[{"data":1,"prerenderedAt":1228},["ShallowReactive",2],{"page-\u002Fscaling-python-web-scrapers\u002Fweb-scraping-with-scrapy\u002F":3,"content-navigation":1077},{"id":4,"title":5,"body":6,"description":1070,"extension":1071,"meta":1072,"navigation":130,"path":1073,"seo":1074,"stem":1075,"__hash__":1076},"content\u002Fscaling-python-web-scrapers\u002Fweb-scraping-with-scrapy\u002Findex.md","Web Scraping with Scrapy",{"type":7,"value":8,"toc":1061},"minimark",[9,13,28,31,36,39,76,95,99,110,473,476,498,501,505,529,590,605,609,616,808,823,827,832,925,946,950,1009,1013,1028,1041,1047,1057],[10,11,5],"h1",{"id":12},"web-scraping-with-scrapy",[14,15,16,17,21,22,27],"p",{},"Scrapy is a complete crawling framework rather than a single-purpose library. Where ",[18,19,20],"code",{},"requests"," and BeautifulSoup hand you raw HTML and leave orchestration to you, Scrapy provides an asynchronous engine, a request scheduler, automatic retries, configurable concurrency, and an item pipeline — the infrastructure that production crawls need. This guide walks through building a real spider and the settings that keep it fast and polite. For the broader context on production architecture, see ",[23,24,26],"a",{"href":25},"\u002Fscaling-python-web-scrapers\u002F","Scaling & Deploying Python Web Scrapers",".",[29,30],"diagram-scrapy-arch",{},[32,33,35],"h2",{"id":34},"installation-and-project-structure","Installation and Project Structure",[14,37,38],{},"Scrapy installs as a single package and ships a command-line tool that scaffolds projects.",[40,41,46],"pre",{"className":42,"code":43,"language":44,"meta":45,"style":45},"language-bash shiki shiki-themes material-theme-lighter github-light github-dark","pip install scrapy\nscrapy startproject bookstore\n","bash","",[18,47,48,64],{"__ignoreMap":45},[49,50,53,57,61],"span",{"class":51,"line":52},"line",1,[49,54,56],{"class":55},"sbgvK","pip",[49,58,60],{"class":59},"s_sjI"," install",[49,62,63],{"class":59}," scrapy\n",[49,65,67,70,73],{"class":51,"line":66},2,[49,68,69],{"class":55},"scrapy",[49,71,72],{"class":59}," startproject",[49,74,75],{"class":59}," bookstore\n",[14,77,78,79,82,83,86,87,90,91,94],{},"This generates a project with a predictable layout: ",[18,80,81],{},"spiders\u002F"," holds your crawlers, ",[18,84,85],{},"items.py"," defines the data schema, ",[18,88,89],{},"pipelines.py"," processes scraped records, and ",[18,92,93],{},"settings.py"," controls concurrency, throttling, and middleware. That separation of concerns is the whole point — fetching, parsing, and storage stay in distinct, testable layers.",[32,96,98],{"id":97},"writing-your-first-spider","Writing Your First Spider",[14,100,101,102,105,106,109],{},"A spider declares where to start, how to follow links, and how to parse responses. Scrapy calls your ",[18,103,104],{},"parse"," method with each downloaded response and lets you ",[18,107,108],{},"yield"," either extracted items or new requests to follow.",[40,111,115],{"className":112,"code":113,"language":114,"meta":45,"style":45},"language-python shiki shiki-themes material-theme-lighter github-light github-dark","import scrapy\n\nclass BookSpider(scrapy.Spider):\n    name = \"books\"\n    start_urls = [\"https:\u002F\u002Fbooks.toscrape.com\u002F\"]\n\n    def parse(self, response):\n        for book in response.css(\"article.product_pod\"):\n            yield {\n                \"title\": book.css(\"h3 a::attr(title)\").get(),\n                \"price\": book.css(\"p.price_color::text\").get(),\n                \"in_stock\": bool(book.css(\"p.instock.availability\")),\n            }\n\n        # Follow pagination automatically\n        next_page = response.css(\"li.next a::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page, callback=self.parse)\n","python",[18,116,117,126,132,156,176,198,203,228,259,268,307,340,377,383,388,395,425,437],{"__ignoreMap":45},[49,118,119,123],{"class":51,"line":52},[49,120,122],{"class":121},"sVHd0","import",[49,124,63],{"class":125},"su5hD",[49,127,128],{"class":51,"line":66},[49,129,131],{"emptyLinePlaceholder":130},true,"\n",[49,133,135,139,142,146,148,150,153],{"class":51,"line":134},3,[49,136,138],{"class":137},"sbsja","class",[49,140,141],{"class":55}," BookSpider",[49,143,145],{"class":144},"sP7_E","(",[49,147,69],{"class":55},[49,149,27],{"class":144},[49,151,152],{"class":55},"Spider",[49,154,155],{"class":144},"):\n",[49,157,159,162,166,170,173],{"class":51,"line":158},4,[49,160,161],{"class":125},"    name ",[49,163,165],{"class":164},"smGrS","=",[49,167,169],{"class":168},"sjJ54"," \"",[49,171,172],{"class":59},"books",[49,174,175],{"class":168},"\"\n",[49,177,179,182,184,187,190,193,195],{"class":51,"line":178},5,[49,180,181],{"class":125},"    start_urls ",[49,183,165],{"class":164},[49,185,186],{"class":144}," [",[49,188,189],{"class":168},"\"",[49,191,192],{"class":59},"https:\u002F\u002Fbooks.toscrape.com\u002F",[49,194,189],{"class":168},[49,196,197],{"class":144},"]\n",[49,199,201],{"class":51,"line":200},6,[49,202,131],{"emptyLinePlaceholder":130},[49,204,206,209,213,215,219,222,226],{"class":51,"line":205},7,[49,207,208],{"class":137},"    def",[49,210,212],{"class":211},"sGLFI"," parse",[49,214,145],{"class":144},[49,216,218],{"class":217},"smCYv","self",[49,220,221],{"class":144},",",[49,223,225],{"class":224},"sFwrP"," response",[49,227,155],{"class":144},[49,229,231,234,237,240,242,244,248,250,252,255,257],{"class":51,"line":230},8,[49,232,233],{"class":121},"        for",[49,235,236],{"class":125}," book ",[49,238,239],{"class":121},"in",[49,241,225],{"class":125},[49,243,27],{"class":144},[49,245,247],{"class":246},"slqww","css",[49,249,145],{"class":144},[49,251,189],{"class":168},[49,253,254],{"class":59},"article.product_pod",[49,256,189],{"class":168},[49,258,155],{"class":144},[49,260,262,265],{"class":51,"line":261},9,[49,263,264],{"class":121},"            yield",[49,266,267],{"class":144}," {\n",[49,269,271,274,277,279,282,285,287,289,291,293,296,298,301,304],{"class":51,"line":270},10,[49,272,273],{"class":168},"                \"",[49,275,276],{"class":59},"title",[49,278,189],{"class":168},[49,280,281],{"class":144},":",[49,283,284],{"class":125}," book",[49,286,27],{"class":144},[49,288,247],{"class":246},[49,290,145],{"class":144},[49,292,189],{"class":168},[49,294,295],{"class":59},"h3 a::attr(title)",[49,297,189],{"class":168},[49,299,300],{"class":144},").",[49,302,303],{"class":246},"get",[49,305,306],{"class":144},"(),\n",[49,308,310,312,315,317,319,321,323,325,327,329,332,334,336,338],{"class":51,"line":309},11,[49,311,273],{"class":168},[49,313,314],{"class":59},"price",[49,316,189],{"class":168},[49,318,281],{"class":144},[49,320,284],{"class":125},[49,322,27],{"class":144},[49,324,247],{"class":246},[49,326,145],{"class":144},[49,328,189],{"class":168},[49,330,331],{"class":59},"p.price_color::text",[49,333,189],{"class":168},[49,335,300],{"class":144},[49,337,303],{"class":246},[49,339,306],{"class":144},[49,341,343,345,348,350,352,356,358,361,363,365,367,369,372,374],{"class":51,"line":342},12,[49,344,273],{"class":168},[49,346,347],{"class":59},"in_stock",[49,349,189],{"class":168},[49,351,281],{"class":144},[49,353,355],{"class":354},"sZMiF"," bool",[49,357,145],{"class":144},[49,359,360],{"class":246},"book",[49,362,27],{"class":144},[49,364,247],{"class":246},[49,366,145],{"class":144},[49,368,189],{"class":168},[49,370,371],{"class":59},"p.instock.availability",[49,373,189],{"class":168},[49,375,376],{"class":144},")),\n",[49,378,380],{"class":51,"line":379},13,[49,381,382],{"class":144},"            }\n",[49,384,386],{"class":51,"line":385},14,[49,387,131],{"emptyLinePlaceholder":130},[49,389,391],{"class":51,"line":390},15,[49,392,394],{"class":393},"sutJx","        # Follow pagination automatically\n",[49,396,398,401,403,405,407,409,411,413,416,418,420,422],{"class":51,"line":397},16,[49,399,400],{"class":125},"        next_page ",[49,402,165],{"class":164},[49,404,225],{"class":125},[49,406,27],{"class":144},[49,408,247],{"class":246},[49,410,145],{"class":144},[49,412,189],{"class":168},[49,414,415],{"class":59},"li.next a::attr(href)",[49,417,189],{"class":168},[49,419,300],{"class":144},[49,421,303],{"class":246},[49,423,424],{"class":144},"()\n",[49,426,428,431,434],{"class":51,"line":427},17,[49,429,430],{"class":121},"        if",[49,432,433],{"class":125}," next_page",[49,435,436],{"class":144},":\n",[49,438,440,442,444,446,449,451,454,456,460,462,465,467,470],{"class":51,"line":439},18,[49,441,264],{"class":121},[49,443,225],{"class":125},[49,445,27],{"class":144},[49,447,448],{"class":246},"follow",[49,450,145],{"class":144},[49,452,453],{"class":246},"next_page",[49,455,221],{"class":144},[49,457,459],{"class":458},"s99_P"," callback",[49,461,165],{"class":164},[49,463,218],{"class":464},"s_hVV",[49,466,27],{"class":144},[49,468,104],{"class":469},"skxfh",[49,471,472],{"class":144},")\n",[14,474,475],{},"Run it and export the results in one command:",[40,477,479],{"className":42,"code":478,"language":44,"meta":45,"style":45},"scrapy crawl books -o books.json\n",[18,480,481],{"__ignoreMap":45},[49,482,483,485,488,491,495],{"class":51,"line":52},[49,484,69],{"class":55},[49,486,487],{"class":59}," crawl",[49,489,490],{"class":59}," books",[49,492,494],{"class":493},"stzsN"," -o",[49,496,497],{"class":59}," books.json\n",[14,499,500],{},"Scrapy handles the request queue, concurrency, and retries while your code focuses purely on extraction.",[32,502,504],{"id":503},"selectors-css-and-xpath","Selectors: CSS and XPath",[14,506,507,508,511,512,515,516,519,520,524,525,528],{},"Scrapy's ",[18,509,510],{},"response"," object exposes both CSS and XPath selectors. CSS is concise for class- and tag-based selection; XPath is more powerful for traversing relationships and matching on text. The ",[18,513,514],{},"::text"," and ",[18,517,518],{},"::attr()"," pseudo-selectors extract text and attributes directly. If you are coming from BeautifulSoup, the mental model is similar — see ",[23,521,523],{"href":522},"\u002Fthe-complete-guide-to-python-web-scraping\u002Fparsing-html-with-beautifulsoup\u002F","Parsing HTML with BeautifulSoup"," — but Scrapy selectors are backed by the fast ",[18,526,527],{},"parsel"," library and integrate with the framework's response handling.",[40,530,532],{"className":112,"code":531,"language":114,"meta":45,"style":45},"# CSS\nresponse.css(\"h3 a::attr(title)\").get()\n# Equivalent XPath\nresponse.xpath(\"\u002F\u002Fh3\u002Fa\u002F@title\").get()\n",[18,533,534,539,561,566],{"__ignoreMap":45},[49,535,536],{"class":51,"line":52},[49,537,538],{"class":393},"# CSS\n",[49,540,541,543,545,547,549,551,553,555,557,559],{"class":51,"line":66},[49,542,510],{"class":125},[49,544,27],{"class":144},[49,546,247],{"class":246},[49,548,145],{"class":144},[49,550,189],{"class":168},[49,552,295],{"class":59},[49,554,189],{"class":168},[49,556,300],{"class":144},[49,558,303],{"class":246},[49,560,424],{"class":144},[49,562,563],{"class":51,"line":134},[49,564,565],{"class":393},"# Equivalent XPath\n",[49,567,568,570,572,575,577,579,582,584,586,588],{"class":51,"line":158},[49,569,510],{"class":125},[49,571,27],{"class":144},[49,573,574],{"class":246},"xpath",[49,576,145],{"class":144},[49,578,189],{"class":168},[49,580,581],{"class":59},"\u002F\u002Fh3\u002Fa\u002F@title",[49,583,189],{"class":168},[49,585,300],{"class":144},[49,587,303],{"class":246},[49,589,424],{"class":144},[14,591,592,593,596,597,600,601,604],{},"Use ",[18,594,595],{},".get()"," for the first match and ",[18,598,599],{},".getall()"," for a list. Both return ",[18,602,603],{},"None"," (or an empty list) instead of raising when nothing matches, which keeps parsing code resilient.",[32,606,608],{"id":607},"items-and-pipelines","Items and Pipelines",[14,610,611,612,615],{},"For anything beyond a quick export, define a schema with ",[18,613,614],{},"Item"," and process records through pipelines. An item declares the fields you expect; a pipeline validates, cleans, deduplicates, or stores each scraped record as it flows through the engine.",[40,617,619],{"className":112,"code":618,"language":114,"meta":45,"style":45},"# items.py\nimport scrapy\n\nclass BookItem(scrapy.Item):\n    title = scrapy.Field()\n    price = scrapy.Field()\n    in_stock = scrapy.Field()\n\n# pipelines.py\nclass PriceCleanPipeline:\n    def process_item(self, item, spider):\n        item[\"price\"] = float(item[\"price\"].replace(\"£\", \"\"))\n        return item\n",[18,620,621,626,632,636,653,670,685,700,704,709,718,741,800],{"__ignoreMap":45},[49,622,623],{"class":51,"line":52},[49,624,625],{"class":393},"# items.py\n",[49,627,628,630],{"class":51,"line":66},[49,629,122],{"class":121},[49,631,63],{"class":125},[49,633,634],{"class":51,"line":134},[49,635,131],{"emptyLinePlaceholder":130},[49,637,638,640,643,645,647,649,651],{"class":51,"line":158},[49,639,138],{"class":137},[49,641,642],{"class":55}," BookItem",[49,644,145],{"class":144},[49,646,69],{"class":55},[49,648,27],{"class":144},[49,650,614],{"class":55},[49,652,155],{"class":144},[49,654,655,658,660,663,665,668],{"class":51,"line":178},[49,656,657],{"class":125},"    title ",[49,659,165],{"class":164},[49,661,662],{"class":125}," scrapy",[49,664,27],{"class":144},[49,666,667],{"class":246},"Field",[49,669,424],{"class":144},[49,671,672,675,677,679,681,683],{"class":51,"line":200},[49,673,674],{"class":125},"    price ",[49,676,165],{"class":164},[49,678,662],{"class":125},[49,680,27],{"class":144},[49,682,667],{"class":246},[49,684,424],{"class":144},[49,686,687,690,692,694,696,698],{"class":51,"line":205},[49,688,689],{"class":125},"    in_stock ",[49,691,165],{"class":164},[49,693,662],{"class":125},[49,695,27],{"class":144},[49,697,667],{"class":246},[49,699,424],{"class":144},[49,701,702],{"class":51,"line":230},[49,703,131],{"emptyLinePlaceholder":130},[49,705,706],{"class":51,"line":261},[49,707,708],{"class":393},"# pipelines.py\n",[49,710,711,713,716],{"class":51,"line":270},[49,712,138],{"class":137},[49,714,715],{"class":55}," PriceCleanPipeline",[49,717,436],{"class":144},[49,719,720,722,725,727,729,731,734,736,739],{"class":51,"line":309},[49,721,208],{"class":137},[49,723,724],{"class":211}," process_item",[49,726,145],{"class":144},[49,728,218],{"class":217},[49,730,221],{"class":144},[49,732,733],{"class":224}," item",[49,735,221],{"class":144},[49,737,738],{"class":224}," spider",[49,740,155],{"class":144},[49,742,743,746,749,751,753,755,758,761,764,766,769,771,773,775,777,780,783,785,787,790,792,794,797],{"class":51,"line":342},[49,744,745],{"class":125},"        item",[49,747,748],{"class":144},"[",[49,750,189],{"class":168},[49,752,314],{"class":59},[49,754,189],{"class":168},[49,756,757],{"class":144},"]",[49,759,760],{"class":164}," =",[49,762,763],{"class":354}," float",[49,765,145],{"class":144},[49,767,768],{"class":246},"item",[49,770,748],{"class":144},[49,772,189],{"class":168},[49,774,314],{"class":59},[49,776,189],{"class":168},[49,778,779],{"class":144},"].",[49,781,782],{"class":246},"replace",[49,784,145],{"class":144},[49,786,189],{"class":168},[49,788,789],{"class":59},"£",[49,791,189],{"class":168},[49,793,221],{"class":144},[49,795,796],{"class":168}," \"\"",[49,798,799],{"class":144},"))\n",[49,801,802,805],{"class":51,"line":379},[49,803,804],{"class":121},"        return",[49,806,807],{"class":125}," item\n",[14,809,810,811,813,814,817,818,822],{},"Enable the pipeline in ",[18,812,93],{}," with ",[18,815,816],{},"ITEM_PIPELINES = {\"bookstore.pipelines.PriceCleanPipeline\": 300}",". Pipelines are where storage logic belongs — see ",[23,819,821],{"href":820},"\u002Fscaling-python-web-scrapers\u002Fstoring-and-exporting-scraped-data\u002F","Storing and Exporting Scraped Data"," for writing to databases from a pipeline.",[32,824,826],{"id":825},"concurrency-throttling-and-politeness","Concurrency, Throttling, and Politeness",[14,828,829,830,281],{},"Scrapy is concurrent by default, which makes responsible configuration essential. The key settings in ",[18,831,93],{},[40,833,835],{"className":112,"code":834,"language":114,"meta":45,"style":45},"# settings.py\nCONCURRENT_REQUESTS = 16\nCONCURRENT_REQUESTS_PER_DOMAIN = 8\nDOWNLOAD_DELAY = 0.5            # base delay between requests\nAUTOTHROTTLE_ENABLED = True    # adapt delay to server response time\nAUTOTHROTTLE_TARGET_CONCURRENCY = 4.0\nRETRY_TIMES = 3                # retry transient failures\nROBOTSTXT_OBEY = True          # respect robots.txt by default\n",[18,836,837,842,853,863,876,890,900,913],{"__ignoreMap":45},[49,838,839],{"class":51,"line":52},[49,840,841],{"class":393},"# settings.py\n",[49,843,844,847,849],{"class":51,"line":66},[49,845,846],{"class":464},"CONCURRENT_REQUESTS",[49,848,760],{"class":164},[49,850,852],{"class":851},"srdBf"," 16\n",[49,854,855,858,860],{"class":51,"line":134},[49,856,857],{"class":464},"CONCURRENT_REQUESTS_PER_DOMAIN",[49,859,760],{"class":164},[49,861,862],{"class":851}," 8\n",[49,864,865,868,870,873],{"class":51,"line":158},[49,866,867],{"class":464},"DOWNLOAD_DELAY",[49,869,760],{"class":164},[49,871,872],{"class":851}," 0.5",[49,874,875],{"class":393},"            # base delay between requests\n",[49,877,878,881,883,887],{"class":51,"line":178},[49,879,880],{"class":464},"AUTOTHROTTLE_ENABLED",[49,882,760],{"class":164},[49,884,886],{"class":885},"s39Yj"," True",[49,888,889],{"class":393},"    # adapt delay to server response time\n",[49,891,892,895,897],{"class":51,"line":200},[49,893,894],{"class":464},"AUTOTHROTTLE_TARGET_CONCURRENCY",[49,896,760],{"class":164},[49,898,899],{"class":851}," 4.0\n",[49,901,902,905,907,910],{"class":51,"line":205},[49,903,904],{"class":464},"RETRY_TIMES",[49,906,760],{"class":164},[49,908,909],{"class":851}," 3",[49,911,912],{"class":393},"                # retry transient failures\n",[49,914,915,918,920,922],{"class":51,"line":230},[49,916,917],{"class":464},"ROBOTSTXT_OBEY",[49,919,760],{"class":164},[49,921,886],{"class":885},[49,923,924],{"class":393},"          # respect robots.txt by default\n",[14,926,927,930,931,933,934,937,938,941,942,27],{},[18,928,929],{},"AUTOTHROTTLE"," is particularly valuable: it automatically adjusts delays based on server latency, slowing down when the target is under load. Combined with ",[18,932,904],{}," and the built-in retry middleware, it handles the ",[18,935,936],{},"429","\u002F",[18,939,940],{},"503"," backoff logic you would otherwise write by hand. For evading more aggressive defenses, integrate proxy and header rotation from ",[23,943,945],{"href":944},"\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Frotating-proxies-and-managing-ip-blocks\u002F","Rotating Proxies and Managing IP Blocks",[32,947,949],{"id":948},"common-mistakes-to-avoid","Common Mistakes to Avoid",[951,952,953,971,980,988,1000],"ul",{},[954,955,956,960,961,964,965,967,968,970],"li",{},[957,958,959],"strong",{},"Blocking the event loop:"," Scrapy is asynchronous. Calling ",[18,962,963],{},"time.sleep()"," or synchronous ",[18,966,20],{}," inside a spider stalls the entire engine. Use ",[18,969,867],{}," and yield requests instead.",[954,972,973,976,977,979],{},[957,974,975],{},"Disabling AutoThrottle then setting concurrency too high:"," this is the fastest route to an IP ban. Let AutoThrottle adapt, or tune ",[18,978,867],{}," conservatively.",[954,981,982,987],{},[957,983,984,985,281],{},"Putting storage logic in ",[18,986,104],{}," keep parsing pure and move persistence into pipelines so it is reusable and testable.",[954,989,990,996,997,999],{},[957,991,992,993,281],{},"Ignoring ",[18,994,995],{},"response.follow"," building absolute URLs by hand is error-prone; ",[18,998,995],{}," resolves relative links for you.",[954,1001,1002,1005,1006,1008],{},[957,1003,1004],{},"Forgetting to handle missing fields:"," always assume a selector may return ",[18,1007,603],{}," and validate before downstream processing.",[32,1010,1012],{"id":1011},"frequently-asked-questions","Frequently Asked Questions",[14,1014,1015,1018,1019,1023,1024,1027],{},[957,1016,1017],{},"Does Scrapy render JavaScript?","\nNo — Scrapy fetches raw HTML and does not execute JavaScript. For dynamic, JS-rendered sites, use a headless browser such as ",[23,1020,1022],{"href":1021},"\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Fusing-playwright-for-modern-web-automation\u002F","Playwright",", or integrate ",[18,1025,1026],{},"scrapy-playwright"," to combine the two.",[14,1029,1030,1033,1034,1036,1037,27],{},[957,1031,1032],{},"Is Scrapy overkill for small projects?","\nFor a few pages, yes — ",[18,1035,20],{}," and BeautifulSoup are simpler. Scrapy pays off when you crawl many linked pages, need retries and throttling, or run the job repeatedly. See ",[23,1038,1040],{"href":1039},"\u002Fscaling-python-web-scrapers\u002Fweb-scraping-with-scrapy\u002Fscrapy-vs-beautifulsoup-which-to-use\u002F","Scrapy vs BeautifulSoup: Which to Use",[14,1042,1043,1046],{},[957,1044,1045],{},"How do I schedule Scrapy crawls?","\nRun spiders from cron or a task scheduler, or use Scrapyd \u002F a managed service to deploy and schedule spiders with an API. The crawl itself stays the same code.",[14,1048,1049,1052,1053,1056],{},[957,1050,1051],{},"Can Scrapy resume an interrupted crawl?","\nYes. Enable a persistent job directory with ",[18,1054,1055],{},"JOBDIR"," so the scheduler and deduplication filter survive restarts, letting a stopped crawl pick up where it left off.",[1058,1059,1060],"style",{},"html pre.shiki code .sbgvK, html code.shiki .sbgvK{--shiki-light:#E2931D;--shiki-default:#6F42C1;--shiki-dark:#B392F0}html pre.shiki code .s_sjI, html code.shiki .s_sjI{--shiki-light:#91B859;--shiki-default:#032F62;--shiki-dark:#9ECBFF}html .light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html.light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html pre.shiki code .sVHd0, html code.shiki .sVHd0{--shiki-light:#39ADB5;--shiki-light-font-style:italic;--shiki-default:#D73A49;--shiki-default-font-style:inherit;--shiki-dark:#F97583;--shiki-dark-font-style:inherit}html pre.shiki code .su5hD, html code.shiki .su5hD{--shiki-light:#90A4AE;--shiki-default:#24292E;--shiki-dark:#E1E4E8}html pre.shiki code .sbsja, html code.shiki .sbsja{--shiki-light:#9C3EDA;--shiki-default:#D73A49;--shiki-dark:#F97583}html pre.shiki code .sP7_E, html code.shiki .sP7_E{--shiki-light:#39ADB5;--shiki-default:#24292E;--shiki-dark:#E1E4E8}html pre.shiki code .smGrS, html code.shiki .smGrS{--shiki-light:#39ADB5;--shiki-default:#D73A49;--shiki-dark:#F97583}html pre.shiki code .sjJ54, html code.shiki .sjJ54{--shiki-light:#39ADB5;--shiki-default:#032F62;--shiki-dark:#9ECBFF}html pre.shiki code .sGLFI, html code.shiki .sGLFI{--shiki-light:#6182B8;--shiki-default:#6F42C1;--shiki-dark:#B392F0}html pre.shiki code .smCYv, html code.shiki .smCYv{--shiki-light:#E53935;--shiki-light-font-style:italic;--shiki-default:#24292E;--shiki-default-font-style:inherit;--shiki-dark:#E1E4E8;--shiki-dark-font-style:inherit}html pre.shiki code .sFwrP, html code.shiki .sFwrP{--shiki-light:#90A4AE;--shiki-light-font-style:italic;--shiki-default:#24292E;--shiki-default-font-style:inherit;--shiki-dark:#E1E4E8;--shiki-dark-font-style:inherit}html pre.shiki code .slqww, html code.shiki .slqww{--shiki-light:#6182B8;--shiki-default:#24292E;--shiki-dark:#E1E4E8}html pre.shiki code .sZMiF, html code.shiki .sZMiF{--shiki-light:#E2931D;--shiki-default:#005CC5;--shiki-dark:#79B8FF}html pre.shiki code .sutJx, html code.shiki .sutJx{--shiki-light:#90A4AE;--shiki-light-font-style:italic;--shiki-default:#6A737D;--shiki-default-font-style:inherit;--shiki-dark:#6A737D;--shiki-dark-font-style:inherit}html pre.shiki code .s99_P, html code.shiki .s99_P{--shiki-light:#90A4AE;--shiki-light-font-style:italic;--shiki-default:#E36209;--shiki-default-font-style:inherit;--shiki-dark:#FFAB70;--shiki-dark-font-style:inherit}html pre.shiki code .s_hVV, html code.shiki .s_hVV{--shiki-light:#90A4AE;--shiki-default:#005CC5;--shiki-dark:#79B8FF}html pre.shiki code .skxfh, html code.shiki .skxfh{--shiki-light:#E53935;--shiki-default:#24292E;--shiki-dark:#E1E4E8}html pre.shiki code .stzsN, html code.shiki .stzsN{--shiki-light:#91B859;--shiki-default:#005CC5;--shiki-dark:#79B8FF}html pre.shiki code .srdBf, html code.shiki .srdBf{--shiki-light:#F76D47;--shiki-default:#005CC5;--shiki-dark:#79B8FF}html pre.shiki code .s39Yj, html code.shiki .s39Yj{--shiki-light:#39ADB5;--shiki-default:#005CC5;--shiki-dark:#79B8FF}",{"title":45,"searchDepth":66,"depth":66,"links":1062},[1063,1064,1065,1066,1067,1068,1069],{"id":34,"depth":66,"text":35},{"id":97,"depth":66,"text":98},{"id":503,"depth":66,"text":504},{"id":607,"depth":66,"text":608},{"id":825,"depth":66,"text":826},{"id":948,"depth":66,"text":949},{"id":1011,"depth":66,"text":1012},"Build production crawlers with Scrapy — project structure, spiders, selectors, item pipelines, concurrency and throttling settings, and exporting data at scale.","md",{},"\u002Fscaling-python-web-scrapers\u002Fweb-scraping-with-scrapy",{"title":5,"description":1070},"scaling-python-web-scrapers\u002Fweb-scraping-with-scrapy\u002Findex","kWLJphF-0ZUPwrwFW0HNoyvLZRRWiB7UCSPzgwTM5j4",[1078,1128,1154],{"title":1079,"path":1080,"stem":1081,"children":1082,"page":-1},"Advanced Scraping Techniques Anti Bot Evasion","\u002Fadvanced-scraping-techniques-anti-bot-evasion","advanced-scraping-techniques-anti-bot-evasion",[1083,1086,1092,1104,1116],{"title":1084,"path":1080,"stem":1085},"Advanced Python Scraping & Anti-Bot Evasion","advanced-scraping-techniques-anti-bot-evasion\u002Findex",{"title":1087,"path":1088,"stem":1089,"children":1090},"Bypass Cloudflare & Akamai with Python","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Fbypassing-cloudflare-and-akamai-protections","advanced-scraping-techniques-anti-bot-evasion\u002Fbypassing-cloudflare-and-akamai-protections\u002Findex",[1091],{"title":1087,"path":1088,"stem":1089},{"title":1093,"path":1094,"stem":1095,"children":1096,"page":-1},"Mastering Selenium for Dynamic Websites","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Fmastering-selenium-for-dynamic-websites","advanced-scraping-techniques-anti-bot-evasion\u002Fmastering-selenium-for-dynamic-websites\u002Findex",[1097,1098],{"title":1093,"path":1094,"stem":1095},{"title":1099,"path":1100,"stem":1101,"children":1102},"Python Selenium Stealth Setup Guide","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Fmastering-selenium-for-dynamic-websites\u002Fhow-to-configure-selenium-stealth-to-avoid-detection","advanced-scraping-techniques-anti-bot-evasion\u002Fmastering-selenium-for-dynamic-websites\u002Fhow-to-configure-selenium-stealth-to-avoid-detection\u002Findex",[1103],{"title":1099,"path":1100,"stem":1101},{"title":1105,"path":1106,"stem":1107,"children":1108,"page":-1},"Rotating Proxies & Managing IP Blocks","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Frotating-proxies-and-managing-ip-blocks","advanced-scraping-techniques-anti-bot-evasion\u002Frotating-proxies-and-managing-ip-blocks\u002Findex",[1109,1110],{"title":1105,"path":1106,"stem":1107},{"title":1111,"path":1112,"stem":1113,"children":1114},"Best Proxy Providers for Python Scrapers","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Frotating-proxies-and-managing-ip-blocks\u002Fbest-free-and-paid-proxy-providers-for-scraping","advanced-scraping-techniques-anti-bot-evasion\u002Frotating-proxies-and-managing-ip-blocks\u002Fbest-free-and-paid-proxy-providers-for-scraping\u002Findex",[1115],{"title":1111,"path":1112,"stem":1113},{"title":1117,"path":1118,"stem":1119,"children":1120},"Playwright for Python Web Automation","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Fusing-playwright-for-modern-web-automation","advanced-scraping-techniques-anti-bot-evasion\u002Fusing-playwright-for-modern-web-automation\u002Findex",[1121,1122],{"title":1117,"path":1118,"stem":1119},{"title":1123,"path":1124,"stem":1125,"children":1126},"Playwright vs Selenium: Python Benchmarks","\u002Fadvanced-scraping-techniques-anti-bot-evasion\u002Fusing-playwright-for-modern-web-automation\u002Fplaywright-vs-selenium-performance-benchmarks","advanced-scraping-techniques-anti-bot-evasion\u002Fusing-playwright-for-modern-web-automation\u002Fplaywright-vs-selenium-performance-benchmarks\u002Findex",[1127],{"title":1123,"path":1124,"stem":1125},{"title":1129,"path":1130,"stem":1131,"children":1132,"page":-1},"Scaling Python Web Scrapers","\u002Fscaling-python-web-scrapers","scaling-python-web-scrapers",[1133,1135,1141,1146],{"title":26,"path":1130,"stem":1134},"scaling-python-web-scrapers\u002Findex",{"title":1136,"path":1137,"stem":1138,"children":1139},"Asynchronous Scraping with asyncio and HTTPX","\u002Fscaling-python-web-scrapers\u002Fasynchronous-scraping-with-asyncio-and-httpx","scaling-python-web-scrapers\u002Fasynchronous-scraping-with-asyncio-and-httpx\u002Findex",[1140],{"title":1136,"path":1137,"stem":1138},{"title":821,"path":1142,"stem":1143,"children":1144},"\u002Fscaling-python-web-scrapers\u002Fstoring-and-exporting-scraped-data","scaling-python-web-scrapers\u002Fstoring-and-exporting-scraped-data\u002Findex",[1145],{"title":821,"path":1142,"stem":1143},{"title":5,"path":1073,"stem":1075,"children":1147},[1148,1149],{"title":5,"path":1073,"stem":1075},{"title":1040,"path":1150,"stem":1151,"children":1152},"\u002Fscaling-python-web-scrapers\u002Fweb-scraping-with-scrapy\u002Fscrapy-vs-beautifulsoup-which-to-use","scaling-python-web-scrapers\u002Fweb-scraping-with-scrapy\u002Fscrapy-vs-beautifulsoup-which-to-use\u002Findex",[1153],{"title":1040,"path":1150,"stem":1151},{"title":1155,"path":1156,"stem":1157,"children":1158,"page":-1},"The Complete Guide To Python Web Scraping","\u002Fthe-complete-guide-to-python-web-scraping","the-complete-guide-to-python-web-scraping",[1159,1162,1174,1186,1192,1204,1216],{"title":1160,"path":1156,"stem":1161},"The Complete Python Web Scraping Guide","the-complete-guide-to-python-web-scraping\u002Findex",{"title":1163,"path":1164,"stem":1165,"children":1166,"page":-1},"Regex Data Extraction in Python Scraping","\u002Fthe-complete-guide-to-python-web-scraping\u002Fextracting-data-with-regular-expressions","the-complete-guide-to-python-web-scraping\u002Fextracting-data-with-regular-expressions\u002Findex",[1167,1168],{"title":1163,"path":1164,"stem":1165},{"title":1169,"path":1170,"stem":1171,"children":1172},"Fix Unicode Errors in Python Web Scraping","\u002Fthe-complete-guide-to-python-web-scraping\u002Fextracting-data-with-regular-expressions\u002Ffixing-common-unicode-errors-in-python-scraping","the-complete-guide-to-python-web-scraping\u002Fextracting-data-with-regular-expressions\u002Ffixing-common-unicode-errors-in-python-scraping\u002Findex",[1173],{"title":1169,"path":1170,"stem":1171},{"title":1175,"path":1176,"stem":1177,"children":1178,"page":-1},"Pagination & Infinite Scroll in Python","\u002Fthe-complete-guide-to-python-web-scraping\u002Fhandling-pagination-and-infinite-scroll","the-complete-guide-to-python-web-scraping\u002Fhandling-pagination-and-infinite-scroll\u002Findex",[1179,1180],{"title":1175,"path":1176,"stem":1177},{"title":1181,"path":1182,"stem":1183,"children":1184},"Scrape Static Sites Without Getting Blocked","\u002Fthe-complete-guide-to-python-web-scraping\u002Fhandling-pagination-and-infinite-scroll\u002Fhow-to-scrape-a-static-website-without-getting-blocked","the-complete-guide-to-python-web-scraping\u002Fhandling-pagination-and-infinite-scroll\u002Fhow-to-scrape-a-static-website-without-getting-blocked\u002Findex",[1185],{"title":1181,"path":1182,"stem":1183},{"title":1187,"path":1188,"stem":1189,"children":1190},"Managing Cookies & Sessions in Python","\u002Fthe-complete-guide-to-python-web-scraping\u002Fmanaging-cookies-and-sessions","the-complete-guide-to-python-web-scraping\u002Fmanaging-cookies-and-sessions\u002Findex",[1191],{"title":1187,"path":1188,"stem":1189},{"title":1193,"path":1194,"stem":1195,"children":1196,"page":-1},"Parsing HTML with BeautifulSoup in Python","\u002Fthe-complete-guide-to-python-web-scraping\u002Fparsing-html-with-beautifulsoup","the-complete-guide-to-python-web-scraping\u002Fparsing-html-with-beautifulsoup\u002Findex",[1197,1198],{"title":1193,"path":1194,"stem":1195},{"title":1199,"path":1200,"stem":1201,"children":1202},"BeautifulSoup vs lxml Speed Comparison","\u002Fthe-complete-guide-to-python-web-scraping\u002Fparsing-html-with-beautifulsoup\u002Fbeautifulsoup-vs-lxml-which-parser-is-faster","the-complete-guide-to-python-web-scraping\u002Fparsing-html-with-beautifulsoup\u002Fbeautifulsoup-vs-lxml-which-parser-is-faster\u002Findex",[1203],{"title":1199,"path":1200,"stem":1201},{"title":1205,"path":1206,"stem":1207,"children":1208,"page":-1},"Setting Up Your Python Scraping Environment","\u002Fthe-complete-guide-to-python-web-scraping\u002Fsetting-up-your-python-scraping-environment","the-complete-guide-to-python-web-scraping\u002Fsetting-up-your-python-scraping-environment\u002Findex",[1209,1210],{"title":1205,"path":1206,"stem":1207},{"title":1211,"path":1212,"stem":1213,"children":1214},"Install Python & Requests for Beginners","\u002Fthe-complete-guide-to-python-web-scraping\u002Fsetting-up-your-python-scraping-environment\u002Fhow-to-install-python-and-requests-for-beginners","the-complete-guide-to-python-web-scraping\u002Fsetting-up-your-python-scraping-environment\u002Fhow-to-install-python-and-requests-for-beginners\u002Findex",[1215],{"title":1211,"path":1212,"stem":1213},{"title":1217,"path":1218,"stem":1219,"children":1220},"HTTP Requests & Responses for Scrapers","\u002Fthe-complete-guide-to-python-web-scraping\u002Funderstanding-http-requests-and-responses","the-complete-guide-to-python-web-scraping\u002Funderstanding-http-requests-and-responses\u002Findex",[1221,1222],{"title":1217,"path":1218,"stem":1219},{"title":1223,"path":1224,"stem":1225,"children":1226},"Extract HTML Tables with Python","\u002Fthe-complete-guide-to-python-web-scraping\u002Funderstanding-http-requests-and-responses\u002Fstep-by-step-guide-to-extracting-tables-from-html","the-complete-guide-to-python-web-scraping\u002Funderstanding-http-requests-and-responses\u002Fstep-by-step-guide-to-extracting-tables-from-html\u002Findex",[1227],{"title":1223,"path":1224,"stem":1225},1781700486721]