Loading...
Loading...
Expert guidance for building web scrapers and crawlers using the Scrapy Python framework with best practices for spider development, data extraction, and pipeline management.
npx skill4agent add mindrally/skills scrapy-web-scrapingmyproject/
scrapy.cfg
myproject/
__init__.py
items.py
middlewares.py
pipelines.py
settings.py
spiders/
__init__.py
myspider.pyallowed_domainsstart_requests()parse()ItemLoader::text::attr()# Good practice: Using ItemLoader
from scrapy.loader import ItemLoader
from myproject.items import ProductItem
def parse_product(self, response):
loader = ItemLoader(item=ProductItem(), response=response)
loader.add_css('name', 'h1.product-title::text')
loader.add_css('price', 'span.price::text')
loader.add_xpath('description', '//div[@class="desc"]/text()')
yield loader.load_item()DOWNLOAD_DELAYAUTOTHROTTLECONCURRENT_REQUESTS_PER_DOMAINscrapy-fake-useragentclass ValidationPipeline:
def process_item(self, item, spider):
if not item.get('name'):
raise DropItem("Missing name field")
return itemerrbackHTTPCACHE_ENABLEDscrapy.extensions.memusage# Recommended production settings
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
ROBOTSTXT_OBEY = True
HTTPCACHE_ENABLED = True
LOG_LEVEL = 'INFO'scrapy.contracts