Scrapy
PythonEnterprise web scraping framework. Integrate via a custom downloader middleware for automatic proxy rotation on every request.
# middlewares.pyimport base64
class ZentisProxyMiddleware: def __init__(self, proxy_url, proxy_user, proxy_pass): self.proxy_url = proxy_url self.proxy_auth = base64.b64encode( f"{proxy_user}:{proxy_pass}".encode() ).decode()
@classmethod def from_crawler(cls, crawler): return cls( proxy_url=crawler.settings.get("ZENTIS_PROXY_URL"), proxy_user=crawler.settings.get("ZENTIS_PROXY_USER"), proxy_pass=crawler.settings.get("ZENTIS_PROXY_PASS"), )
def process_request(self, request, spider): request.meta["proxy"] = self.proxy_url request.headers["Proxy-Authorization"] = ( b"Basic " + self.proxy_auth.encode() )# settings.pyZENTIS_PROXY_URL = "http://gate.zentislabs.com:8080"ZENTIS_PROXY_USER = "customer-USERNAME-cc-US"ZENTIS_PROXY_PASS = "YOUR_PASSWORD"
DOWNLOADER_MIDDLEWARES = { "myproject.middlewares.ZentisProxyMiddleware": 610,}
# Recommended Scrapy settings for proxy usageCONCURRENT_REQUESTS = 16CONCURRENT_REQUESTS_PER_DOMAIN = 8DOWNLOAD_DELAY = 0.25RETRY_TIMES = 3RETRY_HTTP_CODES = [429, 500, 502, 503, 504]