scrapy采集数据过程中放弃下载过大的页面
添加以下代码到settings.py,myproject为你的项目名称
DOWNLOADER_HTTPCLIENTFACTORY = 'myproject.downloader.LimitSizeHTTPClientFactory'
自定义限制下载过大页面的模块
MAX_RESPONSE_SIZE = 1048576 # 1Mb from scrapy.core.downloader.webclient import ScrapyHTTPClientFactory, ScrapyHTTPPageGetter class LimitSizePageGetter(ScrapyHTTPPageGetter): def handleHeader(self, key, value): ScrapyHTTPPageGetter.handleHeader(self, key, value) if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE: self.connectionLost('oversized') class LimitSizeHTTPClientFactory(ScrapyHTTPClientFactory): protocol = LimitSizePageGetter
相关文章