scrapy工作原理探秘
def _next_request_from_scheduler(self, spider):#engine从调度器取得下一个request
slot = self.slot
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)#登记一个下载,返回deferred对象
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.remove_request(request))#引擎的slot移除该请求
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.nextcall.schedule())#再次调度
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d def _handle_downloader_output(self, response, request, spider):
assert isinstance(response, (Request, Response, Failure)), response
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
self.crawl(response, spider)#在slot的队列中放置一个请求
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider)#输出的延迟对象
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
每个请求对象的延迟对象经过_handle_downloader_output处理后,又会返回一个延迟对象。
def enqueue_scrape(self, response, request, spider):
slot = self.slot
dfd = slot.add_response_request(response, request)#在scraper的queue中添加(response, request,defer)
def finish_scraping(_):
slot.finish_response(response, request)
self._check_if_closing(spider, slot)
self._scrape_next(spider, slot)
return _
dfd.addBoth(finish_scraping)
dfd.addErrback(
lambda f: logger.error('Scraper bug processing %(request)s',
{'request': request},
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
self._scrape_next(spider, slot)
return dfd def _scrape_next(self, spider, slot):
while slot.queue:
response, request, deferred = slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred) def _scrape(self, response, request, spider):
"""Handle the downloaded response or failure through the spider
callback/errback"""
assert isinstance(response, (Response, Failure)) dfd = self._scrape2(response, request, spider) # returns spiders processed output
dfd.addErrback(self.handle_spider_error, request, response, spider)
dfd.addCallback(self.handle_spider_output, request, response, spider)
return dfd def _scrape2(self, request_result, request, spider):
"""Handle the different cases of request's result been a Response or a
Failure"""
if not isinstance(request_result, Failure):#当结果不是失败对象
return self.spidermw.scrape_response(
self.call_spider, request_result, request, spider)#
else:
# FIXME: don't ignore errors in spider middleware
dfd = self.call_spider(request_result, request, spider)
return dfd.addErrback(
self._log_download_errors, request_result, request, spider) def call_spider(self, result, request, spider):
result.request = request
dfd = defer_result(result)
dfd.addCallbacks(request.callback or spider.parse, request.errback)
return dfd.addCallback(iterate_spider_output)#addCallback方法会返回一个defer
iterate_spider_output(scrapy/util/spider.py) def iterate_spider_output(result):
return arg_to_iter(result) #from scrapy.utils.misc import arg_to_iter
def arg_to_iter(arg):
"""Convert an argument to an iterable. The argument can be a None, single
value, or an iterable. Exception: if arg is a dict, [arg] will be returned
"""
if arg is None:
return []
elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'):
return arg
else:
return [arg]
class SpiderMiddlewareManager(MiddlewareManager): component_name = 'spider middleware' @classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES')) def _add_middleware(self, mw):
super(SpiderMiddlewareManager, self)._add_middleware(mw)
if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_spider_output'):
self.methods['process_spider_output'].insert(0, mw.process_spider_output)
if hasattr(mw, 'process_spider_exception'):
self.methods['process_spider_exception'].insert(0, mw.process_spider_exception)
if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].insert(0, mw.process_start_requests) def scrape_response(self, scrape_func, response, request, spider):
fname = lambda f:'%s.%s' % (
six.get_method_self(f).__class__.__name__,
six.get_method_function(f).__name__) def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
assert result is None, \
'Middleware %s must returns None or ' \
'raise an exception, got %s ' \
% (fname(method), type(result))
except:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider) def process_spider_exception(_failure):
exception = _failure.value
for method in self.methods['process_spider_exception']:
result = method(response=response, exception=exception, spider=spider)
assert result is None or _isiterable(result), \
'Middleware %s must returns None, or an iterable object, got %s ' % \
(fname(method), type(result))
if result is not None:
return result
return _failure def process_spider_output(result):
for method in self.methods['process_spider_output']:
result = method(response=response, result=result, spider=spider)
assert _isiterable(result), \
'Middleware %s must returns an iterable object, got %s ' % \
(fname(method), type(result))
return result dfd = mustbe_deferred(process_spider_input, response)
dfd.addErrback(process_spider_exception)
dfd.addCallback(process_spider_output)
return dfd def process_start_requests(self, start_requests, spider):
return self._process_chain('process_start_requests', start_requests, spider)
def handle_spider_output(self, result, request, response, spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items,
self._process_spidermw_output, request, response, spider)
return dfd def _process_spidermw_output(self, output, request, response, spider):#把生成的请求放到scheduler
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
if isinstance(output, Request):#如果输出是请求,继续爬行
self.crawler.engine.crawl(request=output, spider=spider)
elif isinstance(output, (BaseItem, dict)):#是item,进行保存
self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
return dfd
elif output is None:
pass
else:
typename = type(output).__name__
logger.error('Spider must return Request, BaseItem, dict or None, '
'got %(typename)r in %(request)s',
{'request': request, 'typename': typename},
extra={'spider': spider})
engine的crawl函数
def crawl(self, request, spider):
assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request)
self.schedule(request, spider)
self.slot.nextcall.schedule() def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled,
request=request, spider=spider)
if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signal=signals.request_dropped,
request=request, spider=spider)
reactor.callLater(delay, self)通过timer触发deferred对象的callback。
scrapy工作原理探秘的更多相关文章
- Scrapy工作原理
目录 1. Scrapy旧版架构图(绿线是数据流向) 2. Scrapy新版架构图 1. 组件介绍 2. 数据流(Data Flow) 3. 使用Scrapy框架爬虫的重要命令 4. Middlewa ...
- scrapy工作原理概述
当运行scrapy crawl spider 时,会生成一个crawl命令对象,scrapy是调用execute函数(cmdlin.py)来执行命令的,execute函数会给命令对象添加crawler ...
- 一篇文章教会你理解Scrapy网络爬虫框架的工作原理和数据采集过程
今天小编给大家详细的讲解一下Scrapy爬虫框架,希望对大家的学习有帮助. 1.Scrapy爬虫框架 Scrapy是一个使用Python编程语言编写的爬虫框架,任何人都可以根据自己的需求进行修改,并且 ...
- How Javascript works (Javascript工作原理) (十五) 类和继承及 Babel 和 TypeScript 代码转换探秘
个人总结:读完这篇文章需要15分钟,文章主要讲解了Babel和TypeScript的工作原理,(例如对es6 类的转换,是将原始es6代码转换为es5代码,这些代码中包含着类似于 _classCall ...
- Scrapy 框架结构及工作原理
1.下图为 Scrapy 框架的组成结构,并从数据流的角度揭示 Scrapy 的工作原理 2.首先.简单了解一下 Scrapy 框架中的各个组件 组 件 描 述 类 型 EN ...
- scrapy框架结构与工作原理
组件: ENGINE:引擎,框架的核心,其他组件在其控制下协同工作. SCHEDULER:调度器,负责对SPIDER提交的下载请求进行调度 DOWNLOADER:下载器,负责下载页面,发送HTTP请求 ...
- Python爬虫-Scrapy框架的工作原理
Scrapy框架工作原理 Scrapy框架架构图 Scrapy框架主要由六大组件组成,分别为: 调度器(Scheduler),下载器(Downler),爬虫(Spiders),中间件(Middwa ...
- How Javascript works (Javascript工作原理) (十二) 网络层探秘及如何提高其性能和安全性
个人总结:阅读完这篇文章需要20分钟,这篇文章主要讲解了现代浏览器在网络层传输所用到的一些技术, 应当对 window.performance.timing 这个API所有了解. 这是 JavaScr ...
- scrapy学习笔记(二)框架结构工作原理
scrapy结构图: scrapy组件: ENGINE:引擎,框架的核心,其它所有组件在其控制下协同工作. SCHEDULER:调度器,负责对SPIDER提交的下载请求进行调度. DOWNLOADER ...
随机推荐
- Django与Vue交互,实现注册的图片验证码没有加载的原因
注册功能之图片验证码: 1.实现过程: 传递uuid给后端,再发送图片验证码的请求给后端,后端存储uuid并生成图片验证码保存到redis,然后将图片验证码返回给前端. 当用户输入图片验证码的时候,前 ...
- sudo 命令报错的解决方法
尝试着用终端打开Mac的安全权限(sudo spctl --master-disable),却显示以下提示,望高手解答. sudo: /etc/sudoers is world writablesud ...
- redis实现对账(集合比较)功能
现状:每日在进行系统之间的订单对账时,往往是这样的操作流程: 1.从外部系统拉取数据存入本地数据库: 2.查询本地订单数据集合localSet: 3.查询外部系统订单数据集合outerSet; 4.以 ...
- 如何配置Linux系统防火墙,以防止DDOS攻击?
虚拟主机服务商在运营过程中可能会受到黑客攻击,常见的攻击方式有SYN,DDOS等.通过更换IP,查找被攻击的站点可能避开攻击,但是中断服务的时间比较长.比较彻底的解决方法是添置硬件防火墙.不过,硬件防 ...
- SCCM 2012 R2实战系列之十三:辅助站点部署
由于最近几个月一直处于AD升级项目中,很久没有更新SCCM的技术文档了.SCCM 2012中的辅助站点部署方法还是比较特别的,需要注意的地方也非常多,今天跟大家分享辅助站点的具体部署和配置方法. 1. ...
- Gini系数的原理
转载:https://blog.csdn.net/u010665216/article/details/78528261 首先,我们直接构造赛题结果:真实数据与预测数据: predictions = ...
- python程序化交易神器——tushare
一直想试着将自己的交易思路程序化,可惜困难重重 ,连第一步获取数据都要花很多精力,直到最近发现了Tushare,不仅使用非常便利,功能也无比强大,股票.期货.基金.财经新闻,甚至电影票房等都可以非常便 ...
- OpenGL 画出雷达动态扫描效果(二) 非底图
OpenGL 画出雷达动态扫描效果(一)中给出了已一张图片作为底图的雷达扫面程序 如果有漂亮的雷达底图的话,效果应该非常不错的,另外也可以直接手绘雷达框架 效果如下 雷达主体代码 glLineWidt ...
- linux chown命令解除文件夹的root权限限制
sudo chown -R demouser file 这个命令可以解除linux文件的超级权限限制 摘录: chown将指定文件的拥有者改为指定的用户或组,用户可以是用户名或者用户ID:组可以是组名 ...
- python数字
#=====>part1:数字类型#掌握:int,float#了解:Long(在python2中才有),complex# num=10# num=int(10)# print(type(num) ...