scrapy中deferred的回调
- def _next_request_from_scheduler(self, spider):#在引擎中处理一个请求
- slot = self.slot
- request = slot.scheduler.next_request()
- if not request:
- return
- d = self._download(request, spider)#生成了一个deferred对象
- d.addBoth(self._handle_downloader_output, request, spider)
- d.addErrback(lambda f: logger.info('Error while handling downloader output',
- exc_info=failure_to_exc_info(f),
- extra={'spider': spider}))
- d.addBoth(lambda _: slot.remove_request(request))
- d.addErrback(lambda f: logger.info('Error while removing request from slot',
- exc_info=failure_to_exc_info(f),
- extra={'spider': spider}))
- d.addBoth(lambda _: slot.nextcall.schedule())
- d.addErrback(lambda f: logger.info('Error while scheduling new request',
- exc_info=failure_to_exc_info(f),
- extra={'spider': spider}))
- return d
- def _download(self, request, spider):
- slot = self.slot
- slot.add_request(request)
- def _on_success(response):
- assert isinstance(response, (Response, Request))
- if isinstance(response, Response):
- response.request = request # tie request to response received
- logkws = self.logformatter.crawled(request, response, spider)
- logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
- self.signals.send_catch_log(signal=signals.response_received, \
- response=response, request=request, spider=spider)
- return response
- def _on_complete(_):
- slot.nextcall.schedule()
- return _
- dwld = self.downloader.fetch(request, spider)#下载器fetch
- dwld.addCallbacks(_on_success)
- dwld.addBoth(_on_complete)
- return dwld
- 在HTTP11处理器中
- class HTTP11DownloadHandler(object):
- def download_request(self, request, spider):
- """Return a deferred for the HTTP download"""
- agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
- maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
- warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
- fail_on_dataloss=self._fail_on_dataloss)
- return agent.download_request(request)
- class ScrapyAgent(object):
- def download_request(self, request):
- timeout = request.meta.get('download_timeout') or self._connectTimeout
- agent = self._get_agent(request, timeout)
- # request details
- url = urldefrag(request.url)[0]
- method = to_bytes(request.method)
- headers = TxHeaders(request.headers)
- if isinstance(agent, self._TunnelingAgent):
- headers.removeHeader(b'Proxy-Authorization')
- if request.body:
- bodyproducer = _RequestBodyProducer(request.body)
- elif method == b'POST':
- # Setting Content-Length: 0 even for POST requests is not a
- # MUST per HTTP RFCs, but it's common behavior, and some
- # servers require this, otherwise returning HTTP 411 Length required
- #
- # RFC 7230#section-3.3.2:
- # "a Content-Length header field is normally sent in a POST
- # request even when the value is 0 (indicating an empty payload body)."
- #
- # Twisted < 17 will not add "Content-Length: 0" by itself;
- # Twisted >= 17 fixes this;
- # Using a producer with an empty-string sends `0` as Content-Length
- # for all versions of Twisted.
- bodyproducer = _RequestBodyProducer(b'')
- else:
- bodyproducer = None
- start_time = time()
- d = agent.request(
- method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
- # set download latency
- d.addCallback(self._cb_latency, request, start_time)
- # response body is ready to be consumed
- d.addCallback(self._cb_bodyready, request)
- d.addCallback(self._cb_bodydone, request, url)
- # check download timeout
- self._timeout_cl = reactor.callLater(timeout, d.cancel)
- d.addBoth(self._cb_timeout, request, url, timeout)
- return d
- class _ResponseReader(protocol.Protocol):
- def __init__(self, finished, txresponse, request, maxsize, warnsize,
- fail_on_dataloss):
- self._finished = finished
- self._txresponse = txresponse
- self._request = request
- self._bodybuf = BytesIO()
- self._maxsize = maxsize
- self._warnsize = warnsize
- self._fail_on_dataloss = fail_on_dataloss
- self._fail_on_dataloss_warned = False
- self._reached_warnsize = False
- self._bytes_received = 0
- def dataReceived(self, bodyBytes):#读取数据,放到缓冲
- # This maybe called several times after cancel was called with buffered
- # data.
- if self._finished.called:
- return
- self._bodybuf.write(bodyBytes)
- self._bytes_received += len(bodyBytes)
- if self._maxsize and self._bytes_received > self._maxsize:
- logger.error("Received (%(bytes)s) bytes larger than download "
- "max size (%(maxsize)s).",
- {'bytes': self._bytes_received,
- 'maxsize': self._maxsize})
- # Clear buffer earlier to avoid keeping data in memory for a long
- # time.
- self._bodybuf.truncate(0)
- self._finished.cancel()
- if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
- self._reached_warnsize = True
- logger.warning("Received more bytes than download "
- "warn size (%(warnsize)s) in request %(request)s.",
- {'warnsize': self._warnsize,
- 'request': self._request})
- def connectionLost(self, reason):#连接完成后调用,也即响应已经到达。
- if self._finished.called:
- return
- body = self._bodybuf.getvalue()
- if reason.check(ResponseDone):#self._finished是deferred对象
- self._finished.callback((self._txresponse, body, None))#回调了,
- return
- if reason.check(PotentialDataLoss):
- self._finished.callback((self._txresponse, body, ['partial']))
- return
- if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
- if not self._fail_on_dataloss:
- self._finished.callback((self._txresponse, body, ['dataloss']))
- return
- elif not self._fail_on_dataloss_warned:
- logger.warn("Got data loss in %s. If you want to process broken "
- "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
- " -- This message won't be shown in further requests",
- self._txresponse.request.absoluteURI.decode())
- self._fail_on_dataloss_warned = True
- self._finished.errback(reason)
scrapy中deferred的回调的更多相关文章
- scrapy中使用LinkExtractor提取链接
le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) 使用LinkExtra ...
- 通过实例说明在scrapy中 yield的作用
源https://www.jianshu.com/p/7c1a084853d8 开始前的准备工作: 1.MySQL下载:点我2.python MySQL驱动下载:pymysql(pyMySql,直接用 ...
- python的scrapy框架的使用 和xpath的使用 && scrapy中request和response的函数参数 && parse()函数运行机制
这篇博客主要是讲一下scrapy框架的使用,对于糗事百科爬取数据并未去专门处理 最后爬取的数据保存为json格式 一.先说一下pyharm怎么去看一些函数在源码中的代码实现 按着ctrl然后点击函数就 ...
- Objective-C中的Block回调模式
在前面的博客中提到了Block的概念和使用方法,个人感觉Block最爽的用法莫过于在回调时用block.感觉比委托回调和目标方法回调用着要顺手,好不好用还得读者亲自用一下才知道.如果 读者之前用过SS ...
- Android中的接口回调技术
Android中的接口回调技术有很多应用的场景,最常见的:Activity(人机交互的端口)的UI界面中定义了Button,点击该Button时,执行某个逻辑. 下面参见上述执行的模型,讲述James ...
- 如何优雅的处理Nodejs中的异步回调
前言 Nodejs最大的亮点就在于事件驱动, 非阻塞I/O 模型,这使得Nodejs具有很强的并发处理能力,非常适合编写网络应用.在Nodejs中大部分的I/O操作几乎都是异步的,也就是我们处理I/O ...
- js中this和回调方法循环-我们到底能走多远系列(35)
我们到底能走多远系列(35) 扯淡: 13年最后一个月了,你们在13年初的计划实现了吗?还来得及吗? 请加油~ 主题: 最近一直在写js,遇到了几个问题,可能初入门的时候都会遇到吧,总结下. 例子: ...
- Scrapy中使用Django的Model访问数据库
Scrapy中使用Django的Model进行数据库访问 当已存在Django项目的时候,直接引入Django的Model来使用比较简单 # 使用以下语句添加Django项目的目录到path impo ...
- scrapy中的下载器中间件
scrapy中的下载器中间件 下载中间件 下载器中间件是介于Scrapy的request/response处理的钩子框架. 是用于全局修改Scrapy request和response的一个轻量.底层 ...
随机推荐
- vue中滚动事件绑定的函数无法调用问题
问题描述: 一个包含下拉加载的页面,刷新当前页然后滚动页面,能够正常触发滚动事件并调用回调函数,但是如果是进了某一个页面然后再进的该页面,滚动事件能够触发, 但是回调函数在滚动的时候只能被调用一次. ...
- IDC:电源系统
ylbtech-IDC:电源系统 电源系统(Power System)是由整流设备.直流配电设备.蓄电池组.直流变换器.机架电源设备等和相关的配电线路组成的总体.电源系统为各种电机提供各种高.低频交. ...
- 认识hasLayout——IE浏览器css bug的一大罪恶根源
原文地址:http://neverned.blog.163.com/blog/static/1265524200933021130561/ 什么是hasLayout?hasLayout是IE特有 ...
- PHP 多态理解
PHP 多态 多态性是指相同的操作或函数.过程可作用于多种类型的对象上并获得不同的结果.不同的对象,收到同一消息将可以产生不同的结果,这种现象称为多态性. 多态性允许每个对象以适合自身的方式去响应 ...
- 集合--(List、Set、Map)遍历、删除、比较元素时的小陷阱
6,Map集合遍历的4中方法? 5,List遍历时如何remove元素 4.漏网之鱼-for循环递增下标方式遍历集合,并删除元素 如果你用for循环递增下标方式遍历集合,在遍历过程中删除元素,你可能会 ...
- python面向对象 : 反射和内置方法
一. 反射 1. isinstance()和issubclass() isinstance( 对象名, 类名) : 判断对象所属关系,包括父类 (注:type(对象名) is 类名 : 判断对象所属 ...
- [UE4]控件模板参数
创建的时候就会变成这样了.
- HBase常用操作命令
HBase常用操作命令 1.进入HBase脚本客户端 #hbase shell #进入HBase脚本客户端 > whoami #查看当前登录用户 > status ...
- sso CAS
sso:single sign on,在多个应用系统中,用户只需要登陆一次就可以访问所有相互信任的应用系统 CAS框架:Central Authentication Service是实现sso单点登录 ...
- C# webbrowser全掌握(二)
全篇引用单元mshtml; 路径:C:\windows\assembly\GAC\Microsoft.mshtml\7.0.3300.0__b03f5f7f11d50a3a\Microsoft.msh ...