scrapy中deferred的回调
def _next_request_from_scheduler(self, spider):#在引擎中处理一个请求
slot = self.slot
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)#生成了一个deferred对象
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d def _download(self, request, spider):
slot = self.slot
slot.add_request(request)
def _on_success(response):
assert isinstance(response, (Response, Request))
if isinstance(response, Response):
response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
return response def _on_complete(_):
slot.nextcall.schedule()
return _ dwld = self.downloader.fetch(request, spider)#下载器fetch
dwld.addCallbacks(_on_success)
dwld.addBoth(_on_complete)
return dwld
在HTTP11处理器中
class HTTP11DownloadHandler(object): def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss)
return agent.download_request(request) class ScrapyAgent(object): def download_request(self, request):
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout) # request details
url = urldefrag(request.url)[0]
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
if request.body:
bodyproducer = _RequestBodyProducer(request.body)
elif method == b'POST':
# Setting Content-Length: 0 even for POST requests is not a
# MUST per HTTP RFCs, but it's common behavior, and some
# servers require this, otherwise returning HTTP 411 Length required
#
# RFC 7230#section-3.3.2:
# "a Content-Length header field is normally sent in a POST
# request even when the value is 0 (indicating an empty payload body)."
#
# Twisted < 17 will not add "Content-Length: 0" by itself;
# Twisted >= 17 fixes this;
# Using a producer with an empty-string sends `0` as Content-Length
# for all versions of Twisted.
bodyproducer = _RequestBodyProducer(b'')
else:
bodyproducer = None
start_time = time()
d = agent.request(
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, request, url, timeout)
return d class _ResponseReader(protocol.Protocol): def __init__(self, finished, txresponse, request, maxsize, warnsize,
fail_on_dataloss):
self._finished = finished
self._txresponse = txresponse
self._request = request
self._bodybuf = BytesIO()
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._fail_on_dataloss_warned = False
self._reached_warnsize = False
self._bytes_received = 0 def dataReceived(self, bodyBytes):#读取数据,放到缓冲
# This maybe called several times after cancel was called with buffered
# data.
if self._finished.called:
return self._bodybuf.write(bodyBytes)
self._bytes_received += len(bodyBytes) if self._maxsize and self._bytes_received > self._maxsize:
logger.error("Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s).",
{'bytes': self._bytes_received,
'maxsize': self._maxsize})
# Clear buffer earlier to avoid keeping data in memory for a long
# time.
self._bodybuf.truncate(0)
self._finished.cancel() if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
self._reached_warnsize = True
logger.warning("Received more bytes than download "
"warn size (%(warnsize)s) in request %(request)s.",
{'warnsize': self._warnsize,
'request': self._request}) def connectionLost(self, reason):#连接完成后调用,也即响应已经到达。
if self._finished.called:
return body = self._bodybuf.getvalue()
if reason.check(ResponseDone):#self._finished是deferred对象
self._finished.callback((self._txresponse, body, None))#回调了,
return if reason.check(PotentialDataLoss):
self._finished.callback((self._txresponse, body, ['partial']))
return if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if not self._fail_on_dataloss:
self._finished.callback((self._txresponse, body, ['dataloss']))
return elif not self._fail_on_dataloss_warned:
logger.warn("Got data loss in %s. If you want to process broken "
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
" -- This message won't be shown in further requests",
self._txresponse.request.absoluteURI.decode())
self._fail_on_dataloss_warned = True self._finished.errback(reason)
scrapy中deferred的回调的更多相关文章
- scrapy中使用LinkExtractor提取链接
le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) 使用LinkExtra ...
- 通过实例说明在scrapy中 yield的作用
源https://www.jianshu.com/p/7c1a084853d8 开始前的准备工作: 1.MySQL下载:点我2.python MySQL驱动下载:pymysql(pyMySql,直接用 ...
- python的scrapy框架的使用 和xpath的使用 && scrapy中request和response的函数参数 && parse()函数运行机制
这篇博客主要是讲一下scrapy框架的使用,对于糗事百科爬取数据并未去专门处理 最后爬取的数据保存为json格式 一.先说一下pyharm怎么去看一些函数在源码中的代码实现 按着ctrl然后点击函数就 ...
- Objective-C中的Block回调模式
在前面的博客中提到了Block的概念和使用方法,个人感觉Block最爽的用法莫过于在回调时用block.感觉比委托回调和目标方法回调用着要顺手,好不好用还得读者亲自用一下才知道.如果 读者之前用过SS ...
- Android中的接口回调技术
Android中的接口回调技术有很多应用的场景,最常见的:Activity(人机交互的端口)的UI界面中定义了Button,点击该Button时,执行某个逻辑. 下面参见上述执行的模型,讲述James ...
- 如何优雅的处理Nodejs中的异步回调
前言 Nodejs最大的亮点就在于事件驱动, 非阻塞I/O 模型,这使得Nodejs具有很强的并发处理能力,非常适合编写网络应用.在Nodejs中大部分的I/O操作几乎都是异步的,也就是我们处理I/O ...
- js中this和回调方法循环-我们到底能走多远系列(35)
我们到底能走多远系列(35) 扯淡: 13年最后一个月了,你们在13年初的计划实现了吗?还来得及吗? 请加油~ 主题: 最近一直在写js,遇到了几个问题,可能初入门的时候都会遇到吧,总结下. 例子: ...
- Scrapy中使用Django的Model访问数据库
Scrapy中使用Django的Model进行数据库访问 当已存在Django项目的时候,直接引入Django的Model来使用比较简单 # 使用以下语句添加Django项目的目录到path impo ...
- scrapy中的下载器中间件
scrapy中的下载器中间件 下载中间件 下载器中间件是介于Scrapy的request/response处理的钩子框架. 是用于全局修改Scrapy request和response的一个轻量.底层 ...
随机推荐
- STL基础--容器
容器种类 序列容器(数组,链表) Vector, deque, list, forward list, array 关联容器(二叉树),总是有序的 set, multiset根据值排序,元素值不能修改 ...
- KMeans (K均值)算法讲解及实现
算法原理 KMeans算法是典型的基于距离的聚类算法,采用距离作为相似性的评价指标,即认为两个对象的距离越近,其相似度就越大.该算法认为簇是由距离靠近的对象组成的,因此把得到紧凑且独立的簇作为最终目标 ...
- rhel7.0解决:This system is not registered to Red Hat Subscription Management. You can use subscription-manager to register.
看这篇文章前,先说一下我的实际情况.本来要部署docker服务的,然后yum安装任何软件都不起效果,最后通过老师远程的帮助,最后成功安装上docker,老师的解决办法就是忽略这个问题,直接自己配置网络 ...
- centos7.0下增加swap分区大小
承接上篇文章扩容磁盘空间后增加根分区的大小后,来扩容swap分区的空间 检查当前的swap分区情况 # free -m # free -g [root@localhost ~]# free -m to ...
- [UE4]认识Decorator
Decorator装饰器:即为其他行为树系统中的条件语句,附着于一个Composite(组合节点)或者Task(任务节点),并定义树中的一个分支或者单个节点是否可被执行. Decorator装饰器节点 ...
- androidstudio在创建new project时,窗口太大,看不到下面确定按钮的解决方法
点击File-->setting-->Appearance将里面的Override default fonts by(not recommended)打钩去掉. 这个是目前找到唯一办法.
- vue-生存周期
beforeCreate 实例初始化之后 created 实例创建之后 beforeMount 实例挂载前 文本节点 mounted 渲染实例 防止花括 ...
- Delphi 解决Utf8ToAnsi和Utf8DeCode转换编码为空的问题
//delphi DecodeUtf8Str解决系统自带UTF8解码缺陷 function DecodeUtf8Str(const S: UTF8String): WideString; var le ...
- linux中 shell编程 判断服务是否运行
判断nginx是否运行中: if ps -ef|grep "nginx"|egrep -v grep >/dev/null then echo ok! else echo n ...
- ubuntu 16.04在真实机安装后的静态ip的配置
nssa-sensor1@nssa-sensor1:~$ vim /etc/network/interfaces 以下是编辑文件的内容# interfaces(5) file used by ifup ...