1. def _next_request_from_scheduler(self, spider):#在引擎中处理一个请求
  2. slot = self.slot
  3. request = slot.scheduler.next_request()
  4. if not request:
  5. return
  6. d = self._download(request, spider)#生成了一个deferred对象
  7. d.addBoth(self._handle_downloader_output, request, spider)
  8. d.addErrback(lambda f: logger.info('Error while handling downloader output',
  9. exc_info=failure_to_exc_info(f),
  10. extra={'spider': spider}))
  11. d.addBoth(lambda _: slot.remove_request(request))
  12. d.addErrback(lambda f: logger.info('Error while removing request from slot',
  13. exc_info=failure_to_exc_info(f),
  14. extra={'spider': spider}))
  15. d.addBoth(lambda _: slot.nextcall.schedule())
  16. d.addErrback(lambda f: logger.info('Error while scheduling new request',
  17. exc_info=failure_to_exc_info(f),
  18. extra={'spider': spider}))
  19. return d
  20.  
  21. def _download(self, request, spider):
  22. slot = self.slot
  23. slot.add_request(request)
  24. def _on_success(response):
  25. assert isinstance(response, (Response, Request))
  26. if isinstance(response, Response):
  27. response.request = request # tie request to response received
  28. logkws = self.logformatter.crawled(request, response, spider)
  29. logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
  30. self.signals.send_catch_log(signal=signals.response_received, \
  31. response=response, request=request, spider=spider)
  32. return response
  33.  
  34. def _on_complete(_):
  35. slot.nextcall.schedule()
  36. return _
  37.  
  38. dwld = self.downloader.fetch(request, spider)#下载器fetch
  39. dwld.addCallbacks(_on_success)
  40. dwld.addBoth(_on_complete)
  41. return dwld
  42. HTTP11处理器中
  43. class HTTP11DownloadHandler(object):
  44.  
  45. def download_request(self, request, spider):
  46. """Return a deferred for the HTTP download"""
  47. agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
  48. maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
  49. warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
  50. fail_on_dataloss=self._fail_on_dataloss)
  51. return agent.download_request(request)
  52.  
  53. class ScrapyAgent(object):
  54.  
  55. def download_request(self, request):
  56. timeout = request.meta.get('download_timeout') or self._connectTimeout
  57. agent = self._get_agent(request, timeout)
  58.  
  59. # request details
  60. url = urldefrag(request.url)[0]
  61. method = to_bytes(request.method)
  62. headers = TxHeaders(request.headers)
  63. if isinstance(agent, self._TunnelingAgent):
  64. headers.removeHeader(b'Proxy-Authorization')
  65. if request.body:
  66. bodyproducer = _RequestBodyProducer(request.body)
  67. elif method == b'POST':
  68. # Setting Content-Length: 0 even for POST requests is not a
  69. # MUST per HTTP RFCs, but it's common behavior, and some
  70. # servers require this, otherwise returning HTTP 411 Length required
  71. #
  72. # RFC 7230#section-3.3.2:
  73. # "a Content-Length header field is normally sent in a POST
  74. # request even when the value is 0 (indicating an empty payload body)."
  75. #
  76. # Twisted < 17 will not add "Content-Length: 0" by itself;
  77. # Twisted >= 17 fixes this;
  78. # Using a producer with an empty-string sends `0` as Content-Length
  79. # for all versions of Twisted.
  80. bodyproducer = _RequestBodyProducer(b'')
  81. else:
  82. bodyproducer = None
  83. start_time = time()
  84. d = agent.request(
  85. method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
  86. # set download latency
  87. d.addCallback(self._cb_latency, request, start_time)
  88. # response body is ready to be consumed
  89. d.addCallback(self._cb_bodyready, request)
  90. d.addCallback(self._cb_bodydone, request, url)
  91. # check download timeout
  92. self._timeout_cl = reactor.callLater(timeout, d.cancel)
  93. d.addBoth(self._cb_timeout, request, url, timeout)
  94. return d
  95.  
  96. class _ResponseReader(protocol.Protocol):
  97.  
  98. def __init__(self, finished, txresponse, request, maxsize, warnsize,
  99. fail_on_dataloss):
  100. self._finished = finished
  101. self._txresponse = txresponse
  102. self._request = request
  103. self._bodybuf = BytesIO()
  104. self._maxsize = maxsize
  105. self._warnsize = warnsize
  106. self._fail_on_dataloss = fail_on_dataloss
  107. self._fail_on_dataloss_warned = False
  108. self._reached_warnsize = False
  109. self._bytes_received = 0
  110.  
  111. def dataReceived(self, bodyBytes):#读取数据,放到缓冲
  112. # This maybe called several times after cancel was called with buffered
  113. # data.
  114. if self._finished.called:
  115. return
  116.  
  117. self._bodybuf.write(bodyBytes)
  118. self._bytes_received += len(bodyBytes)
  119.  
  120. if self._maxsize and self._bytes_received > self._maxsize:
  121. logger.error("Received (%(bytes)s) bytes larger than download "
  122. "max size (%(maxsize)s).",
  123. {'bytes': self._bytes_received,
  124. 'maxsize': self._maxsize})
  125. # Clear buffer earlier to avoid keeping data in memory for a long
  126. # time.
  127. self._bodybuf.truncate(0)
  128. self._finished.cancel()
  129.  
  130. if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
  131. self._reached_warnsize = True
  132. logger.warning("Received more bytes than download "
  133. "warn size (%(warnsize)s) in request %(request)s.",
  134. {'warnsize': self._warnsize,
  135. 'request': self._request})
  136.  
  137. def connectionLost(self, reason):#连接完成后调用,也即响应已经到达。
  138. if self._finished.called:
  139. return
  140.  
  141. body = self._bodybuf.getvalue()
  142. if reason.check(ResponseDone):#self._finisheddeferred对象
  143. self._finished.callback((self._txresponse, body, None))#回调了,
  144. return
  145.  
  146. if reason.check(PotentialDataLoss):
  147. self._finished.callback((self._txresponse, body, ['partial']))
  148. return
  149.  
  150. if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
  151. if not self._fail_on_dataloss:
  152. self._finished.callback((self._txresponse, body, ['dataloss']))
  153. return
  154.  
  155. elif not self._fail_on_dataloss_warned:
  156. logger.warn("Got data loss in %s. If you want to process broken "
  157. "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
  158. " -- This message won't be shown in further requests",
  159. self._txresponse.request.absoluteURI.decode())
  160. self._fail_on_dataloss_warned = True
  161.  
  162. self._finished.errback(reason)

scrapy中deferred的回调的更多相关文章

  1. scrapy中使用LinkExtractor提取链接

    le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response)   使用LinkExtra ...

  2. 通过实例说明在scrapy中 yield的作用

    源https://www.jianshu.com/p/7c1a084853d8 开始前的准备工作: 1.MySQL下载:点我2.python MySQL驱动下载:pymysql(pyMySql,直接用 ...

  3. python的scrapy框架的使用 和xpath的使用 && scrapy中request和response的函数参数 && parse()函数运行机制

    这篇博客主要是讲一下scrapy框架的使用,对于糗事百科爬取数据并未去专门处理 最后爬取的数据保存为json格式 一.先说一下pyharm怎么去看一些函数在源码中的代码实现 按着ctrl然后点击函数就 ...

  4. Objective-C中的Block回调模式

    在前面的博客中提到了Block的概念和使用方法,个人感觉Block最爽的用法莫过于在回调时用block.感觉比委托回调和目标方法回调用着要顺手,好不好用还得读者亲自用一下才知道.如果 读者之前用过SS ...

  5. Android中的接口回调技术

    Android中的接口回调技术有很多应用的场景,最常见的:Activity(人机交互的端口)的UI界面中定义了Button,点击该Button时,执行某个逻辑. 下面参见上述执行的模型,讲述James ...

  6. 如何优雅的处理Nodejs中的异步回调

    前言 Nodejs最大的亮点就在于事件驱动, 非阻塞I/O 模型,这使得Nodejs具有很强的并发处理能力,非常适合编写网络应用.在Nodejs中大部分的I/O操作几乎都是异步的,也就是我们处理I/O ...

  7. js中this和回调方法循环-我们到底能走多远系列(35)

    我们到底能走多远系列(35) 扯淡: 13年最后一个月了,你们在13年初的计划实现了吗?还来得及吗? 请加油~ 主题: 最近一直在写js,遇到了几个问题,可能初入门的时候都会遇到吧,总结下. 例子: ...

  8. Scrapy中使用Django的Model访问数据库

    Scrapy中使用Django的Model进行数据库访问 当已存在Django项目的时候,直接引入Django的Model来使用比较简单 # 使用以下语句添加Django项目的目录到path impo ...

  9. scrapy中的下载器中间件

    scrapy中的下载器中间件 下载中间件 下载器中间件是介于Scrapy的request/response处理的钩子框架. 是用于全局修改Scrapy request和response的一个轻量.底层 ...

随机推荐

  1. vue中滚动事件绑定的函数无法调用问题

    问题描述: 一个包含下拉加载的页面,刷新当前页然后滚动页面,能够正常触发滚动事件并调用回调函数,但是如果是进了某一个页面然后再进的该页面,滚动事件能够触发, 但是回调函数在滚动的时候只能被调用一次. ...

  2. IDC:电源系统

    ylbtech-IDC:电源系统 电源系统(Power System)是由整流设备.直流配电设备.蓄电池组.直流变换器.机架电源设备等和相关的配电线路组成的总体.电源系统为各种电机提供各种高.低频交. ...

  3. 认识hasLayout——IE浏览器css bug的一大罪恶根源

     原文地址:http://neverned.blog.163.com/blog/static/1265524200933021130561/   什么是hasLayout?hasLayout是IE特有 ...

  4. PHP 多态理解

    PHP 多态   多态性是指相同的操作或函数.过程可作用于多种类型的对象上并获得不同的结果.不同的对象,收到同一消息将可以产生不同的结果,这种现象称为多态性. 多态性允许每个对象以适合自身的方式去响应 ...

  5. 集合--(List、Set、Map)遍历、删除、比较元素时的小陷阱

    6,Map集合遍历的4中方法? 5,List遍历时如何remove元素 4.漏网之鱼-for循环递增下标方式遍历集合,并删除元素 如果你用for循环递增下标方式遍历集合,在遍历过程中删除元素,你可能会 ...

  6. python面向对象 : 反射和内置方法

    一. 反射 1. isinstance()和issubclass() isinstance( 对象名, 类名) : 判断对象所属关系,包括父类  (注:type(对象名) is 类名 : 判断对象所属 ...

  7. [UE4]控件模板参数

    创建的时候就会变成这样了.

  8. HBase常用操作命令

    HBase常用操作命令 1.进入HBase脚本客户端 #hbase shell #进入HBase脚本客户端 > whoami    #查看当前登录用户 > status           ...

  9. sso CAS

    sso:single sign on,在多个应用系统中,用户只需要登陆一次就可以访问所有相互信任的应用系统 CAS框架:Central Authentication Service是实现sso单点登录 ...

  10. C# webbrowser全掌握(二)

    全篇引用单元mshtml; 路径:C:\windows\assembly\GAC\Microsoft.mshtml\7.0.3300.0__b03f5f7f11d50a3a\Microsoft.msh ...