Agent = client.Agent

class ScrapyAgent(object):

    _Agent = Agent#为twisted的client.Agent类
_ProxyAgent = ProxyAgent
_TunnelingAgent = TunnelingAgent def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0, fail_on_dataloss=True):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._txresponse = None def _get_agent(self, request, timeout):#获得代理
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if scheme == b'https' and not omitConnectTunnel:
proxyConf = (proxyHost, proxyPort,
request.headers.get(b'Proxy-Authorization', None))
return self._TunnelingAgent(reactor, proxyConf,
contextFactory=self._contextFactory, connectTimeout=timeout,
bindAddress=bindaddress, pool=self._pool)
else:
endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
timeout=timeout, bindAddress=bindaddress)
return self._ProxyAgent(endpoint) return self._Agent(reactor, contextFactory=self._contextFactory,
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) def download_request(self, request):
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout) # request details
url = urldefrag(request.url)[0]
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
if request.body:
bodyproducer = _RequestBodyProducer(request.body)
elif method == b'POST':
# Setting Content-Length: 0 even for POST requests is not a
# MUST per HTTP RFCs, but it's common behavior, and some
# servers require this, otherwise returning HTTP 411 Length required
#
# RFC 7230#section-3.3.2:
# "a Content-Length header field is normally sent in a POST
# request even when the value is 0 (indicating an empty payload body)."
#
# Twisted < 17 will not add "Content-Length: 0" by itself;
# Twisted >= 17 fixes this;
# Using a producer with an empty-string sends `0` as Content-Length
# for all versions of Twisted.
bodyproducer = _RequestBodyProducer(b'')
else:
bodyproducer = None
start_time = time()
d = agent.request(#调用代理的请求
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, req
class Agent(_AgentBase):

    def __init__(self, reactor,
contextFactory=BrowserLikePolicyForHTTPS(),
connectTimeout=None, bindAddress=None,
pool=None): if not IPolicyForHTTPS.providedBy(contextFactory):
warnings.warn(
repr(contextFactory) +
" was passed as the HTTPS policy for an Agent, but it does "
"not provide IPolicyForHTTPS. Since Twisted 14.0, you must "
"pass a provider of IPolicyForHTTPS.",
stacklevel=2, category=DeprecationWarning
)
contextFactory = _DeprecatedToCurrentPolicyForHTTPS(contextFactory)
endpointFactory = _StandardEndpointFactory(
reactor, contextFactory, connectTimeout, bindAddress)
self._init(reactor, endpointFactory, pool) @classmethod
def usingEndpointFactory(cls, reactor, endpointFactory, pool=None):
"""
Create a new L{Agent} that will use the endpoint factory to figure
out how to connect to the server. """
agent = cls.__new__(cls)
agent._init(reactor, endpointFactory, pool)
return agent def _init(self, reactor, endpointFactory, pool): _AgentBase.__init__(self, reactor, pool)
self._endpointFactory = endpointFactory def _getEndpoint(self, uri): return self._endpointFactory.endpointForURI(uri) def request(self, method, uri, headers=None, bodyProducer=None):
"""
Issue a request to the server indicated by the given C{uri}. An existing connection from the connection pool may be used or a new
one may be created. I{HTTP} and I{HTTPS} schemes are supported in C{uri}. @see: L{twisted.web.iweb.IAgent.request}
"""
parsedURI = URI.fromBytes(uri)
try:
endpoint = self._getEndpoint(parsedURI)
except SchemeNotSupported:
return defer.fail(Failure())
key = (parsedURI.scheme, parsedURI.host, parsedURI.port)#key值的计算
return self._requestWithEndpoint(key, endpoint, method, parsedURI,
headers, bodyProducer,
parsedURI.originForm)
#从class HTTPConnectionPool(object)中取得一个连接
def getConnection(self, key, endpoint): # Try to get cached version:
connections = self._connections.get(key)
while connections:
connection = connections.pop(0)
# Cancel timeout:
self._timeouts[connection].cancel()
del self._timeouts[connection]
if connection.state == "QUIESCENT":#该连接为空闲状态
if self.retryAutomatically:
newConnection = lambda: self._newConnection(key, endpoint)
connection = _RetryingHTTP11ClientProtocol(
connection, newConnection)
return defer.succeed(connection)#成功 return self._newConnection(key, endpoint)

scrapy之downloader执行流程的更多相关文章

  1. Scrapy框架的执行流程解析

    这里主要介绍七个大类Command->CrawlerProcess->Crawler->ExecutionEngine->sceduler另外还有两个类:Request和Htt ...

  2. scrapy架构图与执行流程

    概览 本文描述了Scrapy的架构图.数据流动.以及个组件的相互作用 架构图与数据流 上图中各个数字与箭头代表数据的流动方向和流动顺序,具体执行流程如下: 0. Scrapy将会实例化一个Crawle ...

  3. 爬虫--Scrapy之Downloader Middleware

    下载器中间件(Downloader Middleware) 下载器中间件是介于Scrapy的request/response处理的钩子框架. 是用于全局修改Scrapy request和respons ...

  4. Scrapy五大核心组件工作流程

    一.Scrapy五大核心组件工作流程 1.核心组件 # 引擎(Scrapy) 对整个系统的数据流进行处理, 触发事务(框架核心). # 调度器(Scheduler) 用来接受引擎发过来的请求. 由过滤 ...

  5. 步步深入:MySQL架构总览->查询执行流程->SQL解析顺序

    前言: 一直是想知道一条SQL语句是怎么被执行的,它执行的顺序是怎样的,然后查看总结各方资料,就有了下面这一篇博文了. 本文将从MySQL总体架构--->查询执行流程--->语句执行顺序来 ...

  6. 第二天 ci执行流程

    第二天 ci执行流程 welcome 页面 this this->load 单入口框架index.php 两个文件夹 system application定义 定义常亮路径 载入 codeign ...

  7. 轻量级前端MVVM框架avalon - 执行流程2

    接上一章 执行流程1 在这一大堆扫描绑定方法中应该会哪些实现? 首先我们看avalon能帮你做什么? 数据填充,比如表单的一些初始值,切换卡的各个面板的内容({{xxx}},{{xxx|html}}, ...

  8. [Java编程思想-学习笔记]第4章 控制执行流程

    4.1  return 关键字return有两方面的用途:一方面指定一个方法结束时返回一个值:一方面强行在return位置结束整个方法,如下所示: char test(int score) { if ...

  9. ThinkPHP2.2框架执行流程图,ThinkPHP控制器的执行流程

    ThinkPHP2.2框架执行原理.流程图在线手册 ThinkPHP控制器的执行流程 对用户的第一次URL访问 http://<serverIp>/My/index.php/Index/s ...

随机推荐

  1. Flsk-Werkzeug-请求参数获取备忘

    Werkzeug:response,request,routing 获取请求参数:data,form,args,files,cookies,headers,method,url routing:Rul ...

  2. DS图--最小生成树

    题目描述 根据输入创建无向网.分别用Prim算法和Kruskal算法构建最小生成树.(假设:输入数据的最小生成树唯一.) 输入 顶点数n n个顶点 边数m m条边信息,格式为:顶点1 顶点2 权值 P ...

  3. 关于Javascript闭包(Closure)

    闭包(closure)是Javascript语言的一个难点,也是它的特色,很多高级应用都要依靠闭包实现. 一.变量的作用域 要理解闭包,首先必须理解Javascript特殊的变量作用域. 变量的作用域 ...

  4. STL基础--算法(已排序数据的算法,数值算法)

    已排序数据的算法 Binary search, merge, set operations 每个已排序数据算法都有一个同名的更一般的形式 vector vec = {8,9,9,9,45,87,90} ...

  5. bzoj5052: 繁忙的财政官

    求区间内相差最小的两个数的差 分sqrt(n)块,预处理两个数在块内,以及一个数在块内一个数在零散部分的情况,询问时归并排序处理两个数都在零散部分的情况,时间复杂度$O((n+q)\sqrt{n})$ ...

  6. MATLAB在三维坐标中显示图片 并 使得图片部分透明

    要画一个光路图,本来可以用proe,但是鼠标不好用,有些操作也忘了,用MATLAB画了个.下面是用到的图片. 但是三维坐标中显示彩色图片的目标没有搞定,做了个灰度图,然后用仿射程序将彩色图片贴到了二维 ...

  7. ueditor 正在读取目录

    ueditor 版本为1.3.6  项目版本为2.0 引用 <script src="../ueditor/ueditor.config.js" type="tex ...

  8. 如何获取阿里云OSS上每个文件夹的大小

    原文 https://help.aliyun.com/document_detail/88458.html?spm=a2c4g.11186623.2.11.792462b15oU02q OSS文件按照 ...

  9. [UE4][Canvas]用C++代码绘制血条(HealthBar)

    转自:http://aigo.iteye.com/blog/2275110 参考自Epic官方项目StrategyGame 血条效果: StrategyHUD.h StrategyHUD.cpp

  10. 可变卷积Deforable ConvNet 迁移训练自己的数据集 MXNet框架 GPU版

    [引言] 最近在用可变卷积的rfcn 模型迁移训练自己的数据集, MSRA官方使用的MXNet框架 环境搭建及配置:http://www.cnblogs.com/andre-ma/p/8867031. ...