针对pipelines的扩展

from scrapy.exceptions import DropItem

class CustomPipeline(object):
def __init__(self,v):
self.value = v def process_item(self, item, spider):
# 操作并进行持久化 # return表示会被后续的pipeline继续处理
return item # 表示将item丢弃,不会被后续pipeline处理
# raise DropItem() @classmethod
def from_crawler(cls, crawler):
"""
初始化时候,用于创建pipeline对象
:param crawler:
:return:
"""
val = crawler.settings.getint('MMMM')
return cls(val) def open_spider(self,spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
print('') def close_spider(self,spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
print('')

针对爬虫中间件的扩展

class SpiderMiddleware(object):

    def process_spider_input(self,response, spider):
"""
下载完成,执行,然后交给parse处理
:param response:
:param spider:
:return:
"""
pass def process_spider_output(self,response, result, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
"""
return result def process_spider_exception(self,response, exception, spider):
"""
异常调用
:param response:
:param exception:
:param spider:
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
"""
return None def process_start_requests(self,start_requests, spider):
"""
爬虫启动时调用
:param start_requests:
:param spider:
:return: 包含 Request 对象的可迭代对象
"""
return start_requests
内置爬虫中间件:
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900, # from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'step8_king.middlewares.SpiderMiddleware': 543,
}

针对下载中间件的扩展

class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
pass def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print('response1')
return response def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
return None
默认下载中间件
{
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
} # from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'step8_king.middlewares.DownMiddleware1': 100,
# 'step8_king.middlewares.DownMiddleware2': 500,
# }

信号

from scrapy import signals

class MyExtension(object):
def __init__(self, value):
self.value = value @classmethod
def from_crawler(cls, crawler):
val = crawler.settings.getint('MMMM')
ext = cls(val) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) return ext def spider_opened(self, spider):
print('open') def spider_closed(self, spider):
print('close')

url去重扩展

class RepeatUrl:
def __init__(self):
self.visited_url = set() @classmethod
def from_settings(cls, settings):
"""
初始化时,调用
:param settings:
:return:
"""
return cls() def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过;False表示未访问过
"""
if request.url in self.visited_url:
return True
self.visited_url.add(request.url)
return False def open(self):
"""
开始爬去请求时,调用
:return:
"""
print('open replication') def close(self, reason):
"""
结束爬虫爬取时,调用
:param reason:
:return:
"""
print('close replication') def log(self, request, spider):
"""
记录日志
:param request:
:param spider:
:return:
"""
print('repeat', request.url)

代理扩展

from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware

    方式一:使用默认
os.environ
{
http_proxy:http://root:woshiniba@192.168.11.11:9999/
https_proxy:http://192.168.11.11:9999/
}
方式二:使用自定义下载中间件 def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors) class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) DOWNLOADER_MIDDLEWARES = {
'step8_king.middlewares.ProxyMiddleware': 500,
}

Https访问证书扩展

    Https访问时有两种情况:
1. 要爬取网站使用的可信任证书(默认支持)
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 2. 要爬取网站使用的自定义证书
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
其他:
相关类
scrapy.core.downloader.handlers.http.HttpDownloadHandler
scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
相关配置
DOWNLOADER_HTTPCLIENTFACTORY
DOWNLOADER_CLIENTCONTEXTFACTORY

爬虫之scrapy扩展的更多相关文章

  1. 97、爬虫框架scrapy

    本篇导航: 介绍与安装 命令行工具 项目结构以及爬虫应用简介 Spiders 其它介绍 爬取亚马逊商品信息   一.介绍与安装 Scrapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, ...

  2. 爬虫框架 Scrapy

    一 介绍 crapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的,使用它可以以快速.简单.可扩展的方式从网站中提取所需的数据.但目前Scrapy的用途十分广泛,可用 ...

  3. 初识python爬虫框架Scrapy

    Scrapy,按照其官网(https://scrapy.org/)上的解释:一个开源和协作式的框架,用快速.简单.可扩展的方式从网站提取所需的数据. 我们一开始上手爬虫的时候,接触的是urllib.r ...

  4. Python 爬虫七 Scrapy

    Scrapy Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中.其最初是为了页面抓取 (更确切来说, 网络抓取 )所设 ...

  5. 5、爬虫之scrapy框架

    一 scrapy框架简介 1 介绍 Scrapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的,使用它可以以快速.简单.可扩展的方式从网站中提取所需的数据.但目前Sc ...

  6. 05 爬虫之scrapy

    一 scrapy框架简介 01 什么是scrapy: Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架,非常出名,非常强悍.所谓的框架就是一个已经被集成了各种功能(高性能异步下载,队 ...

  7. 第三百二十四节,web爬虫,scrapy模块介绍与使用

    第三百二十四节,web爬虫,scrapy模块介绍与使用 Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中.其最初是为了 ...

  8. 爬虫之Scrapy详解

    性能相关 在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢. import requests def fetch_async(url): ...

  9. 爬虫必备—Scrapy

    一.Scrapy简介 Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中.其最初是为了页面抓取 (更确切来说, 网络抓取 ...

随机推荐

  1. Ansa 自动加载用户脚本设置

    1.在安装路径(×××\AppData\Local\Apps\BETA_CAE_Systems\ansa_v16.0.0\config)找到user_ANSA_TRANSL.py: 打开文本可以发现里 ...

  2. Oracle数据库容量使用情况调查

    -- 剩余容量 select sum(bytes) FREE from DBA_FREE_SPACE where tablespace_name ='xxx'; -- 总容量 select sum(b ...

  3. [蓝桥杯]PREV-27.历届试题_蚂蚁感冒

    问题描述 长100厘米的细长直杆子上有n只蚂蚁.它们的头有的朝左,有的朝右. 每只蚂蚁都只能沿着杆子向前爬,速度是1厘米/秒. 当两只蚂蚁碰面时,它们会同时掉头往相反的方向爬行. 这些蚂蚁中,有1只蚂 ...

  4. transform、transition、animation 区别

    Transform 在部分的test case当中,每每演示transform属性的,看起来好像都是带动画.这使得小部分直觉化思维的人(包括我)认为transform属性是动画属性.而恰恰相反,tra ...

  5. sql 存储过程和触发器

    mysql----------------------------------------------------------------------------------------------- ...

  6. windows驱动开发前导知识

    从以下整理得到 https://blog.csdn.net/suxinpingtao51/article/details/8610528 http://www.cnblogs.com/bugcheck ...

  7. Java笔记Spring(九)

    完整调试springmvc源码 WebApplicationContext = new XmlWebApplicationContext();// XmlWebApplicationContext通过 ...

  8. Access Token 与 Refresh Token【转载哒科普啊】

    Access Token 与 Refresh Token   access token 是客户端访问资源服务器的令牌.拥有这个令牌代表着得到用户的授权.然而,这个授权应该是临时的,有一定有效期.这是因 ...

  9. 关于std::thread

    std::thread基本用法 1.普通函数: std::thread thread(func, param, ...) 2.类成员函数: std::thread thread(&class_ ...

  10. 【C语言基础】循环体系

    1.For循环结构: For循环的一般形式为: for (表达式1 初始化:判断条件:自增自减) { 语句块 } 2.while循环结构: while循环的一般的形式为: 表达式1 初始化 while ...