# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class DownloadtestSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects. @classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider. # Should return None or raise an exception.
return None def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response. # Must return an iterable of Request, dict or Item objects.
for i in result:
yield i def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict
# or Item objects.
pass def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated. # Must return only requests (not items).
for r in start_requests:
yield r def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) class DownloadtestDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects. @classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware. # Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None def process_response(self, request, response, spider):
# Called with the response returned from the downloader. # Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception. # Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) import random #设置请求头
class RandomUA():
def __init__(self):
self.user_agent=[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'
] def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agent) def process_response(self, request, response, spider):
response.status=201
return response #设置代理
class ProxyMiddleware:
proxy_list=[
"http://127.0.0.1:8080"
# "http://183.91.33.41:80",
# "http://111.29.3.190:80",
# "http://124.156.108.71:82"
] def process_request(self, request, spider):
ip=random.choice(self.proxy_list)
request.meta['proxy']=ip

Scrapy框架: middlewares.py设置的更多相关文章

  1. Scrapy框架: settings.py设置

    # -*- coding: utf-8 -*- # Scrapy settings for maitian project # # For simplicity, this file contains ...

  2. Scrapy框架: pipelines.py设置

    保存数据到json文件 # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your p ...

  3. Scrapy框架之高级 转

    一.CrawlSpider模板 创建项目 scrapy startproject 项目名称 查看模板 scrapy genspider -l 创建crawl模板 scrapy genspider -t ...

  4. 基于scrapy框架的爬虫

    Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中. scrapy 框架 高性能的网络请求 高性能的数据解析 高性能的 ...

  5. scrapy框架的中间件

    中间件的使用 作用:拦截所有的请求和响应 拦截请求:process_request拦截正常的请求,process_exception拦截异常的请求 篡改请求的头信息 def process_reque ...

  6. scrapy框架设置代理

    网易音乐在单ip请求下经常会遇到网页返回码503的情况经查询,503为单个ip请求流量超限,猜测是网易音乐的一种反扒方式因原音乐下载程序采用scrapy框架,所以需要在scrapy中通过代理的方式去解 ...

  7. scrapy框架设置代理ip,headers头和cookies

    [设置代理ip] 根据最新的scrapy官方文档,scrapy爬虫框架的代理配置有以下两种方法: 一.使用中间件DownloaderMiddleware进行配置使用Scrapy默认方法scrapy s ...

  8. 网络爬虫之scrapy框架设置代理

    前戏 os.environ()简介 os.environ()可以获取到当前进程的环境变量,注意,是当前进程. 如果我们在一个程序中设置了环境变量,另一个程序是无法获取设置的那个变量的. 环境变量是以一 ...

  9. Python爬虫Scrapy框架入门(2)

    本文是跟着大神博客,尝试从网站上爬一堆东西,一堆你懂得的东西 附上原创链接: http://www.cnblogs.com/qiyeboy/p/5428240.html 基本思路是,查看网页元素,填写 ...

随机推荐

  1. FastReport.net 使用 Winform WebForm打印

    delphi用的fastreport比较多 所以.net中也研究一下用法,这个打印控件还是很简单的 只要手动设计一下写少许代码就可以打印了 甚至可以写成通用代码 以后就可以不用写代码 安装demo会同 ...

  2. python小学堂-基础调用

    tem='xiaoq {0}iang'print(tem.capitalize())#首字母大学print(tem.center(30)) #居中显示iprint(tem.endswith(" ...

  3. Java调用DB的存储过程

    2015/12/7 使用数据库存储过程的java代码:   try {            con = (Connection) DBProxy.getConnection(null);       ...

  4. vuex存取数据展示在table里-----第一次实现

    之前也看了vuex的文档,对它的原理只是了解,看代码(仅自己复习.做笔记) 流程是在组件的created中提交dispatch,然后通过action调用一个封装好的axios然后再触发mutation ...

  5. 基于QRcode的带有文字+图片的二维码的Vue组件

    1 <template> 2 <!-- 生成二维码开放接口: 3 二维码内容[通常为url] 4 二维码大小[限制为正方形] 二维码下方显示:文字 5 二维码中间显示:图片--> ...

  6. 关于C++中的非静态类成员函数指针

    昨天发现了一个问题,就是使用对类中的非静态成员函数使用std::bind时,不能像普通函数一样直接传递函数名,而是必须显式地调用&(取地址),于是引申出我们今天的问题:非静态类成员函数指针和普 ...

  7. firefox浏览器强制取消自动更新

    问题:Firefox浏览器,在浏览器的设置中已经设置了取消自动升级,实际退出Firefox浏览器重新启动浏览器后还是会升级到最新版本.影响:Firefox浏览器不同的版本的插件的支持兼容不一样,如果需 ...

  8. python bezier 曲线

    1.手写bezier公式,生成bezier代码, 如果给的点数过多,则会生成一半bezier曲线,剩下的一半就需要进行拼接: import numpy as np import matplotlib. ...

  9. 第07章 JdbcTemplate

    第07章JdbcTemplate 1. 概述 为了使JDBC更加易于使用,Spring在JDBC API上定义了一个抽象层,以此建立一个JDBC存取框架. 作为Spring JDBC框架的核心,JDB ...

  10. ShellListView

    過濾ShellListView顯示的檔案 有關這方面的元件你可以在Win3.中找到相關元件 你可以使用四個元件搭配應該就可以你所需要的功能 DriveComboBox1.FilterComboBox1 ...