Scrapy框架: middlewares.py设置

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

class DownloadtestSpiderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the spider middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_spider_input(self, response, spider):

        # Called for each response that goes through the spider

        # middleware and into the spider.

        # Should return None or raise an exception.

        return None

    def process_spider_output(self, response, result, spider):

        # Called with the results returned from the Spider, after

        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.

        for i in result:

            yield i

    def process_spider_exception(self, response, exception, spider):

        # Called when a spider or process_spider_input() method

        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict

        # or Item objects.

        pass

    def process_start_requests(self, start_requests, spider):

        # Called with the start requests of the spider, and works

        # similarly to the process_spider_output() method, except

        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        for r in start_requests:

            yield r

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

class DownloadtestDownloaderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the downloader middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_request(self, request, spider):

        # Called for each request that goes through the downloader

        # middleware.

        # Must either:

        # - return None: continue processing this request

        # - or return a Response object

        # - or return a Request object

        # - or raise IgnoreRequest: process_exception() methods of

        #   installed downloader middleware will be called

        return None

    def process_response(self, request, response, spider):

        # Called with the response returned from the downloader.

        # Must either;

        # - return a Response object

        # - return a Request object

        # - or raise IgnoreRequest

        return response

    def process_exception(self, request, exception, spider):

        # Called when a download handler or a process_request()

        # (from other downloader middleware) raises an exception.

        # Must either:

        # - return None: continue processing this exception

        # - return a Response object: stops process_exception() chain

        # - return a Request object: stops process_exception() chain

        pass

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

import random

#设置请求头

class RandomUA():

    def __init__(self):

        self.user_agent=[

            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.16',

            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0',

            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'

        ]

    def process_request(self, request, spider):

        request.headers['User-Agent'] = random.choice(self.user_agent)

    def process_response(self, request, response, spider):

        response.status=201

        return response

#设置代理

class ProxyMiddleware:

    proxy_list=[

        "http://127.0.0.1:8080"

        # "http://183.91.33.41:80",

        # "http://111.29.3.190:80",

        # "http://124.156.108.71:82"

    ]

    def process_request(self, request, spider):

        ip=random.choice(self.proxy_list)

        request.meta['proxy']=ip

Scrapy框架: middlewares.py设置的更多相关文章

Scrapy框架: settings.py设置
# -*- coding: utf-8 -*- # Scrapy settings for maitian project # # For simplicity, this file contains ...
Scrapy框架: pipelines.py设置
保存数据到json文件 # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your p ...
Scrapy框架之高级转
一.CrawlSpider模板创建项目 scrapy startproject 项目名称查看模板 scrapy genspider -l 创建crawl模板 scrapy genspider -t ...
基于scrapy框架的爬虫
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中. scrapy 框架高性能的网络请求高性能的数据解析高性能的 ...
scrapy框架的中间件
中间件的使用作用:拦截所有的请求和响应拦截请求:process_request拦截正常的请求,process_exception拦截异常的请求篡改请求的头信息 def process_reque ...
scrapy框架设置代理
网易音乐在单ip请求下经常会遇到网页返回码503的情况经查询,503为单个ip请求流量超限,猜测是网易音乐的一种反扒方式因原音乐下载程序采用scrapy框架,所以需要在scrapy中通过代理的方式去解 ...
scrapy框架设置代理ip，headers头和cookies
[设置代理ip] 根据最新的scrapy官方文档,scrapy爬虫框架的代理配置有以下两种方法: 一.使用中间件DownloaderMiddleware进行配置使用Scrapy默认方法scrapy s ...
网络爬虫之scrapy框架设置代理
前戏 os.environ()简介 os.environ()可以获取到当前进程的环境变量,注意,是当前进程. 如果我们在一个程序中设置了环境变量,另一个程序是无法获取设置的那个变量的. 环境变量是以一 ...
Python爬虫Scrapy框架入门（2）
本文是跟着大神博客,尝试从网站上爬一堆东西,一堆你懂得的东西附上原创链接: http://www.cnblogs.com/qiyeboy/p/5428240.html 基本思路是,查看网页元素,填写 ...

随机推荐

Django ajax小例
urls.py: url(r'^ajaxstudents/$', views.ajaxstudents), url(r'^getstudentsinfo/$', views.getstudentsin ...
组件化框架设计之apt编译时期自动生成代码&动态类加载（二）
阿里P7移动互联网架构师进阶视频(每日更新中)免费学习请点击:https://space.bilibili.com/474380680 本篇文章将继续从以下两个内容来介绍组件化框架设计: apt编译时 ...
51.Lowest Common Ancestor of a Binary Tree（二叉树的最小公共祖先）
Level: Medium 题目描述: Given a binary tree, find the lowest common ancestor (LCA) of two given nodes ...
【知识强化】第四章网络层 4.3 IP
这节课我们来学习一下IP数据报的格式.那之所以把路由算法这一小节跳过呢,就是因为我们之后会要讲到路由的选择协议.那在路由选择协议这一块讲路由算法,我觉得是比较合适的.那我们先来看一下这节课要讲的知识. ...
MMM实现Mysql高可用
MySQL主主同步方案 l MySQL主主+Keepalived l MySQL+DRBD+Heartbeat 在企业中,数据库高可用一直是企业的重中之重,中小企业很多都是使用mysql主主方案, ...
The Preliminary Contest for ICPC Asia Nanjing 2019 D. Robots
题意:给出一个DAG,一只机器人从1点出发,等概率地选择一条出边走或者停留在原点,机器人的第k次行动消耗能量是k(无论是走还是停留都算一次行动).问1到n的期望. 解法:因为行动消耗的能量跟行动次数有 ...
修改Docker中容器的时间和宿主主机时间一致
修改Docker容器的时间和宿主时间一致在查看容器的日志的,发现时间有和宿主主机时间相差有8个小时,而且宿主主机使用的是CST时间,容器容器使用的是UTC时间主机时间 DOCKER容器的时间世界 ...
AI行业精选日报_人工智能（12·24）
特斯拉完全自动驾驶进步:新系统能识别交通信号标志 12 月 24 日,有美国车友称,升级最新的特斯拉 2019.40.50 系统后已经可以识别红绿灯和停车标志(目前仅限搭载 HW3.0 硬件的车型), ...
OpenCV常用基本处理函数（8）图像变换
傅里叶变换傅里叶变换在实际中有非常明显的物理意义,设f是一个能量有限的模拟信号,则其傅里叶变换就表示f的频谱. 图像的频率是表征图像中灰度变化剧烈程度的指标,是灰度在平面空间上的梯度.如:大面积的沙 ...
python 循环中使用多个subplot画子图像（python matplotlib use more than one subplot in loop）
在循环语句中画出多个subplot图像代码如下 http://jonathansoma.com/lede/data-studio/classes/small-multiples/long-explan ...

Scrapy框架: middlewares.py设置

Scrapy框架: middlewares.py设置的更多相关文章

随机推荐

热门专题