Scrapy框架: middlewares.py设置
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DownloadtestSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DownloadtestDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
import random
#设置请求头
class RandomUA():
def __init__(self):
self.user_agent=[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'
]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agent)
def process_response(self, request, response, spider):
response.status=201
return response
#设置代理
class ProxyMiddleware:
proxy_list=[
"http://127.0.0.1:8080"
# "http://183.91.33.41:80",
# "http://111.29.3.190:80",
# "http://124.156.108.71:82"
]
def process_request(self, request, spider):
ip=random.choice(self.proxy_list)
request.meta['proxy']=ip
Scrapy框架: middlewares.py设置的更多相关文章
- Scrapy框架: settings.py设置
# -*- coding: utf-8 -*- # Scrapy settings for maitian project # # For simplicity, this file contains ...
- Scrapy框架: pipelines.py设置
保存数据到json文件 # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your p ...
- Scrapy框架之高级 转
一.CrawlSpider模板 创建项目 scrapy startproject 项目名称 查看模板 scrapy genspider -l 创建crawl模板 scrapy genspider -t ...
- 基于scrapy框架的爬虫
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中. scrapy 框架 高性能的网络请求 高性能的数据解析 高性能的 ...
- scrapy框架的中间件
中间件的使用 作用:拦截所有的请求和响应 拦截请求:process_request拦截正常的请求,process_exception拦截异常的请求 篡改请求的头信息 def process_reque ...
- scrapy框架设置代理
网易音乐在单ip请求下经常会遇到网页返回码503的情况经查询,503为单个ip请求流量超限,猜测是网易音乐的一种反扒方式因原音乐下载程序采用scrapy框架,所以需要在scrapy中通过代理的方式去解 ...
- scrapy框架设置代理ip,headers头和cookies
[设置代理ip] 根据最新的scrapy官方文档,scrapy爬虫框架的代理配置有以下两种方法: 一.使用中间件DownloaderMiddleware进行配置使用Scrapy默认方法scrapy s ...
- 网络爬虫之scrapy框架设置代理
前戏 os.environ()简介 os.environ()可以获取到当前进程的环境变量,注意,是当前进程. 如果我们在一个程序中设置了环境变量,另一个程序是无法获取设置的那个变量的. 环境变量是以一 ...
- Python爬虫Scrapy框架入门(2)
本文是跟着大神博客,尝试从网站上爬一堆东西,一堆你懂得的东西 附上原创链接: http://www.cnblogs.com/qiyeboy/p/5428240.html 基本思路是,查看网页元素,填写 ...
随机推荐
- 3.jmeter jsr232 脚本获取当前测试的正在活动的线程数
jsr232 groovy 脚本获取当前测试的正在活动的线程数 (需要选择 groovy类型, 如果使用beanshell或者javascript,请根据其语法稍作修改即可) import org.a ...
- Windows重置网络命令
我们在日常使用电脑的时候会碰到网络异常,网络故障,想要针对性地去解决是很困难的. 有时候可能查遍了资料,花了大量时间,怎么搞都还是搞不定.所以,这次直接给大家分享一个通过重置网络来解决所有问题的方法. ...
- Cocos2d-x之Label
| 版权声明:本文为博主原创文章,未经博主允许不得转载. 在游戏开发中经常会使用标签文字,例如,游戏介绍,玩家积分,菜单选项,文字提示等等. LabelTTF 直接支持使用 TTF 字库 ...
- 纯css隐藏移动端滚动条解决方案(ios上流畅滑动)---转载
html代码展示(直接复制代码保存至本地文件运行即可): <!DOCTYPE html> <html lang="en"> <head> < ...
- HDFS学习笔记二
文章来源于:https://blog.csdn.net/xuejingfu1/article/details/52554174 文件写入staging(分阶段进行) 一个客户端的创建文件的请求并不直接 ...
- 2018-8-10-win10-uwp-httpClient-登陆CSDN
title author date CreateTime categories win10 uwp httpClient 登陆CSDN lindexi 2018-08-10 19:16:53 +080 ...
- shutdown - 关闭系统
总览 SYNOPSIS /sbin/shutdown [-t sec] [-arkhncfF] time [warning-message] 描述 DESCRIPTION shutdown 以一种安全 ...
- 命令 检查Linux服务器性能
一.uptime命令 这个命令可以快速查看机器的负载情况.在Linux系统中,这些数据表示等待CPU资源的进程和阻塞在不可中断IO进程(进程状态为D)的数量.这些数据可以让我们对系统资源使用有一个宏观 ...
- Matplotlib_key_point
Matplotlib官方入门教程: http://www.labri.fr/perso/nrougier/teaching/matplotlib/ 本文参考教程: http://codingpy.co ...
- excel导出简单示例(jxl jar包)
@param title excel文件名 @param excelTopName 表头中文名字(显示在第一行的中文表头内容) @param header 表头字段属性(根据该属性获取对应的属性值,表 ...