scrapy_ip_agent
#File name is rotate_useragent
# -*- coding: UTF-8 -*-
import random
import urllib2
import redis
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
from CrawlerTools.ScrapyFileSystem.config import *
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __inti__(self,user_agent=""):
self.user_agent=user_agent
def process_request(self,request,spider):
user_agent_list=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
ua=random.choice(user_agent_list)
if ua:
request.headers.setdefault("User-Agent",ua)
#Get ip
cfg=config("Redis")
redisId=redis.Redis(cfg["host"],cfg["port"],1,cfg["pwd"])
res=redisId.srandmember("ipPool",1)[0].strip()
urls=res.split(":")
request.meta['proxy'] ="http://"+str(urls[0])+":"+str(urls[1])
# Use the following lines if your proxy requires authentication
#Configuration profile
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
'CrawlerTools.rotate_useragent.RotateUserAgentMiddleware' :400
}
scrapy_ip_agent的更多相关文章
随机推荐
- c函数习记
1,user groups 篇幅 the length of an article; fgetgrent(从指定的文件来读取组格式) 相关related functions;fgetpwent hea ...
- 第二百八十六天 how can I 坚持
bug不断啊,头疼. 今天早上到的倒是挺早. 中午吃的黄焖鸡,晚上加了会班. 勇江的鱼都死了,杨建的还剩3条,晚上到家都快十点了,还洗了衣服,没捞出来呢, 希望可以请下来假吧. 晾上衣服睡觉.
- 03 javadoc
javadoc从程序源代码中抽取类.方法.成员等注释形成一个和源代码配套的API帮助文档 1.标签.命令格式: 2.使用方式: 2.1 dos命令行格式:javadoc XXX.java 2.2 ec ...
- 排序算法之快速排序(java实现)
package com.javaTest300; public class Test039 { public static void main(String[] args) {// 快速排序 int ...
- 问题-Delphi2007跟踪变量时提示“E2171 Variable 'APolygon' inaccessible here due to optimization”
问题现象:Delphi2007跟踪变量时提示“E2171 Variable 'APolygon' inaccessible here due to optimization” . 问题原因:可能是因为 ...
- Linux递归删除文件命令
Linux递归删除文件命令 find . -name "*.log.*" -exec ls {} \; find . -name "*.log.*" -exec ...
- javascript中数组的迭代等操作
- ios页面传值的几种方法
1.属性2.方法3.代理方法4.SharedApplication5.NSUserdefault6.通过一个单例的class来传递 属性这种方法传值挺方便的,只需要拿到它的指针,如果重新声明一个指针, ...
- ASP.NET网站中设置404自定义错误页面
在用ASP.NET WebForm开发一个网站时,需要自定义404错误页面. 做法是这样的 在网站根目录下建立了一个404.html的错误页面,然后在Global.asax文件中,加入如下代码: &l ...
- 教你50招提升ASP.NET性能(二十):认识你的循环
(31)Know your loops 招数31: 认识你的循环 for is the fastest way of iterating over a collection, foreach is a ...