Python 爬取各大代理IP网站(元类封装)
import requests
from pyquery import PyQuery as pq
base_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
def get_page(url):
headers = dict(base_headers)
print('Getting',url)
try:
r = requests.get(url,headers=headers)
print('Getting result',url,r.status_code)
if r.status_code == 200:
return r.text
except ConnectionError:
pritn('Cramling Failed',url)
return None # 道生一:创建抽取代理的metaclass
class ProxyMetaclass(type):
"""
元类,在FreeProxyGetter类中加入
__CrawlName__,__CrawlFunc__和__CrawlFuncCount__
三个参数,分别表示爬虫函数名,函数实例和爬虫函数的数量。
"""
# __new__控制__init__的执行,所以在其执行之前
# cls:代表要__init__的类,此参数在实例化时由Python解释器自动提供
# name:类名
# bases:代表继承父类的集合
# attrs:类的方法及属性的集合
def __new__(cls, name, bases, attrs):
count = 0
# 在attrs字典加入两个key,key的值是个列表
attrs['__CrawlFunc__'] = []
attrs['__CrawlName__'] = []
for k, v in attrs.items():
if 'crawl_' in k:
attrs['__CrawlName__'].append(k) #函数名依次添加进attrs['__CrawlName__']列表
attrs['__CrawlFunc__'].append(v) #函数实例依次添加进attrs['__CrawlFunc__']列表
print(k,v)
#print(attrs['__CrawlName__'])
count += 1
for k in attrs['__CrawlName__']:
# 剔除原有的字典键值对
attrs.pop(k)
attrs['__CrawlFuncCount__'] = count
print(attrs)
return type.__new__(cls, name, bases, attrs) # 一生二:创建代理获取类 class ProxyGetter(object, metaclass=ProxyMetaclass):
def get_raw_proxies(self, site):
proxies = []
print('Site', site)
for func in self.__CrawlFunc__:
if func.__name__==site:
this_page_proxies = func(self)
for proxy in this_page_proxies:
print('Getting', proxy, 'from', site)
proxies.append(proxy)
print(proxies)
return proxies def crawl_daili66(self, page_count=4):
start_url = 'http://www.66ip.cn/{}.html'
urls = [start_url.format(page) for page in range(1, page_count + 1)] # format和%s的用法一样
for url in urls:
print('Crawling', url)
html = get_page(url)
if html:
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
ip = tr.find('td:nth-child(1)').text()
port = tr.find('td:nth-child(2)').text()
yield ':'.join([ip, port])
'''
def crawl_proxy360(self):
start_url = 'http://www.proxy360.cn/Region/China'
print('Crawling', start_url)
html = get_page(start_url)
if html:
doc = pq(html)
lines = doc('div[name="list_proxy_ip"]').items()
for line in lines:
ip = line.find('.tbBottomLine:nth-child(1)').text()
port = line.find('.tbBottomLine:nth-child(2)').text()
yield ':'.join([ip, port])
'''
def crawl_goubanjia(self):
start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
html = get_page(start_url)
if html:
doc = pq(html)
tds = doc('td.ip').items()
for td in tds:
td.find('p').remove()
yield td.text().replace(' ', '') if __name__ == '__main__':
# 二生三:实例化ProxyGetter
crawler = ProxyGetter()
print(crawler.__CrawlName__)
# 三生万物
for site_label in range(crawler.__CrawlFuncCount__):
site = crawler.__CrawlName__[site_label] # site_label是列表的索引值
myProxies = crawler.get_raw_proxies(site)
运行结果
D:\pythontest>python proxy_ip.py
crawl_goubanjia <function ProxyGetter.crawl_goubanjia at 0x00000000035D2510>
crawl_daili66 <function ProxyGetter.crawl_daili66 at 0x00000000035D2488>
{'__qualname__': 'ProxyGetter', '__module__': '__main__', '__CrawlName__': ['cra
wl_goubanjia', 'crawl_daili66'], '__CrawlFunc__': [<function ProxyGetter.crawl_g
oubanjia at 0x00000000035D2510>, <function ProxyGetter.crawl_daili66 at 0x000000
00035D2488>], 'get_raw_proxies': <function ProxyGetter.get_raw_proxies at 0x0000
0000035D2400>, '__CrawlFuncCount__': }
['crawl_goubanjia', 'crawl_daili66']
Site crawl_goubanjia
Getting http://www.goubanjia.com/free/gngn/index.shtml
Getting result http://www.goubanjia.com/free/gngn/index.shtml 403
[]
Site crawl_daili66
Crawling=== http://www.66ip.cn/1.html
Getting http://www.66ip.cn/1.html
Getting result http://www.66ip.cn/1.html 200
Getting 123.163.97.198: from crawl_daili66
Getting 36.249.109.21: from crawl_daili66
Getting 163.204.245.52: from crawl_daili66
Getting 222.189.247.207: from crawl_daili66
Getting 87.250.218.12: from crawl_daili66
Getting 118.172.176.61: from crawl_daili66
Getting 134.119.214.206: from crawl_daili66
Getting 110.74.208.154: from crawl_daili66
Crawling=== http://www.66ip.cn/2.html
Getting http://www.66ip.cn/2.html
Getting result http://www.66ip.cn/2.html 200
Getting 120.234.138.102: from crawl_daili66
Getting 110.86.136.127: from crawl_daili66
Getting 59.57.38.197: from crawl_daili66
Getting 202.62.86.94: from crawl_daili66
Getting 210.22.176.146: from crawl_daili66
Getting 180.183.136.212: from crawl_daili66
Getting 183.87.153.98: from crawl_daili66
Getting 222.124.2.186: from crawl_daili66
Getting 123.169.126.9: from crawl_daili66
Getting 123.169.126.93: from crawl_daili66
Getting 158.255.249.58: from crawl_daili66
Getting 1.198.72.242: from crawl_daili66
Crawling=== http://www.66ip.cn/3.html
Getting http://www.66ip.cn/3.html
Getting result http://www.66ip.cn/3.html 200
Getting 163.204.246.10: from crawl_daili66
Getting 186.159.112.6: from crawl_daili66
Getting 163.204.246.102: from crawl_daili66
Getting 88.87.72.72: from crawl_daili66
Getting 193.169.118.6: from crawl_daili66
Getting 196.216.220.204: from crawl_daili66
Getting 185.109.62.124: from crawl_daili66
Getting 1.193.246.78: from crawl_daili66
Getting 188.131.239.119: from crawl_daili66
Getting 1.10.188.93: from crawl_daili66
Getting 182.116.237.203: from crawl_daili66
Getting 139.99.223.230: from crawl_daili66
Crawling=== http://www.66ip.cn/4.html
Getting http://www.66ip.cn/4.html
Getting result http://www.66ip.cn/4.html 200
Getting 163.204.246.232: from crawl_daili66
Getting 117.28.96.105: from crawl_daili66
Getting 202.29.220.34: from crawl_daili66
Getting 123.169.114.80: from crawl_daili66
Getting 115.42.34.3: from crawl_daili66
Getting 41.84.131.78: from crawl_daili66
Getting 123.163.96.207: from crawl_daili66
Getting 182.35.83.12: from crawl_daili66
Getting 191.241.226.230: from crawl_daili66
Getting 202.138.236.35: from crawl_daili66
Getting 194.1.193.226: from crawl_daili66
Getting 202.158.77.122: from crawl_daili66
['123.163.97.198:9999', '36.249.109.21:9999', '163.204.245.52:9999', '222.189.247.207:9999', '87.250.218.12:44168',
'118.172.176.61:8080', '134.119.214.206:1080', '110.74.208.154:21776', '120.234.138.102:53779', '110.86.136.127:9999',
'59.57.38.197:9999', '202.62.86.94:83', '210.22.176.146:37299', '180.183.136.212:8080', '183.87.153.98:49602',
'222.124.2.186:8080', '123.169.126.9:3', '123.169.126.93:9999', '158.255.249.58:50100', '1.198.72.242:9999',
'163.204.246.10:2', '186.159.112.6:53281', '163.204.246.102:9999', '88.87.72.72:8080', '193.169.118.6:53281',
'185.109.62.124:808', '1.193.246.78:9999', '188.131.239.119:8118', '1.10.188.93:34871', '182.116.237.203:9999',
'139.99.223.230:8080', '163.204.246.232:9999', '117.28.96.105:9999', '202.29.220.34:38961', '123.169.114.80:9999',
'115.42.34.3:8080', '41.84.131.78:53281', '123.163.96.207:9999', '182.35.83.12:9999', '191.241.226.230:53281',
'202.138.236.35:56413', '194.1.193.226:35646','196.216.220.204:36739', '202.158.77.122:47284']
//看来只有一个代理网站能爬到数据
Python 爬取各大代理IP网站(元类封装)的更多相关文章
- python爬取高匿代理IP(再也不用担心会进小黑屋了)
为什么要用代理IP 很多人学习python,不知道从何学起.很多人学习python,掌握了基本语法过后,不知道在哪里寻找案例上手.很多已经做案例的人,却不知道如何去学习更加高深的知识.那么针对这三类人 ...
- 利用Python爬取可用的代理IP
前言 就以最近发现的一个免费代理IP网站为例:http://www.xicidaili.com/nn/.在使用的时候发现很多IP都用不了. 所以用Python写了个脚本,该脚本可以把能用的代理IP检测 ...
- 手把手教你使用Python爬取西刺代理数据(下篇)
/1 前言/ 前几天小编发布了手把手教你使用Python爬取西次代理数据(上篇),木有赶上车的小伙伴,可以戳进去看看.今天小编带大家进行网页结构的分析以及网页数据的提取,具体步骤如下. /2 首页分析 ...
- python scrapy 爬取西刺代理ip(一基础篇)(ubuntu环境下) -赖大大
第一步:环境搭建 1.python2 或 python3 2.用pip安装下载scrapy框架 具体就自行百度了,主要内容不是在这. 第二步:创建scrapy(简单介绍) 1.Creating a p ...
- python+scrapy 爬取西刺代理ip(一)
转自:https://www.cnblogs.com/lyc642983907/p/10739577.html 第一步:环境搭建 1.python2 或 python3 2.用pip安装下载scrap ...
- 学以致用:Python爬取廖大Python教程制作pdf
当我学了廖大的Python教程后,感觉总得做点什么,正好自己想随时查阅,于是就开始有了制作PDF这个想法. 想要把教程变成PDF有三步: 先生成空html,爬取每一篇教程放进一个新生成的div,这样就 ...
- 深夜,我用python爬取了整个斗图网站,不服来斗
QQ.微信斗图总是斗不过,索性直接来爬斗图网,我有整个网站的图,不服来斗. 废话不多说,选取的网站为斗图啦,我们先简单来看一下网站的结构 网页信息 从上面这张图我们可以看出,一页有多套图,这个时候我们 ...
- Scrapy爬取西刺代理ip流程
西刺代理爬虫 1. 新建项目和爬虫 scrapy startproject daili_ips ...... cd daili_ips/ #爬虫名称和domains scrapy genspider ...
- python爬取网站数据
开学前接了一个任务,内容是从网上爬取特定属性的数据.正好之前学了python,练练手. 编码问题 因为涉及到中文,所以必然地涉及到了编码的问题,这一次借这个机会算是彻底搞清楚了. 问题要从文字的编码讲 ...
随机推荐
- PHP FILTER_CALLBACK 过滤器
定义和用法 FILTER_CALLBACK 过滤器调用用户自定义函数来过滤数据. 该过滤器为我们提供了对数据过滤的完全控制. 指定的函数必须存入名为 "options" 的关联数组 ...
- <自动化测试>之<Selenium API 的用法1>
今天,简单,举例说一下在用python+selenium中元素定位的主要方法,第一部分是单个元素的操作,第二部分是一类元素的操作,实际操作中注意区分 #!/usr/bin/env python # - ...
- 【Spring Boot】Spring Boot项目设置多个配置文件,并在生产环境中的Tomcat设置对应的配置文件
1.修改Spring Boot项目配置文件 除了主配置文件,另外创建2个配置文件,注意命名要用application-*.properties 主配置文件中写入,使用dev作为开发配置 spring. ...
- C++ placement new与内存池
参考:https://blog.csdn.net/Kiritow/article/details/51314612 有些时候我们需要能够长时间运行的程序(例如监听程序,服务器程序)对于这些7*24运行 ...
- 将libvex移植到Visual Studio编译平台下的经验总结
1. 两难 将libvex从Linux移植到Windows,移植工作聚集于Cross-Compiler,而不是预料的Cross-Platform. VC++ Compiler到目前为止只支持C89标准 ...
- LeetCode题:旋转链表
原题: 给定一个链表,旋转链表,将链表每个节点向右移动 k 个位置,其中 k 是非负数. 示例 1: 输入: 1->2->3->4->5->NULL, k = 2输出: ...
- JVM系列文章合集
博客作者:纯洁的微笑 JVM系列(①):java类的加载机制 JVM系列(②):JVM内存结构 JVM系列(③):GC算法 垃圾收集器 JVM系列(④):jvm调优-命令大全(jps jstat jm ...
- Java内部类成员
内部类可以访问其所有实例成员,实例字段和其封闭类的实例方法.参考如下实例 - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 2 ...
- TP5.1/TP框架的访问控制,访问不存在的模块、控制器、方法等控制
TP框架的访问控制,默认模块.控制器.方法等 在tp框架中,config文件夹下的app.php文件可以设置默认的空模块名,默认的空控制器名. 举例:以上项目中有admin.common.api.er ...
- CentOS 7 安装 nginx1.15
1,安装依赖 yum -y install gcc zlib zlib-devel pcre-devel openssl openssl-devel SSL功能需要openssl库 gzip模块需要z ...