import requests
from fake_useragent import UserAgent
from lxml import etree
from urllib.parse import urljoin
import pymysql
import time ua = UserAgent() class MyException(Exception): def __init__(self, status, msg):
self.status = status
self.msg = msg
super().__init__() class XiCi: def __init__(self):
self.session = requests.Session()
self.session.headers = {
"User-Agent": ua.random,
"Host": "www.xicidaili.com"
}
self.conn = pymysql.connect(host="127.0.0.1",
port=3306,
user="root",
db="proxies")
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor) def get_page_html(self, api):
'''通过get方法请求网页'''
response = self.session.get(url=api, headers=self.session.headers)
if response.status_code == 200:
return response def __html_to_etree(self, html):
'''将html源码转为xml'''
return etree.HTML(html) def get_next_page_url(self, response):
'''拿到下一页的url'''
selector = self.__html_to_etree(response.text)
try:
next_page_url = selector.xpath("//a[@class='next_page']/@href")[0]
next_page_url = urljoin(response.url, next_page_url)
return next_page_url
except IndexError:
raise MyException(1000, "爬取完毕") def __get_proxies_info(self, response):
'''获取到爬取的代理信息'''
selector = self.__html_to_etree(response.text)
tr_ele_list = selector.xpath("//*[@id='ip_list']//tr")
for tr in tr_ele_list:
ip = tr.xpath("td[2]/text()")
if not ip:
continue
ip = ip[0]
port = tr.xpath("td[3]/text()")[0]
type = tr.xpath("td[6]/text()")[0]
yield [ip, port, type] def __detect_availability(self, data):
'''拿到爬取的数据,检测代理是否可以使用'''
https_api = "https://icanhazip.com/"
http_api = "http://icanhazip.com/"
ip = data[0]
port = data[1]
type = data[2]
proxies = {type.lower(): "{}://{}:{}".format(type.lower(), ip, port)}
try:
if type.upper() == "HTTPS":
requests.get(https_api, headers={"User-Agent": ua.random}, proxies=proxies, timeout=3)
else:
requests.get(http_api, headers={"User-Agent": ua.random}, proxies=proxies, timeout=3)
return True
except Exception:
return False def get_usable_proxies_ip(self, response):
'''获取到可用的代理ip'''
res = self.__get_proxies_info(response)
for data in res:
if self.__detect_availability(data):
self.save_to_db(data) def save_to_db(self, data):
'''保存到数据库'''
sql = 'insert into proxies_table(ip,port,type) values(%s,%s,%s);'
print(data)
self.cursor.execute(sql, data)
self.conn.commit() def run(self, api):
'''启动入口'''
page = 1
while True:
print("爬取第{}页数据...".format(page))
response = self.get_page_html(api)
self.get_usable_proxies_ip(response)
try:
api = self.get_next_page_url(response)
except MyException as e:
if e.status == 1000:
print(e.msg)
break
page += 1
time.sleep(3) def __del__(self):
self.conn.close() if __name__ == '__main__':
api = "https://www.xicidaili.com/nn"
xici = XiCi()
xici.run(api)

python3爬虫-通过requests爬取西刺代理的更多相关文章

  1. 使用XPath爬取西刺代理

    因为在Scrapy的使用过程中,提取页面信息使用XPath比较方便,遂成此文. 在b站上看了介绍XPath的:https://www.bilibili.com/video/av30320885?fro ...

  2. Python四线程爬取西刺代理

    import requests from bs4 import BeautifulSoup import lxml import telnetlib #验证代理的可用性 import pymysql. ...

  3. 手把手教你使用Python爬取西刺代理数据(下篇)

    /1 前言/ 前几天小编发布了手把手教你使用Python爬取西次代理数据(上篇),木有赶上车的小伙伴,可以戳进去看看.今天小编带大家进行网页结构的分析以及网页数据的提取,具体步骤如下. /2 首页分析 ...

  4. Scrapy爬取西刺代理ip流程

    西刺代理爬虫 1. 新建项目和爬虫 scrapy startproject daili_ips ...... cd daili_ips/ #爬虫名称和domains scrapy genspider ...

  5. python3爬虫-使用requests爬取起点小说

    import requests from lxml import etree from urllib import parse import os, time def get_page_html(ur ...

  6. python3爬虫-通过requests爬取图虫网

    import requests from fake_useragent import UserAgent from requests.exceptions import Timeout from ur ...

  7. Python3爬虫使用requests爬取lol英雄皮肤

    本人博客:https://xiaoxiablogs.top 此次爬取lol英雄皮肤一共有两个版本,分别是多线程版本和非多线程版本. 多线程版本 # !/usr/bin/env python # -*- ...

  8. python scrapy 爬取西刺代理ip(一基础篇)(ubuntu环境下) -赖大大

    第一步:环境搭建 1.python2 或 python3 2.用pip安装下载scrapy框架 具体就自行百度了,主要内容不是在这. 第二步:创建scrapy(简单介绍) 1.Creating a p ...

  9. python+scrapy 爬取西刺代理ip(一)

    转自:https://www.cnblogs.com/lyc642983907/p/10739577.html 第一步:环境搭建 1.python2 或 python3 2.用pip安装下载scrap ...

随机推荐

  1. IDEA 单元测试

    下载所需的两个 jar 包,下载地址:Download and Install · junit-team/junit4 Wiki · GitHub junit-4.12.jar hamcrest-co ...

  2. codeforces 965E Trie+multiset

    E. Short Code time limit per test 1 second memory limit per test 256 megabytes input standard input ...

  3. 什么是javascript闭包?

    在我们开发中,也经常使用到闭包,但当有人问什么是闭包,就会可能说不上来.那就谈谈一些基本的: 一.理解闭包的概念, 简单说当function里嵌套function时,内部的function可以访问外部 ...

  4. poj Find a multiple【鸽巢原理】

    参考:https://www.cnblogs.com/ACShiryu/archive/2011/08/09/poj2356.html 鸽巢原理??? 其实不用map但是习惯了就打的map 以下C-c ...

  5. [BZOJ1331]魔板

    Description 在成功地发明了魔方之后,鲁比克先生发明了它的二维版本,称作魔板.这是一张有8个大小相同的格子的魔板: 1 2 3 4 8 7 6 5 我们知道魔板的每一个方格都有一种颜色.这8 ...

  6. I - Andy's First Dictionary(set+stringstream)

    Description Andy, 8, has a dream - he wants to produce his very own dictionary. This is not an easy ...

  7. Java 遍历Map对象的4种方法

    http://blog.csdn.net/tjcyjd/article/details/11111401

  8. LN : leetcode 283 Move Zeroes

    lc 283 Move Zeroes 283 Move Zeroes Given an array nums, write a function to move all 0's to the end ...

  9. CNN:测试一下YoloV3

    项目地址:https://pjreddie.com/darknet/yolo/ mAP提升了不少,在VS上试一把 V3 的权值: https://pjreddie.com/media/files/yo ...

  10. QT 杂记

    1.按F4切换designer和Edit视图. 2.加载同目录下的js文件: import "XXX.js" as MyJs //首字母一定要大写 3.qml 引用的js中对象.字 ...