import requests
from fake_useragent import UserAgent
from lxml import etree
from http import cookiejar
import re, time
import pymysql
import random
from requests.exceptions import Timeout ua = UserAgent() session = requests.Session() class MyException(Exception):
'''自定义一个异常''' def __init__(self, status, msg):
self.status = status
self.msg = msg
super().__init__() class AnKeJu:
'''
北京新房 https://bj.fang.anjuke.com/
北京二手房 https://beijing.anjuke.com/sale/
北京租房 https://bj.zu.anjuke.com/
想要爬取不同城市的信息,只需将bj改为对应的城市信息
''' # 本来想写下登陆的,但是他好像没有密码登陆,只有手机验证码。我说的普通用户
is_login = False
city_dict = {}
conn = None
proxies = None def __init__(self):
self.session = session
self.session.headers = {
"user-agent": ua.random
}
self.session.cookies = cookiejar.LWPCookieJar(filename="./cookies.txt") if not self.conn:
self.conn = pymysql.connect(host="127.0.0.1",
port=3306,
user="root",
db="ankeju")
self.conn.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
self.__get_all_city() def __response_to_xml(self, response):
'''将response处理为xml格式数据'''
xml = etree.HTML(response.text)
return xml def __get_all_city(self):
api = "https://www.anjuke.com/sy-city.html"
headers = self.session.headers.copy()
response = self.session.get(api, headers=headers)
xml = self.__response_to_xml(response)
city_xpath_list = xml.xpath("//div[@class='city_list']")[0:-1]
city_name_list = [city_xpath.xpath("a/text()") for city_xpath in city_xpath_list]
city_url_list = [city_xpath.xpath("a/@href") for city_xpath in city_xpath_list]
city_dict_value = []
city_dict_key = [] # 这里真不知道怎么取变量名了
# city_url_list它的格式是list套多个list,因为这个页面是按照A,B,C,D...这样排的
for letter_url_list in city_url_list:
for city_url in letter_url_list:
shorthand_city = re.findall(r"//(.*?)\.", city_url)[0]
city_dict_value.append(shorthand_city) for aa_list in city_name_list:
for city_name in aa_list:
city_dict_key.append(city_name) self.city_dict = {k: v for k, v in zip(city_dict_key, city_dict_value)} def __is_exist_next_page(self, response):
'''判断二手房当前页面是否存在下一页'''
xml = self.__response_to_xml(response)
next_page_url = xml.xpath("//*[@class='aNxt']/@href")
if next_page_url:
return next_page_url[0]
return False def __get_html_information_v2(self, response):
'''获取二手房当前页面的房子信息'''
xml = self.__response_to_xml(response) # 检测是不是访问验证的页面 if xml.xpath("//*[@id='verify_page']"):
# 出现了爬虫检测
# 只要你的ip地址,都会出现访问验证这个页面,我也不清楚我用了代理,还是被检测出了ip问题
# 那只有调用selenium去进行破解了
pass # 获取到房子的信息
li_xpath_list = xml.xpath("//*[@id='houselist-mod-new']//li[@class='list-item']")
for li_xpath in li_xpath_list:
house_info = []
# 获取房子的img地址
house_img_url = li_xpath.xpath("div[@class='item-img']/img/@src")[0]
house_info.append(house_img_url) # 获取介绍房子的title
house_title = li_xpath.xpath("div[@class='house-details']/div[1]/a/text()")[0].strip()
house_info.append(house_title)
# 获取房子详情信息
house_details = li_xpath.xpath("div[@class='house-details']/div[2]")[0].xpath("string(.)").strip().split(
"")[0]
house_info.append(house_details)
# 获取房子地址 可能会存在地址没有的请求
try:
house_address = li_xpath.xpath("div[@class='house-details']/div[3]/span/@title")[
0].strip() or "暂时没有地址信息"
except IndexError:
house_address = "暂时没有地址信息"
house_info.append(house_address)
# 获取房子的总价钱
house_total_price = li_xpath.xpath("div[@class='pro-price']/span[1]")[0].xpath("string(.)").strip()
house_info.append(house_total_price)
# 获取房子的房价
house_price = li_xpath.xpath("div[@class='pro-price']/span[2]/text()")[0]
house_info.append(house_price)
# 获取房子标签
house_tags = li_xpath.xpath("div[@class='house-details']/div[@class='tags-bottom']")[0].xpath(
"string(.)").strip() or "暂无房子标签信息" house_info.append(house_tags)
yield house_info def __get_html_information_v1(self, response):
'''获取新房当前页面的房子信息'''
xml = self.__response_to_xml(response)
if xml.xpath("//*[@id='verify_page']"):
pass div_xpath_list = xml.xpath("//div[@class='key-list imglazyload']//div[@class='item-mod ']") for div_xpath in div_xpath_list:
house_info_list = []
# 获取房子的img地址
house_img_url = div_xpath.xpath("a[@class='pic']/img/@src")[0]
house_info_list.append(house_img_url)
# 获取介绍房子的title
house_title = div_xpath.xpath("div[@class='infos']/a[@class='lp-name']/h3/span/text()")[0].strip()
house_info_list.append(house_title)
# 获取房子详情信息
try:
house_details = div_xpath.xpath("div[@class='infos']/a[@class='huxing']")[0].xpath("string(.)").strip()
house_details = re.sub("\s", "", house_details)
except IndexError:
house_details = div_xpath.xpath("div[@class='infos']/a[@class='kp-time']/text()")[0]
house_info_list.append(house_details)
# 获取房子地址
house_address = div_xpath.xpath("div[@class='infos']/a[@class='address']/span/text()")[0].strip()
house_info_list.append(house_address)
# 获取房子标签
house_tags = ",".join(div_xpath.xpath("div[@class='infos']/a[@class='tags-wrap']/div/span/text()"))
house_info_list.append(house_tags)
# 获取房子的类型
# 有些房子它是没有类型的
try:
house_type = \
div_xpath.xpath("div[@class='infos']/a[@class='tags-wrap']/div[@class='tag-panel']/i[2]/text()")[0]
except IndexError:
house_type = "无"
house_info_list.append(house_type)
# 获取房子是否还在售卖
house_is_sale = div_xpath.xpath("div[@class='infos']/a[@class='tags-wrap']/div/i[1]/text()")[0]
house_info_list.append(house_is_sale)
# 获取房子价格
# 有两种情况,一种价格确定,一种价格待定
# 价格待定也有两种,一种是周围价格,一种就是没有价格
try:
house_price = div_xpath.xpath("a[@class='favor-pos']/p[@class='price']")[0].xpath("string(.)").strip()
except IndexError:
try:
house_price = div_xpath.xpath("a[@class='favor-pos']/p[2]")[0].xpath("string(.)").strip()
except IndexError:
house_price = "暂无"
house_info_list.append(house_price)
yield house_info_list def __is_exist_next_page_v1(self, response):
'''检测新房的当前页面是否有下一页'''
xml = self.__response_to_xml(response)
next_page_url = xml.xpath("//a[@class='next-page next-link']/@href")
if next_page_url:
return next_page_url[0]
return False def __save_to_db(self, house_info_tuple, table_name):
'''将数据保存在数据库,我这里只写了租房,新房,二手房,这样写的话,那么数据表的名字必须要对应上呀'''
if table_name == "secondary_house":
sql = "insert into secondary_house (house_img_url,house_title,house_details,house_address,house_total_price,house_price,house_tags) values (%s,%s,%s,%s,%s,%s,%s)"
elif table_name == "new_house":
sql = "insert into new_house (house_img_url,house_title,house_details,house_address,house_tags,house_type,house_is_sale,house_price) values (%s,%s,%s,%s,%s,%s,%s,%s)" else:
sql = "insert into zu_house (house_img_url,house_title,house_details,house_address,house_tags,house_price) values (%s,%s,%s,%s,%s,%s)"
self.conn.cursor.execute(sql, house_info_tuple)
self.conn.commit() def __get_proxies(self):
'''从代理池获取代理'''
if not self.proxies:
self.__init_proxies()
while True:
# 这里字段较少,而且所有的数据我都需要,所以用 "*"
offset = random.randint(1, 100)
sql = "select * from proxies ORDER BY id LIMIT %s,1 "
row = self.proxies.cursor.execute(sql, (offset,))
if not row:
raise MyException(10003, "代理池错误")
res = self.proxies.cursor.fetchone()
proxies = {res["type"].lower(): "{}://{}:{}".format(res["type"].lower(), res["ip"], res["port"])}
# 检测代理是否可以使用
if self.__check_proxies(proxies):
return proxies
else:
# 删除不可用的代理的记录
del_sql = "DELETE FROM table_name where id = %s"
self.proxies.cursor.execute(del_sql, (res["id"],))
self.proxies.commit() def __check_proxies(self, proxies):
'''检测代理是否可以使用'''
api = "https://www.cnblogs.com/"
try:
res = requests.get(api, headers={"user-Agent": ua.random}, proxies=proxies, timeout=3)
if res.status_code == 200:
return True
else:
return False
except Exception:
return False def __init_proxies(self):
self.proxies = pymysql.connect(
host="127.0.0.1",
port=3306,
user="root",
db="proxies"
)
self.proxies.cursor = self.proxies.cursor(cursor=pymysql.cursors.DictCursor) def __start_secondary_spider(self, url, city):
'''处理二手房的爬虫'''
secondary_house_table_name = "secondary_house"
headers = self.session.headers
page_num = 1
while True:
time.sleep(3)
print("正在爬取 {} 第 {} 页...".format(city, page_num))
response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10) # 获取当前页面的需要的数据,保存在数据库
print("正在写入数据库...") for house_info_tuple in self.__get_html_information_v2(response):
# 额,这里我是把所有的二手房信息,保存在一张表中,当时忘记加city这个字段了,如果你要写的话,最好加上city这个字段
# 以后方便对数据库中的数据进行处理的话,就相对来说好很多
self.__save_to_db(house_info_tuple, secondary_house_table_name) # 测试了一下,二手房数据最多50页,但是最好还是根据下一页去获取到下一页的数据
next_page_url = self.__is_exist_next_page(response)
if not next_page_url:
raise MyException(10000, "{}二手房--数据爬取完毕...".format(city))
url = next_page_url
page_num += 1 def __start_new_house_spider(self, url, city):
'''处理新房的爬虫'''
new_house_table_name = "new_house"
headers = self.session.headers
page_num = 1
while True:
time.sleep(3)
print("正在爬取 {} 第 {} 页...".format(city, page_num))
response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)
print("正在写入数据库...")
for house_info_list in self.__get_html_information_v1(response):
self.__save_to_db(house_info_list, new_house_table_name)
next_page_url = self.__is_exist_next_page_v1(response)
if not next_page_url:
raise MyException(10000, "{}新房--数据爬取完毕...".format(city))
url = next_page_url
page_num += 1 def __get_html_information_v3(self, response):
'''获取租房页面的房子信息'''
xml = self.__response_to_xml(response)
if xml.xpath("//*[@id='verify_page']"):
pass div_xpath_list = xml.xpath("//div[@class='zu-itemmod']")
for div_xpath in div_xpath_list:
house_info_list = [] house_img_url = div_xpath.xpath("a/img/@src")[0]
house_info_list.append(house_img_url) house_title = div_xpath.xpath("div[@class='zu-info']/h3/a/text()")[0].strip()
house_info_list.append(house_title) house_details = div_xpath.xpath("div[@class='zu-info']/p[@class='details-item tag']")[0].xpath(
"string(.)").strip().split("")[0]
house_details = re.sub("\s", "", house_details)
house_info_list.append(house_details) house_address = div_xpath.xpath("div[@class='zu-info']/address[@class='details-item']")[0].xpath(
"string(.)").strip().replace("\xa0", "")
house_address = re.sub("\s", "", house_address)
house_info_list.append(house_address) house_tags = ",".join(div_xpath.xpath("div[@class='zu-info']/p[@class='details-item bot-tag']/span/text()"))
house_info_list.append(house_tags) house_price = div_xpath.xpath("div[@class='zu-side']/p")[0].xpath("string(.)").strip()
house_info_list.append(house_price) yield house_info_list def __is_exist_next_page_v3(self, response):
'''判断租房页面是否有下一页'''
xml = self.__response_to_xml(response)
next_page_url = xml.xpath("//a[@class='aNxt']/@href")
if next_page_url:
return next_page_url[0]
return False def __start_zu_house_spider(self, url, city):
'''爬取租房'''
zu_house_table_name = "zu_house"
headers = self.session.headers
page_num = 1
while True:
time.sleep(3)
print("正在爬取 {} 第 {} 页...".format(city, page_num))
try:
response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)
except Timeout:
response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)
print("正在写入数据库...")
for house_info_list in self.__get_html_information_v3(response):
self.__save_to_db(house_info_list, zu_house_table_name)
next_page_url = self.__is_exist_next_page_v3(response)
if not next_page_url:
raise MyException(10000, "{}租房--数据爬取完毕...".format(city))
url = next_page_url
page_num += 1 def spider_zufang(self, city: str = "北京", allow_all: bool = False):
'''爬取租房信息'''
while True:
format_city = self.city_dict.pop(city)
assert bool(format_city) is True, "请输入正确的地区"
start_url = "https://{}.zu.anjuke.com/".format(format_city)
try:
self.__start_zu_house_spider(start_url, city)
except MyException as e:
if e.status == 10000:
print(e.msg)
if allow_all:
try:
city = list(self.city_dict.keys()).pop(0)
except IndexError:
print("全部爬取完毕")
return
else:
return def spider_new_house(self, city: str = "北京", allow_all: bool = False):
'''爬取新房'''
while True:
format_city = self.city_dict.pop(city)
assert bool(format_city) is True, "请输入正确的地区"
start_url = "https://{}.fang.anjuke.com/".format(format_city)
try:
self.__start_new_house_spider(start_url, city)
except MyException as e:
if e.status == 10000:
print(e.msg)
if allow_all:
try:
city = list(self.city_dict.keys()).pop(0)
except IndexError:
print("全部爬取完毕")
return
else:
return def spider_secondary(self, city: str = "北京", allow_all: bool = False):
'''
:param city: 默认是北京
:return:
'''
# 这里直接是要bj也是可以的,他会帮我们重定向beijing
while True:
format_city = self.city_dict.pop(city)
assert bool(format_city) is True, "请输入正确的地区"
start_url = "https://{}.anjuke.com/sale/".format(format_city)
try:
self.__start_secondary_spider(start_url, city)
except MyException as e:
if e.status == 10000:
print(e.msg)
if allow_all:
try:
city = list(self.city_dict.keys()).pop(0)
except IndexError:
print("全部爬取完毕")
return
else:
return def __del__(self):
self.conn.close()
if self.proxies:
self.proxies.close() def test(self):
'''测试bug专用方法'''
res = self.session.get("https://al.zu.anjuke.com/", headers=self.session.headers)
n = 1
for i in self.__get_html_information_v3(res):
print(n)
print(i)
n += 1 if __name__ == '__main__':
anjuke = AnKeJu()
# anjuke.spider_secondary(allow_all=True)
# anjuke.spider_new_house(allow_all=True)
# anjuke.spider_zufang(allow_all=True)
# anjuke.test()

补上数据库获取到的数据。。创建数据库的时候,最好添加一个city的字段,要不然太乱了

python3爬虫-通过requests获取安居客房屋信息的更多相关文章

  1. python3爬虫-通过requests获取拉钩职位信息

    import requests, json, time, tablib def send_ajax_request(data: dict): try: ajax_response = session. ...

  2. python3 爬虫之爬取安居客二手房资讯(第一版)

    #!/usr/bin/env python3 # -*- coding: utf-8 -*- # Author;Tsukasa import requests from bs4 import Beau ...

  3. python3爬虫-通过requests爬取图虫网

    import requests from fake_useragent import UserAgent from requests.exceptions import Timeout from ur ...

  4. PyCharm+Scrapy爬取安居客楼盘信息

    一.说明 1.1 开发环境说明 开发环境--PyCharm 爬虫框架--Scrapy 开发语言--Python 3.6 安装第三方库--Scrapy.pymysql.matplotlib 数据库--M ...

  5. Python——安居客租房信息爬取(以南昌为例)

    前言: 提前安装好所需要的库. 本代码的输入仅需要某个城市的租房地址首页即可,其他自会生成. 使用前请创建所需的目录,或者为代码添加os.makedir() 支持断点重爬,重行运行即可. header ...

  6. python3爬虫抓取智联招聘职位信息代码

    上代码,有问题欢迎留言指出. # -*- coding: utf-8 -*- """ Created on Tue Aug 7 20:41:09 2018 @author ...

  7. python3爬虫-使用requests爬取起点小说

    import requests from lxml import etree from urllib import parse import os, time def get_page_html(ur ...

  8. python3爬虫之requests库基本使用

    官方文档链接(中文) https://2.python-requests.org/zh_CN/latest/ requests  基于  urllib3 ,python编写. 安装 pip insta ...

  9. Python3爬虫使用requests爬取lol英雄皮肤

    本人博客:https://xiaoxiablogs.top 此次爬取lol英雄皮肤一共有两个版本,分别是多线程版本和非多线程版本. 多线程版本 # !/usr/bin/env python # -*- ...

随机推荐

  1. JeePlus:目录

    ylbtech-JeePlus:目录 1.返回顶部 0. http://www.jeeplus.org/ 0.2.文档 http://wiki.jeeplus.org/docs/show/75 0.3 ...

  2. 构造 Codeforces Round #107 (Div. 2) B. Phone Numbers

    题目传送门 /* 构造:结构体排个序,写的有些啰嗦,主要想用用流,少些了判断条件WA好几次:( */ #include <cstdio> #include <algorithm> ...

  3. 二分图最大匹配(匈牙利算法) UVA 10080 Gopher II

    题目传送门 /* 匈牙利算法:这题比UVA_670简单,注意是要被吃的鼠的最少个数,套模板 */ #include <cstdio> #include <algorithm> ...

  4. ACM_01背包2

    背包4 Time Limit: 2000/1000ms (Java/Others) Problem Description: 有n个重量和价值分别为Wi,Vi的物品,现从这些物品中挑选出总量不超过W的 ...

  5. 全面学习ORACLE Scheduler特性(10)管理Chains

    5.2  管理Chains 5.2.1  修改Chains属性 基本上碰到修改CHAIN属性的机率不会太大,因此确实没啥可修改的,对于CHAIN对象来说,能够修改的属性只有两个:evaluation_ ...

  6. C#知识点-枚举器和迭代器

    一.几个基本概念的理解 问题一:为什么数组可以使用foreach输出各元素 答:数组是可枚举类型,它实现了一个枚举器(enumerator)对象:枚举器知道各元素的次序并跟踪它们的位置,然后返回请求的 ...

  7. EasyUI系列学习(五)-Resizable(调整大小)

    一.创建组件 1.使用标签创建可变大小的窗口 <div id="rBox" class="easyui-resizable" style="wi ...

  8. LN : leetcode 258 Add Digits

    lc 258 Add Digits lc 258 Add Digits Given a non-negative integer num, repeatedly add all its digits ...

  9. [ GDOI 2014 ] 拯救莫莉斯

    \(\\\) \(Description\) 有一个 \(N\times M\) 的网格,每个格点都有权值,图是四连通的. 现在选择一个点集,使得每个格点要么被选中,要么连通的点之一被选中. 求这个点 ...

  10. git add . 的时候报错fatal: Unable to create : …File exists.

    报错内容: $ git add . fatal: Unable to create 'E:/project/qbm_cs/.git/index.lock': File exists. Another ...