import requests
import json
import re
import csv
import threadpool
import time, random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent def hotel(city_letter, city_num, city_name):
with open('has_address.json', 'a+', encoding="utf-8") as f:
f.write(str(city_num) + '\n')
f.close()
ss = 0
with open('携程/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as hotel_xie:
k = csv.writer(hotel_xie, dialect='excel')
k.writerow(['序号', '名称', '价格', '星级', '地址', '酒店介绍'])
for i in range(1, 100):
url = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx"
headers = {
"Connection": "keep-alive",
"origin": "http://hotels.ctrip.com",
"Host": "hotels.ctrip.com",
"referer": "http://hotels.ctrip.com/hotel/%s" % city_letter,
"user-agent": UserAgent(verify_ssl=False).random,
"Content-Type": "application/x-www-form-urlencoded",
}
data = {
"StartTime": "2019-02-25",
"DepTime": "2019-02-26",
"RoomGuestCount": "1,1,0",
"city": city_num,
"page": i,
}
try:
time.sleep(random.randint(1, 5))
html = requests.post(url, headers=headers, data=data)
regex = re.compile(r'\\(?![/u"])')
fixed = regex.sub(r"\\\\", html.text) aa = json.loads(fixed)
except Exception:
pass
for n in range(0, 25):
try:
hotel_name = aa["hotelPositionJSON"][n]["name"]
hotel_id = aa["hotelPositionJSON"][n]["id"]
hotel_address = aa["hotelPositionJSON"][n]["address"]
price = eval(aa["HotelMaiDianData"]["value"]["htllist"])[n]["amount"]
star_class = aa["hotelPositionJSON"][n]["star"][-2:]
time.sleep(random.randint(1, 3))
hotel_intro = requests.get('http://hotels.ctrip.com/hotel/%s.html' % hotel_id)
res_req = BeautifulSoup(hotel_intro.text, "html5lib")
iss = re.sub('资质备案', '', re.sub('联系方式', '', res_req.find('div', id='htlDes').findAll('p')[0].get_text()))
ins = iss.replace('\n', '').replace(' ', '').replace(' ', '')
s = res_req.find('span', id='J_realContact')['data-real'].replace('\n', ',')
tel = s[s.rfind("电话"): s.rfind("<a") - 2]
duction = res_req.find('span', id='ctl00_MainContentPlaceHolder_hotelDetailInfo_lbDesc').get_text().replace('\n', ',')
introduction = str(ins) + str(tel) + str(duction)
ss += 1
k.writerow([ss, hotel_name, price + "元起", star_class, hotel_address, introduction])
except Exception:
continue
time.sleep(random.randint(1, 4))
hotel_xie.close() if __name__ == '__main__':
has_num = []
will_req_list = []
for line in open("address.json", encoding='utf-8'):
single_list = line.replace("\n", "").split(',')
for has in open("has_address.json", encoding='utf-8'):
has_num.append(int(has.replace("\n", "")))
if int(single_list[1]) in has_num:
continue
single_tuple = (single_list, None)
will_req_list.append(single_tuple)
pool = threadpool.ThreadPool(8)
request_list = threadpool.makeRequests(hotel, will_req_list)
[pool.putRequest(req) for req in request_list]
pool.wait() # 爬取地址
# h = {
# "Connection": "keep-alive",
# "origin": "http://hotels.ctrip.com",
# "Host": "hotels.ctrip.com",
# "referer": "http://hotels.ctrip.com/hotel/beijing1",
# "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
# "Content-Type": "application/x-www-form-urlencoded",
# }
# res = requests.get('http://hotels.ctrip.com/Domestic/Tool/AjaxGetCitySuggestion.aspx', headers=h)
# a_list = re.findall('data:(.*?),group:', res.text)
# with open('address.json', 'w+', encoding="utf-8") as f:
# for address in a_list:
# i = 0
# al = address.split(',')
# for a in al:
# city_letter = ''.join(re.findall(r'[A-Za-z]', a))
# f.write(city_letter + ',')
# city_num = re.sub("\D", "", a)
# f.write(str(city_num))
# city_name = re.sub('[A-Za-z0-9\"\|]', "", a)
# f.write(',' + str(city_name))
# f.write('\n')
# i += 1
# f.close()

使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中的更多相关文章

  1. 使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中

    import requests import time, random, csv from fake_useragent import UserAgent from bs4 import Beauti ...

  2. Python爬取猫眼电影100榜并保存到excel表格

    首先我们前期要导入的第三方类库有; 通过猫眼电影100榜的源码可以看到很有规律 如: 亦或者是: 根据规律我们可以得到非贪婪的正则表达式 """<div class ...

  3. 爬取拉勾网所有python职位并保存到excel表格 对象方式

    # 1.把之间案例,使用bs4,正则,xpath,进行数据提取. # 2.爬取拉钩网上的所有python职位. from urllib import request,parse import json ...

  4. 爬取淘宝商品数据并保存在excel中

    1.re实现 import requests from requests.exceptions import RequestException import re,json import xlwt,x ...

  5. 基于requests模块的cookie,session和线程池爬取

    目录 基于requests模块的cookie,session和线程池爬取 基于requests模块的cookie操作 基于requests模块的代理操作 基于multiprocessing.dummy ...

  6. Python+Requests+异步线程池爬取视频到本地

    1.本次项目为获取梨视频中的视频,再使用异步线程池下载视频到本地 2.获取视频时,其地址中的Url是会动态变化,不播放时src值为图片的地址,播放时src值为mp4格式 3.查看视频链接是否存在aja ...

  7. python爬取数据保存到Excel中

    # -*- conding:utf-8 -*- # 1.两页的内容 # 2.抓取每页title和URL # 3.根据title创建文件,发送URL请求,提取数据 import requests fro ...

  8. 使用pandas中的raad_html函数爬取TOP500超级计算机表格数据并保存到csv文件和mysql数据库中

    参考链接:https://www.makcyun.top/web_scraping_withpython2.html #!/usr/bin/env python # -*- coding: utf-8 ...

  9. 「拉勾网」薪资调查的小爬虫,并将抓取结果保存到excel中

    学习Python也有一段时间了,各种理论知识大体上也算略知一二了,今天就进入实战演练:通过Python来编写一个拉勾网薪资调查的小爬虫. 第一步:分析网站的请求过程 我们在查看拉勾网上的招聘信息的时候 ...

随机推荐

  1. poj 2253 最短路 or 最小生成树

    Freddy Frog is sitting on a stone in the middle of a lake. Suddenly he notices Fiona Frog who is sit ...

  2. 枚举 xor

    题意:输入整数n(1<=n<=3千万),有多少对整数(a,b)满足:1<=b<=a<=n,且gcd(a,b)=a XOR b.例如:n=7时,有4对:(3,2),(5,4 ...

  3. Oracle安装连接常见错误

    oracle安装注意:安装路径url不能带中文C:\app\59428\product\11.2.0\dbhome_1\sqldeveloper打开sqldeveloper的时候,需要输入java.e ...

  4. Oracle Autonomous Health Framework (AHF) 解读

    AHF介绍 Oracle在2019年10月18日发布自治健康框架Autonomous Health Framework (AHF) 19.3,将ORAchk,EXAchk,TFA三种诊断工具合并入AH ...

  5. 关于爬虫的日常复习(11)—— 实战:flask+redis维护代理池(to be continue)

  6. mongdb角色的授权

    开启cmd窗口切换到cd D:\programs\mongoDB\bin D:\programs\mongoDB\bin>mongo MongoDB shell version v3.4.6 c ...

  7. 多个github账号时,本地配置ssh-key

    由于需要,申请了多个github账号,但是都是在同一台电脑上操作,原来只有一个账号进行ssh操作时,推送没有遇到什么问题,现在有多个账号了,推送的时候就有点懵逼了,下面是根据网上的资料来进行多个账号, ...

  8. [洛谷P4178] Tree (点分治模板)

    题目略了吧,就是一棵树上有多少个点对之间的距离 \(\leq k\) \(n \leq 40000\) 算法 首先有一个 \(O(n^2)\) 的做法,枚举每一个点为起点,\(dfs\) 一遍可知其它 ...

  9. 使用Razor表达式 使用条件语句 来自 精通ASP-NET-MVC-5-弗瑞曼

  10. Exchange Server 2016 本地部署安装流程

    思路:一台Server 2016用作AD+DNS,一台Server 2016用作Exchange Server 2016 Exchange Server 2016 CU14 安装路径:安装路径:htt ...