import requests
import time, random, csv
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from threadpool import ThreadPool, makeRequests def request_url(city_code, city_name, city_letter):
"""
请求主页
"""
with open('has_elong.json', 'a+', encoding='utf-8') as hs:
hs.write(city_code + '\n')
hs.close()
if city_code and int(city_code) < 1000:
city_code = '' + str(city_code)
else:
city_code = str(city_code)
with open('艺龙/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as f:
cs = csv.writer(f, dialect='excel')
# [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
cs.writerow(['酒店名称', '价格', '地址', '星级', '主题', '可供服务', '酒店信息'])
# 循环1-89页
for n in range(1, 89):
url = 'http://hotel.elong.com/%s/' % city_letter
data = {
"code": "",
"listRequest.areaID": "",
"listRequest.bookingChannel": "",
"listRequest.cardNo": "",
"listRequest.checkInDate": "2019-03-02 00:00:00", # 入住时间
"listRequest.checkOutDate": "2019-03-03 00:00:00", # 离开时间
"listRequest.cityID": city_code,
"listRequest.cityName": city_name, # 北京等地区
"listRequest.customLevel": "",
"listRequest.distance": "",
"listRequest.endLat": "",
"listRequest.endLng": "",
"listRequest.facilityIds": "",
"listRequest.highPrice": "",
"listRequest.hotelBrandIDs": "",
"listRequest.isAdvanceSave": "false",
"listRequest.isAfterCouponPrice": "true",
"listRequest.isCoupon": "false",
"listRequest.isDebug": "false",
"listRequest.isLimitTime": "false",
"listRequest.isLogin": "false",
"listRequest.isMobileOnly": "true",
"listRequest.isNeed5Discount": "true",
"listRequest.isNeedNotContractedHotel": "false",
"listRequest.isNeedSimilarPrice": "false",
"listRequest.isReturnNoRoomHotel": "true",
"listRequest.isStaySave": "false",
"listRequest.isTrace": "false",
"listRequest.isUnionSite": "false",
"listRequest.keywords": "",
"listRequest.keywordsType": "",
"listRequest.language": "cn",
"listRequest.listType": "",
"listRequest.lowPrice": "",
"listRequest.orderFromID": "",
"listRequest.pageIndex": n, # 翻页
"listRequest.pageSize": "",
"listRequest.payMethod": "",
"listRequest.personOfRoom": "",
"listRequest.poiId": "",
"listRequest.promotionChannelCode": "",
"listRequest.proxyID": "ZD",
"listRequest.rankType": "",
"listRequest.returnFilterItem": "true",
"listRequest.sellChannel": "",
"listRequest.seoHotelStar": "",
"listRequest.sortDirection": "",
"listRequest.sortMethod": "",
"listRequest.starLevels": "",
"listRequest.startLat": "",
"listRequest.startLng": "",
"listRequest.taRecommend": "false",
"listRequest.themeIds": "",
"listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e",
"listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61"
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Content-Length': '',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie':'……61b8-48a1-b398-8b9ec1903f05……',
'Host': 'hotel.elong.com',
'Origin': 'http://hotel.elong.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://hotel.elong.com/%s/' % city_letter,
'User-Agent': UserAgent(verify_ssl=False).random,
'X-Requested-With': 'XMLHttpRequest'
}
try:
time.sleep(random.randint(1, 4))
res = requests.get(url, data=data, headers=headers)
dete_list = get_info_and_req_details(res.text)
for data in dete_list:
cs.writerow(data)
except Exception:
continue
f.close() def get_info_and_req_details(html):
"""
清洗该页列表数据并向请求各个酒店的详情页
page_list = [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
"""
bs = BeautifulSoup(html, "lxml")
h_list = bs.find_all('div', attrs={'class': 'h_item'})
page_list = []
i = 0
for hotel in h_list:
if i < 25:
try:
hotel_name = hotel.find('div', attrs={'class': 'h_info_pic'}).find('img').get('alt')
hotel_price = str(hotel.find('span', attrs={'class': 'h_pri_num'}).get_text()) + '元起'
hotel_add = hotel.find('p', attrs={'class': 'h_info_b2'}).find('a').get_text().replace('[', '').replace(']', '')
hotel_ress = hotel.find('span', attrs={'class': 'l1'}).get('data-hoteladdress')
try:
hotel_grade = hotel.find('b', attrs={'class': 'icon_stars'}).get('title')
except Exception:
hotel_grade = '经济型'
try:
hotel_theme = hotel.find('div', attrs={'class': 'tagList'}).get_text().replace('\n', ',')
except Exception:
hotel_theme = ''
try:
hotel_link = hotel.find('div', attrs={'class': 'h_info_pic'}).find('a').get('href')
time.sleep(random.randint(1, 3))
detail_html = requests.get('http://hotel.elong.com%s#hotelContent' % hotel_link)
server, hotel_info = get_details(detail_html.text)
except Exception:
server = ''
hotel_info = ''
except Exception:
continue
page_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info])
i += 1
return page_list def get_details(detail_html):
"""
清洗详情页数据
"""
detail = BeautifulSoup(detail_html, 'lxml')
server = ''
hotel_info = ''
try:
server = detail.find('ul', attrs={'class': 'dview_icon_list'}).get_text().replace('\n', ',')
hotel_info = detail.find('div', attrs={'class': 'dview_info'}).get_text().replace('\n', ',').replace('\t', ',')
except Exception:
return server, hotel_info
return server, hotel_info if __name__ == '__main__':
has_num = []
req_list = []
  // 地址爬取请借鉴爬取携程酒店信息
for line in open('elong.json', encoding='utf-8'):
line_list = line.replace("\n", "").split(',')
for has in open("has_elong.json", encoding='utf-8'):
has_num.append(int(has.replace('\n', '')))
if int(line_list[0]) in has_num:
continue
# request_url(line_list[0], line_list[1], line_list[2])
line_tuple = (line_list, None)
req_list.append(line_tuple)
pool = ThreadPool(3)
requests_list = makeRequests(request_url, req_list)
[pool.putRequest(req) for req in requests_list]
pool.wait()

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中的更多相关文章

  1. 使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中

    import requests import json import re import csv import threadpool import time, random from bs4 impo ...

  2. Python+Requests+异步线程池爬取视频到本地

    1.本次项目为获取梨视频中的视频,再使用异步线程池下载视频到本地 2.获取视频时,其地址中的Url是会动态变化,不播放时src值为图片的地址,播放时src值为mp4格式 3.查看视频链接是否存在aja ...

  3. Python爬取猫眼电影100榜并保存到excel表格

    首先我们前期要导入的第三方类库有; 通过猫眼电影100榜的源码可以看到很有规律 如: 亦或者是: 根据规律我们可以得到非贪婪的正则表达式 """<div class ...

  4. 爬取拉勾网所有python职位并保存到excel表格 对象方式

    # 1.把之间案例,使用bs4,正则,xpath,进行数据提取. # 2.爬取拉钩网上的所有python职位. from urllib import request,parse import json ...

  5. 爬取淘宝商品数据并保存在excel中

    1.re实现 import requests from requests.exceptions import RequestException import re,json import xlwt,x ...

  6. 基于requests模块的cookie,session和线程池爬取

    目录 基于requests模块的cookie,session和线程池爬取 基于requests模块的cookie操作 基于requests模块的代理操作 基于multiprocessing.dummy ...

  7. 【原创】py3+requests+json+xlwt,爬取拉勾招聘信息

    在拉勾搜索职位时,通过谷歌F12抓取请求信息 发现请求是一个post请求,参数为: 返回的是json数据 有了上面的基础,我们就可以构造请求了 然后对获取到的响应反序列化,这样就获取到了json格式的 ...

  8. py3+requests+json+xlwt,爬取拉勾招聘信息

    在拉勾搜索职位时,通过谷歌F12抓取请求信息 发现请求是一个post请求,参数为: 返回的是json数据 有了上面的基础,我们就可以构造请求了 然后对获取到的响应反序列化,这样就获取到了json格式的 ...

  9. python爬取数据保存到Excel中

    # -*- conding:utf-8 -*- # 1.两页的内容 # 2.抓取每页title和URL # 3.根据title创建文件,发送URL请求,提取数据 import requests fro ...

随机推荐

  1. Java入门 - 导读

    原文地址:http://www.work100.net/training/java 更多教程:光束云 - 免费课程 Java入门 Java 是由 Sun Microsystems 公司于1995年5月 ...

  2. 调用Excel.Application报错的解决方法

    之前由于装了WPS后,VBA和python调用某些OFFICE的端口一直报错.网上找了无数的解决办法.也没有解决. 将注册表清理.不行. 将WPS卸载.不行. 将office重装.不行. 之后找到了个 ...

  3. 微软的github 上面 有 Docker.DotNet 嗯 作为 菜 只有欣赏的额

    .NET Client for Docker Remote API step one 需要下载的 猛戳 Docker.DotNet

  4. RMQ算法使用ST表实现

    RMQ RMQ (Range Minimum Query),指求区间最小值.普通的求区间最小值的方法是暴力. 对于一个数列: \[ A_1,~ A_2,~ A_3,~ \cdots,~ A_n \] ...

  5. Day6-Python3基础-面向对象编程

    面向过程 VS 面向对象 编程范式 编程是 程序 员 用特定的语法+数据结构+算法组成的代码来告诉计算机如何执行任务的过程 , 一个程序是程序员为了得到一个任务结果而编写的一组指令的集合,正所谓条条大 ...

  6. 每天一道Java题[9]

    题目 native关键字的作用是什么? 解答 首先,需了解JNI(Java Native Interface),它是连接Java平台与本地C代码的一个API. 其次,用native关键字声明的方法,是 ...

  7. 暑假第四周总结(HDFS编程实践,安装HBASE)

    本周根据书上以及教程的提示,对HDFS进行了编程实践,将教程所给的代码(判断文件是否存在,创建文件,读取文件)进行了应用,根据视频的讲解,对一些简单的语句有了一定的了解,但还是比较生疏.另外还根据提示 ...

  8. python笔记15

    今日内容 模块知识 内置模块 time datetime json 其他 内容回顾 & 作业题 重要知识点 构造字典和函数对应关系,避免重复的if else a=1 b=2 ==> a, ...

  9. FPGA VGA+PLL+IP核笔记

    1.实现了预定功能!整个工程,没有使用例程的25MHZ,全部统一使用50MHZ.2.分辨率使用了800*600@72HZ.3.实现了只显示白色部分,黑色部分RGB == 0,要显示背景色.VGA图形基 ...

  10. HDU_1176_DP

    http://acm.hdu.edu.cn/showproblem.php?pid=1176 简单dp,转换后跟上一题数塔一样,注意每秒只能移动一格,还有在边缘的情况. #include<ios ...