import requests
import time, random, csv
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from threadpool import ThreadPool, makeRequests def request_url(city_code, city_name, city_letter):
"""
请求主页
"""
with open('has_elong.json', 'a+', encoding='utf-8') as hs:
hs.write(city_code + '\n')
hs.close()
if city_code and int(city_code) < 1000:
city_code = '' + str(city_code)
else:
city_code = str(city_code)
with open('艺龙/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as f:
cs = csv.writer(f, dialect='excel')
# [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
cs.writerow(['酒店名称', '价格', '地址', '星级', '主题', '可供服务', '酒店信息'])
# 循环1-89页
for n in range(1, 89):
url = 'http://hotel.elong.com/%s/' % city_letter
data = {
"code": "",
"listRequest.areaID": "",
"listRequest.bookingChannel": "",
"listRequest.cardNo": "",
"listRequest.checkInDate": "2019-03-02 00:00:00", # 入住时间
"listRequest.checkOutDate": "2019-03-03 00:00:00", # 离开时间
"listRequest.cityID": city_code,
"listRequest.cityName": city_name, # 北京等地区
"listRequest.customLevel": "",
"listRequest.distance": "",
"listRequest.endLat": "",
"listRequest.endLng": "",
"listRequest.facilityIds": "",
"listRequest.highPrice": "",
"listRequest.hotelBrandIDs": "",
"listRequest.isAdvanceSave": "false",
"listRequest.isAfterCouponPrice": "true",
"listRequest.isCoupon": "false",
"listRequest.isDebug": "false",
"listRequest.isLimitTime": "false",
"listRequest.isLogin": "false",
"listRequest.isMobileOnly": "true",
"listRequest.isNeed5Discount": "true",
"listRequest.isNeedNotContractedHotel": "false",
"listRequest.isNeedSimilarPrice": "false",
"listRequest.isReturnNoRoomHotel": "true",
"listRequest.isStaySave": "false",
"listRequest.isTrace": "false",
"listRequest.isUnionSite": "false",
"listRequest.keywords": "",
"listRequest.keywordsType": "",
"listRequest.language": "cn",
"listRequest.listType": "",
"listRequest.lowPrice": "",
"listRequest.orderFromID": "",
"listRequest.pageIndex": n, # 翻页
"listRequest.pageSize": "",
"listRequest.payMethod": "",
"listRequest.personOfRoom": "",
"listRequest.poiId": "",
"listRequest.promotionChannelCode": "",
"listRequest.proxyID": "ZD",
"listRequest.rankType": "",
"listRequest.returnFilterItem": "true",
"listRequest.sellChannel": "",
"listRequest.seoHotelStar": "",
"listRequest.sortDirection": "",
"listRequest.sortMethod": "",
"listRequest.starLevels": "",
"listRequest.startLat": "",
"listRequest.startLng": "",
"listRequest.taRecommend": "false",
"listRequest.themeIds": "",
"listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e",
"listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61"
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Content-Length': '',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie':'……61b8-48a1-b398-8b9ec1903f05……',
'Host': 'hotel.elong.com',
'Origin': 'http://hotel.elong.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://hotel.elong.com/%s/' % city_letter,
'User-Agent': UserAgent(verify_ssl=False).random,
'X-Requested-With': 'XMLHttpRequest'
}
try:
time.sleep(random.randint(1, 4))
res = requests.get(url, data=data, headers=headers)
dete_list = get_info_and_req_details(res.text)
for data in dete_list:
cs.writerow(data)
except Exception:
continue
f.close() def get_info_and_req_details(html):
"""
清洗该页列表数据并向请求各个酒店的详情页
page_list = [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
"""
bs = BeautifulSoup(html, "lxml")
h_list = bs.find_all('div', attrs={'class': 'h_item'})
page_list = []
i = 0
for hotel in h_list:
if i < 25:
try:
hotel_name = hotel.find('div', attrs={'class': 'h_info_pic'}).find('img').get('alt')
hotel_price = str(hotel.find('span', attrs={'class': 'h_pri_num'}).get_text()) + '元起'
hotel_add = hotel.find('p', attrs={'class': 'h_info_b2'}).find('a').get_text().replace('[', '').replace(']', '')
hotel_ress = hotel.find('span', attrs={'class': 'l1'}).get('data-hoteladdress')
try:
hotel_grade = hotel.find('b', attrs={'class': 'icon_stars'}).get('title')
except Exception:
hotel_grade = '经济型'
try:
hotel_theme = hotel.find('div', attrs={'class': 'tagList'}).get_text().replace('\n', ',')
except Exception:
hotel_theme = ''
try:
hotel_link = hotel.find('div', attrs={'class': 'h_info_pic'}).find('a').get('href')
time.sleep(random.randint(1, 3))
detail_html = requests.get('http://hotel.elong.com%s#hotelContent' % hotel_link)
server, hotel_info = get_details(detail_html.text)
except Exception:
server = ''
hotel_info = ''
except Exception:
continue
page_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info])
i += 1
return page_list def get_details(detail_html):
"""
清洗详情页数据
"""
detail = BeautifulSoup(detail_html, 'lxml')
server = ''
hotel_info = ''
try:
server = detail.find('ul', attrs={'class': 'dview_icon_list'}).get_text().replace('\n', ',')
hotel_info = detail.find('div', attrs={'class': 'dview_info'}).get_text().replace('\n', ',').replace('\t', ',')
except Exception:
return server, hotel_info
return server, hotel_info if __name__ == '__main__':
has_num = []
req_list = []
  // 地址爬取请借鉴爬取携程酒店信息
for line in open('elong.json', encoding='utf-8'):
line_list = line.replace("\n", "").split(',')
for has in open("has_elong.json", encoding='utf-8'):
has_num.append(int(has.replace('\n', '')))
if int(line_list[0]) in has_num:
continue
# request_url(line_list[0], line_list[1], line_list[2])
line_tuple = (line_list, None)
req_list.append(line_tuple)
pool = ThreadPool(3)
requests_list = makeRequests(request_url, req_list)
[pool.putRequest(req) for req in requests_list]
pool.wait()

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中的更多相关文章

  1. 使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中

    import requests import json import re import csv import threadpool import time, random from bs4 impo ...

  2. Python+Requests+异步线程池爬取视频到本地

    1.本次项目为获取梨视频中的视频,再使用异步线程池下载视频到本地 2.获取视频时,其地址中的Url是会动态变化,不播放时src值为图片的地址,播放时src值为mp4格式 3.查看视频链接是否存在aja ...

  3. Python爬取猫眼电影100榜并保存到excel表格

    首先我们前期要导入的第三方类库有; 通过猫眼电影100榜的源码可以看到很有规律 如: 亦或者是: 根据规律我们可以得到非贪婪的正则表达式 """<div class ...

  4. 爬取拉勾网所有python职位并保存到excel表格 对象方式

    # 1.把之间案例,使用bs4,正则,xpath,进行数据提取. # 2.爬取拉钩网上的所有python职位. from urllib import request,parse import json ...

  5. 爬取淘宝商品数据并保存在excel中

    1.re实现 import requests from requests.exceptions import RequestException import re,json import xlwt,x ...

  6. 基于requests模块的cookie,session和线程池爬取

    目录 基于requests模块的cookie,session和线程池爬取 基于requests模块的cookie操作 基于requests模块的代理操作 基于multiprocessing.dummy ...

  7. 【原创】py3+requests+json+xlwt,爬取拉勾招聘信息

    在拉勾搜索职位时,通过谷歌F12抓取请求信息 发现请求是一个post请求,参数为: 返回的是json数据 有了上面的基础,我们就可以构造请求了 然后对获取到的响应反序列化,这样就获取到了json格式的 ...

  8. py3+requests+json+xlwt,爬取拉勾招聘信息

    在拉勾搜索职位时,通过谷歌F12抓取请求信息 发现请求是一个post请求,参数为: 返回的是json数据 有了上面的基础,我们就可以构造请求了 然后对获取到的响应反序列化,这样就获取到了json格式的 ...

  9. python爬取数据保存到Excel中

    # -*- conding:utf-8 -*- # 1.两页的内容 # 2.抓取每页title和URL # 3.根据title创建文件,发送URL请求,提取数据 import requests fro ...

随机推荐

  1. Cannot create PoolableConnectionFactory (Could not create connection to database server.)

    是由于mysql驱动版本太低导致. 本人 jdk版本是1.8.0_211,mysql版本是8.0.17,mysql-connector-java版本是5.1.39, 后来把mysql-connecto ...

  2. 软工作业-14组铁大FaceBook网站使用体验

    铁大facebook是面向铁道大学学生的一个空间网站,空间界面十分朴素,灰色的色调.基本可以满足日常的发动态需求,但也存在一些问题: 比如发动态不是很方便,必须要进入到某一个空间才能发动态 .就有一些 ...

  3. springboot下Caused by: java.lang.IllegalArgumentException: Property 'sqlSessionFactory' or 'sqlSessionTemplate' are required

    已检查jar包是否引入 <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId ...

  4. 安装mysql遇到的问题

    想在自己的PC上安装mysql服务器,首先在官网下载mysql的安装文件. MySQL安装文件分两种 .msi和.zip ,.msi需要安装,.zip文件需要配置环境变量. 我首先下载的是不需要安装的 ...

  5. 揭秘webpack loader

    前言 Loader(加载器) 是 webpack 的核心之一.它用于将不同类型的文件转换为 webpack 可识别的模块.本文将尝试深入探索 webpack 中的 loader,揭秘它的工作原理,以及 ...

  6. VSCODE更改文件时,提示EACCES permission denied的解决办法(mac电脑系统)

    permission denied:权限问题 具体解决办法: 1.在项目文件夹右键-显示简介-点击右下角解锁 2.权限全部设置为读与写 3.最关键一步:点击"应用到包含的项目",这 ...

  7. 痞子衡嵌入式:语音处理工具pzh-speech诞生记(2)- 界面构建(wxFormBuilder3.8.0)

    大家好,我是痞子衡,是正经搞技术的痞子.今天痞子衡给大家介绍的是语音处理工具pzh-py-speech诞生之界面构建. 之前痞子衡设计过一个串口调试助手pzh-py-com,也专门写过一篇关于其界面构 ...

  8. Ubuntu16手动安装OpenStack

    记录大佬的博客全文转载于https://www.voidking.com/dev-ubuntu16-manual-openstack-env/ 前言 <Ubuntu16安装OpenStack&g ...

  9. Java基础系列1:深入理解Java数据类型

    Java基础系列1:深入理解Java数据类型 当初学习计算机的时候,教科书中对程序的定义是:程序=数据结构+算法,Java基础系列第一篇就聊聊Java中的数据类型. 本篇聊Java数据类型主要包括四个 ...

  10. JVM第一弹

    JVM第一弹 基本概念 JVM是可运行java代码的假想计算机,包括一套字节码指令集,一组寄存器,一个栈,一个垃圾回收.堆和一个存储方法域.JVM是运行在操作系统之上的,它与硬件没有直接的交互. 运行 ...