用Python写爬虫爬取58同城二手交易数据
爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意
模块1 获取分类url列表
from bs4 import BeautifulSoup
import requests,pymongo
main_url = 'http://bj.58.com/sale.shtml'
client = pymongo.MongoClient('localhost',27017)
tc_58 = client['58tc']
tab_link_list = tc_58['link_list']
web_data = requests.get(main_url)
soup = BeautifulSoup(web_data.text,'lxml')
sub_menu_link = soup.select('ul.ym-submnu > li > b > a')
link_list = []
count = 0
for link in sub_menu_link:
link = 'http://bj.58.com' + link.get('href')
#print(link)
if link == 'http://bj.58.com/shoujihao/':
pass
elif link == 'http://bj.58.com/tongxunyw/':
pass
elif link == 'http://bj.58.com/tiaozao/':
count += 1
if count == 1:
data = {'link':link}
link_list.append(data)
else:
data = {'link': link}
link_list.append(data)
for i in link_list:
tab_link_list.insert(i)
模块2 获取每个商品详情信息
from bs4 import BeautifulSoup
import requests,re,pymongo,sys
from multiprocessing import Pool
client = pymongo.MongoClient('localhost',27017)
tc_58 = client['58tc']
# detail_link = tc_58['detail_link']
tab_link_list = tc_58['link_list']
# tc_58_data = client['58tcData']
def getDetailUrl(page_url,tab):
url_list = []
web_data = requests.get(page_url)
soup = BeautifulSoup(web_data.text,'lxml')
detail_url = soup.select('div.infocon > table > tbody > tr > td.t > a[onclick]')
#获取详细页面url
for url in detail_url:
url_list.append(url.get('href').split('?')[0])
#插入mongodb
count = 0
client = pymongo.MongoClient('localhost', 27017)
tc_58 = client['58tc']
tab_list = tc_58[tab+'_list']
for i in url_list:
count += 1
tab_list.insert({'link':i})
return count
original_price_patt = re.compile('原价:(.+)')
def getInfo(detail_url):
try:
web_data = requests.get(detail_url)
soup = BeautifulSoup(web_data.text,'lxml')
title = soup.title.text.strip()
view_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text
want_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person')[0].text
current_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
current_price = current_price[0].text if current_price else None
original_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b')
original_price = original_price[0].text if original_price else None
original_price = re.findall(original_price_patt,original_price) if original_price else None
location = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
tag = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
tag = list(tag[0].stripped_strings) if tag else None
seller_name = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name')[0].text
# level = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span')
# level = str(level[0]).split('\n')
#
# full_count = 0
# half_count = 0
# for j in level:
# if '<span class="icon_png "></span>' == j:
# full_count += 1
# elif '<span class="icon_png smallScore"></span>' == j:
# half_count += 1
full_count = len(soup.find_all('span', class_='icon_png '))
half_count = len(soup.find_all('span', class_='icon_png smallScore'))
level_count = {'full':full_count,'half':half_count}
desc = soup.select('body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p')
desc = desc[0].text if desc else None
data = {
'title':title,
'view_count':view_count,
'want_count':want_count,
'current_price':current_price,
'original_price':original_price,
'location':location,
'tag':tag,
'seller_name':seller_name,
#'level':level,
'level_count':level_count,
'desc':desc,
'link':detail_url
}
return data
except:
print(sys.exc_info()[0], sys.exc_info()[1])
return None
# for i in tab_link_list.find({},{'link':1,'_id':0}):
# print(i['link'])
# getDetailUrl(i['link'])
#规律每个页面最多70页
def insertDetailLin(sub_menu_list):
patt = re.compile('.+?com/([a-z]+)/')
tab_list = []
for i in sub_menu_list.find({},{'link':1,'_id':0}):
#for i in [{'link':'http://bj.58.com/shouji/'}]:
i = i['link']
sub_menu_name = re.findall(patt,i)[0]
print(sub_menu_name+': ',end='')
url_list = []
for j in range(1,71):
link = i + 'pn' + str(j)
url_list.append(link)
cnt = 0
for k in url_list:
cnt = cnt + getDetailUrl(k, sub_menu_name)
print(str(cnt) + ' lines inserted')
if cnt != 0:
tab_list.append(sub_menu_name+'_list')
return tab_list
# for i in tab_link_list.find({},{'link':1,'_id':0}):
# print(i)
#insertDetailLin(tab_link_list)
allMenCollectionName = tc_58.collection_names()
#allMenCollectionName.remove('detail_link')
allMenCollectionName.remove('link_list')
def insertData(tab_name):
client = pymongo.MongoClient('localhost', 27017)
tc_58 = client['58tc']
tc_58_data = client['58tcDataNew']
fenLei = tab_name[:-5]
fenLei = tc_58_data[fenLei+'_data']
tab_name = tc_58[tab_name]
#print(tab_name)
for i in tab_name.find({},{'link':1,'_id':0}):
data = getInfo(i['link'])
fenLei.insert(data)
def getContinuingly(fenlei):
client = pymongo.MongoClient('localhost',27017)
tc_58_data = client['58tcDataNew']
tc_58 = client['58tc']
fenlei_data = tc_58_data[fenlei+'_data']
fenlei_list = tc_58[fenlei+'_list']
db_urls = [item['link'] for item in fenlei_data.find()]
index_url = [item['link'] for item in fenlei_list.find()]
x=set(db_urls)
y=set(index_url)
rest_of_urls = y-x
return list(rest_of_urls)
def startgetContinuingly(fenlei):
client = pymongo.MongoClient('localhost', 27017)
tc_58_data = client['58tcDataNew']
fenLei = tc_58_data[fenlei+'_data']
#rest_of_urls = getContinuingly('chuang')
rest_of_urls = getContinuingly(fenlei)
#print(rest_of_urls)
for i in rest_of_urls:
data = getInfo(i)
fenLei.insert(data)
# startgetContinuingly('bijiben')
pool = Pool()
pool.map(insertData,allMenCollectionName)
#pool.map(insertData,['chuang_list'])
#insertData(allMenCollectionName)
模块3 分析
from collections import Counter
import pymongo,charts
def getTotalCount(database,host=None,port=None):
client = pymongo.MongoClient(host,port)
db = client[database]
tab_list = db.collection_names()
#print(tab_list)
count = 0
for i in tab_list:
count = count + db[i].find({}).count()
print(count)
return count
#getTotalCount('58tcDataNew')
#14700
def getAreaByClassify(classify,database='58tcDataNew',host=None,port=None):
client = pymongo.MongoClient(host, port)
db = client[database]
classify = classify + '_data'
#location_list = [ i['location'][3:] if i['location'] != '' and i['location'][:2] == '北京' else None for i in db['bijiben_data'].find(filter={},projection={'location':1,'_id':0})]
location_list = [i['location'][3:] for i in db['yueqi_data'].find(filter={}, projection={'location': 1, '_id': 0})
if i['location'] != '' and i['location'][:2] == '北京' and i['location'][3:] != '']
loc_name = list(set(location_list))
dic_count = {}
for i in loc_name:
dic_count[i] = location_list.count(i)
return dic_count
# bijiben_area_count = getAreaByClassify(classify='yueqi')
# print(bijiben_area_count)
# danche_area_count = getAreaByClassify(classify='danche')
# sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count)
# print(sum_area_count)
def myCounter(L,database='58tcDataNew',host=None,port=None):
client = pymongo.MongoClient(host, port)
db = client[database]
tab_list = db.collection_names()
dic_0 = {}
for i in tab_list:
loc = i[:-5] + '_area_count'
dic_0[loc] = 0
if not L:
return Counter(dic_0)
else:
return Counter(L[0]) + myCounter(L[1:])
def getAllCount(database='58tcDataNew',host=None,port=None):
client = pymongo.MongoClient(host, port)
db = client[database]
tab_list = db.collection_names()
dic_all_count = {}
for i in tab_list:
dic = getAreaByClassify(i[:-5])
loc = i[:-5] + '_area_count'
dic_all_count[loc] = dic
dic_val = [dic_all_count[x] for x in dic_all_count]
my = myCounter(dic_val)
dic_all_count['total_area_count'] = dict(my)
return dic_all_count
dic_all_count = getAllCount()
# print(dic_all_count['bijiben_area_count'])
# print(dic_all_count['total_area_count'])
#
#
tmp_list = []
for i in dic_all_count['total_area_count']:
data = {
'name':i,
'data':[dic_all_count['total_area_count'][i]],
'type':'column'
}
tmp_list.append(data)
options = {
'chart' : {'zoomType':'xy'},
'title' : {'text': '北京58同城二手交易信息发布区域分布图'},
'subtitle': {'text': '数据来源: 58.com'},
'xAxis' : {'categories': ['']},
'yAxis' : {'title':{'text':'数量'}},
'plotOptions': {'column': {'dataLabels': {'enabled': True}}}
}
charts.plot(tmp_list,show='inline',options=options)
用Python写爬虫爬取58同城二手交易数据的更多相关文章
- python3爬虫-爬取58同城上所有城市的租房信息
from fake_useragent import UserAgent from lxml import etree import requests, os import time, re, dat ...
- 养只爬虫当宠物(Node.js爬虫爬取58同城租房信息)
先上一个源代码吧. https://github.com/answershuto/Rental 欢迎指导交流. 效果图 搭建Node.js环境及启动服务 安装node以及npm,用express模块启 ...
- 利用python爬取58同城简历数据
利用python爬取58同城简历数据 利用python爬取58同城简历数据 最近接到一个工作,需要获取58同城上面的简历信息(http://gz.58.com/qzyewu/).最开始想到是用pyth ...
- 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)
前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...
- 利用Python网络爬虫爬取学校官网十条标题
利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...
- python爬虫爬取get请求的页面数据代码样例
废话不多说,上代码 #!/usr/bin/env python # -*- coding:utf-8 -*- # 导包 import urllib.request import urllib.pars ...
- 04 Python网络爬虫 <<爬取get/post请求的页面数据>>之requests模块
一. urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib ...
- 爬虫--scrapy+redis分布式爬取58同城北京全站租房数据
作业需求: 1.基于Spider或者CrawlSpider进行租房信息的爬取 2.本机搭建分布式环境对租房信息进行爬取 3.搭建多台机器的分布式环境,多台机器同时进行租房数据爬取 建议:用Pychar ...
- python学习(十六)写爬虫爬取糗事百科段子
原文链接:爬取糗事百科段子 利用前面学到的文件.正则表达式.urllib的知识,综合运用,爬取糗事百科的段子先用urllib库获取糗事百科热帖第一页的数据.并打开文件进行保存,正好可以熟悉一下之前学过 ...
随机推荐
- linux下redis的安装与部署及基础命令
<1>下载安装文件:redis-3.2.5.tar.gz 放在opt目录下 <2> tar -zxvf redis-3.2.5.tar.gz,备份redis.conf到自己的目 ...
- mongodb的linux环境搭建
一.启动 [mongodb@node1 ~]$ mongod -f /data/config/shard1.confmongod: /usr/lib64/libcrypto.so.10: no ver ...
- 【微信开发】常用 api
[微信开发]api 一. 开发文档 二. 调试工具 三. api 1. 获取 token - https://api.weixin.qq.com/cgi-bin/token?grant_type=cl ...
- Huffman编码
#define _CRT_SECURE_NO_WARNINGS #include <iostream> #include <cstdio> #include <cstri ...
- Swiper.js 中文API手册
本文分享自 http://www.cnblogs.com/scavengers/p/3760449.html ---------------------------华丽的分割线------------ ...
- Android SVN的配置
关于svn有不错的图文博文,先po一个个人认为很不错的博文:http://ask.android-studio.org/?/article/97 接下来呢呢就是以我个人的思维模式来讲解svn啦: 当然 ...
- css中一些必要的公共样式
body, h1, h2, h3, h4, h5, h6, hr, p, blockquote, dl, dt, dd, ul, ol, li, pre, form, fieldset, legend ...
- java中的条件语句(if、if...else、多重if、嵌套if)
Java条件语句之 if 生活中,我们经常需要先做判断,然后才决定是否要做某件事情.例如,如果考试成绩大于 90 分,则奖励一个 IPHONE 5S .对于这种"需要先判断条件,条件满足后才 ...
- Android开发学习---使用XmlPullParser解析xml文件
Android中解析XML的方式主要有三种:sax,dom和pull关于其内容可参考:http://blog.csdn.net/liuhe688/article/details/6415593 本文将 ...
- ArcEngine:The XY domain on the spatial reference is not set or invalid错误
在创建数据集的时候,提示The XY domain on the spatial reference is not set or invalid错误. 原因:未设置空间参考(ISpatialRefer ...