爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意

模块1 获取分类url列表

from bs4 import BeautifulSoup
import requests,pymongo

main_url = 'http://bj.58.com/sale.shtml'
client = pymongo.MongoClient('localhost',27017)
tc_58 = client['58tc']
tab_link_list = tc_58['link_list']

web_data = requests.get(main_url)
soup = BeautifulSoup(web_data.text,'lxml')
sub_menu_link = soup.select('ul.ym-submnu > li > b > a')

link_list = []
count = 0
for link in sub_menu_link:
    link = 'http://bj.58.com' + link.get('href')
    #print(link)
    if link == 'http://bj.58.com/shoujihao/':
        pass
    elif link == 'http://bj.58.com/tongxunyw/':
        pass
    elif link == 'http://bj.58.com/tiaozao/':
        count += 1
        if count == 1:
            data = {'link':link}
            link_list.append(data)
    else:
        data = {'link': link}
        link_list.append(data)

for i in link_list:
    tab_link_list.insert(i)

模块2 获取每个商品详情信息

from bs4 import BeautifulSoup
import requests,re,pymongo,sys
from multiprocessing import Pool

client = pymongo.MongoClient('localhost',27017)
tc_58 = client['58tc']
# detail_link = tc_58['detail_link']
tab_link_list = tc_58['link_list']
# tc_58_data = client['58tcData']

def getDetailUrl(page_url,tab):
    url_list = []
    web_data = requests.get(page_url)
    soup = BeautifulSoup(web_data.text,'lxml')
    detail_url = soup.select('div.infocon > table > tbody > tr > td.t > a[onclick]')

    #获取详细页面url
    for url in detail_url:
        url_list.append(url.get('href').split('?')[0])

    #插入mongodb
    count = 0
    client = pymongo.MongoClient('localhost', 27017)
    tc_58 = client['58tc']
    tab_list = tc_58[tab+'_list']
    for i in url_list:
        count += 1
        tab_list.insert({'link':i})
    return count

original_price_patt = re.compile('原价:(.+)')
def getInfo(detail_url):
    try:
        web_data = requests.get(detail_url)
        soup = BeautifulSoup(web_data.text,'lxml')
        title = soup.title.text.strip()
        view_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text
        want_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person')[0].text
        current_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
        current_price = current_price[0].text if current_price else None
original_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b')
        original_price = original_price[0].text if original_price else None
original_price = re.findall(original_price_patt,original_price) if original_price else None
location = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
        tag = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
        tag = list(tag[0].stripped_strings) if tag else None
seller_name = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name')[0].text
        # level = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span')
        # level = str(level[0]).split('\n')
        #
        # full_count = 0
        # half_count = 0
        # for j in level:
        #     if '<span class="icon_png "></span>' == j:
        #         full_count += 1
        #     elif '<span class="icon_png smallScore"></span>' == j:
        #         half_count += 1
        full_count = len(soup.find_all('span', class_='icon_png '))
        half_count = len(soup.find_all('span', class_='icon_png smallScore'))

        level_count = {'full':full_count,'half':half_count}
        desc = soup.select('body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p')
        desc = desc[0].text if desc else None
data = {
            'title':title,
            'view_count':view_count,
            'want_count':want_count,
            'current_price':current_price,
            'original_price':original_price,
            'location':location,
            'tag':tag,
            'seller_name':seller_name,
            #'level':level,
            'level_count':level_count,
            'desc':desc,
            'link':detail_url
}
        return data
    except:
        print(sys.exc_info()[0], sys.exc_info()[1])
        return None
# for i in tab_link_list.find({},{'link':1,'_id':0}):
#     print(i['link'])
#     getDetailUrl(i['link'])

#规律每个页面最多70页
def insertDetailLin(sub_menu_list):
    patt = re.compile('.+?com/([a-z]+)/')
    tab_list = []
    for i in sub_menu_list.find({},{'link':1,'_id':0}):
    #for i in [{'link':'http://bj.58.com/shouji/'}]:
        i = i['link']
        sub_menu_name = re.findall(patt,i)[0]
        print(sub_menu_name+': ',end='')
        url_list = []
        for j in range(1,71):
            link = i + 'pn' + str(j)
            url_list.append(link)

        cnt = 0
        for k in url_list:
            cnt = cnt + getDetailUrl(k, sub_menu_name)
        print(str(cnt) + ' lines inserted')
        if cnt != 0:
            tab_list.append(sub_menu_name+'_list')
    return tab_list

# for i in tab_link_list.find({},{'link':1,'_id':0}):
#     print(i)

#insertDetailLin(tab_link_list)

allMenCollectionName = tc_58.collection_names()
#allMenCollectionName.remove('detail_link')
allMenCollectionName.remove('link_list')
def insertData(tab_name):
    client = pymongo.MongoClient('localhost', 27017)
    tc_58 = client['58tc']
    tc_58_data = client['58tcDataNew']
    fenLei = tab_name[:-5]
    fenLei = tc_58_data[fenLei+'_data']
    tab_name = tc_58[tab_name]
    #print(tab_name)
    for i in tab_name.find({},{'link':1,'_id':0}):
        data = getInfo(i['link'])
        fenLei.insert(data)

def getContinuingly(fenlei):
    client = pymongo.MongoClient('localhost',27017)
    tc_58_data = client['58tcDataNew']
    tc_58 = client['58tc']
    fenlei_data = tc_58_data[fenlei+'_data']
    fenlei_list = tc_58[fenlei+'_list']
    db_urls = [item['link'] for item in fenlei_data.find()]
    index_url = [item['link'] for item in fenlei_list.find()]
    x=set(db_urls)
    y=set(index_url)
    rest_of_urls = y-x
    return list(rest_of_urls)

def startgetContinuingly(fenlei):
    client = pymongo.MongoClient('localhost', 27017)
    tc_58_data = client['58tcDataNew']
    fenLei = tc_58_data[fenlei+'_data']
    #rest_of_urls = getContinuingly('chuang')
    rest_of_urls = getContinuingly(fenlei)
    #print(rest_of_urls)
    for i in rest_of_urls:
        data = getInfo(i)
        fenLei.insert(data)

# startgetContinuingly('bijiben')
pool = Pool()
pool.map(insertData,allMenCollectionName)
#pool.map(insertData,['chuang_list'])
#insertData(allMenCollectionName)

模块3 分析

from collections import Counter
import pymongo,charts

def getTotalCount(database,host=None,port=None):
    client = pymongo.MongoClient(host,port)
    db = client[database]
    tab_list = db.collection_names()
    #print(tab_list)
    count = 0
    for i in tab_list:
        count = count + db[i].find({}).count()
    print(count)
    return count

#getTotalCount('58tcDataNew')
#14700

def getAreaByClassify(classify,database='58tcDataNew',host=None,port=None):
    client = pymongo.MongoClient(host, port)
    db = client[database]
    classify = classify + '_data'
    #location_list = [ i['location'][3:] if i['location'] != '' and i['location'][:2] == '北京' else None for i in db['bijiben_data'].find(filter={},projection={'location':1,'_id':0})]
    location_list = [i['location'][3:] for i in db['yueqi_data'].find(filter={}, projection={'location': 1, '_id': 0})
                     if i['location'] != '' and i['location'][:2] == '北京' and i['location'][3:] != '']
    loc_name = list(set(location_list))
    dic_count = {}
    for i in loc_name:
        dic_count[i] = location_list.count(i)
    return dic_count

# bijiben_area_count = getAreaByClassify(classify='yueqi')
# print(bijiben_area_count)
# danche_area_count = getAreaByClassify(classify='danche')
# sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count)
# print(sum_area_count)

def myCounter(L,database='58tcDataNew',host=None,port=None):
    client = pymongo.MongoClient(host, port)
    db = client[database]
    tab_list = db.collection_names()
    dic_0 = {}
    for i in tab_list:
        loc = i[:-5] + '_area_count'
        dic_0[loc] = 0

    if not L:
        return Counter(dic_0)
    else:
        return Counter(L[0]) + myCounter(L[1:])

def getAllCount(database='58tcDataNew',host=None,port=None):
    client = pymongo.MongoClient(host, port)
    db = client[database]
    tab_list = db.collection_names()
    dic_all_count = {}
    for i in tab_list:
        dic = getAreaByClassify(i[:-5])
        loc = i[:-5] + '_area_count'
        dic_all_count[loc] = dic

    dic_val = [dic_all_count[x] for x in dic_all_count]
    my = myCounter(dic_val)

    dic_all_count['total_area_count'] = dict(my)
    return dic_all_count

dic_all_count = getAllCount()
# print(dic_all_count['bijiben_area_count'])
# print(dic_all_count['total_area_count'])
#
#

tmp_list = []
for i in dic_all_count['total_area_count']:
    data = {
        'name':i,
        'data':[dic_all_count['total_area_count'][i]],
        'type':'column'
    }
    tmp_list.append(data)

options = {
    'chart'   : {'zoomType':'xy'},
    'title'   : {'text': '北京58同城二手交易信息发布区域分布图'},
    'subtitle': {'text': '数据来源: 58.com'},
    'xAxis'   : {'categories': ['']},
    'yAxis'   : {'title':{'text':'数量'}},
    'plotOptions': {'column': {'dataLabels': {'enabled': True}}}
    }
charts.plot(tmp_list,show='inline',options=options)

 
 

用Python写爬虫爬取58同城二手交易数据的更多相关文章

  1. python3爬虫-爬取58同城上所有城市的租房信息

    from fake_useragent import UserAgent from lxml import etree import requests, os import time, re, dat ...

  2. 养只爬虫当宠物(Node.js爬虫爬取58同城租房信息)

    先上一个源代码吧. https://github.com/answershuto/Rental 欢迎指导交流. 效果图 搭建Node.js环境及启动服务 安装node以及npm,用express模块启 ...

  3. 利用python爬取58同城简历数据

    利用python爬取58同城简历数据 利用python爬取58同城简历数据 最近接到一个工作,需要获取58同城上面的简历信息(http://gz.58.com/qzyewu/).最开始想到是用pyth ...

  4. 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)

    前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...

  5. 利用Python网络爬虫爬取学校官网十条标题

    利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...

  6. python爬虫爬取get请求的页面数据代码样例

    废话不多说,上代码 #!/usr/bin/env python # -*- coding:utf-8 -*- # 导包 import urllib.request import urllib.pars ...

  7. 04 Python网络爬虫 <<爬取get/post请求的页面数据>>之requests模块

    一. urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib ...

  8. 爬虫--scrapy+redis分布式爬取58同城北京全站租房数据

    作业需求: 1.基于Spider或者CrawlSpider进行租房信息的爬取 2.本机搭建分布式环境对租房信息进行爬取 3.搭建多台机器的分布式环境,多台机器同时进行租房数据爬取 建议:用Pychar ...

  9. python学习(十六)写爬虫爬取糗事百科段子

    原文链接:爬取糗事百科段子 利用前面学到的文件.正则表达式.urllib的知识,综合运用,爬取糗事百科的段子先用urllib库获取糗事百科热帖第一页的数据.并打开文件进行保存,正好可以熟悉一下之前学过 ...

随机推荐

  1. linux下redis的安装与部署及基础命令

    <1>下载安装文件:redis-3.2.5.tar.gz 放在opt目录下 <2> tar -zxvf redis-3.2.5.tar.gz,备份redis.conf到自己的目 ...

  2. mongodb的linux环境搭建

    一.启动 [mongodb@node1 ~]$ mongod -f /data/config/shard1.confmongod: /usr/lib64/libcrypto.so.10: no ver ...

  3. 【微信开发】常用 api

    [微信开发]api 一. 开发文档 二. 调试工具 三. api 1. 获取 token - https://api.weixin.qq.com/cgi-bin/token?grant_type=cl ...

  4. Huffman编码

    #define _CRT_SECURE_NO_WARNINGS #include <iostream> #include <cstdio> #include <cstri ...

  5. Swiper.js 中文API手册

    本文分享自 http://www.cnblogs.com/scavengers/p/3760449.html ---------------------------华丽的分割线------------ ...

  6. Android SVN的配置

    关于svn有不错的图文博文,先po一个个人认为很不错的博文:http://ask.android-studio.org/?/article/97 接下来呢呢就是以我个人的思维模式来讲解svn啦: 当然 ...

  7. css中一些必要的公共样式

    body, h1, h2, h3, h4, h5, h6, hr, p, blockquote, dl, dt, dd, ul, ol, li, pre, form, fieldset, legend ...

  8. java中的条件语句(if、if...else、多重if、嵌套if)

    Java条件语句之 if 生活中,我们经常需要先做判断,然后才决定是否要做某件事情.例如,如果考试成绩大于 90 分,则奖励一个 IPHONE 5S .对于这种"需要先判断条件,条件满足后才 ...

  9. Android开发学习---使用XmlPullParser解析xml文件

    Android中解析XML的方式主要有三种:sax,dom和pull关于其内容可参考:http://blog.csdn.net/liuhe688/article/details/6415593 本文将 ...

  10. ArcEngine:The XY domain on the spatial reference is not set or invalid错误

    在创建数据集的时候,提示The XY domain on the spatial reference is not set or invalid错误. 原因:未设置空间参考(ISpatialRefer ...