实现多线程爬取数据并保存到mongodb

多线程爬取二手房网页并将数据保存到mongodb的代码：

import pymongo

import threading

import time

from lxml import etree

import requests

from queue import Queue

index_url='https://m.lianjia.com/gz/ershoufang/pg{}/'

detail_url='https://m.lianjia.com{}'

# 设置爬取主页的页数

INDEX_PAGE_NUM=

# 定义一个类

# 0定义主页url队列、主页html队列、详情页url队列、html队列、内容队列

# 1获取首页url并解析详情页url

# 2获取详情页的内容

# 3保存内容

# 4设置多线程调用方法

# 设置mongodb

client = pymongo.MongoClient('localhost')

# 设置数据库名

db = client['ershoufang']

# 指定集合名

index = 'index_info'

detail = 'detail_info'

class lianJia():

    def __init__(self):

        self.index_url_queue=Queue()

        self.html_index_queue=Queue()

        self.index_content_queue=Queue()

        self.detail_content_queue = Queue()

    #     获取主页的url和html内容并解析出index页内容和详情页url

    def get_index(self):

        for i in range(INDEX_PAGE_NUM):

            # print(index_url.format(i+))

            url=index_url.format(i+)

            self.index_url_queue.put(url)

            # index=requests.get(index_url.format(i+)).content.decode()

            # self.html_index_queue.put(index)

    # 获取主页html

    def get_index_html(self):

        while True:

            url=self.index_url_queue.get()

            index = requests.get(url).content.decode()

            self.html_index_queue.put(index)

            self.index_url_queue.task_done()

    def parse_index(self):

        while True:

            # 获取队列里得内容

            html1=self.html_index_queue.get()

            xml=etree.HTML(html1)

            pingjie_list=xml.xpath('''//ul[@class='lists']/li[position()>1]''')

            # 将 pingjie_list拼接在xpath前，少写xpath语句

            index_content_list=[]

            for pj in pingjie_list:

                index_infor={}

                # #判空炒作，如果为空则显示none if len(index_infor['title']) >  else None

                index_infor['title']=pj.xpath('''./div/div[@class='item_list']/div[1]/text()''')

                index_infor['title']=index_infor['title'][] if len(index_infor['title']) >  else None

                index_infor['detail_url'] = pj.xpath('''./a/@href''')[]

                index_infor['index_detail']=pj.xpath('''./div/div[2]/div[2]/text()''')

                index_infor['index_detail']=index_infor['index_detail'][] if len(index_infor['index_detail'])> else None

                index_infor['total_price']=pj.xpath('''./div/div[2]/div[position()>2]/span[1]/em/text()''')

                index_infor['total_price']= index_infor['total_price'][] if len( index_infor['total_price'])> else None

                index_infor['average_price']=pj.xpath('''./div/div[@class='item_list']/div[3]/span[2]/text()''')

                index_infor['average_price']=index_infor['average_price'][]if len(index_infor['average_price'])> else None

                index_content_list.append(index_infor)

                #  队列保存时不能在循环里 否之回保存很多个队列

                # self.index_content_queue.put(index_content_list)

                # 把content_list放进content_queue里面

            self.index_content_queue.put(index_content_list)

            # print(index_content_list)

            # 每从队列中获取一个数，队列则减少一个数，所以此代码必须写

            self.html_index_queue.task_done()

    # 获取详情页内容

    def get_detail(self):

        pass

    # 保存内容

    def save_content(self):

        while True:

            index_conten_list=self.index_content_queue.get()

            for i in index_conten_list:

                # print(i['title'])

                if i['title']==None or i['total_price']==None or i['average_price']==None:

                    print('该数据为空，不进行保存')

                else:

                    db['index_info'].insert(i)

                    # db['detailDta'].insert(detail_datas)

                    print('保存数据成功')

            self.index_content_queue.task_done()

    # 主线程：分配各种子线程去执行class里得每一个函数

    # 使用队列的方式得设置多线程进行调用函数，才能让程序执行速度更快

    def run(self):

        # 设置线程列表

        thread_list=[]

        # start_time=time.time()

        # .url_list

        # threading.Thread不需要传参数，参数都是从队列里面取得

        # for i in range():

        t_index_u=threading.Thread(target=self.get_index)

        thread_list.append(t_index_u)

        # .遍历，发送请求，获取响应

        for i in range():

            t_index_html=threading.Thread(target=self.get_index_html)

            thread_list.append(t_index_html)

        # .提取数据

        for i in range():

            t_parse_index=threading.Thread(target=self.parse_index)

            thread_list.append(t_parse_index)

        # .保存数据

        t_save=threading.Thread(target=self.save_content)

        thread_list.append(t_save)

        #     循环开启各子线程

        for t in thread_list:

            # 表示主线程结束，子线程（设置为true无限循环）也跟着结束（用主线程控制子线程）

            t.setDaemon(True)

            # 启动线程

            t.start()

        for q in [self.index_url_queue,self.html_index_queue,self.index_content_queue]:

            # 让主线程等待阻塞，等待队列的任务完成（即队列为空时 ）之后再进行主线程

            q.join()

            # end_time=time.time()

            # print('总耗时%.2f秒'%(end_time-start_time))

if __name__=='__main__':

    sk = time.clock()

    func=lianJia()

    func.run()

    ek = time.clock()

    print('程序总耗时：',ek-sk)

多线程爬取糗事百科：

# coding=utf-

import requests

from lxml import etree

import threading

from queue import Queue

# https://docs.python.org/3/library/queue.html#module-queue

# 队列使用方法简介

# q.qsize() 返回队列的大小

# q.empty() 如果队列为空，返回True,反之False

# q.full() 如果队列满了，返回True,反之False

# q.full 与 maxsize 大小对应

# q.get([block[, timeout]]) 获取队列，timeout等待时间

# q.get_nowait() 相当q.get(False)

# q.put(item) 写入队列，timeout等待时间

# q.put_nowait(item) 相当q.put(item, False)

# q.task_done() 在完成一项工作之后，q.task_done() 函数向任务已经完成的队列发送一个信号

# q.join() 实际上意味着等到队列为空，再执行别的操作

class QiubaiSpdier:

    def __init__(self):

        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"

        self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}

        self.url_queue = Queue()

        self.html_queue  = Queue()

        self.content_queue = Queue()

    def get_url_list(self):

        # return [self.url_temp.format(i) for i in range(,)]

        for i in range(,):

            # 把13个索引页面的Url放进url_queue队列里

            self.url_queue.put(self.url_temp.format(i))

    def parse_url(self):

        while True:

            # get方法和task_done搭配使用

            # 在put是队列+，get和task_done一起使用时队列才会-

            url = self.url_queue.get()

            print(url)

            response = requests.get(url,headers=self.headers)

            # 然后把索引页的响应页面放进html_queue队列里

            self.html_queue.put(response.content.decode())

            self.url_queue.task_done()

    def get_content_list(self): #提取数据

        while True:

            # 先从索引页响应页面html_queue队列里面取出索引页面

            html_str = self.html_queue.get()

            html = etree.HTML(html_str)

            div_list = html.xpath("//div[@id='content-left']/div")  #分组

            content_list = []

            for div in div_list:

                item= {}

                item["content"] = div.xpath(".//div[@class='content']/span/text()")

                item["content"] = [i.replace("\n","") for i in item["content"]]

                item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")

                item["author_gender"] = item["author_gender"][].split(" ")[-].replace("Icon","") if len(item["author_gender"])> else None

                item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")

                item["auhtor_age"] = item["auhtor_age"][] if len(item["auhtor_age"])> else None

                item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")

                item["content_img"] = "https:"+item["content_img"][] if len(item["content_img"])> else None

                item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")

                item["author_img"] = "https:"+item["author_img"][] if len(item["author_img"])> else None

                item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")

                item["stats_vote"] = item["stats_vote"][] if len(item["stats_vote"])> else None

                content_list.append(item)

            # 把content_list放进content_queue里面

            self.content_queue.put(content_list)

            self.html_queue.task_done()

    def save_content_list(self): #保存

        while True:

            content_list = self.content_queue.get()

            for i in content_list:

                print(i)

                pass

            self.content_queue.task_done()

    def run(self): #实现主要逻辑

        thread_list = []

        #.url_list

        # threading.Thread不需要传参数，参数都是从队列里面取得

        t_url = threading.Thread(target=self.get_url_list)

        thread_list.append(t_url)

        #.遍历，发送请求，获取响应

        for i in range(): # 添加20个线程

            t_parse = threading.Thread(target=self.parse_url)

            thread_list.append(t_parse)

        #.提取数据

        for i in range(): # 添加2个线程

            t_html = threading.Thread(target=self.get_content_list)

            thread_list.append(t_html)

        #.保存

        t_save = threading.Thread(target=self.save_content_list)

        thread_list.append(t_save)

        for t in thread_list:

            t.setDaemon(True) #把子线程设置为守护线程，该线程不重要，主线程结束，子线程结束(子线程是while true不会自己结束)

            t.start()

        for q in [self.url_queue,self.html_queue,self.content_queue]:

            q.join() #让主线程等待阻塞，等待队列的任务完成（即队列为空时 ）之后再进行主线程

        print("主线程结束")

if __name__ == '__main__':

    qiubai = QiubaiSpdier()

    qiubai.run()

# 所没有tast_done方法，程序最终会卡着不动，无法终止

# 线程的设计注意：耗时的操作要分配一些线程

实现多线程爬取数据并保存到mongodb的更多相关文章

scrapy爬取海量数据并保存在MongoDB和MySQL数据库中
前言一般我们都会将数据爬取下来保存在临时文件或者控制台直接输出,但对于超大规模数据的快速读写,高并发场景的访问,用数据库管理无疑是不二之选.首先简单描述一下MySQL和MongoDB的区别:MySQ ...
sumafan:python爬虫多线程爬取数据小练习（附答案）
抓取 https://www.cnbeta.com/ 首页中新闻内容页网址, 抓取内容例子: https://hot.cnbeta.com/articles/game/825125 将抓取下来的内容页 ...
吴裕雄--天生自然python数据清洗与数据可视化：MYSQL、MongoDB数据库连接与查询、爬取天猫连衣裙数据保存到MongoDB
本博文使用的数据库是MySQL和MongoDB数据库.安装MySQL可以参照我的这篇博文:https://www.cnblogs.com/tszr/p/12112777.html 其中操作Mysql使 ...
吴裕雄--天生自然PYTHON爬虫：安装配置MongoDBy和爬取天气数据并清洗保存到MongoDB中
1.下载MongoDB 官网下载:https://www.mongodb.com/download-center#community 上面这张图选择第二个按钮上面这张图直接Next 把bin路径添加 ...
python多线程爬取斗图啦数据
python多线程爬取斗图啦网的表情数据使用到的技术点 requests请求库 re 正则表达式 pyquery解析库,python实现的jquery threading 线程 queue 队列 ' ...
Python爬虫入门教程 11-100 行行网电子书多线程爬取
行行网电子书多线程爬取-写在前面最近想找几本电子书看看,就翻啊翻,然后呢,找到了一个叫做周读的网站 ,网站特别好,简单清爽,书籍很多,而且打开都是百度网盘可以直接下载,更新速度也还可以,于是乎, ...
借助Chrome和插件爬取数据
工具 Chrome浏览器 TamperMonkey ReRes Chrome浏览器 chrome浏览器是目前最受欢迎的浏览器,没有之一,它兼容大部分的w3c标准和ecma标准,对于前端工程师在开发过程 ...
使用selenium 多线程爬取爱奇艺电影信息
使用selenium 多线程爬取爱奇艺电影信息转载请注明出处. 爬取目标:每个电影的评分.名称.时长.主演.和类型爬取思路: 源文件:(有注释) from selenium import webd ...
Python爬虫入门教程 14-100 All IT eBooks多线程爬取
All IT eBooks多线程爬取-写在前面对一个爬虫爱好者来说,或多或少都有这么一点点的收集癖 ~ 发现好的图片,发现好的书籍,发现各种能存放在电脑上的东西,都喜欢把它批量的爬取下来. 然后放着 ...

随机推荐

纯原生JS大图轮播
CSS部分: CSS: <style type="text/css"> #banner { position: relative; width: 500px; heig ...
A1140. Look-and-say Sequence
Look-and-say sequence is a sequence of integers as the following: D, D1, D111, D113, D11231, D112213 ...
WebAPI接口安全校验
通过网上查看相关WebAPI接口验证的方法,整理了一下,直接上代码,功能不复杂,有问题留言, //--------------------------------------------------- ...
JDK几个高版本的新特性
JDK 高版本的新特性 1.JDK5的新特性: 自动拆装箱见Integer部分笔记泛型增强for循环静态导入可变参数见集合部分笔记枚举是指将变量的值一一列出来,变量的值只限于列举出来的 ...
关于code::blocks的编译速度问题
在一个程序写好之后,按下F9,便可以进行编译并且运行,在2018年的寒假之中,编译速度一直困扰着我,因为每次编译都需要十秒左右的时间,体验极差.而此前,编译时间一直保持在0 second. 经过我的多 ...
5款 Mac 常用PDF阅读和编辑软件推荐
PDF和Word.TXT等文档一样,都是我们最常用的文档格式,那么一款好用的浏览或编辑PDF的工具就很有必要了,今天和大家分享5款Mac上优秀的PDF阅读和编辑工具. 以下内容来自[风云社区 SCOE ...
sklearn-woe/iv-乳腺癌分类器实战
sklearn实战-乳腺癌细胞数据挖掘 https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campai ...
LSTM时间序列预测及网络层搭建
一.LSTM预测未来一年某航空公司的客运流量给你一个数据集,只有一列数据,这是一个关于时间序列的数据,从这个时间序列中预测未来一年某航空公司的客运流量.数据形式: 二.实战 1)数据下载你可以go ...
OS + Windows 10 / office excel vlookup / CredSSP
s https://support.microsoft.com/zh-cn/help/10749/windows-10-find-product-key 查找 Windows 7 或 Windows ...
Http接口开发（自测服务端客户端）
一. Http与Https的区别 1.概念 HTTP:是互联网上应用最为广泛的一种网络协议,是一个客户端和服务器端请求和应答的标准(TCP),用于从www服务器传 ...

实现多线程爬取数据并保存到mongodb

实现多线程爬取数据并保存到mongodb的更多相关文章

随机推荐

热门专题