Breastcancer社区评论下载

首页

某个社区

某社区的一个话题

目标：获取这个网站所有话题的所有评论相关信息

python实现

# -*- coding: utf-8 -*-

"""

@Datetime: 2019/3/17

@Author: Zhang Yafei

"""

import functools

import os

import re

import traceback

import time

from concurrent.futures import ThreadPoolExecutor, wait, as_completed, ProcessPoolExecutor

from itertools import chain

from urllib.request import urljoin

import pandas as pd

from lxml import etree

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

# 控制台输出所有列

pd.set_option('display.max_columns', None)

forum_info = pd.read_csv('all_forum_info.csv')

def timeit(fun):

    @functools.wraps(fun)

    def wrapper(*args, **kwargs):

        start_time = time.time()

        res = fun(*args, **kwargs)

        print('运行时间为%.6f' % (time.time() - start_time))

        return res

    return wrapper

class Download(object):

    """ 请求的下载 """

    def __init__(self, url):

        self.url = url

        self.forum_topic = '【{}】'.format(forum_info[forum_info.forum_url == url[0]].forum_topic.values[0])

        self.dir_path = os.path.join('download', self.forum_topic).replace('/', '-').replace(',', '').replace(':', '-').replace('"', '')

        if not os.path.exists(self.dir_path):

            os.mkdir(self.dir_path)

        self.chrome_options = Options()

        self.chrome_options.add_argument('--headless')

        self.chrome_options.add_argument('--disable-gpu')

        self.browser = None

        self.dispatch(pool=True)

    def dispatch(self, pool=True):

        """ 下载所有社区页面 """

        topic_count = int(self.url[1].replace(',', ''))

        pages = divmod(topic_count, 30)[0] + 1

        remain_pages = self.filter_url(pages)

        urls = [self.url[0] + '?page={}'.format(page) for page in remain_pages]

        # [self.forum_start(urls), list(map(self.forum_start, self.url))][len(self.url) > 1

        if urls and pool:

            pool = ThreadPoolExecutor(max_workers=4)

            pool.map(self.forum_start, urls, timeout=60)

            pool.shutdown()

        elif urls:

            list(map(self.forum_start, urls))

        else:

            print(self.dir_path + '共需下载{}页'.format(pages) + '\t下载完成')

    def filter_url(self, pages):

        has_pages = list(map(lambda x: int(x.strip('.html')), os.listdir(self.dir_path)))

        down_pages = list(set(range(1, pages + 1)) - set(has_pages))

        if len(down_pages):

            print(self.dir_path + '共需下载：{}页'.format(pages) + ' 已下载：{}页'.format(len(has_pages)) + ' 未下载：{}页'.format(len(down_pages)))

        return down_pages

    def forum_start(self, url, header=False):

        if header:

            browser = webdriver.Chrome()

        else:

            browser = webdriver.Chrome(chrome_options=self.chrome_options)

        browser.get(url=url)

        html = browser.page_source

        browser.close()

        file_path = os.path.join(self.dir_path, url.split('?page=')[-1] + '.html')

        with open(file_path, mode='w', encoding='utf-8') as f:

            f.write(html)

        print(url + '下载成功')

    # def close(self):

    #     self.browser.close()

class BreastCancer(object):

    """ BreastCancer社区评论下载 """

    def __init__(self):

        self.base_url = 'https://community.breastcancer.org/'

        self.all_forum_columns = ['hgroup', 'forum_topic', 'forum_url', 'count_topic', 'count_post']

        self.all_forum_file = 'all_forum_info.csv'

        self.all_topic_columns = ['forum', 'topic_url', 'founder', 'count_posts', 'count_views', 'created_time', 'file_path']

        self.all_topic_file = 'all_topic_info.csv'

        if not os.path.exists(self.all_topic_file):

            self.write_to_file(columns=self.all_topic_columns, file=self.all_topic_file, mode='w')

    def download_forums_index(self):

        """

        下载社区首页

        """

        browser = webdriver.Chrome()

        browser.get(url=self.base_url)

        html = browser.page_source

        with open('download/community.html', mode='w', encoding='utf-8') as f:

            f.write(html)

        browser.close()

    def parse_forums_index(self):

        """ 解析社区首页 获取所有社区页面id """

        # 创建csv文件, 写入表头

        if not os.path.exists(self.all_forum_file):

            data = pd.DataFrame(columns=self.all_forum_columns)

            data.to_csv(self.all_forum_file, index=False)

        # 读取html文件,提取有用信息，写入到文件尾部

        with open('download/community.html', encoding='utf-8') as f:

            response = f.read()

        response = etree.HTML(response)

        rowgroups = response.xpath('//div[@class="rowgroup"]')

        list(map(self.get_all_forums_ids, rowgroups))

    def get_all_forums_ids(self, response):

        """

        获取所有:

            hgroup、

            forum_topic(社区主题)、

            url(社区页面地址)、

            count_topic(话题数量)、

            count_post(评论数量)

        """

        hgroup = response.xpath('.//h2/text()')[0]

        forums = response.xpath('.//li')

        data_list = []

        for forum in forums:

            forum_topic = forum.xpath('h3/a/text()')[0]

            forum_url = urljoin('https://community.breastcancer.org/', forum.xpath('h3/a/@href')[0])

            count_topic = forum.xpath('.//span[@class="count_topic"]/strong/text()')[0]

            count_post = forum.xpath('.//span[@class="count_post"]/strong/text()')[0]

            data_dict = {'hgroup': hgroup, 'forum_topic': forum_topic, 'forum_url': forum_url,

                         'count_topic': count_topic,

                         'count_post': count_post}

            data_list.append(data_dict)

        df = pd.DataFrame(data=data_list)

        # 固定输出列的顺序

        df = df.loc[:, self.all_forum_columns]

        df.to_csv(self.all_forum_file, index=False, header=False, mode='a')

        print(hgroup + '数据解析完成')

    def get_all_forum_page(self, pool=True):

        """ 获取所有社区url 并下载所有社区相关页面 """

        df = pd.read_csv(self.all_forum_file)

        forum_urls = df.apply(lambda x: (x.forum_url, x.count_topic), axis=1)

        if pool:

            # 多线程下载

            # BreastCancer.thread_pool_download(forum_urls)

            # 多进程

            BreastCancer.process_pool_download(forum_urls)

        else:

            # 单线程下载

            list(map(Download, forum_urls))

    @staticmethod

    def thread_pool_download(urls):

        """ 多线程下载 """

        pool = ThreadPoolExecutor(max_workers=4)

        tasks = [pool.submit(Download, url) for url in urls]

        wait(tasks)

        # pool.map(Download, urls)

        pool.shutdown()

    @staticmethod

    def process_pool_download(urls):

        """ 多进程下载 """

        pool = ProcessPoolExecutor(max_workers=4)

        pool.map(Download, urls)

        pool.shutdown()

    def get_topic_info(self, response):

        """ 获取所有话题相关信息 """

        topic_url = urljoin(self.base_url, response.xpath('./h3/a/@href')[0])

        count_posts = response.xpath('./p[1]/span[1]/strong/text()')[0]

        count_views = response.xpath('./p[1]/span[2]/strong/text()')[0]

        founder = response.xpath('./p[2]/a/text()')[0]

        created = response.xpath('./p[2]')[0].xpath('string(.)').replace('\n', '').replace(' ', '')

        created_time = re.search('on(.*)', created).group(1)

        data_dict = {'topic_url': topic_url, 'founder': founder,

                     'count_posts': count_posts, 'count_views': count_views,

                     'created_time': created_time

                     }

        return data_dict

    @staticmethod

    def get_file_path_list(row):

        forum_topic = '【{}】'.format(row.forum_topic)

        dir_path = os.path.join('download', forum_topic).replace('/', '-').replace(',', '').replace(':', '-').replace(

            '"', '')

        file_path_list = list(map(lambda file: os.path.join(dir_path, file), os.listdir(dir_path)))

        return file_path_list

    @timeit

    def start_parse_forum_page(self, pool=False):

        """ 开始解析社区页面  """

        file_path_lists = forum_info.apply(BreastCancer.get_file_path_list, axis=1).tolist()

        file_path_lists = functools.reduce(lambda x, y: x + y, file_path_lists)

        # 每次解析前过滤已解析的页面

        file_path_lists = self.filter_file_path(file_path_lists)

        if pool:

            pool = ThreadPoolExecutor(max_workers=4)

            pool.map(self.parse_forum_page, file_path_lists)

            pool.shutdown()

        else:

            list(map(self.parse_forum_page, file_path_lists))

    def filter_file_path(self, file_path_list):

        parse_file = pd.read_csv(self.all_topic_file).file_path.unique()

        file_list = set(file_path_list) - set(parse_file)

        print('共{}个页面'.format(len(set(file_path_list))), '\n\r已解析{}个页面'.format(len(set(parse_file))))

        return list(file_list)

    def parse_forum_page(self, file_path):

        """ 解析社区页面具体功能实现 """

        with open(file_path, encoding='utf-8') as f:

            response = f.read()

        response = etree.HTML(response)

        try:

            forum = response.xpath('//div[@id="section-content"]/h1/text()')[0]

            forum = re.search('Forum: (.*)', forum).group(1)

            topics = response.cssselect('#section-content > ul.rowgroup.topic-list > li')

            data_list = list(map(self.get_topic_info, topics))

            for data_dict in data_list:

                data_dict['forum'] = forum

                data_dict['file_path'] = file_path

            BreastCancer.write_to_file(data=data_list, file=self.all_topic_file, columns=self.all_topic_columns, mode='a')

            print(file_path + '解析完成')

        except Exception:

            traceback.print_exc()

            print(file_path + '解析错误')

            return

    @staticmethod

    def write_to_file(file, data=None, columns=None, mode=None, index=False, header=False):

        if mode == 'w':

            data = pd.DataFrame(columns=columns)

            data.to_csv(file, index=False)

        elif mode == 'a':

            df = pd.DataFrame(data=data)

            df = df.loc[:, columns]

            df.to_csv(file, mode=mode, index=index, header=header)

    @staticmethod

    def all_forum_stat():

        all_hgroup_count = forum_info.hgroup.nunique()

        all_forum_count = forum_info.forum_topic.nunique()

        all_topic_count = pd.to_numeric(forum_info.count_topic.str.replace(',', ''), downcast='integer').sum()

        all_post_count = pd.to_numeric(forum_info.count_post.str.replace(',', ''), downcast='integer').sum()

        all_topic_pages = forum_info.apply(lambda x: divmod(int(x.count_topic.replace(',', '')), 30)[0] + 1,

                                           axis=1).sum()

        print('共有hgroup：{}个'.format(all_hgroup_count))

        print('共有社区：{}个'.format(all_forum_count))

        print('共有话题：{}个'.format(all_topic_count))

        print('共有评论：{}个'.format(all_post_count))

        print('共需下载：{}个页面'.format(all_topic_pages))

if __name__ == '__main__':

    breastcancer = BreastCancer()

    # breastcancer.all_forum_stat()

    # 1. 下载社区首页

    # breastcancer.parse_forums_index()

    # 2. 获取所有社区页面url

    # breastcancer.get_all_forums_ids()

    # 3. 下载所有社区页面

    # breastcancer.get_all_forum_page(pool=False)

    # 4. 解析所有社区页面

    breastcancer.start_parse_forum_page()

Breastcancer社区评论下载的更多相关文章

SQLyog 社区免费版下载
SQLyog 是一个快速而简洁的图形化管理MYSQL数据库的工具,它能够在任何地点有效地管理你的数据库,由业界著名的Webyog公司出品.使用SQLyog可以快速直观地让您从世界的任何角落通过网络来维 ...
Visual Studio 2015官方汇总包括下载和视频
7月20日 23:30 Visual Studio 2015正式版正式发布,作为微软新一代开发利器,在全地球乃至全宇宙乃至全太阳系中最强大且没有之一的IDE(上述描述来自微博用户评论)跨平台支持成 ...
无需付费，教你IDEA社区版中开发Web项目（SpringBoot\Tomcat）
1.IDEA 版本介绍最近有小伙伴私信我说 IDEA 破解怎么总是失效?难道就没有使用长一点的吗... 咳咳,除了给我留言「激活码」外,或许社区版可能完全满足你的需求. 相信有挺多小伙伴可能不清楚或 ...
应用集成-在Hexo、Hugo博客框架中使用Gitalk基于Github上仓库项目的issue无后端服务评论系统实践
关注「WeiyiGeek」公众号设为「特别关注」每天带你玩转网络安全运维.应用开发.物联网IOT学习! 希望各位看友[关注.点赞.评论.收藏.投币],助力每一个梦想. 本章目录目录 0x00 Gi ...
ActiveReports 9实战教程（1）：手把手搭建环境Visual Studio 2013 社区版
原文:ActiveReports 9实战教程(1): 手把手搭建环境Visual Studio 2013 社区版 ActiveReports 9刚刚发布3天,微软就发布了 Visual Studio ...
搭建环境Visual Studio 2013 社区版
搭建环境Visual Studio 2013 社区版 ActiveReports 9刚刚发布3天,微软就发布了 Visual Studio Community 2013 开发环境. Visual St ...
VS2017在线安装包下载
VS2017个人免费版即社区官方下载地址为:https://download.microsoft.com/download/D/1/4/D142F7E7-4D7E-4F3B-A399-5BACA91E ...
iReport 5.6.0 安装包下载&安装
iReport 5.6.0 下载方式有两种: 1.在官网社区上下载,下载地址:https://community.jaspersoft.com/project/ireport-designer/re ...
微信web开发者工具、破解文件、开发文档和开发Demo下载
关注,QQ群,微信应用号社区 511389428 下载: Win: https://pan.baidu.com/s/1bHJGEa Mac: https://pan.baidu.com/s/1slhD ...

随机推荐

Yii2.0调用sql server存储过程并获取返回值
1.首先展示创建sql server存储过程的语句,创建一个简单的存储过程,测试用. SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON GO CREATE P ...
the security settings could not be applied to the database(mysql安装error)【简记】
在安装mysql时,出现“The security settings could not be applied to the database because the connection has f ...
GIL：全局解释器锁 VS 用户程序锁
既然有了GIL锁,CPython还要多线程干什么? ''' GIL:全局解释器锁的来历四核:同一时刻真正有四个任务在运行,多核的意义在于此单核:看上去是并发的,因为进行了上下文切换,单核永远是串行 ...
socketServer并发处理socket
socketserver简单介绍 ''' socketserver:是对socket的封装,实现并发处理前两个TCP,UDP常用,后两个不常用 ''' import socketserver soc ...
(十二)Deleting Documents
Deleting a document is fairly straightforward. This example shows how to delete our previous custome ...
基于C#的钉钉SDK开发（1）--对官方SDK的重构优化
在前段时间,接触一个很喜欢钉钉并且已在内部场景广泛使用钉钉进行工厂内部管理的客户,如钉钉考勤.日常审批.钉钉投影.钉钉门禁等等方面,才体会到原来钉钉已经已经在企业上可以用的很广泛的,因此回过头来学习研 ...
在 .NET Core 中结合 HttpClientFactory 使用 Polly（下篇）
译者:王亮作者:Polly 团队原文:http://t.cn/EhZ90oq声明:我翻译技术文章不是逐句翻译的,而是根据我自己的理解来表述的(包括标题).其中可能会去除一些不影响理解但本人实在不知道如 ...
OpenStack-Neutron（5）
一. Neutron 概述 SDN(software-defined networking)软件定义网络,其所具有的灵活性和自动化优势使其成为云时代网络管理的主流. Neutron的设计目标是实现“网 ...
docker(六) 使用docker-maven-plugin插件构建docker镜像(已过时)
可以参考博客:https://blog.csdn.net/aixiaoyang168/article/details/77453974 docker-maven-plugin官网推荐在新项目中使用do ...
Python——高阶函数——map filter zip
一.map函数 1.作用:它接收一个函数 f 和一个 list,并通过把函数 f 依次作用在 list 的每个元素上,得到一个新的 list 并返回. 2.实例 def f(x): return x* ...

Breastcancer社区评论下载

Breastcancer社区评论下载的更多相关文章

随机推荐

热门专题