那些10w+的公众号都在写什么？

出于好奇，那些10w+的公众号都写了些什么，于是我写了几个脚本爬取了各行业Top的公众号文章，进行了关键词统计。

抓取数据、分析用到了3中语言：Node.js，Java，Python。废话不多说，直接上代码。

1(NODEJS)

puppeteer模拟登陆，抓取微信公众号链接：

/**

* load wechat article urls on newrank.cn

**/

const puppeteer = require('puppeteer');

//emulate iphone

const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';

const workPath = './newrank_cn1111';

const fs = require("fs");

const userName = "公众号";

const ppwwdd = "caiyongji";

if (!fs.existsSync(workPath)) {

        fs.mkdirSync(workPath)

}

const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/';

const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data";

const detailUrl = "https://www.newrank.cn/public/info/detail.html?account=";

(async () => {

    const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UI

    const page = await browser.newPage();

    await page.setUserAgent(userAgent);

    await page.setViewport({width:1920, height:1000});

    await page.setRequestInterception(true);

    //filter to block images

    page.on('request', request => {

    if (request.resourceType() === 'image')

      request.abort();

    else

      request.continue();

    });

    await page.goto(loginUrl);

    //login

    await loginOperate();

    //await page.close();

    await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]');

    await processMonthlyRank('#wx_month_all');

    async function loginOperate(){

        try{

            await page.click('div[data-type=pwd]');

        }catch(err){

            console.log('login#1');

        }

        try{

            await page.type('#account_input',userName);

            await page.type('#password_input',ppwwdd);

        }catch(err){

            console.log('login#2');

        }

        try{

            await page.click('#pwd_confirm');

        }catch(err){

            console.log('login#3');

        }

    }

    async function processMonthlyRank(btn){

        const tab = await browser.newPage();

        await tab.setUserAgent(userAgent);

        await tab.setViewport({width:1920, height:1000});

        await tab.setRequestInterception(true);

        //filter to block images

        tab.on('request', request => {

        if (request.resourceType() === 'image')

          request.abort();

        else

          request.continue();

        });

        await tab.goto(monthlyRankUrl);

        try{

            await tab.click(btn);

        }catch(err){

            console.log('processMonthlyRank#1');

        }

        let fileName = await tab.evaluate(function(param){

            return document.querySelector(param).innerHTML;

        },btn);

        console.log('-------------------------'+fileName+'-------------------------');

        await scrollWait(tab);

        await waitSecond(tab);

        const sel = '.wx_main tr';

        const texts = await tab.evaluate((sel) => {

        let elements = Array.from(document.querySelectorAll(sel));

            let txt = elements.map(element => {

                return element.innerText

            })

            return txt;

        }, sel);

        console.log('total rows: '+texts.length);

        let contents='记录条数'+(texts.length-1)+'\n\n';

        texts.forEach(function(c,index){

            if(index>0){

                contents+=c+'\n\n';

            }

        });

        const fs = require("fs");

        fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);

        console.log(fileName + " has been extracted to local.");

        const idSel = '.wx_main tr a[href^="detail.html"]';

        const ids = await tab.evaluate((idSel) => {

        let elements = Array.from(document.querySelectorAll(idSel));

            let txt = elements.map(element => {

                return element.innerText

            })

            return txt;

        }, idSel);

        let idContents='';

        let w_name;

        let flag =true;

        /*ids.forEach(async function(id,index){

            if(index%2!=0){

                idContents+=id+'\n';

                await getDetail(fileName,w_name,id);

                w_name =null;

            }else{

                w_name=id;

            }

        });*/

        await (async ()=>{

            for(let i=0;i<ids.length;i++){

                if(i%2!=0){

                idContents+=ids[i]+'\n';

                await getDetail(fileName,w_name,ids[i]);

                w_name =null;

            }else{

                w_name=ids[i];

            }

            }

        })();

        let idFile = 'id_'+fileName;

        fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);

        console.log(idFile + " has been extracted to local.");

        await tab.close();

    }

    async function scrollWait(p, n){

        if(n==null) n=5;

        for(let i= 0; i<n;i++){

            try{

                await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));

                await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});

            }catch(err){

                console.log('scroll to bottom and then wait 500 ms.');

            }

        }

    }

    async function waitSecond(p){

        try{

            await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});

        }catch(err){

            //console.log('wait 1 sec.');

        }

    }

    async function getDetail(cat,name,id){

        const tab = await browser.newPage();

        await tab.setUserAgent(userAgent);

        await tab.setViewport({width:1920, height:1000});

        await tab.setRequestInterception(true);

        //filter to block images

        tab.on('request', request => {

        if (request.resourceType() === 'image')

          request.abort();

        else

          request.continue();

        });

        await tab.goto(detailUrl+id);

        await waitSecond(tab);

        const sel = '#info_detail_article_top li .title a';

        const hrefs = await tab.evaluate((sel) => {

            let elements = Array.from(document.querySelectorAll(sel));

            let links = elements.map(element => {

                return element.href

            })

            return links;

        }, sel);

        let urlList='';

        hrefs.forEach(function(href,index){

            urlList+=href+"\n";

        });

        const fs = require("fs");

        if (!fs.existsSync(workPath+'/'+cat)) {

            fs.mkdirSync(workPath+'/'+cat)

        }

        fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList);

        const sel1 = '#info_detail_article_lastest li .title a';

        const hrefs1 = await tab.evaluate((sel1) => {

            let elements = Array.from(document.querySelectorAll(sel1));

            let links = elements.map(element => {

                return element.href

            })

            return links;

        }, sel1);

        let urlList1='';

        hrefs1.forEach(function(href,index){

            urlList1+=href+"\n";

        });

        fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);

        console.log(id+' '+name+' has been extracted to local.');

        await tab.close();

    }

})();

2（JAVA）

Jsoup抓取微信文章文本：Vps 安全设置 Win2003中IIS的安全设置技巧

package com;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.util.Arrays;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import java.util.concurrent.ThreadLocalRandom;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

public class WeChatUrls extends Thread {

    private File catFile;

    final static Integer ThreadNum = 1;

    final String ERROR = "ERROR";

    private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";

    private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";

    private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn";

    public WeChatUrls(File cat) {

        this.catFile = cat;

    }

    private String getUrlProxyContent(String url) {

        String body = ERROR;

        try {

            Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();

            if (doc.select("body") != null) {

                body = doc.select("body").text();

            }

        } catch (IOException e) {

            System.out.println("ERROR URL: " + url);

            e.printStackTrace();

        }

        return body;

    }

    private void write(String content, String fileName) {

        File f = new File(fileName);

        FileWriter fw = null;

        BufferedWriter bw = null;

        try {

            if (!f.exists()) {

                f.getParentFile().mkdirs();

                f.createNewFile();

            }

//             fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容

            fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加

            bw = new BufferedWriter(fw);

            bw.write(content);

            bw.close();

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    public static void main(String[] args) throws Exception {

        File baseFolder = new File(READ_URLS_FOLDER);

        File[] cataFiles = baseFolder.listFiles();

        ExecutorService service = Executors.newFixedThreadPool(ThreadNum);

        Arrays.asList(cataFiles).stream().forEach(catFile -> {

            if (catFile.isFile() && catFile.getName().startsWith("id")) {

                service.execute(new WeChatUrls(catFile));

            }

        });

        service.shutdown();

    }

    private void process() {

//        Set<String> redoSet = new HashSet<>();

        String catagory = catFile.getName().split("\\.")[0].split("_")[1];

        File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);

        File[] urlFiles = urlFolder.listFiles();

        if (urlFiles != null) {

            Arrays.asList(urlFiles).stream().forEach(urlFile -> {

                try {

                    BufferedReader reader = new BufferedReader(new FileReader(catFile));

                    String wechatId = null;

                    int countLatest = 1;

                    int countTop = 1;

                    while ((wechatId = reader.readLine()) != null) {

                        if (urlFile.getName().startsWith(wechatId)) {

                            String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];

//                            if (urlFile.length() == 0) {

//                                redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");

//                            }

                            BufferedReader r = new BufferedReader(new FileReader(urlFile));

                            String wechatUrl = null;

                            while ((wechatUrl = r.readLine()) != null) {

                                String writePath = WORK_FOLDER + "\\" + catagory + "\\"

                                        + (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId

                                        + "_" + wechatName + "_"

                                        + (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";

                                String content = getUrlProxyContent(wechatUrl);

                                write(content, writePath);

                                System.out.println(writePath);

                                Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));

                            }

                            r.close();

                        }

                    }

                    reader.close();

                } catch (Exception e) {

                    e.printStackTrace();

                }

            });

        }

//        redoSet.stream().forEach(System.out::println);

    }

    @Override

    public void run() {

        process();

    }

}

3（PYTHON）

wordcloud生成词云：

# -*- coding: utf-8 -*-

import json

import random

import time

import os

from pyecharts import Bar,Geo,Line,Overlap

import jieba

from scipy.misc import imread

from wordcloud import WordCloud, ImageColorGenerator

import matplotlib.pyplot as plt

from collections import Counter

os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles')

stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序']

def proc(folder, type):

    fileLines = []

    rootdir = './'+folder+'/'+type

    list = os.listdir(rootdir)

    for i in range(0,len(list)):

            path = os.path.join(rootdir,list[i])

            if os.path.isfile(path):

                    try:

                        fo = open(path, 'r+')

                        fileLines += fo.readlines()

                    except:

                        print('error while processing file: ' + path)

    _str =  ' '.join(fileLines)

    words_list = []

    word_generator = jieba.cut_for_search(_str)

    for word in word_generator:

        words_list.append(word)

    words_list = [k for k in words_list if len(k)>1 and k not in stopWords]

    back_color = imread('back.jpg')

    wc = WordCloud(background_color='white',

                   max_words=2000,

                   mask=back_color,

                   max_font_size=300,

                   font_path="C:/Windows/Fonts/msyh.ttc",

                   random_state=42

                   )

    _count = Counter(words_list)

    wc.generate_from_frequencies(_count)

    image_colors = ImageColorGenerator(back_color)

    wc.recolor(color_func=image_colors)

    #plt.figure()

    #plt.imshow(wc.recolor(color_func=image_colors))

    #plt.axis('off')

    # The pil way (if you don't have matplotlib)

    image = wc.to_image()

    image.show()

    jpgFile = './'+type+'_'+folder+'.jpg'

    image.save(jpgFile)

    print('image File saved:' + jpgFile)

basedir = './'

baselist = os.listdir(basedir)

for l in range(0,len(baselist)):

        p = os.path.join(basedir,baselist[l])

        if os.path.isdir(p):

                proc(os.path.basename(p), 'top')

4

词云结果涉及23个维度，得出结果如下：

TOP500公众号文章

创业

健康

教育

乐活

企业

情感

体育娱乐

文化

文摘

幽默

政务

旅行

时事

时尚

民生

汽车

百科

科技

美体

美食

职场

财富

文章转自：https://segmentfault.com/r/1250000015997077?shareId=1210000015997081

那些10w+的公众号都在写什么？的更多相关文章

一个人的公众号，我写了1w+
大家好,我是Bypass,一个人一直保持着写博客的习惯,为此维护了一个技术公众号,致力于分享原创高质量干货,写的内容主要围绕:渗透测试.WAF绕过.代码审计.应急响应.企业安全. 一直以来,我把它当成 ...
手机QQ公众号亿级消息实时群发架构
编者按:高可用架构分享及传播在架构领域具有典型意义的文章,本文由孙子荀分享.转载请注明来自高可用架构公众号 ArchNotes. 孙子荀,2009 年在华为从事内核和分布式系统的开发工作:2011 ...
微信公众号发送消息模板(java)
这段时间接触公众号开发,写下向用户发送消息模板的接口调用先上接口代码 public static JSONObject sendModelMessage(ServletContext context ...
PHP开发微信公众号（二）消息接受与推送
上一篇文章我们知道怎么获取二维码,这样别人就可以扫描二维码来关注我们,但是别人关注后,发送消息,我们怎么进行相关处理? 这里我们就来学习下怎么处理处理这些消息,以及推送消息. 学习之前首先你需要有一个 ...
spring-boot-route（二十三）开发微信公众号
在讲微信公众号开发之前,先来大概了解一下微信公众号.微信公众号大体上可以分为服务号和订阅号,订阅号和服务号的区别如下: 服务号可以申请微信支付功能. 服务号只能由企业申请,订阅号可以有企业或个人申请. ...
php开发微信公众号获取信息LBS
1.一般的公众号都可以在微信公众平台里面设置自定义菜单和自动回复消息,如果需要获取用户位置,则必须开启服务器配置,当次功能开启后,微信公众平台的自定义菜单和自动回复则失效. 需要通过接口开发来实现微 ...
微信公众号开发C#系列-9、多公众号集中管理
1.概述通过前面8篇关于微信开发相关文章的学习,我们已经对微信常用开发有了一个比较深入的了解.前面的文章都是基于某一特定公众号的,在现实业务中同一单位个体运营着不至一个公众号,此时就需要对多个公众号 ...
微信公众号UX分析—— 学生作业小结
1. 不足: 1. 权威性:个人帐号,显得不够正式. 2. 排版问题: + 没有必要的外接端口,界面设计极度缺少排版.哪怕是个人公众号都不至于如此,更何况这是一个学校的教务平台. 3. 反应不及时或无 ...
如何玩转小程序+公众号？手把手教你JeeWx小程序CMS与公众号关联
随着微信小程序新功能.新入口的不断更新,小程序的商业价值逐步增强,特别是小程序与公众号的深度融合,已经让小程序成为各行业新的营销渠道.Jeewx平台专注小程序的开发,逐步完善小程序生态圈,通过简单操作 ...

随机推荐

【计算机视觉】阶编码本模型(Multi phase codebook model)
转自:http://www.cnblogs.com/xrwang/archive/2012/04/24/MPCBBGM.html 多阶编码本模型(Multi phase codebook model) ...
navicat连接 mysql报错1251解决方案
转自:https://blog.csdn.net/XDMFC/article/details/80263215 好不容易安装好mysql,但又出现了mysql客户端版本太低的问题.根据参考的这篇博客, ...
Myeclipse下配置SVN报错问题 svn: E175002: java.lang.RuntimeException: Could not generate DH keypair(转)
转:http://blog.csdn.net/yulong_1988/article/details/51459936 在myeclipse下安装svn插件,出现了Could not generate ...
session 的理解
oracle truncate表恢复操作
truncate恢复表 1.创建测试用表 conn elan/elan create table haha as select * from dba_users; 2.查询表数据 ) from hah ...
HDU-5155 Harry And Magic Box
题目描述在$n*m$的矩阵内每一行每一列都有钻石,问钻石分布的种类? 答案有可能很大,所以输出答案对$1000000007$取模. Input 对于每个测试用例,有两个整数$n$和\(m ...
goods商品类
php Excel 导入
php Excel 导入 public function storeSql() { $file = input('file.excel'); $path = ROOT_PATH . 'public' ...
数据库中的round()
Round函数返回一个数值,该数值是按照指定的小数位数进行四舍五入运算的结果.可是当保留位跟着的即使是5,有可能进位,也有可能舍去,机会各50%.这样就会造成在应用程序中计算有误. 参数规范语法 r ...
Codeforces 1220C. Substring Game in the Lesson
传送门首先显然的,如果 $l$ 能移动,那么 $r$ 一定可以随便移动,如果 $l$ 不动,那么 $r$ 一定不能动那么我们现在只要考虑 $l$ 的移动即可考虑找到位置 $k$ 之前的最左边的最 ...

那些10w+的公众号都在写什么？

1(NODEJS)

2（JAVA）

3（PYTHON）

4

TOP500公众号文章

创业

健康

教育

乐活

企业

情感

体育娱乐

文化

文摘

幽默

政务

旅行

时事

时尚

民生

汽车

百科

科技

美体

美食

职场

财富

那些10w+的公众号都在写什么？的更多相关文章

随机推荐

热门专题