出于好奇,那些10w+的公众号都写了些什么,于是我写了几个脚本爬取了各行业Top的公众号文章,进行了关键词统计。

抓取数据、分析用到了3中语言:Node.js,Java,Python。废话不多说,直接上代码。

1(NODEJS)

puppeteer模拟登陆,抓取微信公众号链接:

/**
* load wechat article urls on newrank.cn
**/
const puppeteer = require('puppeteer');
//emulate iphone
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';
const workPath = './newrank_cn1111';
const fs = require("fs");
const userName = "公众号";
const ppwwdd = "caiyongji";
if (!fs.existsSync(workPath)) {
fs.mkdirSync(workPath)
}
const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/'; const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data"; const detailUrl = "https://www.newrank.cn/public/info/detail.html?account="; (async () => { const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UI
const page = await browser.newPage();
await page.setUserAgent(userAgent);
await page.setViewport({width:1920, height:1000});
await page.setRequestInterception(true); //filter to block images
page.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await page.goto(loginUrl);
//login
await loginOperate();
//await page.close(); await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]'); await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]'); await processMonthlyRank('#wx_month_all'); async function loginOperate(){
try{
await page.click('div[data-type=pwd]');
}catch(err){
console.log('login#1');
} try{
await page.type('#account_input',userName);
await page.type('#password_input',ppwwdd);
}catch(err){
console.log('login#2');
} try{
await page.click('#pwd_confirm');
}catch(err){
console.log('login#3');
} } async function processMonthlyRank(btn){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(monthlyRankUrl);
try{
await tab.click(btn);
}catch(err){
console.log('processMonthlyRank#1');
}
let fileName = await tab.evaluate(function(param){
return document.querySelector(param).innerHTML;
},btn);
console.log('-------------------------'+fileName+'-------------------------');
await scrollWait(tab);
await waitSecond(tab); const sel = '.wx_main tr';
const texts = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, sel);
console.log('total rows: '+texts.length);
let contents='记录条数'+(texts.length-1)+'\n\n';
texts.forEach(function(c,index){
if(index>0){
contents+=c+'\n\n';
}
}); const fs = require("fs");
fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);
console.log(fileName + " has been extracted to local."); const idSel = '.wx_main tr a[href^="detail.html"]';
const ids = await tab.evaluate((idSel) => {
let elements = Array.from(document.querySelectorAll(idSel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, idSel);
let idContents='';
let w_name;
let flag =true;
/*ids.forEach(async function(id,index){
if(index%2!=0){
idContents+=id+'\n';
await getDetail(fileName,w_name,id);
w_name =null;
}else{
w_name=id;
}
});*/
await (async ()=>{
for(let i=0;i<ids.length;i++){
if(i%2!=0){
idContents+=ids[i]+'\n';
await getDetail(fileName,w_name,ids[i]);
w_name =null;
}else{
w_name=ids[i];
}
}
})();
let idFile = 'id_'+fileName;
fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);
console.log(idFile + " has been extracted to local.");
await tab.close();
} async function scrollWait(p, n){
if(n==null) n=5;
for(let i= 0; i<n;i++){
try{
await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));
await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});
}catch(err){
console.log('scroll to bottom and then wait 500 ms.');
}
}
} async function waitSecond(p){
try{
await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});
}catch(err){
//console.log('wait 1 sec.');
}
} async function getDetail(cat,name,id){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(detailUrl+id);
await waitSecond(tab);
const sel = '#info_detail_article_top li .title a';
const hrefs = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let links = elements.map(element => {
return element.href
})
return links;
}, sel);
let urlList='';
hrefs.forEach(function(href,index){
urlList+=href+"\n";
});
const fs = require("fs");
if (!fs.existsSync(workPath+'/'+cat)) {
fs.mkdirSync(workPath+'/'+cat)
}
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList); const sel1 = '#info_detail_article_lastest li .title a';
const hrefs1 = await tab.evaluate((sel1) => {
let elements = Array.from(document.querySelectorAll(sel1));
let links = elements.map(element => {
return element.href
})
return links;
}, sel1);
let urlList1='';
hrefs1.forEach(function(href,index){
urlList1+=href+"\n";
});
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);
console.log(id+' '+name+' has been extracted to local.');
await tab.close();
} })();

  

 

2(JAVA)

Jsoup抓取微信文章文本:Vps 安全设置 Win2003中IIS的安全设置技巧

package com;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; public class WeChatUrls extends Thread {
private File catFile;
final static Integer ThreadNum = 1;
final String ERROR = "ERROR";
private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";
private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn"; public WeChatUrls(File cat) {
this.catFile = cat;
} private String getUrlProxyContent(String url) {
String body = ERROR;
try {
Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();
if (doc.select("body") != null) {
body = doc.select("body").text();
}
} catch (IOException e) {
System.out.println("ERROR URL: " + url);
e.printStackTrace();
} return body;
} private void write(String content, String fileName) {
File f = new File(fileName);
FileWriter fw = null;
BufferedWriter bw = null;
try {
if (!f.exists()) {
f.getParentFile().mkdirs();
f.createNewFile();
}
// fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容
fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加
bw = new BufferedWriter(fw);
bw.write(content);
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
} public static void main(String[] args) throws Exception {
File baseFolder = new File(READ_URLS_FOLDER);
File[] cataFiles = baseFolder.listFiles();
ExecutorService service = Executors.newFixedThreadPool(ThreadNum);
Arrays.asList(cataFiles).stream().forEach(catFile -> {
if (catFile.isFile() && catFile.getName().startsWith("id")) {
service.execute(new WeChatUrls(catFile));
}
});
service.shutdown();
} private void process() {
// Set<String> redoSet = new HashSet<>();
String catagory = catFile.getName().split("\\.")[0].split("_")[1];
File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);
File[] urlFiles = urlFolder.listFiles();
if (urlFiles != null) {
Arrays.asList(urlFiles).stream().forEach(urlFile -> {
try {
BufferedReader reader = new BufferedReader(new FileReader(catFile));
String wechatId = null;
int countLatest = 1;
int countTop = 1;
while ((wechatId = reader.readLine()) != null) {
if (urlFile.getName().startsWith(wechatId)) {
String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];
// if (urlFile.length() == 0) {
// redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");
// }
BufferedReader r = new BufferedReader(new FileReader(urlFile));
String wechatUrl = null;
while ((wechatUrl = r.readLine()) != null) {
String writePath = WORK_FOLDER + "\\" + catagory + "\\"
+ (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId
+ "_" + wechatName + "_"
+ (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";
String content = getUrlProxyContent(wechatUrl);
write(content, writePath);
System.out.println(writePath);
Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));
}
r.close();
}
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
});
}
// redoSet.stream().forEach(System.out::println); } @Override
public void run() {
process();
}
}

  

 

3(PYTHON)

wordcloud生成词云:

# -*- coding: utf-8 -*-
import json
import random
import time
import os
from pyecharts import Bar,Geo,Line,Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter
os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles') stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序'] def proc(folder, type):
fileLines = []
rootdir = './'+folder+'/'+type
list = os.listdir(rootdir)
for i in range(0,len(list)):
path = os.path.join(rootdir,list[i])
if os.path.isfile(path):
try:
fo = open(path, 'r+')
fileLines += fo.readlines()
except:
print('error while processing file: ' + path) _str = ' '.join(fileLines)
words_list = []
word_generator = jieba.cut_for_search(_str)
for word in word_generator:
words_list.append(word)
words_list = [k for k in words_list if len(k)>1 and k not in stopWords]
back_color = imread('back.jpg')
wc = WordCloud(background_color='white',
max_words=2000,
mask=back_color,
max_font_size=300,
font_path="C:/Windows/Fonts/msyh.ttc",
random_state=42
)
_count = Counter(words_list)
wc.generate_from_frequencies(_count)
image_colors = ImageColorGenerator(back_color)
wc.recolor(color_func=image_colors)
#plt.figure()
#plt.imshow(wc.recolor(color_func=image_colors))
#plt.axis('off') # The pil way (if you don't have matplotlib)
image = wc.to_image()
image.show()
jpgFile = './'+type+'_'+folder+'.jpg'
image.save(jpgFile)
print('image File saved:' + jpgFile) basedir = './'
baselist = os.listdir(basedir)
for l in range(0,len(baselist)):
p = os.path.join(basedir,baselist[l])
if os.path.isdir(p):
proc(os.path.basename(p), 'top')

  

 

4

词云结果涉及23个维度,得出结果如下:

TOP500公众号文章

创业

健康

教育

乐活

企业

情感

体育娱乐

文化

文摘

幽默

政务

旅行

时事

时尚

民生

汽车

百科

科技

美体

美食

职场

财富

文章转自:https://segmentfault.com/r/1250000015997077?shareId=1210000015997081

那些10w+的公众号都在写什么?的更多相关文章

  1. 一个人的公众号,我写了1w+

    大家好,我是Bypass,一个人一直保持着写博客的习惯,为此维护了一个技术公众号,致力于分享原创高质量干货,写的内容主要围绕:渗透测试.WAF绕过.代码审计.应急响应.企业安全. 一直以来,我把它当成 ...

  2. 手机QQ公众号亿级消息实时群发架构

    编者按:高可用架构分享及传播在架构领域具有典型意义的文章,本文由孙子荀分享.转载请注明来自高可用架构公众号 ArchNotes.   孙子荀,2009 年在华为从事内核和分布式系统的开发工作:2011 ...

  3. 微信公众号发送消息模板(java)

    这段时间接触公众号开发,写下向用户发送消息模板的接口调用 先上接口代码 public static JSONObject sendModelMessage(ServletContext context ...

  4. PHP开发微信公众号(二)消息接受与推送

    上一篇文章我们知道怎么获取二维码,这样别人就可以扫描二维码来关注我们,但是别人关注后,发送消息,我们怎么进行相关处理? 这里我们就来学习下怎么处理处理这些消息,以及推送消息. 学习之前首先你需要有一个 ...

  5. spring-boot-route(二十三)开发微信公众号

    在讲微信公众号开发之前,先来大概了解一下微信公众号.微信公众号大体上可以分为服务号和订阅号,订阅号和服务号的区别如下: 服务号可以申请微信支付功能. 服务号只能由企业申请,订阅号可以有企业或个人申请. ...

  6. php开发微信公众号获取信息LBS

    1.一般的公众号都可以在微信公众平台里面设置自定义菜单和自动回复消息,如果需要获取用户位置,则必须开启 服务器配置,当次功能开启后,微信公众平台的自定义菜单和自动回复则失效. 需要通过接口开发来实现微 ...

  7. 微信公众号开发C#系列-9、多公众号集中管理

    1.概述 通过前面8篇关于微信开发相关文章的学习,我们已经对微信常用开发有了一个比较深入的了解.前面的文章都是基于某一特定公众号的,在现实业务中同一单位个体运营着不至一个公众号,此时就需要对多个公众号 ...

  8. 微信公众号UX分析—— 学生作业小结

    1. 不足: 1. 权威性:个人帐号,显得不够正式. 2. 排版问题: + 没有必要的外接端口,界面设计极度缺少排版.哪怕是个人公众号都不至于如此,更何况这是一个学校的教务平台. 3. 反应不及时或无 ...

  9. 如何玩转小程序+公众号?手把手教你JeeWx小程序CMS与公众号关联

    随着微信小程序新功能.新入口的不断更新,小程序的商业价值逐步增强,特别是小程序与公众号的深度融合,已经让小程序成为各行业新的营销渠道.Jeewx平台专注小程序的开发,逐步完善小程序生态圈,通过简单操作 ...

随机推荐

  1. 【图像处理】【计算机视觉】findContours的使用

    原文地址:findContours函数参数说明及相关函数作者:鸳都学童 findContours函数,这个函数的原型为: void findContours(InputOutputArray imag ...

  2. hive查询结果保存

    参考: https://blog.csdn.net/zhuce1986/article/details/39586189 一.保存结果到本地 方法1:调用hive标准输出,将查询结果写到指定的文件中 ...

  3. Nginx_Ubuntu

    一. 基本步骤 1.1 环境准备 开始前,请确认gcc g++开发类库是否装好,默认已经安装. 注: 等待linux下载更新功能准备好了 重启系统 在执行下载安装命令,如执行命令没有问题可以继续往下走 ...

  4. nodejs 写服务器解决中文乱码问题

    nodejs 写服务器解决中文乱码问题:https://blog.csdn.net/worldmakewayfordream/article/details/77483423     本文链接:htt ...

  5. centos7 源码编译安装 php

    准备工作 下载 PHP 源码包并解压 $ wget https://www.php.net/distributions/php-7.2.19.tar.bz2 $ yum -y install bzip ...

  6. git reset –mixed –soft –hard命令解释。

    直接看官方的解释. 其中HEAD代表版本库,index代表暂存区,另外还有一个我们增删改代码的工作区.所以官方解释翻译过来就是: --hard : 回退版本库,暂存区,工作区.(因此我们修改过的代码就 ...

  7. jQuery-点击按钮页面滚动到顶部,底部,指定位置

    $('.scroll_top').click(function(){$('html,body').animate({scrollTop: '0px'}, 800);}); //页面滚动至顶部 $('. ...

  8. 097、如何实现Service伸缩?(Swarm04)

    参考https://www.cnblogs.com/CloudMan6/p/7885667.html   上一节部署了只有一个副本的Service,不过对于web服务,我们通常会运行多个实例,这样可以 ...

  9. 运维LVS三种模式十种调度算法

    一.LVS简介 LVS(Linux Virtual Server)即Linux虚拟服务器,是由章文嵩博士主导的开源负载均衡项目,目前LVS已经被集成到Linux内核模块中.该项目在Linux内核中实现 ...

  10. ELK-全文检索技术-使用总结

    一.概念 1.1 基础概念 ELK: 是ElasticSearch,LogStash以及Kibana三个产品的首字母缩写 lucene : apache 的全文搜索引擎工具包 elasticsearc ...