出于好奇,那些10w+的公众号都写了些什么,于是我写了几个脚本爬取了各行业Top的公众号文章,进行了关键词统计。

抓取数据、分析用到了3中语言:Node.js,Java,Python。废话不多说,直接上代码。

1(NODEJS)

puppeteer模拟登陆,抓取微信公众号链接:

/**
* load wechat article urls on newrank.cn
**/
const puppeteer = require('puppeteer');
//emulate iphone
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';
const workPath = './newrank_cn1111';
const fs = require("fs");
const userName = "公众号";
const ppwwdd = "caiyongji";
if (!fs.existsSync(workPath)) {
fs.mkdirSync(workPath)
}
const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/'; const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data"; const detailUrl = "https://www.newrank.cn/public/info/detail.html?account="; (async () => { const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UI
const page = await browser.newPage();
await page.setUserAgent(userAgent);
await page.setViewport({width:1920, height:1000});
await page.setRequestInterception(true); //filter to block images
page.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await page.goto(loginUrl);
//login
await loginOperate();
//await page.close(); await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]'); await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]'); await processMonthlyRank('#wx_month_all'); async function loginOperate(){
try{
await page.click('div[data-type=pwd]');
}catch(err){
console.log('login#1');
} try{
await page.type('#account_input',userName);
await page.type('#password_input',ppwwdd);
}catch(err){
console.log('login#2');
} try{
await page.click('#pwd_confirm');
}catch(err){
console.log('login#3');
} } async function processMonthlyRank(btn){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(monthlyRankUrl);
try{
await tab.click(btn);
}catch(err){
console.log('processMonthlyRank#1');
}
let fileName = await tab.evaluate(function(param){
return document.querySelector(param).innerHTML;
},btn);
console.log('-------------------------'+fileName+'-------------------------');
await scrollWait(tab);
await waitSecond(tab); const sel = '.wx_main tr';
const texts = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, sel);
console.log('total rows: '+texts.length);
let contents='记录条数'+(texts.length-1)+'\n\n';
texts.forEach(function(c,index){
if(index>0){
contents+=c+'\n\n';
}
}); const fs = require("fs");
fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);
console.log(fileName + " has been extracted to local."); const idSel = '.wx_main tr a[href^="detail.html"]';
const ids = await tab.evaluate((idSel) => {
let elements = Array.from(document.querySelectorAll(idSel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, idSel);
let idContents='';
let w_name;
let flag =true;
/*ids.forEach(async function(id,index){
if(index%2!=0){
idContents+=id+'\n';
await getDetail(fileName,w_name,id);
w_name =null;
}else{
w_name=id;
}
});*/
await (async ()=>{
for(let i=0;i<ids.length;i++){
if(i%2!=0){
idContents+=ids[i]+'\n';
await getDetail(fileName,w_name,ids[i]);
w_name =null;
}else{
w_name=ids[i];
}
}
})();
let idFile = 'id_'+fileName;
fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);
console.log(idFile + " has been extracted to local.");
await tab.close();
} async function scrollWait(p, n){
if(n==null) n=5;
for(let i= 0; i<n;i++){
try{
await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));
await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});
}catch(err){
console.log('scroll to bottom and then wait 500 ms.');
}
}
} async function waitSecond(p){
try{
await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});
}catch(err){
//console.log('wait 1 sec.');
}
} async function getDetail(cat,name,id){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(detailUrl+id);
await waitSecond(tab);
const sel = '#info_detail_article_top li .title a';
const hrefs = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let links = elements.map(element => {
return element.href
})
return links;
}, sel);
let urlList='';
hrefs.forEach(function(href,index){
urlList+=href+"\n";
});
const fs = require("fs");
if (!fs.existsSync(workPath+'/'+cat)) {
fs.mkdirSync(workPath+'/'+cat)
}
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList); const sel1 = '#info_detail_article_lastest li .title a';
const hrefs1 = await tab.evaluate((sel1) => {
let elements = Array.from(document.querySelectorAll(sel1));
let links = elements.map(element => {
return element.href
})
return links;
}, sel1);
let urlList1='';
hrefs1.forEach(function(href,index){
urlList1+=href+"\n";
});
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);
console.log(id+' '+name+' has been extracted to local.');
await tab.close();
} })();

  

 

2(JAVA)

Jsoup抓取微信文章文本:Vps 安全设置 Win2003中IIS的安全设置技巧

package com;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; public class WeChatUrls extends Thread {
private File catFile;
final static Integer ThreadNum = 1;
final String ERROR = "ERROR";
private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";
private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn"; public WeChatUrls(File cat) {
this.catFile = cat;
} private String getUrlProxyContent(String url) {
String body = ERROR;
try {
Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();
if (doc.select("body") != null) {
body = doc.select("body").text();
}
} catch (IOException e) {
System.out.println("ERROR URL: " + url);
e.printStackTrace();
} return body;
} private void write(String content, String fileName) {
File f = new File(fileName);
FileWriter fw = null;
BufferedWriter bw = null;
try {
if (!f.exists()) {
f.getParentFile().mkdirs();
f.createNewFile();
}
// fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容
fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加
bw = new BufferedWriter(fw);
bw.write(content);
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
} public static void main(String[] args) throws Exception {
File baseFolder = new File(READ_URLS_FOLDER);
File[] cataFiles = baseFolder.listFiles();
ExecutorService service = Executors.newFixedThreadPool(ThreadNum);
Arrays.asList(cataFiles).stream().forEach(catFile -> {
if (catFile.isFile() && catFile.getName().startsWith("id")) {
service.execute(new WeChatUrls(catFile));
}
});
service.shutdown();
} private void process() {
// Set<String> redoSet = new HashSet<>();
String catagory = catFile.getName().split("\\.")[0].split("_")[1];
File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);
File[] urlFiles = urlFolder.listFiles();
if (urlFiles != null) {
Arrays.asList(urlFiles).stream().forEach(urlFile -> {
try {
BufferedReader reader = new BufferedReader(new FileReader(catFile));
String wechatId = null;
int countLatest = 1;
int countTop = 1;
while ((wechatId = reader.readLine()) != null) {
if (urlFile.getName().startsWith(wechatId)) {
String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];
// if (urlFile.length() == 0) {
// redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");
// }
BufferedReader r = new BufferedReader(new FileReader(urlFile));
String wechatUrl = null;
while ((wechatUrl = r.readLine()) != null) {
String writePath = WORK_FOLDER + "\\" + catagory + "\\"
+ (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId
+ "_" + wechatName + "_"
+ (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";
String content = getUrlProxyContent(wechatUrl);
write(content, writePath);
System.out.println(writePath);
Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));
}
r.close();
}
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
});
}
// redoSet.stream().forEach(System.out::println); } @Override
public void run() {
process();
}
}

  

 

3(PYTHON)

wordcloud生成词云:

# -*- coding: utf-8 -*-
import json
import random
import time
import os
from pyecharts import Bar,Geo,Line,Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter
os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles') stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序'] def proc(folder, type):
fileLines = []
rootdir = './'+folder+'/'+type
list = os.listdir(rootdir)
for i in range(0,len(list)):
path = os.path.join(rootdir,list[i])
if os.path.isfile(path):
try:
fo = open(path, 'r+')
fileLines += fo.readlines()
except:
print('error while processing file: ' + path) _str = ' '.join(fileLines)
words_list = []
word_generator = jieba.cut_for_search(_str)
for word in word_generator:
words_list.append(word)
words_list = [k for k in words_list if len(k)>1 and k not in stopWords]
back_color = imread('back.jpg')
wc = WordCloud(background_color='white',
max_words=2000,
mask=back_color,
max_font_size=300,
font_path="C:/Windows/Fonts/msyh.ttc",
random_state=42
)
_count = Counter(words_list)
wc.generate_from_frequencies(_count)
image_colors = ImageColorGenerator(back_color)
wc.recolor(color_func=image_colors)
#plt.figure()
#plt.imshow(wc.recolor(color_func=image_colors))
#plt.axis('off') # The pil way (if you don't have matplotlib)
image = wc.to_image()
image.show()
jpgFile = './'+type+'_'+folder+'.jpg'
image.save(jpgFile)
print('image File saved:' + jpgFile) basedir = './'
baselist = os.listdir(basedir)
for l in range(0,len(baselist)):
p = os.path.join(basedir,baselist[l])
if os.path.isdir(p):
proc(os.path.basename(p), 'top')

  

 

4

词云结果涉及23个维度,得出结果如下:

TOP500公众号文章

创业

健康

教育

乐活

企业

情感

体育娱乐

文化

文摘

幽默

政务

旅行

时事

时尚

民生

汽车

百科

科技

美体

美食

职场

财富

文章转自:https://segmentfault.com/r/1250000015997077?shareId=1210000015997081

那些10w+的公众号都在写什么?的更多相关文章

  1. 一个人的公众号,我写了1w+

    大家好,我是Bypass,一个人一直保持着写博客的习惯,为此维护了一个技术公众号,致力于分享原创高质量干货,写的内容主要围绕:渗透测试.WAF绕过.代码审计.应急响应.企业安全. 一直以来,我把它当成 ...

  2. 手机QQ公众号亿级消息实时群发架构

    编者按:高可用架构分享及传播在架构领域具有典型意义的文章,本文由孙子荀分享.转载请注明来自高可用架构公众号 ArchNotes.   孙子荀,2009 年在华为从事内核和分布式系统的开发工作:2011 ...

  3. 微信公众号发送消息模板(java)

    这段时间接触公众号开发,写下向用户发送消息模板的接口调用 先上接口代码 public static JSONObject sendModelMessage(ServletContext context ...

  4. PHP开发微信公众号(二)消息接受与推送

    上一篇文章我们知道怎么获取二维码,这样别人就可以扫描二维码来关注我们,但是别人关注后,发送消息,我们怎么进行相关处理? 这里我们就来学习下怎么处理处理这些消息,以及推送消息. 学习之前首先你需要有一个 ...

  5. spring-boot-route(二十三)开发微信公众号

    在讲微信公众号开发之前,先来大概了解一下微信公众号.微信公众号大体上可以分为服务号和订阅号,订阅号和服务号的区别如下: 服务号可以申请微信支付功能. 服务号只能由企业申请,订阅号可以有企业或个人申请. ...

  6. php开发微信公众号获取信息LBS

    1.一般的公众号都可以在微信公众平台里面设置自定义菜单和自动回复消息,如果需要获取用户位置,则必须开启 服务器配置,当次功能开启后,微信公众平台的自定义菜单和自动回复则失效. 需要通过接口开发来实现微 ...

  7. 微信公众号开发C#系列-9、多公众号集中管理

    1.概述 通过前面8篇关于微信开发相关文章的学习,我们已经对微信常用开发有了一个比较深入的了解.前面的文章都是基于某一特定公众号的,在现实业务中同一单位个体运营着不至一个公众号,此时就需要对多个公众号 ...

  8. 微信公众号UX分析—— 学生作业小结

    1. 不足: 1. 权威性:个人帐号,显得不够正式. 2. 排版问题: + 没有必要的外接端口,界面设计极度缺少排版.哪怕是个人公众号都不至于如此,更何况这是一个学校的教务平台. 3. 反应不及时或无 ...

  9. 如何玩转小程序+公众号?手把手教你JeeWx小程序CMS与公众号关联

    随着微信小程序新功能.新入口的不断更新,小程序的商业价值逐步增强,特别是小程序与公众号的深度融合,已经让小程序成为各行业新的营销渠道.Jeewx平台专注小程序的开发,逐步完善小程序生态圈,通过简单操作 ...

随机推荐

  1. PEP8-python编码规范(下)

    1.结尾逗号 结尾的逗号通常是可选的,除了在构成一个元素的元组时是强制性需要的(在Python 2 中,它们对 print 语句有语义).为了清晰起见,建议将后者用括号括起来(在技术上是多余的). Y ...

  2. python基础--面向对象之封装

    # 在python中用双下划线,开头的方式将属性隐藏起来(设置成私有的) # 但其实这只是一种变形操作,而且仅仅在类定义阶段会发生变形 # 类中所有双下划线开头的如__x都会在类定义的时候自动形成:_ ...

  3. 关于Docx动态控制word模板文件的数据

    博客:https://www.cnblogs.com/24klr/ github: https://github.com/luoruiemail/Dynamic_Word_Web 参考资料:https ...

  4. ${pagecontext.request.contextpath}绝对路径理解

    ${pageContext.request.contextPath}是JSP取得绝对路径的方法,等价于<%=request.getContextPath()%> .也就是取出部署的应用程序 ...

  5. CSS基本样式-文本属性

    字体属性 文本属性呢,自我认为就是写文档的一些格式属性,例如:字体颜色,字体加粗,字体大小,字体类型等,而且我们在输出测试用例报告的时候也可以用到这些属性,对测试报告进行优化. <html> ...

  6. 关于setter 和 getter方法的一些总结(初级)

    1.最基础的set 和 get 准备工作 Person.h @interface Person : NSObject { NSString *_hobby; // ObjC建议成员变量带"_ ...

  7. centos7 源码编译安装 php

    准备工作 下载 PHP 源码包并解压 $ wget https://www.php.net/distributions/php-7.2.19.tar.bz2 $ yum -y install bzip ...

  8. QQ音乐爬虫

    #今日目标 **QQ音乐爬虫** 今天要爬取的是QQ音乐任意歌手的所有音乐歌词,因为笔者是周杰伦的忠实粉丝,所以专门写了个爬虫来爬取他的音乐的歌词,因为他的音乐在咪咕音乐可以听,所以便没有去爬取. 好 ...

  9. javaweb: request.getParameter()、request.setAttribute()与request.getAttribute()的作用 (转)

    出处:https://blog.csdn.net/qq_41937388/article/details/87972914 1.request.getParameter()方法是获取通过类似post, ...

  10. 深入理解java虚拟机(1)走进jvm

    1.JDK:java程序设计语言.java虚拟机.javaAPI 二.自动内存管理机制 ----------------------------------------------------- 1. ...