python爬虫如何爬知乎的话题?
因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用
#coding:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division # 精确除法
from Queue import Queue
from __builtin__ import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup reload(sys)
sys.setdefaultencoding( "utf-8" ) headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Referer':'https://www.zhihu.com/topics',
'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
} DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = 'root' queue= Queue() #接收队列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base="" conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
conn.autocommit(False)
curr = conn.cursor() def get_html(url):
try:
req = urllib2.Request(url)
response = urllib2.urlopen(req,None,3) #在这里应该加入代理
html = response.read()
return html
except:
pass
return None def getTopics():
url = 'https://www.zhihu.com/topics'
print url
try:
req = urllib2.Request(url)
response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
html = response.read().decode('utf-8')
print html
soup = BeautifulSoup(html)
lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'}) for li in lis:
data_id=li.get('data-id')
name=li.text
curr.execute('select id from classify_new where name=%s',(name))
y= curr.fetchone()
if not y:
curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
conn.commit()
except Exception as e:
print "get topic error",e def get_extension(name):
where=name.rfind('.')
if where!=-1:
return name[where:len(name)]
return None def which_platform():
sys_str = platform.system()
return sys_str def GetDateString():
when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
foldername = str(when)
return foldername def makeDateFolder(par,classify):
try:
if os.path.isdir(par):
newFolderName=par + '//' + GetDateString() + '//' +str(classify)
if which_platform()=="Linux":
newFolderName=par + '/' + GetDateString() + "/" +str(classify)
if not os.path.isdir( newFolderName ):
os.makedirs( newFolderName )
return newFolderName
else:
return None
except Exception,e:
print "kk",e
return None def download_img(url,classify):
try:
extention=get_extension(url)
if(extention is None):
return None
req = urllib2.Request(url)
resp = urllib2.urlopen(req,None,3)
dataimg=resp.read()
name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
top="E://topic_pic"
folder=makeDateFolder(top, classify)
filename=None
if folder is not None:
filename =folder+"//"+name
try:
if "e82bab09c_m" in str(url):
return True
if not os.path.exists(filename):
file_object = open(filename,'w+b')
file_object.write(dataimg)
file_object.close()
return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
else:
print "file exist"
return None
except IOError,e1:
print "e1=",e1
pass
except Exception as e:
print "eee",e
pass
return None #如果没有下载下来就利用原来网站的链接 def getChildren(node,name):
global queue,nodeSet
try:
url="https://www.zhihu.com/topic/"+str(node)+"/hot"
html=get_html(url)
if html is None:
return
soup = BeautifulSoup(html)
p_ch='父话题'
node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
topic_cla=soup.find('div', {'class' : 'child-topic'})
if topic_cla is not None:
try:
p_ch=str(topic_cla.text)
aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点
if u'子话题' in p_ch:
for a in aList:
token=a.get('data-token')
a=str(a).replace('\n','').replace('\t','').replace('\r','')
start=str(a).find('>')
end=str(a).rfind('</a>')
new_node=str(str(a)[start+1:end])
curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同
y= curr.fetchone()
if not y:
print "y=",y,"new_node=",new_node,"token=",token
queue.put((token,new_node,node_name))
except Exception as e:
print "add queue error",e
except Exception as e:
print "get html error",e def getContent(n,name,p,top_id):
try:
global counter
curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
y= curr.fetchone()
print "exist?? ",y,"n=",n
if not y:
url="https://www.zhihu.com/topic/"+str(n)+"/hot"
html=get_html(url)
if html is None:
return
soup = BeautifulSoup(html)
title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
description=soup.find('div',{'class':'zm-editable-content'})
if description is not None:
description=description.text if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环
description=None tag_path=download_img(pic_path,top_id)
print "tag_path=",tag_path
if (tag_path is not None) or tag_path==True:
if tag_path==True:
tag_path=None
father_id=2 #默认为杂谈
curr.execute('select id from rooms where name=%s',(p))
results = curr.fetchall()
for r in results:
father_id=r[0]
name=title
curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
y= curr.fetchone()
print "store see..",y
if not y:
friends_num=0
temp = time.time()
x = time.localtime(float(temp))
create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
create_time
creater_id=None
room_avatar=tag_path
is_pass=1
has_index=0
reason_id=None
#print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
######################有资格入库的内容
counter=counter+1
curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
conn.commit() #必须时时进入数据库,不然找不到父节点
if counter % 200==0:
print "current node",name,"num",counter
except Exception as e:
print "get content error",e def work():
global queue
curr.execute('select id,node,parent,name from classify where status=1')
results = curr.fetchall()
for r in results:
top_id=r[0]
node=r[1]
parent=r[2]
name=r[3]
try:
queue.put((node,name,parent)) #首先放入队列
while queue.qsize() >0:
n,p=queue.get() #顶节点出队
getContent(n,p,top_id)
getChildren(n,name) #出队内容的子节点
conn.commit()
except Exception as e:
print "what's wrong",e def new_work():
global queue
curr.execute('select id,data_id,name from classify_new_copy where status=1')
results = curr.fetchall()
for r in results:
top_id=r[0]
data_id=r[1]
name=r[2]
try:
get_topis(data_id,name,top_id)
except:
pass def get_topis(data_id,name,top_id):
global queue
url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
isGet = True;
offset = -20;
data_id=str(data_id)
while isGet:
offset = offset + 20
values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
try:
msg=None
try:
data = urllib.urlencode(values)
request = urllib2.Request(url,data,headers)
response = urllib2.urlopen(request,None,5)
html=response.read().decode('utf-8')
json_str = json.loads(html)
ms=json_str['msg']
if len(ms) <5:
break
msg=ms[0]
except Exception as e:
print "eeeee",e
#print msg
if msg is not None:
soup = BeautifulSoup(str(msg))
blks = soup.find_all('div', {'class' : 'blk'})
for blk in blks:
page=blk.find('a').get('href')
if page is not None:
node=page.replace("/topic/","") #将更多的种子入库
parent=name
ne=blk.find('strong').text
try:
queue.put((node,ne,parent)) #首先放入队列
while queue.qsize() >0:
n,name,p=queue.get() #顶节点出队
size=queue.qsize()
if size > 0:
print size
getContent(n,name,p,top_id)
getChildren(n,name) #出队内容的子节点
conn.commit()
except Exception as e:
print "what's wrong",e
except urllib2.URLError, e:
print "error is",e
pass if __name__ == '__main__':
i=0
while i<400:
new_work()
i=i+1
说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。
有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。
python爬虫如何爬知乎的话题?的更多相关文章
- Python爬虫之爬取慕课网课程评分
BS是什么? BeautifulSoup是一个基于标签的文本解析工具.可以根据标签提取想要的内容,很适合处理html和xml这类语言文本.如果你希望了解更多关于BS的介绍和用法,请看Beautiful ...
- [Python爬虫] Selenium爬取新浪微博客户端用户信息、热点话题及评论 (上)
转载自:http://blog.csdn.net/eastmount/article/details/51231852 一. 文章介绍 源码下载地址:http://download.csdn.net/ ...
- from appium import webdriver 使用python爬虫,批量爬取抖音app视频(requests+Fiddler+appium)
使用python爬虫,批量爬取抖音app视频(requests+Fiddler+appium) - 北平吴彦祖 - 博客园 https://www.cnblogs.com/stevenshushu/p ...
- 【Python必学】Python爬虫反爬策略你肯定不会吧?
前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 正文 Python爬虫反爬策略三部曲,拥有这三步曲就可以在爬虫界立足了: ...
- 初次尝试python爬虫,爬取小说网站的小说。
本次是小阿鹏,第一次通过python爬虫去爬一个小说网站的小说. 下面直接上菜. 1.首先我需要导入相应的包,这里我采用了第三方模块的架包,requests.requests是python实现的简单易 ...
- Python爬虫之爬取站内所有图片
title date tags layut Python爬虫之爬取站内所有图片 2018-10-07 Python post 目标是 http://www.5442.com/meinv/ 如需在非li ...
- python爬虫实战---爬取大众点评评论
python爬虫实战—爬取大众点评评论(加密字体) 1.首先打开一个店铺找到评论 很多人学习python,不知道从何学起.很多人学习python,掌握了基本语法过后,不知道在哪里寻找案例上手.很多已经 ...
- Python爬虫之爬取淘女郎照片示例详解
这篇文章主要介绍了Python爬虫之爬取淘女郎照片示例详解,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧 本篇目标 抓取淘宝MM ...
- Python 爬虫模拟登陆知乎
在之前写过一篇使用python爬虫爬取电影天堂资源的博客,重点是如何解析页面和提高爬虫的效率.由于电影天堂上的资源获取权限是所有人都一样的,所以不需要进行登录验证操作,写完那篇文章后又花了些时间研究了 ...
随机推荐
- JAVA高并发程序设计笔记
第二章 Java并行程序基础 1.join()的本质是让调用线程wait()在当前线程的对象上 2.Thread.yiedl()会使当前线程让出CPU 3.volatile保证可见性,无法保证原子性( ...
- Linux 配置Jenkins
一.安装包下载: 1. jdk-8u152-linux-x64.tar.gz下载: wget http://download.oracle.com/otn-pub/java/jdk/8u152-b16 ...
- ConcurrentHashMap原理分析(1.7与1.8)
前言 以前写过介绍HashMap的文章,文中提到过HashMap在put的时候,插入的元素超过了容量(由负载因子决定)的范围就会触发扩容操作,就是rehash,这个会重新将原数组的内容重新hash到新 ...
- python3.6 urllib.request库实现简单的网络爬虫、下载图片
#更新日志:#0418 爬取页面商品URL#0421 更新 添加爬取下载页面图片功能#0423 更新 添加发送邮件功能# 优化 爬虫异常处理.错误页面及空页面处理# 优化 爬虫关键字黑名单.白名单,提 ...
- day5、文件乱码怎么解决
1.1 Linux下,如何将一个乱码的文件进行重命名 方法一: 命令格式:mv $(ls |egrep "[^a-zA-Z0-9.-]") tandao.tx [root@nb ...
- ubuntu14.04 升级mysql到5.7版本
Ubuntu14.04默认安装的是mysql5.5,由于开发需要支持utf8mb4,因此需要升级到mysql5.7 默认情况下,apt是无法直接升级到mysql5.7的,因此需要额外设置 首先,备份数 ...
- Cocos2d-X 精灵、动作效果
命名空间宏: USING_NS_CC; 感觉事实上挺鸡肋的. NS_CC_BEGIN. == using namespace cocos2d{ NS_CC_END ; } 推断一个精灵被点击: 1.层 ...
- Javascript 进阶 面向对象编程 继承的一个样例
Javascript的难点就是面向对象编程,上一篇介绍了Javascript的两种继承方式:Javascript 进阶 继承.这篇使用一个样例来展示js怎样面向对象编程.以及怎样基于类实现继承. 1. ...
- Hibernate学习笔记(五) — 多对多关系映射
多对多关系映射 多对多建立关系相当于在第三张表中插入一行数据 多对多解除关系相当于在第三张表中删除一行数据 多对多改动关系相当于在第三张表中先删除后添加 多对多谁维护效率都一样.看需求 在实际开发过程 ...
- SDUTOJ 贪心 -商人小鑫
题目描写叙述 小鑫是个商人,当然商人最希望的就是多赚钱.小鑫也一样. 这天,他来到了一个遥远的国度.那里有着n件商品,对于第i件商品须要付出ci的价钱才干得到. 当然.对于第i件商品,小鑫在自己心中有 ...