python爬虫如何爬知乎的话题？

因为要做观点，观点的屋子类似于知乎的话题，所以得想办法把他给爬下来，搞了半天最终还是妥妥的搞定了，代码是python写的，不懂得麻烦自学哈！懂得直接看代码，绝对可用

#coding:utf-8

"""

@author:haoning

@create time:2015.8.5

"""

from __future__ import division  # 精确除法

from Queue import Queue

from __builtin__ import False

import json

import os

import re

import platform

import uuid

import urllib

import urllib2

import sys

import time

import MySQLdb as mdb

from bs4 import BeautifulSoup

reload(sys)

sys.setdefaultencoding( "utf-8" )

headers = {

   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',

   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

   'X-Requested-With':'XMLHttpRequest',

   'Referer':'https://www.zhihu.com/topics',

   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'

}

DB_HOST = '127.0.0.1'

DB_USER = 'root'

DB_PASS = 'root'

queue= Queue() #接收队列

nodeSet=set()

keywordSet=set()

stop=0

offset=-20

level=0

maxLevel=7

counter=0

base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')

conn.autocommit(False)

curr = conn.cursor()

def get_html(url):

    try:

        req = urllib2.Request(url)

        response = urllib2.urlopen(req,None,3) #在这里应该加入代理

        html = response.read()

        return html

    except:

        pass

    return None

def getTopics():

    url = 'https://www.zhihu.com/topics'

    print url

    try:

        req = urllib2.Request(url)

        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�

        html = response.read().decode('utf-8')

        print html

        soup = BeautifulSoup(html)

        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})

        for li in lis:

            data_id=li.get('data-id')

            name=li.text

            curr.execute('select id from classify_new where name=%s',(name))

            y= curr.fetchone()

            if not y:

                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))

        conn.commit()

    except Exception as e:

        print "get topic error",e

def get_extension(name):

    where=name.rfind('.')

    if where!=-1:

        return name[where:len(name)]

    return None

def which_platform():

    sys_str = platform.system()

    return sys_str

def GetDateString():

    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))

    foldername = str(when)

    return foldername 

def makeDateFolder(par,classify):

    try:

        if os.path.isdir(par):

            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)

            if which_platform()=="Linux":

                newFolderName=par + '/' + GetDateString() + "/" +str(classify)

            if not os.path.isdir( newFolderName ):

                os.makedirs( newFolderName )

            return newFolderName

        else:

            return None

    except Exception,e:

        print "kk",e

    return None 

def download_img(url,classify):

    try:

        extention=get_extension(url)

        if(extention is None):

            return None

        req = urllib2.Request(url)

        resp = urllib2.urlopen(req,None,3)

        dataimg=resp.read()

        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention

        top="E://topic_pic"

        folder=makeDateFolder(top, classify)

        filename=None

        if folder is not None:

            filename  =folder+"//"+name

        try:

            if "e82bab09c_m" in str(url):

                return True

            if not os.path.exists(filename):

                file_object = open(filename,'w+b')

                file_object.write(dataimg)

                file_object.close()

                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name

            else:

                print "file exist"

                return None

        except IOError,e1:

            print "e1=",e1

            pass

    except Exception as e:

        print "eee",e

        pass

    return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):

    global queue,nodeSet

    try:

        url="https://www.zhihu.com/topic/"+str(node)+"/hot"

        html=get_html(url)

        if html is None:

            return

        soup = BeautifulSoup(html)

        p_ch='父话题'

        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

        topic_cla=soup.find('div', {'class' : 'child-topic'})

        if topic_cla is not None:

            try:

                p_ch=str(topic_cla.text)

                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点

                if u'子话题' in p_ch:

                    for a in aList:

                        token=a.get('data-token')

                        a=str(a).replace('\n','').replace('\t','').replace('\r','')

                        start=str(a).find('>')

                        end=str(a).rfind('</a>')

                        new_node=str(str(a)[start+1:end])

                        curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同

                        y= curr.fetchone()

                        if not y:

                            print "y=",y,"new_node=",new_node,"token=",token

                            queue.put((token,new_node,node_name))

            except Exception as e:

                print "add queue error",e

    except Exception as e:

        print "get html error",e

def getContent(n,name,p,top_id):

    try:

        global counter

        curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

        y= curr.fetchone()

        print "exist?? ",y,"n=",n

        if not y:

            url="https://www.zhihu.com/topic/"+str(n)+"/hot"

            html=get_html(url)

            if html is None:

                return

            soup = BeautifulSoup(html)

            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')

            description=soup.find('div',{'class':'zm-editable-content'})

            if description is not None:

                description=description.text

            if (u"未归类" in title or u"根话题" in title): #允许入库，避免死循环

                description=None

            tag_path=download_img(pic_path,top_id)

            print "tag_path=",tag_path

            if (tag_path is not None) or tag_path==True:

                if tag_path==True:

                    tag_path=None

                father_id=2 #默认为杂谈

                curr.execute('select id from rooms where name=%s',(p))

                results = curr.fetchall()

                for r in results:

                    father_id=r[0]

                name=title

                curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

                y= curr.fetchone()

                print "store see..",y

                if not y:

                    friends_num=0

                    temp = time.time()

                    x = time.localtime(float(temp))

                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now

                    create_time

                    creater_id=None

                    room_avatar=tag_path

                    is_pass=1

                    has_index=0

                    reason_id=None

                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id

                    ######################有资格入库的内容

                    counter=counter+1

                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))

                    conn.commit() #必须时时进入数据库，不然找不到父节点

                    if counter % 200==0:

                        print "current node",name,"num",counter

    except Exception as e:

        print "get content error",e       

def work():

    global queue

    curr.execute('select id,node,parent,name from classify where status=1')

    results = curr.fetchall()

    for r in results:

        top_id=r[0]

        node=r[1]

        parent=r[2]

        name=r[3]

        try:

            queue.put((node,name,parent)) #首先放入队列

            while queue.qsize() >0:

                n,p=queue.get() #顶节点出队

                getContent(n,p,top_id)

                getChildren(n,name) #出队内容的子节点

            conn.commit()

        except Exception as e:

            print "what's wrong",e  

def new_work():

    global queue

    curr.execute('select id,data_id,name from classify_new_copy where status=1')

    results = curr.fetchall()

    for r in results:

        top_id=r[0]

        data_id=r[1]

        name=r[2]

        try:

            get_topis(data_id,name,top_id)

        except:

            pass

def get_topis(data_id,name,top_id):

    global queue

    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'

    isGet = True;

    offset = -20;

    data_id=str(data_id)

    while isGet:

        offset = offset + 20

        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}

        try:

            msg=None

            try:

                data = urllib.urlencode(values)

                request = urllib2.Request(url,data,headers)

                response = urllib2.urlopen(request,None,5)

                html=response.read().decode('utf-8')

                json_str = json.loads(html)

                ms=json_str['msg']

                if len(ms) <5:

                    break

                msg=ms[0]

            except Exception as e:

                print "eeeee",e

            #print msg

            if msg is not None:

                soup = BeautifulSoup(str(msg))

                blks = soup.find_all('div', {'class' : 'blk'})

                for blk in blks:

                    page=blk.find('a').get('href')

                    if page is not None:

                        node=page.replace("/topic/","") #将更多的种子入库

                        parent=name

                        ne=blk.find('strong').text

                        try:

                            queue.put((node,ne,parent)) #首先放入队列

                            while queue.qsize() >0:

                                n,name,p=queue.get() #顶节点出队

                                size=queue.qsize()

                                if size > 0:

                                    print size

                                getContent(n,name,p,top_id)

                                getChildren(n,name) #出队内容的子节点

                            conn.commit()

                        except Exception as e:

                            print "what's wrong",e

        except urllib2.URLError, e:

            print "error is",e

            pass 

if __name__ == '__main__':

    i=0

    while i<400:

        new_work()

        i=i+1

说下数据库的问题，我这里就不传附件了，看字段自己建立，因为这确实太简单了，我是用的mysql，你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我，因为这个也是我开发的，上面会及时更新qq群号，这里不留qq号啥的，以免被系统给K了。

python爬虫如何爬知乎的话题？的更多相关文章

Python爬虫之爬取慕课网课程评分
BS是什么? BeautifulSoup是一个基于标签的文本解析工具.可以根据标签提取想要的内容,很适合处理html和xml这类语言文本.如果你希望了解更多关于BS的介绍和用法,请看Beautiful ...
[Python爬虫] Selenium爬取新浪微博客户端用户信息、热点话题及评论 (上)
转载自:http://blog.csdn.net/eastmount/article/details/51231852 一. 文章介绍源码下载地址:http://download.csdn.net/ ...
from appium import webdriver 使用python爬虫,批量爬取抖音app视频（requests+Fiddler+appium）
使用python爬虫,批量爬取抖音app视频(requests+Fiddler+appium) - 北平吴彦祖 - 博客园 https://www.cnblogs.com/stevenshushu/p ...
【Python必学】Python爬虫反爬策略你肯定不会吧？
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 正文 Python爬虫反爬策略三部曲,拥有这三步曲就可以在爬虫界立足了: ...
初次尝试python爬虫，爬取小说网站的小说。
本次是小阿鹏,第一次通过python爬虫去爬一个小说网站的小说. 下面直接上菜. 1.首先我需要导入相应的包,这里我采用了第三方模块的架包,requests.requests是python实现的简单易 ...
Python爬虫之爬取站内所有图片
title date tags layut Python爬虫之爬取站内所有图片 2018-10-07 Python post 目标是 http://www.5442.com/meinv/ 如需在非li ...
python爬虫实战---爬取大众点评评论
python爬虫实战—爬取大众点评评论(加密字体) 1.首先打开一个店铺找到评论很多人学习python,不知道从何学起.很多人学习python,掌握了基本语法过后,不知道在哪里寻找案例上手.很多已经 ...
Python爬虫之爬取淘女郎照片示例详解
这篇文章主要介绍了Python爬虫之爬取淘女郎照片示例详解,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧本篇目标抓取淘宝MM ...
Python 爬虫模拟登陆知乎
在之前写过一篇使用python爬虫爬取电影天堂资源的博客,重点是如何解析页面和提高爬虫的效率.由于电影天堂上的资源获取权限是所有人都一样的,所以不需要进行登录验证操作,写完那篇文章后又花了些时间研究了 ...

随机推荐

这些 Drawable 的小技巧，你都了解吗？
一.前言在 Android 的开发过程中,Drawable 经常会被用到,一般会用 Drawable 为 View 设置一个显示的效果.而在 Android 下,也提供了很多 Drawable 的默 ...
Yahoo网站性能优化的34条军规
1.尽量减少HTTP请求次数终端用户响应的时间中,有80%用于下载各项内容,这部分时间包括下载页面中的图像.样式表.脚本.Flash等.通过减少页面中的元素可以减少HTTP请求的次数,这是提高网页速 ...
ThinkPHP5.0 实现 app支付宝支付功能
前几天做项目,要求要用到支付宝接口,第一次做,弄了好几天各种坑啊,简单写一下我做支付宝支付的过程,希望对也是第一次做支付宝支付的童鞋有帮助, 不懂的可以先去支付平台看一下支付宝支付的文档,我是下的d ...
JAVA IO分析一：File类、字节流、字符流、字节字符转换流
因为工作事宜,又有一段时间没有写博客了,趁着今天不是很忙开始IO之路:IO往往是我们忽略但是却又非常重要的部分,在这个讲究人机交互体验的年代,IO问题渐渐成了核心问题. 一.File类在讲解File ...
java-8u151-64安装与配置环境变量
去oracle官网下载 java jdk for developments(最新发布的java9与java8有很大差别,选择8就够用了) 我是装在默认的C盘里的,直接配置环境变量了新建JAVA_HO ...
NGUI_Input
九.输入框Input 1.凡是用户可以输入文本的地方,几乎都用输入框,有登录账号和密码.输入角色名称.输入聊天内容 2.手动拼接输入框,拖动预制体的就不再说了 (1).创建一个Sprite作为输入框的 ...
NGUI_Button
十.按钮,Button 1.按钮的核心作用: 按钮能够接收单击并触发响应事件按钮单击时能同时触发多个响应事件按钮可以有普通.悬停.单击.禁用等多个状态的不同表现广泛的说,按钮的核心在于接收事件 ...
webrtc视频数据render流程
DotNetCore跨平台~xUnit生成xml报告
在CI/CD流行至极的今天,你的项目没有自动化测试绝对是不可以接受的,在进行自动化部署和持续集成时,我们的dotnet core项目也是可以实现自动化的,之前说过gitlab,jenkins对持续集成 ...
poj：4091:The Closest M Points
poj:4091:The Closest M Points 题目描写叙述每到饭点,就又到了一日几度的小L纠结去哪吃饭的时候了.由于有太多太多好吃的地方能够去吃,而小L又比較懒不想走太远,所以小L会 ...

python爬虫如何爬知乎的话题？

python爬虫如何爬知乎的话题？的更多相关文章

随机推荐

热门专题