多线程网页爬虫 python 实现（二）

#!/usr/bin/env python

#coding=utf-8

import threading

import urllib

import re

import time

cur=0

last=0

totalcount=0

depth=0

t_mutex=threading.Condition() 

class Mycrawler:

    def __init__(self,crawlername,seeds,threadnum):

        self.crawlername=crawlername

        self.seeds=seeds

        self.crawqueue=CrawQueue()

        self.initQueue(self.seeds)

        self.threadnum=threadnum

        self.threadpools=[]

        self.logfile=file('log2.txt','w')

    def initQueue(self,seeds):

        if isinstance(seeds,str):

            self.crawqueue.push(seeds)

        elif isinstance(seeds,list):

            for seed in seeds:

                self.crawqueue.push(seed)

        global last

        global totalcount

        totalcount=self.crawqueue.getQueueCount()

        last=totalcount

    def crawling(self):

        global cur

        global depth

        global last

        global totalcount

        self.log(">>>Depth "+str(depth)+":\n")

        while self.crawqueue.getQueueCount()!=0:

            url=self.crawqueue.pop()

            self.log(url)

            if url==None:

                continue

            self.crawqueue.addToVisited(url)

            links=self.getLinks(url)

            if links==None:

                print 'None'

                self.crawqueue.failed.append(url)

                continue

            beforenum = self.crawqueue.getQueueCount()

            self.crawqueue.addLinks(links)

            afternum  = self.crawqueue.getQueueCount()

            totalcount+=afternum-beforenum

            cur+=1

            if cur==last:

                depth+=1

                self.log(">>>Depth "+str(depth)+":\n")

                last=totalcount

    def crawling2(self):

        global last

        global totalcount

        global depth

        self.log(">>>Depth "+str(depth)+":\n")

        totalcount=self.crawqueue.getQueueCount()

        last=totalcount

        while self.crawqueue.getQueueCount()!=0:

            for i in range(self.threadnum):

                url=self.crawqueue.pop()

                if url==None:

                    break

                crawthread=crawlerThread(url,i,self)

                self.threadpools.append(crawthread)

                crawthread.start()

            for i in range(len(self.threadpools)):

                crawthread=self.threadpools[i]

                crawthread.join(30)

    def log(self,content):

        self.logfile.write(content+"\n")

class crawlerThread(threading.Thread):

    def __init__(self,url,tid,mycrawler):

        threading.Thread.__init__(self)

        self.url=url

        self.tid=tid

        self.mycrawler=mycrawler

    def run(self):

        global t_mutex

        global cur

        global last

        global totalcount

        global depth

        t_mutex.acquire()

        self.mycrawler.log(self.url)

        t_mutex.release()

        links=self.getLinks(self.url)

        if links==None:

            t_mutex.acquire()

            self.mycrawler.crawqueue.addToVisited(self.url)

            self.mycrawler.crawqueue.addToFailed(self.url)

            t_mutex.release()

        else:

            t_mutex.acquire()

            self.mycrawler.crawqueue.addToVisited(self.url)

            beforenum=self.mycrawler.crawqueue.getQueueCount()

            self.mycrawler.crawqueue.addLinks(links)

            afternum =self.mycrawler.crawqueue.getQueueCount()

            totalcount+=afternum-beforenum

            t_mutex.release()

        t_mutex.acquire()

        cur+=1

        if cur==last:

            depth+=1

            self.mycrawler.log(">>>Depth "+str(depth)+":\n")

            last=totalcount

        t_mutex.release()

    def getLinks(self,url):

        try:

            page=urllib.urlopen(url)

            html=page.read()

            reg=r'"(http://.+?)"'

            regob=re.compile(reg,re.DOTALL)

            links=regob.findall(html)

            return links

        except:

            print 'Failed downloading and saving',url

            return None

class CrawQueue:

    def __init__(self):

        self.queue=[]

        self.visited=[]

        self.failed=[]

    def getQueue(self):

        return self.queue

    def getVisited(self):

        return self.visited

    def getFailed(self):

        return self.failed

    def push(self,url):

        if url!="" and url not in self.queue and url not in self.visited:

            self.queue.insert(0,url)

    def pop(self):

        if len(self.queue)==0:

            #print 'failed to pop: queue is empty'

            return None

        else:

            return self.queue.pop()

    def isEmpty(self):

        if len(self.queue)==0:

            return 1

        else:

            return 0

    def addToVisited(self,url):

        self.visited.append(url)

    def addToFailed(self,url):

        self.failed.append(url)

    def remove(self,url):

        self.queue.remove(url)

    def getVisitedCount(self):

        return len(self.visited)

    def getQueueCount(self):

        return len(self.queue)

    def addLinks(self,links):

        for link in links:

            self.push(link)

if __name__=="__main__":

    seeds="http://www.douban.com/"

    threadnum=int(raw_input("设置线程数:"))

    crawlername="小小爬虫"

    mycrawler=Mycrawler(crawlername,seeds,threadnum)

    mycrawler.crawling2()

多线程网页爬虫 python 实现（二）的更多相关文章

多线程网页爬虫 python 实现
采用了多线程和锁机制,实现了广度优先算法的网页爬虫. 对于一个网络爬虫,如果要按广度遍历的方式下载,它就是这样干活的: 1.从给定的入口网址把第一个网页下载下来 2.从 ...
python网页爬虫开发之二
1.网站robots robotparser模块首先加载robots.txt文件,然后通过can_fetch()函数确定指定的用户代理是否允许访问网页. 2.识别网站技术 3.下载网页使用urlli ...
python 网页爬虫+保存图片+多线程+网络代理
今天,又算是浪费了一天了.python爬虫,之前写过简单的版本,那个时候还不懂原理,现在算是收尾吧. 以前对网页爬虫不了解,感觉非常神奇,但是解开这面面纱,似乎里面的原理并不是很难掌握.首先,明白一个 ...
Python爬虫初学（二）—— 爬百度贴吧
Python爬虫初学(二)-- 爬百度贴吧昨天初步接触了爬虫,实现了爬取网络段子并逐条阅读等功能,详见Python爬虫初学(一). 今天准备对百度贴吧下手了,嘿嘿.依然是跟着这个博客学习的,这次仿照 ...
Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱（转）
原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...
Python网页爬虫（一）
很多时候我们想要获得网站的数据,但是网站并没有提供相应的API调用,这时候应该怎么办呢?还有的时候我们需要模拟人的一些行为,例如点击网页上的按钮等,又有什么好的解决方法吗?这些正是python和网页爬 ...
Python爬虫学习：二、爬虫的初步尝试
我使用的编辑器是IDLE,版本为Python2.7.11,Windows平台. 本文是博主原创随笔,转载时请注明出处Maple2cat|Python爬虫学习:二.爬虫的初步尝试 1.尝试抓取指定网页 ...
【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
本文转载自:https://www.cnblogs.com/colipso/p/4284510.html 好文 mark http://www.52nlp.cn/python-%E7%BD%91%E9 ...
Python 3实现网页爬虫
1 什么是网页爬虫网络爬虫( 网页蜘蛛,网络机器人,网页追逐者,自动索引,模拟程序)是一种按照一定的规则自动地抓取互联网信息的程序或者脚本,从互联网上抓取对于我们有价值的信息.Tips:自动提取网页 ...

随机推荐

JavaScript注释
JavaScript注释有两种方式: 1.单行注释. 2.多行注释. 单行注释单行注释以“//”开头. <script type="text/javascript"> ...
HDU 4283 区间DP You Are the One
题解我使用记忆化搜索写的.
mac下secureCRT 客户端 $redis-cli回车后没有反应的解决办法
启动redis server后,SecureCRT进入redis-cli,输入不断在后面追加IP:Port显示设置当前的Session Options-->Terminal-->Emula ...
WCF学习-协议绑定
文章:无废话WCF入门教程三[WCF的宿主] 讲了net.tcp协议的wcf绑定.
【bzoj3671】[Noi2014]随机数生成器贪心
题目描述输入第1行包含5个整数,依次为 x_0,a,b,c,d ,描述小H采用的随机数生成算法所需的随机种子.第2行包含三个整数 N,M,Q ,表示小H希望生成一个1到 N×M 的排列来填入她 N ...
算法复习——虚树（消耗战bzoj2286）
题目: Description 在一场战争中,战场由n个岛屿和n-1个桥梁组成,保证每两个岛屿间有且仅有一条路径可达.现在,我军已经侦查到敌军的总部在编号为1的岛屿,而且他们已经没有足够多的能源维系战 ...
Win7开启SNMP服务
通过SNMP监控Windows主机需要在被监控的服务器上安装简单网络管理协议(SNMP)的Windows组件,以Windows 7系统为例: 首先,在控制面板中找到“卸载程序”: 在弹出的窗口中单击“ ...
【2018.10.27】CXM笔记
一个数大约有 $O(\sqrt(n)/log^2(n))$ 个约数. 1. 一个棋盘,每个格子最开始都是白的.可以按一个格子,它马跳(日字跳)能到达的 $8$ 个格子反色(当前格不反色).问有多少种方 ...
win10安装virtualbox发生严重错误
转载自:http://blog.csdn.net/ljw124213/article/details/50545101 Windows 10 系统在安装VirtualBox即将完毕时,突然回退,提示错 ...
栅格网络流（cogs 750）
[问题描述] Bob 觉得一般图的最大流问题太难了,他不知道如何解决,于是他想尝试一个简单点的:栅格网络中的最大流问题,这个虽说简单了一点,但对 Bob 来说依旧太难,现在他有个麻烦需要你帮忙:给你一 ...

多线程网页爬虫 python 实现（二）

多线程网页爬虫 python 实现（二）的更多相关文章

随机推荐

热门专题