python核心编程中网络爬虫的例子

 #!/usr/bin/env python

 import cStringIO                    #

 import formatter                    #

 from htmllib import HTMLParser         # We use various classes in these modules for parsing HTML.

 import httplib                        # We only need an exception from this module

 import os                            # This provides various file system functions

 import sys                            # We are just using argv for command-line arguments

 import urllib                        # We only need the urlretrieve()function for downloading Web pages

 import urlparse                        # We use the urlparse()and urljoin()functions for URL manipulation

 class Retriever(object):

     __slots__ = ('url','file')

     def __init__(self,url):

         self.url, self.file = self.get_file(url)

     def get_file(self, url, default='index.html'):

         'Create usable local filename from URL'

         parsed = urlparse.urlparse(url)                     # ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='')

         host = parsed.netloc.split('@')[-1].split(':')[0]    # 'www.baidu.com'

         filepath = '%s%s' % (host,parsed.path)                # 'www.baidu.com'

         if not os.path.splitext(parsed.path)[1]:            # ''

             filepath = os.path.join(filepath, default)        # 'www.baidu.com\\index.html'

         linkdir = os.path.dirname(filepath)                    # 'www.baidu.com'

         if not os.path.isdir(linkdir):                        # False

             if os.path.exists(linkdir):                        # False

                 os.unlink(linkdir)

             os.makedirs(linkdir)                            # make a directory named by link directory on the hard disc

         return url, filepath

     def download(self):

         'Download URL to specific name file'

         try:

             retval = urllib.urlretrieve(self.url, self.file)

         except (IOError, httplib.InvalidURL) as e:

             retval = (('*** ERROR:bad URL "%s": %s' % (self.url,e)),)

         return retval

     def parse_links(self):

         'Parse out the links found in downloaded HTML file'

         f = open(self.file, 'r')

         data = f.read()

         f.close()

         parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))

         parser.feed(data)

         parser.close()

         return parser.anchorlist

 class Crawler(object):

     count = 0                                                # the number of objects downloaded from the internet

     def __init__(self, url):

         self.q = [url]                                        # a queue of links to download

         self.seen = set()                                    # a set containing all the links that we have seen(downloaded) already

         parsed = urlparse.urlparse(url)

         host = parsed.netloc.split('@')[-1].split(':')[0]

         self.dom = '.'.join(host.split('.')[-2:])            # 'b.a.i.d.u'

     def get_page(self, url, media=False):

         'Download page & parse links, add to queue if nec'

         r = Retriever(url)

         fname = r.download()[0]                                # 'www.baidu.com\\index.html'

         if fname[0] == '*':                                    # 'w'

             print fname, '... skipping parse'

             return

         Crawler.count += 1                                    #

         print '\n(', Crawler.count, ')'                        # (1)

         print 'URL:', url                                    # URL: http://www.baidu.com

         print 'FILE:', fname                                # FILE: www.baidu.com\\index.html

         self.seen.add(url)                                    # set(['http://www.baidu.com'])

         ftype = os.path.splitext(fname)[1]                    # '.html'

         if ftype not in ('.htm', '.html'):                    # False

             return

         for link in r.parse_links():

             if link.startswith('mailto:'):                    # False

                 print '... discarded, mailto link'

                 continue

             if not media:                                    # False

                 ftype = os.path.splitext(link)[1]

                 if ftype in ('.mp3','.mp4','.m4v','.wav'):

                     print '... discarded, media file'

                     continue

             if not link.startswith('http://'):                # False

                 link = urlparse.urljoin(url, link)

             print '*', link,

             if link not in self.seen:                        # True

                 if self.dom not in link:                    # False

                     print '... discarded, not in domain'

                 else:

                     if link not in self.q:

                         self.q.append(link)

                         print '... new, added to Q'

                     else:

                         print '... discarded, already in Q'

             else:

                 print '... discarded, already processed'

     def go(self, media=False):

         'Process next page in queue (if any)'

         while self.q:

             url = self.q.pop()

             self.get_page(url, media)

 def main():

         if len(sys.argv) > 1:

             url = sys.argv[1]

         else:

             try:

                 url = raw_input('Enter starting URL:')

             except(KeyboardInterrupt, EOFError):

                 url = ''

         if not url:

             return

         if not url.startswith('http://') and not url.startswith('ftp://'):

             url = 'http://%s/' % url

         robot = Crawler(url)

         robot.go()

 if __name__ == '__main__':

         main()

python核心编程中网络爬虫的例子的更多相关文章

python核心编程中的对象值比较VS对象身份比较(转载)
转载地址: https://blog.csdn.net/Mluka/article/details/51076786 在python核心编程第四章中,P69在优化下面这段代码时提出了:对象值比较VS对 ...
Python核心编程（网络编程）
1.python socket模块内置方法 2.tcp服务器伪代码 3.tcp客户端伪代码 4.socket模块属性 5.一个简单的tcp客户端和服务端服务端代码: # encoding:utf-8 ...
Python核心编程-描述符
python中,什么描述符.描述符就是实现了"__get__"."__set__"或"__delete__" 方法中至少一个的对象.什么是非 ...
python核心编程第二版笔记
python核心编程第二版笔记由网友提供:open168 python核心编程--笔记(很详细,建议收藏) 解释器options:1.1 –d 提供调试输出1.2 –O 生成优化的字节码(生成 ...
python核心编程--笔记
python核心编程--笔记的解释器options: 1.1 –d 提供调试输出 1.2 –O 生成优化的字节码(生成.pyo文件) 1.3 –S 不导入site模块以在启动时查找pyt ...
Python核心编程第二版(中文).pdf 目录整理
python核心编程目录 Chapter1:欢迎来到python世界!-页码:7 1.1什么是python 1.2起源 :罗萨姆1989底创建python 1.3特点 1.3.1高级 1.3.2面向 ...
Python核心编程的四大神兽：迭代器、生成器、闭包以及装饰器
生成器生成器是生成一个值的特殊函数,它具有这样的特点:第一次执行该函数时,先从头按顺序执行,在碰到yield关键字时该函数会暂停执行该函数后续的代码,并且返回一个值:在下一次调用该函数执行时,程 ...
python核心编程--笔记（不定时跟新）(转)
的解释器options: 1.1 –d 提供调试输出 1.2 –O 生成优化的字节码(生成.pyo文件) 1.3 –S 不导入site模块以在启动时查找python路径 1.4 –v ...
python核心编程笔记（转）
解释器options: 1.1 –d 提供调试输出 1.2 –O 生成优化的字节码(生成.pyo文件) 1.3 –S 不导入site模块以在启动时查找python路径 1.4 –v 冗 ...

随机推荐

html自定义垂直导航菜单（加强版--自定义传入menu参数，支持JSONArray、JSArray、JSONObject、JSObject）
在上一篇中我简单写了个html自定义垂直导航菜单,缺点很明显,里面的数据是固定死的,不能动态更改数据. 这里我重写了一个修改版的垂直二级导航菜单,将原先的menuBox.init(config);修改 ...
使用jenkins自动化构建android和ios应用
背景随着业务需求的演进,工程的复杂度会逐渐增加,自动化的践行日益强烈.事实上,工程的自动化一直是我们努力的目标,能有效提高我们的生产效率,最大化减少人为出错的概率,实现一些复杂的业务需求应变.场景如 ...
Angular4+NodeJs+MySQL 入门-04 接口调用类
上一篇文章说一下,后台接口的创建,这篇说一下如果调用接口. 创建一个目录helpers 此目录下有三个文件分别是 ApiClient.ts.clientMiddleware.ts.Core.ts,前面 ...
org.springframework.dao.DataIntegrityViolationException: could not execute statement; SQL [n/a]; constraint [null]; nested exception is org.hibernate.
今天报了这个异常,这是页面报的 org.springframework.dao.DataIntegrityViolationException: could not execute statement ...
Column 'orders' in order clause is ambiguous
今天报了这个错误原因是.当使用sql查询语句,使用了join查表.但是这个orders没指定是哪张表的字段 ,发生在自关联情况
进入保护模式（一）——《x86汇编语言：从实模式到保护模式》读书笔记12
之前已经做了一些理论上的铺垫,这次我们就可以看代码了. 一.代码清单 ;代码清单11-1 ;文件名:c11_mbr.asm ;文件说明:硬盘主引导扇区代码 ;创建日期:2011-5-16 19:54 ...
JS识别不同浏览器信息
总所周知,不同浏览器兼容是不一致的,然而今天我在Coding的时候深深体会到那个痛苦,一样的代码在Firefox里面是没问题的,可以根据索引找到对应的对象元素然后进行操作,但是同样的却获取不到对象元 ...
Windows与Unix思想
Unix与Windows的思想 Unix中的哲学是"一切皆文件",这里的一切皆文件是一个广泛的概念,有一些特殊的设备文件,在/dev目录下物理设备在Unix中就对应一个特殊的设备 ...
vue2.0 饿了么项目学习总结
最近在GitHub上发现一个基于vue2.0的饿了么项目.本着互联网的分享精神,现在将我自己所理解的,所总结的经验分享给大家.本篇文字我将从学习的角度向大家分享. 在学习本项目之前我已经将vue2.0 ...
pod install 出错
今天使用cocoapods的时候在执行pod install出错,如下: 使用很多方法都不行,但是问题感觉应该是需要升级,所有就找到升级cocoapods:sudo gem install -n /u ...

python核心编程中网络爬虫的例子

python核心编程中网络爬虫的例子的更多相关文章

随机推荐

热门专题