aiohttp爬虫的模板，类的形式

 import asyncio

 import aiohttp

 import async_timeout

 from lxml import html

 from timeit import default_timer as timer

 from db import DBData

 class Crawler:

     def __init__(self, **kwargs):

         self.domains = kwargs["domains"]

         self.max_depth = kwargs["max_depth"]

         self.max_retries = 3

         self.max_workers = 10

         self.Q = asyncio.Queue()

         self.db_Q = asyncio.Queue()

         self.cache = set()

         self.count = 0

         self.loop = asyncio.get_event_loop()

         self.db_data = DBData()

         # Clear

         self.db_data.clear_crawler()

     async def get(self, url, timeout):

         with async_timeout.timeout(timeout):

             async with self.session.get(url) as response:

                 return await response.text()

     async def extract_urls(self, url, timeout=10):

         tree = html.fromstring(await self.get(url, timeout))

         # Search only in domains

         return {p for p in tree.xpath("//a/@href")}

                 # if any(domain in p for domain in self.domains)}

     async def worker(self):

         while True:

             url, depth, retries = await self.Q.get()

             if url in self.cache:

                 self.db_Q.put_nowait(url)

                 self.Q.task_done()

                 continue

             try:

                 new_urls = await self.extract_urls(url)

             except Exception as e:

                 if retries <= self.max_retries:

                     self.Q.put_nowait((url, depth, retries + 1))

                 else:

                     print("Error in %s: %s" % (url, repr(e)))

             else:

                 self.cache.add(url)

                 self.count += 1

                 self.db_Q.put_nowait(url)

                 print("Depth: %s Retry: %s Visited: %s" % (depth, retries, url))

                 if depth+1 <= self.max_depth:

                     for x in new_urls:

                         self.Q.put_nowait((x, depth + 1, retries))

             self.Q.task_done()

     async def run(self):

         async with aiohttp.ClientSession(loop=self.loop) as session:

             self.session = session

             workers = [self.worker() for _ in range(self.max_workers)]

             workers += [self.write_to_db() for _ in range(self.max_workers)]

             tasks = [self.loop.create_task(x) for x in workers]

             await asyncio.sleep(5)

             await self.Q.join()

             await self.db_Q.join()

             for task in tasks:

                 task.cancel()

     def start(self):

         for domain in self.domains:

             print("Crawling %s start..." % domain)

             self.Q.put_nowait((domain, 0, 0))

             start_time = timer()

             self.loop.run_until_complete(asyncio.gather(self.run()))

             self.loop.close()

             runtime = timer() - start_time

             print("Crawling %s end. Exec time: %s. Requests: %s" % (

                 domain, runtime, self.count))

     async def write_to_db(self):

         while True:

             address = await self.db_Q.get()

             if await self.db_data.check_url(address) is None:

                 self.db_data.add_url(address)

                 print("Write to DB: %s" % address)

             self.db_Q.task_done()

 if __name__ == "__main__":

     options = {

         "domains": ["https://www.yahoo.com/news/"],

         "max_depth": 1

     }

     c = Crawler(**options)

     c.start()

aiohttp爬虫的模板，类的形式的更多相关文章

C++模板类的使用
1.定义模板类通过类似于下面的语法可以定义一个模板类: template<typename T> class Job : public virtual RefBase { public: ...
C++：类模板与模板类
6.3 类模板和模板类所谓类模板,实际上是建立一个通用类,其数据成员.成员函数的返回值类型和形参类型不具体指定,用一个虚拟的类型来代表.使用类模板定义对象时,系统会实参的类型来取代类模板中虚拟类型从 ...
C++ 模板类解析
具体模板类作用这边就不细说了,下面主要是描述下模板类的使用方法以及注意的一些东西. #include <iostream> using namespace std; template &l ...
使用模板类导致error LNK2019: 无法解析的外部符号
原地址 1.定义模板类: template<class T> class Stack {....}; 2.定义模板成员函数: 每个函数头都要以相同的模板声明打头,并将类限定符改成:类名&l ...
开涛spring3(7.2) - 对JDBC的支持之 7.2 JDBC模板类
7.2 JDBC模板类 7.2.1 概述 Spring JDBC抽象框架core包提供了JDBC模板类,其中JdbcTemplate是core包的核心类,所以其他模板类都是基于它封装完成的,JDB ...
7.2 C++模板类实例化
参考:http://www.weixueyuan.net/view/6399.html 总结: array < int >表明用int类型来代替模板类中的类参数“T”,编译器会将模板类ar ...
[C++]模板类和模板函数
参考: C++ 中模板使用详解 C++模板详解概念为了避免因重载函数定义不全面而带来的调用错误,引入了模板机制定义模板是C++支持参数化多态的工具,使用模板可以使用户为类或者函数声明一种一般模 ...
(转)JDBC模板类。
Spring JDBC抽象框架core包提供了JDBC模板类,其中JdbcTemplate是core包的核心类,所以其他模板类都是基于它封装完成的,JDBC模板类是第一种工作模式. JdbcTempl ...
spring3：对JDBC的支持之 JDBC模板类
7.2 JDBC模板类 7.2.1 概述 Spring JDBC抽象框架core包提供了JDBC模板类,其中JdbcTemplate是core包的核心类,所以其他模板类都是基于它封装完成的,JDB ...

随机推荐

[翻译] KVNProgress
KVNProgress KVNProgress is a fully customizable progress HUD that can be full screen or not. KVNProg ...
POP动画[1]
POP动画[1] pop动画是facebook扩展CoreAnimation的,使用及其方便:) 1:Spring系列的弹簧效果(两个动画kPOPLayerBounds与kPOPLayerCorner ...
JavaScript学习---JavaScript深入学习
对象的概念对象分类[3种]: ECMScript(JS自己的对象), BOM(浏览器对象) DOM(文档对象,操作HTML的) 11种内置对象: Array ,String ...
windows系统镜像微软官方资源便捷下载教程
今天跟小师弟学到了一个下载软件的好办法,省得到各种网站下载带有病毒,插件的资源. 这个神奇的网站叫做 MSDN, 我告诉你,这是一个私人维护的网站,里面有各种官方软件的下载地址.可以直接用下载工具 ...
一次失败的尝试hdfs的java客户端编写（在linux下使用eclipse）
一次失败的尝试hdfs的java客户端编写(在linux下使用eclipse) 给centOS安装图形界面 GNOME桌面环境 https://blog.csdn.net/wh211212/artic ...
fzu_oop_east 第二次作业
这次有四题: 题目1:(这题本身没难度,就是听说格式比较坑,好像) 代码: #include<iostream> #include<cstdio> using namespac ...
GitLab-CI与GitLab-Runner
一.持续集成(Continuous Integration) 要了解GitLab-CI与GitLab Runner,我们得先了解持续集成是什么. 持续集成是一种软件开发实践,即团队开发成员经常集成他们 ...
安装Jdk，tomcat【转载】
一.下载安装对应的jdk,并配置Java环境. 官网下载地址: http://www.oracle.com/technetwork/java/javase/downloads/jdk-6u26-dow ...
tcp通讯中socket套接字accept和listen的关系
今天看到一个文章,客户端的connect在服务端调用accept之前,突然想到这可以建立正常的连接么?以前从没细细的思考过listen accept connect之前的关系,带着疑问学习了一下,记录 ...
POJ-2452 Sticks Problem 二分+RMQ
题目链接: https://cn.vjudge.net/problem/POJ-2452 题目大意: 给出一个数组a,求最大的j-i满足 i<j && a[i] ... a[j] ...

aiohttp爬虫的模板，类的形式

aiohttp爬虫的模板，类的形式的更多相关文章

随机推荐

热门专题