aiohttp笔记
简介
aiohttp需要python3.5.3以及更高的版本,它不但能做客户端爬虫,也能做服务器端,利用asyncio,协程,十分高效
官方文档
采集模板
一批,一次性采集
import asyncio
import logging
import time
from aiohttp import ClientSession, ClientTimeout
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s in %(filename)s.%(funcName)s: %(message)s')
# 默认请求头
HEADERS = {
'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36',
}
# 默认超时时间
TIMEOUT = 15
class AioCrawl:
def __init__(self):
self.logger = logging.getLogger(__name__)
async def fetch(self, url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None):
"""采集纤程"""
method = 'POST' if method.upper() == 'POST' else 'GET'
headers = headers if headers else HEADERS
timeout = ClientTimeout(total=timeout)
cookies = cookies if cookies else None
data = data if data and isinstance(data, dict) else {}
async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
try:
if method == 'GET':
async with session.get(url) as response:
return await response.read()
else:
async with session.post(url, data=data) as response:
return await response.read()
except Exception as e:
raise e
def prepare_fetch(self, urls):
"""准备future_list"""
return [asyncio.ensure_future(self.fetch(url)) for url in urls]
def crawl_batch_urls(self, urls):
"""执行采集"""
future_list = self.prepare_fetch(urls)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(future_list))
self.logger.info('采集完一批: {}'.format(len(urls)))
return future_list
if __name__ == '__main__':
a = AioCrawl()
# 2-4秒
t0 = time.time()
future_list = a.crawl_batch_urls(['https://www.sina.com.cn' for _ in range(5)])
print(time.time() - t0)
for future in future_list:
if future.exception():
print(future.exception())
else:
print(len(future.result()))
动态添加任务
import asyncio
import time
from threading import Thread
from aiohttp import ClientSession, ClientTimeout
# 默认请求头
HEADERS = {
'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36',
}
# 默认超时时间
TIMEOUT = 15
def start_loop(loop):
"""驱动事件循环"""
asyncio.set_event_loop(loop)
loop.run_forever()
async def fetch(url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None):
"""采集纤程"""
print(url)
method = 'POST' if method.upper() == 'POST' else 'GET'
headers = headers if headers else HEADERS
timeout = ClientTimeout(total=timeout)
cookies = cookies if cookies else None
data = data if data and isinstance(data, dict) else {}
async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
try:
if method == 'GET':
async with session.get(url) as response:
content = await response.read()
return response.status, content
else:
async with session.post(url, data=data) as response:
content = await response.read()
return response.status, content
except Exception as e:
raise e
def callback(future):
"""回调函数"""
try:
print(future.result())
except Exception as e:
print(e)
print(type(future))
print(future)
if __name__ == '__main__':
# 启动事件循环
loop = asyncio.new_event_loop()
t = Thread(target=start_loop, args=(loop,))
t.setDaemon(True)
t.start()
f = asyncio.run_coroutine_threadsafe(fetch('https://www.sina.com.cn'), loop)
f.add_done_callback(callback) # 给future对象添加回调函数
time.sleep(5) # 否则看不到结果
动态添加任务,封装成类
import asyncio
import logging
import time
from threading import Thread
from aiohttp import ClientSession, ClientTimeout, TCPConnector
# 默认请求头
HEADERS = {
'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
# 默认超时时间
TIMEOUT = 15
def start_loop(loop):
asyncio.set_event_loop(loop)
loop.run_forever()
class AioCrawl:
def __init__(self):
self.logger = logging.getLogger(__name__)
# 启动事件循环
self.event_loop = asyncio.new_event_loop()
self.t = Thread(target=start_loop, args=(self.event_loop,))
self.t.setDaemon(True)
self.t.start()
self.concurrent = 0 # 记录并发数
async def fetch(self, url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None, proxy=None):
"""采集纤程
:param url: str
:param method: 'GET' or 'POST'
:param headers: dict()
:param timeout: int
:param cookies:
:param data: dict()
:param proxy: str
:return: (status, content)
"""
method = 'POST' if method.upper() == 'POST' else 'GET'
headers = headers if headers else HEADERS
timeout = ClientTimeout(total=timeout)
cookies = cookies if cookies else None
data = data if data and isinstance(data, dict) else {}
tcp_connector = TCPConnector(verify_ssl=False) # 禁用证书验证
async with ClientSession(headers=headers, timeout=timeout, cookies=cookies, connector=tcp_connector) as session:
try:
if method == 'GET':
async with session.get(url, proxy=proxy) as response:
content = await response.read()
return response.status, content
else:
async with session.post(url, data=data, proxy=proxy) as response:
content = await response.read()
return response.status, content
except Exception as e:
raise e
def callback(self, future):
"""回调函数
1.处理并转换成Result对象
2.写数据库
"""
msg = str(future.exception()) if future.exception() else 'success'
code = 1 if msg == 'success' else 0
status = future.result()[0] if code == 1 else None
data = future.result()[1] if code == 1 else b'' # 空串
data_len = len(data) if data else 0
if code == 0 or (status is not None and status != 200): # 打印小异常
self.logger.warning('<url="{}", code={}, msg="{}", status={}, data(len):{}>'.format(
future.url, code, msg, status, data_len))
self.concurrent -= 1 # 并发数-1
print(len(data))
def add_tasks(self, tasks):
"""添加任务
:param tasks: list <class Task>
:return: future
"""
for task in tasks:
# asyncio.run_coroutine_threadsafe 接收一个协程对象和,事件循环对象
future = asyncio.run_coroutine_threadsafe(self.fetch(task), self.event_loop)
future.add_done_callback(self.callback) # 给future对象添加回调函数
self.concurrent += 1 # 并发数加 1
if __name__ == '__main__':
a = AioCrawl()
for _ in range(5):
a.add_tasks(['https://www.sina.com.cn' for _ in range(2)]) # 模拟动态添加任务
time.sleep(1)
aiohttp笔记的更多相关文章
- aiohttp的笔记之TCPConnector
TCPConnector维持链接池,限制并行连接的总量,当池满了,有请求退出再加入新请求.默认是100,limit=0的时候是无限制 1.use_dns_cache: 使用内部DNS映射缓存用以查询D ...
- Python开发【笔记】:aiohttp搭建简易聊天室
简易聊天室: 1.入口main.py import logging import jinja2 import aiohttp_jinja2 from aiohttp import web from a ...
- python 学习笔记 aiohttp
asyncio可以实现单进程并发IO操作,如果仅用在客户端,发挥的威力并不大,如果把asyncio用在服务器端,由于http链接就是IO操作, 因此可以用单线程+coroutine实现多客户的高并发支 ...
- 《用OpenResty搭建高性能服务端》笔记
概要 <用OpenResty搭建高性能服务端>是OpenResty系列课程中的入门课程,主讲人:温铭老师.课程分为10个章节,侧重于OpenResty的基本概念和主要特点的介绍,包括它的指 ...
- DAY7-Python学习笔记
前记: 这几天在弄小程序,view页面的开发很简单,但是在加载图片上遇到了问题,小程序的大小不能超过2M,所以大部分的图片内容要通过request请求服务器来获取,这里之前学习小程序的时候是通过网站A ...
- git-简单流程(学习笔记)
这是阅读廖雪峰的官方网站的笔记,用于自己以后回看 1.进入项目文件夹 初始化一个Git仓库,使用git init命令. 添加文件到Git仓库,分两步: 第一步,使用命令git add <file ...
- js学习笔记:webpack基础入门(一)
之前听说过webpack,今天想正式的接触一下,先跟着webpack的官方用户指南走: 在这里有: 如何安装webpack 如何使用webpack 如何使用loader 如何使用webpack的开发者 ...
- SQL Server技术内幕笔记合集
SQL Server技术内幕笔记合集 发这一篇文章主要是方便大家找到我的笔记入口,方便大家o(∩_∩)o Microsoft SQL Server 6.5 技术内幕 笔记http://www.cnbl ...
- PHP-自定义模板-学习笔记
1. 开始 这几天,看了李炎恢老师的<PHP第二季度视频>中的“章节7:创建TPL自定义模板”,做一个学习笔记,通过绘制架构图.UML类图和思维导图,来对加深理解. 2. 整体架构图 ...
随机推荐
- canvas 压缩图片的大小
使用 signature_pad canvas 库生成的图片太大.但又没有提供方法来压缩. 当然这是根据你canvas的画布大小决定的,某些原因导致我的画布就得是那么大. 随随便便一个图片转化为bas ...
- MySQL5.0存储过程教程
Introduction 简介 MySQL 5.0 新特性教程是为需要了解5.0版本新特性的MySQL老用户而写的.简单的来说是介绍了“存储过程.触发器.视图.信息架构视图”,在此感谢译者陈朋奕的努力 ...
- 多国语言解决方案gnu.gettext + poedit
1.工具简介 1.1.关于i18n i18n其来源是英文单词 internationalization的首末字符i和n,18为中间的字符数是“国际化”的简称. i10n为资源本地化,全称为Locali ...
- Atitit.注解and属性解析(2)---------语法分析 生成AST attilax总结 java .net
Atitit.注解and属性解析(2)---------语法分析 生成AST attilax总结 java .net 1. 应用场景:::因为要使用ui化的注解 1 2. 使用解释器方式来实现生成 ...
- Atitit.解决org.hibernate.DuplicateMappingException: Duplicate class/entity mapping
Atitit.解决org.hibernate.DuplicateMappingException: Duplicate class/entity mapping 1. 排除流程::: @Depreca ...
- Path相关方法解说(二)
今天咱们一起来看看Path里 XXXTo 相关的一类方法. watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/4 ...
- docker容器跑tomcat遇到的坑
使用docker容器跑tomcat,由于同一个宿主机上跑了多个容器,再加上宿主机本身跑了很多进程,导致系统总的进程数达到了8000+,而容器中tomcat的启动脚本中会调用自带的setenv.sh,在 ...
- 页面跳转时候拼接在url后面的多个 参数获取
function GetRequest() { var url = location.search; var theRequest = new Object(); if (url.indexOf(&q ...
- URL与URI
1.URI是统一资源标识符,是一个用于标识某一互联网资源名称的字符串. 该种标识允许用户对任何(包括本地和互联网)的资源通过特定的协议进行交互操作.URI由包括确定语法和相关协议的方案所定义.由是三个 ...
- 数据库事务隔离级别<转>
数据库事务的隔离级别有4个,由低到高依次为Read uncommitted.Read committed.Repeatable read.Serializable,这四个级别可以逐个解决脏读.不可重复 ...