aiohttp_spider
aiohttp_spider_def:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
async def fetch(url, session):
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(url, session):
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
async def article_handler(url, session, pool):
# 获取文章详情
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
try:
# 插入
await cursor.execute('insert into async_test_async(title) values("{}")'.format(title))
# 插入数据
await cursor.execute("insert into async_test_async(title) values('{}')".format(title))
# 查询数据
await cursor.execute("select * from async_test_async")
data = await cursor.fetchall()
print("data:", data)
# 更新数据
await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168))
# 删除数据
await cursor.execute("delete from async_test_async where id={}".format(10174))
except:
pass
except:
pass
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(init_urls(url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(article_handler(url, session, pool))
await asyncio.sleep(0.1)
async def main(loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await fetch(start_url, session)
seen_uels.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
loop.run_forever()
aiohttp_spider_class:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
class async_text(object):
async def fetch(self, url, session):
print("self:", self)
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(self, html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(self, url, session):
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
async def article_handler(self, url, session, pool):
# 获取文章详情
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cur:
try:
# 插入
await cur.execute('insert into async_test_async(title) values("{}")'.format(title))
except:
pass
except:
pass
async def consumer(self, pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(self.init_urls(self, url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(self.article_handler(self, url, session, pool))
await asyncio.sleep(0.1)
@classmethod
async def main(self, loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda',
loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await self.fetch(self, start_url, session)
seen_uels.add(start_url)
self.extract_urls(self, html)
asyncio.ensure_future(self.consumer(self, pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(async_text.main(loop))
loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- 010.MongoDB备份恢复
一 MongoDB备份 1.1 备份概述 mongodb数据备份和还原主要分为二种,一种是针对于库的mongodump和mongorestore,一种是针对库中表的mongoexport和mongoi ...
- CodeForces - 1230C(思维/暴力)
题意 https://vjudge.net/problem/CodeForces-1230C 给了你总共有21张多米诺骨牌,每张牌有两个面,然后给你一个无向图,保证没有环和一个顶点多条边的情况存在.现 ...
- numpy中多维数组的绝对索引
这涉及到吧多维数组映射为一维数组. 对于3维数组,有公式: def MAP(x,y,z): return y_s * z_s * x + z_s * y + z 此公式可以推广到N维 测试代码:(两个 ...
- CF1253E Antenna Coverage(DP)
本题难点在正确性证明. 令 \(f_i\) 表示 \([1,i]\) 被全部覆盖的最小花费.答案为 \(f_m\). 首先发现,添加一个区间 \([0,0]\) 不会影响答案.所以 \(f_i\) 的 ...
- 实例属性和方法的动态处理(__getattr__)
正常情况下,当调用类的方法或属性时,如果不存在,就会报错 要避免这个错误,除了可以加上那个要调用但不存在的属性外,Python还有另一个机制,那就是写一个__getattr__()方法,动态返回一个属 ...
- jmeter进行接口测试--csv参数化,数据驱动-转
首先我们要有一个接口测试用例存放的地方,我们这里用EXCEL模板管理,里面包含用例编号.入参.优先级.请求方式.url等等. 1:新建一个txt文件,命名为sjqd,后缀名改为csv,右键excel格 ...
- 基于 H5 + WebGL 实现的地铁站 3D 可视化系统
前言 工业互联网,物联网,可视化等名词在我们现在信息化的大背景下已经是耳熟能详,日常生活的交通,出行,吃穿等可能都可以用信息化的方式来为我们表达,在传统的可视化监控领域,一般都是基于 Web SCAD ...
- mysql Hash索引和BTree索引区别
Hash仅支持=.>.>=.<.<=.between.BTree可以支持like模糊查询 索引是帮助mysql获取数据的数据结构.最常见的索引是Btree索引和Hash索引. ...
- Object(Asp.NET核心机制内置对象汇总)
ASP.NET有个大佬,HttpContext(在.Net Core中依然是它)Http请求的上下文,任何一个环节都是需要HttpContext的,需要的参数信息,处理的中间结果,最终的结果,都是放在 ...
- python基础(16):内置函数(二)
1. lamda匿名函数 为了解决⼀些简单的需求⽽设计的⼀句话函数 # 计算n的n次⽅ def func(n): return n**n print(func(10)) f = lambda n: n ...