aiohttp_spider
aiohttp_spider_def:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
async def fetch(url, session):
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(url, session):
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
async def article_handler(url, session, pool):
# 获取文章详情
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
try:
# 插入
await cursor.execute('insert into async_test_async(title) values("{}")'.format(title))
# 插入数据
await cursor.execute("insert into async_test_async(title) values('{}')".format(title))
# 查询数据
await cursor.execute("select * from async_test_async")
data = await cursor.fetchall()
print("data:", data)
# 更新数据
await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168))
# 删除数据
await cursor.execute("delete from async_test_async where id={}".format(10174))
except:
pass
except:
pass
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(init_urls(url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(article_handler(url, session, pool))
await asyncio.sleep(0.1)
async def main(loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await fetch(start_url, session)
seen_uels.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
loop.run_forever()
aiohttp_spider_class:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
class async_text(object):
async def fetch(self, url, session):
print("self:", self)
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(self, html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(self, url, session):
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
async def article_handler(self, url, session, pool):
# 获取文章详情
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cur:
try:
# 插入
await cur.execute('insert into async_test_async(title) values("{}")'.format(title))
except:
pass
except:
pass
async def consumer(self, pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(self.init_urls(self, url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(self.article_handler(self, url, session, pool))
await asyncio.sleep(0.1)
@classmethod
async def main(self, loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda',
loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await self.fetch(self, start_url, session)
seen_uels.add(start_url)
self.extract_urls(self, html)
asyncio.ensure_future(self.consumer(self, pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(async_text.main(loop))
loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- Android程序中,内嵌ELF可执行文件-- Android开发C语言混合编程总结
前言 都知道的,Android基于Linux系统,然后覆盖了一层由Java虚拟机为核心的壳系统.跟一般常见的Linux+Java系统不同的,是其中有对硬件驱动进行支持,以避开GPL开源协议限制的HAL ...
- JVM-2-JVM结构
什么是JVM JVM是可运行Java代码的假想计算机 (或者理解为一种规范),包括一套字节码指令集.一组寄存器.一个栈.一个垃圾回收,堆 和 一个存储方法域.JVM是运行在操作系统之上的 ...
- Java哲学家进餐问题|多线程
Java实验三 多线程 哲学家进餐问题: 5个哲学家共用一张圆桌,分别坐在周围的5张椅子上, 在圆桌上有5个碗和5只筷子(注意是5只筷子,不是5双), 碗和筷子交替排列.他们的生活方式是交替地进行思考 ...
- 201871010110 - 李华 《面向对象程序设计(java)》第一周学习总结
项目 内容 <面向对象程序设计(java)> https://www.cnblogs.com/nwnu-daizh/ 这个作业的要求在哪里 https://i.cnblogs.com/Ed ...
- LG5239 回望京都 组合数+暴力
问题描述 LG5239 题解 我就是个傻逼,鉴定完毕. 连 \(C_m^n=C_{m-1}^n+C_{m-1}^{n-1}\) 都忘了. 所以暴力求出 \(1000\) 以内的 \(C_i^j\) , ...
- vue 指令和修饰符
1. v-textv-text主要用来更新textContent,可以等同于JS的text属性. <spanv-text="msg"></span> 这两者 ...
- oracle中utl_raw
RAW,类似于CHAR,声明方式RAW(L),L为长度,以字节为单位,作为数据库列最大2000,作为变量最大32767字节.LONG RAW,类似于LONG,作为数据库列最大存储2G字节的数据,作为变 ...
- 开放平台API接口安全策略汇总
在设计开放平台接口过程中,往往会涉及接口传输安全性相关的问题,本文对接口加密及签名的相关知识做了一个总结,在方便自己查阅的同时也分享给大家做一些参考. 接口安全性问题思考 接口安全性问题主要来源于几方 ...
- mysql 优化之 is null ,is not null 索引使用测试
关于mysql优化部分,有很多网友说尽量避免使用is null, is not null,select * 等,会导致索引失效,性能降低?那是否一定收到影响呢?真的就不会使用索引了吗? 本文的测试数据 ...
- 【LOJ#575】【LNR#2】不等关系(容斥,动态规划,分治FFT)
[LOJ#575][LNR#2]不等关系(容斥,动态规划,分治FFT) 题面 LOJ 题解 一个暴力\(dp\),设\(f[i][j]\)表示考虑完了前\(i\)个位置,其中最后一个数在前面所有数中排 ...