aiohttp_spider
aiohttp_spider_def:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
async def fetch(url, session):
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(url, session):
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
async def article_handler(url, session, pool):
# 获取文章详情
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
try:
# 插入
await cursor.execute('insert into async_test_async(title) values("{}")'.format(title))
# 插入数据
await cursor.execute("insert into async_test_async(title) values('{}')".format(title))
# 查询数据
await cursor.execute("select * from async_test_async")
data = await cursor.fetchall()
print("data:", data)
# 更新数据
await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168))
# 删除数据
await cursor.execute("delete from async_test_async where id={}".format(10174))
except:
pass
except:
pass
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(init_urls(url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(article_handler(url, session, pool))
await asyncio.sleep(0.1)
async def main(loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await fetch(start_url, session)
seen_uels.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
loop.run_forever()
aiohttp_spider_class:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
class async_text(object):
async def fetch(self, url, session):
print("self:", self)
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(self, html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(self, url, session):
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
async def article_handler(self, url, session, pool):
# 获取文章详情
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cur:
try:
# 插入
await cur.execute('insert into async_test_async(title) values("{}")'.format(title))
except:
pass
except:
pass
async def consumer(self, pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(self.init_urls(self, url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(self.article_handler(self, url, session, pool))
await asyncio.sleep(0.1)
@classmethod
async def main(self, loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda',
loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await self.fetch(self, start_url, session)
seen_uels.add(start_url)
self.extract_urls(self, html)
asyncio.ensure_future(self.consumer(self, pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(async_text.main(loop))
loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- UML工具-1-StarUML下载及破解
UML工具-StarUML 下载地址 http://staruml.io/
- [Linux]线程分离状态的理解
在任何一个时间点上,线程是可结合的(joinable),或者是分离的(detached).一个可结合的线程能够被其他线程收回其资源和杀死:在被其他线程回收之前,它的存储器资源(如栈)是不释放的.相反, ...
- CSharpGL(54)用基于图像的光照(IBL)来计算PBR的Specular部分
CSharpGL(54)用基于图像的光照(IBL)来计算PBR的Specular部分 接下来本系列将通过翻译(https://learnopengl.com)这个网站上关于PBR的内容来学习PBR(P ...
- python request获取ip、获取登录设备
from flask import request 获取ip request.remote_addr 获取登录设备 request.user_agent.string
- Java设计模式:Factory Method(工厂方法)模式
概念定义 工厂方法(Factory Method)模式,又称多态工厂(Polymorphic Factory)模式或虚拟构造器(Virtual Constructor)模式.工厂方法模式通过定义工厂抽 ...
- Python - ^在正则表达式中的作用
^在正则表达式中有两个作用,一是表达以什么开头,二是表达对什么取反.有时候经常傻傻的分不清楚,接下来给大家详细介绍该怎么用这个^准备一个python文件test.py,借用re.search函数举例说 ...
- C#判断dataGridView1 点击的是哪一列上的按钮
private void dataGridView1_CellContentClick(object sender, DataGridViewCellEventArgs e) { ) { DataGr ...
- 采坑 - LODOP,打印预览
结合 layui.弹出框内容样式如下: 红框表示,左右的内边距. 图一 打印预览的样式如下:红框表示,左右的内边距. 图二 要根据图二的左右内边距,去修改图一的左右内边距.不然会影响正文内容高度的判断 ...
- 自定义滚动条样式纯(css)
啥都不说先看图: 注: 只适合chrom,不适用IE和fireFox 下面展示代码: <html lang="en"> <head> <meta ch ...
- 模仿UIApplication创建单例
UIApplicationMain: 1.创建UIApplication--应用程序唯一标识:可设置状态栏.识别联网状态.设置数字.打电话.发邮件.发短信.打开网页 2.创建UIApplication ...