aiohttp_spider
aiohttp_spider_def:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
async def fetch(url, session):
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(url, session):
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
async def article_handler(url, session, pool):
# 获取文章详情
html = await fetch(url, session)
seen_uels.add(url)
extract_urls(html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
try:
# 插入
await cursor.execute('insert into async_test_async(title) values("{}")'.format(title))
# 插入数据
await cursor.execute("insert into async_test_async(title) values('{}')".format(title))
# 查询数据
await cursor.execute("select * from async_test_async")
data = await cursor.fetchall()
print("data:", data)
# 更新数据
await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168))
# 删除数据
await cursor.execute("delete from async_test_async where id={}".format(10174))
except:
pass
except:
pass
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(init_urls(url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(article_handler(url, session, pool))
await asyncio.sleep(0.1)
async def main(loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await fetch(start_url, session)
seen_uels.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
loop.run_forever()
aiohttp_spider_class:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10) # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
class async_text(object):
async def fetch(self, url, session):
print("self:", self)
async with sem:
# await asyncio.sleep(1)
try:
async with session.get(url, headers=headers, timeout=1) as resp:
print('url status:{}'.format(resp.status))
# if resp.status in [200, 201]:
data = etree.HTML(await resp.read())
return data
except Exception as e:
print('错误为:{} url:{}'.format(e, url))
def extract_urls(self, html):
try:
for url in html.xpath('//a/@href'):
if url and url.startswith("http") and url not in seen_uels:
if re.findall(r'baidu', url):
waitting_urs.append(url)
except:
pass
async def init_urls(self, url, session):
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
async def article_handler(self, url, session, pool):
# 获取文章详情
html = await self.fetch(self, url, session)
seen_uels.add(url)
self.extract_urls(self, html)
try:
title = html.xpath('//title/text()')[0].strip()
print('title:{}'.format(title))
async with pool.acquire() as conn:
async with conn.cursor() as cur:
try:
# 插入
await cur.execute('insert into async_test_async(title) values("{}")'.format(title))
except:
pass
except:
pass
async def consumer(self, pool):
async with aiohttp.ClientSession() as session:
while not stoppint:
if len(waitting_urs) < 10:
if url not in seen_uels:
asyncio.ensure_future(self.init_urls(self, url, session))
url = waitting_urs.pop()
print('start get url:{}'.format(url))
if re.findall(r'baidu', url):
if url not in seen_uels:
print('waitting_urs:{}'.format(waitting_urs[0: 3]))
asyncio.ensure_future(self.article_handler(self, url, session, pool))
await asyncio.sleep(0.1)
@classmethod
async def main(self, loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda',
loop=loop,
charset='utf8', autocommit=True)
async with aiohttp.ClientSession() as session:
html = await self.fetch(self, start_url, session)
seen_uels.add(start_url)
self.extract_urls(self, html)
asyncio.ensure_future(self.consumer(self, pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(async_text.main(loop))
loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- 9.jenkins 集群
一. 集群配置 实际生产中,需要配置集群,来配合使用. jenkins 主节点需要安装插件 SSH Slaves 从节点(就是slave的机器)需要安装 jdk 开发包 yum install -y ...
- java 内存溢出总结
堆 /** * jvm 参数: -Xms5m -Xmx5m -Xmn2m -XX:NewSize=1m * @author admin * */ public class HeapOutOfMemor ...
- 超实用的Java web面试题
Java web面试题 1.Tomcat的优化经验 答:去掉对web.xml的监视,把jsp提前编辑成Servlet. 有富余物理内存的情况,加大tomcat使用的jvm的内存 2.HTTP请求的GE ...
- .NET Core NuGet 多项目套餐打包的正确姿势
NuGet 默认只支持一个菜一个菜打包,不支持套餐打包.当对一个 csproj 项目进行 nuget 打包时(比如使用 dotnet pack 命令),只会将当前项目 build 出来的 dll 程序 ...
- Mybatis和Hibernate框架的区别
Mybatis和Hibernate框架的区别1 简单简介 1.1 Hibernate 框架 Hibernate是一个开放源代码的对象关系映射框架,它对JDBC进行了非常轻量级的对象封装,建立对象与数据 ...
- STM32 F4xx Fault 异常错误定位指南
STM32 F407 采用 Cortex-M4 的内核,该内核的 Fault 异常可以捕获非法的内存访问和非法的编程行为.Fault异常能够检测到以下几类非法行为: 总线 Fault: 在取址.数据读 ...
- mysql数据库的十种查询方式及多表查询
--mysql数据库的十种查询方式 -- (1)查询时起别名 SELECT id AS '编号',NAME AS '姓名',age AS '年龄' FROM student; -- (2)查询时添加常 ...
- Bag of Tricks for Image Classification with Convolutional Neural Networks
这篇文章来自李沐大神团队,使用各种CNN tricks,将原始的resnet在imagenet上提升了四个点.记录一下,可以用到自己的网络上.如果图片显示不了,点击链接观看 baseline mode ...
- 10、Fiddler中设置断点修改Response
当然Fiddler中也能修改Response 第一种:打开Fiddler 点击Rules-> Automatic Breakpoint ->After Response (这种方法会中 ...
- 2018-8-10-win10-uwp-商业游戏-
原文:2018-8-10-win10-uwp-商业游戏- title author date CreateTime categories win10 uwp 商业游戏 lindexi 2018-08- ...