aiohttp_spider
aiohttp_spider_def:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10)  # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
async def fetch(url, session):
    async with sem:
        #     await asyncio.sleep(1)
        try:
            async with session.get(url, headers=headers, timeout=1) as resp:
                print('url status:{}'.format(resp.status))
                # if resp.status in [200, 201]:
                data = etree.HTML(await resp.read())
                return data
        except Exception as e:
            print('错误为:{}  url:{}'.format(e, url))
def extract_urls(html):
    try:
        for url in html.xpath('//a/@href'):
            if url and url.startswith("http") and url not in seen_uels:
                if re.findall(r'baidu', url):
                    waitting_urs.append(url)
    except:
        pass
async def init_urls(url, session):
    html = await fetch(url, session)
    seen_uels.add(url)
    extract_urls(html)
async def article_handler(url, session, pool):
    # 获取文章详情
    html = await fetch(url, session)
    seen_uels.add(url)
    extract_urls(html)
    try:
        title = html.xpath('//title/text()')[0].strip()
        print('title:{}'.format(title))
        async with pool.acquire() as conn:
            async with conn.cursor() as cursor:
                try:
                    # 插入
                    await cursor.execute('insert into async_test_async(title) values("{}")'.format(title))
                    # 插入数据
                    await cursor.execute("insert into async_test_async(title) values('{}')".format(title))
                    # 查询数据
                    await cursor.execute("select * from async_test_async")
                    data = await cursor.fetchall()
                    print("data:", data)
                    # 更新数据
                    await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168))
                    # 删除数据
                    await cursor.execute("delete from async_test_async where id={}".format(10174))
                except:
                    pass
    except:
        pass
async def consumer(pool):
    async with aiohttp.ClientSession() as session:
        while not stoppint:
            if len(waitting_urs) < 10:
                if url not in seen_uels:
                    asyncio.ensure_future(init_urls(url, session))
            url = waitting_urs.pop()
            print('start get url:{}'.format(url))
            if re.findall(r'baidu', url):
                if url not in seen_uels:
                    print('waitting_urs:{}'.format(waitting_urs[0: 3]))
                    asyncio.ensure_future(article_handler(url, session, pool))
                    await asyncio.sleep(0.1)
async def main(loop):
    pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop,
                                      charset='utf8', autocommit=True)
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url, session)
        seen_uels.add(start_url)
        extract_urls(html)
    asyncio.ensure_future(consumer(pool))
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(loop))
    loop.run_forever() 
aiohttp_spider_class:
import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree
start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10)  # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
class async_text(object):
    async def fetch(self, url, session):
        print("self:", self)
        async with sem:
            #     await asyncio.sleep(1)
            try:
                async with session.get(url, headers=headers, timeout=1) as resp:
                    print('url status:{}'.format(resp.status))
                    # if resp.status in [200, 201]:
                    data = etree.HTML(await resp.read())
                    return data
            except Exception as e:
                print('错误为:{}  url:{}'.format(e, url))
    def extract_urls(self, html):
        try:
            for url in html.xpath('//a/@href'):
                if url and url.startswith("http") and url not in seen_uels:
                    if re.findall(r'baidu', url):
                        waitting_urs.append(url)
        except:
            pass
    async def init_urls(self, url, session):
        html = await self.fetch(self, url, session)
        seen_uels.add(url)
        self.extract_urls(self, html)
    async def article_handler(self, url, session, pool):
        # 获取文章详情
        html = await self.fetch(self, url, session)
        seen_uels.add(url)
        self.extract_urls(self, html)
        try:
            title = html.xpath('//title/text()')[0].strip()
            print('title:{}'.format(title))
            async with pool.acquire() as conn:
                async with conn.cursor() as cur:
                    try:
                        # 插入
                        await cur.execute('insert into async_test_async(title) values("{}")'.format(title))
                    except:
                        pass
        except:
            pass
    async def consumer(self, pool):
        async with aiohttp.ClientSession() as session:
            while not stoppint:
                if len(waitting_urs) < 10:
                    if url not in seen_uels:
                        asyncio.ensure_future(self.init_urls(self, url, session))
                url = waitting_urs.pop()
                print('start get url:{}'.format(url))
                if re.findall(r'baidu', url):
                    if url not in seen_uels:
                        print('waitting_urs:{}'.format(waitting_urs[0: 3]))
                        asyncio.ensure_future(self.article_handler(self, url, session, pool))
                        await asyncio.sleep(0.1)
    @classmethod
    async def main(self, loop):
        pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda',
                                          loop=loop,
                                          charset='utf8', autocommit=True)
        async with aiohttp.ClientSession() as session:
            html = await self.fetch(self, start_url, session)
            seen_uels.add(start_url)
            self.extract_urls(self, html)
        asyncio.ensure_future(self.consumer(self, pool))
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(async_text.main(loop))
    loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- CSP 2019 游记
			
Day -32 开坑. 没什么好说的,等个 5 天等初赛(应该叫第一轮认证)挂掉之后就能弃坑了. 今天开始停课,虽然每天只停半天,但是感觉还是特别的舒服~ 然而得等初赛过了才能全天停课-- 没关系,熬 ...
 - Dubbo 一些你不一定知道但是很好用的功能
			
dubbo功能非常完善,很多时候我们不需要重复造轮子,下面列举一些你不一定知道,但是很好用的功能: 直连Provider 在开发及测试环境下,可能需要绕过注册中心,只测试指定服务提供者,这时候可能需要 ...
 - LeetCode 387: 字符串中的第一个唯一字符	 First Unique Character in a String
			
题目: 给定一个字符串,找到它的第一个不重复的字符,并返回它的索引.如果不存在,则返回 -1. Given a string, find the first non-repeating charact ...
 - 08-蓝图&单元测试
			
学习目标 能够使用代码实现蓝图对项目进行模块化 能够说出断言的作用 能够说出实现单元测试步骤 能够说出单元测试所执行方法的定义规则 Blueprint(蓝图) 随着flask程序越来越复杂,我们需要对 ...
 - app自动化测试环境搭建之node+appium+ADT+MUMU模拟器
			
一.安装Microsoft .NET Framework 4.5 检测本机已安装的程序中,是否已经安装Microsoft .NET Framework 4.5及以上的版本 如果没有安装,则获取安装文件 ...
 - Dubbo从入门到实战:实战篇
			
一.加入 zookeeper 作为注册中心 在前面的案例中,我们没有使用任何的注册中心,而是用一种直连的方式进行的.但是,实际上很多时候,我们都是使用 dubbo + zookeeper 的方式,使用 ...
 - sql server  列字段拼接 —— STUFF
			
原始数据: sql语句 SELECT DISTINCT l.family_id, )) ,,'' ) isc_id FROM dbo.Addresses l 结果数据:
 - python读取Excel的值
			
上代码: import pandas as pd if __name__ == '__main__': #默认的读取第一个sheet df = pd.read_excel("E:\\MyPr ...
 - go-爬虫-百度贴吧(并发版)
			
爬取百度贴吧的网页 非并发版 package main import ( "fmt" "io" "net/http" "os&qu ...
 - python中13个实用的文件操作
			
1. 判断指定目录是否存在: os.path.exists(input_folder) 2. 判断指定目录是不是文件夹 os.path.isdir(input_folder) 3. 判断指定目录是不是 ...