福利爬虫妹子图之获取种子url

import os

import uuid

from lxml import html

import aiofiles

import logging

from ruia import Spider, Request

from ruia_ua import middleware

from aiohttp探究.db import MotorBase

import datetime

demo = "https://www.mzitu.com/page/{}/"

class BaiduImgSpider(Spider):

    start_urls = []

    img_path = 'data/'

    async def parse(self, res):

        self.mongo_db = MotorBase().get_db('img_data')

        source = res.html

        root = html.fromstring(source)

        url_list = root.xpath("//ul[@id='pins']/li/a/@href")

        name_list = root.xpath("//ul[@id='pins']/li/a/img/@alt")

        next_page_urls = []

        headers = {

            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

            'accept-encoding': 'gzip, deflate, br',

            'accept-language': 'zh-CN,zh;q=0.9',

            'cache-control': 'max-age=0',

            'referer': 'https://www.mzitu.com/mm/',

            'upgrade-insecure-requests': '1',

            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',

        }

        for each_data in url_list:

            next_page_urls.append(each_data)

        for name, url in zip(name_list, next_page_urls):

            yield Request(url, headers=headers, callback=self.next_page, metadata={"name": name}, res_type='text')

    async def next_page(self, res):

        source = res.html

        root = html.fromstring(source)

        name = res.metadata.get("name")

        refere_url = res.url

        # print(name, refere_url)

        # 最后一页xpath

        max_page_list = "//div[@class='pagenavi']/a[last()-1]/span/text()"

        _max_page_num = root.xpath(max_page_list)

        max_page_num = _max_page_num[0] if _max_page_num else None

        img_url_node = root.xpath("//div[@class='main-image']/p/a/img/@src")

        img_url = img_url_node[0] if img_url_node else None

        headers = {

            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

            'accept-encoding': 'gzip, deflate, br',

            'accept-language': 'zh-CN,zh;q=0.9',

            'cache-control': 'max-age=0',

            'if-modified-since': 'Thu, 15 Nov 2018 04:24:11 GMT',

            'if-none-match': '"5becf4eb-1b7d4"',

            'referer': refere_url,

            'upgrade-insecure-requests': '1',

            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',

        }

        datas = []

        # yield Request(img_url, callback=self.save_img, headers=headers,

        #               metadata={"url": img_url, "name": name, "id": "1"},

        #               res_type='bytes')

        data1 = {'url': img_url, "status": "0", 'title': name, "img_id": "1", "headers": headers,

                 "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

        datas.append(data1)

        # print("最大页数", max_page_num)

        for page in range(2, int(max_page_num) + 1):

            headers["referer"] = f"{refere_url}{str(page).zfill(2)}"

            next_img_url = img_url.replace("01.", f"{str(page).zfill(2)}.")

            # print("next",next_img_url)

            # yield Request(next_img_url, callback=self.save_img, headers=headers,

            #               metadata={"url": img_url, "name": name, "id": page},

            #               res_type='bytes')

            data2 = {'url': next_img_url, "status": "0", 'title': name, "img_id": page, "headers": headers,

                     "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

            datas.append(data2)

        await self.mongo_db.mzitu2.insert_many(datas)

    async def save_img(self, res):

        url = res.metadata.get("url")

        _img_type = url.rsplit(".", 1)

        img_type = _img_type[1] if _img_type else None

        name = res.metadata.get("name")

        img_id = res.metadata.get("id")

        img_all_path = f"{self.img_path}{name}/"

        if not os.path.exists(img_all_path):

            os.makedirs(img_all_path)

        # img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-')

        img_name = f"{img_id}.{img_type}"

        async with aiofiles.open(img_all_path + img_name, 'wb') as fp:

            await fp.write(res.html)

            logging.info('Img downloaded successfully in {dir}'.format(dir=img_all_path + img_name))

if __name__ == '__main__':

    word = '妹子图'  # 目录名

    pages = 201  # 页数

    BaiduImgSpider.img_path = word + "/"

    BaiduImgSpider.start_urls = [demo.format(page) for page in range(pages)]

    BaiduImgSpider.start(middleware=middleware)

db.py

import asyncio

from motor.motor_asyncio import AsyncIOMotorClient

class MotorBase:

    """

    About motor's doc: https://github.com/mongodb/motor

    """

    _db = {}

    _collection = {}

    def __init__(self, loop=None):

        self.motor_uri = ''

        self.loop = loop or asyncio.get_event_loop()

    def client(self, db):

        # motor

        self.motor_uri = f"mongodb://localhost:27017/{db}"

        return AsyncIOMotorClient(self.motor_uri, io_loop=self.loop)

    def get_db(self, db='test'):

        """

        Get a db instance

        :param db: database name

        :return: the motor db instance

        """

        if db not in self._db:

            self._db[db] = self.client(db)[db]

        return self._db[db]

福利爬虫妹子图之获取种子url的更多相关文章

关于如何爬虫妹子图网的源码分析 c#实现
网上也出现一些抓取妹子图的python 代码,今天我们用c#实现爬虫过程. 请看我的网站: www.di81.com private void www_94xmn_Com(string url, st ...
爬虫实战【5】送福利！Python获取妹子图上的内容
[插入图片,妹子图首页] 哈,只敢放到这个地步了. 今天给直男们送点福利,通过今天的代码,可以把你的硬盘装的满满的~ 下面就开始咯! 第一步:如何获取一张图片假如我们知道某张图片的url,如何获取到 ...
python妹子图爬虫5千张高清大图突破防盗链福利5千张福利高清大图
meizitu-spider python通用爬虫-绕过防盗链爬取妹子图这是一只小巧方便,强大的爬虫,由python编写所需的库有 requests BeautifulSoup os lxml 伪 ...
Python协程爬取妹子图(内有福利，你懂得~)
项目说明: 1.项目介绍本项目使用Python提供的协程+scrapy中的选择器的使用(相当好用)实现爬取妹子图的(福利图)图片,这个学会了,某榴什么的.pow(2, 10)是吧! 2.用到的知 ...
Scrapy框架实战-妹子图爬虫
Scrapy这个成熟的爬虫框架,用起来之后发现并没有想象中的那么难.即便是在一些小型的项目上,用scrapy甚至比用requests.urllib.urllib2更方便,简单,效率也更高.废话不多说, ...
Python爬虫入门教程 2-100 妹子图网站爬取
妹子图网站爬取---前言从今天开始就要撸起袖子,直接写Python爬虫了,学习语言最好的办法就是有目的的进行,所以,接下来我将用10+篇的博客,写爬图片这一件事情.希望可以做好. 为了写好爬虫,我们 ...
Python3爬虫系列：理论+实验+爬取妹子图实战
Github: https://github.com/wangy8961/python3-concurrency-pics-02 ,欢迎star 爬虫系列: (1) 理论 Python3爬虫系列01 ...
[Python爬虫]煎蛋网OOXX妹子图爬虫（1）——解密图片地址
之前在鱼C论坛的时候,看到很多人都在用Python写爬虫爬煎蛋网的妹子图,当时我也写过,爬了很多的妹子图片.后来煎蛋网把妹子图的网页改进了,对图片的地址进行了加密,所以论坛里面的人经常有人问怎么请求的 ...
Python Scrapy 爬取煎蛋网妹子图实例（一）
前面介绍了爬虫框架的一个实例,那个比较简单,这里在介绍一个实例爬取煎蛋网妹子图,遗憾的是上周煎蛋网还有妹子图了,但是这周妹子图变成了随手拍, 不过没关系,我们爬图的目的是为了加强实战应用,管 ...

随机推荐

[luogu1327][生活大爆炸石头剪子布]
题目地址 https://www.luogu.org/problemnew/show/P1328 题目描述石头剪刀布是常见的猜拳游戏:石头胜剪刀,剪刀胜布,布胜石头.如果两个人出拳一样,则不分胜负. ...
关于next.js中的css
css进行了全局和局部的限制 export default () => ( <div className='hello'> <p>Hello World</p> ...
EOJ2018.10 月赛
EOJ2018.10 月赛题目一览表(Green color indicate understand and Accept) 来源考察知识点完成时间 A oxx 的小姐姐们 EOJ 数学+思维 ...
mfc editline 变为大框框
属性:
数据类型、位运算、sizeof()函数
数据精度,依次升高.(负数必须使用有符号类型) 不同精度的数据间运算,所得结果为高精度类型. 数据类型详细信息如下图: 整型数据的数制:十进制(32).八进制(032,以0开头).十六进制(0x32, ...
STM32 --- 断言（assert_param）的开启和使用
默认,STM32的assert_param是没有开启检测,需要 #define USE_FULL_ASSERT 开启后,才能检测形参是否符合要求 // #define assert_param(exp ...
eclipse中编辑properties文件无法看到中文
如果在eclipse中编辑properties文件无法看到中文则参考“Eclipse开发环境配置-indigo.docx”添加propedit插件.
vuex2.0源码分析
当我们用vue在开发的过程中,经常会遇到以下问题多个vue组件共享状态 Vue组件间的通讯在项目不复杂的时候,我们会利用全局事件bus的方式解决,但随着复杂度的提升,用这种方式将会使得代码难以维护 ...
Ubuntu 16.04及以上安装/卸载 Docker-CE
前言本文仅针对Ubuntu 18.10.18.04.16.04的x86_64的OS与架构下的Docker-CE的安装卸载老板本如果已安装,请卸载它们: sudo apt-get remove d ...
stm32启动文件ld md hd cl vl xl分析及选择
startup_stm32f10x_cl.s互联型的STM32F105xx,STM32F107xxstartup_stm32f10x_hd.s 大容量的STM32F101xx,STM32F102xx, ...

福利爬虫妹子图之获取种子url

福利爬虫妹子图之获取种子url的更多相关文章

随机推荐

热门专题