python 爬虫005-爬虫实例

实例一：扒取猫眼电影TOP100 的信息

#!/usr/bin/env python

# -*- coding: utf-8 -*-

"""

扒取猫眼电影TOP100 的信息

"""

import re

import json

from multiprocessing import Pool

import requests

from requests.exceptions import RequestException

def get_page_content(url):

    """

    获取页面源码

    :param url:

    :return:

    """

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        return None

def parse_html(html):

    """

    使用正则表达式解析源码

    :param html:

    :return:

    """

    pattern = re.compile(

        '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?class="name"><a.*?>(.*?)</a>.*?class="score"><i.*?>(.*?)</i><i.*?>(.*?)</i>.*?</dd>',

        re.S)

    items = re.findall(pattern, html)

    for item in items:

        yield {

            'index': item[0],

            'image': item[1],

            'title': item[2],

            'score': item[3] + item[4]

        }

def write_to_file(dic):

    """

    写入文件

    :param dic:

    :return:

    """

    with open('result', 'a', encoding='utf-8') as f:

        f.write(json.dumps(dic, ensure_ascii=False) + "\n")

def main(offset):

    """

    主函数

    :param offset:

    :return:

    """

    url = 'http://maoyan.com/board/4?offset=' + str(offset)

    html = get_page_content(url)

    for item in parse_html(html):

        print(item)

        write_to_file(item)

if __name__ == '__main__':

    # 通过multiprocessing.Pool 创建多线程并发执行。

    pool = Pool(4)

    pool.map(main, [i * 10 for i in range(10)])

    pool.close()

    pool.join()

实例二：扒取头条网中的街拍图片

#!/usr/bin/env python

# -*- coding: utf-8 -*-

"""

扒取头条网中的街拍图片

"""

import requests

import os

from hashlib import md5

from multiprocessing import Pool

def get_page(offset):

    """

    搜索接口返回json结果

    :param offset:

    :return:

    """

    url = 'http://www.toutiao.com/search_content'

    params = {

        'offset': offset,

        'format': 'json',

        'keyword': '街拍',

        'autoload': 'true',

        'count': 20,

        'cur_tab': 1,

        'from': 'search_tab'

    }

    try:

        response = requests.get(url, params=params)

        if response.status_code == 200:

            return response.json()

    except Exception as e:

        print(e)

        return None

def get_images(json):

    """

    解析json，获取图片信息迭代器

    :param json:

    :return:

    """

    if json.get("data"):

        for item in json.get("data"):

            title = item.get("title")

            images = item.get("image_detail")

            if images:

                for image in images:

                    url = image.get("url")

                    yield {

                        'title': title,

                        'image': url

                    }

def download_image(item):

    """

    保存图片到本地

    :param item:

    :return:

    """

    url = item.get("image")

    title = item.get("title")

    if not os.path.exists(title):

        try:

            os.mkdir(title)

        except Exception as e:

            print(e)

    file_path = os.path.join(title, md5(

        item.get("image").encode("utf-8")).hexdigest() + ".jpg")

    if not os.path.exists(file_path):

        try:

            response = requests.get(url)

            if response.status_code == 200:

                with open(file_path, 'wb') as fp:

                    fp.write(response.content)

        except Exception as e:

            print('Failed to Save Image')

            print(e)

def main(offset):

    """

    主要函数

    :param offset:

    :return:

    """

    # step1.获取街拍搜索api的json返回

    content = get_page(offset)

    # step2.获取每条搜索记录的title与图片地址，并且遍历

    for item in get_images(content):

        print(item)

        # step3.下载图片

        download_image(item)

if __name__ == '__main__':

    GROUP_START = 1

    GROUP_END = 2

    offsets = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

    # Make the Pool of workers

    pool = Pool(4)

    pool.map(main, offsets)

    # close the pool and wait for the work to finish

    pool.close()

    pool.join()

实例三：扒取妹子图中的所有图片

#!/usr/bin/python

# coding=utf-8

"""

爬取妹子网图片到本地

"""

import os

import requests

from pyquery import PyQuery as pq

from multiprocessing import Pool

import time

class MeiZi:

    def __init__(self):

        self.index_url = "http://www.mzitu.com/"

        self.headers = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',

            'Referer': self.index_url

        }

    def get_document(self, url):

        """

        获取指定页面的DOM文档，并使用pyquery解析

        :param url:

        :return:

        """

        try:

            response = requests.get(url)

            if response.status_code == 200:

                return pq(response.text)

            else:

                print("[页面访问失败]:status_code:%s,url:%s" % (

                    response.status_code, url))

        except Exception as e:

            print(e)

    def index_page_range(self):

        """

        获取首页翻页总数迭代器

        :return:

        """

        doc = self.get_document(self.index_url)

        if doc:

            page_count = doc('.nav-links a:nth-last-child(2)').text()

            if page_count:

                return range(int(page_count))

    def get_items(self, page):

        """

        获取一级列表页中的主题页标题与主题页ID

        :param html:

        :return:

        """

        doc = self.get_document(self.index_url + "page/" + str(page))

        if doc:

            items = doc('#pins li span a').items()

            if items:

                for item in items:

                    yield {

                        'item_url': item.attr('href'),

                        'item_title': item.text()

                    }

    def save_item_details(self, item):

        """

       保存二级专题页中的图片

        :param detail_url:

        :return:

        """

        # 获取并解析二级专题页信息

        item_url = item.get("item_url")

        item_title = item.get("item_title")

        doc = self.get_document(item_url)

        if doc:

            # 获取二级专题页中的翻页总数

            page_count = doc('.pagenavi a:nth-last-child(2)').text()

            # 获取二级专题页中的图片地址

            img_src = doc('.main-image img').attr('src')

            # 保存第一页图片

            self.save_image(item_title, img_src)

            # 如果还有翻页,獲取保存分頁中的圖片

            page = 1

            while page < int(page_count):

                page += 1

                detail_url = item_url + "/" + str(page)

                doc = self.get_document(detail_url)

                if doc:

                    img_src = doc('.main-image img').attr('src')

                    self.save_image(item_title, img_src)

    def save_image(self, title, img_src):

        """

        保存图片img_src到本地，根据title命名文件夹

        :param title:

        :param img_src:

        :return:

        """

        root = "F:\\meizitu"

        # 保存到的文件夾，如果不存在，則創建

        save_dir = os.path.join(root, title)

        if not os.path.exists(save_dir):

            try:

                os.makedirs(save_dir)

            except Exception as e:

                print("[創建文件夾失敗]:%s" % save_dir)

                print(e)

        # 如果圖片不存在，則下載並且保存圖片

        pic_name = os.path.basename(img_src)

        pic_path = os.path.join(save_dir, pic_name)

        if not os.path.exists(pic_path):

            try:

                response = requests.get(img_src, headers=self.headers)

                if response.status_code == 200:

                    with open(pic_path, 'wb') as f:

                        f.write(response.content)

                        print(pic_path)

                else:

                    print("[图片访问失败]status_code:%s,url：%s" % (

                        response.status_code, img_src))

            except Exception as e:

                print(e)

    def main(self, page):

        # 获取一级页面中的专题

        items = self.get_items(page)

        # 遍历一级专题，获取二级页面中的图片，保存图片

        for item in items:

            self.save_item_details(item)

if __name__ == '__main__':

    start_time = time.time()

    mz = MeiZi()

    pool = Pool(10)

    page_count = mz.index_page_range()

    pool.map(mz.main, page_count)

    pool.close()

    pool.join()

    print("times:", time.time() - start_time)

实例四：扒取1688商品宝贝的数据包

#!/usr/bin/python

# coding=utf-8

"""

扒取1688淘宝图片到本地

"""

import os

import requests

from pyquery import PyQuery as pq

import json

import re

def save_image(root, pic_name, img_src):

    """ 保存图片到本地

    :param root: 保存位置文件夹

    :param pic_name:保存图片名称，如a.jpg

    :param img_src:图片源地址

    :return:

    """

    # 保存到的文件夾，如果不存在，則創建

    if not os.path.exists(root):

        try:

            os.makedirs(root)

        except Exception as e:

            print("[創建文件夾失敗]:%s" % root)

            print(e)

    # 如果圖片不存在，則下載並且保存圖片

    pic_path = os.path.join(root, pic_name)

    if not os.path.exists(pic_path):

        try:

            response = requests.get(img_src)

            if response.status_code == 200:

                with open(pic_path, 'wb') as f:

                    f.write(response.content)

                    print("下载完成：", pic_path)

            else:

                print("[图片访问失败]status_code:%s,url：%s" % (

                    response.status_code, img_src))

        except Exception as e:

            print(e)

def main(root, detail_url, cookie):

    """ 下载数据包主函数"""

    header = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',

        # 用户cookie

        'Accept-Encoding': 'gzip, deflate, br',

        'upgrade-insecure-requests': '',

        'cookie': cookie,

    }

    resp = requests.get(url=detail_url, headers=header)

    doc = pq(resp.text)

    # 保存主图

    images_show = doc('.content li.tab-trigger')

    if images_show:

        loop = 1

        for image in images_show:

            src = json.loads(image.get('data-imgs')).get('original')

            pic_type = os.path.splitext(src)[1]

            save_image(root, '主图_%s%s' % (loop, pic_type), src)

            loop += 1

    # 保存详情图：

    images_detail = doc('#desc-lazyload-container')

    detail = images_detail.attr('data-tfs-url')

    resp = requests.get(url=detail)

    if resp.status_code == 200:

        src_match = re.findall('<img.*?(https://.*?\.(jpg|png|jpeg|gif))',

                               resp.text)

        loop = 1

        for src in src_match:

            save_image(root, '详情_%s.%s' % (loop, src[1]), src[0])

            loop += 1

if __name__ == '__main__':

    # 设置图片保存的目录,注意每次都需要修改到不同的目录下，避免图片覆盖

    root = 'F:\\ShopData\\777'

    # 准确填写 1688宝贝详情页地址

    detail_url = 'https://detail.1688.com/offer/36413052665.html?spm=b26110380.sw1688.mof001.34.xEXARY'

    # 手工去登录1688网站后，将浏览器的cookie拷贝到这里

    cookie = 'XXXXXXXXXXXX'

    # 开始执行扒取...

    main(root, detail_url, cookie)

    print("数据包下载完成！")

***微信扫一扫，关注“python测试开发圈”，了解更多测试教程！***

python 爬虫005-爬虫实例的更多相关文章

Python爬虫框架Scrapy实例（三）数据存储到MongoDB
Python爬虫框架Scrapy实例(三)数据存储到MongoDB任务目标:爬取豆瓣电影top250,将数据存储到MongoDB中. items.py文件复制代码# -*- coding: utf-8 ...
Python 开发轻量级爬虫08
Python 开发轻量级爬虫 (imooc总结08--爬虫实例--分析目标) 怎么开发一个爬虫?开发一个爬虫包含哪些步骤呢? 1.确定要抓取得目标,即抓取哪些网站的哪些网页的哪部分数据. 本实例确定抓 ...
Python 开发轻量级爬虫04
Python 开发轻量级爬虫 (imooc总结04--url管理器) 介绍抓取URL管理器 url管理器用来管理待抓取url集合和已抓取url集合. 这里有一个问题,遇到一个url,我们就抓取它的内容 ...
Python 开发轻量级爬虫01
Python 开发轻量级爬虫 (imooc总结01--课程目标) 课程目标:掌握开发轻量级爬虫为什么说是轻量级的呢?因为一个复杂的爬虫需要考虑的问题场景非常多,比如有些网页需要用户登录了以后才能够访 ...
Python开发简单爬虫 - 慕课网
课程链接:Python开发简单爬虫环境搭建: Eclipse+PyDev配置搭建Python开发环境 Python入门基础教程用Eclipse编写Python程序课程目录第1章课程介绍 ...
Python实战：爬虫的基础
网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不常使用的名字还有蚂蚁.自动索引.模拟程序或者蠕 ...
爬虫新手学习2-爬虫进阶(urllib和urllib2 的区别、url转码、爬虫GET提交实例、批量爬取贴吧数据、fidder软件安装、有道翻译POST实例、豆瓣ajax数据获取)
1.urllib和urllib2区别实例 urllib和urllib2都是接受URL请求相关模块,但是提供了不同的功能,两个最显著的不同如下: urllib可以接受URL,不能创建设置headers的 ...
c#代码天气接口一分钟搞懂你的博客为什么没人看看完python这段爬虫代码，java流泪了c#沉默了图片二进制转换与存入数据库相关 C#7.0--引用返回值和引用局部变量 JS直接调用C#后台方法（ajax调用） Linq To Json SqlServer 递归查询
天气预报的程序.程序并不难. 看到这个需求第一个想法就是只要找到合适天气预报接口一切都是小意思,说干就干,立马跟学生沟通价格. 不过谈报价的过程中,差点没让我一口老血喷键盘上,话说我们程序猿的人 ...
【网络爬虫】【python】网络爬虫（四）：scrapy爬虫框架（架构、win/linux安装、文件结构）
scrapy框架的学习,目前个人觉得比较详尽的资料主要有两个: 1.官方教程文档.scrapy的github wiki: 2.一个很好的scrapy中文文档:http://scrapy-chs.rea ...
Python 爬虫1——爬虫简述
Python除了可以用来开发Python Web之后,其实还可以用来编写一些爬虫小工具,可能还有人不知道什么是爬虫的. 一.爬虫的定义: 爬虫——网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区 ...

随机推荐

细说PHP7
PHP7带来的新东西 1.类型的声明. 可以使用字符串(string), 整数 (int), 浮点数 (float), 以及布尔值 (bool),来声明函数的参数类型与函数返回值. declare(s ...
Oracle 通过undo块查看事务信息（转）
数据库版本:Oracle 11.2.0.3 RAC 实验目的:通过undo块查看Oracle事务信息实验细节:1 开始一个事务SQL> select * from t1; ID NAME- ...
Python 函数的使用小结
函数的好处:提高代码复用性,简化代码,代码可扩展. 函数只有调用的时候才会被执行. 1.参数: 形参&实参:位置参数,属于必填参数:默认值参数,为非必填参数,没有传值时使用默认值:关键字参数: ...
dubbo总结
一 .Dubbo产生背景单一应用架构当网站流量很小时,只需一个应用,将所有功能都部署在一起,以减少部署节点和成本.此时,用于简化增删改查工作量的数据访问框架(ORM) 是关键. 垂直应用架构当访 ...
杭电1019Least Common Multiple
地址:http://acm.hdu.edu.cn/showproblem.php?pid=1019 题目: Problem Description The least common multiple ...
LRU算法---缓存淘汰算法
计算机中的缓存大小是有限的,如果对所有数据都缓存,肯定是不现实的,所以需要有一种淘汰机制,用于将一些暂时没有用的数据给淘汰掉,以换入新鲜的数据进来,这样可以提高缓存的命中率,减少磁盘访问的次数. LR ...
《Java入门第三季》第二章认识 Java 中的字符串
什么是 Java 中的字符串.1.在Java的世界里,字符串被作为String类型的对象处理. 2.通用的初始化的方式:String s = new String("i love you & ...
在python3下使用OpenCV 抓取摄像头图像提取蓝色
工作中需要对摄像头进行调试, Python平台大大提高调试效率. 从网找到段代码, 可以从摄像头图像中抠出蓝色. import cv2 import numpy as np cap = cv2.Vi ...
RabbitMQ学习之（一）_初步了解RabbitMQ、RabbitMQ的使用流程、为什么要使用RabbitMQ、RabbitMQ的应用场景
初识RabbitMQ RabbitMQ是一个在AMQP协议基础上实现的消息队列系统, 是一个消息代理.它的核心原理非常简单:接收和发送消息.你可以把它想像成一个邮局:你把信件放入邮箱,邮递员就会把信件 ...
Ant Design 常用命令汇总
Ant Design React 安装 1. 安装脚手架工具# antd-init 是一个用于演示 antd 如何使用的脚手架工具,真实项目建议使用 dva-cli. $ npm install an ...

python 爬虫005-爬虫实例

实例一：扒取猫眼电影TOP100 的信息

实例二：扒取头条网中的街拍图片

实例三：扒取妹子图中的所有图片

实例四：扒取1688商品宝贝的数据包

python 爬虫005-爬虫实例的更多相关文章

随机推荐

热门专题