python3爬虫-分析Ajax，抓取今日头条街拍美图

# coding=utf-8

from urllib.parse import urlencode

import requests

from requests.exceptions import RequestException,Timeout

import json

from bs4 import BeautifulSoup

from pymongo import MongoClient

from multiprocessing import Pool

import os

import string

from hashlib import md5

def get_response(url):

    try:

        headers = {

            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"

        }

        # proxies = {'http':'118.11.2.3:8080'}

        response = requests.get(url, headers=headers, timeout=5)

        print(url + 'request success')

        return response

    except Timeout:

        print(url + 'request timeout')

def get_page_index(offset, keyword):

    data = {

        "offset": offset,

        "format": "json",

        "keyword": keyword,

        "autoload": "true",

        "count": "20",

        "cur_tab": "1",

        "from":"search_tab"

    }

    url = "https://www.toutiao.com/search_content/?" + urlencode(data)

    print(url)

    try:

        response = get_response(url)

        print(response.status_code)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        print('request error')

        return None

def conn_mongodb():

    client = MongoClient('localhost', 27017)

    db = client['jiepai']

    jiepai = db['jiepai']

    return jiepai

def save_image_url(data):

    jiepai = conn_mongodb()

    jiepai.update({'title':data.get('title')}, {'$set':data}, upsert=True)

def get_image_url():

    jiepai = conn_mongodb()

    data = jiepai.find({}, {'title': 1, 'images_list': 1, '_id': 0})

    return data

def download_image(data):

    base_dir = os.path.abspath(os.path.dirname(__file__))

    if not os.path.exists(base_dir + '\jiepai'):

        os.mkdir(base_dir + '\jiepai')

    for item in data:

        print(item.get('title'))

        title = item.get('title')

        images_list = item.get('images_list')

        print('images_lsit',images_list)

        # every file name

        file_name = title.strip(string.punctuation)

        file_name = str(file_name).replace('?','')

        if not os.path.exists(base_dir + '\jiepai/' + file_name):

            os.mkdir(base_dir + '\jiepai\\' + file_name)

        # save images path

        file_path = base_dir + '\jiepai\\' + file_name

        for image_url in images_list:

            print(image_url)

            response = get_response(image_url)

            html = response.content

            image_name = md5(html).hexdigest() + '.jpg'

            with open(file_path + '\\' + image_name, 'wb') as f:

                f.write(html)

                print('download success')

def parse_page_index(html):

    data = json.loads(html)

    if data and 'data' in data.keys():

        for item in data.get('data'):

            a_gourp_image_detail = {}

            images_list = []

            title = item.get('title')

            # print(title)

            if title is not None:

                a_gourp_image_detail['title'] = title

                images = item.get('image_detail')

                # print(images)

                if images:

                    for image in images:

                        # print(image.get('url'))

                        images_list.append(image.get('url'))

            # if images_list:

            a_gourp_image_detail['images_list'] = list(set(images_list))

            print(a_gourp_image_detail)

            save_image_url(a_gourp_image_detail)

def main(offset):

    html = get_page_index(offset, '街拍')

    # print(html)

    parse_page_index(html)

if __name__ == "__main__":

    # 多进程爬取图片链接，并保存到 Mongodb

    # groups = [x*20 for x in range(0,5)]

    # pool = Pool()

    # pool.map(main, groups)

    # 从 mongodb 中获取链接，多进程下载图片，并保存

    data = get_image_url()

    datas = [item for item in data]

    pool = Pool()

    pool.map(download_image, data)

    # download_image()

python3爬虫-分析Ajax，抓取今日头条街拍美图的更多相关文章

分析Ajax抓取今日头条街拍美图
spider.py # -*- coding:utf-8 -*- from urllib import urlencode import requests from requests.exceptio ...
【Python3网络爬虫开发实战】分析Ajax爬取今日头条街拍美图
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理.作者:haoxuan10 本节中,我们以今日头条为例来尝试通过分析Ajax请求 ...
分析Ajax爬取今日头条街拍美图-崔庆才思路
站点分析源码及遇到的问题代码结构方法定义需要的常量关于在代码中遇到的问题 01. 数据库连接 02.今日头条的反爬虫机制 03. json解码遇到的问题 04. 关于response.tex ...
关于爬虫的日常复习（9）—— 实战：分析Ajax抓取今日头条接拍美图
python爬虫之分析Ajax请求抓取抓取今日头条街拍美图（七）
python爬虫之分析Ajax请求抓取抓取今日头条街拍美图一.分析网站 1.进入浏览器,搜索今日头条,在搜索栏搜索街拍,然后选择图集这一栏. 2.按F12打开开发者工具,刷新网页,这时网页回弹到综合 ...
15-分析Ajax请求并抓取今日头条街拍美图
流程框架: 抓取索引页内容:利用requests请求目标站点,得到索引网页HTML代码,返回结果. 抓取详情页内容:解析返回结果,得到详情页的链接,并进一步抓取详情页的信息. 下载图片与保存数据库:将 ...
Python Spider 抓取今日头条街拍美图
""" 抓取今日头条街拍美图 """ import os import time import requests from hashlib ...
分析Ajax请求并抓取今日头条街拍美图
项目说明本项目以今日头条为例,通过分析Ajax请求来抓取网页数据. 有些网页请求得到的HTML代码里面并没有我们在浏览器中看到的内容.这是因为这些信息是通过Ajax加载并且通过JavaScript渲 ...
【Python3网络爬虫开发实战】6.4-分析Ajax爬取今日头条街拍美图【华为云技术分享】
[摘要] 本节中,我们以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法.这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来. 1. 准备工作在本节 ...
转：【Python3网络爬虫开发实战】6.4-分析Ajax爬取今日头条街拍美图
[摘要] 本节中,我们以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法.这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来. 1. 准备工作在本节 ...

随机推荐

vb调试dll
1.有两个工程BW_DetectCard.vbp(生成dll)及识别卡检测软件.vbp(生成exe) 2.打开工程<识别卡检测软件.vbp>,在文件--添加工程--现存,找到要引用的dll ...
使用NPOI读取Excel出错
使用NPOI读取Excel出错,错误信息:java.io.IOException: Invalid header signature; read 4503608217567241, expected ...
你应该将应用迁移到Spring 4的五个原因
本文来源于我在InfoQ中文站翻译的文章,原文地址是:http://www.infoq.com/cn/news/2015/12/five-reasons-to-migrate-spring4 Rafa ...
C++ Primer Plus的若干收获--（二）
哎,真是不想吐槽考驾照的艰辛历程了.跑到大西郊,顶着大太阳,一天就能摸上个十几分钟二十分钟的车,简直不要太坑爹,这两天真是做的我屁股疼的不行. .. 今天果断不去了.仅仅可惜我的大阿根廷啊,坚持到最后 ...
SQL Server事务详解
事务定义: 事务是单个的工作单元.如果某一事务成功,则在该事务中进行的所有数据更改均会提交,成为数据库中的永久组成部分.如果事务遇到错误且必须取消或回滚,则所有数据更改均被清除. 事务三种运行模式: ...
NodeJS CSV导出文件名和内容乱码解决
// 解决不同浏览器下载文件名称乱码 var userAgent = (req.headers['user-agent']||'').toLowerCase(); res.set('Content-T ...
Android 仿今日头条频道管理（上）（GridView之间Item的移动和拖拽）
前言常常逛今日头条.发现它的频道管理功能做的特别赞.交互体验很好.如图: watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fo ...
如何在Linux下统计高速网络中的流量
参考: http://www.geekfan.net/5558/ http://blog.jobbole.com/23638/ http://www.csdn.net/article/2014-03- ...
vs2015创建webService
自定义 Collection View 布局
自定义 Collection View 布局 answer-huang 29 Mar 2014 分享文章 UICollectionView 在 iOS6 中第一次被引入,也是 UIKit 视图类中的一 ...

python3爬虫-分析Ajax，抓取今日头条街拍美图

python3爬虫-分析Ajax，抓取今日头条街拍美图的更多相关文章

随机推荐

热门专题