分析ajax请求抓取今日头条关键字美图

 # 目标：抓取今日头条关键字美图

 # 思路：

 # 一、分析目标站点

 # 二、构造ajax请求，用requests请求到索引页的内容，正则+BeautifulSoup得到索引url

 # 三、对索引url请求，得到图片url与标题，下载并保存到数据库，本次使用MongDB

 # 四、开启循环与多进程，对多页内容遍历与抓取

 #问题一、为什么要构造请求

 #为什么要构造请求，举个例子，第一屏的内容我们看到的实际url是：

 # http://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1

 # 后面有一大串参数，这些参数就是请求的一些‘设定’，表示关键词，加载的页数，等等，是一个字典的形式，

 # 如果人为去传这些数据显然十分繁琐，我们需要将这字典编码成一定格式加载请求函数里面。

 import os

 from json import JSONDecodeError

 from multiprocessing.pool import Pool

 import requests

 from urllib.parse import urlencode

 import json

 import pymongo

 from bs4 import BeautifulSoup

 from requests.exceptions import RequestException

 import re

 from config import *

 client = pymongo.MongoClient(MONGO_URL)

 db = client[MONGO_DB]

 def get_index_page(offset,keyword):

     data = {

         'offset': offset,

         'format': 'json',

         'keyword': keyword,

         'autoload': 'true',

         'count': '',

         'cur_tab': 1

     }

     data = urlencode(data)

     url ='http://www.toutiao.com/search_content/?' + data

     #print(url)

     try:

         response = requests.get(url)

         if response.status_code == 200:

             return response.text

         else:

             return None

     except RequestException:

         print('请求不到索引页面！')

         return None

 def parse_index_page(html):

     #json_obj = json.dumps(html)#将Python对象序列化为json

     #python_obj = json.loads(json_obj)#将json加载成Python对象

     data = json.loads(html)

     #在进行json操作之前有必要了解一下json是怎么操作的

     if data and 'data' in data.keys():

         for item in data.get('data'):

             yield item.get('article_url')

 def get_detail_page(url):

     try:

         response = requests.get(url)

         if response.status_code == 200:

             return response.text

         else:

             return None

     except RequestException:

         return None

 def save_to_mongo(result):

     if db[MONG_TABLE].insert(result):

         print('存储到MongoDB成功',result)

         return True

     else:

         return False

 def parse_detail_page(html,url):

     soup = BeautifulSoup(html,'lxml')

     title = soup.title.string

     pattern = re.compile(r'var gallery = (.*?);',re.S)

     result = re.findall(pattern,html)

     if result:

         images=[]

         for i in result:

             i = json.loads(i)

             j = i.get("sub_images")

             #print(j)

             for k in j:

                 k = k.get('url')

                 images.append(k)

             return{

                     'title':title,

                     'url':url,

                     'images':images

                 }

 def download_image(result):

     image_list = result.get('images')

     image_title  = result.get('title')

     print('正在下载：%s'%image_title)

     if image_title not in os.listdir(path ='.'):

         os.mkdir(image_title)

         os.chdir(image_title)

         for image in image_list:

             try:

                 response = requests.get(image)

                 if response.status_code == 200:

                     filename = image.split('/')[-1] + '.jpg'

                     with open(filename,'wb') as f:

                         f.write(response.content)

                         print('正在下载:%s'%image)

                 else:

                     return None

             except RequestException:

                 return None

         os.chdir(os.pardir)#返回上一级目录

 def main(offset):

     html = get_index_page(offset,KEYWORDS)

     for url in parse_index_page(html):

         #print(url)

         html = get_detail_page(url)

         if html:

             result = parse_detail_page(html,url)

             if result:

                 #print(result)

                 #save_to_mongo(result)

                 download_image(result)

 if __name__ == '__main__':

     groups = [i*20 for i in range(GROUP_START,GROUP_END + 1)]

     pool = Pool()

     pool.map(main,groups)

 #对比老司机所写

 import json

 import os

 from urllib.parse import urlencode

 import pymongo

 import requests

 from bs4 import BeautifulSoup

 from requests.exceptions import ConnectionError

 import re

 from multiprocessing import Pool

 from hashlib import md5

 from json.decoder import JSONDecodeError

 from config import *

 client = pymongo.MongoClient(MONGO_URL, connect=False)

 db = client[MONGO_DB]

 def get_page_index(offset, keyword):

     data = {

         'autoload': 'true',

         'count': 20,

         'cur_tab': 3,

         'format': 'json',

         'keyword': keyword,

         'offset': offset,

     }

     params = urlencode(data)

     base = 'http://www.toutiao.com/search_content/'

     url = base + '?' + params

     try:

         response = requests.get(url)

         if response.status_code == 200:

             return response.text

         return None

     except ConnectionError:

         print('Error occurred')

         return None

 def download_image(url):

     print('Downloading', url)

     try:

         response = requests.get(url)

         if response.status_code == 200:

             save_image(response.content)

         return None

     except ConnectionError:

         return None

 def save_image(content):

     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')

     print(file_path)

     if not os.path.exists(file_path):

         with open(file_path, 'wb') as f:

             f.write(content)

             f.close()

 def parse_page_index(text):

     try:

         data = json.loads(text)

         if data and 'data' in data.keys():

             for item in data.get('data'):

                 yield item.get('article_url')

     except JSONDecodeError:

         pass

 def get_page_detail(url):

     try:

         response = requests.get(url)

         if response.status_code == 200:

             return response.text

         return None

     except ConnectionError:

         print('Error occurred')

         return None

 def parse_page_detail(html, url):

     soup = BeautifulSoup(html, 'lxml')

     result = soup.select('title')

     title = result[0].get_text() if result else ''

     images_pattern = re.compile('var gallery = (.*?);', re.S)

     result = re.search(images_pattern, html)

     if result:

         data = json.loads(result.group(1))

         if data and 'sub_images' in data.keys():

             sub_images = data.get('sub_images')

             images = [item.get('url') for item in sub_images]

             for image in images: download_image(image)

             return {

                 'title': title,

                 'url': url,

                 'images': images

             }

 def save_to_mongo(result):

     if db[MONGO_TABLE].insert(result):

         print('Successfully Saved to Mongo', result)

         return True

     return False

 def main(offset):

     text = get_page_index(offset, KEYWORD)

     urls = parse_page_index(text)

     for url in urls:

         html = get_page_detail(url)

         result = parse_page_detail(html, url)

         if result: save_to_mongo(result)

 pool = Pool()

 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

 pool.map(main, groups)

 pool.close()

 pool.join()

分析ajax请求抓取今日头条关键字美图的更多相关文章

python学习(26)分析ajax请求抓取今日头条cosplay小姐姐图片
分析ajax请求格式,模拟发送http请求,从而获取网页代码,进而分析取出需要的数据和图片.这里分析ajax请求,获取cosplay美女图片. 登陆今日头条,点击搜索,输入cosplay 下面查看浏览 ...
爬虫（八）：分析Ajax请求抓取今日头条街拍美图
(1):分析网页分析ajax的请求网址,和需要的参数.通过不断向下拉动滚动条,发现请求的参数中offset一直在变化,所以每次请求通过offset来控制新的ajax请求. (2)上代码 a.通过aj ...
Python爬虫学习==>第十一章：分析Ajax请求-抓取今日头条信息
学习目的: 解决AJAX请求的爬虫,网页解析库的学习,MongoDB的简单应用正式步骤 Step1:流程分析抓取单页内容:利用requests请求目标站点,得到单个页面的html代码,返回结果: ...
通过分析Ajax请求抓取今日头条街拍图集
代码: import os import re import json import time from hashlib import md5 from multiprocessing import ...
python3爬虫-分析Ajax，抓取今日头条街拍美图
# coding=utf-8 from urllib.parse import urlencode import requests from requests.exceptions import Re ...
python爬虫---实现项目(二) 分析Ajax请求抓取数据
这次我们来继续深入爬虫数据,有些网页通过请求的html代码不能直接拿到数据,我们所需的数据是通过ajax渲染到页面上去的,这次我们来看看如何分析ajax 我们这次所使用的网络库还是上一节的Reques ...
python爬虫之分析Ajax请求抓取抓取今日头条街拍美图（七）
python爬虫之分析Ajax请求抓取抓取今日头条街拍美图一.分析网站 1.进入浏览器,搜索今日头条,在搜索栏搜索街拍,然后选择图集这一栏. 2.按F12打开开发者工具,刷新网页,这时网页回弹到综合 ...
分析Ajax来爬取今日头条街拍美图并保存到MongDB
前提:.需要安装MongDB 注:因今日投票网页发生变更,如下代码不保证能正常使用 #!/usr/bin/env python #-*- coding: utf-8 -*- import json i ...
15-分析Ajax请求并抓取今日头条街拍美图
流程框架: 抓取索引页内容:利用requests请求目标站点,得到索引网页HTML代码,返回结果. 抓取详情页内容:解析返回结果,得到详情页的链接,并进一步抓取详情页的信息. 下载图片与保存数据库:将 ...

随机推荐

Java中为什么long能自动转换成float类型
刷题时候看到一个float和long相互转换的问题,float向long转换的时候不会报错,一个4个字节一个8个字节,通过baidu找到了答案. 下面转载自http://blog.csdn.net/s ...
使用Intellij IDEA生成JavaDoc
以下是常用的注释标签,规范书写生成的文档中才能显示: @author 作者 @version 版本 @see 参考转向 @param 参数说明 @return 返回值说明 @exception 异常说 ...
centOS7安装nodejs(8.4.0)（详细步骤）
1.使用rpm查看是否安装gcc.make 若如下图有输出版本详细表示已安装,则无需再次安装,直接下一步(输入rpm -qa 包名称) 若没有安装则执行以下命令安装: yum install gcc ...
python中super()的一些用法
在看python高级编程这本书的时候,在讲到super的时候,产生了一些疑惑,super在python中的用法跟其他的语言有一些不一样的地方,在网上找了一些资料,发现基本上很少有文章能把我的疑惑讲明白 ...
【Python】文件目录比较工具filecmp和difflib
在一些运维场景中,常常需要比较两个环境中的应用目录结构(是否有文件/目录层面上的增删)以及比较两个环境中同名文件内容的不同(即文件层面上的改).Python自带了两个内建模块可以很好地完成这个工作,f ...
java操作数据库的通用的类
package cn.dao; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; ...
linux下各种解压方法
linux下各种格式的压缩包的压缩.解压方法.但是部分方法我没有用到,也就不全,希望大家帮我补充,我将随时修改完善,谢谢! .tar 解包:tar xvf FileName.tar 打包:t ...
Sublime Text3下使用Python，REPL的安装与快捷键设置方法
前提条件:连接外网 1.安装管理插件(CTRL+SHIFT+P),找到Package Control:install package一项,回车后继续选择SublimeREPL插件,进行安装: ...
Beta 第一天
一.今日任务重新熟悉整体项目对整个项目在未来的beta冲刺中进程有一个合理的规划由于我们送出的是一个负责前端的成员,引入的也是一个负责前端工作的女生,(女生做起美工比起男生更加得心应手吧)所以我 ...
Twisted 安全信道
1.安装python的SSL插件pyOpenSSL pip install pyopenssl 2.安装OpenSSL工具包 sudo apt-get install openssl sudo apt ...

分析ajax请求抓取今日头条关键字美图

分析ajax请求抓取今日头条关键字美图的更多相关文章

随机推荐

热门专题