import requests
from fake_useragent import UserAgent
from requests.exceptions import Timeout
from urllib.parse import quote, unquote
import re, json, os, hashlib
from lxml import etree
import time
from multiprocessing import Process, Queue, Pool # 之前想使用多进程,通过队列处理图片下载。没有实现 userAgent = UserAgent()
headers = {
"User-Agent": userAgent.random,
"Host": "tuchong.com",
"Referer": "https://tuchong.com/explore/"
}
baseUrl = "https://tuchong.com/rest/tag-categories/"
baseTagUrl = "https://tuchong.com/rest/tags/"
tagReferer = "https://tuchong.com/tags/" timeout = 5
s = requests.Session() dic = {
"subject": [],
"style": [],
"equipment": [],
"location": [],
} categoriesDict = {
"subject": "题材",
"style": "风格",
"equipment": "器材",
"location": "地区",
} def getCategoryPage(url, category, page=1):
try:
url = url + category
params = {
"page": page,
"count": 20
}
response = s.get(url=url, headers=headers, timeout=timeout, params=params)
if response.status_code == 200:
response.category = category
return response
except Timeout as e:
print(e)
return None def getTagNameUrl(response):
if not response:
return None
data_dict = response.json()
tag_list = data_dict.get("data").get("tag_list")
tag_name_list = [tag.get("tag_name") for tag in tag_list]
return tag_name_list def getNextPageUrl(response):
if not response:
return []
data_dict = response.json()
pages = int(data_dict.get("data").get("pages"))
for page in range(2, pages + 1):
yield page def getAllTag():
global dic
s.get(url="https://tuchong.com/explore/", headers=headers, timeout=timeout)
for category in categoriesDict.keys():
print("获取 -{}- 第 <{}> 页tagName信息.........".format(categoriesDict.get(category), 1))
response = getCategoryPage(url=baseUrl, category=category)
tag_name_list = getTagNameUrl(response) or []
dic.get(category).extend(tag_name_list)
time.sleep(1)
for page in getNextPageUrl(response):
print("获取 -{}- 第 <{}> 页tagName信息.........".format(categoriesDict.get(category), page))
response = getCategoryPage(url=baseUrl, category=category, page=page)
tag_name_list = getTagNameUrl(response) or []
dic.get(category).extend(tag_name_list)
time.sleep(1) def getTagPage(url, tag, page):
tag = quote(tag)
url = url + tag + "/posts"
params = {
"page": page,
"count": 20,
"order": "weekly"
}
headers["Referer"] = tagReferer + tag + "/"
try:
response = requests.get(url=url, params=params, headers=headers, timeout=timeout)
if response.status_code == 200:
return response
except Timeout as e:
print(e)
return None def getImagesInfo(response):
print('---')
if not response:
return None
result = response.json().get("result")
if result == "INVALID":
print("数据取完了")
return None
postList = response.json().get("postList")
imageUrlList = [dic.get("url") for dic in postList]
titleList = [dic.get("title").strip() for dic in postList]
for img_url_title in zip(titleList, imageUrlList):
img_url_title = list(img_url_title)
yield img_url_title def get_md5(img_url):
m = hashlib.md5()
m.update(bytes(img_url, encoding="utf-8"))
return m.hexdigest() def download(imgsUrl):
if imgsUrl:
for img_url in imgsUrl:
response = requests.get(url=img_url)
name = get_md5(img_url)
print("正在下载{}...".format(img_url))
with open(os.path.join(BASE_PATH, name) + ".jpg", "wb") as f:
f.write(response.content) def gogo(tagname):
page = 1
while True:
response = getTagPage(url=baseTagUrl, tag=tagname, page=page)
print("开始爬取 {} 第 {} 页...".format(tagname, page))
info = getImagesInfo(response) or []
if not response:
return
for info_tuple in info:
imgsUrl = putImageUrl(info_tuple)
download(imgsUrl)
page += 1
time.sleep(5) def putImageUrl(img_url_title_list):
if img_url_title_list:
img_url = img_url_title_list[1]
try:
response = s.get(url=img_url, headers=headers, timeout=timeout)
html = etree.HTML(response.text)
imgsUrl = html.xpath("//article[@class='post-content']/img/@src")
return imgsUrl
except requests.exceptions.ConnectionError as e:
print(e)
return None def downloadImage():
for key in dic:
tagname_list = dic.get(key)
for tagname in tagname_list:
gogo(tagname) def run():
getAllTag()
print("所有tag信息获取完毕.........")
print("开始获取每个tag的内容.........")
downloadImage() if __name__ == '__main__':
BASE_PATH = r"D:\tuchong"
run()

python3爬虫-通过requests爬取图虫网的更多相关文章

  1. 爬取图虫网 示例网址 https://wangxu.tuchong.com/23892889/

    #coding=gbk import requests from fake_useragent import UserAgent from lxml import etree import urlli ...

  2. python3爬虫-使用requests爬取起点小说

    import requests from lxml import etree from urllib import parse import os, time def get_page_html(ur ...

  3. python3爬虫-通过requests爬取西刺代理

    import requests from fake_useragent import UserAgent from lxml import etree from urllib.parse import ...

  4. Python3爬虫使用requests爬取lol英雄皮肤

    本人博客:https://xiaoxiablogs.top 此次爬取lol英雄皮肤一共有两个版本,分别是多线程版本和非多线程版本. 多线程版本 # !/usr/bin/env python # -*- ...

  5. 爬虫 Scrapy框架 爬取图虫图片并下载

    items.py,根据需求确定自己的数据要求 # -*- coding: utf-8 -*- # Define here the models for your scraped items # # S ...

  6. Python爬虫训练:爬取酷燃网视频数据

    前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理 项目目标 爬取酷燃网视频数据 https://krcom.cn/ 环境 Py ...

  7. python爬虫基础应用----爬取校花网视频

    一.爬虫简单介绍 爬虫是什么? 爬虫是首先使用模拟浏览器访问网站获取数据,然后通过解析过滤获得有价值的信息,最后保存到到自己库中的程序. 爬虫程序包括哪些模块? python中的爬虫程序主要包括,re ...

  8. python3 爬虫教学之爬取链家二手房(最下面源码) //以更新源码

    前言 作为一只小白,刚进入Python爬虫领域,今天尝试一下爬取链家的二手房,之前已经爬取了房天下的了,看看链家有什么不同,马上开始. 一.分析观察爬取网站结构 这里以广州链家二手房为例:http:/ ...

  9. 【Python3爬虫】我爬取了七万条弹幕,看看RNG和SKT打得怎么样

    一.写在前面 直播行业已经火热几年了,几个大平台也有了各自独特的“弹幕文化”,不过现在很多平台直播比赛时的弹幕都基本没法看的,主要是因为网络上的喷子还是挺多的,尤其是在观看比赛的时候,很多弹幕不是喷选 ...

随机推荐

  1. Node.js 的安装

    Node.js 是一个基于 Chrome V8 引擎的 JavaScript 的运行环境,简单的说就是运行在服务端的 JavaScript.所以学起来还是比较容易接受的. Node.js 使用事件驱动 ...

  2. Install Python on Mac

    1. 从官网下载最新版Python 3.X 后安装:由于Mac OS X EI Capitan中默认已经集成了 Python 2.7,因此需要在Terminal中输入 Python3 来检测是否安装成 ...

  3. git 打包报错:Maven Build时提示:Failed to execute goal org.apache.maven.plugins:maven-surefire-plugin:2.12.4:test

    1.使用git 升级 服务命令 mvn  deploy -e 之后报错: Failed to execute goal org.apache.maven.plugins:maven-surefire- ...

  4. SQL 语句及关键字的用法

    一.SELECT select [ALL|DISTINCT] select_list [into new table] FROM table_source [where serch_conditaio ...

  5. 最近选购MP3而有感便携追求音质的一些心得

    之前的创新小石头MP3的耳机接口松动了.考虑到它已经服役了4年了.所以我准备重新买一个.而小石头出色的外放,我决定让给宝宝当玩具. 选购心得MP3的时候,原来的主导思想,是在低价位的里面考虑一台国际品 ...

  6. [翻译] JSAnimatedImagesView

    JSAnimatedImagesView 本人测试的效果: Description:描述 Easy to use UIView subclass to quickly add a cool anima ...

  7. 有用的JS函数

    1. QueryString function queryString(key) { var re = new RegExp("[?&]" + key + "=( ...

  8. August 17th 2017 Week 33rd Thursday

    Fate is responsible for shuffling, but the game of cards is our own! 命运负责洗牌,但是玩牌的是我们自己! Today, I upd ...

  9. php独特的语法

    今天写一个程序的时候遇到一个很有意思的问题,这个和php独特的语法有关,首先我们看一下代码是怎么写的. <?php $db = mysql_connect('localhost','root', ...

  10. ZT Android Debuggerd的分析及使用方法

    Android Debuggerd的分析及使用方法 分类: 移动开发 android framework 2012-12-28 12:00 983人阅读 评论(0) 收藏 举报 目录(?)[+] An ...