Python-爬取妹子图(单线程和多线程版本)

一、参考文章

上述文章中的代码讲述的非常清楚，我的基本能思路也是这样，本篇文章中的代码仅仅做了一些异常处理和一些日志显示优化工作，写此文章主要是当做笔记，方便以后查阅，修改的地方如下：

1、异常处理下面在代码中会单独标红

2、多线程版使用了multiprocessing这个库，需要在main函数开始调用freeze_support()，防止打包成exe之后，运行时创建线程失败

3、多线程版本加了一个命令行自定义线程个数功能

二、单线程版本

 #coding=utf-8

 import requests

 from bs4 import BeautifulSoup

 import os

 all_url = 'http://www.mzitu.com'

 #http请求头

 Hostreferer = {

     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',

     'Referer':'http://www.mzitu.com'

                }

 Picreferer = {

     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',

     'Referer':'http://i.meizitu.net'

 }

 #此请求头破解盗链

 start_html = requests.get(all_url, headers = Hostreferer)

 #保存地址

 path = os.getcwd() + '/mzitu/'

 #找寻最大页数

 soup = BeautifulSoup(start_html.text, "html.parser")

 page = soup.find_all('a', class_='page-numbers')

 max_page = page[-2].text

 same_url = 'http://www.mzitu.com/page/'

 for n in range(0, int(max_page)+1):#遍历页面数

     ul = same_url+str(n)

     start_html = requests.get(ul, headers = Hostreferer)

     soup = BeautifulSoup(start_html.text, "html.parser")

     all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')

     for a in all_a:#每个页面包含的妹子数

         title = a.get_text() #提取文本

         if(title != ''):

             print("准备扒取：" + title)

             #win不能创建带？的目录

             if(os.path.exists(path+title.strip().replace('?', ''))):

                     #print('目录已存在')

                     flag = 1

             else:

                 os.makedirs(path+title.strip().replace('?', ''))

                 flag = 0

             os.chdir(path + title.strip().replace('?', ''))

             href = a['href']

             html = requests.get(href, headers = Hostreferer)

             mess = BeautifulSoup(html.text, "html.parser")

             pic_max = mess.find_all('span')

             pic_max = pic_max[10].text #最大页数

             if(flag == 1 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):

                 print('已经保存完毕，跳过')

                 continue

             for num in range(1, int(pic_max) + 1):#每个妹子的所有照片

                 pic = href+'/'+str(num)

                 html = requests.get(pic, headers = Hostreferer)

                 mess = BeautifulSoup(html.text, "html.parser")

                 pic_url = mess.find('img', alt = title)

                 if 'src' not in pic_url.attrs:#有些pic_url标签没有src这个属性，导致操作异常，在次进行过滤

                     continue

                 print(pic_url['src'])

                 #exit(0)

                 html = requests.get(pic_url['src'],headers = Picreferer)

                 file_name = pic_url['src'].split(r'/')[-1]

                 f = open(file_name, 'wb')

                 f.write(html.content)

                 f.close()

             print('完成')

     print('第',n,'页完成')

三、多线程版本

 #coding=utf-8

 import requests

 from bs4 import BeautifulSoup

 import os

 from multiprocessing import Pool

 from multiprocessing import freeze_support

 import sys

 header = {

     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',

     'Referer':'http://www.mzitu.com'

     }

 Picreferer = {

     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',

     'Referer':'http://i.meizitu.net'

 }

 def find_MaxPage():

     all_url = 'http://www.mzitu.com'

     start_html = requests.get(all_url, headers = header)

     #找寻最大妹子页面数

     soup = BeautifulSoup(start_html.text, "html.parser")

     page = soup.find_all('a', class_ = 'page-numbers')

     max_page = page[-2].text

     return max_page

 def Download(href, title, path):

     html = requests.get(href, headers = header)

     soup = BeautifulSoup(html.text, 'html.parser')

     pic_max = soup.find_all('span')

     pic_max = pic_max[10].text  # 最大页数

     if(os.path.exists(path+title.strip().replace('?', ''))

     and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):

         print('妹子已待命，继续准备下一个妹子' + title)

         return 1

     print(f"发现妹子资源{pic_max}个，准备中：" + title)

     os.makedirs(path + title.strip().replace('?', ''))

     os.chdir(path + title.strip().replace('?', ''))

     for num in range(1, int(pic_max) + 1):

         pic = href + '/' + str(num)

         html = requests.get(pic, headers = header)

         mess = BeautifulSoup(html.text, "html.parser")

         pic_url = mess.find('img', alt = title)

         if 'src' not in pic_url.attrs:#有些pic_url标签没有src属性，导致操作异常，在次进行过滤

             continue

         print(f"{title}：{pic_url['src']}")

         html = requests.get(pic_url['src'], headers = header)

         file_name = pic_url['src'].split(r'/')[-1]

         f = open(file_name,'wb')

         f.write(html.content)

         f.close()

     print('妹子已就绪，客官请慢用：' + title)

 if __name__ == '__main__':

     freeze_support()#防止打包后 运行exe创建进程失败

     #线程池中线程数

     count = 1

     if len(sys.argv) >=2:

         count = int(sys.argv[1])

     pool = Pool(count)

     print(f'初始化下载线程个数${count}')

     # http请求头

     path = os.getcwd() + '/mzitu_mutil/'

     max_page = find_MaxPage() #获取最大页数  即生成的文件夹数量

     print(f'捕获{max_page}页妹子，请耐心等待下载完成')

     same_url = 'http://www.mzitu.com/page/'

     for n in range(1, int(max_page) + 1):

         each_url = same_url + str(n)

         start_html = requests.get(each_url, headers = header)#请求一页中的所有妹子

         soup = BeautifulSoup(start_html.text, "html.parser")

         all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')

         for a in all_a:#遍历每一页中的妹子

             title = a.get_text()  # 提取文本

             if (title != ''):

                 href = a['href']#请求妹子的所有图集

                 pool.apply_async(Download, args = (href, title, path))

     pool.close()

     pool.join()

     print('所有妹子已就绪，客官请慢用')

四、资源下载

　　资源下载地址：Python爬取妹子图-单线程和多线程版本

Python-爬取妹子图(单线程和多线程版本)的更多相关文章

python爬取妹子图全站全部图片-可自行添加-线程-进程爬取，图片去重
from bs4 import BeautifulSoupimport sys,os,requests,pymongo,timefrom lxml import etreedef get_fenlei ...
Python 爬取妹子图(技术是无罪的)
... #!/usr/bin/env python import urllib.request from bs4 import BeautifulSoup def crawl(url): header ...
Python 爬取妹子图(技术是无罪的)
... import requests from bs4 import BeautifulSoup import os import sys class mzitu(): def html(self, ...
Python协程爬取妹子图(内有福利，你懂得~)
项目说明: 1.项目介绍本项目使用Python提供的协程+scrapy中的选择器的使用(相当好用)实现爬取妹子图的(福利图)图片,这个学会了,某榴什么的.pow(2, 10)是吧! 2.用到的知 ...
Python3爬虫系列：理论+实验+爬取妹子图实战
Github: https://github.com/wangy8961/python3-concurrency-pics-02 ,欢迎star 爬虫系列: (1) 理论 Python3爬虫系列01 ...
python 爬取妹子
爬取妹子图片网址:https://www.mzitu.com/jiepai/ 2019-06-13 环境WIN10 1903 python 3.7.3 个人习惯先在IDLE中进行调试 import ...
Python网络爬虫 | Scrapy爬取妹子图网站全站照片
根据现有的知识,写了一个下载妹子图(meizitu.com)Scrapy脚本,把全站两万多张照片下载到了本地. 网站的分析网页的网址分析打开网站,发现网页的网址都是以 http://www.mei ...
python爬取斗图网中的 “最新套图”和“最新表情”
1.分析斗图网斗图网地址:http://www.doutula.com 网站的顶部有这两个部分: 先分析“最新套图” 发现地址栏变成了这个链接,我们在点击第二页可见,每一页的地址栏只有后面的pag ...
Python爬取斗图表情，让你成为斗图大佬
话不多说,上结果(只爬了10页内容) 上代码:(可直接运行) 用到Xpath #encoding:utf-8 # __author__ = 'donghao' # __time__ = 2018/ ...

随机推荐

Apriori算法思想和其python实现
第十一章使用Apriori算法进行关联分析一．导语 "啤酒和尿布"问题属于经典的关联分析.在零售业,医药业等我们经常需要是要关联分析.我们之所以要使用关联分析,其目的是为了从大 ...
使用WSL连接Docker for Windows
在Windows下安装Docker for Windows Cotana搜索功能,打开Windows的Hype-v功能(注:会影响Virtualbox和Vmware的使用)并重启电脑. 从Docker ...
自定义完美的ViewPager 真正无限循环的轮播图
网上80%的思路关于Android轮播图无限循环都是不正确的,不是真正意义上的无限循环, 其思路大多是将ViewPager的getCount方法返回值设置为Integer.MAX_VALUE, 然后呢 ...
Eclipse开发前，常用设置
设置工作空间的项目编码, 防止出现乱码 Window - Preferences - General - Workspace 将"Text file encoding" ...
Reflection的getCallerClass静态方法
Reflection的getCallerClass的使用博客分类: java基础 Reflection的getCallerClass的使用:可以得到调用者的类.这个方法是很好用的. 0 和小于0 ...
Myeclipse10.7.1 导出war包报错
myeclipse10.7.1 导出war问题解决办法myeclipse10破解后,导出war包时报"SECURITY ALERT: INTEGERITY CHECK ERROR" ...
Jmeter4.0分布式测试时启动Jmeter.server时报错
最近又开始研究Jmeter,将新版本4.0下载下来体验,准备远程分布式测试,又出现一些问题,废话不多说,直入主题把! Windows 系统启动Jmeter 4.0的JmeterServer.ba ...
2101: Bake Off
Description Davy decided to start a weekend market stall where he sells his famous cakes. For the fi ...
玩转Spring MVC(五)----在spring中整合log4j
在前边的基础上,本文主要总结一下如何在spring 中配置log4j,在本文末尾会给出完整项目的链接. 首先是web.xml中要新添加的代码:  &l ...
ftp研究
工作中经常用到ftp,最近闲下心来,仔细研究下ftp这个协议. FTP(文件传输协议)工作原理目前在网络上,如果你想把文件和其他人共享.最方便的办法莫过于将文件放FTP服务器上,然后其他人通过FTP ...