Python3网络爬虫

# 最近在实验楼学习了爬取妹子图,发现在运行的时候不是很流畅,有些图片下 1 # coding: utf-8

 # coding: utf-8

 import re

 import threading

 from urllib.request import urlopen

 from urllib.error import HTTPError

 from bs4 import BeautifulSoup

 import meizi_series_nextpage

 def loadurl(url):

     try:

         conn = urlopen(url)

         html = conn.read()

         return html

     except HTTPError as e:

         return e

     except Exception as e:

         print("unkown exception in conn.read() %s "%e)

         return ''

 def meizi(url,path):

     # 获取首页标签

     print('start open meiziwang')

     html = ''

     while True:

         html = loadurl(url)

         if html == '':

             print('load', url,'error')

             continue

         else:

             break

     mnvtp = BeautifulSoup(html)

     taglists = mnvtp.findAll("div",{"class":"tags"})

     taglistss = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists)

     print(list(set(taglistss)))

     print(len(list(set(taglistss))))

     print('open meiziwang over')

     meizi_series_nextpage.nextpage(url,path)

     threads = []

     for url in list(set(taglistss)):

         t =threading.Thread(target=meizi_series_nextpage.nextpage, args=(url, path))

         threads.append(t)

     for t in threads:

         t.start()

     for t in threads:

         t.join()

 if __name__ == '__main__':

     meizi('http://www.meizitu.com','D:\\MeiZi\\')

     print ('Spider Stop')

# coding: utf-8

import re

from urllib.request import urlopen

from urllib.error import HTTPError

from bs4 import BeautifulSoup

import meizi_series_getpage

#同样的，这里是加载链接防超时

def loadurl(url):

    try:

        conn = urlopen(url, timeout=5)

        html = conn.read()

        return html

    except HTTPError as e:

        print(e)

    except Exception as e:

        print(e)

def nextpage(url,path):

    #获取首页尾部标签

    nextweibu = re.split("/",url)

    # 获取头部文件

    nexthead = re.split("/a/",url)

    nexthead = nexthead[0] + "/a/"

    # 创建首页路径

    path = path+"\\"+nextweibu[-1].split(".",1)[0]

    # 获取html

    while True:

        html = loadurl(url)

        if html == '':

            print('load', url,'error')

            continue

        else:

            break

    # 获取子标签

    mnvtp = BeautifulSoup(html)

    taglists = mnvtp.findAll("div",{"id":"wp_page_numbers"})

    taglists = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists)

    taglists = sorted(list(set(taglists)))

    if taglists == []:

        taglists = [nextweibu[-1]]

    # 获取单个首页所有标签完整url路径

    print("正在获取首页所有子标签Url:%s"%url)

    completeurl = []

    for i in taglists:

        url = nexthead + i

        completeurl.append(url)

    completeurl = sorted(completeurl)

    for i in completeurl:

        print("正在获取子标签下所有套图url路径")

        meizi_series_getpage.tag_series(i,path)

# coding: utf-8

import time

from urllib.request import urlopen

from urllib.request import Request

from urllib.error import HTTPError

from urllib.request import urlretrieve

import os

import re

from bs4 import BeautifulSoup

import urllib

from urllib import parse

# 图片下载的主逻辑函数，获取图片链接，然后传给pic_list()

def picurl(url,path):

    if os.path.exists(path):

        print(path,'目录已存在')

    else:

        print("正在创建目录:%s"%path)

        os.makedirs(path)

    # 获取套图url（图片）地址

    html = ''

    while True:

        html = loadurl(url)

        if html == '':

            continue

        else:

            break

    rePicContent1 = '<div.*?id="picture.*?>.*?<p>(.*?)</p>'

    rePicContent2 = '<div.*?class="postContent.*?>.*?<p>(.*?)</p>'

    rePicList = '<img.*?src="(.*?)".*?>'

    #这里对re.S做个介绍，re.S是可以不添加的，加上之后，它的作用就是能忽略换行符，将两条作为一条来匹配。html代码碰上换行的概率是很高的，所以我一致采用re.S(下文有配图)

    picContent = re.findall(rePicContent1,"%s"%html,re.S)

    if len(picContent) <=0:

        picContent = re.findall(rePicContent2, "%s"%html,re.S)

    if len(picContent) <=0:

        print('无法匹配到对应的图片url')

        return False

    else:

        picList = re.findall(rePicList,"%s"%picContent[0],re.S)

        pic_list(picList,path)

# #这个函数，相当于一个中介，我只是把for循环代码提出就得到了这个函数

def pic_list(picList,path):

    for picurl in picList:

        print("获取图片地址：%s"%picurl)

        save_pic(picurl,path)

#保存图片的逻辑代码块

def save_pic(url,path):

    searchname = '.*/(.*?.jpg)'

    name = re.findall(searchname,url)

    filename = path +'\\'+ name[0]

    print(filename + ':start') #控制台显示信息

    #定义了在下载图片时遇到错误的重试次数

    tryTimes = 3

    #当重试次数没有用完时，则尝试下载

    while tryTimes != 0:

        tryTimes -= 1

        if os.path.exists(filename):

            print(filename,'已存在,跳过')

            return True

        elif os.path.exists(filename):

            os.mknod(filename)

        if download(url,filename):

            break

    if tryTimes != 0:

        print(filename + ": over")

    else:

        print(url + " ：Failed to download")

    #控制台显示信息

#这里是图片保存的代码被调函数，timeout=5设置超时时间，一个500k不到的图片，5秒时间算长的了，超时的话，返回失败

def download(url,filename):

    try:

        headers = {

        'Host':'mm.howkuai.com',

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',

        }

        Url = Request(url,headers=headers)

        req = urlopen(Url).read()

        f = open(filename,'wb')

        f.write(req)

        f.close()

        return True

    except HTTPError as e:

        print(e)

        return False

    except Exception as e:

        print(e)

def loadurl(url):

    try:

        conn = urlopen(url,timeout=5)

        html = conn.read()

        return html

    except HTTPError as e:

        return ''

    except Exception as e:

        print("unkown exception in conn.read()")

        return ''

有时间在来解释代码含义,第一段代码是主函数,分别根据导入的py创建既可。

Python3网络爬虫的更多相关文章

Python3 网络爬虫（请求库的安装）
Python3 网络爬虫(请求库的安装) 爬虫可以简单分为几步:抓取页面,分析页面和存储数据在页面爬取的过程中我们需要模拟浏览器向服务器发送请求,所以需要用到一些python库来实现HTTP的请求操 ...
崔庆才Python3网络爬虫开发实战电子版书籍分享
资料下载地址: 链接:https://pan.baidu.com/s/1WV-_XHZvYIedsC1GJ1hOtw 提取码:4o94 <崔庆才Python3网络爬虫开发实战>高清中文版P ...
《Python3 网络爬虫开发实战》开发环境配置过程中踩过的坑
<Python3 网络爬虫开发实战>学习资料:https://www.cnblogs.com/waiwai14/p/11698175.html 如何从墙内下载Android Studio: ...
《Python3 网络爬虫开发实战》学习资料
<Python3 网络爬虫开发实战> 学习资料百度网盘:https://pan.baidu.com/s/1PisddjC9e60TXlCFMgVjrQ
Python3网络爬虫开发实战PDF高清完整版免费下载|百度云盘
百度云盘:Python3网络爬虫开发实战高清完整版免费下载提取码:d03u 内容简介本书介绍了如何利用Python 3开发网络爬虫,书中首先介绍了环境配置和基础知识,然后讨论了urllib.req ...
转：【Python3网络爬虫开发实战】 requests基本用法
1. 准备工作在开始之前,请确保已经正确安装好了requests库.如果没有安装,可以参考1.2.1节安装. 2. 实例引入 urllib库中的urlopen()方法实际上是以GET方式请求网页,而 ...
Python3网络爬虫(四)：使用User Agent和代理IP隐藏身份《转》
https://blog.csdn.net/c406495762/article/details/60137956 运行平台:Windows Python版本:Python3.x IDE:Sublim ...
Python3网络爬虫（1）：利用urllib进行简单的网页抓取
1.开发环境 pycharm2017.3.3 python3.5 2.网络爬虫的定义网络爬虫,也叫网络蜘蛛(web spider),如果把互联网比喻成一个蜘蛛网,spider就是一只在网上爬来爬去的 ...
python3网络爬虫系统学习：第一讲基本库urllib
在python3中爬虫常用基本库为urllib以及requests 本文主要描述urllib的相关内容 urllib包含四个模块:requests——模拟发送请求 error——异常处理模块 pars ...
《Python3网络爬虫开发实战》
推荐:★ ★ ★ ★ ★ 第1章开发环境配置第2章网页基础知识第3章网络爬虫基础第4章基本库的使用第5章解析库的使用第6章数据存储第7章 Ajax数据爬取第8章动态渲染页面 ...

随机推荐

第九章 Criteria查询及注解
第九章 Criteria查询及注解9.1 使用Criteria查询数据 9.1.1 条件查询 Criteria查询步骤: 1)使用session接口的cr ...
HTML、CSS、JS 样式
把一个数组(一维或二维等)的内容转化为对应的字符串.相当于把print_r($array)显示出来的内容赋值给一个变量.$data= array('hello',',','world','!'); $ ...
JS作用域就这么几句话
JavaScript的作用域一直以来是前端开发中比较难以理解的知识点,对于JavaScript的作用域主要记住几句话,走遍天下都不怕... 一.“JavaScript中无块级作用域” 在Java或C# ...
[Day03] 循环语句、list相关练习题
用户输入两个数,求平均值. 让用户一直输入数字,如果输入的是'0',终止程序打印所有数字之和. 让用户一直输入数字(只输入数字),如果没输入任何值,终止程序打印所有输入数字的平均值. 求出这个list ...
hdu4417 Super Mario
Problem Description Mario is world-famous plumber. His “burly” figure and amazing jumping ability re ...
express4.x的使用
①.安装 npm install -g express ②.创建应用 express [目录] 会在目录下生成 node_modules, 存放所有的项目依赖库.(每个项目管理自己的依赖,与Ma ...
Openstack Swift 原理、架构与 API 介绍
OpenStack Swift 开源项目提供了弹性可伸缩.高可用的分布式对象存储服务,适合存储大规模非结构化数据.本文将深入介绍 Swift 的基本设计原理.对称式的系统架构和 RESTful API ...
.Net程序员学用Oracle系列(23)：视图理论、物化视图
1.视图理论 1.1.视图的存储 1.2.视图的作用 1.3.视图的工作机制 1.4.视图的依赖性 1.5.可更新的连接视图 1.6.内联视图 2.物化视图 2.1.刷新物化视图 2.2.物化视图日志 ...
The authenticity of host '172.16.33.53 (172.16.33.53)' can't be established的问题(日志六)
用ssh登录一个机器(换过ip地址)会出现如下错误 weiguohui@weiguohui1-virtual-machine:~/.ssh$ ssh 172.16.33.53The authentic ...
精选this关键字的指向规律你记住了吗
1.首先要明确: 谁最终调用函数,this指向谁 this指向的永远只可能是对象!!!!! this指向谁永远不取决于this写在哪,而取 ...

Python3网络爬虫

Python3网络爬虫的更多相关文章

随机推荐

热门专题