Spider_实践_beautifulsoup静态网页爬取所有网页链接

# 获取百度网站首页上的所有a标签里的 href属性值：

# import requests

# from bs4 import BeautifulSoup 

# # html = requests.get('http://en.wikipedia.org/wiki/Kevin_Bacon')

# html = requests.get('http://www.baidu.com')

# bs = BeautifulSoup(html.text, 'html.parser')

# for link in bs.find_all(lambda tag: 'href' in tag.attrs):

#     print(link.attrs['href'])

# import requests

# import re

# from bs4 import BeautifulSoup 

# # html = requests.get('http://en.wikipedia.org/wiki/Kevin_Bacon')

# html = requests.get('http://www.baidu.com')

# bs = BeautifulSoup(html.text, 'html.parser')

# for link in bs.find_all('', {'href':re.compile('\.com')}):

#     print(link.attrs['href'])

# import requests

# from bs4 import BeautifulSoup 

# html = requests.get('http://www.baidu.com')

# bs = BeautifulSoup(html.text, 'html.parser')

# for link in bs.find_all('a'):

#     if 'href' in link.attrs:

#         print(link.attrs['href'])

import requests

from bs4 import BeautifulSoup 

def geturl(url):

    html = requests.get(url)

    bs = BeautifulSoup(html.text, 'html.parser')

    return bs.find_all('a')

links=geturl('http://www.baidu.com')

for link in links:

    if 'href' in link.attrs:

        print(link.attrs['href'])

http://news.baidu.com

http://www.hao123.com

http://map.baidu.com

http://v.baidu.com

http://tieba.baidu.com

http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1

//www.baidu.com/more/

http://home.baidu.com

http://ir.baidu.com

http://www.baidu.com/duty/

http://jianyi.baidu.com/



import requests

from bs4 import BeautifulSoup 

def geturl(url):

    html = requests.get(url)

    bs = BeautifulSoup(html.text, 'html.parser')

    return bs.find_all('', {'href':re.compile('http://')})

# links=geturl('http://www.baidu.com')

# print(list)

# links_in_news=set(geturl(links[1].attrs['href']))  # set(list) 去重变为集合

# for link in links_in_news:

#    print(link.attrs['href'])

print('-------------------------------------------------------------------------')

links=geturl('http://www.baidu.com')

for link in links:

    if '//news.' in link.attrs['href']:

        links_in_news=set(geturl(link.attrs['href']))  # set(list) 去重变为集合

        for link in links_in_news:

            print(link.attrs['href'])

        break

-------------------------------------------------------------------------

http://baijiahao.baidu.com/s?id=1670182176542294758

http://baijiahao.baidu.com/s?id=1670237336710694101

http://baijiahao.baidu.com/s?id=1670287125142703268

http://baijiahao.baidu.com/s?id=1670255408896313915

http://baijiahao.baidu.com/s?id=1670191066909619203

http://v.baidu.com/

http://baijiahao.baidu.com/s?id=1670253988609166598

 http://news.cctv.com/2020/06/23/ARTIHsG0yhCaD2YJUSFy7Qwt200623.shtml

http://baijiahao.baidu.com/s?id=1670286769270600802

http://news.cctv.com/2020/06/23/ARTIpnapIHyb413WeY46ShDy200623.shtml

http://m.top.cnr.cn/bdxw/20200623/t20200623_525141426.html

http://world.people.com.cn/n1/2020/0623/c1002-31756267.html

http://m.news.cctv.com/2020/06/23/ARTIDAQdwzQFMOkbW2Z0ehEk200623.shtml

http://baijiahao.baidu.com/s?id=1670245143050480742

http://m.news.cctv.com/2020/06/18/ARTIYNwiYAjjHBmGeAXpERs3200618.shtml

http://m.xinhuanet.com/yn/2020-06/23/c_139161263.htm

http://baijiahao.baidu.com/s?id=1670194818426496533

http://baijiahao.baidu.com/s?id=1670232858345398185

http://www.xinhuanet.com/2020-06/23/c_1126147531.htm

http://baijiahao.baidu.com/s?id=1670251112933488182

http://baijiahao.baidu.com/s?id=1670254276238905964

http://baijiahao.baidu.com/s?id=1670255017218969710

http://music.baidu.com/

http://m.top.cnr.cn/bdxw/20200623/t20200623_525141422.html

http://app.cctv.com/special/cportal/detail/arti/index.html?id=Arti8bFV6wkTJPYEkaZYVvoC200622&fromapp=cctvnews&version=805&allow_comment=1&allow_comment=1

http://map.baidu.com/

http://baijiahao.baidu.com/s?id=1670243226621040644

http://baijiahao.baidu.com/s?id=1670254944449236682

http://net.china.cn/chinese/index.htm

http://baijiahao.baidu.com/s?id=1670250874637091231

http://baijiahao.baidu.com/s?id=1670232858345398185

http://baijiahao.baidu.com/s?id=1670289098569528699

http://baijiahao.baidu.com/s?id=1670247580845339645

http://baijiahao.baidu.com/s?id=1670254849012760202

http://m.top.cnr.cn/bdxw/20200623/t20200623_525141424.html

http://baijiahao.baidu.com/s?id=1670246144336669257

http://baijiahao.baidu.com/s?id=1670254276238905964

http://app.cctv.com/special/cportal/detail/arti/index.html?id=ArtiLXGGutc9OLD23xo3Y3dN200622&fromapp=cctvnews&version=805&allow_comment=1&allow_comment=1

http://www.qstheory.cn/zt2019/llxjj/index.htm

http://www.cyberpolice.cn/wfjb/

http://baijiahao.baidu.com/s?id=1670250874637091231

http://baijiahao.baidu.com/s?id=1670239896280719334

http://baijiahao.baidu.com/s?id=1670248053773599893

http://image.baidu.com/

http://baijiahao.baidu.com/s?id=1670243226621040644

http://news.baidu.com/

http://tieba.baidu.com/

http://wenku.baidu.com/

http://report.12377.cn:13225/toreportinputNormal_anis.do

http://www.xinhuanet.com/politics/2020-06/23/c_1126149333.htm

http://app.cctv.com/special/cportal/detail/arti/index.html?id=ArtiA1FM8grjZNDdJ15XVvv8200623&fromapp=cctvnews&version=727

http://downpack.baidu.com/baidunews_AndroidPhone_1014720b.apk

http://www.bjjubao.org/

http://www.qstheory.cn/zt2017/xcgcdd19djs/index.htm

li=[1,2,2,3,4,3,6,4,3]

s=set(li)  #set(list) 去重变为集合

print(s)

{1, 2, 3, 4, 6}

# 递归 抓取所有的链接及链接页面的链接...：

import requests

from requests import exceptions

from bs4 import BeautifulSoup 

pages=set()

def geturl(url):

    global pages

    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}

    # 捕获异常

    try:

        html = requests.get(url,headers=headers)

    except exceptions.ConnectionError:

        # print(url)

        print("连接错误")

    else:

        bs = BeautifulSoup(html.text, 'html.parser')

        links=set(bs.find_all('', {'href':re.compile('^(http://)|^(https://)')}))  # set去重

        if links:

            for link in links:

                if link.attrs['href'] not in pages:  # 去重

                    pages.add(link.attrs['href'])

                    print(link.attrs['href'])

                    geturl(link.attrs['href'])  # 递归

        else:

            print("已爬完！")

home_link='http://www.baidu.com'

geturl(home_link)

print('end....')

https://wenku.baidu.com

https://www.baidu.com/cache/icon/favicon.ico

已爬完！

https://www.baidu.com/cache/icon/favicon.svg

已爬完！

https://jingyan.baidu.com

https://passport.baidu.com/v2/?reg&tpl=exp&u=http%3A%2F%2Fjingyan.baidu.com%2F

https://www.baidu.com/favicon.ico

已爬完！

https://www.baidu.com/img/baidu.svg

已爬完！

https://passport.baidu.com/v2/?ucenterfeedback#reg

http://www.baidu.com/

https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F&sms=5

https://passport.baidu.com/export/app/index.html

https://downpack.baidu.com/ppSecurityCenter_AndroidPhone_passport.apk

已爬完！

https://itunes.apple.com/cn/app/bai-du-quan-zhong-xin-shou/id695439229

https://www.apple.com.cn/iphone/

https://www.apple.com/kw/iphone/

https://www.apple.com/lae/iphone/

https://www.apple.com/gn/iphone/

https://support.apple.com/fr-gn

https://support.apple.com/ko-kr

https://support.apple.com/en-al

https://support.apple.com/fr-sn

https://support.apple.com/ru-ru

https://www.apple.com/ru/

https://www.apple.com/kr/

https://www.apple.com/la/

---为了方便展示，删除几百行---

KeyboardInterrupt:

# 增强上面的代码：只爬取链接，往往是没有什么用的，现在我们增加点功能：

# 1）所有链接网页的标题

      # h1-->span

# 2）第一段文字

      # div#mw-content-text-->p

# 3）编辑链接

      # li#cca-edit-->span-->a

import requests

from bs4 import BeautifulSoup

from requests import exceptions

import re

pages = set()

def geturl(url):

    global pages

    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}

    # 捕获异常

    html = requests.get(url,headers=headers)

    bs = BeautifulSoup(html.text, 'html.parser')

    try:

        print(bs.h1)

#         print(bs.find(id ='mw-content-text').find_all('p')[0])

#         print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])

    except AttributeError:

        print('This page is missing something! Continuing.')

    except exceptions.ConnectionError:

        print("连接错误")

    except exceptions.HTTPError:

        print('HTTP错误异常')

    except exceptions.MaxRetryError:

        print("多次重试失败")

    except exceptions.TooManyRedirects:

        print("多次重定位失败")

    except exceptions.ConnectTimeout:

        print("连接远程服务超时错误")

    except exceptions.Timeout:

        print("请求 URL超时")

    links=set(bs.find_all('', {'href':re.compile('^(http://)|^(https://)')}))  # set去重

    if links:

        for link in links:

            if 'href' in link.attrs:

                if link.attrs['href'] not in pages:

                    newPage = link.attrs['href']

                    print('-'*20)

                    print(newPage)

                    pages.add(newPage)

                    geturl(newPage)

    else:

        print("已爬完！")

# home_link='https://baike.baidu.com/'

home_link='https://baike.hk.xileso.top/wiki/Wikipedia:首页'

geturl(home_link)

print('end....')

# 书上的例子（不翻墙没法访问，例子不可用）

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

pages = set()

def getLinks(pageUrl):

    global pages

    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))

    bs = BeautifulSoup(html, 'html.parser')

    try:

        print(bs.h1.get_text())

        print(bs.find(id ='mw-content-text').find_all('p')[0])

        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])

    except AttributeError:

        print('This page is missing something! Continuing.')

    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):

        if 'href' in link.attrs:

            if link.attrs['href'] not in pages:

                #We have encountered a new page

                newPage = link.attrs['href']

                print('-'*20)

                print(newPage)

                pages.add(newPage)

                getLinks(newPage)

getLinks('')

Spider_实践_beautifulsoup静态网页爬取所有网页链接的更多相关文章

使用htmlparse爬虫技术爬取电影网页的全部下载链接
昨天,我们利用webcollector爬虫技术爬取了网易云音乐17万多首歌曲,而且还包括付费的在内,如果时间允许的话,可以获取更多的音乐下来,当然,也有小伙伴留言说这样会降低国人的知识产权保护意识,诚 ...
使用htmlparser爬虫技术爬取电影网页的全部下载链接
昨天,我们利用webcollector爬虫技术爬取了网易云音乐17万多首歌曲,而且还包括付费的在内,如果时间允许的话,可以获取更多的音乐下来,当然,也有小伙伴留言说这样会降低国人的知识产权保护意识,诚 ...
R语言爬取动态网页之环境准备
在R实现pm2.5地图数据展示文章中,使用rvest包实现了静态页面的数据抓取,然而rvest只能抓取静态网页,而诸如ajax异步加载的动态网页结构无能为力.在R语言中,爬取这类网页可以使用RSele ...
使用urllib进行网页爬取
# coding=gbk # 抓取开奖号码 # url:http://datachart.500.com/dlt/zoushi/jbzs_foreback.shtml ''' 对网页逐行迭代,找到目标 ...
python爬取某个网页的图片-如百度贴吧
python爬取某个网页的图片-如百度贴吧作者:vpoet mail:vpoet_sir@163.com 注:随意copy,不用告诉我 #coding:utf-8 import urllib imp ...
WebFetch 是无依赖极简网页爬取组件
WebFetch 是无依赖极简网页爬取组件,能在移动设备上运行的微型爬虫. WebFetch 要达到的目标: 没有第三方依赖jar包减少内存使用提高CPU利用率加快网络爬取速度简洁明了的api ...
动态网页爬取例子（WebCollector+selenium+phantomjs）
目标:动态网页爬取说明:这里的动态网页指几种可能:1)需要用户交互,如常见的登录操作:2)网页通过JS / AJAX动态生成,如一个html里有<div id="test" ...
12月4日学习爬虫007.使用Urllib模块进行简单网页爬取
笔记如下: 1.https是http加强版协议(安全协议)http(普通网络通信协议) 爬数据如果爬https发现和理想中的数据不同,可以改为http 直接去掉s即可 2.使用Urllib爬取简单网 ...
python使用requests库爬取网页的小实例：爬取京东网页
爬取京东网页的全代码: #爬取京东页面的全代码 import requests url="https://item.jd.com/2967929.html" try: r=requ ...

随机推荐

Oracle体系结构概述与SQL解析剖析
Oracle服务器是一个数据库管理系统,它提供了一种全面.开放.集成的方法来管理信息. Oracle服务器由Oracle数据库和Oracle实例组成. oracle数据库软件和Oracle数据库软件 ...
多测师讲解RF自动化测试实现流程_高级讲师肖sir
1.环境搭建过程?整套环境需要哪些工具包,以及工具包的作用?因为我搭建的RF框架是基于Python的,所以肯定要先安装Python,python安装完之后,开始安装自动化测试框架rf3.0-在do ...
day12 Pyhton学习
一.昨日内容回顾 1.函数名函数名是一个变量名可以作为集合类的元素可以作为参数进行传递 def func(fn): fn() 可以作为返回值返回 def outer(): def inner( ...
ansible使用script模块在受控机上执行脚本（ansible2.9.5）
一,ansible的script模块的用途 script 模块用来在远程主机上执行 ansible 管理主机上的脚本, 即:脚本一直存在于 ansible 管理主机本地, 不需要手动拷贝到远程主机后再 ...
插件下载地址 ext
ext2.2.0 http://files.cnblogs.com/files/chenghu/ext-2.2.zip http://files.cnblogs.com/files/chenghu/ ...
PowerShell类grep
PowerShell类grep 方法一: windows下没有grep不过有findstr, 功能差不多方法二: powershell自带的正择功能 xxx | where {$_ -match & ...
web应用部署（Tomcat，springboot部署方式）
转载自:https://www.cnblogs.com/haimishasha/p/10791454.html 核心内容 1.在Tomcat中有四种部署Web应用的方式,分别是: (1)利用Tomca ...
JAVA NIO 基础学习
package com.hrd.netty.demo.jnio; import java.io.BufferedReader; import java.io.IOException; import j ...
E. Almost Regular Bracket Sequence 解析(思維)
Codeforce 1095 E. Almost Regular Bracket Sequence 解析(思維) 今天我們來看看CF1095E 題目連結題目給你一個括號序列,求有幾個字元改括號方向 ...
Mybatis---06Mybatis配置文件浅析（四）
参考链接:深入理解Mybatis插件开发 1.plugins:与其称为Mybatis插件,不如叫Mybatis拦截器,更加符合其功能定位,实际上它就是一个拦截器,应用代理模式,在方法级别上进行拦截. ...

Spider_实践_beautifulsoup静态网页爬取所有网页链接

Spider_实践_beautifulsoup静态网页爬取所有网页链接的更多相关文章

随机推荐

热门专题