安装使用

# 安装 pip3 install beautifulsoup4

from bs4 import BeautifulSoup

soup=BeautifulSoup(ret.text,'html.parser')  # 传数据

soup=BeautifulSaoup(open('a.html','r'))		# 传文件

# html.parser内置解析器，速度稍微慢一些，但是不需要装第三方模块

# lxml：速度快一些，但是需要安装 pip3 install lxml

soup=BeautifulSoup(ret.text,'html.parser')

soup=BeautifulSoup(ret.text,'lxml')

# find（找到的第一个）

# find_all(找到的所有)

# 找页面所有的li标签

li_list=soup.find_all(name='li')

.text     # 全部拿出来，拼在一起

.string    # 只拿第一个

.strings   # 全部拿出来生成一个迭代器

遍历文档树

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"id="id_p"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

soup=BeautifulSoup(html_doc,'lxml')

# 1.美化+自动补全闭合标签

print(soup.prettify())

# 2.（通过.来查找，只能找到第一个）

head=soup.head  # Tag对象

title=head.title  # Tag对象

# 3.获取标签的名称

p=soup.body

print(p.name)  # body    #没什么卵用

# 4.获取标签的属性

p=soup.p

# 获取class属性,可以有多个，所以拿到是列表

# 方式一

print(p['class'])  # ['title']

print(p['id'])     # id_p

# 方式二

print(p.get('class'))   # ['title']

print(p.attrs['class'])  # ['title']

# 5.获取标签内容

p=soup.p

print(p)  # <p class="title" id="id_p"><b>The Dormouse's story</b></p>

print(p.text)  # The Dormouse's story

print(p.string) # The Dormouse's story

# 6.嵌套选择

title = soup.head.title.text  # 获取head下第一个title的内容

print(title)  # The Dormouse's story

# 7.子节点、子孙节点

p1=soup.p.children   # 迭代器

p2=soup.p.contents  # 列表

print(p1)  # <list_iterator object at 0x000001FA66E0C4A8>

print(list(p1))  # [<b>The Dormouse's story</b>]

print(p2)  # [<b>The Dormouse's story</b>]

# 8.父节点、祖先节点

p1=soup.p.parent  # 直接父节点

p2=soup.p.parents

print(p1)

# # print(len(list(p2)))

print(list(p2))

# 9.兄弟节点

print(soup.a.next_sibling) #下一个兄弟

print(soup.a.previous_sibling) #上一个兄弟

print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象

print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象

查找文档树

# 查找文档树（find，find_all），速度比遍历文档树慢

# 两个配合着使用（soup.p.find()）

# 五种过滤器: 字符串、正则表达式、列表、True、方法

# 1.字符串查找 引号内是字符串

p=soup.find(name='p')  # <p class="title" id="id_p"><b>The Dormouse's story</b></p>

p=soup.find(name='body')  # body标签所以内容

print(p)

# 查找类名是title的所有标签,class是关键字，class_

ret=soup.find_all(class_='title')  # [<p class="title" id="id_p"><b>The Dormouse's story</b></p>]

# 找id为xx的标签

ret=soup.find_all(id='id_p')  # [<p class="title" id="id_p"><b>The Dormouse's story</b></p>]

# href属性为http://example.com/elsie的标签

ret=soup.find_all(href='http://example.com/elsie') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 2.正则表达式

import re

reg=re.compile('^id')  # ^id开头

ret=soup.find_all(id=reg)  # 查找以id开头的id标签

print(ret)  # [<p class="title" id="id_p"><b>The Dormouse's story</b></p>]

# 3.列表

# or关系

ret=soup.find_all(id=['id_psafdsaf','link1'])  # 查找id有id_psafdsaf或者link1的标签

ret=soup.find_all(class_=['title1','story'])  # 查找类名有title1或者story的标签

# and关系

ret=soup.find_all(class_='title',name='p') # 查找类名有title1并且name=p的标签

# 4.true

# 所有有名字的标签

ret=soup.find_all(name=True)

#所有有id的标签

ret=soup.find_all(id=True)

# 所有有herf属性的

ret=soup.find_all(href=True)

print(ret)

# 5.方法

def has_class_but_no_id(tag):

    return tag.has_attr('class') and not tag.has_attr('id')

print(soup.find_all(has_class_but_no_id))

# 6.其他使用

ret=soup.find_all(attrs={'class':"title"})

ret=soup.find_all(attrs={'id':"id_p1",'class':'title'})

print(ret)

# 7.limit(限制条数)

# soup.find()  就是find_all limit=1

ret=soup.find_all(name=True,limit=2)

print(len(ret)) # 2

# 8.recursive

# recursive=False (只找儿子)不递归查找，只找第一层

ret=soup.body.find_all(name='p',recursive=False)

print(ret)

选择器介绍

# bs4：自己的选择器，css选择器

# lxml：css选择器，xpath选择器

# selenium：自己的选择器，css选择器，xpath选择器

# scrapy框架：自己的选择器，css选择器，xpath选择器

# css选择器，xpath选择器会用了，它就是个通行证

CSS选择器

# Tag对象.select("css选择器")

from bs4 import BeautifulSoup

import requests

for i in range(1, 5):

    i1 = str(i - 1)

    i = str(i)

    url = 'https://www.mzitu.com/202340/' + i

    ret = requests.get(url,headers={'User-Agent': 'request', 'Referer': 'https://www.mzitu.com/206122/' + i1},proxies={'http': '47.115.54.89'}, )

soup = BeautifulSoup(ret.text, 'lxml')

案例：

#   div>p：儿子      div p：子子孙孙

#   找div下最后一个a标签 div a:last-child

print(soup.select('#list-1 li:nth-child(1)')[0].text)    # 取第一个li标签

print(soup.select('#list-1 li:nth-child(2)')[0].text)    # 取第二个li标签

print(soup.select('#list-1 li:nth-last-child(1)')[0].text)    # 取倒数第一个li标签

print(soup.select('#list-1 li:nth-last-child(2)')[0].text)    # 取倒数第二个li标签

print(soup.p.select('.sister'))  # 可以组合使用。

print(soup.select('.sister span'))

print(soup.select('#link1'))

print(soup.select('#link1 span'))

print(soup.select('#list-2 .element.xxx'))

print(soup.select('#list-2')[0].select('.element')) #可以一直select,但其实没必要,一条select就可以了

# 2、获取属性

print(soup.select('#list-2 h1')[0].attrs)

href = soup.select('body > div.main > div.content > div.main-image > p > a > img')

print(href[0].attrs['src'])  # 图片链接

# 3、获取内容

# .get_text()

# .text

# .string

# .strings 变成迭代器

print(soup.select('#list-2 h1')[0].get_text())

xpath选择器

# xpath选择

# / 从根节点选取  /a   从根节点开始，往下找a标签（子）

# //从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置  //a 从根节点开始找a标签（子子孙孙中所有a）

# . 	选取当前节点。

# .. 	选取当前节点的父节点。

# @ 	选取属性。

# 取值 /text()

# 取属性 /@属性名

使用：

from lxml import etree

html=etree.HTML(doc) # 传字符串

html=etree.parse('search.html',etree.HTMLParser())  # 传文件

# 1 文本获取   标签后加：/text() ********重点

a=html.xpath('//body//a[@href="image1.html"]/text()')

a=html.xpath('//body//a/text()')

# 2 属性获取  标签后：/@href   ********重点

a=html.xpath('//body//a/@href')

# 注意从1 开始取（不是从0）

a=html.xpath('//body//a[3]/@href')

# 3 所有节点

a=html.xpath('//*')

# 4 指定节点（结果为列表）

a=html.xpath('//head')

# 5 子节点，子孙节点

a=html.xpath('//div/a')

a=html.xpath('//body/a') #无数据

a=html.xpath('//body//a')

# 6 父节点

a=html.xpath('//body//a[@href="image1.html"]/..')

a=html.xpath('//body//a[@href="image1.html"]')

a=html.xpath('//body//a[1]/..')

也可以这样

a=html.xpath('//body//a[1]/parent::*')

# 7 属性匹配

a=html.xpath('//body//a[@href="image1.html"]')

# 8 属性多值匹配

a 标签有多个class类，直接匹配就不可以了，需要用contains

a=html.xpath('//body//a[@class="li"]')

a=html.xpath('//body//a[@href="image1.html"]')

a=html.xpath('//body//a[contains(@class,"li")]')

a=html.xpath('//body//a[contains(@class,"li")]/text()')

a=html.xpath('//body//a[contains(@class,"li")]/@name')

# 9 多属性匹配 or 和 and （了解）

a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')

a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')

a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 10 按序选择

a=html.xpath('//a[2]/text()')

a=html.xpath('//a[2]/@href')

# 取最后一个（了解）

a=html.xpath('//a[last()]/@href')

a=html.xpath('//a[last()]/text()')

# 位置小于3的

a=html.xpath('//a[position()<3]/@href')

a=html.xpath('//a[position()<3]/text()')

# 倒数第二个

a=html.xpath('//a[last()-2]/@href')

# 11 节点轴选择

ancestor：祖先节点

使用了* 获取所有祖先节点

a=html.xpath('//a/ancestor::*')

# # 获取祖先节点中的div

a=html.xpath('//a/ancestor::div')

a=html.xpath('//a/ancestor::div/a[2]/text()')

# attribute：属性值

a=html.xpath('//a[1]/attribute::*')

a=html.xpath('//a[1]/@href')

# child：直接子节点

a=html.xpath('//a[1]/child::*')

a=html.xpath('//a[1]/img/@src')

descendant：所有子孙节点

a=html.xpath('//a[6]/descendant::*')

following:当前节点之后所有节点(递归)

a=html.xpath('//a[1]/following::*')

a=html.xpath('//a[1]/following::*[1]/@href')

# following-sibling:当前节点之后同级节点（同级）

a=html.xpath('//a[1]/following-sibling::*')

a=html.xpath('//a[1]/following-sibling::a')

a=html.xpath('//a[1]/following-sibling::*[2]')

a=html.xpath('//a[1]/following-sibling::*[2]/@href')

bs4使用的更多相关文章

bs4 python解析html
使用文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/ python的编码问题比较恶心. decode解码encode编码在文件 ...
【bs4】安装beautifulsoup
Debian/Ubuntu,install $ apt-get install python-bs4 easy_install/pip $ easy_install beautifulsoup4 $ ...
使用bs4对海投网内容信息进行提取并存入mongodb数据库
example: http://xyzp.haitou.cc/article/722427.html 首先是直接下载好每个页面,可以使用 os.system( "wget " ...
python爬虫主要就是五个模块：爬虫启动入口模块，URL管理器存放已经爬虫的URL和待爬虫URL列表，html下载器，html解析器，html输出器同时可以掌握到urllib2的使用、bs4（BeautifulSoup）页面解析器、re正则表达式、urlparse、python基础知识回顾（set集合操作）等相关内容。
本次python爬虫百步百科,里面详细分析了爬虫的步骤,对每一步代码都有详细的注释说明,可通过本案例掌握python爬虫的特点: 1.爬虫调度入口(crawler_main.py) # coding: ...
BS4爬取糗百
-- coding: cp936 -- import urllib,urllib2 from bs4 import BeautifulSoup user_agent='Mozilla/5.0 (Win ...
Python爬虫(十五)_案例：使用bs4的爬虫
本章将从Python案例讲起:所使用bs4做一个简单的爬虫案例,更多内容请参考:Python学习指南案例:使用BeautifulSoup的爬虫我们已腾讯社招页面来做演示:http://hr.ten ...
Python：bs4的使用
概述 bs4 全名 BeautifulSoup,是编写 python 爬虫常用库之一,主要用来解析 html 标签. 一.初始化 from bs4 import BeautifulSoup soup ...
Python：bs4中 string 属性和 text 属性的区别及背后的原理
刚开始接触 bs4 的时候,我也很迷茫,觉得 string 属性和 text 属性是一样的,不明白为什么要分成两个属性. html = '<p>hello world</p>' ...
bs4模块
1.导入模块 from bs4 import BeautifulSoup 2.创建对象 Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器,如果我们不安装它 ...
秋名山老司机（BS4与正则的比拼）
因为嘉伟思杯里的一个脚本题目,16进制计算,python3正则还没学,所以没写出来.大佬跟我说也可以用BS4,从DOM上下手,直接爬下来直接一个eval就搞定了,eval可以像这样计算16进制,eva ...

随机推荐

关于nw的简单应用
最近使用到了桌面开发应用nw.js.进行简单的介绍一下,基本用法 nwjs实际上是基于node js的,支持node js的所有api 中文官网https://nwjs.org.cn/ 第一步.在官网 ...
JavaScript对象(一)
Part One:对象的创建对象的创建,可以使用new Object() 或者 Object.creat(),该方法为静态函数 var foo = Object.create({x:1,y:2}); ...
第三篇：Linux的基本操作与文件管理（纯命令行模式下）（下）
接上篇介绍完软件的管理(查询.删除.安装)之后,本篇将介绍Linux的文件和目录的管理. 如何浏览Linux的目录(文件夹),就像Windows一样,我们平时需要打开各个目录,去里面找一找曾经悄悄存储 ...
使用 notepad 正则转换字符串
一..在一堆字符串中找出某一个特定格式的字符串,例如如下需要摘出 WMID_abc WMID_def WMID_ghi {"abc",WMID_abc,oid_abc} {&qu ...
Windows Server 2012 R2 域证书服务搭建
网管大叔说要给每个人颁发一个证书,这个证书很耗电 1.在服务器管理器中添加角色和功能下一步下一步勾选Active Directory证书服务下一步下一步勾选证书颁发机构,证书颁发机构Web ...
express第三方中间件研究之bodyParser中间件
转载至:http://www.cnblogs.com/lianer/p/5178693.html 接触nodejs已有一段时间了,但最近才开始落实项目,于是使用express应用生成器生成了一个应用. ...
云CRM系统安全吗
云CRM系统有一个特点只要连接互联网就能够进行访问,这种访问可以是移动端也可以是电脑端的,而且本地CRM系统只允许电脑端访问.云CRM系统将数据存储在云服务器上,很多人就会问云CRM系统安全吗?下面和 ...
Swift 4.0 中的错误处理及抛出错误
在Swift的标准库,很多方法名后都带有'throws'这个关键词, 'throws'表示该方法在执行过程中遇到错误则抛出,但不会crash. 下面是Swift标准库中的一个构造方法,String.D ...
Journal of Proteome Research | 人类牙槽骨蛋白的蛋白质组学和n端分析:改进的蛋白质提取方法和LysargiNase消化策略增加了蛋白质组的覆盖率和缺失蛋白的识别 | (解读人：卜繁宇)
文献名:Proteomic and N-Terminomic TAILS Analyses of Human Alveolar Bone Proteins: Improved Protein Extr ...
jmeter3.3 接口压测入门和软件下载
Jmeter3.3软件下载地址 https://download.csdn.net/download/qq_36625806/11076556 简单的使用教程 1.启动Jmeter 双击jemeter ...

bs4使用

安装使用

遍历文档树

查找文档树

选择器介绍

CSS选择器

xpath选择器

bs4使用的更多相关文章

随机推荐

热门专题