Python爬虫之pyquery库的基本使用
# 字符串初始化
html = '''
<div>
<ul>
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
''' from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li')) # url初始化
from pyquery import PyQuery as pq
doc = pq(url = "http://www.baidu.com")
print(doc("head")) # 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename = "demo.html")
print(doc('li')) # 基本CSS选择器
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面id 前面需要加上#,class 前面需要加上.
print(doc('#container .list li')) # 查找元素
# 子元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis) lis = items.children()
print(type(lis))
print(lis) lis = items.children('.active')
print(lis) # 父元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents) parents = items.parents('.wrap')
print(parents)
# 兄弟元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面item-0后面直接是. 没有空格
li = doc('.list .item-0.active')
print(li.siblings()) print(li.siblings('.active')) # 遍历
# 单个元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li) # 获取信息
# 获取属性
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
# 获取属性的两种方法
print(a.attr('href'))
print(a.attr.href) # 获取文本
print(a.text()) # 获取html
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
# 得到<li>标签里面的代码
print(li.html()) # DOM操作
# addClass、removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')
print(li)
li.add_class('active')
print(li) # attr CSS
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li) # remove
html = '''
<div class = "wrap">
Hello,World
<p>This is a paragraph</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text()) # 伪类选择器
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 获取第一个元素
li = doc('li:first-child')
print(li)
# 获取最后一个元素
li = doc('li:last-child')
print(li)
# 获取第二个元素
li = doc('li:nth-child(2)')
print(li)
# 获取下标为2的元素后面的所有元素(下标从0开始)
li = doc('li:gt(2)')
print(li)
# 获取下标为偶数的元素
li = doc('li:nth-child(2n)')
print(li)
# 获取内容包含second 的元素
li = doc('li:contains(second)')
print(li)
Python爬虫之pyquery库的基本使用的更多相关文章
- Python爬虫之PyQuery使用(六)
Python爬虫之PyQuery使用 PyQuery简介 pyquery能够通过选择器精确定位 DOM 树中的目标并进行操作.pyquery相当于jQuery的python实现,可以用于解析HTML网 ...
- python爬虫之urllib库(三)
python爬虫之urllib库(三) urllib库 访问网页都是通过HTTP协议进行的,而HTTP协议是一种无状态的协议,即记不住来者何人.举个栗子,天猫上买东西,需要先登录天猫账号进入主页,再去 ...
- python爬虫之urllib库(二)
python爬虫之urllib库(二) urllib库 超时设置 网页长时间无法响应的,系统会判断网页超时,无法打开网页.对于爬虫而言,我们作为网页的访问者,不能一直等着服务器给我们返回错误信息,耗费 ...
- python爬虫之urllib库(一)
python爬虫之urllib库(一) urllib库 urllib库是python提供的一种用于操作URL的模块,python2中是urllib和urllib2两个库文件,python3中整合在了u ...
- Python爬虫之selenium库使用详解
Python爬虫之selenium库使用详解 本章内容如下: 什么是Selenium selenium基本使用 声明浏览器对象 访问页面 查找元素 多个元素查找 元素交互操作 交互动作 执行JavaS ...
- Mac os 下 python爬虫相关的库和软件的安装
由于最近正在放暑假,所以就自己开始学习python中有关爬虫的技术,因为发现其中需要安装许多库与软件所以就在这里记录一下以避免大家在安装时遇到一些不必要的坑. 一. 相关软件的安装: 1. h ...
- python爬虫(四)_urllib2库的基本使用
本篇我们将开始学习如何进行网页抓取,更多内容请参考:python学习指南 urllib2库的基本使用 所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地.在Python中有很 ...
- python爬虫之PyQuery的基本使用
PyQuery库也是一个非常强大又灵活的网页解析库,如果你有前端开发经验的,都应该接触过jQuery,那么PyQuery就是你非常绝佳的选择,PyQuery 是 Python 仿照 jQuery 的严 ...
- python爬虫之requests库
在python爬虫中,要想获取url的原网页,就要用到众所周知的强大好用的requests库,在2018年python文档年度总结中,requests库使用率排行第一,接下来就开始简单的使用reque ...
随机推荐
- ASP.NET 一个数据访问层的封装
刚通过开通写博客的申请,向博客园的大佬致敬,由于一直以来都在网上搜索大家的思想,也有翻遍整个百度都有的找不到的时候,作为一个网民理应为互联网贡献一点东西. 下面是我工作后受一个师傅的影响对数据库访问层 ...
- Java 11新功能抢先了解
目前 Oracle 已经发布了 Java Development Kit 10,下个版本 JDK 11 也即将发布.本文介绍 Java 11 的新功能. 根据Oracle新出台的每6个月发布一次Jav ...
- Java:多态乃幸福本源
01 多态是什么 在我刻板的印象里,西游记里的那段孙悟空和二郎神的精彩对战就能很好的解释“多态”这个词:一个孙悟空,能七十二变:一个二郎神,也能七十二变:他们都可以变成不同的形态,但只需要悄悄地喊一声 ...
- influxdb使用说明
前言 influxdb是目前比较流行的时间序列数据库. 何谓时间序列数据库?什么是时间序列数据库,最简单的定义就是数据格式里包含Timestamp字段的数据,比如某一时间环境的温度,CPU的使用率等. ...
- andrroid 测试那点事
1.拨号*#*#98284#*#* 2.查看imei号:拔号 *#06# 3.抓取 MTK Log *#*#3646633#*#* 高通平台 *#62564# 4.查看手机的cpu架构信息:adb s ...
- Scrum到底是个神马玩意儿
从前有一种非常火爆的体育运动,对阵双方各派出11位猛男,在宽阔的草皮球场内争抢一颗可怜的小皮球.哪方能够通过团队协作拿到皮球,并且运送到对方场地的特定位置即得分. 没错,你没有走错片场,快到超级碗里来 ...
- 阿里云弹性容器实例产品 ECI ——云原生时代的基础设施
阿里云弹性容器实例产品 ECI ——云原生时代的基础设施 1. 什么是 ECI 弹性容器实例 ECI (Elastic Container Instance) 是阿里云在云原生时代为用户提供的基础计算 ...
- JVM(三)对象的生死判定和算法详解
好的文章是能把各个知识点,通过逻辑关系串连起来,让人豁然开朗的同时又记忆深刻. 导读:对象除了生死之外,还有其他状态吗?对象真正的死亡,难道只经历一次简单的判定?如何在垂死的边缘"拯救&qu ...
- 痞子衡嵌入式:语音处理工具Jays-PySPEECH诞生记(6)- 文语合成实现(pyttsx3, eSpeak1.48.04)
大家好,我是痞子衡,是正经搞技术的痞子.今天痞子衡给大家介绍的是语音处理工具Jays-PySPEECH诞生之文语合成实现. 文语合成是Jays-PySPEECH的核心功能,Jays-PySPEECH借 ...
- Spring基础系列-参数校验
原创作品,可以转载,但是请标注出处地址:https://www.cnblogs.com/V1haoge/p/9953744.html Spring中使用参数校验 概述 JSR 303中提出了Bea ...