# 字符串初始化
html = '''
<div>
<ul>
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
''' from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li')) # url初始化
from pyquery import PyQuery as pq
doc = pq(url = "http://www.baidu.com")
print(doc("head")) # 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename = "demo.html")
print(doc('li')) # 基本CSS选择器
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面id 前面需要加上#,class 前面需要加上.
print(doc('#container .list li')) # 查找元素
# 子元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis) lis = items.children()
print(type(lis))
print(lis) lis = items.children('.active')
print(lis) # 父元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents) parents = items.parents('.wrap')
print(parents)
 # 兄弟元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面item-0后面直接是. 没有空格
li = doc('.list .item-0.active')
print(li.siblings()) print(li.siblings('.active')) # 遍历
# 单个元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li) # 获取信息
# 获取属性
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
# 获取属性的两种方法
print(a.attr('href'))
print(a.attr.href) # 获取文本
print(a.text()) # 获取html
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
# 得到<li>标签里面的代码
print(li.html()) # DOM操作
# addClass、removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')
print(li)
li.add_class('active')
print(li) # attr CSS
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li) # remove
html = '''
<div class = "wrap">
Hello,World
<p>This is a paragraph</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text()) # 伪类选择器
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 获取第一个元素
li = doc('li:first-child')
print(li)
# 获取最后一个元素
li = doc('li:last-child')
print(li)
# 获取第二个元素
li = doc('li:nth-child(2)')
print(li)
# 获取下标为2的元素后面的所有元素(下标从0开始)
li = doc('li:gt(2)')
print(li)
# 获取下标为偶数的元素
li = doc('li:nth-child(2n)')
print(li)
# 获取内容包含second 的元素
li = doc('li:contains(second)')
print(li)

Python爬虫之pyquery库的基本使用的更多相关文章

  1. Python爬虫之PyQuery使用(六)

    Python爬虫之PyQuery使用 PyQuery简介 pyquery能够通过选择器精确定位 DOM 树中的目标并进行操作.pyquery相当于jQuery的python实现,可以用于解析HTML网 ...

  2. python爬虫之urllib库(三)

    python爬虫之urllib库(三) urllib库 访问网页都是通过HTTP协议进行的,而HTTP协议是一种无状态的协议,即记不住来者何人.举个栗子,天猫上买东西,需要先登录天猫账号进入主页,再去 ...

  3. python爬虫之urllib库(二)

    python爬虫之urllib库(二) urllib库 超时设置 网页长时间无法响应的,系统会判断网页超时,无法打开网页.对于爬虫而言,我们作为网页的访问者,不能一直等着服务器给我们返回错误信息,耗费 ...

  4. python爬虫之urllib库(一)

    python爬虫之urllib库(一) urllib库 urllib库是python提供的一种用于操作URL的模块,python2中是urllib和urllib2两个库文件,python3中整合在了u ...

  5. Python爬虫之selenium库使用详解

    Python爬虫之selenium库使用详解 本章内容如下: 什么是Selenium selenium基本使用 声明浏览器对象 访问页面 查找元素 多个元素查找 元素交互操作 交互动作 执行JavaS ...

  6. Mac os 下 python爬虫相关的库和软件的安装

      由于最近正在放暑假,所以就自己开始学习python中有关爬虫的技术,因为发现其中需要安装许多库与软件所以就在这里记录一下以避免大家在安装时遇到一些不必要的坑. 一. 相关软件的安装:   1. h ...

  7. python爬虫(四)_urllib2库的基本使用

    本篇我们将开始学习如何进行网页抓取,更多内容请参考:python学习指南 urllib2库的基本使用 所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地.在Python中有很 ...

  8. python爬虫之PyQuery的基本使用

    PyQuery库也是一个非常强大又灵活的网页解析库,如果你有前端开发经验的,都应该接触过jQuery,那么PyQuery就是你非常绝佳的选择,PyQuery 是 Python 仿照 jQuery 的严 ...

  9. python爬虫之requests库

    在python爬虫中,要想获取url的原网页,就要用到众所周知的强大好用的requests库,在2018年python文档年度总结中,requests库使用率排行第一,接下来就开始简单的使用reque ...

随机推荐

  1. Linux清空文件内容

    日志文件太多,需要清空: echo "" > mylog.log

  2. System.Data.Entity.Infrastructure.DbUpdateException

    异常描述:   捕捉到 System.Data.Entity.Infrastructure.DbUpdateException  HResult=-2146233087  Message=无法更新 E ...

  3. Python爬虫入门教程 13-100 斗图啦表情包多线程爬取

    斗图啦表情包多线程爬取-写在前面 今天在CSDN博客,发现好多人写爬虫都在爬取一个叫做斗图啦的网站,里面很多表情包,然后瞅了瞅,各种实现方式都有,今天我给你实现一个多线程版本的.关键技术点 aioht ...

  4. 《连连看》算法c语言演示(自动连连看)

    (图片是游戏的示意图,来自互联网,与本文程序无关) 看题目就知道是写给初学者的,没需要的就别看了,自己都觉得怪无聊的. 很多游戏的耐玩性都来自精巧的算法,特别是人工智能的水平.比如前几天看了著名的Al ...

  5. 前端笔记之JavaScript(一)初识JavaScript

    一.JavaScript简介 1.1网页分层 web前端一共分三层: 结构层 HTML         : 负责搭建页面结构 样式层 CSS          : 负责页面的美观 行为层 JavaSc ...

  6. 前后端数据加密传输 RSA非对称加密

    任务需求:要求登陆时将密码加密之后再进行传输到后端. 经过半天查询摸索折腾,于是有了如下成果: 加密方式:RSA非对称加密.实现方式:公钥加密,私钥解密.研究进度:javascript与java端皆已 ...

  7. SQLServer特殊字符/生僻字与varchar

    对于中文版的SQL SERVER,默认安装后使用的默认排序规则为Chinese_PRC_CI_AS,在此排序规则下,使用varchar类型来可以“正常存取”存放中文字符以及一些东南亚国家的字符,同时v ...

  8. 开发函数计算的正确姿势 —— 使用 Fun Local 本地运行与调试

    前言 首先介绍下在本文出现的几个比较重要的概念: 函数计算(Function Compute): 函数计算是一个事件驱动的服务,通过函数计算,用户无需管理服务器等运行情况,只需编写代码并上传.函数计算 ...

  9. Servlet+Tomcat总结

    Tomcat的缺省端口是多少,怎么修改 1.找到Tomcat目录下的conf文件夹 2.进入conf文件夹里面找到server.xml文件 3.打开server.xml文件 4.在server.xml ...

  10. 痞子衡嵌入式:飞思卡尔Kinetis开发板OpenSDA调试器那些事(上)- 背景与架构

    大家好,我是痞子衡,是正经搞技术的痞子.今天痞子衡给大家介绍的是飞思卡尔Kinetis MCU开发板板载OpenSDA调试器(上篇). 众所周知,嵌入式软件开发几乎离不开调试器,因为写一个稍有代码规模 ...