#!/usr/bin/env python
# -*- coding: utf-8 -*- import requests
from pyquery import PyQuery as pq url = 'http://www.136book.com/huaqiangu/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
} # 请求网页链接 获取页面源码
r = requests.get(url, headers=headers).text
doc = pq(r)
# 获取网页的全部章节链接 这里去掉了前面的最新章节部分 直接从第一章开始
# div#book_detail:nth-child(2) 选取的是第二个div的内容
links = doc('div#book_detail:nth-child(2) li a').items() for link in links: download_url = link.attr('href')
# 请求每个章节
download_page = requests.get(download_url, headers=headers).text
# 获取每一章节的源码
doc = pq(download_page)
# 获取每一章小说的内容
contents = doc('div#content').text()
with open('花千骨.txt', 'a+', encoding='utf8') as f:
f.write(link.text()+"\n\n")
f.write(contents+"\n\n")
print("写入文件完成!请查看")

1.安装方法

pip install pyquery

2.引用方法

from pyquery import PyQuery as pq

3.简介

 pyquery 是类型jquery 的一个专供python使用的html解析的库,使用方法类似bs4。

4.使用方法

  4.1 初始化方法:

from pyquery import PyQuery as pq
doc =pq(html) #解析html字符串
doc =pq("http://news.baidu.com/") #解析网页
doc =pq("./a.html") #解析html 文本

4.2 基本CSS选择器

from pyquery import PyQuery as pq
html = '''
<div id="wrap">
<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
'''
doc = pq(html)
print doc("#wrap .s_from link")

  运行结果:

<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>

  #是查找id的标签  .是查找class 的标签  link 是查找link 标签 中间的空格表示里层

  4.3 查找子元素

from pyquery import PyQuery as pq
html = '''
<div id="wrap">
<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
'''
#查找子元素
doc = pq(html)
items=doc("#wrap")
print(items)
print("类型为:%s"%type(items))
link = items.find('.s_from')
print(link)
link = items.children()
print(link)

  运行结果:

<div id="wrap">
<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
类型为:<class 'pyquery.pyquery.PyQuery'>
<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul> <ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul>

  根据运行结果可以发现返回结果类型为pyquery,并且find方法和children 方法都可以获取里层标签

  4.4查找父元素

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
items=doc(".s_from")
print(items)
#查找父元素
parent_href=items.parent()
print(parent_href)

  运行结果:

<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul> <div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link href="http://asda.com">asdadasdad12312</link>
<link href="http://asda1.com">asdadasdad12312</link>
<link href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>

  parent可以查找出外层标签包括的内容,与之类似的还有parents,可以获取所有外层节点

  4.5 查找兄弟元素

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
items=doc("link.active1.a123")
print(items)
#查找兄弟元素
siblings_href=items.siblings()
print(siblings_href)

  运行结果:

<link class="active1 a123" href="http://asda.com">asdadasdad12312</link>

<link class="active2" href="http://asda1.com">asdadasdad12312</link>
<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  根据运行结果可以看出,siblings 返回了同级的其他标签

  结论:子元素查找,父元素查找,兄弟元素查找,这些方法返回的结果类型都是pyquery类型,可以针对结果再次进行选择

  4.6 遍历查找结果

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link").items()
for it in its:
print(it)

  运行结果:

<link class="active1 a123" href="http://asda.com">asdadasdad12312</link>

<link class="active2" href="http://asda1.com">asdadasdad12312</link>

<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  4.7获取属性信息

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link").items()
for it in its:
print(it.attr('href'))
print(it.attr.href)

  运行结果:

http://asda.com
http://asda.com
http://asda1.com
http://asda1.com
http://asda2.com
http://asda2.com

  4.8 获取文本

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link").items()
for it in its:
print(it.text())

  运行结果

asdadasdad12312
asdadasdad12312
asdadasdad12312

  4.9 获取 HTML信息

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link").items()
for it in its:
print(it.html())

  运行结果:

<a>asdadasdad12312</a>
asdadasdad12312
asdadasdad12312

5.常用DOM操作

  5.1 addClass removeClass

  添加,移除class标签

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link").items()
for it in its:
print("添加:%s"%it.addClass('active1'))
print("移除:%s"%it.removeClass('active1'))

  运行结果

添加:<link class="active1 a123" href="http://asda.com"><a>asdadasdad12312</a></link>

移除:<link class="a123" href="http://asda.com"><a>asdadasdad12312</a></link>

添加:<link class="active2 active1" href="http://asda1.com">asdadasdad12312</link>

移除:<link class="active2" href="http://asda1.com">asdadasdad12312</link>

添加:<link class="movie1 active1" href="http://asda2.com">asdadasdad12312</link>

移除:<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  需要注意的是已经存在的class标签不会继续添加

  5.2 attr css

  attr 为获取/修改属性 css 添加style属性

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link").items()
for it in its:
print("修改:%s"%it.attr('class','active'))
print("添加:%s"%it.css('font-size','14px'))

  运行结果

C:\Python27\python.exe D:/test_his/test_re_1.py
修改:<link class="active" href="http://asda.com"><a>asdadasdad12312</a></link> 添加:<link class="active" href="http://asda.com" style="font-size: 14px"><a>asdadasdad12312</a></link> 修改:<link class="active" href="http://asda1.com">asdadasdad12312</link> 添加:<link class="active" href="http://asda1.com" style="font-size: 14px">asdadasdad12312</link> 修改:<link class="active" href="http://asda2.com">asdadasdad12312</link> 添加:<link class="active" href="http://asda2.com" style="font-size: 14px">asdadasdad12312</link>

  attr css操作直接修改对象的

  5.3 remove

  remove 移除标签

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("div")
print('移除前获取文本结果:\n%s'%its.text())
it=its.remove('ul')
print('移除后获取文本结果:\n%s'%it.text())

  运行结果

移除前获取文本结果:
hello nihao
asdasd
asdadasdad12312
asdadasdad12312
asdadasdad12312
移除后获取文本结果:
hello nihao

  其他DOM方法参考:

  http://pyquery.readthedocs.io/en/latest/api.html

6.伪类选择器

from pyquery import PyQuery as pq
html = '''
<div href="wrap">
hello nihao
<ul class="s_from">
asdasd
<link class='active1 a123' href="http://asda.com"><a>helloasdadasdad12312</a></link>
<link class='active2' href="http://asda1.com">asdadasdad12312</link>
<link class='movie1' href="http://asda2.com">asdadasdad12312</link>
</ul>
</div>
''' doc = pq(html)
its=doc("link:first-child")
print('第一个标签:%s'%its)
its=doc("link:last-child")
print('最后一个标签:%s'%its)
its=doc("link:nth-child(2)")
print('第二个标签:%s'%its)
its=doc("link:gt(0)") #从零开始
print("获取0以后的标签:%s"%its)
its=doc("link:nth-child(2n-1)")
print("获取奇数标签:%s"%its)
its=doc("link:contains('hello')")
print("获取文本包含hello的标签:%s"%its)

  运行结果

第一个标签:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>

最后一个标签:<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

第二个标签:<link class="active2" href="http://asda1.com">asdadasdad12312</link>

获取0以后的标签:<link class="active2" href="http://asda1.com">asdadasdad12312</link>
<link class="movie1" href="http://asda2.com">asdadasdad12312</link> 获取奇数标签:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>
<link class="movie1" href="http://asda2.com">asdadasdad12312</link> 获取文本包含hello的标签:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>

Python网络爬虫神器PyQuery的使用方法的更多相关文章

  1. python爬虫神器PyQuery的使用方法

    你是否觉得 XPath 的用法多少有点晦涩难记呢? 你是否觉得 BeautifulSoup 的语法多少有些悭吝难懂呢? 你是否甚至还在苦苦研究正则表达式却因为少些了一个点而抓狂呢? 你是否已经有了一些 ...

  2. 《精通python网络爬虫》笔记

    <精通python网络爬虫>韦玮 著 目录结构 第一章 什么是网络爬虫 第二章 爬虫技能概览 第三章 爬虫实现原理与实现技术 第四章 Urllib库与URLError异常处理 第五章 正则 ...

  3. 《精通Python网络爬虫》|百度网盘免费下载|Python爬虫实战

    <精通Python网络爬虫>|百度网盘免费下载|Python爬虫实战 提取码:7wr5 内容简介 为什么写这本书 网络爬虫其实很早就出现了,最开始网络爬虫主要应用在各种搜索引擎中.在搜索引 ...

  4. python网络爬虫学习笔记

    python网络爬虫学习笔记 By 钟桓 9月 4 2014 更新日期:9月 4 2014 文章文件夹 1. 介绍: 2. 从简单语句中開始: 3. 传送数据给server 4. HTTP头-描写叙述 ...

  5. Python网络爬虫

    http://blog.csdn.net/pi9nc/article/details/9734437 一.网络爬虫的定义 网络爬虫,即Web Spider,是一个很形象的名字. 把互联网比喻成一个蜘蛛 ...

  6. 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)

    前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...

  7. 利用Python网络爬虫爬取学校官网十条标题

    利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...

  8. Python网络爬虫学习总结

    1.检查robots.txt 让爬虫了解爬取该网站时存在哪些限制. 最小化爬虫被封禁的可能,而且还能发现和网站结构相关的线索. 2.检查网站地图(robots.txt文件中发现的Sitemap文件) ...

  9. python网络爬虫入门范例

    python网络爬虫入门范例 Windows用户建议安装anaconda,因为有些套件难以安装. 安装使用pip install * 找出所有含有特定标签的HTML元素 找出含有特定CSS属性的元素 ...

随机推荐

  1. secure-file-priv特性

    转载自:https://segmentfault.com/a/1190000009333563 当出现:1290 - The MySQL server is running with the --se ...

  2. python 进程和线程-简介及进程

    进程和线程的关系及应用 参考链接:https://www.liaoxuefeng.com/wiki/1016959663602400/1017627212385376 多任务: 什么叫“多任务”呢?简 ...

  3. HDU2023求平均成绩 - biaobiao88

    题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=2023 求平均成绩 Problem Description 假设一个班有n(n<=50)个学生,每 ...

  4. 使用Docker Compose 部署Nexus后提示:Unable to create directory /nexus-data/instance

    场景 Ubuntu Server 上使用Docker Compose 部署Nexus(图文教程): https://blog.csdn.net/BADAO_LIUMANG_QIZHI/article/ ...

  5. 破解压缩包的几种方式(zip伪加密 爆破 CRC32碰撞 已知明文攻击)

    zip伪加密 zip文件是由3部分组成,详见文末 压缩源文件数据区+压缩源文件目录区+压缩源文件目录结束标志 在压缩源文件数据区有个2字节的 全局方式位标记 ,在压缩源文件目录区也有个2字节的 全局方 ...

  6. WampServer出现You don’t have permission to access/on this server提示

    WampServer出现You don’t have permission to access/on this server提示 本地搭建WampServer,输入http://127.0.0.1访问 ...

  7. 使用Python3导出MySQL查询数据

    整理个Python3导出MySQL查询数据d的脚本. Python依赖包: pymysql xlwt Python脚本: #!/usr/bin/env python # -*- coding: utf ...

  8. Django框架(三)-- orm增删改查、Django生命周期

    一.orm介绍 1.什么是orm ORM即Object Relational Mapping,全称对象关系映射. 2.使用orm的优缺点 优点: 不用写SQL语句 开发效率高 缺点: SQL的效率低 ...

  9. Linux的网络参数设置

    前面讲解了lLinux 的IP组成,下面就讲一下Linux的网络设置和数据传递. 其实这地方对运维的人员来说,不会要精通,但还是要了解.必要时刻还会用到的 电脑之间数据的传递: 数据的传递要分为下面几 ...

  10. 《linux就该这么学》课堂笔记14 Apache、SELinux、虚拟主机

    1.目前能够提供Web网络服务的程序有IIS(Windows系统中默认的Web服务程序,是一款图形化的网站管理工具).Nginx和Apache(RHEL 7系统中默认的Web服务程序)等. Apach ...