python 小爬虫爬取博客文章初体验
最近学习 python 走火入魔,趁着热情继续初级体验一下下爬虫,以前用 java也写过,这里还是最初级的爬取html,都没有用html解析器,正则等。。。而且一直在循环效率肯定### 很低下
import urllib.request as urllib2
import random
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
ua_agent = random.choice(ua_list)
# ua_agent_dict = {'User-Agent':ua_agent}
# print(ua_agent_dict)
# request = urllib2.Request(url=url)
# request.add_header(**ua_agent_dict)
def checkPageExists(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
try:
code = urllib2.urlopen(request).code
except IOError as httperr:
return False
return True if code == 200 else False
checkPageExists('https://www.cnblogs.com/Frank99/p/91111024.html',ua_agent=ua_agent)
url_prefix = 'https://www.cnblogs.com/Frank99/p/'
url_subfix = '.html'
# https://www.cnblogs.com/Frank99/p/
def getHtml(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
print('正在从页面 {} 读取数据......'.format(url))
response = urllib2.urlopen(request)
print('从页面 {} 读取数据完成......'.format(url))
return response.read()
def write_html2file(html,file_name):
with open(file_name,'w',encoding='utf-8') as f:
print('开始保存文件{}......'.format(file_name))
f.write(html.decode())
print('保存文件{}完成......'.format(file_name))
if __name__ == '__main__':
list(map(lambda i:write_html2file(getHtml(url_prefix+str(i)+url_subfix,ua_agent=ua_agent),str(i)+url_subfix),[i for i in range(9111123,9111125) if checkPageExists(url_prefix+str(i)+url_subfix,ua_agent=ua_agent)]))
# for i in range(9111123,9111125):
# url = url_prefix+str(i)+url_subfix
# file_name = str(i)+url_subfix
# if checkPageExists(url,ua_agent=ua_agent):
# html = getHtml(url,ua_agent=ua_agent)
# write_html2file(html,file_name)
import urllib.request as urllib2
import random
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
ua_agent = random.choice(ua_list)
# ua_agent_dict = {'User-Agent':ua_agent}
# print(ua_agent_dict)
# request = urllib2.Request(url=url)
# request.add_header(**ua_agent_dict)
def checkPageExists(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
try:
code = urllib2.urlopen(request).code
except IOError as httperr:
return False
return True if code == 200 else False
# checkPageExists('https://www.cnblogs.com/Frank99/p/91111024.html',ua_agent=ua_agent)
# https://www.cnblogs.com/Frank99/p/
def getHtml(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
print('正在从页面 {} 读取数据......'.format(url))
response = urllib2.urlopen(request)
print('从页面 {} 读取数据完成......'.format(url))
return response.read()
def write_html2file(html,file_name):
with open(file_name,'w',encoding='utf-8') as f:
print('开始保存文件{}......'.format(file_name))
f.write(html.decode())
print('保存文件{}完成......'.format(file_name))
if __name__ == '__main__':
# url_prefix = 'https://www.cnblogs.com/Frank99/p/'
# url_subfix = '.html'
url_prefix = input('请输入要被爬取的资源地址前缀...')
url_subfix = input('请输入要被爬取的资源地址后缀...')
list(map(lambda i:write_html2file(getHtml(url_prefix+str(i)+url_subfix,ua_agent=ua_agent),str(i)+url_subfix),(i for i in range(5400017,9111125) if checkPageExists(url_prefix+str(i)+url_subfix,ua_agent=ua_agent))))
for i in range(9111123,9111125):
url = url_prefix+str(i)+url_subfix
file_name = str(i)+url_subfix
if checkPageExists(url,ua_agent=ua_agent):
html = getHtml(url,ua_agent=ua_agent)
write_html2file(html,file_name)
刚虾米。。。。
# https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=100
# https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=10
import urllib.request as urllib2
from urllib import parse
import random
class TieBa(object):
def __init__(self,**kw):
for key in kw:
if key == 'name':
self.__name = kw[key]
elif key == 'start':
self.__start = kw[key]
elif key == 'end':
self.__end = kw[key]
# elif key == 'url':
# self.__url = kw[key]
def set_name(self,name):
self.__name = name
def get_name(self):
return self.__name
def set_start(self,start):
self.__start = start
def get_start(self):
return self.__start
def set_end(self,end):
self.__end = end
def get_end(self):
return self.__end
def spider_html(self):
'''
爬取网页信息
'''
name=self.__name
start=self.__start
end=self.__end
words ={'kw':name}
name = parse.urlencode(words)
url_prefix = r'https://tieba.baidu.com/f?'
url_suffix =r'&ie=utf-8&pn='
url = url_prefix+name+url_suffix
start = int(start)
end = int(end)
for page in range(start,end):
url = url+str(page)
print(url)
html = self.__get_html(page,url)
file_name = '{}-{}.html'.format(words['kw'],page)
self.__write2file(file_name,html)
def __get_html(self,page,url):
ua_list = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]
request = urllib2.Request(url)
request.add_header('User-Agent',random.choice(ua_list))
response = urllib2.urlopen(request)
print('第 {} 正在爬取'.format(page))
html = response.read()
print('第 {} 爬取完成'.format(page))
return html
def __write2file(self,file_name,html):
print('开始保存html为文件...')
with open(file_name,'w',encoding='utf-8') as f:
f.write(html.decode())
print('保存html为文件成功...')
if __name__ =='__main__':
tb = TieBa()
tb.set_name(input('请输入贴吧名!'))
tb.set_start(input('请输入从第几页开始: '))
tb.set_end(input('请输入从第几页结束: '))
tb.spider_html()
Condition ,用来同步 线程 , 底部还是 Lock , RLock (在一个线程中可以多次重入)
from threading import (Thread,Condition)
class XiaoAI(Thread):
def __init__(self,cond,name='小爱'):
super().__init__(name=name)
self.cond = cond
def run(self):
with self.cond:
self.cond.wait()
print('{name}: 在'.format(name=self.name))
self.cond.notify()
self.cond.wait()
print('{name}: 好啊!'.format(name=self.name))
self.cond.notify()
class TianMao(Thread):
def __init__(self,cond,name='天猫'):
super().__init__(name=name)
self.cond = cond
def run(self):
with cond:
print('{name}:小爱同学'.format(name=self.name))
self.cond.notify()
self.cond.wait()
print('{name}: 我们来对古诗吧。'.format(name=self.name))
self.cond.notify()
self.cond.wait()
if __name__ == '__main__':
cond = Condition()
xiao = XiaoAI(cond)
tian = TianMao(cond)
xiao.start() # 这里 start 顺序千万要注意
tian.start()
xiao.join()
tian.join()
from threading import (Thread,Semaphore)
from urllib.parse import urlencode
import requests
import chardet
import logging
from os import path
import random
import re
logging.basicConfig(level=logging.DEBUG)
# https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=100
class TieBaSpider(Thread):
def __init__(self,url,sem,name='TieBaSpider'):
super(TieBaSpider,self).__init__(name=name)
self.url = url
self.sem = sem
def _save(self,text):
parent_dir = r'D:\tieba'
file_name = path.join(parent_dir,path.split(re.sub(r'[%|=|&|?]','',self.url))[1])+'.html'
with open(file_name,'w',encoding='utf-8') as fw:
fw.write(text)
fw.flush()
return 1
def run(self):
# ua_list = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
# "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
# "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
# "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]
# header = {'User-Agent':random.choice(ua_list)}
response = requests.get(self.url)#header=header)
content = response.content
logging.info(response.encoding)
# result = chardet.detect(content)
# logging.info(result)
# code = result.get('encoding','utf-8')
self._save(content.decode(response.encoding))
self.sem.release()
class UrlProducer(Thread):
def __init__(self,tb_name,sem,pages_once=3,start_index=1,end_index=9):# end-start % pages_once == 0
super(UrlProducer,self).__init__(name=tb_name)
self.tb_name = urlencode(tb_name)
self.sem = sem
logging.info(self.tb_name)
self.pages_once = pages_once
self.start_index = start_index
self.end_index = end_index
def run(self):
for page_idx in range(self.start_index,self.end_index+1):
self.sem.acquire()
url_prefix = r'https://tieba.baidu.com/f?'
url_suffix = r'&fr=ala0&tpl='
self.url = url_prefix+self.tb_name+url_suffix+str(page_idx)
tb_spider = TieBaSpider(self.url,self.sem)
tb_spider.start()
if __name__ == '__main__':
kw_dict = dict(kw=r'国家地理')
sem = Semaphore(3) # 控制一次并发 3 个线程
url_producer = UrlProducer(kw_dict,sem=sem)
url_producer.start()
url_producer.join()
免费 ip 代理池
python 小爬虫爬取博客文章初体验的更多相关文章
- 爬虫---lxml爬取博客文章
上一篇大概写了下lxml的用法,今天我们通过案例来实践,爬取我的博客博客并保存在本地 爬取博客园博客 爬取思路: 1.首先找到需要爬取的博客园地址 2.解析博客园地址 # coding:utf-8 i ...
- [Python学习] 简单网络爬虫抓取博客文章及思想介绍
前面一直强调Python运用到网络爬虫方面很有效,这篇文章也是结合学习的Python视频知识及我研究生数据挖掘方向的知识.从而简介下Python是怎样爬去网络数据的,文章知识很easy ...
- [js高手之路]Node.js实现简易的爬虫-抓取博客文章列表信息
抓取目标:就是我自己的博客:http://www.cnblogs.com/ghostwu/ 需要实现的功能: 抓取文章标题,超链接,文章摘要,发布时间 需要用到的库: node.js自带的http库 ...
- python爬取博客圆首页文章链接+标题
新人一枚,初来乍到,请多关照 来到博客园,不知道写点啥,那就去瞄一瞄大家都在干什么好了. 使用python 爬取博客园首页文章链接和标题. 首先当然是环境了,爬虫在window10系统下,python ...
- 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)
前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...
- 利用Python网络爬虫爬取学校官网十条标题
利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...
- [js高手之路]Node.js实现简易的爬虫-抓取博客所有文章列表信息
抓取目标:就是我自己的博客:http://www.cnblogs.com/ghostwu/ 需要实现的功能: 抓取博客所有的文章标题,超链接,文章摘要,发布时间 需要用到的库: node.js自带的h ...
- Java使用Jsoup之爬取博客数据应用实例
导入Maven依赖 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <g ...
- Python开发爬虫之动态网页抓取篇:爬取博客评论数据——通过Selenium模拟浏览器抓取
区别于上篇动态网页抓取,这里介绍另一种方法,即使用浏览器渲染引擎.直接用浏览器在显示网页时解析 HTML.应用 CSS 样式并执行 JavaScript 的语句. 这个方法在爬虫过程中会打开一个浏览器 ...
随机推荐
- java基础知识点罗列
1:Java泛型 2:clone Java中的深拷贝(深复制)和浅拷贝(浅复制) Java中对Clone的理解 序列化和反序列化的概念 3:Java中有关Null的9件事
- Docker(十六)-Docker的daemon.json的作用
docker安装后默认没有daemon.json这个配置文件,需要进行手动创建.配置文件的默认路径:/etc/docker/daemon.json 一般情况,配置文件 daemon.json中配置的项 ...
- MySQL使用AUTO_INCREMENT列的表注意事项之update自增列篇
1)对于MyISAM表,如果用UPDATE更新自增列,如果列值与已有的值重复,则会出错:如果大于已有的最大值,则会自动更新表的AUTO_INCREMENT,操作是安全的. (2)对于innodb表,u ...
- Angular service定义服务
<!DOCTYPE html><html ng-app="myApp"><head lang="en"> <meta ...
- 智能制造(MES)四大阶段
智能制造的发展会经历标准化.自动化.信息化.智能化四个阶段标准化,对于生产流程.业务流程.生产制造多方面的标准化.质量检测标准化.企业管理.供应链等.标准化是组织现代化生产的重要组成部分,对于生产专业 ...
- rabbitmq使用报错总结
最近公司重构服务器架构,需要用到rabbitmq,在公司搞了一个下午还是连接不上,后来细看了英文说明,测试连接成功,得出如下报错几点. 我用的安装包:otp_win64_21.3.exe(erlang ...
- php-扩展模块查找下载网址
http://pecl.php.net/ 在该网页里面输入想要查找的扩展模块名 如: 搜索结果:
- BZOJ1775[USACO 2009 Dec Gold 3.Video Game Troubles]——DP
题目描述 输入 * 第1行: 两个由空格隔开的整数: N和V * 第2到第N+1行: 第i+1行表示第i种游戏平台的价格和可以在这种游戏平台上面运行的游 戏.包含: P_i, G_i还有G_i对由空格 ...
- HDU4623 CRIME 【状压DP】【同类项合并】
题目大意: 求相邻元素互质的排列个数. 题目分析: 由于互质只与质因数有关,所以我们对于质因数种类相同的数合并为一类,特殊的,1,17,19,23是一类,因为没有数与他们不互质. 那么我们做各个位进制 ...
- 2017ACM/ICPC亚洲区沈阳站-重现赛
HDU 6222 Heron and His Triangle 链接:http://acm.hdu.edu.cn/showproblem.php?pid=6222 思路: 打表找规律+大数运算 首先我 ...