百度图片爬虫-python版
1 #coding:utf-8
2
3 """
4
5 Created on 2015-9-17
6
7
8
9 @author: huangxie
10
11 """
12
13 import time,math,os,re,urllib,urllib2,cookielib
14
15 from bs4 import BeautifulSoup
16
17 import time
18
19 import re
20
21 import uuid
22
23 import json
24
25 from threading import Thread
26
27 from Queue import Queue
28
29 import MySQLdb as mdb
30
31 import sys
32
33 import threading
34
35 import utils
36
37 import imitate_browser
38
39 from MySQLdb.constants.REFRESH import STATUS
40
41 reload(sys)
42
43 sys.setdefaultencoding('utf-8')
44
45
46
47 DB_HOST = '127.0.0.1'
48
49 DB_USER = 'root'
50
51 DB_PASS = 'root'
52
53 proxy = {u'http':u'222.39.64.13:8118'}
54
55 TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"
56
57 KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"
58
59
60
61 """
62
63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
64
65 'Accept':'json;q=0.9,*/*;q=0.8',
66
67 'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
68
69 'Accept-Encoding':'gzip',
70
71 'Connection':'close',
72
73 'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
74
75 }
76
77 """
78
79 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
80
81
82
83 def GetDateString():
84
85 x = time.localtime(time.time())
86
87 foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
88
89 return foldername
90
91
92
93 class BaiduImage(threading.Thread):
94
95
96
97 def __init__(self):
98
99 Thread.__init__(self)
self.browser=imitate_browser.BrowserBase()
self.chance=0
self.chance1=0
self.request_queue=Queue()
self.wait_ana_queue=Queue()
#self.key_word_queue.put((("动态图", 0, 24)))
self.count=0
self.mutex = threading.RLock() #可重入锁,使单线程可以再次获得已经获得的锁
self.commit_count=0
self.ID=500
self.next_proxy_set = set()
self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'sosogif', charset='utf8')
self.dbconn.autocommit(False)
self.dbcurr = self.dbconn.cursor()
self.dbcurr.execute('SET NAMES utf8')
"""
def run(self):
while True:
self.get_pic()
"""
def work(self,item):
print "start thread",item
while True: #MAX_REQUEST条以上则等待
self.get_pic()
self.prepare_request()
def format_keyword_url(self,keyword):
return KEYWORD_URL.format(wd=keyword).encode('utf-8')
def generateSeed(self,url):
html = self.browser.openurl(url).read()
if html:
try:
soup = BeautifulSoup(html)
trs = soup.find('div', id='rs').find('table').find_all('tr') #获得所有行
for tr in trs:
ths=tr.find_all('th')
for th in ths:
a=th.find_all('a')[0]
keyword=a.text.strip()
if "动态图" in keyword or "gif" in keyword:
print "keyword",keyword
self.dbcurr.execute('select id from info where word=%s',(keyword))
y = self.dbcurr.fetchone()
if not y:
self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)',(keyword))
self.dbconn.commit()
except:
pass
def prepare_request(self):
self.lock()
self.dbcurr.execute('select * from info where status=0')
result = self.dbcurr.fetchone()
if result:
id,word,status,page_num,left_num,how_many=result
self.request_queue.put((id,word,page_num))
if page_num==0 and left_num==0 and how_many==0:
url=self.format_keyword_url(word)
self.generateSeed(url)
html=""
try:
url=self.format_top_url(word, page_num, 24)
html = self.browser.openurl(url).read()
except Exception as err:
print "err",err
#pass
if html!="":
how_many=self.how_many(html)
print "how_many",how_many
if how_many==None:
how_many=0
t=math.ceil(how_many/24*100) #只要前1/100即可
num = int(t)
for i in xrange(0,num-1):
self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))
self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id)) #置为已经访问
self.dbconn.commit()
self.unlock()
def start_work(self,req_max):
for item in xrange(req_max):
t = threading.Thread(target=self.work, args=(item,))
t.setDaemon(True)
t.start()
def lock(self): #加锁
self.mutex.acquire()
def unlock(self): #解锁
self.mutex.release()
def get_para(self,url,key):
values = url.split('?')[-1]
for key_value in values.split('&'):
value=key_value.split('=')
if value[0]==key:
return value[1]
return None
def makeDateFolder( self,par,child):
#self.lock()
if os.path.isdir( par ):
path=par + '//' + GetDateString()
newFolderName = path+'//'+child
if not os.path.isdir(path):
os.mkdir(path)
if not os.path.isdir( newFolderName ):
os.mkdir( newFolderName )
return newFolderName
else:
return par
#self.unlock()
def parse_json(self,data):
ipdata = json.loads(data)
try:
if ipdata['imgs']:
for n in ipdata['imgs']: #data子项
if n['objURL']:
try:
proxy_support = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
#print "proxy",proxy
self.lock()
self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))
y = self.dbcurr.fetchone()
#print "y=",y
if y:
print "database exist"
self.unlock() #continue 前解锁
continue
else:
real_extension=utils.get_extension(n['objURL'])
req = urllib2.Request(n['objURL'],headers=i_headers)
resp = urllib2.urlopen(req,None,5)
dataimg=resp.read()
name=str(uuid.uuid1())
filename=""
if len(real_extension)>4:
real_extension=".gif"
real_extension=real_extension.lower()
if real_extension==".gif":
filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
self.count+=1
else:
filename =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
self.count+=1
"""
name=str(uuid.uuid1())
filename=""
if len(real_extension)>4:
real_extension=".gif"
filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
self.count+=1
"""
try:
if not os.path.exists(filename):
file_object = open(filename,'w+b')
file_object.write(dataimg)
file_object.close()
self.anaylis_info(n,filename,real_extension) #入库操作
else:
print "file exist"
except IOError,e1:
print "e1=",e1
pass
self.unlock()
except IOError,e2:
#print "e2=",e2
pass
self.chance1+=1
except Exception as parse_error:
print "parse_error",parse_error
pass
def title_dealwith(self,title):
#print "title",title
a=title.find("<strong>")
temp1=title[0:a]
b=title.find("</strong>")
temp2=title[a+8:b]
temp3=title[b+9:len(title)]
return (temp1+temp2+temp3).strip()
def anaylis_info(self,n,filename,real_extension):
print "success."
#if self.wait_ana_queue.qsize()!=0:
#n,filename,real_extension=self.wait.ana_queue.get()
#self.lock()
objURL=n['objURL'] #图片地址
fromURLHost=n['fromURLHost'] #来源网站
width=n['width'] #宽度
height=n['height'] #高度
di=n['di'] #用来唯一标识
type=n['type'] #格式
fromPageTitle=n['fromPageTitle'] #来自网站
keyword=self.title_dealwith(fromPageTitle)
cs=n['cs'] #未知
os=n['os'] #未知
temp = time.time()
x = time.localtime(float(temp))
acTime = time.strftime("%Y-%m-%d %H:%M:%S",x) #爬取时间
self.dbcurr.execute('select ID from pic_info where cs=%s', (cs))
y = self.dbcurr.fetchone()
if not y:
print 'add pic',filename
self.commit_count+=1
self.dbcurr.execute('INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
if self.commit_count==10:
self.dbconn.commit()
self.commit_count=0
#self.unlock()
def format_top_url(self,word,pn,rn):
url = TOP_URL.format(word=word, pn=pn,rn=rn).encode('utf-8')
return url
def how_many(self,data):
try:
ipdata = json.loads(data)
if ipdata['displayNum']>0:
how_many=ipdata['displayNum']
return int(how_many)
else:
return 0
except Exception as e:
pass
def get_pic(self):
"""
word="gif"
pn=0
rn=24
if self.key_word_queue.qsize()!=0:
word,pn,rn=self.key_word_queue.get()
url=self.format_top_url(word,pn,rn)
global proxy
if url:
try:
html=""
try:
req = urllib2.Request(url,headers=i_headers)
response = urllib2.urlopen(req, None,5)
#print "url",url
html = self.browser.openurl(url).read()
except Exception as err:
print "err",err
#pass
if html:
how_many=self.how_many(html)
#how_many=10000
print "how_many",how_many
word=self.get_para(url,"word")
rn=int(self.get_para(url,"rn"))
t=math.ceil(how_many/rn)
num = int(t)
for item in xrange(0,num-1):
"""
try:
global proxy
print "size of queue",self.request_queue.qsize()
if self.request_queue.qsize()!=0:
id,word,page_num = self.request_queue.get()
u=self.format_top_url(word,page_num,24)
self.lock()
self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id))
self.dbconn.commit()
if self.chance >0 or self.chance1>1: #任何一个出问题都给换代理
if self.ID % 100==0:
self.dbcurr.execute("select count(*) from proxy")
for r in self.dbcurr:
count=r[0]
if self.ID>count:
self.ID=50
self.dbcurr.execute("select * from proxy where ID=%s",(self.ID))
results = self.dbcurr.fetchall()
for r in results:
protocol=r[1]
ip=r[2]
port=r[3]
pro=(protocol,ip+":"+port)
if pro not in self.next_proxy_set:
self.next_proxy_set.add(pro)
self.chance=0
self.chance1=0
self.ID+=1
self.unlock()
proxy_support = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
html=""
try:
req = urllib2.Request(u,headers=i_headers)
#print "u=",u
response = urllib2.urlopen(req, None,5)
html = response.read()
if html:
#print "html",type(html)
self.parse_json(html)
except Exception as ex1:
#print "error=",ex1
pass
self.chance+=1
if self.chance>0 or self.chance1>1:
if len(self.next_proxy_set)>0:
protocol,socket=self.next_proxy_set.pop()
proxy= {protocol:socket}
print "change proxy finished<<",proxy,self.ID
except Exception as e:
print "error1",e
pass
if __name__ == '__main__':
app = BaiduImage()
app.start_work(80)
#app.generateSeed()
while 1:
pass
百度图片爬虫-python版的更多相关文章
- 百度图片爬虫-python版-如何爬取百度图片?
上一篇我写了如何爬取百度网盘的爬虫,在这里还是重温一下,把链接附上: http://www.cnblogs.com/huangxie/p/5473273.html 这一篇我想写写如何爬取百度图片的爬虫 ...
- python写的百度图片爬虫
学了一下python正则表达式,写一个百度图片爬虫玩玩. 当技术遇上心术不正的人,就成我这样的2B青年了. python3.6开发.程序已经打包好,下载地址: http://pan.baidu.com ...
- python 百度图片爬虫
# -*- coding:utf-8 -*- #https://blog.csdn.net/qq_32166627/article/details/60882964 import requests i ...
- 百度翻译爬虫-Web版(自动生成sign)
# 面向对象 # 百度翻译 -- 网页版(自动获取token,sign) import requests import js2py import json import re class WebFan ...
- Python 爬虫实例(1)—— 爬取百度图片
爬取百度图片 在Python 2.7上运行 #!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: loveNight import jso ...
- 【Python网络爬虫四】通过关键字爬取多张百度图片的图片
最近看了女神的新剧<逃避虽然可耻但有用>,同样男主也是一名程序员,所以很有共鸣 被大只萝莉萌的一脸一脸的,我们来爬一爬女神的皂片. 百度搜索结果:新恒结衣 本文主要分为4个部分: 1.下载 ...
- Python爬虫:通过关键字爬取百度图片
使用工具:Python2.7 点我下载 scrapy框架 sublime text3 一.搭建python(Windows版本) 1.安装python2.7 ---然后在cmd当中输入python,界 ...
- 如何用Python爬虫实现百度图片自动下载?
Github:https://github.com/nnngu/LearningNotes 制作爬虫的步骤 制作一个爬虫一般分以下几个步骤: 分析需求 分析网页源代码,配合开发者工具 编写正则表达式或 ...
- python爬虫获取百度图片(没有精华,只为娱乐)
python3.7,爬虫技术,获取百度图片资源,msg为查询内容,cnt为查询的页数,大家快点来爬起来.注:现在只能爬取到百度的小图片,以后有大图片的方法,我会陆续发贴. #!/usr/bin/env ...
随机推荐
- 17.C#类型判断和重载决策(九章9.4)
今天来结束第九章,聊下我们经常忽略,但是编译器会帮我们完成的"类型判断和重载决策",理解编译器如何帮我们完成,相信在写代码时会更明确,避免一些编译出错,排查的问题,让我们开发更给力 ...
- [Linux主机] 优化你的php-fpm(php5.3+)让你的网站跑得更快
从php5.3以后php自带了php-fpm不是和php5.2一样以插件的方式存在了.这给我们带来一个好处502没有那么容易出现了坛子里用linux的绝大多数应该还是在用小军的lnmp的那个包,但是配 ...
- 完美实现开机启动虚拟WIFI,顺便实现目前的WP8系统使用VPN(7.1修)
众所周知,windows7系统的机器若带有无线网卡(台式机可以买一个USB无线网卡,京东目前39元,TP-Link的),可以虚拟出wifi,供手机等移动设备使用. 虚拟的WIFI的命了和软件在网上都找 ...
- 软工实践练习一——使用Git进行代码管理心得
在github.com的操作 注册 创建Organization 将指定代码库fork到小组Organization下 在Organization下创建repository 这些操作在学校的机房已经完 ...
- 2012杀毒软件排行榜TOP10强
2012杀毒软件排行榜TOP10强 1:avast!杀毒软件 来自捷克的avast!,已有数十年的历史,它在国外市场一直处于领先地位.avast!分为家庭版.专业版.家庭网络特别版.和服务 ...
- DVR分布式路由
1. 背景 没有使用DVR的场景: 从图中可以明显看到东西向和南北向的流量会集中到网络节点,这会使网络节点成为瓶颈. 如果启用DVR,如下图: 对于东西向的流量, 流量会直接在计算节点之间传递. 对于 ...
- Maven-改变本地存储仓库位置
修改 maven 仓库存放位置: 找到 maven 下的 conf 下的 settings.xml 配置文件,假设maven安装在D:\Server目录中.那么配置文件应该在 D:\Server\ma ...
- 洛谷U5653 宋荣子的小饼干
题目描述 楼下机房的LYL有n个妹子,分别编号为a1,a2……an,每个妹子都拥有一定数量的小饼干.有一天,saruka没有吃晚饭,饿的不要不要的,这时,他忽然想起了LYL的妹子们有小饼干可以吃.于是 ...
- Codeforces 593B Anton and Lines
LINK time limit per test 1 second memory limit per test 256 megabytes input standard input output st ...
- IOS基础之 (四) OC对象
一 建立一个OC的类 完整的写一个函数:需要函数的声明和定义. 完整的写一个类:需要类的声明和实现. 1.类的声明 声明对象的属性和行为 #import <Foundation/Foundati ...