crawler
# !/usr/bin/env python
# encoding:UTF-8
from util import request_url
import re
import os
import sys
#from __future__ import print_function
from pptx import Presentation
from pptx.util import Inches
import PIL
class Crawler(object):
def __init__(self):
self.main_url = "https://mp.weixin.qq.com/s?__biz=MzA3NzIwMDM3OA==&mid=209853452&idx=1&sn=bd40e9622dca2e5bd52af08bbf870861&pass_ticket=8MmcYuwV6RkFHjUHOnxmzVg%2FEhQYTM26Zg%2FO2ZpgJVGyL6ewBt5fJc%2BEsNkytOiN"
self.media_content_pattern = re.compile('<div class="rich_media_content " id="js_content">.*?</div>',re.S)
self.item_pattern = re.compile('<p><a href="(.*?)" target=',re.S)
self.title_pattern = re.compile('<h2 class="rich_media_title" id="activity-name">(.*?)</h2>',re.S)
self.elements_content_pattern = re.compile('<p style=.*?</p>',re.S)
self.png_pattern = re.compile('data-src="(.*?)"',re.S)
self.datatype_pattern = re.compile('data-type="(.*?)"',re.S)
self.text_pattern = re.compile('>(.*?)</',re.S)
self.picid_pattern = re.compile('http://mmbiz.qpic.cn/mmbiz/(.*?)/',re.S)
self.pic_fmt_pattern = re.compile('wx_fmt=(.*?)$',re.S)
self.data_path = "../data/"
self.ppt_path = "../ppt/"
def get_item_list(self):
ret,main_page = request_url(self.main_url)
if ret == -1 or main_page == "":
print "Request main page failed!"
return
info = self.media_content_pattern.findall(main_page);
if len(info) != 0:
media_content = info[0]
else:
media_content = ""
item_list = []
if media_content:
item_info = self.item_pattern.findall(main_page);
if len(item_info) != 0:
for item_url in item_info:
print item_url
item_list.append(item_url)
title = self.get_item(item_url)
if title== "" and item_url.find("&")!=-1:
item_url = item_url.replace("&","&")
title = self.get_item(item_url)
def get_item(self,item_url):
ret,item_page = request_url(item_url)
if ret == -1 or item_page == "":
print "Request item page failed! %s" % item_url
return "bad"
info = self.title_pattern.findall(item_page);
if len(info) != 0:
title = info[0].strip().replace("(ppt)","")
else:
title = ""
item_path = self.data_path+"bak"
ppt_file = self.ppt_path+"bak"
if title!="":
item_path = self.data_path + title
ppt_file = self.ppt_path+ title + ".pptx"
else:
print "title is null!%s" % item_url
return ""
if os.path.exists(ppt_file):
return "exist"
info = self.media_content_pattern.findall(item_page);
if len(info) != 0:
media_content = info[0]
else:
media_content = ""
if media_content == "":
return
info = self.elements_content_pattern.findall(media_content);
element_tuple_list = []
for element_content in info:
if element_content.find('data-src="http://')!=-1:
element_type = "png"
else:
element_type = "text"
element_data = ""
if element_type == "png":
info = self.png_pattern.findall(element_content);
if len(info) != 0:
element_data = info[0]
if element_data.find("wx_fmt")==-1:
info = self.datatype_pattern.findall(element_content)
if len(info)>0:
element_data += "?wx_fmt=%s" % info[0]
else:
element_data = ""
else:
info = self.text_pattern.findall(element_content);
if len(info)>3:
element_data = ""
else:
element_data = "\n".join(info)
if element_data:
element_tuple_list.append((element_type,element_data))
if len(element_tuple_list) > 0:
if not os.path.exists(item_path):
os.makedirs(item_path)
text_data = ""
picfile_list = []
for element_tuple in element_tuple_list:
element_type,element_data = element_tuple
if element_type == "text":
text_data += element_data
else:
if element_data:
picfile = self.download_pic(element_data,item_path)
if picfile!="":
picfile_list.append(picfile)
self.write_text_content(text_data,item_path)
self.creat_ppt(title,ppt_file,picfile_list)
return title
def download_pic(self,url,path):
ret,pic_page = request_url(url)
if ret == -1 or pic_page=="":
return ""
info = self.picid_pattern.findall(url)
if len(info)>0 and info[0]!="":
picid = info[0]
else:
picid = url.replace("/","_").replace(":","_").replace(".","_").replace("?","_")
info = self.pic_fmt_pattern.findall(url)
if len(info)>0 and info[0]!="":
fmt = info[0].split("&")[0]
fmt = fmt.split("?")[0]
else:
print "Get pic fmt failed!%s" % url
return ""
filename_bak = "../data/" + picid + "_bak.%s" % fmt
fp = open(filename_bak,"w")
fp.write(pic_page)
fp.close()
pil_image = PIL.Image.open(filename_bak)
w, h = pil_image.size
w_box = 720
h_box = 540
filename = path + "/" + picid + ".%s" % fmt
self.resize(w, h, w_box, h_box, pil_image,filename)
os.system("rm -rf %s" % filename_bak)
return filename
def resize(self,w, h, w_box, h_box, pil_image,outfile):
'''
resize a pil_image object so it will fit into
a box of size w_box times h_box, but retain aspect ratio
'''
f1 = 1.0*w_box/w # 1.0 forces float division in Python2
f2 = 1.0*h_box/h
factor = min([f1, f2])
#print(f1, f2, factor) # test
# use best down-sizing filter
width = int(w*factor)
height = int(h*factor)
out = pil_image.resize((width, height), PIL.Image.ANTIALIAS)
out.save(outfile)
def write_text_content(self,text_data,path):
filename = path + "/" + "text_data.txt"
fp = open(filename,"w")
fp.write(text_data)
fp.close()
def creat_ppt(self,title_content,ppt_file,picfile_list):
prs = Presentation("default.pptx")
title_slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(title_slide_layout)
title = slide.shapes.title
title.text = title_content
graph_slide_layout = prs.slide_layouts[6]
for picfile in picfile_list:
slide = prs.slides.add_slide(graph_slide_layout)
slide.shapes.add_picture(picfile,0,0)
prs.save(ppt_file)
def test_ppt():
picfile = "test.jpeg"
for i in xrange(12):
try:
prs = Presentation("default.pptx")
graph_slide_layout = prs.slide_layouts[i]
slide = prs.slides.add_slide(graph_slide_layout)
placeholder = slide.placeholders[0]
#pic = placeholder.insert_picture(picfile)
prs.save("../ppt/%s.pptx" % i)
except:
continue
def test_layout(i):
picfile = "test.jpeg"
prs = Presentation("default.pptx")
graph_slide_layout = prs.slide_layouts[i]
slide = prs.slides.add_slide(graph_slide_layout)
placeholder = slide.placeholders[1]
pic = placeholder.insert_picture(picfile)
prs.save("../ppt/%s.pptx" % i)
def test_empty_layout():
picfile = "test.jpeg"
prs = Presentation("default.pptx")
graph_slide_layout = prs.slide_layouts[6]
slide = prs.slides.add_slide(graph_slide_layout)
slide.shapes.add_picture(picfile,0,0)
prs.save("../ppt/%s.pptx" % 6)
if __name__=="__main__":
#test_empty_layout()
#test_layout(int(sys.argv[1]))
#test_ppt()
crawler = Crawler()
#crawler.get_item_list()
item_url = "http://mp.weixin.qq.com/s?__biz=MzA3NzIwMDM3OA==&mid=206906414&idx=1&sn=484555cf9c8efd164d06f6f6d0a6c19e&scene=21#wechat_redirect"
item_url = item_url.replace("&","&")
crawler.get_item(item_url)
print "done!"
util.py
# !/usr/bin/env python
# encoding:UTF-8
import urllib2
def request_url(url,repeat=3):
ret = -1#失败
content = ''
for cnt in xrange(repeat):
try:
req = urllib2.Request(url);
response = urllib2.urlopen(req)
content = response.read()
response.close()
ret = 0#成功
break;
except:
continue
result = (ret,content)
return result
crawler的更多相关文章
- A web crawler design for data mining
Abstract The content of the web has increasingly become a focus for academic research. Computer prog ...
- [CareerCup] 10.5 Web Crawler 网络爬虫
10.5 If you were designing a web crawler, how would you avoid getting into infinite loops? 这道题问如果让我们 ...
- Crawler & Ajax:WebBrowser C#
Crawler 與 Ajax http://net.zdnet.com.cn/network_security_zone/2007/1005/536329.shtml WebBrowser: 利用We ...
- (92) Web Crawling: How can I build a web crawler from scratch? - Quora
(92) Web Crawling: How can I build a web crawler from scratch? - Quora How can I build a web crawler ...
- (92) Is there a better crawler than Scrapy? - Quora
(92) Is there a better crawler than Scrapy? - Quora Is there a better crawler than Scrapy?Edit
- 使用Crawler框架搭建自己的爬虫框架MyCrawler
自己写一个爬虫框架的目的: 完美架构 在实际的数据采集编码过程中,发现代码比较乱,抓取数据,存储数据的代码混杂在一起,为了构建比较完美的数据采集框架 敏捷开发 将数据采集进行标准流程化,每个标准流程都 ...
- Py之Crawler:爬虫利用随机选取代理访问服务器的方法实现下载某网址上所有的图片到指定文件夹——Jason niu
#Py之Crawler:爬虫利用随机选取代理访问服务器的方法实现下载某网址上所有的图片到指定文件夹 import urllib.request import os import random def ...
- 使用Node.js搭建数据爬虫crawler
0. 通用爬虫框架包括: (1) 将爬取url加入队列,并获取指定url的前端资源(crawler爬虫框架主要使用Crawler类进行抓取网页) (2)解析前端资源,获取指定所需字段的值,即获取有价值 ...
- [开源 .NET 跨平台 Crawler 数据采集 爬虫框架: DotnetSpider] [一] 初衷与架构设计
[DotnetSpider 系列目录] 一.初衷与架构设计 二.基本使用 三.配置式爬虫 四.JSON数据解析与配置系统 五.如何做全站采集 为什么要造轮子 同学们可以去各大招聘网站查看一下爬虫工程师 ...
随机推荐
- 10款基于jquery的web前端特效及源码下载
1.jQuery时间轴插件:jQuery Timelinr 这是一款可用于展示历史和计划的时间轴插件,尤其比较适合一些网站展示发展历程.大事件等场景.该插件基于jQuery,可以滑动切换.水平和垂直滚 ...
- 10款免费CSS编辑器应对于Linux和Ubuntu
您是否在使用Linux和Ubuntu的,不知道在哪里可以找到一些优秀且免费的CSS编辑器用于Linux和Ubuntu的?如果你的答案是肯定的,然后停止幻想,开始浏览这个帖子里,我们展示了前10名,并免 ...
- HDU 1954 Subway tree systems (树的最小表示法)
题意:用一个字符串表示树,0代表向下走,1代表往回走,求两棵树是否同构. 分析:同构的树经过最小表示会转化成两个相等的串. 方法:递归寻找每一棵子树,将根节点相同的子树的字符串按字典序排列,递归回去即 ...
- Centos文本方式安装情况下lvm分区的创建
作者:马 岩(Furzoom) (http://www.cnblogs.com/furzoom/)版权声明:本文的版权归作者与博客园共同所有.转载时请在明显地方注明本文的详细链接,未经作者同意请不要删 ...
- UINavigationController 与 UITabBarController
http://www.cnblogs.com/YouXianMing/p/3756904.html // index start from 1. UITabBarItem *newsItem = [[ ...
- linux服务器修改ssh默认22端口方法
1.登录服务器,打开sshd_config文件 # vim /etc/ssh/sshd_config 2.找到#Port 22,默认是注释掉的,先把前面的#号去掉,再插入一行设置成你想要的端口号,注意 ...
- 工厂方法模式与IoC/DI控制反转和依赖注入
IoC——Inversion of Control 控制反转 DI——Dependency Injection 依赖注入 要想理解上面两个概念,就必须搞清楚如下的问题: 参与者都有谁? 依赖:谁 ...
- setEllipsize(TruncateAt where)
void android.widget.TextView.setEllipsize(TruncateAt where) public void setEllipsize (TextUtils.Trun ...
- Delphi XE5教程12:注释和编译器指示字
内容源自Delphi XE5 UPDATE 2官方帮助<Delphi Reference>,本人水平有限,欢迎各位高人修正相关错误!也欢迎各位加入到Delphi学习资料汉化中来,有兴趣者可 ...
- 【转】RunTime.getRunTime().addShutdownHook用法
Runtime.getRuntime().addShutdownHook(shutdownHook); 这个方法的含义说明: 这个方法的意思就是在jvm中增加一个关闭的钩子,当jvm关闭的时候,会执行 ...