001 #coding:utf-8
002 import re,os,shutil,sys
003 import urllib2,socket,cookielib
004 from threading import Thread,stack_size,Lock
005 from Queue import Queue
006 import time
007 from gzip import GzipFile
008 from StringIO import StringIO
009
010 class ContentEncodingProcessor(urllib2.BaseHandler):
011 """A handler to add gzip capabilities to urllib2 requests """
012
013 # add headers to requests
014 def http_request(self, req):
015 req.add_header("Accept-Encoding", "gzip, deflate")
016 return req
017
018 # decode
019 def http_response(self, req, resp):
020 old_resp = resp
021 # gzip
022 if resp.headers.get("content-encoding") == "gzip":
023 gz = GzipFile(
024 fileobj=StringIO(resp.read()),
025 mode="r"
026 )
027 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
028 resp.msg = old_resp.msg
029 # deflate
030 if resp.headers.get("content-encoding") == "deflate":
031 gz = StringIO( deflate(resp.read()) )
032 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and
033 resp.msg = old_resp.msg
034 return resp
035
036 # deflate support
037 import zlib
038 def deflate(data): # zlib only provides the zlib compress format, not the deflate format;
039 try: # so on top of all there's this workaround:
040 return zlib.decompress(data, -zlib.MAX_WBITS)
041 except zlib.error:
042 return zlib.decompress(data)
043
044 class Fetcher:
045 '''
046 html Fetcher
047
048 basic usage
049 -----------
050 from fetcher import Fetcher
051 f = Fetcher()
052 f.get(url)
053
054 post
055 ----
056 req = urllib2.Request(...)
057 f.post(req)
058
059 multi-thread
060 ------------
061 f = Fetcher(threads=10)
062 for url in urls:
063 f.push(url)
064 while f.taskleft()
065 url,html = f.pop()
066 deal_with(url,html)
067 '''
068 def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None):
069 #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
070 cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
071 encoding_support = ContentEncodingProcessor()
072 #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
073 self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler)
074 self.req = urllib2.Request('http://www.hsbc.com')
075 socket.setdefaulttimeout(timeout)
076 self.q_req = Queue()
077 self.q_ans = Queue()
078 self.lock = Lock()
079 self.running = 0
080 if loginfunc:
081 self.opener = loginfunc(self.opener)
082 if threads:
083 self.threads = threads
084 stack_size(stacksize)
085 for i in range(threads):
086 t = Thread(target=self.threadget)
087 t.setDaemon(True)
088 t.start()
089
090 def __del__(self):
091 time.sleep(0.5)
092 self.q_req.join()
093 self.q_ans.join()
094
095 def taskleft(self):
096 return self.q_req.qsize()+self.q_ans.qsize()+self.running
097
098 def push(self,req,repeat=3):
099 if not self.threads:
100 print 'no thread, return get instead'
101 return get(req,repeat)
102 self.q_req.put(req)
103
104 def pop(self):
105 try:
106 data = self.q_ans.get(block=True,timeout=10)
107 self.q_ans.task_done()
108 except:
109 data = ['','']
110 return data
111
112 def threadget(self):
113 while True:
114 req = self.q_req.get()
115 with self.lock:
116 self.running += 1
117 ans = self.get(req)
118 print 'got',req
119 self.q_ans.put((req,ans))
120 try:
121 self.q_req.task_done()
122 except:
123 pass
124 with self.lock:
125 self.running -= 1
126 time.sleep(0.1) # don't spam
127
128 def proxyisworking(self):
129 try:
130 self.opener.open('http://www.hsbc.com').read(1024)
131 return True
132 except Exception , what:
133 print what
134 return False
135 def get(self,req,repeat=3):
136 '''
137 http GET req and repeat 3 times if failed
138 html text is returned when succeeded
139 '' is returned when failed
140 '''
141 try:
142 response = self.opener.open(req)
143 data = response.read()
144 except Exception , what:
145 print what,req
146 if repeat>0:
147 return self.get(req,repeat-1)
148 else:
149 print 'GET Failed',req
150 return ''
151 return data
152
153 def post(self,req,repeat=3):
154 '''
155 http POST req and repeat 3 times if failed
156 html text/True is returned when succeeded
157 False is returned when failed
158 '''
159 if not isinstance(req,urllib2.Request):
160 print 'post method need urllib.Request as argument'
161 return False
162 else:
163 r = self.get(req,repeat)
164 if r:
165 return r
166 else:
167 return True
168
169 class SiteCopyer:
170 def __init__(self,url):
171 self.baseurl = url
172 self.home = self.baseurl.split('/')[2]
173 self.f = Fetcher(threads=10)
174 self.create_dir()
175
176 def create_dir(self):
177 try:
178 shutil.rmtree(self.home)
179 except Exception,what:
180 print what
181 try:
182 os.mkdir(self.home)
183 os.mkdir(self.home+'/media')
184 os.mkdir(self.home+'/media/js')
185 os.mkdir(self.home+'/media/css')
186 os.mkdir(self.home+'/media/image')
187 except Exception,what:
188 print what
189
190 def full_link(self,link,baseurl=None):
191 if not baseurl:
192 baseurl = self.baseurl
193 if '?' in link:
194 link = link.rsplit('?',1)[0]
195 if not link.startswith('http://'):
196 if link.startswith('/'):
197 link = '/'.join(baseurl.split('/',3)[:3]) + link
198 elif link.startswith('../'):
199 while link.startswith('../'):
200 baseurl = baseurl.rsplit('/',2)[0]
201 link = link[3:]
202 link = baseurl+'/'+link
203 else:
204 link = baseurl.rsplit('/',1)[0]+'/'+link
205 return link
206
207 def link_alias(self,link):
208 link = self.full_link(link)
209 name = link.rsplit('/',1)[1]
210 if '.css' in name:
211 name = name[:name.find('.css')+4]
212 alias = '/media/css/'+name
213 elif '.js' in name:
214 name = name[:name.find('.js')+3]
215 alias = '/media/js/'+name
216 else:
217 alias = '/media/image/'+name
218 return alias
219
220 def strip_link(self,link):
221 if link and (link[0] in ['"',"'"]):
222 link = link[1:]
223 while link and (link[-1] in ['"',"'"]):
224 link = link[:-1]
225 while link.endswith('/'):
226 link = link[:-1]
227 if link and (link[0] not in ["<","'",'"']) and ('feed' not in link):
228 return link
229 else:
230 return ''
231
232 def copy(self):
233 page = self.f.get(self.baseurl)
234 links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page)
235 links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) )
236 links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) )
237 templinks = []
238 for link in links:
239 slink = self.strip_link(link)
240 if slink:
241 templinks.append(slink)
242 links = templinks
243 for link in set(links):
244 page = page.replace(link,self.link_alias(link)[1:])
245 self.f.push( self.full_link(link) )
246 open(self.home+'/index.html','w').write(page)
247 while self.f.taskleft():
248 url,page = self.f.pop()
249 if url.endswith('.css'):
250 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
251 templinks = []
252 for link in links:
253 slink = self.strip_link(link)
254 if slink:
255 templinks.append(slink)
256 links = templinks
257 for link in set(links):
258 self.f.push( self.full_link(link,url) )
259 page = page.replace(link,self.link_alias(link)[1:].replace("media",".."))
260 print 'write to',self.home+self.link_alias(url)
261 try:
262 open(self.home+self.link_alias(url),'w').write(page)
263 except Exception,what:
264 print what
265
266 if __name__ == "__main__":
267 if len(sys.argv) == 2:
268 url = sys.argv[1]
269 SiteCopyer(url).copy()
270 else:
271 print "Usage: python "+sys.argv[0]+" url"

【py网页】sitecopy代码的更多相关文章

  1. php正则获取网页标题、关键字、网页描述代码

    php正则获取网页关键字,代码如下: function get_keywords($html) { $html=strtolower($html); preg_match("@<hea ...

  2. Html网页的代码

    Html网页的代码 很全哦 1)贴图:<img src="图片地址"> 2)加入连接:<a href="所要连接的相关地址">写上你想写 ...

  3. 使用python对py文件程序代码复用度检查

    #!/user/bin/env python # @Time :2018/6/5 14:58 # @Author :PGIDYSQ #@File :PyCheck.py from os.path im ...

  4. 网页HTML代码在线运行器

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  5. 基于jQuery仿QQ音乐播放器网页版代码

    基于jQuery仿QQ音乐播放器网页版代码是一款黑色样式风格的网页QQ音乐播放器样式代码.效果图如下: 在线预览   源码下载 实现的代码. html代码: <div class="m ...

  6. mp4网页播放代码,有声音无图像的解决办法~

    mp4网页播放代码,有声音无图像的解决办法~     关于网页播放mp4格式的视频,找了一些插件,这里推荐一下video.js 官方网址:http://www.videojs.com/ github ...

  7. 程序猿爱情表白专用html5动画网页的代码

    程序猿爱情表白专用html5动画网页的代码 下载地址:源代码 程序员表白专用的html5动画特效网页,真的挺羡慕创作者的水平,有这水平能够把爱表白给想表白的人,不要以为那些鲜花是用 的图片.你会发如今 ...

  8. Python获取网页html代码

    获取网页html代码: import requests res = requests.get('https://www.cnblogs.com/easyidea/p/10214559.html') r ...

  9. 【py网页】urlopen的补充,完美

    urllib 是 python 自带的一个抓取网页信息一个接口,他最主要的方法是 urlopen(),是基于 python 的 open() 方法的.下面是主要说明: 1 urllib.urlopen ...

随机推荐

  1. VLC说明

    一.简介 vlc的全名是Video Lan Client,是一个开源的.跨平台的视频播放器.VLC支持大量的音视频传输.封装和编码格式,完整的功能特性列表可以在这里获得http://www.video ...

  2. Windows-006-映射网络驱动器图文详解

    此文主要讲述 Win7 中,如何映射网络驱动器,一般用于网络共享时.敬请亲们参阅,若有不足之处,敬请大神指正,不胜感激! 打开计算机,选择工具栏中的 映射网络驱动器,依据下图中的操作进行映射网络驱动器 ...

  3. http://blog.csdn.net/foreverling/article/details/51385128

    http://blog.csdn.net/foreverling/article/details/51385128

  4. iOS7跳转AppStore地址

    跳转AppStore地址改变: 由 itms-apps://ax.itunes.apple.com/WebObjects/MZStore.woa/wa/viewContentsUserReviews? ...

  5. Insert BLOB && CLOB from PL/SQL and JDBC

    For PL/SQL 1)Create Directory Where BLOB resides. create or replace directory temp as '/oradata2'; - ...

  6. iOS:项目中用到的Cookie

    1.介绍: 做了这么长时间开发,Cookie真是用的不多,可是现在不一样了,这次的项目我用到了Cookie.其实,Cookie的使用在项目中愈加的频繁,一般情况下,提供的接口是用Cookie来识别用户 ...

  7. iOS:runtime最全的知识总结

    runtime 完整总结 好东西,应该拿出来与大家分享... 南峰子博客地址:http://southpeak.github.io/blog/categories/ios/ 原文链接:http://w ...

  8. Python迁移MySQL数据到MongoDB脚本

    MongoDB是一个文档数据库,在存储小文件方面存在天然优势.随着业务求的变化,需要将线上MySQL数据库中的行记录,导入到MongoDB中文档记录. 一.场景:线上MySQL数据库某表迁移到Mong ...

  9. 私有IP地址范围

    私有IP地址范围 有一部分的地址专门是用于内网的地址,包括: A类中 10.0.0.0/8 B类中 172.16.0.0/12 ~ 172.31.0.0/12 C类中 192.168.0.0/16 这 ...

  10. C# WebApi传参之Post请求-AJAX

    最近悟出来一个道理,在这儿分享给大家:学历代表你的过去,能力代表你的现在,学习代表你的将来. 十年河东十年河西,莫欺少年穷.  学无止境,精益求精    上一节讲述了C# WebApi传参之Get请求 ...