This is an alternative to OfflineExplorer.

Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:

1. L193, change "homepage1_BottomPager" to  "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.

2. L394, set url to your last page.

3. L396, set the output directory on your local disk.

Enjoy it!

 #! encoding=utf-8

 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。

 import urllib2
import re
import os
import sys
# from HTMLParser import HTMLParser
import html5lib
# from xml.etree.ElementTree import ElementTree
from urlparse import urlparse
import xml
import codecs
import traceback
import time # class MyHTMLParser(HTMLParser): # def handle_starttag(self, tag, attrs):
# # if tag.lower() == "img":
# print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
# for x in attrs:
# print "name %s,value %s" % (x[0],x[1])
# def handle_endtag(self, tag):
# print "Encountered the end of a %s tag" % tag # def handle_startendtag(self, tag, attrs):
# print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
# for x in attrs:
# print "name %s,value %s" % (x[0],x[1]) # 资源尝试次数
gTestTime = 5 def DownloadFile(url,output):
responseText = None
dirssPath = None
try:
res = urlparse(url)
url = res.scheme+"://"+res.netloc+res.path
path = res.path
index = path.rfind('/')
dirss = "/"
if index != -1:
dirss = output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
dirss_ansi = dirss.decode('utf-8')
if not os.path.exists(dirss_ansi):
os.makedirs(dirss_ansi)
global gTestTime
count = gTestTime
while True:
if count < 0:
break
count = count - 1
header={"User-Agent": "Mozilla-Firefox5.0"}
if not url.startswith("http://"):
break
try:
# print "url: %s:%d" % (url,count)
time.sleep(0.5)
request = urllib2.Request(url,None,header)
response = urllib2.urlopen(request)
dirssPath_ansi = dirssPath.decode("utf-8")
if not os.path.exists(dirssPath_ansi):
resourceFile = open(dirssPath_ansi,"wb")
responseText = response.read()
if url.endswith(".js"):
responseText = responseText.replace("http://","")
responseText = responseText.replace("https://","")
resourceFile.write(responseText)
resourceFile.close()
break
except Exception,e:
print "DownloadFile: %s:%s:%d" % (e,url,count)
# pass
# exstr = traceback.format_exc()
# print exstr except Exception,e:
pass
# exstr = traceback.format_exc()
# print exstr return (responseText,url,output) def ReadCss(css):
# print "ReadCss"
mode = 'url\(\"?([^)]+)\"?\)'
pattern = re.compile(mode)
try:
text = css[0]
if css[0] == None:
return
strMatch = pattern.findall(text)
size = len(strMatch)
# print "size: ",size
for i in range(0,size,1):
one = strMatch[i]
newurl = GetConcatUrl(css[1],one)
DownloadFile(newurl,css[2])
except Exception,e:
pass
# exstr = traceback.format_exc()
# print exstr def Download(url,output):
# try:
header={"User-Agent": "Mozilla-Firefox5.0"}
namespace = "{http://www.w3.org/1999/xhtml}"
request = urllib2.Request(url,None,header)
response = urllib2.urlopen(request) data = response.read()
document = html5lib.parse(data)
imgElements = document.findall('.//{0}img'.format(namespace))
# print "imgElements %d" % len(imgElements)
for img in imgElements:
src = img.attrib["src"]
# print "src %s" % src
try:
res = urlparse(src)
# 非cnblogs的图片不下载
if not res.netloc.endswith(".cnblogs.com"):
print "image not download: %s:%s" % (src,res.netloc)
continue
except Exception,e:
pass
DownloadFile(src,output) linkElements = document.findall('.//{0}link'.format(namespace))
# print "linkElements %d" % len(linkElements)
for link in linkElements:
href = link.attrib["href"]
# print "href %s" % href
text = DownloadFile(href,output)
if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":
ReadCss(text) scriptElements = document.findall('.//{0}script'.format(namespace))
# print "scriptElements %d" % len(scriptElements)
for script in scriptElements:
if script.attrib.has_key("src"):
src = script.attrib["src"]
# print "src %s" % src
DownloadFile(src,output) htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = GetHtmlName(url)
output = output.decode("utf-8") + "/"+htmlName+".htm"
data = data.replace("http://","")
data = data.replace("https://","")
data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml") resourceFile = open(output,"wb")
resourceFile.write(data)
resourceFile.close() def GetConcatUrl(url,png):
# one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css
count = 0
index = png.find("..")
startindex = None
while index != -1:
count = count + 1;
startindex = index + 2
index = png.find("..",startindex) second = png[startindex:]
length = len(url)
index = url.rfind("/")
endindex = 0
while count >= 0 and index != -1:
endindex = index
index = url.rfind("/",0, endindex)
count = count - 1
first = url[0:endindex]
return first+second def getAllListUrl(url):
header={"User-Agent": "Mozilla-Firefox5.0"}
request = urllib2.Request(url,None,header)
response = urllib2.urlopen(request)
data = response.read() # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
document = html5lib.parse(data)
namespace = "{http://www.w3.org/1999/xhtml}" # get <div id="homepage1_BottomPager" class="topicListFooter">
pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace))
print( "Debug>len(pageList)=%d"%len(pageList) );
# get <div class="pager">
alinks = list(pageList[0])
# get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">
alinks1 = list(alinks[0])
lastArticle = alinks1[len(alinks1)-1] # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20'
lastArticleHref = lastArticle.attrib["href"]
lastPageIndex = lastArticleHref.rfind("=")
lastPageNum = int(lastArticleHref[lastPageIndex+1:])
urlInfo = lastArticleHref[0:lastPageIndex] urlList = []
for x in xrange(1,lastPageNum+1):
listUrl = urlInfo+"="+str(x)
urlList.append(listUrl) return urlList def getArticleList(url):
# 获取所有的文章url
# <div id="article_toplist" class="list"></div>
# <div id="article_list" class="list" # <div class="list_item article_item" # <div class="article_title">
# <span class="ico ico_type_Original"></span>
# <h1>
# <span class="link_title">
# <a href="/infoworld/article/details/18984183"> # <div class="article_manage">
# <span class="link_postdate"></span> urlList = getAllListUrl(url)
print "文章页数(number of pages) ",len(urlList)
header={"User-Agent": "Mozilla-Firefox5.0"} allLists = [] strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")
pageNum = 0
global gTestTime
for one in urlList:
tryCount = gTestTime # try count
pageNum = pageNum + 1
pageNumStr = strPage.format(pageNum)
print pageNumStr while tryCount > 0:
try:
tryCount = tryCount - 1
time.sleep(0.5) #访问太快会不响应
request = urllib2.Request(one,None,header)
response = urllib2.urlopen(request) data = response.read()
document = html5lib.parse(data,encoding="utf-8")
namespace = "{http://www.w3.org/1999/xhtml}"
# .//{0}div[@id=\'article_toplist\']
#topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
#articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
articleLists = document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace))
allLists = allLists + articleLists
break
except Exception, e:
print "getArticleList %s:%s:%d" % (e,one,tryCount) count = 0 # 文章数
artices = []
for article in allLists:
count = count+1
alink = article.find(".//{0}a".format(namespace))
# href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html'
href = alink.attrib["href"]
#oneHref = "http://blog.csdn.net"+href
oneHref = href childElement = list(alink)
linkIter = alink.itertext()
title = "".encode("utf-8")
for x in linkIter:
title = title+x.strip().encode("utf-8")
artices.append([oneHref,title]) return artices def GetUserName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)
htmlName = url[htmlNameIndex1+1:htmlNameIndex]
# if htmlNameIndex+1 == urlLen:
# htmlNameIndex = url.rfind("/",0,htmlNameIndex)
# htmlName = url[htmlNameIndex+1:urlLen-1]
# else:
# htmlName = url[htmlNameIndex+1:]
return htmlName def GetHtmlName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
if htmlNameIndex+1 == urlLen:
htmlNameIndex = url.rfind("/",0,htmlNameIndex)
htmlName = url[htmlNameIndex+1:urlLen-1]
else:
htmlName = url[htmlNameIndex+1:]
return htmlName #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL
def Start(url,output): print "备份开始"
lists = getArticleList(url)
username = GetUserName(url)
output_username = output+"/"+username
output_username.replace("\\","/")
if not os.path.exists(output_username.decode("utf-8")):
os.mkdir(output_username.decode("utf-8")) totalNum = len(lists)
print "总文章数(number of articles): %d" % totalNum # 生成首页文件
doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'
indexHtml = output_username + ".htm"
f = open(indexHtml.decode("utf-8"),"w")
print >> f,doctype
print >> f,'<html>'
print >> f,'<head>'
print >> f,charset
print >> f,'</head>'
print >> f,'<frameset cols=\"20%,*\">'
navigationHtmlName = username+'-navigation.htm'
print >> f,'<frame src=\"'+navigationHtmlName+'\" />'
firstHtmlName = GetHtmlName(lists[0][0])
print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">'
print >> f,'</frameset>'
print >> f,'</html>'
f.close() # 生成导航文件
navigationHtml = output+"/"+navigationHtmlName
# f = open(navigationHtml.decode("utf-8"),"w")
f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")
print >> f,doctype
print >> f,'<html>'
print >> f,'<head>'
print >> f,charset
print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>'
print >> f,'</head>'
print >> f,'<body>'
count = 0
for x in lists:
count = count + 1
articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"
print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />'
print >> f,'</body>'
print >> f,'</html>'
f.close() print "开始下载文章"
currentNum = 0
strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")
global gTestTime
for x in lists:
count = gTestTime
currentNum = currentNum+1
while True:
if count < 0:
break
count = count - 1
try:
time.sleep(1) #访问太快,csdn会报503错误.
strPageTemp = strPage.format(totalNum,currentNum)
strPageTemp = strPageTemp+x[1]
print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时 print x[0]
print "\n"
Download(x[0],output_username)
break
except Exception, e:
# exstr = traceback.format_exc()
# print exstr
pass #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL
if __name__=='__main__':
url = "http://www.cnblogs.com/yaoyansi/default.html?page=4"
#output = "C:/Users/apple/Desktop/新建文件夹"
output = "/tmp/my_tmp/cnblogs"
Start(url,output)
# Download("http://blog.csdn.net/dcraw/article/details/6858820",
# "C:/Users/apple/Desktop/新建文件夹/infoworld")

Reference:

[1] http://blog.csdn.net/llrraa2010/article/details/35540845

How to backup your blogs on cnblogs的更多相关文章

  1. Usual tiny skills & solutions

    Ubuntu and Win10 - double OS 2016-02-21 Yesterday I helped my friend install Ubuntu (14.04 LTS) on h ...

  2. MySQL 使用XtraBackup的shell脚本介绍

    mysql_backup.sh是关于MySQL的一个使用XtraBackup做备份的shell脚本,实现了简单的完整备份和增量备份.以及邮件发送备份信息等功能.功能目前还比较简单,后续将继续完善和增加 ...

  3. Troubleshooting Failed Requests Using Tracing in IIS 8.5

    https://docs.microsoft.com/en-us/iis/troubleshoot/using-failed-request-tracing/troubleshooting-faile ...

  4. 我心中的核心组件(可插拔的AOP)~第十五回 我的日志组件Logger.Core(策略,模版方法,工厂,单例等模式的使用)

    回到目录 之前的讲过两篇关于日志组件的文章,分别是<第一回  日志记录组件之自主的Vlog>和<第三回  日志记录组件之log4net>,而今天主要说一下我自己开发的另一种日志 ...

  5. 【编译原理】c++实现自下而上语法分析及中间代码(四元式)生成

    写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...

  6. 【编译原理】c++实现自下而上语法分析器

    写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...

  7. 【编译原理】c++实现自上而下语法分析器

    写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...

  8. 【编译原理】c++实现词法分析器

    写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...

  9. Associate File Type with Qt In Mac Os and Win

    Win Registry Question One day, my boss want me to finish one function which let the users can double ...

随机推荐

  1. BZOJ 2898 模拟

    普及组水题. 按位模拟第一个序列和第二个序列,细节比较多.. 仅为部分看后面两位的和,如果大于10就近位小于8就不进位等于9就看下一位. #include <cstdio> #define ...

  2. HDU5402 暴力模拟

    因为题目中没有说是否是正整数,导致我们以为是DP,没敢做...太可惜了,不过现场赛绝对不会出现这种情况,毕竟所有的提问是都可以看见的. 题意:告诉一个矩阵,然后求从(1,1)到(n,m)能走过的最大和 ...

  3. BackTrack5-r3汉化

    进入BT系统图形模式,将语言包1和2拖进BT图形桌面. 所需文件包地址:http://pan.baidu.com/s/1i3ouc9v(64位更新包)将语言包1里的全部文件复制粘贴到:/var/cac ...

  4. 总结之H3C汇聚层交换机认证在线人数展示系统

    前情提要:意外接了老师说的一个小程序,然后计划7天(实际10天)的小项目就冒出来了. (1)时间与工程量.在和老师开始谈具体需求前,我凭感觉猜了猜完成这个小项目的时间.然后,再和老师确定需求后,再回头 ...

  5. 2.ViewBag、ViewData、TempData之间的区别

    1.ViewBag and ViewData(非跨视图访问) 1)ViewBag是一种dynamic动态类型,用户可以自定义属性并为其赋值,它会在运行时动态解析(例:可以作为变量.数组等各种对象传递并 ...

  6. sumoselect插件

    由于项目需要,研究了下sumoselect插件,接下来简单介绍下sumoselect. 在百度上搜索“sumoselect.js”,查到的网页基本上都有对sumoselect的基本介绍,如下: 简单介 ...

  7. 黑马程序员:Java编程_7K面试题之交通灯管理系统

    =========== ASP.Net+Android+IOS开发..Net培训.期待与您交流!=========== 模拟实现十字路口的交通灯管理系统逻辑,具体需求如下: 异步随机生成按照各个路线行 ...

  8. nodevalue

    在改变文本节点值时,使用DOM的nodeValue属性,用来得到和设置一个节点的值. 代码示例: html: <p id="description">choose a ...

  9. Nginx 配置支持C++

    1.在auto/make脚本里添加新的编译器和链接器: #LINK = $LINK                         #原25行附近注释掉原链接器CXX=g++             ...

  10. Eclipse 关于“The type * is not accessible due to restriction on required library”问题的解决办法

    The type * is not accessible due to restriction on required library”的错误, 意思是所需要的类库由于受限制无法访问. 解决办法: 1 ...