How to backup your blogs on cnblogs

This is an alternative to OfflineExplorer.

Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:

1. L193, change "homepage1_BottomPager" to "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.

2. L394, set url to your last page.

3. L396, set the output directory on your local disk.

Enjoy it!

 #! encoding=utf-8

 #cnblogs博客备份，使用方法：修改最下面的url和output，然后执行就可以了。

 import urllib2

 import re

 import os

 import sys

 # from HTMLParser import HTMLParser

 import html5lib

 # from xml.etree.ElementTree import ElementTree

 from urlparse import urlparse

 import xml

 import codecs

 import traceback

 import time

 # class MyHTMLParser(HTMLParser):

 #     def handle_starttag(self, tag, attrs):

 #         # if tag.lower() == "img":

 #             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))

 #             for x in attrs:

 #                 print "name %s,value %s" % (x[0],x[1])

 #     def handle_endtag(self, tag):

 #         print "Encountered the end of a %s tag" % tag

 #     def handle_startendtag(self, tag, attrs):

 #         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))

 #         for x in attrs:

 #             print "name %s,value %s" % (x[0],x[1])

 # 资源尝试次数

 gTestTime = 5

 def DownloadFile(url,output):

   responseText = None

   dirssPath = None

   try:

     res = urlparse(url)

     url = res.scheme+"://"+res.netloc+res.path

     path = res.path

     index = path.rfind('/')

     dirss = "/"

     if index != -1:

       dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")

       dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")

       dirss_ansi = dirss.decode('utf-8')

       if not os.path.exists(dirss_ansi):

         os.makedirs(dirss_ansi)

     global gTestTime

     count = gTestTime

     while True:

       if count < 0:

         break

       count = count - 1

       header={"User-Agent": "Mozilla-Firefox5.0"}

       if not url.startswith("http://"):

         break

       try:

         # print "url: %s:%d" % (url,count)

         time.sleep(0.5)

         request = urllib2.Request(url,None,header)

         response = urllib2.urlopen(request)

         dirssPath_ansi = dirssPath.decode("utf-8")

         if not os.path.exists(dirssPath_ansi):

           resourceFile = open(dirssPath_ansi,"wb")

           responseText = response.read()

           if url.endswith(".js"):

             responseText = responseText.replace("http://","")

             responseText = responseText.replace("https://","")

           resourceFile.write(responseText)

           resourceFile.close()

         break

       except Exception,e:

         print "DownloadFile: %s:%s:%d" % (e,url,count)

         # pass

         # exstr = traceback.format_exc()

         # print exstr

   except Exception,e:

       pass

       # exstr = traceback.format_exc()

       # print exstr

   return (responseText,url,output)

 def ReadCss(css):

   # print "ReadCss"

   mode = 'url\(\"?([^)]+)\"?\)'

   pattern = re.compile(mode)

   try:

     text = css[0]

     if css[0] == None:

       return

     strMatch = pattern.findall(text)

     size = len(strMatch)

     # print "size: ",size

     for i in range(0,size,1):

       one = strMatch[i]

       newurl = GetConcatUrl(css[1],one)

       DownloadFile(newurl,css[2])

   except Exception,e:

       pass

       # exstr = traceback.format_exc()

       # print exstr 

 def Download(url,output):

   # try:

   header={"User-Agent": "Mozilla-Firefox5.0"}

   namespace = "{http://www.w3.org/1999/xhtml}"

   request = urllib2.Request(url,None,header)

   response = urllib2.urlopen(request)

   data = response.read()

   document = html5lib.parse(data)

   imgElements = document.findall('.//{0}img'.format(namespace))

   # print "imgElements %d" % len(imgElements)

   for img in imgElements:

     src = img.attrib["src"]

     # print "src %s" % src

     try:

       res = urlparse(src)

       # 非cnblogs的图片不下载

       if not res.netloc.endswith(".cnblogs.com"):

         print "image not download: %s:%s" % (src,res.netloc)

         continue

     except Exception,e:

       pass

     DownloadFile(src,output)

   linkElements = document.findall('.//{0}link'.format(namespace))

   # print "linkElements %d" % len(linkElements)

   for link in linkElements:

     href = link.attrib["href"]

     # print "href %s" % href

     text = DownloadFile(href,output)

     if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":

       ReadCss(text)

   scriptElements = document.findall('.//{0}script'.format(namespace))

   # print "scriptElements %d" % len(scriptElements)

   for script in scriptElements:

     if script.attrib.has_key("src"):

       src = script.attrib["src"]

       # print "src %s" % src

       DownloadFile(src,output)

   htmlNameIndex = url.rfind("/");

   urlLen = len(url)

   htmlName = GetHtmlName(url)

   output = output.decode("utf-8") + "/"+htmlName+".htm"

   data = data.replace("http://","")

   data = data.replace("https://","")

   data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")

   resourceFile = open(output,"wb")

   resourceFile.write(data)

   resourceFile.close()

 def GetConcatUrl(url,png):

   # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css

   count = 0

   index = png.find("..")

   startindex = None

   while index != -1:

     count = count + 1;

     startindex = index + 2

     index = png.find("..",startindex)

   second = png[startindex:]

   length = len(url)

   index = url.rfind("/")

   endindex = 0

   while count >= 0 and index != -1:

     endindex = index

     index = url.rfind("/",0, endindex)

     count = count - 1

   first = url[0:endindex]

   return first+second

 def getAllListUrl(url):

   header={"User-Agent": "Mozilla-Firefox5.0"}

   request = urllib2.Request(url,None,header)

   response = urllib2.urlopen(request)

   data = response.read()

   # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).

   document = html5lib.parse(data)

   namespace = "{http://www.w3.org/1999/xhtml}"

   # get <div id="homepage1_BottomPager" class="topicListFooter">
   pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace))

   print( "Debug>len(pageList)=%d"%len(pageList) );

   # get <div class="pager">

   alinks = list(pageList[0])

   # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">

   alinks1 = list(alinks[0])

   lastArticle = alinks1[len(alinks1)-1]

   # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20'

   lastArticleHref = lastArticle.attrib["href"]

   lastPageIndex = lastArticleHref.rfind("=")

   lastPageNum = int(lastArticleHref[lastPageIndex+1:])

   urlInfo = lastArticleHref[0:lastPageIndex]

   urlList = []

   for x in xrange(1,lastPageNum+1):

     listUrl = urlInfo+"="+str(x)

     urlList.append(listUrl)

   return urlList

 def getArticleList(url):

   # 获取所有的文章url

   # <div id="article_toplist" class="list"></div>

   # <div id="article_list" class="list"  

   # <div class="list_item article_item"

   # <div class="article_title">

   # <span class="ico ico_type_Original"></span>

   # <h1>

   #     <span class="link_title">

   #         <a href="/infoworld/article/details/18984183">

   # <div class="article_manage">

   # <span class="link_postdate"></span>

   urlList = getAllListUrl(url)

   print "文章页数(number of pages) ",len(urlList)

   header={"User-Agent": "Mozilla-Firefox5.0"}

   allLists = []

   strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")

   pageNum = 0

   global gTestTime

   for one in urlList:

     tryCount = gTestTime # try count

     pageNum = pageNum + 1

     pageNumStr = strPage.format(pageNum)

     print pageNumStr

     while tryCount > 0:

       try:

         tryCount = tryCount - 1

         time.sleep(0.5) #访问太快会不响应

         request = urllib2.Request(one,None,header)

         response = urllib2.urlopen(request)

         data = response.read()

         document = html5lib.parse(data,encoding="utf-8")

         namespace = "{http://www.w3.org/1999/xhtml}"

         # .//{0}div[@id=\'article_toplist\']

         #topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace))

         #articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace))

         articleLists =  document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace))

         allLists = allLists + articleLists

         break

       except Exception, e:

         print "getArticleList %s:%s:%d" % (e,one,tryCount)

   count = 0 # 文章数

   artices = []

   for article in allLists:

       count = count+1

       alink = article.find(".//{0}a".format(namespace))

       # href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html'

       href = alink.attrib["href"]

       #oneHref = "http://blog.csdn.net"+href

       oneHref = href

       childElement = list(alink)

       linkIter = alink.itertext()

       title = "".encode("utf-8")

       for x in linkIter:

         title = title+x.strip().encode("utf-8")

       artices.append([oneHref,title])

   return artices

 def GetUserName(url):

   htmlNameIndex = url.rfind("/");

   urlLen = len(url)

   htmlName = ""

   htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)

   htmlName = url[htmlNameIndex1+1:htmlNameIndex]

   # if htmlNameIndex+1 == urlLen:

     # htmlNameIndex = url.rfind("/",0,htmlNameIndex)

     # htmlName = url[htmlNameIndex+1:urlLen-1]

   # else:

     # htmlName = url[htmlNameIndex+1:]

   return htmlName

 def GetHtmlName(url):

   htmlNameIndex = url.rfind("/");

   urlLen = len(url)

   htmlName = ""

   if htmlNameIndex+1 == urlLen:

     htmlNameIndex = url.rfind("/",0,htmlNameIndex)

     htmlName = url[htmlNameIndex+1:urlLen-1]

   else:

     htmlName = url[htmlNameIndex+1:]

   return htmlName

 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页，则URL建议为前面的URL

 def Start(url,output):

   print "备份开始"

   lists = getArticleList(url)

   username = GetUserName(url)

   output_username = output+"/"+username

   output_username.replace("\\","/")

   if not os.path.exists(output_username.decode("utf-8")):

     os.mkdir(output_username.decode("utf-8"))

   totalNum = len(lists)

   print "总文章数(number of articles): %d" % totalNum

   # 生成首页文件

   doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'

   charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'

   indexHtml = output_username + ".htm"

   f = open(indexHtml.decode("utf-8"),"w")

   print >> f,doctype

   print >> f,'<html>'

   print >> f,'<head>'

   print >> f,charset

   print >> f,'</head>'

   print >> f,'<frameset cols=\"20%,*\">'

   navigationHtmlName = username+'-navigation.htm'

   print >> f,'<frame src=\"'+navigationHtmlName+'\" />'

   firstHtmlName = GetHtmlName(lists[0][0])

   print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">'

   print >> f,'</frameset>'

   print >> f,'</html>'

   f.close()

   # 生成导航文件

   navigationHtml = output+"/"+navigationHtmlName

   # f = open(navigationHtml.decode("utf-8"),"w")

   f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")

   print >> f,doctype

   print >> f,'<html>'

   print >> f,'<head>'

   print >> f,charset

   print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>'

   print >> f,'</head>'

   print >> f,'<body>'

   count = 0

   for x in lists:

     count = count + 1

     articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"

     print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />'

   print >> f,'</body>'

   print >> f,'</html>'

   f.close()

   print "开始下载文章"

   currentNum = 0

   strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")

   global gTestTime

   for x in lists:

     count = gTestTime

     currentNum = currentNum+1

     while True:

       if count < 0:

         break

       count = count - 1

       try:

         time.sleep(1) #访问太快,csdn会报503错误.

         strPageTemp = strPage.format(totalNum,currentNum)

         strPageTemp = strPageTemp+x[1]

         print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时

         print x[0]

         print "\n"

         Download(x[0],output_username)

         break

       except Exception, e:

         # exstr = traceback.format_exc()

         # print exstr

         pass

 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页，则URL建议为前面的URL

 if __name__=='__main__':

   url = "http://www.cnblogs.com/yaoyansi/default.html?page=4"

   #output = "C:/Users/apple/Desktop/新建文件夹"

   output = "/tmp/my_tmp/cnblogs"

   Start(url,output)

   # Download("http://blog.csdn.net/dcraw/article/details/6858820",

   #     "C:/Users/apple/Desktop/新建文件夹/infoworld")

Reference:

[1] http://blog.csdn.net/llrraa2010/article/details/35540845

How to backup your blogs on cnblogs的更多相关文章

Usual tiny skills & solutions
Ubuntu and Win10 - double OS 2016-02-21 Yesterday I helped my friend install Ubuntu (14.04 LTS) on h ...
MySQL 使用XtraBackup的shell脚本介绍
mysql_backup.sh是关于MySQL的一个使用XtraBackup做备份的shell脚本,实现了简单的完整备份和增量备份.以及邮件发送备份信息等功能.功能目前还比较简单,后续将继续完善和增加 ...
Troubleshooting Failed Requests Using Tracing in IIS 8.5
https://docs.microsoft.com/en-us/iis/troubleshoot/using-failed-request-tracing/troubleshooting-faile ...
我心中的核心组件（可插拔的AOP）~第十五回　我的日志组件Logger.Core（策略，模版方法，工厂，单例等模式的使用）
回到目录之前的讲过两篇关于日志组件的文章,分别是<第一回日志记录组件之自主的Vlog>和<第三回日志记录组件之log4net>,而今天主要说一下我自己开发的另一种日志 ...
【编译原理】c++实现自下而上语法分析及中间代码(四元式)生成
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
【编译原理】c++实现自下而上语法分析器
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
【编译原理】c++实现自上而下语法分析器
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
【编译原理】c++实现词法分析器
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
Associate File Type with Qt In Mac Os and Win
Win Registry Question One day, my boss want me to finish one function which let the users can double ...

随机推荐

js/jquery 回调函数的定义方法
基本写法: 带参数的回调函数以上回调函数,直接传入function作为参数,同样,还可以传入json对象作为参数...如下. 该方法的优势是可以定义多个回调函数....类似$.ajax回调函数中的s ...
Highcharts使用指南
统计分析报表Highcharts使用指南一.前言(Preface)阅览本文,您可以了解:1.Highcharts使用方法2.Highcharts数据动态加载3.Highcharts自动刷新数据4.H ...
Python::re 模块 -- 在Python中使用正则表达式
前言这篇文章,并不是对正则表达式的介绍,而是对Python中如何结合re模块使用正则表达式的介绍.文章的侧重点是如何使用re模块在Python语言中使用正则表达式,对于Python表达式的语法和详细 ...
ArcGIS for Android_离在线一体化核心技术基本流程
核心思想: a.数据首先存储于ArcSDE中,要素添加GlobleID,图层数据启用数据归档或开启版本化.b.然后将ArcSDE数据库托管于ArcGIS for Server作为数据存储.c.在Arc ...
JS手札
Node JS 关于JS调用被调用:exports.cv=cv; cv为类,可以使用其方法cv.***: cv为函数名,可以使用其函数cv( , ): 调用: var cv=require(cv); ...
Collections.reverse 代码思考-超越昨天的自己系列(13)
点进Collections.reverse的代码瞄了眼,然后就开始了一些基础知识的收集. 现在发现知道的越多,知道不知道的越多. 列几个记录下: reverse方法源码: /** * Reverses ...
UNITY5以后怎么改GUI文字
提要:以前是UNITY4,后来用了新的UI,于是GUIText这种东西就没有了,研究了很久.... ---------------------------- 这里我想拖个GUI文字框显示FPS,于是代 ...
eclipse连接mysql，插入数据时乱码
问题:如果eclipse中项目的编码方式为utf-8 插入数据后,在数据库中查看后,汉字出现乱码情况解决方法: 1.在获取连接的时候将conn = DriverManager.getConnecti ...
Jenkins的配置（rpm red hat方式）
Jenkins的配置文件位置 #### sudo chown -R admin /usr/lib/jenkins sudo chgrp -R admin /usr/lib/jenkins #### s ...
python之环境搭建windows版
1.先到python官网下载属于自己的的python版本,有linux版,有mac版,有windows版:https://www.python.org/downloads/windows/ 2.下载完 ...

How to backup your blogs on cnblogs

How to backup your blogs on cnblogs的更多相关文章

随机推荐

热门专题