整理python小爬虫

编码使我快乐！！！

我也不知道为什么，遇到自己喜欢的事情，就越想做下去，可以一个月不出门，但是不能一天没有电脑

掌握程度：对python有了一个更清晰的认识，自动化运维，也许可以用python实现呢，加油

实现功能:

爬取响应的网页，并且存入本地文件和DB

本地文件:

DB:

整体逻辑：

1.读取配置文件

 def ReadLocalFiles() :
     #定义字典用于存储conf中的变量与值
     returndict={}

     #定义变量用于存储行数
     linenumber = 0

     #以只读方式获取文件内容,因为文件需要放在crontab中，故需写全路径
     localfiles = open(r'/root/python-script/geturl/main.conf')

     #获取文件所有值，以list的方式存储在readfilelines中
     readfilelines = localfiles.readlines()

     #定义for用于循环list进行抓值
     for line in readfilelines :

         #使行数自增1，用于报错显示
         linenumber = linenumber + 1

         #利用rstrip进行去除多余的空格
         line = line.rstrip()

         #判断如果文件开头为#,则continue继续循环
         if line.startswith('#') :
             continue

         #判断如果line的长度为0，则continue继续循环
         elif len(line) == 0 :
             continue

         #如果依上条件均不满足
         else:

             #执行try语句,如果有异常，使之在控制范围内
             try:
                 returndict[line.split('=')[0]] = line.split('=')[1]

             #抛出异常
             except:
                 #打印错误语句
                 print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"line %d %s error,please check" % (linenumber,line))
                 #
                 localfiles.close()
                 #使程序异常退出
                 sys.exit(-1)

     #返回字典
     localfiles.close()
     return returndict

2.清除已经下载的文件内容

 def DeleteHisFiles(update_file):

     #判断是否存在update_file这个文件
     if os.path.isfile(update_file):

         try:
             #以可读可写方式打开文件
             download_files = open(update_file,'r+')
             #清除文件内容
             download_files.truncate()
             #关闭文件
             download_files.close()
         except:
             #报错，清除失败
             print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"Truncate " + update_file + "error , please check it")
     #如果文件在路径下不存在
     else :
         #新建文件
         print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"Build New downfiles ok")

3.检测网址是否有更新

 def DealwithURL(url):
     #获取url的内容传入r
     r = requests.get(url)

     #定义需要查找的正则表达式的值
     pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=')

     #在r.text中查找pattern的值
     findurl=re.findall(pattern,r.text)

     #如果有找到值
     if findurl:
         #定义需要查找正则表达式的值，此值为去url=后面的内容直至"结尾
         pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"')

         #在r.text中查找pattern的值，并将下标为0的值赋值给transferurl
         transferurl = re.findall(pattern,r.text)[0]

         #返回transferurl的值
         return transferurl
     else :
         #否则返回true
         return True

4.获取更新后的网址

 def GetNewURL(url):
     #获取url的内容传入r
     r = requests.get(url)

     #编码为utf-8
     r.encoding='utf-8'

     #定义需要查找的正则表达式的值,以alert开头，以">结尾的值
     pattern = re.compile('alert(.*)">')
     #在r.text中查找pattern的值
     findurl=re.findall(pattern,r.text)
     #将传入的值修改为str格式
     findurl_str = (" ".join(findurl))
     #1.返回findurl_str以空格分割后的下标为1的值
     #2.返回第1步的下标为0的值
     #3.返回第2步的下标为2的值至最后
     return (findurl_str.split(' ',1)[0][2:])

5.存储新的网址进本地配置文件

 def SaveLocalUrl(untreatedurl,treatedurl):
     #如果两个值相等
     if untreatedurl == treatedurl :
         pass

     else :
         #执行try语句,如果有异常，使之在控制范围内
         try:
             #以只读的方式打开文件
             fileconf = open(r'/root/python-script/geturl/main.conf','r')
             #设置rewritestr为空
             rewritestr = ""

             #执行fileconf的for循环
             for readline in fileconf:  

                 #如果在readline中有搜索到untreatedurl
                 if re.search(untreatedurl,readline):  

                     #在readline中将untreatedurl值替换为treatedurl值
                     readline = re.sub(untreatedurl,treatedurl,readline)
                     #试rewritestr加上新的readline
                     rewritestr = rewritestr + readline
                 else :
                     #否则的话，也使rewritestr加上新的readline
                     rewritestr = rewritestr + readline
             #读入完毕将文件关闭
             fileconf.close()                                                                                                                                                                                                          

             #以只写的方式打开文件
             fileconf = open(r'/root/python-script/geturl/main.conf','w')
             #将rewritestr写入文件
             fileconf.write(rewritestr)
             #关闭文件
             fileconf.close()                                                                                                                                                                                                          

         except:
             #打印异常
             print ("get new url but open files ng write to logs,please check main.conf")

6.随机获取一个网页并且取得总页数

 def GetTitleInfo(url,down_page,update_file,headers):

     #定义title的值，随机从1——8中抽取一个出来
     title = '/list/'+str(random.randint(1,8))

     #定义titleurl的值，为url+title+.html
     titleurl = url + title + '.html'

     #获取当前的url内容并返回给r
     r = requests.get(titleurl)
     #设置r的编码为当前页面的编码
     r.encoding = chardet.detect(r.content)['encoding']
     ##定义需要查找的正则表达式的值,以'&nbsp;当前:.*'开头，以'页&nbsp;'结尾的值
     pattern = re.compile('&nbsp;当前:.*/(.*)页&nbsp;')
     #在r.text中查找符合pattern的值
     getpagenumber = re.findall(pattern,r.text)

     #将getpagenumber值转换为str格式
     getpagenumber = (" ".join(getpagenumber))

     GetListURLinfo(url , title , int(getpagenumber) , int(down_page),update_file,headers)

7.Download详细URL

 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers):

     #将total的值控制在范围内
     if total >= 100:
         total = 100

     if total <= 1:
         total = 2

     #将total的值赋值给getpagenumber
     getpagenumber = total

     #定义for循环，循环次数为total的总数
     for number in range(0,total) :
         try:
             #定义信号值
             signal.signal(signal.SIGALRM,handler)
             #设置alarm为3，若超过3秒，则被抛送异常
             signal.alarm(3)

             #定义url的值为小于getpagenumber的随机页面
             url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'

             #获取url的内容，并且传值给r
             r = requests.get(url)

             #定义需要查找的正则表达式的值,以<div class="info"><h2>开头，以</a><em></em></h2>结尾的值
             pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>')
             #设置编码为页面编码
             r.encoding = chardet.detect(r.content)['encoding']
             #在r.text中查找符合pattern的值
             allurl = re.findall(pattern,r.text)

             #为allurl定义for循环
             for lineurl in allurl:
                 try:
                     #定义信号值
                     signal.signal(signal.SIGALRM,handler)
                     #设置alarm为3，若超过3秒，则被抛送异常
                     signal.alarm(3)

                     #定义需要查找的正则表达式的值,以<a href="开头，以" title结尾的值
                     pattern = re.compile('<a href="(.*)" title')
                     #在r.text中查找符合pattern的值并且赋值给titleurl
                     titleurl = re.findall(pattern,lineurl)

                     #定义需要查找的正则表达式的值,以<a href="开头，以” target=结尾的值
                     pattern = re.compile('title="(.*)" target=')
                     #在r.text中查找符合pattern的值并且赋值给titlename
                     titlename = re.findall(pattern,lineurl)

                     signal.alarm(0)

                 #超时抛出异常
                 except AssertionError:
                     print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished")
                     continue                                                                                                                                                                                                      

         except AssertionError:
             print ("GetlistURL Error Continue")
             continue

8.获取每个url的ed2k地址

 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers):
     #将downurlstr，downnamestr转换str类型
     downurlstr = (" ".join(titleurl))
     downnamestr = (" ".join(titlename))

     #获取url的内容，并且传值给r
     r = requests.get((sourceurl+downurlstr))

     #定义需要查找的正则表达式的值,以autocomplete="on">开头，以/</textarea></div>结尾的值
     pattern = re.compile('autocomplete="on">(.*)/</textarea></div>')

     #在r.text中查找符合pattern的值并且赋值给downurled2k
     downurled2k = re.findall(pattern,r.text)
     #将downurled2k的值修改为str并且赋值给downurled2kstr
     downurled2kstr = (" ".join(downurled2k))

     WriteLocalDownloadURL(update_file , downurled2kstr,titlename)

     #输出downnamestr 和 downurled2kstr
     print (downnamestr , downurled2kstr)

     #定义savedburl为sourceurl + downurlstr的值
     savedburl = sourceurl + downurlstr
     SaveDB(time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())),titlename[0],downurled2kstr,savedburl)
 #   print (time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())))

9.将文件内容写进本地

 def WriteLocalDownloadURL(downfile,listfile):

         #以追加的方式打开downfile文件
         urlfile = open(downfile,'a+')

         #定义循环语句
         for line in listfile :
                 urlfile.write(line+'\n')

         #关闭文件
         urlfile.close()

10.将文件内容写进DB

 def SaveDB(nowdate,tilename,downurl,savedburl):

     #获取游标
     cursor = db.cursor()

     #定义select查询语句
     sql = "select count(nurl) from downurl where nurl = '%s';" % (downurl)
     cursor.execute(sql)

     #获取执行的结果
     data = cursor.fetchone()

     #判断结果是否为‘0’
     ' == str(data[0]):
         #定义insert插入语句
         sql = "insert into downurl values ('%s','%s','%s','%s');" % (nowdate,tilename,savedburl,downurl)

         #定义try语句
         try:
             #执行sql语句
             cursor.execute(sql)
             #执行commit语句
             db.commit()
         except:
             #否则执行rollback语句
             db.rollback()

代码综合:

 root@Linux:~/python-script/geturl # cat main.py
 #!/usr/bin/python3
 import requests
 import re
 import chardet
 import random
 import signal
 import time
 import os
 import sys
 import pymysql

 def DealwithURL(url):
     r = requests.get(url)
     pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=')
     findurl=re.findall(pattern,r.text)

     if findurl:
         pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"')

         transferurl = re.findall(pattern,r.text)[0]

         return transferurl
     else :
         return True

 def GetNewURL(url):
     r = requests.get(url)

     r.encoding='utf-8'
     pattern = re.compile('alert(.*)">')
     findurl=re.findall(pattern,r.text)
     findurl_str = (" ".join(findurl))
     return (findurl_str.split(' ',1)[0][2:])

 def gettrueurl(url):
     if DealwithURL(url)==True:
         return url
     else :
         return GetNewURL(DealwithURL(url))

 def SaveLocalUrl(untreatedurl,treatedurl):
     if untreatedurl == treatedurl :
         pass
     else :
         try:
             fileconf = open(r'/root/python-script/geturl/main.conf','r')
             rewritestr = ""

             for readline in fileconf:
                 if re.search(untreatedurl,readline):
                     readline = re.sub(untreatedurl,treatedurl,readline)
                     rewritestr = rewritestr + readline
                 else :
                     rewritestr = rewritestr + readline
             fileconf.close()

             fileconf = open(r'/root/python-script/geturl/main.conf','w')
             fileconf.write(rewritestr)
             fileconf.close()

         except:
             print ("get new url but open files ng write to logs")

 def handler(signum,frame):
     raise AssertionError

 def SaveDB(nowdate,tilename,downurl,savedburl):
     cursor = db.cursor()

     sql = "select count(nurl) from downurl where nurl = '%s';" % (downurl)
     cursor.execute(sql)

     data = cursor.fetchone()

     ' == str(data[0]):
         sql = "insert into downurl values ('%s','%s','%s','%s');" % (nowdate,tilename,savedburl,downurl)

         try:
             cursor.execute(sql)
             db.commit()
         except:
             db.rollback()

 def WriteLocalDownloadURL(downfile,listfile):

     urlfile = open(downfile,'a+')

     for line in listfile :
         urlfile.write(line+'\n')

     urlfile.close()

 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers,enabledb):

     downurlstr = (" ".join(titleurl))
     downnamestr = (" ".join(titlename))

     r = requests.get((sourceurl+downurlstr))
     pattern = re.compile('autocomplete="on">(.*)/</textarea></div>')

     downurled2k = re.findall(pattern,r.text)
     downurled2kstr = (" ".join(downurled2k))

     print (downnamestr , downurled2kstr)

     if 1 == enabledb :
         savedburl = sourceurl + downurlstr
         SaveDB(time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())),titlename[0],downurled2kstr,savedburl)

     returnstr = titlename[0]+" "+downurled2kstr
     return returnstr

 def ReadLocalFiles() :
     returndict={}

     linenumber = 0

     localfiles = open(r'/root/python-script/geturl/main.conf')

     readfilelines = localfiles.readlines()

     for line in readfilelines :
         linenumber = linenumber + 1

         line = line.rstrip()

         if line.startswith('#') :
             continue
         elif len(line) == 0 :
             continue
         else:
             try:
                 returndict[line.split('=')[0]] = line.split('=')[1]
             except:
                 print ("line %d %s error,please check" % (linenumber,line))
                 sys.exit(-1)

     return returndict

 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers,enablewritefile,enabledb):

     returnwriteurl = []

     if total >= 100:
         total = 100

     if total <= 1:
         total = 2

     getpagenumber = total

     for number in range(0,total) :
         try:
             signal.signal(signal.SIGALRM,handler)
             signal.alarm(3)

             url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'

             r = requests.get(url)

             pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>')
             r.encoding = chardet.detect(r.content)['encoding']
             allurl = re.findall(pattern,r.text)

             for lineurl in allurl:
                 try:
                     signal.signal(signal.SIGALRM,handler)
                     signal.alarm(3)

                     pattern = re.compile('<a href="(.*)" title')
                     titleurl = re.findall(pattern,lineurl)

                     pattern = re.compile('title="(.*)" target=')
                     titlename = re.findall(pattern,lineurl)

                     returnwriteurl.append(GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers,enabledb))
                     signal.alarm(0)
                 except AssertionError:
                     print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished")
                     continue

         except AssertionError:
             print ("GetlistURL Error Continue")
             continue

         if 1 == enablewritefile:
             WriteLocalDownloadURL(update_file,returnwriteurl)

 def GetTitleInfo(url,down_page,update_file,headers,savelocalfileenable,savedbenable):

     title = '/list/'+str(random.randint(1,8))

     titleurl = url + title + '.html'

     r = requests.get(titleurl)
     r.encoding = chardet.detect(r.content)['encoding']
     pattern = re.compile('&nbsp;当前:.*/(.*)页&nbsp;')
     getpagenumber = re.findall(pattern,r.text)

     getpagenumber = (" ".join(getpagenumber))

     GetListURLinfo(url , title , getpagenumber , down_page,update_file,headers,int(savelocalfileenable),int(savedbenable))

 def write_logs(time,logs):
     loginfo = str(time)+logs
     try:
         logfile = open(r'logs','a+')
         logfile.write(loginfo)
         logfile.close()
     except:
         print ("Write logs error,code:154")

 def DeleteHisFiles(update_file):
     if os.path.isfile(update_file):
         try:
             download_files = open(update_file,'r+')
             download_files.truncate()
             download_files.close()
         except:
             print ("Delete " + update_file + "Error --code:166")
     else :
         print ("Build New downfiles")

 def main():
     headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,*/*;q=0.8"}

     readconf = ReadLocalFiles()

     try:
         file_url = readconf['url']
         down_page = readconf['download_page']
         savelocalfileenable = readconf['savelocalfileenable']
         savedbenable = readconf['savedbenable']

         if 1 == int(savelocalfileenable) :
             update_file = readconf['download_local_files']

         if 1 == int(savedbenable) :
             dbhost = readconf['dbhost']
             dbport = readconf['dbport']
             dbuser = readconf['dbuser']
             dbpasswd = readconf['dbpasswd']
             dbschema = readconf['dbschema']
             dbcharset = readconf['dbcharset']
     except:
         print ("Get local conf error,please check url or down_page conf")
         sys.exit(-1)

     if 1 == int(savelocalfileenable) :
         DeleteHisFiles(update_file)

     untreatedurl = file_url

     treatedurl = gettrueurl(untreatedurl)
     SaveLocalUrl(untreatedurl,treatedurl)

     url = treatedurl

     if 1 == int(savedbenable):
         global db

 #        dbconnect ='''
 #        host='localhost',port=3306,user='root',passwd='flzx3QC#',db='downpage',charset='utf8'
 #        '''
         db = pymysql.connect(host='localhost',port=3306,user='root',passwd='flzx3QC#',db='downpage',charset='utf8')

 #        print (dbconnect)

 #        db = pymysql.connect(dbconnect)

     else:
         db = ''

     if 1 == int(savelocalfileenable) or 0 ==  int(savelocalfileenable):
         GetTitleInfo(url,int(down_page),update_file,headers,int(savelocalfileenable),int(savedbenable))
         db.close()

 if __name__=="__main__":
     main()
 root@Linux:~/python-script/geturl #

End

整理python小爬虫的更多相关文章

python小爬虫练手
一个人无聊,写了个小爬虫爬取不可描述图片.... 代码太短,就暂时先往这里贴一下做备份吧. 注:这是很严肃的技术研究,当然爬下来的图片我会带着批判性的眼光审查一遍的.... :) #! /usr/ ...
【现学现卖】python小爬虫
1.给小表弟汇总一个院校列表,想来想去可以写一个小爬虫爬下来方便些,所以就看了看怎么用python写,到了基本能用的程度,没有什么特别的技巧,大多都是百度搜的,遇事不决问百度啦 2.基本流程就是: 用 ...
Python 小爬虫流程总结
接触Python3一个月了,在此分享一下知识点,也算是温故而知新了. 接触python之前是做前端的.一直希望接触面能深一点.因工作需求开始学python,几乎做的都是爬虫..第一个demo就是爬取X ...
Python小爬虫-自动下载三亿文库文档
新手学python,写了一个抓取网页后自动下载文档的脚本,和大家分享. 首先我们打开三亿文库下载栏目的网址,比如专业资料(IT/计算机/互联网)http://3y.uu456.com/bl-197?o ...
第一个Python小爬虫
这个爬虫是参考http://python.jobbole.com/81353/这篇文章写的这篇文章可能年代过于久远,所以有些代码会报错,然后我自己稍微修改了一下,增加了一个getContentAll ...
python 小爬虫爬取博客文章初体验
最近学习 python 走火入魔,趁着热情继续初级体验一下下爬虫,以前用 java也写过,这里还是最初级的爬取html,都没有用html解析器,正则等...而且一直在循环效率肯定### 很低下 imp ...
python小爬虫【1】
爬取百度贴吧的图片分析贴吧源代码,图片所在位置是:<img class="BDE_Image" src=“........jpg” pic_ext..... 所以正则匹配是 ...
Python小爬虫——抓取豆瓣电影Top250数据
python抓取豆瓣电影Top250数据 1.豆瓣地址:https://movie.douban.com/top250?start=25&filter= 2.主要流程是抓取该网址下的Top25 ...
Python小爬虫练习
# coding: utf-8 __author__ = 'zhangcx' from urllib3 import PoolManager import codecs import json cla ...

随机推荐

ubuntu 16.04 安装中文语言包
安装中文语言包 sudo apt-get install language-pack-zh-han* 安装gnome包 sudo apt-get install language-pack-gn ...
ORACLE日常操作手册
转发自:http://blog.csdn.net/lichangzai/article/details/7955766 以前为开发人员编写的oracle基础操作手册,都基本的oracle操作和SQL语 ...
kettle 常用组件
Dummy步骤不会做任何事情.它的主要功能是作为以测试为目的的占位符. 追加流分析查询(前后行查询),步骤:增加常量数据包括id,name连个字段,增加自增列autoid字段,按id,outid进行 ...
安装和激活Office 2019
有条件请支持正版!相比费尽力气找一个可能不太安全的激活工具,直接买随时随地更新的Office 365确实是最好的办法.暂时没有经济实力的,可以看看这篇文章.下载OTP工具首先到Office Tool ...
nginx 301跳转
server { server_name xxxx.com; return 301 $scheme://www.xxxx.com$request_uri; }
RK3288 mipi屏参数配置文件
RK3288 Android 5.1系统 Linux 3.10 mipi屏参数配置文件所在的路径:kernel/arch/arm/boot/dts/xxx_mipi.dtsi 屏参数配 ...
深入理解Java虚拟机之Java内存区域随笔
1.java内存区域与内存溢出异常 Java 虚拟机在执行 Java 程序的过程中会把它管理的内存划分成若干个不同的数据区域:1.程序计数器,2.栈(虚拟机栈和本地方法栈 ),3.堆,4.方法区(包含 ...
springboot 启动脚本
#!/bin/bash cd `` data_dir=`pwd` JAVA_HOME=/usr/local/jdk1..0_92 logs_dir=$data_dir/logs if [ ! -d $ ...
RGB ECT 4BIT 压缩后质量远高于RGB ETC2 4BIT
今天突然发现这个.原来以为ETC2在任何方面都强于ETC,没想到...用了RGB ETC2 4BIT 压缩的贴图看起来有点糊,改用RGB ECT 4BIT后竟然非常清晰,而且大小与RGB ECT2 4 ...
.Net 中读写Oracle数据库常用两种方式
.net中连接Oracle 的两种方式:OracleClient,OleDb转载 2015年04月24日 00:00:24 10820.Net 中读写Oracle数据库常用两种方式:OracleCli ...

整理python小爬虫

整理python小爬虫的更多相关文章

随机推荐

热门专题