整理python小爬虫
编码使我快乐!!!
我也不知道为什么,遇到自己喜欢的事情,就越想做下去,可以一个月不出门,但是不能一天没有电脑
掌握程度:对python有了一个更清晰的认识,自动化运维,也许可以用python实现呢,加油
实现功能:
爬取响应的网页,并且存入本地文件和DB
本地文件:

DB:

整体逻辑:

1.读取配置文件
def ReadLocalFiles() :
#定义字典用于存储conf中的变量与值
returndict={}
#定义变量用于存储行数
linenumber = 0
#以只读方式获取文件内容,因为文件需要放在crontab中,故需写全路径
localfiles = open(r'/root/python-script/geturl/main.conf')
#获取文件所有值,以list的方式存储在readfilelines中
readfilelines = localfiles.readlines()
#定义for用于循环list进行抓值
for line in readfilelines :
#使行数自增1,用于报错显示
linenumber = linenumber + 1
#利用rstrip进行去除多余的空格
line = line.rstrip()
#判断如果文件开头为#,则continue继续循环
if line.startswith('#') :
continue
#判断如果line的长度为0,则continue继续循环
elif len(line) == 0 :
continue
#如果依上条件均不满足
else:
#执行try语句,如果有异常,使之在控制范围内
try:
returndict[line.split('=')[0]] = line.split('=')[1]
#抛出异常
except:
#打印错误语句
print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"line %d %s error,please check" % (linenumber,line))
#
localfiles.close()
#使程序异常退出
sys.exit(-1)
#返回字典
localfiles.close()
return returndict
2.清除已经下载的文件内容
def DeleteHisFiles(update_file):
#判断是否存在update_file这个文件
if os.path.isfile(update_file):
try:
#以可读可写方式打开文件
download_files = open(update_file,'r+')
#清除文件内容
download_files.truncate()
#关闭文件
download_files.close()
except:
#报错,清除失败
print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"Truncate " + update_file + "error , please check it")
#如果文件在路径下不存在
else :
#新建文件
print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"Build New downfiles ok")
3.检测网址是否有更新
def DealwithURL(url):
#获取url的内容传入r
r = requests.get(url)
#定义需要查找的正则表达式的值
pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=')
#在r.text中查找pattern的值
findurl=re.findall(pattern,r.text)
#如果有找到值
if findurl:
#定义需要查找正则表达式的值,此值为去url=后面的内容直至"结尾
pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"')
#在r.text中查找pattern的值,并将下标为0的值赋值给transferurl
transferurl = re.findall(pattern,r.text)[0]
#返回transferurl的值
return transferurl
else :
#否则返回true
return True
4.获取更新后的网址
def GetNewURL(url):
#获取url的内容传入r
r = requests.get(url)
#编码为utf-8
r.encoding='utf-8'
#定义需要查找的正则表达式的值,以alert开头,以">结尾的值
pattern = re.compile('alert(.*)">')
#在r.text中查找pattern的值
findurl=re.findall(pattern,r.text)
#将传入的值修改为str格式
findurl_str = (" ".join(findurl))
#1.返回findurl_str以空格分割后的下标为1的值
#2.返回第1步的下标为0的值
#3.返回第2步的下标为2的值至最后
return (findurl_str.split(' ',1)[0][2:])
5.存储新的网址进本地配置文件
def SaveLocalUrl(untreatedurl,treatedurl):
#如果两个值相等
if untreatedurl == treatedurl :
pass
else :
#执行try语句,如果有异常,使之在控制范围内
try:
#以只读的方式打开文件
fileconf = open(r'/root/python-script/geturl/main.conf','r')
#设置rewritestr为空
rewritestr = ""
#执行fileconf的for循环
for readline in fileconf:
#如果在readline中有搜索到untreatedurl
if re.search(untreatedurl,readline):
#在readline中将untreatedurl值替换为treatedurl值
readline = re.sub(untreatedurl,treatedurl,readline)
#试rewritestr加上新的readline
rewritestr = rewritestr + readline
else :
#否则的话,也使rewritestr加上新的readline
rewritestr = rewritestr + readline
#读入完毕将文件关闭
fileconf.close()
#以只写的方式打开文件
fileconf = open(r'/root/python-script/geturl/main.conf','w')
#将rewritestr写入文件
fileconf.write(rewritestr)
#关闭文件
fileconf.close()
except:
#打印异常
print ("get new url but open files ng write to logs,please check main.conf")
6.随机获取一个网页并且取得总页数
def GetTitleInfo(url,down_page,update_file,headers):
#定义title的值,随机从1——8中抽取一个出来
title = '/list/'+str(random.randint(1,8))
#定义titleurl的值,为url+title+.html
titleurl = url + title + '.html'
#获取当前的url内容并返回给r
r = requests.get(titleurl)
#设置r的编码为当前页面的编码
r.encoding = chardet.detect(r.content)['encoding']
##定义需要查找的正则表达式的值,以' 当前:.*'开头,以'页 '结尾的值
pattern = re.compile(' 当前:.*/(.*)页 ')
#在r.text中查找符合pattern的值
getpagenumber = re.findall(pattern,r.text)
#将getpagenumber值转换为str格式
getpagenumber = (" ".join(getpagenumber))
GetListURLinfo(url , title , int(getpagenumber) , int(down_page),update_file,headers)
7.Download详细URL
def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers):
#将total的值控制在范围内
if total >= 100:
total = 100
if total <= 1:
total = 2
#将total的值赋值给getpagenumber
getpagenumber = total
#定义for循环,循环次数为total的总数
for number in range(0,total) :
try:
#定义信号值
signal.signal(signal.SIGALRM,handler)
#设置alarm为3,若超过3秒,则被抛送异常
signal.alarm(3)
#定义url的值为小于getpagenumber的随机页面
url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'
#获取url的内容,并且传值给r
r = requests.get(url)
#定义需要查找的正则表达式的值,以<div class="info"><h2>开头,以</a><em></em></h2>结尾的值
pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>')
#设置编码为页面编码
r.encoding = chardet.detect(r.content)['encoding']
#在r.text中查找符合pattern的值
allurl = re.findall(pattern,r.text)
#为allurl定义for循环
for lineurl in allurl:
try:
#定义信号值
signal.signal(signal.SIGALRM,handler)
#设置alarm为3,若超过3秒,则被抛送异常
signal.alarm(3)
#定义需要查找的正则表达式的值,以<a href="开头,以" title结尾的值
pattern = re.compile('<a href="(.*)" title')
#在r.text中查找符合pattern的值并且赋值给titleurl
titleurl = re.findall(pattern,lineurl)
#定义需要查找的正则表达式的值,以<a href="开头,以” target=结尾的值
pattern = re.compile('title="(.*)" target=')
#在r.text中查找符合pattern的值并且赋值给titlename
titlename = re.findall(pattern,lineurl)
signal.alarm(0)
#超时抛出异常
except AssertionError:
print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished")
continue
except AssertionError:
print ("GetlistURL Error Continue")
continue
8.获取每个url的ed2k地址
def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers):
#将downurlstr,downnamestr转换str类型
downurlstr = (" ".join(titleurl))
downnamestr = (" ".join(titlename))
#获取url的内容,并且传值给r
r = requests.get((sourceurl+downurlstr))
#定义需要查找的正则表达式的值,以autocomplete="on">开头,以/</textarea></div>结尾的值
pattern = re.compile('autocomplete="on">(.*)/</textarea></div>')
#在r.text中查找符合pattern的值并且赋值给downurled2k
downurled2k = re.findall(pattern,r.text)
#将downurled2k的值修改为str并且赋值给downurled2kstr
downurled2kstr = (" ".join(downurled2k))
WriteLocalDownloadURL(update_file , downurled2kstr,titlename)
#输出downnamestr 和 downurled2kstr
print (downnamestr , downurled2kstr)
#定义savedburl为sourceurl + downurlstr的值
savedburl = sourceurl + downurlstr
SaveDB(time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())),titlename[0],downurled2kstr,savedburl)
# print (time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())))
9.将文件内容写进本地
def WriteLocalDownloadURL(downfile,listfile):
#以追加的方式打开downfile文件
urlfile = open(downfile,'a+')
#定义循环语句
for line in listfile :
urlfile.write(line+'\n')
#关闭文件
urlfile.close()
10.将文件内容写进DB
def SaveDB(nowdate,tilename,downurl,savedburl):
#获取游标
cursor = db.cursor()
#定义select查询语句
sql = "select count(nurl) from downurl where nurl = '%s';" % (downurl)
cursor.execute(sql)
#获取执行的结果
data = cursor.fetchone()
#判断结果是否为‘0’
' == str(data[0]):
#定义insert插入语句
sql = "insert into downurl values ('%s','%s','%s','%s');" % (nowdate,tilename,savedburl,downurl)
#定义try语句
try:
#执行sql语句
cursor.execute(sql)
#执行commit语句
db.commit()
except:
#否则执行rollback语句
db.rollback()
代码综合:
root@Linux:~/python-script/geturl # cat main.py
#!/usr/bin/python3
import requests
import re
import chardet
import random
import signal
import time
import os
import sys
import pymysql
def DealwithURL(url):
r = requests.get(url)
pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=')
findurl=re.findall(pattern,r.text)
if findurl:
pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"')
transferurl = re.findall(pattern,r.text)[0]
return transferurl
else :
return True
def GetNewURL(url):
r = requests.get(url)
r.encoding='utf-8'
pattern = re.compile('alert(.*)">')
findurl=re.findall(pattern,r.text)
findurl_str = (" ".join(findurl))
return (findurl_str.split(' ',1)[0][2:])
def gettrueurl(url):
if DealwithURL(url)==True:
return url
else :
return GetNewURL(DealwithURL(url))
def SaveLocalUrl(untreatedurl,treatedurl):
if untreatedurl == treatedurl :
pass
else :
try:
fileconf = open(r'/root/python-script/geturl/main.conf','r')
rewritestr = ""
for readline in fileconf:
if re.search(untreatedurl,readline):
readline = re.sub(untreatedurl,treatedurl,readline)
rewritestr = rewritestr + readline
else :
rewritestr = rewritestr + readline
fileconf.close()
fileconf = open(r'/root/python-script/geturl/main.conf','w')
fileconf.write(rewritestr)
fileconf.close()
except:
print ("get new url but open files ng write to logs")
def handler(signum,frame):
raise AssertionError
def SaveDB(nowdate,tilename,downurl,savedburl):
cursor = db.cursor()
sql = "select count(nurl) from downurl where nurl = '%s';" % (downurl)
cursor.execute(sql)
data = cursor.fetchone()
' == str(data[0]):
sql = "insert into downurl values ('%s','%s','%s','%s');" % (nowdate,tilename,savedburl,downurl)
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
def WriteLocalDownloadURL(downfile,listfile):
urlfile = open(downfile,'a+')
for line in listfile :
urlfile.write(line+'\n')
urlfile.close()
def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers,enabledb):
downurlstr = (" ".join(titleurl))
downnamestr = (" ".join(titlename))
r = requests.get((sourceurl+downurlstr))
pattern = re.compile('autocomplete="on">(.*)/</textarea></div>')
downurled2k = re.findall(pattern,r.text)
downurled2kstr = (" ".join(downurled2k))
print (downnamestr , downurled2kstr)
if 1 == enabledb :
savedburl = sourceurl + downurlstr
SaveDB(time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())),titlename[0],downurled2kstr,savedburl)
returnstr = titlename[0]+" "+downurled2kstr
return returnstr
def ReadLocalFiles() :
returndict={}
linenumber = 0
localfiles = open(r'/root/python-script/geturl/main.conf')
readfilelines = localfiles.readlines()
for line in readfilelines :
linenumber = linenumber + 1
line = line.rstrip()
if line.startswith('#') :
continue
elif len(line) == 0 :
continue
else:
try:
returndict[line.split('=')[0]] = line.split('=')[1]
except:
print ("line %d %s error,please check" % (linenumber,line))
sys.exit(-1)
return returndict
def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers,enablewritefile,enabledb):
returnwriteurl = []
if total >= 100:
total = 100
if total <= 1:
total = 2
getpagenumber = total
for number in range(0,total) :
try:
signal.signal(signal.SIGALRM,handler)
signal.alarm(3)
url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'
r = requests.get(url)
pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>')
r.encoding = chardet.detect(r.content)['encoding']
allurl = re.findall(pattern,r.text)
for lineurl in allurl:
try:
signal.signal(signal.SIGALRM,handler)
signal.alarm(3)
pattern = re.compile('<a href="(.*)" title')
titleurl = re.findall(pattern,lineurl)
pattern = re.compile('title="(.*)" target=')
titlename = re.findall(pattern,lineurl)
returnwriteurl.append(GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers,enabledb))
signal.alarm(0)
except AssertionError:
print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished")
continue
except AssertionError:
print ("GetlistURL Error Continue")
continue
if 1 == enablewritefile:
WriteLocalDownloadURL(update_file,returnwriteurl)
def GetTitleInfo(url,down_page,update_file,headers,savelocalfileenable,savedbenable):
title = '/list/'+str(random.randint(1,8))
titleurl = url + title + '.html'
r = requests.get(titleurl)
r.encoding = chardet.detect(r.content)['encoding']
pattern = re.compile(' 当前:.*/(.*)页 ')
getpagenumber = re.findall(pattern,r.text)
getpagenumber = (" ".join(getpagenumber))
GetListURLinfo(url , title , getpagenumber , down_page,update_file,headers,int(savelocalfileenable),int(savedbenable))
def write_logs(time,logs):
loginfo = str(time)+logs
try:
logfile = open(r'logs','a+')
logfile.write(loginfo)
logfile.close()
except:
print ("Write logs error,code:154")
def DeleteHisFiles(update_file):
if os.path.isfile(update_file):
try:
download_files = open(update_file,'r+')
download_files.truncate()
download_files.close()
except:
print ("Delete " + update_file + "Error --code:166")
else :
print ("Build New downfiles")
def main():
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,*/*;q=0.8"}
readconf = ReadLocalFiles()
try:
file_url = readconf['url']
down_page = readconf['download_page']
savelocalfileenable = readconf['savelocalfileenable']
savedbenable = readconf['savedbenable']
if 1 == int(savelocalfileenable) :
update_file = readconf['download_local_files']
if 1 == int(savedbenable) :
dbhost = readconf['dbhost']
dbport = readconf['dbport']
dbuser = readconf['dbuser']
dbpasswd = readconf['dbpasswd']
dbschema = readconf['dbschema']
dbcharset = readconf['dbcharset']
except:
print ("Get local conf error,please check url or down_page conf")
sys.exit(-1)
if 1 == int(savelocalfileenable) :
DeleteHisFiles(update_file)
untreatedurl = file_url
treatedurl = gettrueurl(untreatedurl)
SaveLocalUrl(untreatedurl,treatedurl)
url = treatedurl
if 1 == int(savedbenable):
global db
# dbconnect ='''
# host='localhost',port=3306,user='root',passwd='flzx3QC#',db='downpage',charset='utf8'
# '''
db = pymysql.connect(host='localhost',port=3306,user='root',passwd='flzx3QC#',db='downpage',charset='utf8')
# print (dbconnect)
# db = pymysql.connect(dbconnect)
else:
db = ''
if 1 == int(savelocalfileenable) or 0 == int(savelocalfileenable):
GetTitleInfo(url,int(down_page),update_file,headers,int(savelocalfileenable),int(savedbenable))
db.close()
if __name__=="__main__":
main()
root@Linux:~/python-script/geturl #
End
整理python小爬虫的更多相关文章
- python小爬虫练手
一个人无聊,写了个小爬虫爬取不可描述图片.... 代码太短,就暂时先往这里贴一下做备份吧. 注:这是很严肃的技术研究,当然爬下来的图片我会带着批判性的眼光审查一遍的.... :) #! /usr/ ...
- 【现学现卖】python小爬虫
1.给小表弟汇总一个院校列表,想来想去可以写一个小爬虫爬下来方便些,所以就看了看怎么用python写,到了基本能用的程度,没有什么特别的技巧,大多都是百度搜的,遇事不决问百度啦 2.基本流程就是: 用 ...
- Python 小爬虫流程总结
接触Python3一个月了,在此分享一下知识点,也算是温故而知新了. 接触python之前是做前端的.一直希望接触面能深一点.因工作需求开始学python,几乎做的都是爬虫..第一个demo就是爬取X ...
- Python小爬虫-自动下载三亿文库文档
新手学python,写了一个抓取网页后自动下载文档的脚本,和大家分享. 首先我们打开三亿文库下载栏目的网址,比如专业资料(IT/计算机/互联网)http://3y.uu456.com/bl-197?o ...
- 第一个Python小爬虫
这个爬虫是参考http://python.jobbole.com/81353/这篇文章写的 这篇文章可能年代过于久远,所以有些代码会报错,然后我自己稍微修改了一下,增加了一个getContentAll ...
- python 小爬虫爬取博客文章初体验
最近学习 python 走火入魔,趁着热情继续初级体验一下下爬虫,以前用 java也写过,这里还是最初级的爬取html,都没有用html解析器,正则等...而且一直在循环效率肯定### 很低下 imp ...
- python小爬虫【1】
爬取百度贴吧的图片 分析贴吧源代码,图片所在位置是:<img class="BDE_Image" src=“........jpg” pic_ext..... 所以正则匹配是 ...
- Python小爬虫——抓取豆瓣电影Top250数据
python抓取豆瓣电影Top250数据 1.豆瓣地址:https://movie.douban.com/top250?start=25&filter= 2.主要流程是抓取该网址下的Top25 ...
- Python小爬虫练习
# coding: utf-8 __author__ = 'zhangcx' from urllib3 import PoolManager import codecs import json cla ...
随机推荐
- Learn the Basics - RN2
使用Image 1. 引用 import { Image } from 'react-native'; 2. 使用 format: <Image source={{}} style{{}} /& ...
- Vue的路由实现:hash模式 和 history模式
hash模式:在浏览器中符号“#”,#以及#后面的字符称之为hash,用 window.location.hash 读取.特点:hash虽然在URL中,但不被包括在HTTP请求中:用来指导浏览器动作, ...
- 性能测试day03_前端分析调优思路
刚刚看到有人支持我写的博客,表示还是比较感动的,发现热心的用户在我的博客留言说“一个系统每天有200万在线用户,问我怎么设计性能场景?”,其实这个问题呢就属于业务没理清,这个问题就像我问你,一个城市一 ...
- python中定时任务
今天看网络框架时,突然想看一下定时器,于是往上搜索了一下python中timer task的实现,但是由于python本身对线程的支持不是太好,因为全局排它锁的存在,使得多线程在访问资源时效率比较低. ...
- jquery中live is not a function的问题
jquery中的live()方法在jquery1.9及以上的版本中已被废弃了,如果使用,会抛出TypeError: $(...).live is not a function错误. 解决方法: 之前的 ...
- python大法好—模块 续
1.sys模块 sys模块的常见函数列表 sys.argv: 实现从程序外部向程序传递参数. sys.exit([arg]): 程序中间的退出,arg=0为正常退出. sys.getdefaulten ...
- leetcode64
class Solution { public: int minPathSum(vector<vector<int>>& grid) { int row=grid.si ...
- Android Studio 3.1.3正式版的新坑。。。
Gradle编译时没问题,运行App时候出现: java.util.NoSuchElementException java.lang.RuntimeException: com.android.bui ...
- java项目打成war包
1.修改pom.xml下的打包方式 <groupId>com.test</groupId> <artifactId>springboot</artifactI ...
- Java学习笔记(二十):多态
什么是多态 多态的好处 举个例子:需求:给饲养员提供一个喂养动物的方法,用于喂养动物 假如没有多态,会发现针对不同类型的动物,我们需要提供不同的feed方法来喂养,当需求变化时,比如增加动物,就要增加 ...