# -*- coding: utf-8 -*-
"""
some function by metaphy,2007-04-03,copyleft
version 0.2
"""
import urllib, httplib, urlparse
import re
import random """judge url exists or not,by others"""
def httpExists(url):
host, path = urlparse.urlsplit(url)[1:3]
if ':' in host:
# port specified, try to use it
host, port = host.split(':', 1)
try:
port = int(port)
except ValueError:
print 'invalid port number %r' % (port,)
return False
else:
# no port specified, use default port
port = None
try:
connection = httplib.HTTPConnection(host, port=port)
connection.request("HEAD", path)
resp = connection.getresponse( )
if resp.status == 200: # normal 'found' status
found = True
elif resp.status == 302: # recurse on temporary redirect
found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
else: # everything else -> not found
print "Status %d %s : %s" % (resp.status, resp.reason, url)
found = False
except Exception, e:
print e.__class__, e, url
found = False
return found """get html src,return lines[]"""
def gGetHtmlLines(url):
if url==None : return
if not httpExists(url): return
try:
page = urllib.urlopen(url)
html = page.readlines()
page.close()
return html
except:
print "gGetHtmlLines() error!"
return
"""get html src,return string"""
def gGetHtml(url):
if url==None : return
if not httpExists(url): return
try:
page = urllib.urlopen(url)
html = page.read()
page.close()
return html
except:
print "gGetHtml() error!"
return """根据url获取文件名"""
def gGetFileName(url):
if url==None: return None
if url=="" : return ""
arr=url.split("/")
return arr[len(arr)-1] """生成随机文件名"""
def gRandFilename(type):
fname = ''
for i in range(16):
fname = fname + chr(random.randint(65,90))
fname = fname + chr(random.randint(48,57))
return fname + '.' + type
"""根据url和其上的link,得到link的绝对地址"""
def gGetAbslLink(url,link):
if url==None or link == None : return
if url=='' or link=='' : return url
addr = ''
if link[0] == '/' :
addr = gGetHttpAddr(url) + link
elif len(link)>3 and link[0:4] == 'http':
addr = link
elif len(link)>2 and link[0:2] == '..':
addr = gGetHttpAddrFatherAssign(url,link)
else:
addr = gGetHttpAddrFather(url) + link return addr """根据输入的lines,匹配正则表达式,返回list"""
def gGetRegList(linesList,regx):
if linesList==None : return
rtnList=[]
for line in linesList:
matchs = re.search(regx, line, re.IGNORECASE)
if matchs!=None:
allGroups = matchs.groups()
for foundStr in allGroups:
if foundStr not in rtnList:
rtnList.append(foundStr)
return rtnList
"""根据url下载文件,文件名参数指定"""
def gDownloadWithFilename(url,savePath,file):
#参数检查,现忽略
try:
urlopen=urllib.URLopener()
fp = urlopen.open(url)
data = fp.read()
fp.close()
file=open(savePath + file,'w+b')
file.write(data)
file.close()
except IOError:
print "download error!"+ url """根据url下载文件,文件名自动从url获取"""
def gDownload(url,savePath):
#参数检查,现忽略
fileName = gGetFileName(url)
#fileName =gRandFilename('jpg')
gDownloadWithFilename(url,savePath,fileName) """根据某网页的url,下载该网页的jpg"""
def gDownloadHtmlJpg(downloadUrl,savePath):
lines= gGetHtmlLines(downloadUrl)
regx = r"""src\s*="?(\S+)\.jpg"""
lists =gGetRegList(lines,regx)
if lists==None: return
for jpg in lists:
jpg = gGetAbslLink(downloadUrl,jpg) + '.jpg'
gDownload(jpg,savePath)
### print gGetFileName(jpg)
"""根据url取主站地址"""
def gGetHttpAddr(url):
if url== '' : return ''
arr=url.split("/")
return arr[0]+"//"+arr[2]
"""根据url取上级目录"""
def gGetHttpAddrFather(url):
if url=='' : return ''
arr=url.split("/")
addr = arr[0]+'//'+arr[2]+ '/'
if len(arr)-1>3 :
for i in range(3,len(arr)-1):
addr = addr + arr[i] + '/'
return addr """根据url和上级的link取link的绝对地址"""
def gGetHttpAddrFatherAssign(url,link):
if url=='' : return ''
if link=='': return ''
linkArray=link.split("/")
urlArray = url.split("/")
partLink =''
partUrl = ''
for i in range(len(linkArray)):
if linkArray[i]=='..':
numOfFather = i + 1 #上级数
else:
partLink = partLink + '/' + linkArray[i]
for i in range(len(urlArray)-1-numOfFather):
partUrl = partUrl + urlArray[i]
if i < len(urlArray)-1-numOfFather -1 :
partUrl = partUrl + '/'
return partUrl + partLink """根据url获取其上的相关htm、html链接,返回list"""
def gGetHtmlLink(url):
#参数检查,现忽略
rtnList=[]
lines=gGetHtmlLines(url)
regx = r"""href="?(\S+)\.htm"""
for link in gGetRegList(lines,regx):
link = gGetAbslLink(url,link) + '.htm'
if link not in rtnList:
rtnList.append(link)
print link
return rtnList """根据url,抓取其上的jpg和其链接htm上的jpg"""
def gDownloadAllJpg(url,savePath):
#参数检查,现忽略
gDownloadHtmlJpg(url,savePath)
#抓取link上的jpg
links=gGetHtmlLink(url)
for link in links:
gDownloadHtmlJpg(link,savePath) """test"""
def test():
u='http://www.gs.xinhuanet.com/news/2007-01/31/content_9188207_1.htm'
save='d:/tmp/'
print 'download pic from [' + u +']'
print 'save to [' +save+'] ...'
gDownloadHtmlJpg(u,save)
print "download finished" test()

python下载图片(3)的更多相关文章

  1. python下载图片超时的调查

    在使用python3下载图片时, 常用的方法有urlretrieve和requests两种, 不管哪种方法在网速极慢的情况下, 会出现图片下载卡住现象.那如何解决呢? 小编根据网上提供的资料测试了几种 ...

  2. Python下载图片并保存本地的两种方式

    一:使用Python中的urllib类中的urlretrieve()函数,直接从网上下载资源到本地,具体代码: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...

  3. python下载图片(2)

    #-*- coding: UTF-8 -*- import urllib2, re,datetime,time, os,sys from PIL import Image, ImageDraw, Im ...

  4. python 下载图片的方法

    a='http://wx1.sinaimg.cn/mw600/006HOayNgy1fqjdi2nxohj32pw3o8x6s.jpg'  #图片下载地址   ( 这里改成 文件txt地址)w='/U ...

  5. 使用python下载图片(福利)

    刚学python 没多久, 代码处处是漏洞,也希望各位大佬理解一下 爬出来的图片... 使用的 是 https://www.tianapi.com/  接口下的 美女图片... (需要自己注册一个账号 ...

  6. Python下载图片小程序

    欢迎大侠们指正批评 思路: 1.引入相关的python文件(import re  import urllib) 2.读取对应网页的html文件(使用 urllib) def getHtml(url): ...

  7. Python 下载图片的几种方法

    import osos.makedirs('./image/', exist_ok=True)IMAGE_URL = "http://image.nationalgeographic.com ...

  8. python下载图片

    import re import  urllib.request   def getHtml(url): page = urllib.request.urlopen(url) html = page. ...

  9. python 下载图片

    import requests from PIL import Image from io import BytesIO url = 'http://image2.buslive.cn/shp/upl ...

随机推荐

  1. UILabel调整字间距

    1.引入 在文件导入 #import <CoreText/CoreText.h> 2.程序 NSMutableAttributedString *attributedString =[[N ...

  2. stm32的dac

  3. link和@import引入外部样式的区别

    原文: 简书原文:https://www.jianshu.com/p/14f99062f29a 大纲 前言 1.隶属上的差别 2.加载顺序的不同 3.兼容性上的差别 4.使用DOM控制样式时的差别 5 ...

  4. [Recompose] Transform Props using Recompose --mapProps

    Learn how to use the 'mapProps' higher-order component to modify an existing component’s API (its pr ...

  5. Thinking in UML 学习笔记(四)——UML核心视图之活动图

    在UML中活动图的本质就是流程图,它描述了为了完成某一个目标需要做的活动以及这些互动的执行顺序.UML中有两个层面的活动图,一种用于描述用例场景,另一种用于描述对象交互. 活动图只是我们用来描述业务目 ...

  6. [Javascript] Understand Function Composition By Building Compose and ComposeAll Utility Functions

    Function composition allows us to build up powerful functions from smaller, more focused functions. ...

  7. 使用搜狐Sendcloud的Webapi发送邮件:Jodd和Apache Httpclient

    最近,在使用搜狐Sendcloud发邮件.    Sendcloud提供http格式的webapi,方便地发送邮件,当然是要付费的. 很早之前,http工具一直用Httpclient,后来觉得jodd ...

  8. [Ramda] Convert Object Methods into Composable Functions with Ramda

    In this lesson, we'll look at how we can use Ramda's invoker and constructNfunctions to take methods ...

  9. KeePass v1.30

    Changes from 1.29 to 1.30: New Features:Refined application icons (thanks to Victor Andreyenkov).Add ...

  10. MVVM初步搭建应用

    MVVM模式:利用 prism Microsoft.Practices.Prism.dllWPF Interaction框架简介 添加Interactions库的引用.主要添加如下两个DLL: Mic ...