下载1000次网页资源

1,普通循环方式下载1000次,非常慢

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import time
import urllib
import urllib2 total_times = 1000 def worker(url):
try:
f = urllib2.urlopen(url,timeout=10800)
body = f.read()
except:
print sys.exc_info()
return 0
return 1 if __name__ == "__main__": for i in range(total_times):
url = "http://web.kuaipan.cn/static/images/pc.png"
worker(url) #root:~/test # time ./c.py
#real 4m6.700s
#user 0m1.192s
#sys 0m1.736s

2,使用进程池下载,有点慢

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import time
import urllib
import urllib2
import multiprocessing total_times = 1000 def worker(url):
try:
f = urllib2.urlopen(url,timeout=10800)
body = f.read()
except:
print sys.exc_info()
return 0
return 1 if __name__ == "__main__": pool_size = multiprocessing.cpu_count() * 2
pool = multiprocessing.Pool(processes=pool_size) for i in range(total_times):
url = "http://web.kuaipan.cn/static/images/pc.png"
pool.apply_async(worker, (url,)) pool.close()
pool.join() #root:~/test # time ./pc.py
#real 1m43.668s
#user 0m1.480s
#sys 0m1.628s

3,使用twisted网络库,同样发起1000次请求,耗时减少为15s左右,性能提升很多,很快

#!/usr/bin/python

from sys import argv
from pprint import pformat #from twisted.internet.task import react
from twisted.internet import reactor
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers total_times = 1000
times = 0 def cbRequest(response):
#print 'Response version:', response.version
#print 'Response code:', response.code
#print 'Response phrase:', response.phrase
#print 'Response headers:'
#print pformat(list(response.headers.getAllRawHeaders()))
d = readBody(response)
d.addCallback(cbBody)
return d def cbBody(body):
#print 'Response body:'
#print body
data = body def cbShutdown(ignored):
global times
times = times + 1
if total_times - 1 < times:
reactor.stop() def curl(url):
agent = Agent(reactor)
d = agent.request(
'GET', url,
Headers({'User-Agent': ['Twisted Web Client Example']}),
None)
d.addCallback(cbRequest)
d.addBoth(cbShutdown)
return d if __name__ == '__main__': for i in range(total_times):
curl("http://web.kuaipan.cn/static/images/pc.png") reactor.run() #root:~/test # time ./tc.py
#real 0m15.480s
#user 0m3.596s
#sys 0m0.720s

4,使用twisted网络库长连接,耗时也是很少,很快

#!/usr/bin/python

from sys import argv
from pprint import pformat #from twisted.internet.task import react
from twisted.internet import reactor
from twisted.web.http_headers import Headers from twisted.internet import reactor
from twisted.internet.defer import Deferred, DeferredList
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent, HTTPConnectionPool total_times = 1000
times = 0 class IgnoreBody(Protocol):
def __init__(self, deferred):
self.deferred = deferred def dataReceived(self, bytes):
pass def connectionLost(self, reason):
self.deferred.callback(None) def cbRequest(response):
#print 'Response code:', response.code
finished = Deferred()
response.deliverBody(IgnoreBody(finished))
return finished pool = HTTPConnectionPool(reactor)
agent = Agent(reactor, pool=pool) def requestGet(url):
d = agent.request('GET', url)
d.addCallback(cbRequest)
return d def cbShutdown(ignored):
global times
times = times + 1
if total_times - 1 < times:
reactor.stop() def curl(url):
agent = Agent(reactor)
d = agent.request(
'GET', url,
Headers({'User-Agent': ['Twisted Web Client Example']}),
None)
d.addCallback(cbRequest)
d.addBoth(cbShutdown)
return d for i in range(total_times):
curl("http://web.kuaipan.cn/static/images/pc.png") reactor.run() #root:~/test # time ./tpc.py
#real 0m12.817s
#user 0m3.508s
#sys 0m0.528s

更多twisted参考:https://twistedmatrix.com/documents/current/web/howto/client.html#auto4

golang使用循环下载方式,和python使用循环下载方式耗时差不多,4分钟时间,瓶颈应该在网络

package main

import (
"fmt"
"net/http"
"io/ioutil"
) var totaltimes = func worker(url string) {
response, err := http.Get(url)
if err != nil {
return
}
defer response.Body.Close()
body, _ := ioutil.ReadAll(response.Body)
fmt.Println(len(body))
} func main() { for i := ; i < totaltimes;i ++ {
worker("http://web.kuaipan.cn/static/images/pc.png")
}
} //root:~/test # time ./got > goresult
//
//real 4m45.257s
//user 0m0.628s
//sys 0m0.632s

golang使用协程池方式模拟下载1000次,性能也要差很多(而且容易出现网络错误,最近出的go version go1.2rc4 linux/amd64要好一点 ,go1.1问题很多)

package main

import (
"fmt"
"net/http"
"io/ioutil"
"sync"
) var totaltimes =
var poolsize = func worker(linkChan chan string, wg *sync.WaitGroup) {
// Decreasing internal counter for wait-group as soon as goroutine finishes
defer wg.Done() for url := range linkChan {
// Analyze value and do the job here
response, err := http.Get(url)
if err != nil {
return
}
defer response.Body.Close()
body, _ := ioutil.ReadAll(response.Body)
fmt.Println(len(body))
//fmt.Println("Resp code", response.StatusCode)
}
} func main() {
var i int lCh := make(chan string)
wg := new(sync.WaitGroup)
// Adding routines to workgroup and running then
for i := ; i < poolsize; i++ {
wg.Add()
go worker(lCh, wg)
} for i = ; i < totaltimes;i ++ {
lCh <- "http://web.kuaipan.cn/static/images/pc.png"
}
close(lCh)
// Waiting for all goroutines to finish (otherwise they die as main routine dies)
wg.Wait()
} //root:~/test # time ./gotest > goresult
//
//real 0m25.250s
//user 0m0.772s
//sys 0m0.380s

twisted支持定时器,我们可以用来动态添加任务

from twisted.web.client import getPage
from twisted.internet import reactor class Getter(object): def __init__(self):
self._sequence = 0
self._results = []
self._errors = [] def add(self, url):
d = getPage(url)
d.addCallbacks(self._on_success, self._on_error)
d.addCallback(self._on_finish)
self._sequence += 1 def _on_finish(self, *narg):
self._sequence -= 1
print len(self._results), len(self._errors)
# if not self._sequence:
# reactor.stop() _on_success = lambda self, *res: self._results.append(res)
_on_error = lambda self, *err: self._errors.append(err) def run(self):
reactor.run()
return self._results, self._errors def jobtimer():
for url in ('http://www.google.com', 'http://www.yahoo.com', 'http://www.baidu.com'):
g.add(url)
reactor.callLater(1,jobtimer) reactor.callLater(2,jobtimer) #定时添加任务
g = Getter()
results, errors = g.run() #print len(results)
#print len(errors)

使用python网络库下载的更多相关文章

  1. 基于协程的Python网络库gevent

    import gevent def test1(): print 12 gevent.sleep(0) print 34 def test2(): print 56 gevent.sleep(0) p ...

  2. Python网络爬虫 - 下载图片

    下载博客园的logo from urllib.request import urlretrieve from urllib.request import urlopen from bs4 import ...

  3. python 第三方库下载

    C:\Python27\Scripts 路径下: easy_install.exe: C:\Python27\Scripts>easy_install.exe pycrypto pip.exe: ...

  4. python 第三方库下载地址

    http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml

  5. python基于协程的网络库gevent、eventlet

    python网络库也有了基于协程的实现,比较著名的是 gevent.eventlet 它两之间的关系可以参照 Comparing gevent to eventlet, 本文主要简单介绍一下event ...

  6. python常用库

    本文由 伯乐在线 - 艾凌风 翻译,Namco 校稿.未经许可,禁止转载!英文出处:vinta.欢迎加入翻译组. Awesome Python ,这又是一个 Awesome XXX 系列的资源整理,由 ...

  7. 156个Python网络爬虫资源

    本列表包含Python网页抓取和数据处理相关的库. 网络相关 通用 urllib - 网络库(标准库) requests - 网络库 grab - 网络库(基于pycurl) pycurl - 网络库 ...

  8. Python常用库大全

    环境管理 管理 Python 版本和环境的工具 p – 非常简单的交互式 python 版本管理工具. pyenv – 简单的 Python 版本管理工具. Vex – 可以在虚拟环境中执行命令. v ...

  9. python的库小全

    环境管理 管理 Python 版本和环境的工具 p – 非常简单的交互式 python 版本管理工具. pyenv – 简单的 Python 版本管理工具. Vex – 可以在虚拟环境中执行命令. v ...

随机推荐

  1. 浅谈 non-blocking I/O Multiplexing + poll/epoll 的正确使用

    在前面的文章中曾经粗略讲过poll,那时是用阻塞IO实现,在发送和接收数据量都较小情况下和网络状况良好的情况下是基本没有问题的,read 不会只接收部分数据,write 也不会一直阻塞.但实际上pol ...

  2. bzoj 1303: [CQOI2009]中位数图

    题目链接 给n个数,一个值b, 统计所有以b为中位数的序列的个数.序列长度为奇数.数字在1-n之间, 每个数只出现一次. 如果一个数大于b, 那么将他赋值为1, 小于b赋值为-1, 记录数组中b出现的 ...

  3. poj 1091 跳骚

    /** 题意: 求对于小于m的n个数, 求x1*a1 + x2*a2+x3*a3........+xn*an = 1 即求 a1,a2,a3,....an 的最大公约数为1 , a1,a2....an ...

  4. mac定时任务

    <?xml version=”1.0″ encoding=”UTF-8″?><!DOCTYPE plist PUBLIC “-//Apple//DTD PLIST 1.0//EN” ...

  5. 关注SSO

    https://wiki.jasig.org/display/CASC/Configuring+the+Jasig+CAS+Client+for+Java+in+the+web.xml 其余的看osc ...

  6. HelloX项目github协同开发指南

    概述 为了提高协同开发效率,HelloX项目已托管到github网站上.根据目前的开发进展,创建了下列几个子项目: HelloX操作系统内核项目:https://github.com/hellox-p ...

  7. Azure Traffic Manager 现可与 Azure 网站集成!

     编辑人员注释:本文章由 WindowsAzure 网站团队高级专家级工程师 Jim Cheshire撰写. AzureTraffic Manager 已经推出有一段时间,这是一种跨多个区域管理网 ...

  8. HDU 4416 Good Article Good sentence(后缀自动机)

    [题目链接] http://acm.hdu.edu.cn/showproblem.php?pid=4416 [题目大意] 给出一个字符串,然后,给出一个字符串集合,问在该字符串中出现,且不在字符串集合 ...

  9. Java实现HTML转PDF的总结

    Java实现HTML转PDF的几种方法—主要解决中文乱码问题 第一:同事在做HTML转PDF中遇到乱码问题 ********************************************** ...

  10. 百度统计和CNZZ到底有哪些不同?

    作为互联网门外汉,同时作为不专业的站长,我总是有很多问题想知道,比如百度统计和CNZZ在数据统计上总是有差异,就好像王婆卖瓜,自卖自夸,又好像公说公有理,婆说婆有理. 作为专业的不专业的市场人员,好奇 ...