for循环和多线程 + selenium

实例一

for循环

# -*- coding: utf-8 -*-

"""
Datetime: 2019/6/22
Author: Zhang Yafei
Description:
"""
import time from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor
import functools chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu') def timeit(func):
"""
装饰器: 判断函数执行时间
:param func:
:return:
""" @functools.wraps(func)
def inner(*args, **kwargs):
start = time.time()
ret = func(*args, **kwargs)
end = time.time() - start
if end < 60:
print(f'花费时间:\t{round(end, 2)}秒')
else:
min, sec = divmod(end, 60)
print(f'花费时间\t{round(min)}分\t{round(sec, 2)}秒')
return ret return inner class PolicyUrlDownload(object):
""" 政策数据下载 """ def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True):
self.url_list = [url.format(page) for page in range(1, pages_num+1)]
self.output_file = output_file
self.a_xpath = a_xpath
if headless:
self.driver = webdriver.Chrome(options=chrome_options)
else:
self.driver = webdriver.Chrome() def start(self, page, url):
with open(self.output_file, mode='a', encoding='utf-8') as file:
print(f"make request to {url}")
self.driver.get(url)
titles = self.driver.find_elements_by_xpath(self.a_xpath)
for title in titles:
href = title.get_attribute('href')
file.write(f'{page}\t{href}\n')
print(f'{url} download completed') def run(self):
for page, url in enumerate(self.url_list):
self.start(page+1, url)
self.driver.close() @timeit
def main(setting):
policy_data = PolicyUrlDownload(**setting)
policy_data.run() if __name__ == '__main__':
start_time = time.time()
print('######################## 开始下载 #########################') # 多配置页面地址下载
settings = [
{
'output_file': '药品供应保障综合的管理.txt',
'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html',
'pages_num': 8,
'a_xpath': '//div[@id="active0"]/ul/li/a'
},
{
'output_file': '药品供应保障综合的管理.txt',
'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html',
'pages_num': 9,
'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a'
}
] for setting in settings:
main(setting) print('下载成功, 共花费时间 ', round(time.time() - start_time, 2), '秒')

结果

下载成功, 共花费时间  28.46 秒

多线程

# -*- coding: utf-8 -*-

"""
Datetime: 2019/6/22
Author: Zhang Yafei
Description:
"""
import time from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor
import functools chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu') def timeit(func):
"""
装饰器: 判断函数执行时间
:param func:
:return:
""" @functools.wraps(func)
def inner(*args, **kwargs):
start = time.time()
ret = func(*args, **kwargs)
end = time.time() - start
if end < 60:
print(f'花费时间:\t{round(end, 2)}秒')
else:
min, sec = divmod(end, 60)
print(f'花费时间\t{round(min)}分\t{round(sec, 2)}秒')
return ret return inner class PolicyUrlDownload(object):
""" 政策数据下载 """ def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True):
self.url_list = [url.format(page) for page in range(1, pages_num+1)]
self.output_file = output_file
self.a_xpath = a_xpath
if headless:
self.driver = webdriver.Chrome(options=chrome_options)
else:
self.driver = webdriver.Chrome() def start(self, page, url):
with open(self.output_file, mode='a', encoding='utf-8') as file:
print(f"make request to {url}")
self.driver.get(url)
titles = self.driver.find_elements_by_xpath(self.a_xpath)
for title in titles:
href = title.get_attribute('href')
file.write(f'{page}\t{href}\n')
print(f'{url} download completed') def run(self):
for page, url in enumerate(self.url_list):
self.start(page+1, url)
self.driver.close() @timeit
def main(setting):
policy_data = PolicyUrlDownload(**setting)
policy_data.run() if __name__ == '__main__':
start_time = time.time()
print('######################## 开始下载 #########################') # 多配置页面地址下载
settings = [
{
'output_file': '药品供应保障综合的管理.txt',
'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html',
'pages_num': 8,
'a_xpath': '//div[@id="active0"]/ul/li/a'
},
{
'output_file': '药品供应保障综合的管理.txt',
'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html',
'pages_num': 9,
'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a'
}
]
with ThreadPoolExecutor() as pool:
pool.map(main, settings) print('下载成功, 共花费时间 ', round(time.time() - start_time, 2), '秒')

结果

花费时间:      18.04秒

实例二

顺序执行

# -*- coding: utf-8 -*-
import os
import time
from concurrent.futures import ThreadPoolExecutor
from hashlib import md5 from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import numpy as np class PolicyPageDownload(object):
""" 政策数据下载 """ def __init__(self, file, dir_name, url_list):
self.file = file
self.dir_name = dir_name
self.urls = url_list
self.chrome_options = Options()
self.chrome_options.add_argument("--headless")
self.chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=self.chrome_options)
# self.driver = webdriver.Chrome() def start(self, url):
"""
开始下载
:param url:
:return:
"""
self.driver.get(url)
response = self.driver.page_source
print(f'make request to {url}')
file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html'
print('11111111111')
with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file:
file.write(response)
print(f'{url} download completed') def run(self):
""" 入口函数 """
[self.start(url) for url in self.urls]
self.driver.quit() def filter_urls(dir_name, urls):
"""
过滤url
:param urls:
:return:
"""
encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls]
has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0]
encode_urls = set(encode_urls) - set(has_file)
down_urls = list(
filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls))
print(f'共{len(set(urls))}\t已下载{len(set(has_file))}\t 还需下载{len(encode_urls)}')
return down_urls def run(url_list):
policy = PolicyPageDownload(url_list=url_list, **setting)
policy.run() def main(file, dir_name):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
inputfile = open(file, 'r', encoding='utf-8')
urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)]
if os.path.exists(dir_name):
urls = filter_urls(dir_name, urls) run(urls) if __name__ == '__main__':
start_time = time.time()
setting = {
'file': '药品供应保障综合的管理.txt',
'dir_name': '药品供应保障综合的管理'
}
main(**setting) print('下载成功, 共花费时间 ', round(time.time() - start_time, 2), '秒')

多线程

# -*- coding: utf-8 -*-
import os
import time
from concurrent.futures import ThreadPoolExecutor
from hashlib import md5 from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import numpy as np class PolicyPageDownload(object):
""" 政策数据下载 """ def __init__(self, file, dir_name, url_list):
self.file = file
self.dir_name = dir_name
self.urls = url_list
self.chrome_options = Options()
self.chrome_options.add_argument("--headless")
self.chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=self.chrome_options)
# self.driver = webdriver.Chrome() def start(self, url):
"""
开始下载
:param url:
:return:
"""
self.driver.get(url)
response = self.driver.page_source
print(f'make request to {url}')
file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html'
print('11111111111')
with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file:
file.write(response)
print(f'{url} download completed') def run(self):
""" 入口函数 """
[self.start(url) for url in self.urls]
self.driver.quit() def filter_urls(dir_name, urls):
"""
过滤url
:param urls:
:return:
"""
encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls]
has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0]
encode_urls = set(encode_urls) - set(has_file)
down_urls = list(
filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls))
print(f'共{len(set(urls))}\t已下载{len(set(has_file))}\t 还需下载{len(encode_urls)}')
return down_urls def run(url_list):
policy = PolicyPageDownload(url_list=url_list, **setting)
policy.run() def main(file, dir_name):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
inputfile = open(file, 'r', encoding='utf-8')
urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)]
if os.path.exists(dir_name):
urls = filter_urls(dir_name, urls) with ThreadPoolExecutor() as pool:
pool.map(run, np.array_split(urls, 4)) if __name__ == '__main__':
start_time = time.time()
setting = {
'file': '药品供应保障综合的管理.txt',
'dir_name': '药品供应保障综合的管理'
}
main(**setting) print('下载成功, 共花费时间 ', round(time.time() - start_time, 2), '秒')

运行结果

#  50 for循环: 下载成功, 共花费时间  48.62 秒
# 150 for循环: 共花费时间 150.22 秒
# 150 多线程: 共花费时间 80.84 秒
  • 结论: 建立driver的花销较大,尽量创建一次,多次使用, 并发的话不能共用一个driver,必须重新创建
  • 使用技巧总结:创建多个线程,个数最好和cpu个数相同,每个线程创建一个driver

  

selenium实现并发的更多相关文章

  1. 使用jenkins pipeline,并发selenium测试 --- 你值得了解

    一.契机 相信很多使用selenium进行UI测试,再对接jenkins时,都是简单的在jenkins上将命令输入就完事了. 但是,相信你一定会遇到以下问题: 1.你需要同时跑不同文件或不同类的用例, ...

  2. selenium 并发执行测试用例

    转帖: 要想多线程并发的运行WebDriver,必须同时满足2个条件,首先你的测试程序是多线程,其次需要用到Selenium Server(selenium-server-standalone-XXX ...

  3. Selenium & Webdriver 远程测试和多线程并发测试

    Selenium & Webdriver 远程测试和多线程并发测试 Selenium Webdriver自动化测试,初学者可以使用selenium ide录制脚本,然后生成java程序导入ec ...

  4. selenium从入门到应用 - 8,selenium+testNG实现多线程的并发测试

    本系列所有代码 https://github.com/zhangting85/simpleWebtest本文将介绍一个Java+TestNG+Maven+Selenium的web自动化测试脚本环境下s ...

  5. Selenium 2 & WebDriver &多线程 并发

    我用的是Selenium2,至于它的背景和历史就不赘述了.Selenium2也叫WebDriver.下面讲个例子,用WebDriver+java来写个自动化测试的程序.(如果能用firefox去测试的 ...

  6. selenium grid解决多台电脑进行并发执行测试脚本

    1 两台计算机,一台计算机既做HUB,又做Node 机器A设置HUB的步骤: 1 运行---输入cmd 2 输入: cd c:/ 3  输入: java -jar selenium-server-st ...

  7. selenium 常见面试题以及答案(Java版)

    1.怎么 判断元素是否存在? 判断元素是否存在和是否出现不同, 判断是否存在意味着如果这个元素压根就不存在, 就会抛出NoSuchElementException 这样就可以使用try catch,如 ...

  8. 搭建selenium grid简单配置

    1.使用selenium提供的服务端独立jar包 :服务端.客户端都是运行于java7环境. 2.启动hub: hub配置文件如下: Java -jar selenium-server-standal ...

  9. 关于selenium的CI、框架……

    这段时间除了项目测试外,主要在做web自动化的事情,大致总结一下吧,总体的设计模式pageobject+pagefactory+testng的数据驱动,项目用maven来构建,使用jenkins集成, ...

随机推荐

  1. 【Xcode】sh: pause: command not found

    system("pause"); 只适合于DOS和Windows系统,不适合Linux系统. 直接删掉就可以. 或者改为: #include <unistd.h> pa ...

  2. 代码仓库gogs的基本配置使用

    目录 一.基本功能介绍 主板说明 页面说明 用户设置 二.仓库 新建仓库 迁移仓库 仓库介绍 三.组织和团队 创建新组织 创建团队 一.基本功能介绍 主板说明 图中1表示自己个人账户下的仓库(所有权属 ...

  3. .NET 云原生架构师训练营(建立系统观)--学习笔记

    目录 目标 ASP .NET Core 什么是系统 什么是系统思维 系统分解 什么是复杂系统 作业 目标 通过整体定义去认识系统 通过分解去简化对系统的认识 ASP .NET Core ASP .NE ...

  4. VirtualBox 同时添加 NAT 和 Host-Only 网卡出现无法上网的情况

    如果网卡1是 NAT,网卡2是 Host-Only,可以 ping 通 baidu.com. 如果网卡1是 Host-Only,网卡2是 NAT,无法 ping 通 baidu.com. 使用 nmc ...

  5. UVA11951 Area 题解

    Content 小 S 想买下一块地.他所在的城市可以看成一个 \(n\times m\) 的网格,要购买所处在 \((i,j)\) 的网格需要缴税 \(c_{i,j}\) 元,如果一块地里面有多个网 ...

  6. generating project in interactive mode

    解决方案:加个参数 -DarchetypeCatalog=internal 让它不要从远程服务器上取catalog

  7. git clone报错: Out of memory, malloc failed (tried to allocate 524288000 bytes)

    IDEA 拉取项目报错:Out of memory, malloc failed (tried to allocate 524288000 bytes) 执行 git config --global ...

  8. Android 控件使用教程(二)—— RecyclerView 展示图片

    简介 在上一篇博文中,介绍了大家已经很熟悉的布局控件ListView,在这篇文章中,我将使用比较新.功能也更强大的RecyclerView. RecyclerView 首先,要用这个控件,你需要在gr ...

  9. 【九度OJ】题目1054:字符串内排序 解题报告

    [九度OJ]题目1054:字符串内排序 解题报告 标签(空格分隔): 九度OJ [LeetCode] http://ac.jobdu.com/problem.php?pid=1054 题目描述: 输入 ...

  10. 【LeetCode】205. Isomorphic Strings 解题报告(Java & Python)

    作者: 负雪明烛 id: fuxuemingzhu 个人博客: http://fuxuemingzhu.cn/ 目录 题目描述 题目大意 解题方法 字典保存位置 字典保存映射 日期 题目地址:http ...