90、Tensorflow实现分布式学习,多台电脑,多个GPU 异步试学习
'''
Created on 2017年5月28日 @author: weizhen
'''
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data import mnist_inference BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.01
TRAINING_STEPS = 1000
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
# 模型保存路径
MODEL_SAVE_PATH = "/path/to/model"
# MNIST数据路径
DATA_PATH = "/path/to/data" # 通过flags指定运行的参数。对于不同的任务task给出了不同的程序
# 但这不是一种可扩展的方式,在这一小节中将使用运行程序是给出的参数来配置在不同任务中运行的程序
FLAGS = tf.app.flags.FLAGS
# 指定当前运行的是参数服务器还是计算服务器。参数服务器只负责Tensorflow中变量的维护和管理
# 计算服务器则负责每一轮迭代时运行反向传播过程
tf.app.flags.DEFINE_string('job_name', 'worker', '"ps" or "worker" ') # 指定集群中的参数服务器地址
tf.app.flags.DEFINE_string(
'ps_hosts', 'tf-ps0:2222,tf-ps1:1111',
'Comma-separated list of hostname:port for the parameter server jobs. e.g. "tf-ps0:2222,tf-ps1:1111" ')
# 指定集群中的计算服务器地址
tf.app.flags.DEFINE_string(
'worker_hosts', 'tf-worker0:2222,tf-worker1:1111',
'Comma-separated list of hostname:port for the worker jobs.'
'e.g. "tf-worker0:2222,tf-worker1:1111" ') # 指定当前程序的任务ID. Tensorflow 会自动根据参数服务器/计算服务器列表中的端口号
# 来启动服务。注意参数服务器和计算服务器的编号都是从0开始的
tf.app.flags.DEFINE_integer(
'task_id', 0, 'Task ID of the worker/replica running the training.'
) # 定义Tensorflow的计算图,并返回每一轮迭代时需要 运行的操作。
# 为了是处理分布式计算的部分更加突出,本校节将此过程整理为一个函数
def build_model(x, y_, is_chief):
regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
# 计算神经网络前向传播的结果
y = mnist_inference.inference(x, regularizer)
global_step = tf.Variable(0, trainable=False) # 计算损失函数并定义反向传播过程
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
cross_entropy_mean = tf.reduce_mean(cross_entropy)
loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY) # 定义每一轮迭代需要运行的操作
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
return global_step, loss, train_op # 训练分布式深度学习模型的主过程
def main(argv=None):
# 解析flags并通过tf.train.ClusterSpec配置TensorFlow集群
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
cluster = tf.train.ClusterSpec({"ps":ps_hosts, "worker":worker_hosts})
# 通过ClusterSpec以及当前任务创建Server
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_id) # 参数服务器只需要管理TensorFlow中的变量,不需要执行训练的过程。server.join()会一直停在这条语句上
if FLAGS.job_name == 'ps':
server.join() # 定义计算服务器需要运行的操作。在所有的计算服务器中有一个是主计算服务器。它除了负责计算反向传播的结果,它还负责输出日志和保存模型
is_chief = (FLAGS.task_id == 0)
mnist = input_data.read_data_sets(DATA_PATH, one_hot=True) # 通过tf.train.replica_device_setter函数来指定执行每一个运算的设备
# tf.train.replica_device_setter函数会自动将所有的参数分配到参数服务器上,而
# 计算分配到当前的计算服务器上
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d " % FLAGS.task_id, cluster=cluster)):
x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')
# 定义训练模型需要运行的操作
global_step, loss, train_op = build_model(x, y_, is_chief)
# 定义用于保存模型的saver
saver = tf.train.Saver()
# 定义日志输出操作
summary_op = tf.summary.merge_all()
# 定义病了初始化操作
init_op = tf.global_variables_initializer()
# 通过tf.train.Supervisor管理训练深度学习模型的通用功能
# tf.train.Supervisor能统一管理队列操作、模型保存、日志输出以及会话的生成
sv = tf.train.Supervisor(
is_chief=is_chief, # 定义当前计算服务器是否为主计算服务器,只用主计算服务器会保存模型以及输出日志
logdir=MODEL_SAVE_PATH, # 指定保存模型和输出日志的地址
init_op=init_op, # 指定初始化操作
summary_op=summary_op, # 指定日志生成操作
saver=saver, # 指定用于保存模型的saver
global_step=global_step, # 指定当前迭代的轮数,这个会用于生成保存模型文件的文件名
save_model_secs=60, # 指定保存模型的时间间隔
save_summaries_secs=60 # 指定日志输出的时间间隔
)
sess_config = tf.ConfigProto(allow_soft_placement=True,
log_device_placement=False)
# 通过tf.train.Supervisor生成会话
sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
step = 0
start_time = time.time()
# 执行迭代过程。在迭代过程中tf.train.Supervisor会帮助输出日志并保存模型
# 所以不需要直接调用这些过程
while not sv.should_stop():
xs, ys = mnist.train.next_batch(BATCH_SIZE)
_, loss_value, global_step_value = sess.run(
[train_op, loss, global_step], feed_dict={x:xs, y_:ys})
if global_step_value >= TRAINING_STEPS:break # 每隔一段时间输出训练信息
if step > 0 and step % 100 == 0:
duration = time.time() - start_time
# 不同的计算服务器都会更新全局的训练轮数,所以这里使用
# global_step_value可以直接得到在训练中使用过的batch的总数
sec_per_batch = duration / global_step_value format_str = ("After %d training steps (%d global steps), loss on training batch is %g. (%.3f sec/batch)")
print(format_str % (step, global_step_value, loss_value, sec_per_batch))
step += 1
sv.stop() if __name__ == "__main__":
tf.app.run()
下面是训练的结果,需要等到所有的机器都开起来之后才能进行训练
2017-05-28 22:38:45.122523: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.122960: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.123285: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.123847: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.124201: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.125153: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.125514: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.126016: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:47.211250: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:887] Found device 0 with properties:
name: GeForce 940MX
major: 5 minor: 0 memoryClockRate (GHz) 1.189
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
2017-05-28 22:38:47.211668: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:908] DMA: 0
2017-05-28 22:38:47.211848: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:918] 0: Y
2017-05-28 22:38:47.212045: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce 940MX, pci bus id: 0000:01:00.0)
2017-05-28 22:38:47.375428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job ps -> {0 -> tf-ps0:2222, 1 -> tf-ps1:1111}
2017-05-28 22:38:47.376363: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222, 1 -> tf-worker1:1111}
2017-05-28 22:38:47.380830: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_server_lib.cc:240] Started server with target: grpc://localhost:2222
Extracting /path/to/data\train-images-idx3-ubyte.gz
Extracting /path/to/data\train-labels-idx1-ubyte.gz
Extracting /path/to/data\t10k-images-idx3-ubyte.gz
Extracting /path/to/data\t10k-labels-idx1-ubyte.gz
2017-05-28 22:38:58.243494: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:38:58.244680: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:38:58.247390: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:08.248725: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:08.249804: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:08.251307: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:18.253692: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:18.254576: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:18.255448: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:28.257660: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:28.258782: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:28.260428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
90、Tensorflow实现分布式学习,多台电脑,多个GPU 异步试学习的更多相关文章
- 『TensorFlow』分布式训练_其二_单机多GPU并行&GPU模式设定
建议比对『MXNet』第七弹_多GPU并行程序设计 一.tensorflow GPU设置 GPU指定占用 gpu_options = tf.GPUOptions(per_process_gpu_mem ...
- WIN7实现多人远程一台电脑
今天查了查网,发现有人说,WIN7可以实现多人远程一台电脑,于是乎我就试了试, 在工作办公室里的局域网里试了试,嘿,成功了,愿与大家分享一下,呵呵! 方法一: 多用户早就能破解了 方法如下:用UE打开 ...
- mxnet:结合R与GPU加速深度学习
转载于统计之都,http://cos.name/tag/dmlc/,作者陈天奇 ------------------------------------------------------------ ...
- 学习笔记TF061:分布式TensorFlow,分布式原理、最佳实践
分布式TensorFlow由高性能gRPC库底层技术支持.Martin Abadi.Ashish Agarwal.Paul Barham论文<TensorFlow:Large-Scale Mac ...
- Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
- 【转载】Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
- jmeter分布式压测(多台电脑一起压测)
(1)在Windows下运行 操作步骤: 1) 有多台电脑,每台电脑上都有jmeter,而且这几台电脑都互相能ping通. 2) 在我的电脑的jmeter的配置文件bin目录下的jme ...
- git学习笔记:一台电脑上配置两个git账户
如何在一台电脑上配置两个git账户,现在云端仓库很多,有开源中国的 gitee.com 微软的 github.com 还有 gitlab.com 和 bitbucket.org 等等,下面是具体步骤 ...
- Git学习笔记——从一台电脑上传文件到Github上
目标:从一台电脑上传文件到Github上 前提: 1.这里假定已在Github上创建了仓库,建立了仓库 2.已在这台电脑上安装了Git客户端 实验环境: 1.Windows 10 64位,已安装了Gi ...
随机推荐
- 公司-IT-Mercari:Mercari 百科
ylbtech-公司-IT-Mercari:Mercari 百科 Mercari是一个日本C2C二手交易平台.拥有针对智能手机的C2C(个人与个人之间的电子商务)二手交易APP,此外还提供针对书籍与C ...
- Workflow:Workflow 百科
ylbtech-Workflow:Workflow 百科 工作流(Workflow),指“业务过程的部分或整体在计算机应用环境下的自动化”.是对工作流程及其各操作步骤之间业务规则的抽象.概括描述.在计 ...
- jmeter添加自定义扩展函数之Strng---base64解密
1,打开eclipse,新建maven工程,在pom中引用jmeter核心jar包,具体请看---https://www.cnblogs.com/guanyf/p/10863033.html---,这 ...
- Windows 08 R2_创建AD DS域服务(图文详解)
目录 目录 Active Directory概念 创建第一个AD域控制器 搭建DNS服务器 使用Windows窗口程序创建AD域控制器 AD与LDAP的关系 使用Powershell来创建ADDS域控 ...
- exception 打印出异常栈踪迹
Java异常抛出使用e.printStackTrace(),打印出抛出的异常栈踪迹, 如果你在catch中继续抛出这个异常,那么e.printStackTrace()也能跟踪到抛出异常的地方, 使用t ...
- linux执行时间段内日志关键字搜索
sed -n '/起始时间/,/结束时间/p' 日志文件 | grep '关键字' 查询文件debug.log在2019-11-18 08:00:00~2019-11-18 08:21:00时间段内e ...
- B. pSort
题目链接: http://codeforces.com/problemset/problem/28/B 题意: 给一个n,原本有个1到n按顺序排列的序列,给一个序列问,在给一个数组,表示这个位置的数可 ...
- 读取hive文件并将数据导入hbase
package cn.tansun.bd.hbase; import java.io.IOException; import java.net.URI; import java.util.List; ...
- Java Console/控制台 打印表格
功能:控制台打印表格,支持字段动态长度,左对齐,右对齐,居中,设置最大列长,设置列间隔符,设置最多打印多少行. 类下载地址:http://download.csdn.net/download/j506 ...
- Shell02---变量
Shell02---变量 1. shell变量概述 1. 什么是变量 变量是Shell传递数据的一种方法,简单理解:用一个固定的字符串去表示不固定的内容,便于后续引用. 2.变量命令规范 变量定义时名 ...