DDPG DDPG介绍2

ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测

公式推导 推导

代码实现的gym的pendulum游戏,这个游戏是连续动作的

pendulum环境介绍

代码实践

"""
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example. View more on my tutorial page: https://morvanzhou.github.io/tutorials/ Using:
tensorflow 1.0
gym 0.8.0
""" import tensorflow as tf
import numpy as np
import gym
import time np.random.seed(1)
tf.set_random_seed(1) ##################### hyper parameters #################### MAX_EPISODES = 200
MAX_EP_STEPS = 200
lr_a = 0.001 # learning rate for actor
lr_c = 0.001 # learning rate for critic
gamma = 0.9 # reward discount
REPLACEMENT = [
dict(name='soft', tau=0.01),
dict(name='hard', rep_iter_a=600, rep_iter_c=500)
][0] # you can try different target replacement strategies
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32 RENDER = True
OUTPUT_GRAPH = True
ENV_NAME = 'Pendulum-v0' ############################### Actor #################################### class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, replacement):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.replacement = replacement
self.t_replace_counter = 0 with tf.variable_scope('Actor'):
# 这个网络用于及时更新参数
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True) ##这个网络不及时更新参数, 用于预测action
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False) self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replace = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, scope, trainable):#根据state预测action的网络
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.3)
init_b = tf.constant_initializer(0.1)
net = tf.layers.dense(s, 30, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l1',
trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
bias_initializer=init_b, name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s}) if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replace)
else:
if self.t_replace_counter % self.replacement['rep_iter_a'] == 0:
self.sess.run(self.hard_replace)
self.t_replace_counter += 1 def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
# ys = policy;
# xs = policy's parameters;
# a_grads = the gradients of the policy to get more Q
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) with tf.variable_scope('A_train'):
opt = tf.train.AdamOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))#对eval_net的参数更新 ############################### Critic #################################### class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, replacement, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.replacement = replacement with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = tf.stop_gradient(a) # stop critic update flows to actor
# 这个网络用于及时更新参数
self.q = self._build_net(S, self.a, 'eval_net', trainable=True) # 这个网络不及时更新参数, 用于评价actor
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_#target计算 with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))#计算loss with tf.variable_scope('C_train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)#训练 with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim) if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replacement = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replacement = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, a, scope, trainable):#Q网络,计算Q(s,a)
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.1)
init_b = tf.constant_initializer(0.1) with tf.variable_scope('l1'):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q def learn(self, s, a, r, s_):
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replacement)
else:
if self.t_replace_counter % self.replacement['rep_iter_c'] == 0:
self.sess.run(self.hard_replacement)
self.t_replace_counter += 1 ##################### Memory #################### class Memory(object):
def __init__(self, capacity, dims):
self.capacity = capacity
self.data = np.zeros((capacity, dims))
self.pointer = 0 def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.capacity # replace the old memory with new memory
self.data[index, :] = transition
self.pointer += 1 def sample(self, n):
assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
indices = np.random.choice(self.capacity, size=n)
return self.data[indices, :] import pdb; pdb.set_trace()
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1) state_dim = env.observation_space.shape[0]#
action_dim = env.action_space.shape[0]#1 连续动作,一维
action_bound = env.action_space.high#[2] # all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_') sess = tf.Session() # Create actor and critic.
# They are actually connected to each other, details can be seen in tensorboard or in this picture:
actor = Actor(sess, action_dim, action_bound, lr_a, REPLACEMENT)
critic = Critic(sess, state_dim, action_dim, lr_c, gamma, REPLACEMENT, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)# # 将 critic 产出的 dQ/da 加入到 Actor 的 Graph 中去 sess.run(tf.global_variables_initializer()) M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1) if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph) var = 3 # control exploration t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER:
env.render() # Add exploration noise
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a) M.store_transition(s, a, r / 10, s_) if M.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
b_M = M.sample(BATCH_SIZE)
b_s = b_M[:, :state_dim]
b_a = b_M[:, state_dim: state_dim + action_dim]
b_r = b_M[:, -state_dim - 1: -state_dim]
b_s_ = b_M[:, -state_dim:] critic.learn(b_s, b_a, b_r, b_s_)
actor.learn(b_s) s = s_
ep_reward += r if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break print('Running time: ', time.time()-t1)

深度增强学习--DDPG的更多相关文章

  1. 深度增强学习--DPPO

    PPO DPPO介绍 PPO实现 代码DPPO

  2. 深度增强学习--A3C

    A3C 它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所 ...

  3. 深度增强学习--DQN的变形

    DQN的变形 double DQN prioritised replay dueling DQN

  4. 深度增强学习--Actor Critic

    Actor Critic value-based和policy-based的结合 实例代码 import sys import gym import pylab import numpy as np ...

  5. 深度增强学习--Policy Gradient

    前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客 一个推导 下面的例子实现的REINFOR ...

  6. 深度增强学习--Deep Q Network

    从这里开始换个游戏演示,cartpole游戏 Deep Q Network 实例代码 import sys import gym import pylab import random import n ...

  7. 常用增强学习实验环境 II (ViZDoom, Roboschool, TensorFlow Agents, ELF, Coach等) (转载)

    原文链接:http://blog.csdn.net/jinzhuojun/article/details/78508203 前段时间Nature上发表的升级版Alpha Go - AlphaGo Ze ...

  8. 马里奥AI实现方式探索 ——神经网络+增强学习

    [TOC] 马里奥AI实现方式探索 --神经网络+增强学习 儿时我们都曾有过一个经典游戏的体验,就是马里奥(顶蘑菇^v^),这次里约奥运会闭幕式,日本作为2020年东京奥运会的东道主,安倍最后也已经典 ...

  9. 增强学习 | AlphaGo背后的秘密

    "敢于尝试,才有突破" 2017年5月27日,当今世界排名第一的中国棋手柯洁与AlphaGo 2.0的三局对战落败.该事件标志着最新的人工智能技术在围棋竞技领域超越了人类智能,借此 ...

随机推荐

  1. python和语法糖

       语法糖(syntactic sugar)是指编程语言中可以更容易的表达一个操作的语法,它可以使程序员更加容易去使用这门语言:操作可以变得更加清晰.方便,或者更加符合程序员的编程习惯.(百度百科的 ...

  2. SnagIt截图后无法在编辑器打开,不显示截图内容的解决办法(转)

    方法1: 用SnagIt截图后,弹出的编辑器里不显示刚才截图的内容,解决办法如下: 完全退出Snagit和编辑器,删除以下文件夹: Win7用户 C:\Users\Administrator\AppD ...

  3. 用webpy实现12306余票查询

    效果

  4. try catch finally 执行顺序面试题总结

    在网上看到一些异常处理的面试题,试着总结一下,先看下面代码,把这个方法在main中进行调用打印返回结果,看看结果输出什么. public static int testBasic(){ int i = ...

  5. spark启动问题,发现任务都是在localhost下面运行的,原来启动spark-shell的时候需要带主节点的参数

    在Spark 集群上运行一个应用,只需通过master的 spark://IP:PORT 链接传递到SparkContext构造器 在集群上运行交互式的Spark 命令, 运行如下命令: MASTER ...

  6. CodeVS1169 传纸条 [DP补完计划]

    题目传送门 题目描述 Description 小渊和小轩是好朋友也是同班同学,他们在一起总有谈不完的话题.一次素质拓展活动中,班上同学安排做成一个m行n列的矩阵,而小渊和小轩被安排在矩阵对角线的两端, ...

  7. 设计模式-命令模式(Command Pattern)

    本文由@呆代待殆原创,转载请注明出处:http://www.cnblogs.com/coffeeSS/ 命令模式简述 命令模式的主要作用是将“行为请求者”和“行为实现者”解耦.举个例子,假如我们现在要 ...

  8. 谜题54:Null与Void

    下面仍然是经典的Hello World程序的另一个变种.那么,这个变种将打印什么呢? public class Null { public static void greet() { System.o ...

  9. 子查询在DELETE语句中的应用

    子查询在DELETE 中唯一可以应用的位置就是WHERE 子句,使用子查询可以完成复杂的数据删除控制.其使用方式与SELECT 语句中的子查询基本相同,而且也可以使用相关子查询等高级的特性.下面的SQ ...

  10. grunt-contrib-qunit安装过程中phantomjs安装报错问题解决

    今天自己fork了一个github上别人写的一个关于grunt项目的一个小demo(https://github.com/cowboy/jquery-tiny-pubsub),主要是想学习下grunt ...