DDPG DDPG介绍2

ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测

公式推导 推导

代码实现的gym的pendulum游戏,这个游戏是连续动作的

pendulum环境介绍

代码实践

"""
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example. View more on my tutorial page: https://morvanzhou.github.io/tutorials/ Using:
tensorflow 1.0
gym 0.8.0
""" import tensorflow as tf
import numpy as np
import gym
import time np.random.seed(1)
tf.set_random_seed(1) ##################### hyper parameters #################### MAX_EPISODES = 200
MAX_EP_STEPS = 200
lr_a = 0.001 # learning rate for actor
lr_c = 0.001 # learning rate for critic
gamma = 0.9 # reward discount
REPLACEMENT = [
dict(name='soft', tau=0.01),
dict(name='hard', rep_iter_a=600, rep_iter_c=500)
][0] # you can try different target replacement strategies
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32 RENDER = True
OUTPUT_GRAPH = True
ENV_NAME = 'Pendulum-v0' ############################### Actor #################################### class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, replacement):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.replacement = replacement
self.t_replace_counter = 0 with tf.variable_scope('Actor'):
# 这个网络用于及时更新参数
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True) ##这个网络不及时更新参数, 用于预测action
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False) self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replace = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, scope, trainable):#根据state预测action的网络
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.3)
init_b = tf.constant_initializer(0.1)
net = tf.layers.dense(s, 30, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l1',
trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
bias_initializer=init_b, name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s}) if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replace)
else:
if self.t_replace_counter % self.replacement['rep_iter_a'] == 0:
self.sess.run(self.hard_replace)
self.t_replace_counter += 1 def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
# ys = policy;
# xs = policy's parameters;
# a_grads = the gradients of the policy to get more Q
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) with tf.variable_scope('A_train'):
opt = tf.train.AdamOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))#对eval_net的参数更新 ############################### Critic #################################### class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, replacement, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.replacement = replacement with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = tf.stop_gradient(a) # stop critic update flows to actor
# 这个网络用于及时更新参数
self.q = self._build_net(S, self.a, 'eval_net', trainable=True) # 这个网络不及时更新参数, 用于评价actor
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_#target计算 with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))#计算loss with tf.variable_scope('C_train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)#训练 with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim) if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replacement = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replacement = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, a, scope, trainable):#Q网络,计算Q(s,a)
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.1)
init_b = tf.constant_initializer(0.1) with tf.variable_scope('l1'):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q def learn(self, s, a, r, s_):
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replacement)
else:
if self.t_replace_counter % self.replacement['rep_iter_c'] == 0:
self.sess.run(self.hard_replacement)
self.t_replace_counter += 1 ##################### Memory #################### class Memory(object):
def __init__(self, capacity, dims):
self.capacity = capacity
self.data = np.zeros((capacity, dims))
self.pointer = 0 def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.capacity # replace the old memory with new memory
self.data[index, :] = transition
self.pointer += 1 def sample(self, n):
assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
indices = np.random.choice(self.capacity, size=n)
return self.data[indices, :] import pdb; pdb.set_trace()
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1) state_dim = env.observation_space.shape[0]#
action_dim = env.action_space.shape[0]#1 连续动作,一维
action_bound = env.action_space.high#[2] # all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_') sess = tf.Session() # Create actor and critic.
# They are actually connected to each other, details can be seen in tensorboard or in this picture:
actor = Actor(sess, action_dim, action_bound, lr_a, REPLACEMENT)
critic = Critic(sess, state_dim, action_dim, lr_c, gamma, REPLACEMENT, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)# # 将 critic 产出的 dQ/da 加入到 Actor 的 Graph 中去 sess.run(tf.global_variables_initializer()) M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1) if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph) var = 3 # control exploration t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER:
env.render() # Add exploration noise
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a) M.store_transition(s, a, r / 10, s_) if M.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
b_M = M.sample(BATCH_SIZE)
b_s = b_M[:, :state_dim]
b_a = b_M[:, state_dim: state_dim + action_dim]
b_r = b_M[:, -state_dim - 1: -state_dim]
b_s_ = b_M[:, -state_dim:] critic.learn(b_s, b_a, b_r, b_s_)
actor.learn(b_s) s = s_
ep_reward += r if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break print('Running time: ', time.time()-t1)

深度增强学习--DDPG的更多相关文章

  1. 深度增强学习--DPPO

    PPO DPPO介绍 PPO实现 代码DPPO

  2. 深度增强学习--A3C

    A3C 它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所 ...

  3. 深度增强学习--DQN的变形

    DQN的变形 double DQN prioritised replay dueling DQN

  4. 深度增强学习--Actor Critic

    Actor Critic value-based和policy-based的结合 实例代码 import sys import gym import pylab import numpy as np ...

  5. 深度增强学习--Policy Gradient

    前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客 一个推导 下面的例子实现的REINFOR ...

  6. 深度增强学习--Deep Q Network

    从这里开始换个游戏演示,cartpole游戏 Deep Q Network 实例代码 import sys import gym import pylab import random import n ...

  7. 常用增强学习实验环境 II (ViZDoom, Roboschool, TensorFlow Agents, ELF, Coach等) (转载)

    原文链接:http://blog.csdn.net/jinzhuojun/article/details/78508203 前段时间Nature上发表的升级版Alpha Go - AlphaGo Ze ...

  8. 马里奥AI实现方式探索 ——神经网络+增强学习

    [TOC] 马里奥AI实现方式探索 --神经网络+增强学习 儿时我们都曾有过一个经典游戏的体验,就是马里奥(顶蘑菇^v^),这次里约奥运会闭幕式,日本作为2020年东京奥运会的东道主,安倍最后也已经典 ...

  9. 增强学习 | AlphaGo背后的秘密

    "敢于尝试,才有突破" 2017年5月27日,当今世界排名第一的中国棋手柯洁与AlphaGo 2.0的三局对战落败.该事件标志着最新的人工智能技术在围棋竞技领域超越了人类智能,借此 ...

随机推荐

  1. C#取出字符串中的数字或字母

    string str20 = "ABC123"; string strSplit1,strSplit2; //取出字符串中所有的英文字母 strSplit1 = Regex.Rep ...

  2. 玩转树莓派 - 修改Raspbian软件源加快软件下载速度

    这是 meelo 原创的 玩转树莓派 系列文章 步骤1:登录到Raspbian的命令行界面 步骤2:修改Raspbian的软件源 软件源是Linux系统免费的应用程序安装仓库,很多的应用软件都会这收录 ...

  3. [水煮 ASP.NET Web API2 方法论](1-5)ASP.NET Web API Scaffolding(模板)

    问题 我们想快速启动一个 ASP.NET Web API 解决方案. 解决方案 APS.NET 模板一开始就支持 ASP.NET Web API.使用模板往我们的项目中添加 Controller,在我 ...

  4. C#获取网页信息核心方法(入门一)

    目录:信息采集入门系列目录 下面记录的是我自己整理的C#请求页面核心类,主要有如下几个方法 1.HttpWebRequest Get请求获得页面html 2.HttpWebRequest Post请求 ...

  5. shadownsocks SSR 账号密码注册 可1元体验一天

    shadownsocks SSR 账号密码注册 可1元体验一天 注册地址 https://www.cup123.club/register?aff=809

  6. 最短路-Bellmanford

    简介: 给定一个图和一个源点,求源点到其余点的最短路径,图中有可能存在负权边. 算法步骤 1.初始化:将除源点外的所有顶点的最短距离估计值 dist[v] ← +∞, dist[s] ←0; 2.迭代 ...

  7. Project interpreter not specified(eclipse+pydev) (转)

    [小记] 最近因为想配置Android的开发环境,把原来的MyEclipse5.5删了,下载了最新的Eclipse3.7版本,因为之前在进行Python开 发,就下载了最新的Pydev2.4版本,安装 ...

  8. LOJ #6282. 数列分块入门 6-分块(单点插入、单点查询、数据随机生成)

    #6282. 数列分块入门 6 内存限制:256 MiB时间限制:500 ms标准输入输出 题目类型:传统评测方式:文本比较 上传者: hzwer 提交提交记录统计测试数据讨论 1   题目描述 给出 ...

  9. 26、Flask实战第26天:cms用户模型定义

    编辑cms.models.py from exts import db from datetime import datetime class CMSUser(db.Model): __tablena ...

  10. 更改Xamarin Android App名称

    更改Xamarin Android App名称   Xamarin Android生成的App名称默认和项目名一致.修改该名称有两种方式.   第一种方式:右击Android项目,选择“属性”命令,然 ...