深度增强学习--DDPG
ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测

代码实现的gym的pendulum游戏,这个游戏是连续动作的

"""
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example. View more on my tutorial page: https://morvanzhou.github.io/tutorials/ Using:
tensorflow 1.0
gym 0.8.0
""" import tensorflow as tf
import numpy as np
import gym
import time np.random.seed(1)
tf.set_random_seed(1) ##################### hyper parameters #################### MAX_EPISODES = 200
MAX_EP_STEPS = 200
lr_a = 0.001 # learning rate for actor
lr_c = 0.001 # learning rate for critic
gamma = 0.9 # reward discount
REPLACEMENT = [
dict(name='soft', tau=0.01),
dict(name='hard', rep_iter_a=600, rep_iter_c=500)
][0] # you can try different target replacement strategies
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32 RENDER = True
OUTPUT_GRAPH = True
ENV_NAME = 'Pendulum-v0' ############################### Actor #################################### class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, replacement):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.replacement = replacement
self.t_replace_counter = 0 with tf.variable_scope('Actor'):
# 这个网络用于及时更新参数
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True) ##这个网络不及时更新参数, 用于预测action
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False) self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replace = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, scope, trainable):#根据state预测action的网络
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.3)
init_b = tf.constant_initializer(0.1)
net = tf.layers.dense(s, 30, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l1',
trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
bias_initializer=init_b, name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s}) if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replace)
else:
if self.t_replace_counter % self.replacement['rep_iter_a'] == 0:
self.sess.run(self.hard_replace)
self.t_replace_counter += 1 def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
# ys = policy;
# xs = policy's parameters;
# a_grads = the gradients of the policy to get more Q
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) with tf.variable_scope('A_train'):
opt = tf.train.AdamOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))#对eval_net的参数更新 ############################### Critic #################################### class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, replacement, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.replacement = replacement with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = tf.stop_gradient(a) # stop critic update flows to actor
# 这个网络用于及时更新参数
self.q = self._build_net(S, self.a, 'eval_net', trainable=True) # 这个网络不及时更新参数, 用于评价actor
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_#target计算 with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))#计算loss with tf.variable_scope('C_train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)#训练 with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim) if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replacement = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replacement = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)] def _build_net(self, s, a, scope, trainable):#Q网络,计算Q(s,a)
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.1)
init_b = tf.constant_initializer(0.1) with tf.variable_scope('l1'):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q def learn(self, s, a, r, s_):
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replacement)
else:
if self.t_replace_counter % self.replacement['rep_iter_c'] == 0:
self.sess.run(self.hard_replacement)
self.t_replace_counter += 1 ##################### Memory #################### class Memory(object):
def __init__(self, capacity, dims):
self.capacity = capacity
self.data = np.zeros((capacity, dims))
self.pointer = 0 def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.capacity # replace the old memory with new memory
self.data[index, :] = transition
self.pointer += 1 def sample(self, n):
assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
indices = np.random.choice(self.capacity, size=n)
return self.data[indices, :] import pdb; pdb.set_trace()
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1) state_dim = env.observation_space.shape[0]#
action_dim = env.action_space.shape[0]#1 连续动作,一维
action_bound = env.action_space.high#[2] # all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_') sess = tf.Session() # Create actor and critic.
# They are actually connected to each other, details can be seen in tensorboard or in this picture:
actor = Actor(sess, action_dim, action_bound, lr_a, REPLACEMENT)
critic = Critic(sess, state_dim, action_dim, lr_c, gamma, REPLACEMENT, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)# # 将 critic 产出的 dQ/da 加入到 Actor 的 Graph 中去 sess.run(tf.global_variables_initializer()) M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1) if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph) var = 3 # control exploration t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER:
env.render() # Add exploration noise
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a) M.store_transition(s, a, r / 10, s_) if M.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
b_M = M.sample(BATCH_SIZE)
b_s = b_M[:, :state_dim]
b_a = b_M[:, state_dim: state_dim + action_dim]
b_r = b_M[:, -state_dim - 1: -state_dim]
b_s_ = b_M[:, -state_dim:] critic.learn(b_s, b_a, b_r, b_s_)
actor.learn(b_s) s = s_
ep_reward += r if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break print('Running time: ', time.time()-t1)
深度增强学习--DDPG的更多相关文章
- 深度增强学习--DPPO
PPO DPPO介绍 PPO实现 代码DPPO
- 深度增强学习--A3C
A3C 它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所 ...
- 深度增强学习--DQN的变形
DQN的变形 double DQN prioritised replay dueling DQN
- 深度增强学习--Actor Critic
Actor Critic value-based和policy-based的结合 实例代码 import sys import gym import pylab import numpy as np ...
- 深度增强学习--Policy Gradient
前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客 一个推导 下面的例子实现的REINFOR ...
- 深度增强学习--Deep Q Network
从这里开始换个游戏演示,cartpole游戏 Deep Q Network 实例代码 import sys import gym import pylab import random import n ...
- 常用增强学习实验环境 II (ViZDoom, Roboschool, TensorFlow Agents, ELF, Coach等) (转载)
原文链接:http://blog.csdn.net/jinzhuojun/article/details/78508203 前段时间Nature上发表的升级版Alpha Go - AlphaGo Ze ...
- 马里奥AI实现方式探索 ——神经网络+增强学习
[TOC] 马里奥AI实现方式探索 --神经网络+增强学习 儿时我们都曾有过一个经典游戏的体验,就是马里奥(顶蘑菇^v^),这次里约奥运会闭幕式,日本作为2020年东京奥运会的东道主,安倍最后也已经典 ...
- 增强学习 | AlphaGo背后的秘密
"敢于尝试,才有突破" 2017年5月27日,当今世界排名第一的中国棋手柯洁与AlphaGo 2.0的三局对战落败.该事件标志着最新的人工智能技术在围棋竞技领域超越了人类智能,借此 ...
随机推荐
- C#取出字符串中的数字或字母
string str20 = "ABC123"; string strSplit1,strSplit2; //取出字符串中所有的英文字母 strSplit1 = Regex.Rep ...
- 玩转树莓派 - 修改Raspbian软件源加快软件下载速度
这是 meelo 原创的 玩转树莓派 系列文章 步骤1:登录到Raspbian的命令行界面 步骤2:修改Raspbian的软件源 软件源是Linux系统免费的应用程序安装仓库,很多的应用软件都会这收录 ...
- [水煮 ASP.NET Web API2 方法论](1-5)ASP.NET Web API Scaffolding(模板)
问题 我们想快速启动一个 ASP.NET Web API 解决方案. 解决方案 APS.NET 模板一开始就支持 ASP.NET Web API.使用模板往我们的项目中添加 Controller,在我 ...
- C#获取网页信息核心方法(入门一)
目录:信息采集入门系列目录 下面记录的是我自己整理的C#请求页面核心类,主要有如下几个方法 1.HttpWebRequest Get请求获得页面html 2.HttpWebRequest Post请求 ...
- shadownsocks SSR 账号密码注册 可1元体验一天
shadownsocks SSR 账号密码注册 可1元体验一天 注册地址 https://www.cup123.club/register?aff=809
- 最短路-Bellmanford
简介: 给定一个图和一个源点,求源点到其余点的最短路径,图中有可能存在负权边. 算法步骤 1.初始化:将除源点外的所有顶点的最短距离估计值 dist[v] ← +∞, dist[s] ←0; 2.迭代 ...
- Project interpreter not specified(eclipse+pydev) (转)
[小记] 最近因为想配置Android的开发环境,把原来的MyEclipse5.5删了,下载了最新的Eclipse3.7版本,因为之前在进行Python开 发,就下载了最新的Pydev2.4版本,安装 ...
- LOJ #6282. 数列分块入门 6-分块(单点插入、单点查询、数据随机生成)
#6282. 数列分块入门 6 内存限制:256 MiB时间限制:500 ms标准输入输出 题目类型:传统评测方式:文本比较 上传者: hzwer 提交提交记录统计测试数据讨论 1 题目描述 给出 ...
- 26、Flask实战第26天:cms用户模型定义
编辑cms.models.py from exts import db from datetime import datetime class CMSUser(db.Model): __tablena ...
- 更改Xamarin Android App名称
更改Xamarin Android App名称 Xamarin Android生成的App名称默认和项目名一致.修改该名称有两种方式. 第一种方式:右击Android项目,选择“属性”命令,然 ...