Hands-on ML and TF Chapter16 Reinforcement Learning
Policy Granients
import tensorflow as tf
reset_graph()
n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01
initializer = tf.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
gradient_placeholders.append(gradient_placeholder)
grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
def discount_rewards(rewards, discount_rate):
discounted_rewards = np.zeros(len(rewards))
cumulative_rewards = 0
for step in reversed(range(len(rewards))):
cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
discounted_rewards[step] = cumulative_rewards
return discounted_rewards
def discount_and_normalize_rewards(all_rewards, discount_rate):
all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
flat_rewards = np.concatenate(all_discounted_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
env = gym.make("CartPole-v0")
n_games_per_update = 10
n_max_steps = 1000
n_iterations = 250
save_iterations = 10
discount_rate = 0.95
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
print("\rIteration: {}".format(iteration), end="")
all_rewards = []
all_gradients = []
for game in range(n_games_per_update):
current_rewards = []
current_gradients = []
obs = env.reset()
for step in range(n_max_steps):
action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
current_rewards.append(reward)
current_gradients.append(gradients_val)
if done:
break
all_rewards.append(current_rewards)
all_gradients.append(current_gradients)
all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
feed_dict = {}
for var_index, gradient_placeholder in enumerate(gradient_placeholders):
mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
for game_index, rewards in enumerate(all_rewards)
for step, reward in enumerate(rewards)], axis=0)
## mean_gradients 是个标量,为一个Variable在10个episode的所有step的梯度的平均值。
feed_dict[gradient_placeholder] = mean_gradients
sess.run(training_op, feed_dict=feed_dict)
if iteration % save_iterations == 0:
saver.save(sess, "./my_policy_net_pg.ckpt")
Markov Decision Process
Learning to Play Ms. Pac-Man Using Deep Q-Learning
这部分首先解决在Windows下安装gym[atari]模块
查阅了很多资料,发现很多解决办法都是使用这个连接:
pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py
这种的方法的出处是OpenAI Gym Atari on Windows
这个gym的github issue中也提到这个方法。
这个How to Install OpenAI Gym in a Windows Environment也是使用这个方法安装atari模块
也可以在下面这个网址上下载whl进行离线安装
https://github.com/Kojoley/atari-py/releases
对于安装whl格式的文件,首先要安装wheel包
pip install wheel
由于anconda中默认安装了wheel包,所以上面的步骤可以省略。
pip install atari_py-0.1.7-cp36-cp36m-win_amd64.whl
绘制智能体和环境交互的动画
# 导入需要的包
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# 将每一帧的画面存到frames中
frames = []
n_max_steps = 1000
n_change_steps = 10
obs = env.reset()
for step in range(n_max_steps):
img = env.render(mode="rgb_array")
frames.append(img)
if step % n_change_steps == 0:
action = env.action_space.sample() # play randomly
obs, reward, done, info = env.step(action)
if done:
break
# 定义绘制动画的函数
def update_scene(num, frames, patch):
patch.set_data(frames[num])
return patch,
def plot_animation(frames, repeat=False, interval=40):
plt.close() # or else nbagg sometimes plots in the previous cell
fig = plt.figure()
patch = plt.imshow(frames[0])
plt.axis('off')
return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)
# 调用函数显示动画
video = plot_animation(frames)
plt.show()
DQL算法部分
首先是对obs进行预处理,转换为88x80的灰度图像并增强对比度。
import gym
import matplotlib.pyplot as plt
import numpy as np
env = gym.make('MsPacman-v0')
obs = env.reset()
plt.imshow(obs)
plt.show()
mspacman_color = np.array([210, 164, 74]).mean()
def preprocess_observation(obs):
img = obs[1:176:2,::2] # crop and downsize
img = img.mean(axis=2) # to greyscale
img[img == mspacman_color] = 0 # improve constrast
img = (img - 128)/128 -1 # normalize from -1. to 1.
return img.reshape(88,80,1)
pre_obs = preprocess_observation(obs).reshape(88,80)
plt.imshow(pre_obs, cmap ='gray')
plt.show()
DQN算法
# ------------------------------构建DQN的计算图-----------------------------------------
from tensorflow.contrib.layers import convolution2d, fully_connected
input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3
conv_activation = [tf.nn.relu] * 3
n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n # 9 discrete actions are available
initializer = tf.variance_scaling_initializer()
def q_network(X_state, name):
prev_layer = X_state / 128.0 # scale pixel intensities to the [-1.0, 1.0] range.
with tf.variable_scope(name) as scope:
for n_maps, kernel_size, strides, padding, activation in zip(
conv_n_maps, conv_kernel_sizes, conv_strides,
conv_paddings, conv_activation):
prev_layer = tf.layers.conv2d(
prev_layer, filters=n_maps, kernel_size=kernel_size,
strides=strides, padding=padding, activation=activation,
kernel_initializer=initializer)
last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
activation=hidden_activation,
kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs,
kernel_initializer=initializer)
trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)
trainable_vars_by_name = {var.name[len(scope.name):]: var
for var in trainable_vars}
return outputs, trainable_vars_by_name
# --------------------------------两个DQN之间复制参数的过程----------------------------------
X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
input_channels])
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")
copy_ops = [target_var.assign(online_vars[var_name])
for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)
# ------------------------------构建actor的损失函数和优化器
learning_rate = 0.001
momentum = 0.95
with tf.variable_scope("train"):
X_action = tf.placeholder(tf.int32, shape=[None])
y = tf.placeholder(tf.float32, shape=[None, 1])
q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
axis=1, keepdims=True)
error = tf.abs(y - q_value)
clipped_error = tf.clip_by_value(error, 0.0, 1.0)
linear_error = 2 * (error - clipped_error)
loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)
global_step = tf.Variable(0, trainable=False, name='global_step')
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss, global_step=global_step)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
#----------------------------replay mem的设置代替书中过的deque-----------------------------------
class ReplayMemory:
def __init__(self, maxlen):
self.maxlen = maxlen
self.buf = np.empty(shape=maxlen, dtype=np.object)
self.index = 0
self.length = 0
def append(self, data):
self.buf[self.index] = data
self.length = min(self.length + 1, self.maxlen)
self.index = (self.index + 1) % self.maxlen
def sample(self, batch_size, with_replacement=True):
if with_replacement:
indices = np.random.randint(self.length, size=batch_size) # faster
else:
indices = np.random.permutation(self.length)[:batch_size]
return self.buf[indices]
replay_memory_size = 500000
replay_memory = ReplayMemory(replay_memory_size)
#----------------------------------从replay mem中进行采样--------------------------------------
def sample_memories(batch_size):
cols = [[], [], [], [], []] # state, action, reward, next_state, continue
for memory in replay_memory.sample(batch_size):
for col, value in zip(cols, memory):
col.append(value)
cols = [np.array(col) for col in cols]
return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)
#-------------------------------epsilon贪婪策略-----------------------------------------
eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000
def epsilon_greedy(q_values, step):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs) # random action
else:
return np.argmax(q_values) # optimal action
#-------------------------------DQN的训练过程--------------------------------------------
n_steps = 4000000 # total number of training steps
training_start = 10000 # start training after 10,000 game iterations
training_interval = 4 # run a training step every 4 game iterations
save_steps = 1000 # save the model every 1,000 training steps
copy_steps = 10000 # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.99
skip_start = 90 # Skip the start of every game (it's just waiting time).
batch_size = 50
iteration = 0 # game iterations
checkpoint_path = "./my_dqn.ckpt"
done = True # env needs to be reset
loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0
with tf.Session() as sess:
if os.path.isfile(checkpoint_path + ".index"):
saver.restore(sess, checkpoint_path)
else:
init.run()
copy_online_to_target.run()
while True:
step = global_step.eval()
if step >= n_steps:
break
iteration += 1
print("\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f} ".format(
iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
if done: # game over, start again
obs = env.reset()
for skip in range(skip_start): # skip the start of each game
obs, reward, done, info = env.step(0)
state = preprocess_observation(obs)
# Online DQN evaluates what to do
q_values = online_q_values.eval(feed_dict={X_state: [state]})
action = epsilon_greedy(q_values, step)
# Online DQN plays
obs, reward, done, info = env.step(action)
next_state = preprocess_observation(obs)
# Let's memorize what happened
replay_memory.append((state, action, reward, next_state, 1.0 - done))
state = next_state
# Compute statistics for tracking progress (not shown in the book)
total_max_q += q_values.max()
game_length += 1
if done:
mean_max_q = total_max_q / game_length
total_max_q = 0.0
game_length = 0
if iteration < training_start or iteration % training_interval != 0:
continue # only train after warmup period and at regular intervals
# Sample memories and use the target DQN to produce the target Q-Value
X_state_val, X_action_val, rewards, X_next_state_val, continues = (
sample_memories(batch_size))
next_q_values = target_q_values.eval(
feed_dict={X_state: X_next_state_val})
max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
y_val = rewards + continues * discount_rate * max_next_q_values
# Train the online DQN
_, loss_val = sess.run([training_op, loss], feed_dict={
X_state: X_state_val, X_action: X_action_val, y: y_val})
# Regularly copy the online DQN to the target DQN
if step % copy_steps == 0:
copy_online_to_target.run()
# And save regularly
if step % save_steps == 0:
saver.save(sess, checkpoint_path)
Hands-on ML and TF Chapter16 Reinforcement Learning的更多相关文章
- (转) Deep Learning Research Review Week 2: Reinforcement Learning
Deep Learning Research Review Week 2: Reinforcement Learning 转载自: https://adeshpande3.github.io/ad ...
- [转]Deep Reinforcement Learning Based Trading Application at JP Morgan Chase
Deep Reinforcement Learning Based Trading Application at JP Morgan Chase https://medium.com/@ranko.m ...
- repost: Deep Reinforcement Learning
From: http://wanghaitao8118.blog.163.com/blog/static/13986977220153811210319/ accessed 2016-03-10 深度 ...
- Reinforcement Learning,微信公众号:DRL学习
欢迎大家关注微信公众号:DRL学习,我们一起来学习强化学习和深度强化学习的算法及现状应用问题. 强化学习简单说就是学习如何最大化未来奖励的预期总和,以及agent学会在环境中做出的行动序列,其中随机状 ...
- Statistics and Samples in Distributional Reinforcement Learning
郑重声明:原文参见标题,如有侵权,请联系作者,将会撤销发布! arXiv:1902.08102v1 [stat.ML] 21 Feb 2019 Abstract 我们通过递归估计回报分布的统计量,提供 ...
- Machine Learning Algorithms Study Notes(5)—Reinforcement Learning
Reinforcement Learning 对于控制决策问题的解决思路:设计一个回报函数(reward function),如果learning agent(如上面的四足机器人.象棋AI程序)在决定 ...
- (转) Playing FPS games with deep reinforcement learning
Playing FPS games with deep reinforcement learning 博文转自:https://blog.acolyer.org/2016/11/23/playing- ...
- (zhuan) Deep Reinforcement Learning Papers
Deep Reinforcement Learning Papers A list of recent papers regarding deep reinforcement learning. Th ...
- Learning Roadmap of Deep Reinforcement Learning
1. 知乎上关于DQN入门的系列文章 1.1 DQN 从入门到放弃 DQN 从入门到放弃1 DQN与增强学习 DQN 从入门到放弃2 增强学习与MDP DQN 从入门到放弃3 价值函数与Bellman ...
随机推荐
- 编译程序遇到问题 relocation R_X86_64_32 against `.rodata' can not be used when making a shared object;
编译程序遇到问题 relocation R_X86_64_32 against `.rodata' can not be used when making a shared object; 发现编译 ...
- webpack抽取CSS文件与CSSTreeShaking
webpack抽取CSS文件 CSSTreeShaking 一.webpack抽取CSS文件 抽取CSS文件的插件:mini-css-extract-plugin npm install --save ...
- Mybatis和hibernate的优缺点比较
介绍: Hibernate :Hibernate 是当前最流行的ORM框架,对数据库结构提供了较为完整的封装. Mybatis:Mybatis同样也是非常流行的ORM框架,主要着力点在于POJO 与S ...
- SVN主从备份
SVN主从备份 两套环境:192.168.67.63(主SVN) 192.168.67.60(从SVN) 1.主环境上已经装好SVN并且存在数据仓库/home/svndata在从环境上,新建一/hom ...
- 分享一个百万数量级的测试学习用的mysql数据集
TEST_DB 带有集成测试套件的示例数据库,用于测试应用程序和数据库服务器 此存储库已从Launchpad迁移. 请参阅MySQL文档中的用法 它来自哪里 原始数据由西门子企业研究中心的Fushen ...
- 如何解决Win10系统更新显示0x80080300代码的错误?
Win10系统自推出以来就不断的在完善更新,其越来越丰富的功能也吸引了越来越多的用户.好系统Win10系统:https://www.vkebao.com/os/index_2.html但最近有用户反映 ...
- 转载 如何使用批处理 动态改变path实现改变JDK版本
http://www.cnblogs.com/xdp-gacl/p/5209386.html 1 @echo off 2 3 rem --- Base Config 配置JDK的安装目录 --- 4 ...
- datePicker 及 timePicker 监听事件 获取用户选择 年月日分秒信息
public class MainActivity extends AppCompatActivity { private TimePicker timePicker; private DatePic ...
- vim复制到剪切板
作者:whinc链接:https://www.zhihu.com/question/19863631/answer/89354508来源:知乎 转载文章 Vim 中的复制.删除的内容都会被存放到默认( ...
- java线程基础巩固---Daemon线程的创建以及使用场景分析
daemon线程既守护线程,而在jdk中对于Thread中针对守护线程有专门的API,如下: 而之前在公司项目中就看到过有人使用过Thread中的这个API,但是对于它的使用场景完全不知,所以这次好好 ...