Policy Granients

import tensorflow as tf

reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1 learning_rate = 0.01 initializer = tf.variance_scaling_initializer() X = tf.placeholder(tf.float32, shape=[None, n_inputs]) hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1) y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
gradient_placeholders.append(gradient_placeholder)
grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed) init = tf.global_variables_initializer()
saver = tf.train.Saver()
def discount_rewards(rewards, discount_rate):
discounted_rewards = np.zeros(len(rewards))
cumulative_rewards = 0
for step in reversed(range(len(rewards))):
cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
discounted_rewards[step] = cumulative_rewards
return discounted_rewards def discount_and_normalize_rewards(all_rewards, discount_rate):
all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
flat_rewards = np.concatenate(all_discounted_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
env = gym.make("CartPole-v0")

n_games_per_update = 10
n_max_steps = 1000
n_iterations = 250
save_iterations = 10
discount_rate = 0.95 with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
print("\rIteration: {}".format(iteration), end="")
all_rewards = []
all_gradients = []
for game in range(n_games_per_update):
current_rewards = []
current_gradients = []
obs = env.reset()
for step in range(n_max_steps):
action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
current_rewards.append(reward)
current_gradients.append(gradients_val)
if done:
break
all_rewards.append(current_rewards)
all_gradients.append(current_gradients) all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
feed_dict = {}
for var_index, gradient_placeholder in enumerate(gradient_placeholders):
mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
for game_index, rewards in enumerate(all_rewards)
for step, reward in enumerate(rewards)], axis=0)
## mean_gradients 是个标量,为一个Variable在10个episode的所有step的梯度的平均值。
feed_dict[gradient_placeholder] = mean_gradients
sess.run(training_op, feed_dict=feed_dict)
if iteration % save_iterations == 0:
saver.save(sess, "./my_policy_net_pg.ckpt")

Markov Decision Process

Learning to Play Ms. Pac-Man Using Deep Q-Learning

这部分首先解决在Windows下安装gym[atari]模块

查阅了很多资料,发现很多解决办法都是使用这个连接:

pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py

这种的方法的出处是OpenAI Gym Atari on Windows

这个gym的github issue中也提到这个方法。

这个How to Install OpenAI Gym in a Windows Environment也是使用这个方法安装atari模块

也可以在下面这个网址上下载whl进行离线安装

https://github.com/Kojoley/atari-py/releases

对于安装whl格式的文件,首先要安装wheel包

pip install wheel

由于anconda中默认安装了wheel包,所以上面的步骤可以省略。

pip install atari_py-0.1.7-cp36-cp36m-win_amd64.whl

绘制智能体和环境交互的动画

# 导入需要的包
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# 将每一帧的画面存到frames中
frames = [] n_max_steps = 1000
n_change_steps = 10 obs = env.reset()
for step in range(n_max_steps):
img = env.render(mode="rgb_array")
frames.append(img)
if step % n_change_steps == 0:
action = env.action_space.sample() # play randomly
obs, reward, done, info = env.step(action)
if done:
break
# 定义绘制动画的函数
def update_scene(num, frames, patch):
patch.set_data(frames[num])
return patch, def plot_animation(frames, repeat=False, interval=40):
plt.close() # or else nbagg sometimes plots in the previous cell
fig = plt.figure()
patch = plt.imshow(frames[0])
plt.axis('off')
return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)
# 调用函数显示动画
video = plot_animation(frames)
plt.show()

DQL算法部分

首先是对obs进行预处理,转换为88x80的灰度图像并增强对比度。

import gym
import matplotlib.pyplot as plt
import numpy as np env = gym.make('MsPacman-v0')
obs = env.reset()
plt.imshow(obs)
plt.show() mspacman_color = np.array([210, 164, 74]).mean() def preprocess_observation(obs):
img = obs[1:176:2,::2] # crop and downsize
img = img.mean(axis=2) # to greyscale
img[img == mspacman_color] = 0 # improve constrast
img = (img - 128)/128 -1 # normalize from -1. to 1.
return img.reshape(88,80,1) pre_obs = preprocess_observation(obs).reshape(88,80)
plt.imshow(pre_obs, cmap ='gray')
plt.show()

DQN算法

# ------------------------------构建DQN的计算图-----------------------------------------
from tensorflow.contrib.layers import convolution2d, fully_connected input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3
conv_activation = [tf.nn.relu] * 3
n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n # 9 discrete actions are available
initializer = tf.variance_scaling_initializer() def q_network(X_state, name):
prev_layer = X_state / 128.0 # scale pixel intensities to the [-1.0, 1.0] range.
with tf.variable_scope(name) as scope:
for n_maps, kernel_size, strides, padding, activation in zip(
conv_n_maps, conv_kernel_sizes, conv_strides,
conv_paddings, conv_activation):
prev_layer = tf.layers.conv2d(
prev_layer, filters=n_maps, kernel_size=kernel_size,
strides=strides, padding=padding, activation=activation,
kernel_initializer=initializer)
last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
activation=hidden_activation,
kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs,
kernel_initializer=initializer)
trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)
trainable_vars_by_name = {var.name[len(scope.name):]: var
for var in trainable_vars}
return outputs, trainable_vars_by_name # --------------------------------两个DQN之间复制参数的过程----------------------------------
X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
input_channels])
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target") copy_ops = [target_var.assign(online_vars[var_name])
for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops) # ------------------------------构建actor的损失函数和优化器
learning_rate = 0.001
momentum = 0.95 with tf.variable_scope("train"):
X_action = tf.placeholder(tf.int32, shape=[None])
y = tf.placeholder(tf.float32, shape=[None, 1])
q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
axis=1, keepdims=True)
error = tf.abs(y - q_value)
clipped_error = tf.clip_by_value(error, 0.0, 1.0)
linear_error = 2 * (error - clipped_error)
loss = tf.reduce_mean(tf.square(clipped_error) + linear_error) global_step = tf.Variable(0, trainable=False, name='global_step')
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss, global_step=global_step) init = tf.global_variables_initializer()
saver = tf.train.Saver() #----------------------------replay mem的设置代替书中过的deque-----------------------------------
class ReplayMemory:
def __init__(self, maxlen):
self.maxlen = maxlen
self.buf = np.empty(shape=maxlen, dtype=np.object)
self.index = 0
self.length = 0 def append(self, data):
self.buf[self.index] = data
self.length = min(self.length + 1, self.maxlen)
self.index = (self.index + 1) % self.maxlen def sample(self, batch_size, with_replacement=True):
if with_replacement:
indices = np.random.randint(self.length, size=batch_size) # faster
else:
indices = np.random.permutation(self.length)[:batch_size]
return self.buf[indices] replay_memory_size = 500000
replay_memory = ReplayMemory(replay_memory_size) #----------------------------------从replay mem中进行采样--------------------------------------
def sample_memories(batch_size):
cols = [[], [], [], [], []] # state, action, reward, next_state, continue
for memory in replay_memory.sample(batch_size):
for col, value in zip(cols, memory):
col.append(value)
cols = [np.array(col) for col in cols]
return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1) #-------------------------------epsilon贪婪策略-----------------------------------------
eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000 def epsilon_greedy(q_values, step):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs) # random action
else:
return np.argmax(q_values) # optimal action #-------------------------------DQN的训练过程--------------------------------------------
n_steps = 4000000 # total number of training steps
training_start = 10000 # start training after 10,000 game iterations
training_interval = 4 # run a training step every 4 game iterations
save_steps = 1000 # save the model every 1,000 training steps
copy_steps = 10000 # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.99
skip_start = 90 # Skip the start of every game (it's just waiting time).
batch_size = 50
iteration = 0 # game iterations
checkpoint_path = "./my_dqn.ckpt"
done = True # env needs to be reset loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0 with tf.Session() as sess:
if os.path.isfile(checkpoint_path + ".index"):
saver.restore(sess, checkpoint_path)
else:
init.run()
copy_online_to_target.run()
while True:
step = global_step.eval()
if step >= n_steps:
break
iteration += 1
print("\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f} ".format(
iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
if done: # game over, start again
obs = env.reset()
for skip in range(skip_start): # skip the start of each game
obs, reward, done, info = env.step(0)
state = preprocess_observation(obs) # Online DQN evaluates what to do
q_values = online_q_values.eval(feed_dict={X_state: [state]})
action = epsilon_greedy(q_values, step) # Online DQN plays
obs, reward, done, info = env.step(action)
next_state = preprocess_observation(obs) # Let's memorize what happened
replay_memory.append((state, action, reward, next_state, 1.0 - done))
state = next_state # Compute statistics for tracking progress (not shown in the book)
total_max_q += q_values.max()
game_length += 1
if done:
mean_max_q = total_max_q / game_length
total_max_q = 0.0
game_length = 0 if iteration < training_start or iteration % training_interval != 0:
continue # only train after warmup period and at regular intervals # Sample memories and use the target DQN to produce the target Q-Value
X_state_val, X_action_val, rewards, X_next_state_val, continues = (
sample_memories(batch_size))
next_q_values = target_q_values.eval(
feed_dict={X_state: X_next_state_val})
max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
y_val = rewards + continues * discount_rate * max_next_q_values # Train the online DQN
_, loss_val = sess.run([training_op, loss], feed_dict={
X_state: X_state_val, X_action: X_action_val, y: y_val}) # Regularly copy the online DQN to the target DQN
if step % copy_steps == 0:
copy_online_to_target.run() # And save regularly
if step % save_steps == 0:
saver.save(sess, checkpoint_path)

Hands-on ML and TF Chapter16 Reinforcement Learning的更多相关文章

  1. (转) Deep Learning Research Review Week 2: Reinforcement Learning

      Deep Learning Research Review Week 2: Reinforcement Learning 转载自: https://adeshpande3.github.io/ad ...

  2. [转]Deep Reinforcement Learning Based Trading Application at JP Morgan Chase

    Deep Reinforcement Learning Based Trading Application at JP Morgan Chase https://medium.com/@ranko.m ...

  3. repost: Deep Reinforcement Learning

    From: http://wanghaitao8118.blog.163.com/blog/static/13986977220153811210319/ accessed 2016-03-10 深度 ...

  4. Reinforcement Learning,微信公众号:DRL学习

    欢迎大家关注微信公众号:DRL学习,我们一起来学习强化学习和深度强化学习的算法及现状应用问题. 强化学习简单说就是学习如何最大化未来奖励的预期总和,以及agent学会在环境中做出的行动序列,其中随机状 ...

  5. Statistics and Samples in Distributional Reinforcement Learning

    郑重声明:原文参见标题,如有侵权,请联系作者,将会撤销发布! arXiv:1902.08102v1 [stat.ML] 21 Feb 2019 Abstract 我们通过递归估计回报分布的统计量,提供 ...

  6. Machine Learning Algorithms Study Notes(5)—Reinforcement Learning

    Reinforcement Learning 对于控制决策问题的解决思路:设计一个回报函数(reward function),如果learning agent(如上面的四足机器人.象棋AI程序)在决定 ...

  7. (转) Playing FPS games with deep reinforcement learning

    Playing FPS games with deep reinforcement learning 博文转自:https://blog.acolyer.org/2016/11/23/playing- ...

  8. (zhuan) Deep Reinforcement Learning Papers

    Deep Reinforcement Learning Papers A list of recent papers regarding deep reinforcement learning. Th ...

  9. Learning Roadmap of Deep Reinforcement Learning

    1. 知乎上关于DQN入门的系列文章 1.1 DQN 从入门到放弃 DQN 从入门到放弃1 DQN与增强学习 DQN 从入门到放弃2 增强学习与MDP DQN 从入门到放弃3 价值函数与Bellman ...

随机推荐

  1. vue中watch深度监听

    监听基本类型的都是浅度监听 watch的深度监听,监听复杂类型都是深度监听(funciton ,arrat ,object) // 监听对象 data(){ return { a:{ b:, c: } ...

  2. vue2.0+按需引入element-ui报错

    项目使用vue脚手架自动生成的,vue版本为^2.5.16.项目中需要按需使用element-ui,根据element-ui的官方文档,一开始在babel.config.js文件中修改配置 modul ...

  3. mysql sleep 死锁例子

    表结构 CREATE TABLE `orders` ( `order_id` int(11) NOT NULL, `order_addr` varchar(255) DEFAULT NULL ) EN ...

  4. Oracle【账户管理】

    Oracle学习大致体系oracle管理系统介绍(客户端和服务器端的交互模式)oracle数据库的数据管理(增删改查 查询)oracle账户管理oracle二维表管理   --创建表   --维护表  ...

  5. C++第二次作业--函数

    1.为什么要用函数 创建 C++ 函数时,会定义函数做什么,然后通过调用函数来完成已定义的任务.通过函数我们可以实现代码复用,即可以重复使用和在各种适用情况下使用,函数的存在增强了程序的可读性.并且函 ...

  6. 小程序UI设计(8)-布局分解-FlexBox的align-content应用

    FlexBox的align-content到底是什么鬼,我也搞了好半天才开发出来,目前工具中WViewRow组件使用了此属性,WViewColumn中此属性不起作用.下图是justify-conten ...

  7. C++虚函数作用原理(一)——虚函数如何在C++语言逻辑中存在

    C++多态,接触其实也没太长的时间.上课的时候老师总是不停的讲,多态可以实现利用一个基类对象调用不同继承类的成员函数.我就会觉得很伤脑筋,这个的原理到底是什么?是什么呢? 开始的时候我觉得自己应该能够 ...

  8. Pursuit For Artifacts CodeForces - 652E (Tarjan+dfs)

    Pursuit For Artifacts CodeForces - 652E Johnny is playing a well-known computer game. The game are i ...

  9. 【墨西哥区域赛】Carpet

    原题: 题意: 给你一个树,有1e5个节点,让你把这个树放在一个长1e6宽20的网格图里,要求一个格子放一个节点,树边之间不能相交 这是一道构造题 因为树的形状可能性很多,很复杂,所以不能简单猜测,而 ...

  10. python学习笔记(三)条件判断和循环

    1.条件判断语句 Python中条件选择语句的关键字为:if .elif .else这三个.其基本形式如下: 1 2 3 4 5 6 7 8 9 age_of_cc = 27   age = int( ...