深度增强学习--Policy Gradient

前面都是value based的方法，现在看一种直接预测动作的方法 Policy Based

下面的例子实现的REINFORCE算法

 import sys

 import gym

 import pylab

 import numpy as np

 from keras.layers import Dense

 from keras.models import Sequential

 from keras.optimizers import Adam

 EPISODES = 1000

 #policy gradient的一种,REINFORCE算法

 # This is Policy Gradient agent for the Cartpole

 # In this example, we use REINFORCE algorithm which uses monte-carlo update rule

 class REINFORCEAgent:

     def __init__(self, state_size, action_size):

         # if you want to see Cartpole learning, then change to True

         self.render = True

         self.load_model = False

         # get size of state and action

         self.state_size = state_size#

         self.action_size = action_size#

         # These are hyper parameters for the Policy Gradient

         self.discount_factor = 0.99

         self.learning_rate = 0.001

         self.hidden1, self.hidden2 = 24, 24

         # create model for policy network

         self.model = self.build_model()

         # lists for the states, actions and rewards

         self.states, self.actions, self.rewards = [], [], []

         if self.load_model:

             self.model.load_weights("./save_model/cartpole_reinforce.h5")

     # approximate policy using Neural Network

     # state is input and probability of each action is output of network

     def build_model(self):

         model = Sequential()

         model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))

         model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform'))

         model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))

         model.summary()

         # Using categorical crossentropy as a loss is a trick to easily

         # implement the policy gradient. Categorical cross entropy is defined

         # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set

         # p_a = advantage. q_a is the output of the policy network, which is

         # the probability of taking the action a, i.e. policy(s, a).

         # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))

         model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate))

         return model

     # using the output of policy network, pick action stochastically

     def get_action(self, state):

         policy = self.model.predict(state, batch_size=1).flatten()#

         return np.random.choice(self.action_size, 1, p=policy)[0]#choose action accordding to probability

     # In Policy Gradient, Q function is not available.

     # Instead agent uses sample returns for evaluating policy

     def discount_rewards(self, rewards):

         discounted_rewards = np.zeros_like(rewards)

         running_add = 0

         for t in reversed(range(0, len(rewards))):

             running_add = running_add * self.discount_factor + rewards[t]

             discounted_rewards[t] = running_add

         return discounted_rewards

     # save <s, a ,r> of each step

     def append_sample(self, state, action, reward):

         self.states.append(state)

         self.rewards.append(reward)

         self.actions.append(action)

     # update policy network every episode

     def train_model(self):

         '''

         example:

         self.states:[array([[-0.00647736, -0.04499117,  0.02213829, -0.00486359]]), array([[-0.00737719, -0.24042351,  0.02204101,  0.2947212 ]]), array([[-0.01218566, -0.04562261,  0.02793544,  0.00907036]]), array([[-0.01309811, -0.24113382,  0.02811684,  0.31043471]]), array([[-0.01792078, -0.04642351,  0.03432554,  0.02674995]]), array([[-0.01884925, -0.24202048,  0.03486054,  0.33006229]]), array([[-0.02368966, -0.04741166,  0.04146178,  0.04857336]]), array([[-0.0246379 , -0.24310286,  0.04243325,  0.35404415]]), array([[-0.02949995, -0.43880168,  0.04951413,  0.65979978]]), array([[-0.03827599, -0.2444025 ,  0.06271013,  0.38310959]]), array([[-0.04316404, -0.44035616,  0.07037232,  0.69488702]]), array([[-0.05197116, -0.63637999,  0.08427006,  1.00886738]]), array([[-0.06469876, -0.83251953,  0.10444741,  1.32677873]]), array([[-0.08134915, -0.63885961,  0.13098298,  1.06852366]]), array([[-0.09412634, -0.44569036,  0.15235346,  0.8196508 ]]), array([[-0.10304015, -0.25294509,  0.16874647,  0.57850069]]), array([[-0.10809905, -0.44997994,  0.18031649,  0.91923131]]), array([[-0.11709865, -0.25769299,  0.19870111,  0.68820344]])]

         self.rewards:[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -100]

         self.actions:[0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1]

         '''

         episode_length = len(self.states)#

         discounted_rewards = self.discount_rewards(self.rewards)

         '''

         example:

         disconnted_rewards:array([ -68.58863868,  -70.29155422,  -72.01167093,  -73.74916255,-75.5042046 , -77.27697434, -79.06765085, -80.876415  , -82.7034495 ,  -84.54893889,  -86.41306958,  -88.29602988,-90.19800998, -92.119202 , -94.0598,-96.02,-98., -100. ])

         '''

         discounted_rewards -= np.mean(discounted_rewards)

         discounted_rewards /= np.std(discounted_rewards)#将作为神经网络预测对象

         '''

         array([ 1.59468271,  1.41701722,  1.23755712,  1.05628429,  0.87318042,

         0.68822702,  0.50140541,  0.3126967 ,  0.12208185, -0.0704584 ,

        -0.26494351, -0.46139311, -0.65982705, -0.86026537, -1.06272832,

        -1.26723636, -1.47381013, -1.6824705 ])

         '''

         update_inputs = np.zeros((episode_length, self.state_size))#shape(18,4)

         advantages = np.zeros((episode_length, self.action_size))#shape(18,2)

         for i in range(episode_length):

             update_inputs[i] = self.states[i]

             advantages[i][self.actions[i]] = discounted_rewards[i]

         self.model.fit(update_inputs, advantages, epochs=1, verbose=0)

         self.states, self.actions, self.rewards = [], [], []

 if __name__ == "__main__":

     # In case of CartPole-v1, you can play until 500 time step

     env = gym.make('CartPole-v1')

     # get size of state and action from environment

     state_size = env.observation_space.shape[0]

     action_size = env.action_space.n

     # make REINFORCE agent

     agent = REINFORCEAgent(state_size, action_size)

     scores, episodes = [], []

     for e in range(EPISODES):

         import pdb; pdb.set_trace()

         done = False

         score = 0

         state = env.reset()

         state = np.reshape(state, [1, state_size])

         while not done:

             if agent.render:

                 env.render()

             # get action for the current state and go one step in environment

             action = agent.get_action(state)

             next_state, reward, done, info = env.step(action)

             next_state = np.reshape(next_state, [1, state_size])

             reward = reward if not done or score == 499 else -100

             # save the sample <s, a, r> to the memory

             agent.append_sample(state, action, reward)

             score += reward

             state = next_state

             if done:

                 # every episode, agent learns from sample returns

                 agent.train_model()

                 # every episode, plot the play time

                 score = score if score == 500 else score + 100

                 scores.append(score)

                 episodes.append(e)

                 pylab.plot(episodes, scores, 'b')

                 pylab.savefig("./save_graph/cartpole_reinforce.png")

                 print("episode:", e, "  score:", score)

                 # if the mean of scores of last 10 episode is bigger than 490

                 # stop training

                 if np.mean(scores[-min(10, len(scores)):]) > 490:

                     sys.exit()

         # save the model

         if e % 50 == 0:

             agent.model.save_weights("./save_model/cartpole_reinforce.h5")

深度增强学习--Policy Gradient的更多相关文章

强化学习--Policy Gradient
Policy Gradient综述: Policy Gradient,通过学习当前环境,直接给出要输出的动作的概率值. Policy Gradient 不是单步更新,只能等玩完一个epoch,再 ...
深度增强学习--DDPG
DDPG DDPG介绍2 ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测公式推导推导代码实现的gym的pendulum游戏,这个游 ...
深度增强学习--A3C
A3C 它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所 ...
深度增强学习--Actor Critic
Actor Critic value-based和policy-based的结合实例代码 import sys import gym import pylab import numpy as np ...
深度增强学习--Deep Q Network
从这里开始换个游戏演示,cartpole游戏 Deep Q Network 实例代码 import sys import gym import pylab import random import n ...
深度增强学习--DPPO
PPO DPPO介绍 PPO实现代码DPPO
深度增强学习--DQN的变形
DQN的变形 double DQN prioritised replay dueling DQN
Deep Learning专栏--强化学习之从 Policy Gradient 到 A3C（3）
在之前的强化学习文章里,我们讲到了经典的MDP模型来描述强化学习,其解法包括value iteration和policy iteration,这类经典解法基于已知的转移概率矩阵P,而在实际应用中,我们 ...
增强学习 | AlphaGo背后的秘密
"敢于尝试,才有突破" 2017年5月27日,当今世界排名第一的中国棋手柯洁与AlphaGo 2.0的三局对战落败.该事件标志着最新的人工智能技术在围棋竞技领域超越了人类智能,借此 ...

随机推荐

使用Frida简化Android端应用安全测试
@author : Dlive 在对Android应用进行Web漏洞测试时,经常遇到一种情况:HTTP传输的数据带有签名字段处理这种情况的方法通常是逆向签名算法,但是如果算法在so中,而且so加壳了 ...
Django-models,继承AbstractUser类
1.UserInfo类继承了Django模型自带的User类,需要导入AbstractUser 2.然后在settings.py中配置
Linux执行shell脚本方式及区别&命令后台运行
Linux执行shell脚本方式及区别&命令后台运行 http://blog.csdn.net/heqiyu34/article/details/19089951/
linux下C的GBD调试学习笔记（转载）
1. 单步执行和跟踪函数调用看下面的程序: 例 10.1. 函数调试实例 #include <stdio.h> int add_range(int low, int high) { in ...
Android的简单应用（三）——为你的程序添加监听器
平时在写程序时经常会遇到监听器,比如按钮的click监听器,按键监听器等等.而android中的监听器和java中的回调函数是同一个概念,都是在底层代码中定义一个接口来调用高层的代码.那么什么是回调函 ...
web开发文档整理
django 1.11版中文文档 django 1.11版英文文档 Redis命令参考文档 Axios中文文档 Axios英文官方文档 Django REST Framework官方文档 Vue.js ...
【SQL】约束与触发器1
一.外键 1.1特点表A的外键,一定是其他某个表B的主键或有UNIQUE声明的属性. A的外键的值,一定是对应表B中相应的属性值.(空值除外) 1.2声明方法方法1:属性名类型 REFERENC ...
Annotation（注解）代替配置文件
非注解形式,即指在配置文件中配置相关参数,使实体类.参数各方面分离.注解方式,即annotation方式,是jdk提供的一种注入或配置方式.即将实体类与相关参数都在一起,只是通过@annotation ...
poj 3230(初始化。。动态规划)
Travel Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 4353 Accepted: 1817 Descriptio ...
duboo服务使用thrift协议 + MQ
写一篇博客来记录从 Python 转型到 Java 的学习成果.整体架构: rpc: dubbo + thrift idl: thrift registeration: zookeeper MQ: k ...

深度增强学习--Policy Gradient

前面都是value based的方法，现在看一种直接预测动作的方法 Policy Based

Policy Gradient

深度增强学习--Policy Gradient的更多相关文章

随机推荐

热门专题