深度增强学习--Deep Q Network

从这里开始换个游戏演示，cartpole游戏

 import sys

 import gym

 import pylab

 import random

 import numpy as np

 from collections import deque

 from keras.layers import Dense

 from keras.optimizers import Adam

 from keras.models import Sequential

 EPISODES = 300

 # DQN Agent for the Cartpole

 # it uses Neural Network to approximate q function,使用神经网络近似q-learning的q函数

 # and experience replay memory & fixed target q network

 class DQNAgent:

     def __init__(self, state_size, action_size):

         # if you want to see Cartpole learning, then change to True

         self.render = True

         self.load_model = False

         # get size of state and action

         self.state_size = state_size

         self.action_size = action_size

         # These are hyper parameters for the DQN

         self.discount_factor = 0.99

         self.learning_rate = 0.001

         self.epsilon = 1.0

         self.epsilon_decay = 0.999

         self.epsilon_min = 0.01

         self.batch_size = 64

         self.train_start = 1000

         # create replay memory using deque

         self.memory = deque(maxlen=2000)

         # create main model and target model

         self.model = self.build_model()

         self.target_model = self.build_model()

         # initialize target model

         self.update_target_model()

         if self.load_model:

             self.model.load_weights("./save_model/cartpole_dqn.h5")

     # approximate Q function using Neural Network

     # state is input and Q Value of each action is output of network

     def build_model(self):

         model = Sequential()

         model.add(Dense(24, input_dim=self.state_size, activation='relu',

                         kernel_initializer='he_uniform'))

         model.add(Dense(24, activation='relu',

                         kernel_initializer='he_uniform'))

         model.add(Dense(self.action_size, activation='linear',

                         kernel_initializer='he_uniform'))

         model.summary()

         model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))

         return model

     # after some time interval update the target model to be same with model

     def update_target_model(self):

         self.target_model.set_weights(self.model.get_weights())

     # get action from model using epsilon-greedy policy

     def get_action(self, state):

         if np.random.rand() <= self.epsilon:

             return random.randrange(self.action_size)

         else:

             q_value = self.model.predict(state)#2，q(s,a),利用模型预测不同action的q值，选大的作为下一action

             return np.argmax(q_value[0])

     # save sample <s,a,r,s'> to the replay memory

     def append_sample(self, state, action, reward, next_state, done):

         self.memory.append((state, action, reward, next_state, done))

         if self.epsilon > self.epsilon_min:

             self.epsilon *= self.epsilon_decay

     # pick samples randomly from replay memory (with batch_size)

     def train_model(self):

         if len(self.memory) < self.train_start:

             return

         import pdb; pdb.set_trace()

         batch_size = min(self.batch_size, len(self.memory))

         mini_batch = random.sample(self.memory, batch_size)#64list

         #(array([[-0.04263461, -0.00657423,  0.00506589, -0.00200269]]), 0, 1.0, array([[-0.04276609, -0.20176846,  0.00502584,  0.29227427]]), False)

         update_input = np.zeros((batch_size, self.state_size))

         update_target = np.zeros((batch_size, self.state_size))

         action, reward, done = [], [], []

         for i in range(self.batch_size):

             update_input[i] = mini_batch[i][0]

             action.append(mini_batch[i][1])

             reward.append(mini_batch[i][2])

             update_target[i] = mini_batch[i][3]

             done.append(mini_batch[i][4])

         target = self.model.predict(update_input)#(64,2)

         target_val = self.target_model.predict(update_target)#(64, 2)

         for i in range(self.batch_size):

             # Q Learning: get maximum Q value at s' from target model

             if done[i]:

                 target[i][action[i]] = reward[i]

             else:

                 target[i][action[i]] = reward[i] + self.discount_factor * (

                     np.amax(target_val[i]))#off-policy 更新

         # and do the model fit!

         self.model.fit(update_input, target, batch_size=self.batch_size,

                        epochs=1, verbose=0)

 if __name__ == "__main__":

     # In case of CartPole-v1, maximum length of episode is 500

     env = gym.make('CartPole-v1')

     # get size of state and action from environment

     state_size = env.observation_space.shape[0]#

     action_size = env.action_space.n#

     agent = DQNAgent(state_size, action_size)

     scores, episodes = [], []

     for e in range(EPISODES):

         done = False

         score = 0

         state = env.reset()

         state = np.reshape(state, [1, state_size])

         while not done:

             if agent.render:

                 env.render()

             # get action for the current state and go one step in environment

             action = agent.get_action(state)

             next_state, reward, done, info = env.step(action)

             next_state = np.reshape(next_state, [1, state_size])

             # if an action make the episode end, then gives penalty of -100

             reward = reward if not done or score == 499 else -100

             # save the sample <s, a, r, s'> to the replay memory

             agent.append_sample(state, action, reward, next_state, done)

             # every time step do the training

             agent.train_model()

             score += reward

             state = next_state

             if done:

                 # every episode update the target model to be same with model

                 agent.update_target_model()

                 # every episode, plot the play time

                 score = score if score == 500 else score + 100

                 scores.append(score)

                 episodes.append(e)

                 pylab.plot(episodes, scores, 'b')

                 pylab.savefig("./save_graph/cartpole_dqn.png")

                 print("episode:", e, "  score:", score, "  memory length:",

                       len(agent.memory), "  epsilon:", agent.epsilon)

                 # if the mean of scores of last 10 episode is bigger than 490

                 # stop training

                 if np.mean(scores[-min(10, len(scores)):]) > 490:

                     sys.exit()

         # save the model

         if e % 50 == 0:

             agent.model.save_weights("./save_model/cartpole_dqn.h5")

深度增强学习--Deep Q Network的更多相关文章

AlphaGo的前世今生（一）Deep Q Network and Game Search Tree：Road to AI Revolution
这一个专题将会是有关AlphaGo的前世今生以及其带来的AI革命,总共分成三节.本人水平有限,如有错误还望指正.如需转载,须征得本人同意. Road to AI Revolution(通往AI革命之路 ...
强化学习系列之:Deep Q Network (DQN)
文章目录 [隐藏] 1. 强化学习和深度学习结合 2. Deep Q Network (DQN) 算法 3. 后续发展 3.1 Double DQN 3.2 Prioritized Replay 3. ...
Deep Q Network(DQN)原理解析
1. 前言在前面的章节中我们介绍了时序差分算法(TD)和Q-Learning,当状态和动作空间是离散且维数不高时可使用Q-Table储存每个状态动作对的Q值,而当状态和动作空间是高维连续时,使用Q- ...
【转】【强化学习】Deep Q Network(DQN)算法详解
原文地址:https://blog.csdn.net/qq_30615903/article/details/80744083 DQN(Deep Q-Learning)是将深度学习deeplearni ...
深度增强学习--DDPG
DDPG DDPG介绍2 ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测公式推导推导代码实现的gym的pendulum游戏,这个游 ...
深度增强学习--A3C
A3C 它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所 ...
深度增强学习--Actor Critic
Actor Critic value-based和policy-based的结合实例代码 import sys import gym import pylab import numpy as np ...
深度增强学习--Policy Gradient
前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客一个推导下面的例子实现的REINFOR ...
深度增强学习--DPPO
PPO DPPO介绍 PPO实现代码DPPO

随机推荐

自旋锁spin_lock和raw_spin_lock【转】
转自:http://blog.csdn.net/droidphone/article/details/7395983 版权声明:本文为博主原创文章,未经博主允许不得转载. 目录(?)[-] 临界区Cr ...
python基础===两个list合并成一个dict的方法
def Run(): list2 = [, , , , ]; list3 = ["a", "b", "c", "d",& ...
(十四)git操作
https://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000
servlet+forward和direct区别
Servlet:是用于 java 编写的服务器端程序,其使用 java servlet API,当客户机发送请求到服务器时,服务器可以将请求信息发送给 servlet,并让 servlet 建立起服务 ...
Ubuntu系统进程管理笔记
前言今天对前端服务器进行迁移,本来前端服务器就一台,都是放置前端静态文件的地方,应该是比较简单的.唯一的问题是由于Nginx需要给ie8浏览器个i同https访问支持,不得不对Nginx进行重新编译 ...
Asp.Net Core 项目实战参考
Asp.Net Core 项目实战链接 http://www.cnblogs.com/fonour/p/5904530.html
《JAVA8实战》读书笔记之传递方法和传递lambda
传递方法: 假设你有一个Apple类,它有一个getColor方法,还有一个变量inventory保存着一个Apples的列表.你可能想要选出所有的绿苹果,并返回一个列表.通常我们用筛选(fil ...
[BZOJ1475]方格取数网络流最小割
1475: 方格取数 Time Limit: 5 Sec Memory Limit: 64 MBSubmit: 1025 Solved: 512[Submit][Status][Discuss] ...
【原创】Maven cobertura整合多个子项目下的单测覆盖率报告
今天在调试一个UT job的时候发现找不到cobertural报告文件,后来发现在Maven的自项目里找到了对应的代码覆盖率报告,但都是是分散在每个子项目下面的,看起来很不方便.就在想是不是可以把这些 ...
HDU1009：FatMouse' Trade(初探贪心，wait)
FatMouse prepared M pounds of cat food, ready to trade with the cats guarding the warehouse containi ...

深度增强学习--Deep Q Network

深度增强学习--Deep Q Network的更多相关文章

随机推荐

热门专题