从这里开始换个游戏演示,cartpole游戏

Deep Q Network

实例代码

 import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential EPISODES = 300 # DQN Agent for the Cartpole
# it uses Neural Network to approximate q function,使用神经网络近似q-learning的q函数
# and experience replay memory & fixed target q network
class DQNAgent:
def __init__(self, state_size, action_size):
# if you want to see Cartpole learning, then change to True
self.render = True
self.load_model = False # get size of state and action
self.state_size = state_size
self.action_size = action_size # These are hyper parameters for the DQN
self.discount_factor = 0.99
self.learning_rate = 0.001
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.01
self.batch_size = 64
self.train_start = 1000
# create replay memory using deque
self.memory = deque(maxlen=2000) # create main model and target model
self.model = self.build_model()
self.target_model = self.build_model() # initialize target model
self.update_target_model() if self.load_model:
self.model.load_weights("./save_model/cartpole_dqn.h5") # approximate Q function using Neural Network
# state is input and Q Value of each action is output of network
def build_model(self):
model = Sequential()
model.add(Dense(24, input_dim=self.state_size, activation='relu',
kernel_initializer='he_uniform'))
model.add(Dense(24, activation='relu',
kernel_initializer='he_uniform'))
model.add(Dense(self.action_size, activation='linear',
kernel_initializer='he_uniform'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model # after some time interval update the target model to be same with model
def update_target_model(self):
self.target_model.set_weights(self.model.get_weights()) # get action from model using epsilon-greedy policy
def get_action(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
else:
q_value = self.model.predict(state)#2,q(s,a),利用模型预测不同action的q值,选大的作为下一action
return np.argmax(q_value[0]) # save sample <s,a,r,s'> to the replay memory
def append_sample(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay # pick samples randomly from replay memory (with batch_size)
def train_model(self):
if len(self.memory) < self.train_start:
return
import pdb; pdb.set_trace()
batch_size = min(self.batch_size, len(self.memory))
mini_batch = random.sample(self.memory, batch_size)#64list
#(array([[-0.04263461, -0.00657423, 0.00506589, -0.00200269]]), 0, 1.0, array([[-0.04276609, -0.20176846, 0.00502584, 0.29227427]]), False) update_input = np.zeros((batch_size, self.state_size))
update_target = np.zeros((batch_size, self.state_size))
action, reward, done = [], [], [] for i in range(self.batch_size):
update_input[i] = mini_batch[i][0]
action.append(mini_batch[i][1])
reward.append(mini_batch[i][2])
update_target[i] = mini_batch[i][3]
done.append(mini_batch[i][4]) target = self.model.predict(update_input)#(64,2)
target_val = self.target_model.predict(update_target)#(64, 2) for i in range(self.batch_size):
# Q Learning: get maximum Q value at s' from target model
if done[i]:
target[i][action[i]] = reward[i]
else:
target[i][action[i]] = reward[i] + self.discount_factor * (
np.amax(target_val[i]))#off-policy 更新 # and do the model fit!
self.model.fit(update_input, target, batch_size=self.batch_size,
epochs=1, verbose=0) if __name__ == "__main__":
# In case of CartPole-v1, maximum length of episode is 500
env = gym.make('CartPole-v1')
# get size of state and action from environment
state_size = env.observation_space.shape[0]#
action_size = env.action_space.n# agent = DQNAgent(state_size, action_size) scores, episodes = [], [] for e in range(EPISODES):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, state_size]) while not done:
if agent.render:
env.render() # get action for the current state and go one step in environment
action = agent.get_action(state)
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, state_size])
# if an action make the episode end, then gives penalty of -100
reward = reward if not done or score == 499 else -100 # save the sample <s, a, r, s'> to the replay memory
agent.append_sample(state, action, reward, next_state, done)
# every time step do the training
agent.train_model()
score += reward
state = next_state if done:
# every episode update the target model to be same with model
agent.update_target_model() # every episode, plot the play time
score = score if score == 500 else score + 100
scores.append(score)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
pylab.savefig("./save_graph/cartpole_dqn.png")
print("episode:", e, " score:", score, " memory length:",
len(agent.memory), " epsilon:", agent.epsilon) # if the mean of scores of last 10 episode is bigger than 490
# stop training
if np.mean(scores[-min(10, len(scores)):]) > 490:
sys.exit() # save the model
if e % 50 == 0:
agent.model.save_weights("./save_model/cartpole_dqn.h5")

深度增强学习--Deep Q Network的更多相关文章

  1. AlphaGo的前世今生(一)Deep Q Network and Game Search Tree:Road to AI Revolution

    这一个专题将会是有关AlphaGo的前世今生以及其带来的AI革命,总共分成三节.本人水平有限,如有错误还望指正.如需转载,须征得本人同意. Road to AI Revolution(通往AI革命之路 ...

  2. 强化学习系列之:Deep Q Network (DQN)

    文章目录 [隐藏] 1. 强化学习和深度学习结合 2. Deep Q Network (DQN) 算法 3. 后续发展 3.1 Double DQN 3.2 Prioritized Replay 3. ...

  3. Deep Q Network(DQN)原理解析

    1. 前言 在前面的章节中我们介绍了时序差分算法(TD)和Q-Learning,当状态和动作空间是离散且维数不高时可使用Q-Table储存每个状态动作对的Q值,而当状态和动作空间是高维连续时,使用Q- ...

  4. 【转】【强化学习】Deep Q Network(DQN)算法详解

    原文地址:https://blog.csdn.net/qq_30615903/article/details/80744083 DQN(Deep Q-Learning)是将深度学习deeplearni ...

  5. 深度增强学习--DDPG

    DDPG DDPG介绍2 ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测 公式推导 推导 代码实现的gym的pendulum游戏,这个游 ...

  6. 深度增强学习--A3C

    A3C 它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所 ...

  7. 深度增强学习--Actor Critic

    Actor Critic value-based和policy-based的结合 实例代码 import sys import gym import pylab import numpy as np ...

  8. 深度增强学习--Policy Gradient

    前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客 一个推导 下面的例子实现的REINFOR ...

  9. 深度增强学习--DPPO

    PPO DPPO介绍 PPO实现 代码DPPO

随机推荐

  1. Python学习笔记 - day5 - 文件操作

    Python文件操作 读写文件是最常见的IO操作,在磁盘上读写文件的功能都是由操作系统提供的,操作系统不允许普通的程序直接操作磁盘(大部分程序都需要间接的通过操作系统来完成对硬件的操作),所以,读写文 ...

  2. Spring MVC的静态和动态概念

    MVC模式: 图释:用户请求通过HTTP协议到达Front controller,Front controller把请求送给Controller,Controller了解业务逻辑细节并且调用业务逻辑数 ...

  3. 白话TCP三次握手

    在TCP/IP协议中,TCP协议提供可靠的连接服务,采用三次握手建立一个连接. 第一次握手:建立连接时,客户端发送syn包(syn=j)到服务器,并进入SYN_SEND状态,等待服务器确认: 第二次握 ...

  4. python日志模块笔记

    前言 在应用中记录日志是程序开发的重要一环,也是调试的重要工具.但却很容易让人忽略.之前用flask写的一个服务就因为没有处理好日志的问题导致线上的错误难以察觉,修复错误的定位也很困难.最近恰好有时间 ...

  5. Hibernate多对多两种情况

    Hibernate在做多对多映射的时候,除了原先的两张表外,会多出一个中间表做关联,根据中间表的会有两种不同的配置情况: 1.中间表不需要加入额外数据. 2.中间表有其他字段,需记录额外数据. 下面, ...

  6. Ubuntu 18.04 sublime text 3176 安装、汉化及配置中文输入

    转载自:https://blog.csdn.net/weixin_42508385/article/details/82152393 一.下载: 在https://www.sublimetext.co ...

  7. MySQL的事务理解

    在学习事务这一概念前,我们需要需要构思一个场景 场景构思 假设该场景发生于一个银行转账背景下,月中,又到了发工资的日子.学校打算给A老师发放一个月的工资.(此处,我们假设转账都是由人工操作的),整个过 ...

  8. MBProgressHUD自定义视图大小的修改

    MBProgressHUD 一款简单易用的弹窗,但是在使用中难免使用自定义view即customView,此时会发现HUD的弹窗大小和你image的大小是一样的无论你怎么修改frame也没有用,此时你 ...

  9. .net core 2.0 报错:error NU1102: Unable to find package ...

    原文地址:传送门 这种是nuget无法还原的问题.解决问题的方法: 在项目文件所在的目录下创建文件:NuGet.Config 里面内容: <?xml version="1.0" ...

  10. [ThinkPHP] 从一个表中获得栏目对应的ID,从另一个表获得属于这些栏目的文章

    public function index(){ $cate = 1; $query = M('Cate')->field('id')->where(array('id'=>$cat ...