深度增强学习--A3C
它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所以更新的相关性被降低, 收敛性提高

import threading
import numpy as np
import tensorflow as tf
import pylab
import time
import gym
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K # global variables for threading
episode = 0
scores = [] EPISODES = 2000 # This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
# In this example, we use A3C algorithm
class A3CAgent:
def __init__(self, state_size, action_size, env_name):
# get size of state and action
self.state_size = state_size
self.action_size = action_size # get gym environment name
self.env_name = env_name # these are hyper parameters for the A3C
self.actor_lr = 0.001
self.critic_lr = 0.001
self.discount_factor = .99
self.hidden1, self.hidden2 = 24, 24
self.threads = 8 #8个线程并行 # create model for actor and critic network
self.actor, self.critic = self.build_model() # method for training actor and critic network
self.optimizer = [self.actor_optimizer(), self.critic_optimizer()] self.sess = tf.InteractiveSession()
K.set_session(self.sess)
self.sess.run(tf.global_variables_initializer()) # approximate policy and value using Neural Network
# actor -> state is input and probability of each action is output of network
# critic -> state is input and value of state is output of network
# actor and critic network share first hidden layer
def build_model(self):
state = Input(batch_shape=(None, self.state_size))
shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state) actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden) value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden) actor = Model(inputs=state, outputs=action_prob)
critic = Model(inputs=state, outputs=state_value) actor._make_predict_function()
critic._make_predict_function() actor.summary()
critic.summary() return actor, critic # make loss function for Policy Gradient
# [log(action probability) * advantages] will be input for the back prop
# we add entropy of action probability to loss
def actor_optimizer(self):
action = K.placeholder(shape=(None, self.action_size))
advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(action * policy, axis=1)
eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) actor_loss = loss + 0.01*entropy optimizer = Adam(lr=self.actor_lr)
updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
train = K.function([self.actor.input, action, advantages], [], updates=updates)
return train # make loss function for Value approximation
def critic_optimizer(self):
discounted_reward = K.placeholder(shape=(None, )) value = self.critic.output loss = K.mean(K.square(discounted_reward - value)) optimizer = Adam(lr=self.critic_lr)
updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
train = K.function([self.critic.input, discounted_reward], [], updates=updates)
return train # make agents(local) and start training
def train(self):
# self.load_model('./save_model/cartpole_a3c.h5')
agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
self.action_size, self.state_size) for i in range(self.threads)]#建立8个local agent for agent in agents:
agent.start() while True:
time.sleep(20) plot = scores[:]
pylab.plot(range(len(plot)), plot, 'b')
pylab.savefig("./save_graph/cartpole_a3c.png") self.save_model('./save_model/cartpole_a3c.h5') def save_model(self, name):
self.actor.save_weights(name + "_actor.h5")
self.critic.save_weights(name + "_critic.h5") def load_model(self, name):
self.actor.load_weights(name + "_actor.h5")
self.critic.load_weights(name + "_critic.h5") # This is Agent(local) class for threading
class Agent(threading.Thread):
def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
threading.Thread.__init__(self) self.states = []
self.rewards = []
self.actions = [] self.index = index
self.actor = actor
self.critic = critic
self.optimizer = optimizer
self.env_name = env_name
self.discount_factor = discount_factor
self.action_size = action_size
self.state_size = state_size # Thread interactive with environment
def run(self):
global episode
env = gym.make(self.env_name)
while episode < EPISODES:
state = env.reset()
score = 0
while True:
action = self.get_action(state)
next_state, reward, done, _ = env.step(action)
score += reward self.memory(state, action, reward) state = next_state if done:
episode += 1
print("episode: ", episode, "/ score : ", score)
scores.append(score)
self.train_episode(score != 500)
break # In Policy Gradient, Q function is not available.
# Instead agent uses sample returns for evaluating policy
def discount_rewards(self, rewards, done=True):
discounted_rewards = np.zeros_like(rewards)
running_add = 0
if not done:
running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
for t in reversed(range(0, len(rewards))):
running_add = running_add * self.discount_factor + rewards[t]
discounted_rewards[t] = running_add
return discounted_rewards # save <s, a ,r> of each step
# this is used for calculating discounted rewards
def memory(self, state, action, reward):
self.states.append(state)
act = np.zeros(self.action_size)
act[action] = 1
self.actions.append(act)
self.rewards.append(reward) # update policy network and value network every episode
def train_episode(self, done):
discounted_rewards = self.discount_rewards(self.rewards, done) values = self.critic.predict(np.array(self.states))
values = np.reshape(values, len(values)) advantages = discounted_rewards - values self.optimizer[0]([self.states, self.actions, advantages])
self.optimizer[1]([self.states, discounted_rewards])
self.states, self.actions, self.rewards = [], [], [] def get_action(self, state):
policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
return np.random.choice(self.action_size, 1, p=policy)[0] if __name__ == "__main__":
env_name = 'CartPole-v1'
env = gym.make(env_name) state_size = env.observation_space.shape[0]
action_size = env.action_space.n env.close() global_agent = A3CAgent(state_size, action_size, env_name)
global_agent.train()
深度增强学习--A3C的更多相关文章
- 深度增强学习--DPPO
PPO DPPO介绍 PPO实现 代码DPPO
- 深度增强学习--DDPG
DDPG DDPG介绍2 ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测 公式推导 推导 代码实现的gym的pendulum游戏,这个游 ...
- 深度增强学习--DQN的变形
DQN的变形 double DQN prioritised replay dueling DQN
- 深度增强学习--Actor Critic
Actor Critic value-based和policy-based的结合 实例代码 import sys import gym import pylab import numpy as np ...
- 深度增强学习--Policy Gradient
前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客 一个推导 下面的例子实现的REINFOR ...
- 深度增强学习--Deep Q Network
从这里开始换个游戏演示,cartpole游戏 Deep Q Network 实例代码 import sys import gym import pylab import random import n ...
- 常用增强学习实验环境 II (ViZDoom, Roboschool, TensorFlow Agents, ELF, Coach等) (转载)
原文链接:http://blog.csdn.net/jinzhuojun/article/details/78508203 前段时间Nature上发表的升级版Alpha Go - AlphaGo Ze ...
- 马里奥AI实现方式探索 ——神经网络+增强学习
[TOC] 马里奥AI实现方式探索 --神经网络+增强学习 儿时我们都曾有过一个经典游戏的体验,就是马里奥(顶蘑菇^v^),这次里约奥运会闭幕式,日本作为2020年东京奥运会的东道主,安倍最后也已经典 ...
- 增强学习 | AlphaGo背后的秘密
"敢于尝试,才有突破" 2017年5月27日,当今世界排名第一的中国棋手柯洁与AlphaGo 2.0的三局对战落败.该事件标志着最新的人工智能技术在围棋竞技领域超越了人类智能,借此 ...
随机推荐
- .NET对IO的基本操作集合
分享一下对IO的基本使用,很简单的东西,不需要记住,用的时候看一下就可以了. 个人对IO的使用很少,记录日志,保存一些数据,保存文件,其他的基本上很少用到,做商城类的项目应该会用到很多 1.先配置一下 ...
- 【面试题】整理一下2018年java技术要领
整理一下2018年java技术要领 基础篇 基本功 面向对象的特征 final, finally, finalize 的区别 int 和 Integer 有什么区别 重载和重写的区别 抽象类和接口有什 ...
- 2.Spark Streaming运行机制和架构
1 解密Spark Streaming运行机制 上节课我们谈到了技术界的寻龙点穴.这就像过去的风水一样,每个领域都有自己的龙脉,Spark就是龙脉之所在,它的龙穴或者关键点就是SparkStreami ...
- Jsonp方式和httpclient方式有什么区别?
jsonp基于js,解决跨域问题,本质发起ajax情求但是Jsonp只支持get请求. 它不安全,它先解析js,然后发起ajax请求,然后获取到返回值,通过浏览器返回,最后解析. JQuery和Spr ...
- 在SpringMVC中 /* 和 / 的区别
<url-pattern> / </url-pattern>:会匹配到 /springmvc 这样的路径型url,而不会匹配到像 .jsp 这样的后缀型的url. <ur ...
- CodeForces 740C Alyona and mex
构造. 比较骚的构造题.肯定可以构造出$min(R-L+1)$,只要$0$ $1$ $2$ $...$ $R-L$ $0$ $1$ $2$ $...$ $R-L$填数字即可,这样任意一段区间都包含了$ ...
- 第5天:Ansible-Playbook
Ansible中的每个模块专注于某一方面的功能.虽然每个模块实现的功能都比较简单,但是,将各个模块结合起来就可以实现比较复杂的功能.在Ansible中,将各个模块组合起来的文件是一个YAML格式的配置 ...
- 文件还原工具Foremost
文件还原工具Foremost 在数字取证中,通过对设备备份,可以获取磁盘镜像文件.通过分析镜像文件,可以获取磁盘存在的数据.但是很多重要数据往往已被删除.这个时候,就需要还原这些文件.Kali L ...
- VB查询数据库之写入数据库——机房收费系统总结(三)
在机房收费系统中,新注册的用户,更改的密码,上机下级记录,上机收费记录等等都要写入数据库,这样,后面的查询才能生效.像数据库中写入数据,首先,找到你要写入数据的数据库中的表,在表中建立新的行,让后再把 ...
- 1.2(SQL学习笔记)高级数据过滤
一.AND 通过WHERE可以进行条件过滤,但只限于单个条件. 通过AND就可以连接多个条件,AND代表了和,即AND两边的条件全部满足才会通过筛选. 这就类似编程语言中的&&. 以下 ...