2018 年 7 月 31 日 - 作者:Raymond Yuan,软件工程实习生
在本教程中,我们将学习如何训练一个能够赢得简单游戏 CartPole 的模型,使用深度强化学习。我们将使用 tf.keras 和 OpenAI 的 gym 来训练一个使用称为异步优势 Actor-Critic (A3C) 的技术的代理。强化学习一直受到极大的关注,但它究竟是什么呢?强化学习是机器学习的一个领域,涉及代理,这些代理应该在环境中采取某些行动以最大化或获得某种奖励。
class RandomAgent:
"""Random Agent that will play the specified game
Arguments:
env_name: Name of the environment to be played
max_eps: Maximum number of episodes to run agent for.
"""
def __init__(self, env_name, max_eps):
self.env = gym.make(env_name)
self.max_episodes = max_eps
self.global_moving_average_reward = 0
self.res_queue = Queue()
def run(self):
reward_avg = 0
for episode in range(self.max_episodes):
done = False
self.env.reset()
reward_sum = 0.0
steps = 0
while not done:
# Sample randomly from the action space and step
_, reward, done, _ = self.env.step(self.env.action_space.sample())
steps += 1
reward_sum += reward
# Record statistics
self.global_moving_average_reward = record(episode,
reward_sum,
0,
self.global_moving_average_reward,
self.res_queue, 0, steps)
reward_avg += reward_sum
final_avg = reward_avg / float(self.max_episodes)
print("Average score across {} episodes: {}".format(self.max_episodes, final_avg))
return final_avg
对于游戏 CartPole,我们在 4000 个回合中获得约 20 的平均值。要运行随机代理,请运行提供的 py 文件:python a3c_cartpole.py — algorithm=random — max-eps=4000
。class ActorCriticModel(keras.Model):
def __init__(self, state_size, action_size):
super(ActorCriticModel, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.dense1 = layers.Dense(100, activation='relu')
self.policy_logits = layers.Dense(action_size)
self.dense2 = layers.Dense(100, activation='relu')
self.values = layers.Dense(1)
def call(self, inputs):
# Forward pass
x = self.dense1(inputs)
logits = self.policy_logits(x)
v1 = self.dense2(inputs)
values = self.values(v1)
return logits, values
从我们的前向传递中可以看出,我们的模型将接收输入并返回策略概率 logits 和值。class MasterAgent():
def __init__(self):
self.game_name = 'CartPole-v0'
save_dir = args.save_dir
self.save_dir = save_dir
if not os.path.exists(save_dir):
os.makedirs(save_dir)
env = gym.make(self.game_name)
self.state_size = env.observation_space.shape[0]
self.action_size = env.action_space.n
self.opt = tf.train.AdamOptimizer(args.lr, use_locking=True)
print(self.state_size, self.action_size)
self.global_model = ActorCriticModel(self.state_size, self.action_size) # global network
self.global_model(tf.convert_to_tensor(np.random.random((1, self.state_size)), dtype=tf.float32))
主代理将运行训练函数来实例化并启动每个代理。主代理负责协调和监督每个代理。这些代理中的每一个都将异步运行。(从技术上讲,这不是真正的异步,因为在 Python 中,由于 GIL(全局解释器锁),单个 Python 进程无法并行运行线程(利用多个内核)。但是,它可以并发运行它们(在 I/O 绑定操作期间进行上下文切换)。为了简单和清晰地说明示例,我们使用线程来实现。)def train(self):
if args.algorithm == 'random':
random_agent = RandomAgent(self.game_name, args.max_eps)
random_agent.run()
return
res_queue = Queue()
workers = [Worker(self.state_size,
self.action_size,
self.global_model,
self.opt, res_queue,
i, game_name=self.game_name,
save_dir=self.save_dir) for i in range(multiprocessing.cpu_count())]
for i, worker in enumerate(workers):
print("Starting worker {}".format(i))
worker.start()
moving_average_rewards = [] # record episode reward to plot
while True:
reward = res_queue.get()
if reward is not None:
moving_average_rewards.append(reward)
else:
break
[w.join() for w in workers]
plt.plot(moving_average_rewards)
plt.ylabel('Moving average ep reward')
plt.xlabel('Step')
plt.savefig(os.path.join(self.save_dir,
'{} Moving Average.png'.format(self.game_name)))
plt.show()
Memory
类。该类将只提供我们跟踪每个步骤发生的行动、奖励和状态的功能。class Memory:
def __init__(self):
self.states = []
self.actions = []
self.rewards = []
def store(self, state, action, reward):
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
def clear(self):
self.states = []
self.actions = []
self.rewards = []
现在,我们进入算法的核心:工作代理。工作代理继承自线程类,并且我们覆盖了 Thread 的 run
方法。这将使我们能够实现 A3C 中的第一个 A,即异步。首先,我们将通过实例化一个本地模型并设置特定的训练参数来开始。class Worker(threading.Thread):
# Set up global variables across different threads
global_episode = 0
# Moving average reward
global_moving_average_reward = 0
best_score = 0
save_lock = threading.Lock()
def __init__(self,
state_size,
action_size,
global_model,
opt,
result_queue,
idx,
game_name='CartPole-v0',
save_dir='/tmp'):
super(Worker, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.result_queue = result_queue
self.global_model = global_model
self.opt = opt
self.local_model = ActorCriticModel(self.state_size, self.action_size)
self.worker_idx = idx
self.game_name = game_name
self.env = gym.make(self.game_name).unwrapped
self.save_dir = save_dir
self.ep_loss = 0.0
run
函数。这将实际运行我们的算法。我们将为给定的全局最大情节数运行所有线程。这就是 A3C 中的第三个 A,即演员,发挥作用的地方。我们的智能体将根据我们的策略函数“行动”,成为演员,而“评论家”——我们的价值函数——会判断行动。虽然代码的这一部分可能看起来很密集,但实际上并没有做太多事情。在每个情节中,代码只是执行以下操作:args.update_freq
)或已达到终止状态(已死亡),则使用从本地模型计算的梯度更新全局模型def run(self):
total_step = 1
mem = Memory()
while Worker.global_episode < args.max_eps:
current_state = self.env.reset()
mem.clear()
ep_reward = 0.
ep_steps = 0
self.ep_loss = 0
time_count = 0
done = False
while not done:
logits, _ = self.local_model(
tf.convert_to_tensor(current_state[None, :],
dtype=tf.float32))
probs = tf.nn.softmax(logits)
action = np.random.choice(self.action_size, p=probs.numpy()[0])
new_state, reward, done, _ = self.env.step(action)
if done:
reward = -1
ep_reward += reward
mem.store(current_state, action, reward)
if time_count == args.update_freq or done:
# Calculate gradient wrt to local model. We do so by tracking the
# variables involved in computing the loss by using tf.GradientTape
with tf.GradientTape() as tape:
total_loss = self.compute_loss(done,
new_state,
mem,
args.gamma)
self.ep_loss += total_loss
# Calculate local gradients
grads = tape.gradient(total_loss, self.local_model.trainable_weights)
# Push local gradients to global model
self.opt.apply_gradients(zip(grads,
self.global_model.trainable_weights))
# Update local model with new weights
self.local_model.set_weights(self.global_model.get_weights())
mem.clear()
time_count = 0
if done: # done and print information
Worker.global_moving_average_reward = \
record(Worker.global_episode, ep_reward, self.worker_idx,
Worker.global_moving_average_reward, self.result_queue,
self.ep_loss, ep_steps)
# We must use a lock to save our model and to print to prevent data races.
if ep_reward > Worker.best_score:
with Worker.save_lock:
print("Saving best model to {}, "
"episode score: {}".format(self.save_dir, ep_reward))
self.global_model.save_weights(
os.path.join(self.save_dir,
'model_{}.h5'.format(self.game_name))
)
Worker.best_score = ep_reward
Worker.global_episode += 1
ep_steps += 1
time_count += 1
current_state = new_state
total_step += 1
self.result_queue.put(None)
def compute_loss(self,
done,
new_state,
memory,
gamma=0.99):
if done:
reward_sum = 0. # terminal
else:
reward_sum = self.local_model(
tf.convert_to_tensor(new_state[None, :],
dtype=tf.float32))[-1].numpy()[0]
# Get discounted rewards
discounted_rewards = []
for reward in memory.rewards[::-1]: # reverse buffer r
reward_sum = reward + gamma * reward_sum
discounted_rewards.append(reward_sum)
discounted_rewards.reverse()
logits, values = self.local_model(
tf.convert_to_tensor(np.vstack(memory.states),
dtype=tf.float32))
# Get our advantages
advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None],
dtype=tf.float32) - values
# Value loss
value_loss = advantage ** 2
# Calculate our policy loss
actions_one_hot = tf.one_hot(memory.actions, self.action_size, dtype=tf.float32)
policy = tf.nn.softmax(logits)
entropy = tf.reduce_sum(policy * tf.log(policy + 1e-20), axis=1)
policy_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=actions_one_hot,
logits=logits)
policy_loss *= tf.stop_gradient(advantage)
policy_loss -= 0.01 * entropy
total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss))
return total_loss
就是这样!工作智能体将重复将网络参数重置为全局网络中的所有参数的过程,并重复与环境交互、计算损失,然后将梯度应用于全局网络的过程。通过运行以下命令训练您的算法:python a3c_cartpole.py — train
。 def play(self):
env = gym.make(self.game_name).unwrapped
state = env.reset()
model = self.global_model
model_path = os.path.join(self.save_dir, 'model_{}.h5'.format(self.game_name))
print('Loading model from: {}'.format(model_path))
model.load_weights(model_path)
done = False
step_counter = 0
reward_sum = 0
try:
while not done:
env.render(mode='rgb_array')
policy, value = model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
policy = tf.nn.softmax(policy)
action = np.argmax(policy)
state, reward, done, _ = env.step(action)
reward_sum += reward
print("{}. Reward: {}, action: {}".format(step_counter, reward_sum, action))
step_counter += 1
except KeyboardInterrupt:
print("Received Keyboard Interrupt. Shutting down.")
finally:
env.close()
训练完模型后,您可以使用以下命令运行它:python a3c_cartpole.py
。
2018 年 7 月 31 日 — 作者:Raymond Yuan,软件工程实习生
在本教程中,我们将学习如何训练一个能够使用深度强化学习赢得简单游戏 CartPole 的模型。我们将使用 tf.keras 和 OpenAI 的 gym,使用一种称为异步优势行动者评论家 (A3C) 的技术训练智能体。强化学习一直受到极大的关注,但它究竟是什么……