六一送你本ai秘籍,轻松通关童年游戏超级马里奥(文末有福利)-4008云顶国际网站
在蘑菇王国,流传着这样一个故事:
某天,操纵着强力魔法的大乌龟酷霸王一族侵略了蘑菇们居住的和平王国。蘑菇一族都被酷霸王变成了岩石、砖块等形状,蘑菇王国即将灭亡。
只有蘑菇王国的桃花公主,才能解开魔法,让蘑菇们苏醒。
然而,她却被大魔王酷霸王所捉住。
为了打倒乌龟一族、救出桃花公主、给蘑菇王国带回和平,水管工马里奥决定站出来,向酷霸王发起挑战。
是的,这就是童年游戏《超级马里奥》的故事。
你是不是仍旧对马里奥这个游戏记忆犹新,是不是仍旧对过关焦头烂额。
六一儿童节,快来训练一款自己的游戏ai,用代码让马里奥从大反派酷霸王的魔掌里救回桃花公主。
基于华为云一站式ai开发平台,利用强化学习中的ppo算法来玩超级马里奥,对于绝大部分关卡,训练出来的ai智能体都可以在1500个episode内学会过关。
modelarts 是面向开发者的一站式 ai 平台,支持海量数据预处理及交互式智能标注、大规模分布式训练、自动化模型生成,及端-边-云模型按需部署能力,可以让ai应用开发到商用部署缩短为分钟级别。
就算不懂代码,也可以按照教程案例,通过简单的调参,一步步实现游戏ai开发,成为超级马里奥闯关王者。
话不多说,先来看看实际的效果:
超级马里奥游戏ai的整体开发流程为:创建马里奥环境->构建ppo算法->训练->推理->可视化效果,目前可以在上免费体验。
ai gallery是在modelarts的基础上构建的开发者生态社区, 支持算法、模型、数据集、notebook案例和技术文章的共享。
下面,童年回忆杀走起。
因为这个游戏ai是基于ppo算法来训练的,所以先简单科普一下强化学习算法。ppo算法有两种主要形式:ppo-penalty和ppo-clip(ppo2)。在这里,我们讨论ppo-clip(openai使用的主要形式)。 ppo的主要特点如下:
-
ppo属于on-policy算法
-
ppo同时适用于离散和连续的动作空间
-
损失函数 ppo-clip算法最精髓的地方就是加入了一项比例用以描绘新老策略的差异,通过超参数ϵ限制策略的更新步长:
-
更新策略:
-
探索策略 ppo采用随机探索策略。
-
优势函数 表示在状态s下采取动作a,相较于其他动作有多少优势,如果>0,则当前动作比平均动作好,反之,则差
算法主要流程大致如下:
看起来有点复杂,不用担心,即便你不懂这些算法,有了华为云modelarts,可以跟着下面的步骤轻松实现超级马里奥游戏ai的强化学习。
本案例运行环境为pytorch-1.0.0,且需使用gpu运行,开始之前一定要选择对应的硬件规格。如下图所示,在modelarts jupyter中,只要点击代码前面的箭头,就能自动运行。
1. 程序初始化
第1步:安装基础依赖
!pip install -u pip
!pip install gym==0.19.0
!pip install tqdm==4.48.0
!pip install nes-py==8.1.0
!pip install gym-super-mario-bros==7.3.2
第2步:导入相关的库
import os
import shutil
import subprocess as sp
from collections import deque
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.multiprocessing as _mp
from torch.distributions import categorical
import torch.multiprocessing as mp
from nes_py.wrappers import joypadspace
import gym_super_mario_bros
from gym.spaces import box
from gym import wrapper
from gym_super_mario_bros.actions import simple_movement, complex_movement, right_only
import cv2
import matplotlib.pyplot as plt
from ipython import display
import moxing as mox
2. 训练参数初始化
此处划重点,该部分参数可以自己调整,以训练出更好的效果。
opt={
"world": 1, # 可选大关:1,2,3,4,5,6,7,8
"stage": 1, # 可选小关:1,2,3,4
"action_type": "simple", # 动作类别:"simple","right_only", "complex"
'lr': 1e-4, # 建议学习率:1e-3,1e-4, 1e-5,7e-5
'gamma': 0.9, # 奖励折扣
'tau': 1.0, # gae参数
'beta': 0.01, # 熵系数
'epsilon': 0.2, # ppo的clip系数
'batch_size': 16, # 经验回放的batch_size
'max_episode':10, # 最大训练局数
'num_epochs': 10, # 每条经验回放次数
"num_local_steps": 512, # 每局的最大步数
"num_processes": 8, # 训练进程数,一般等于训练机核心数
"save_interval": 5, # 每{}局保存一次模型
"log_path": "./log", # 日志保存路径
"saved_path": "./model", # 训练模型保存路径
"pretrain_model": true, # 是否加载预训练模型,目前只提供1-1关卡的预训练模型,其他需要从零开始训练
"episode":5
}
如果你想选择其他关卡时,记得调整参数world和stage ,这里默认的是第一关。
3. 创建环境
结束标志:
- 胜利:mario到达本关终点
- 失败:mario受到敌人的伤害、坠入悬崖或者时间用完
奖励函数:
- 得分:收集金币、踩扁敌人、结束时夺旗
- 扣分:受到敌人伤害、掉落悬崖、结束时未夺旗
# 创建环境
def create_train_env(world, stage, actions, output_path=none):
# 创建基础环境
env = gym_super_mario_bros.make("supermariobros-{}-{}-v0".format(world, stage))
env = joypadspace(env, actions)
# 对环境自定义
env = customreward(env, world, stage, monitor=none)
env = customskipframe(env)
return env
# 对原始环境进行修改,以获得更好的训练效果
class customreward(wrapper):
def __init__(self, env=none, world=none, stage=none, monitor=none):
super(customreward, self).__init__(env)
self.observation_space = box(low=0, high=255, shape=(1, 84, 84))
self.curr_score = 0
self.current_x = 40
self.world = world
self.stage = stage
if monitor:
self.monitor = monitor
else:
self.monitor = none
def step(self, action):
state, reward, done, info = self.env.step(action)
if self.monitor:
self.monitor.record(state)
state = process_frame(state)
reward = (info["score"] - self.curr_score) / 40.
self.curr_score = info["score"]
if done:
if info["flag_get"]:
reward = 50
else:
reward -= 50
if self.world == 7 and self.stage == 4:
if (506 <= info["x_pos"] <= 832 and info["y_pos"] > 127) or (
832 < info["x_pos"] <= 1064 and info["y_pos"] < 80) or (
1113 < info["x_pos"] <= 1464 and info["y_pos"] < 191) or (
1579 < info["x_pos"] <= 1943 and info["y_pos"] < 191) or (
1946 < info["x_pos"] <= 1964 and info["y_pos"] >= 191) or (
1984 < info["x_pos"] <= 2060 and (info["y_pos"] >= 191 or info["y_pos"] < 127)) or (
2114 < info["x_pos"] < 2440 and info["y_pos"] < 191) or info["x_pos"] < self.current_x - 500:
reward -= 50
done = true
if self.world == 4 and self.stage == 4:
if (info["x_pos"] <= 1500 and info["y_pos"] < 127) or (
1588 <= info["x_pos"] < 2380 and info["y_pos"] >= 127):
reward = -50
done = true
self.current_x = info["x_pos"]
return state, reward / 10., done, info
def reset(self):
self.curr_score = 0
self.current_x = 40
return process_frame(self.env.reset())
class multipleenvironments:
def __init__(self, world, stage, action_type, num_envs, output_path=none):
self.agent_conns, self.env_conns = zip(*[mp.pipe() for _ in range(num_envs)])
if action_type == "right_only":
actions = right_only
elif action_type == "simple":
actions = simple_movement
else:
actions = complex_movement
self.envs = [create_train_env(world, stage, actions, output_path=output_path) for _ in range(num_envs)]
self.num_states = self.envs[0].observation_space.shape[0]
self.num_actions = len(actions)
for index in range(num_envs):
process = mp.process(target=self.run, args=(index,))
process.start()
self.env_conns[index].close()
def run(self, index):
self.agent_conns[index].close()
while true:
request, action = self.env_conns[index].recv()
if request == "step":
self.env_conns[index].send(self.envs[index].step(action.item()))
elif request == "reset":
self.env_conns[index].send(self.envs[index].reset())
else:
raise notimplementederror
def process_frame(frame):
if frame is not none:
frame = cv2.cvtcolor(frame, cv2.color_rgb2gray)
frame = cv2.resize(frame, (84, 84))[none, :, :] / 255.
return frame
else:
return np.zeros((1, 84, 84))
class customskipframe(wrapper):
def __init__(self, env, skip=4):
super(customskipframe, self).__init__(env)
self.observation_space = box(low=0, high=255, shape=(skip, 84, 84))
self.skip = skip
self.states = np.zeros((skip, 84, 84), dtype=np.float32)
def step(self, action):
total_reward = 0
last_states = []
for i in range(self.skip):
state, reward, done, info = self.env.step(action)
total_reward = reward
if i >= self.skip / 2:
last_states.append(state)
if done:
self.reset()
return self.states[none, :, :, :].astype(np.float32), total_reward, done, info
max_state = np.max(np.concatenate(last_states, 0), 0)
self.states[:-1] = self.states[1:]
self.states[-1] = max_state
return self.states[none, :, :, :].astype(np.float32), total_reward, done, info
def reset(self):
state = self.env.reset()
self.states = np.concatenate([state for _ in range(self.skip)], 0)
return self.states[none, :, :, :].astype(np.float32)
4. 定义神经网络
神经网络结构包含四层卷积网络和一层全连接网络,提取的特征输入critic层和actor层,分别输出value值和动作概率分布。
class net(nn.module):
def __init__(self, num_inputs, num_actions):
super(net, self).__init__()
self.conv1 = nn.conv2d(num_inputs, 32, 3, stride=2, padding=1)
self.conv2 = nn.conv2d(32, 32, 3, stride=2, padding=1)
self.conv3 = nn.conv2d(32, 32, 3, stride=2, padding=1)
self.conv4 = nn.conv2d(32, 32, 3, stride=2, padding=1)
self.linear = nn.linear(32 * 6 * 6, 512)
self.critic_linear = nn.linear(512, 1)
self.actor_linear = nn.linear(512, num_actions)
self._initialize_weights()
def _initialize_weights(self):
for module in self.modules():
if isinstance(module, nn.conv2d) or isinstance(module, nn.linear):
nn.init.orthogonal_(module.weight, nn.init.calculate_gain('relu'))
nn.init.constant_(module.bias, 0)
def forward(self, x):
x = f.relu(self.conv1(x))
x = f.relu(self.conv2(x))
x = f.relu(self.conv3(x))
x = f.relu(self.conv4(x))
x = self.linear(x.view(x.size(0), -1))
return self.actor_linear(x), self.critic_linear(x)
5. 定义ppo算法
def evaluation(opt, global_model, num_states, num_actions,curr_episode):
print('start evalution !')
torch.manual_seed(123)
if opt['action_type'] == "right":
actions = right_only
elif opt['action_type'] == "simple":
actions = simple_movement
else:
actions = complex_movement
env = create_train_env(opt['world'], opt['stage'], actions)
local_model = net(num_states, num_actions)
if torch.cuda.is_available():
local_model.cuda()
local_model.eval()
state = torch.from_numpy(env.reset())
if torch.cuda.is_available():
state = state.cuda()
plt.figure(figsize=(10,10))
img = plt.imshow(env.render(mode='rgb_array'))
done=false
local_model.load_state_dict(global_model.state_dict()) #加载网络参数\
while not done:
if torch.cuda.is_available():
state = state.cuda()
logits, value = local_model(state)
policy = f.softmax(logits, dim=1)
action = torch.argmax(policy).item()
state, reward, done, info = env.step(action)
state = torch.from_numpy(state)
img.set_data(env.render(mode='rgb_array')) # just update the data
display.display(plt.gcf())
display.clear_output(wait=true)
if info["flag_get"]:
print("flag getted in episode:{}!".format(curr_episode))
torch.save(local_model.state_dict(),
"{}/ppo_super_mario_bros_{}_{}_{}".format(opt['saved_path'], opt['world'], opt['stage'],curr_episode))
opt.update({'episode':curr_episode})
env.close()
return true
return false
def train(opt):
#判断cuda是否可用
if torch.cuda.is_available():
torch.cuda.manual_seed(123)
else:
torch.manual_seed(123)
if os.path.isdir(opt['log_path']):
shutil.rmtree(opt['log_path'])
os.makedirs(opt['log_path'])
if not os.path.isdir(opt['saved_path']):
os.makedirs(opt['saved_path'])
mp = _mp.get_context("spawn")
#创建环境
envs = multipleenvironments(opt['world'], opt['stage'], opt['action_type'], opt['num_processes'])
#创建模型
model = net(envs.num_states, envs.num_actions)
if opt['pretrain_model']:
print('加载预训练模型')
if not os.path.exists("ppo_super_mario_bros_1_1_0"):
mox.file.copy_parallel(
"obs://modelarts-labs-bj4/course/modelarts/zjc_team/reinforcement_learning/ppo_mario/ppo_super_mario_bros_1_1_0",
"ppo_super_mario_bros_1_1_0")
if torch.cuda.is_available():
model.load_state_dict(torch.load("ppo_super_mario_bros_1_1_0"))
model.cuda()
else:
model.load_state_dict(torch.load("ppo_super_mario_bros_1_1_0",torch.device('cpu')))
else:
model.cuda()
model.share_memory()
optimizer = torch.optim.adam(model.parameters(), lr=opt['lr'])
#环境重置
[agent_conn.send(("reset", none)) for agent_conn in envs.agent_conns]
#接收当前状态
curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
if torch.cuda.is_available():
curr_states = curr_states.cuda()
curr_episode = 0
#在最大局数内训练
while curr_episode<opt['max_episode']:
if curr_episode % opt['save_interval'] == 0 and curr_episode > 0:
torch.save(model.state_dict(),
"{}/ppo_super_mario_bros_{}_{}_{}".format(opt['saved_path'], opt['world'], opt['stage'], curr_episode))
curr_episode = 1
old_log_policies = []
actions = []
values = []
states = []
rewards = []
dones = []
#一局内最大步数
for _ in range(opt['num_local_steps']):
states.append(curr_states)
logits, value = model(curr_states)
values.append(value.squeeze())
policy = f.softmax(logits, dim=1)
old_m = categorical(policy)
action = old_m.sample()
actions.append(action)
old_log_policy = old_m.log_prob(action)
old_log_policies.append(old_log_policy)
#执行action
if torch.cuda.is_available():
[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())]
else:
[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)]
state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns])
state = torch.from_numpy(np.concatenate(state, 0))
if torch.cuda.is_available():
state = state.cuda()
reward = torch.cuda.floattensor(reward)
done = torch.cuda.floattensor(done)
else:
reward = torch.floattensor(reward)
done = torch.floattensor(done)
rewards.append(reward)
dones.append(done)
curr_states = state
_, next_value, = model(curr_states)
next_value = next_value.squeeze()
old_log_policies = torch.cat(old_log_policies).detach()
actions = torch.cat(actions)
values = torch.cat(values).detach()
states = torch.cat(states)
gae = 0
r = []
#gae计算
for value, reward, done in list(zip(values, rewards, dones))[::-1]:
gae = gae * opt['gamma'] * opt['tau']
gae = gae reward opt['gamma'] * next_value.detach() * (1 - done) - value.detach()
next_value = value
r.append(gae value)
r = r[::-1]
r = torch.cat(r).detach()
advantages = r - values
#策略更新
for i in range(opt['num_epochs']):
indice = torch.randperm(opt['num_local_steps'] * opt['num_processes'])
for j in range(opt['batch_size']):
batch_indices = indice[
int(j * (opt['num_local_steps'] * opt['num_processes'] / opt['batch_size'])): int((j 1) * (
opt['num_local_steps'] * opt['num_processes'] / opt['batch_size']))]
logits, value = model(states[batch_indices])
new_policy = f.softmax(logits, dim=1)
new_m = categorical(new_policy)
new_log_policy = new_m.log_prob(actions[batch_indices])
ratio = torch.exp(new_log_policy - old_log_policies[batch_indices])
actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices],
torch.clamp(ratio, 1.0 - opt['epsilon'], 1.0 opt['epsilon']) *
advantages[
batch_indices]))
critic_loss = f.smooth_l1_loss(r[batch_indices], value.squeeze())
entropy_loss = torch.mean(new_m.entropy())
#损失函数包含三个部分:actor损失,critic损失,和动作entropy损失
total_loss = actor_loss critic_loss - opt['beta'] * entropy_loss
optimizer.zero_grad()
total_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
print("episode: {}. total loss: {}".format(curr_episode, total_loss))
finish=false
for i in range(opt["num_processes"]):
if info[i]["flag_get"]:
finish=evaluation(opt, model,envs.num_states, envs.num_actions,curr_episode)
if finish:
break
if finish:
break
6. 训练模型
训练10 episode,耗时约5分钟
train(opt)
7. 使用模型推理游戏
定义推理函数
def infer(opt):
if torch.cuda.is_available():
torch.cuda.manual_seed(123)
else:
torch.manual_seed(123)
if opt['action_type'] == "right":
actions = right_only
elif opt['action_type'] == "simple":
actions = simple_movement
else:
actions = complex_movement
env = create_train_env(opt['world'], opt['stage'], actions)
model = net(env.observation_space.shape[0], len(actions))
if torch.cuda.is_available():
model.load_state_dict(torch.load("{}/ppo_super_mario_bros_{}_{}_{}".format(opt['saved_path'],opt['world'], opt['stage'],opt['episode'])))
model.cuda()
else:
model.load_state_dict(torch.load("{}/ppo_super_mario_bros_{}_{}_{}".format(opt['saved_path'], opt['world'], opt['stage'],opt['episode']),
map_location=torch.device('cpu')))
model.eval()
state = torch.from_numpy(env.reset())
plt.figure(figsize=(10,10))
img = plt.imshow(env.render(mode='rgb_array'))
while true:
if torch.cuda.is_available():
state = state.cuda()
logits, value = model(state)
policy = f.softmax(logits, dim=1)
action = torch.argmax(policy).item()
state, reward, done, info = env.step(action)
state = torch.from_numpy(state)
img.set_data(env.render(mode='rgb_array')) # just update the data
display.display(plt.gcf())
display.clear_output(wait=true)
if info["flag_get"]:
print("world {} stage {} completed".format(opt['world'], opt['stage']))
break
if done and info["flag_get"] is false:
print('game failed')
break
infer(opt)
world 1 stage 1 completed
六一儿童节,快来华为云ai gallery上体验ai闯关超级马里奥,无需考虑计算资源,环境的搭建,在modelarts里运行简单几行代码,手把手带你5分钟速成游戏王。
福利时间到,评论区贴出你训练的超级马里奥闯关截图,有机会赢取华为云社区送上的儿童节礼物,快来留下你的童年游戏回忆。
最后,也祝所有大朋友、小朋友们儿童节快乐,愿童心未泯,所遇皆甜!
- 点赞
- 收藏
- 关注作者
评论(0)