强化学习_06_pytorch-PPO2实践(ALE/Breakout-v5)
一、环境适当调整
- 数据收集:
RecordEpisodeStatistics
- 进行起始跳过n帧:
baseSkipFrame
- 一条生命结束记录为done:
EpisodicLifeEnv
- 得分处理成0或1:
ClipRewardEnv
- 叠帧:
FrameStack
- 图像环境的基本操作,方便CNN捕捉智能体的行动
- 向量空间reset处理修复
gym.vector.SyncVectorEnv
: 原始代码中的reset是随机的- 继承重写的
spSyncVectorEnv
方法,支持每个向量的环境的seed一致,利于同一seed下环境的训练
class spSyncVectorEnv(gym.vector.SyncVectorEnv):
"""
step_await _terminateds reset
"""
def __init__(
self,
env_fns: Iterable[Callable[[], Env]],
observation_space: Space = None,
action_space: Space = None,
copy: bool = True,
random_reset: bool = False,
seed: int = None
):
super().__init__(env_fns, observation_space, action_space, copy)
self.random_reset = random_reset
self.seed = seed
def step_wait(self) -> Tuple[Any, NDArray[Any], NDArray[Any], NDArray[Any], dict]:
"""Steps through each of the environments returning the batched results.
Returns:
The batched environment step results
"""
observations, infos = [], {}
for i, (env, action) in enumerate(zip(self.envs, self._actions)):
(
observation,
self._rewards[i],
self._terminateds[i],
self._truncateds[i],
info,
) = env.step(action)
if self._terminateds[i]:
old_observation, old_info = observation, info
if self.random_reset:
observation, info = env.reset(seed=np.random.randint(0, 999999))
else:
observation, info = env.reset() if self.seed is None else env.reset(seed=self.seed)
info["final_observation"] = old_observation
info["final_info"] = old_info
observations.append(observation)
infos = self._add_info(infos, info, i)
self.observations = concatenate(
self.single_observation_space, observations, self.observations
)
return (
deepcopy(self.observations) if self.copy else self.observations,
np.copy(self._rewards),
np.copy(self._terminateds),
np.copy(self._truncateds),
infos,
)
二、pytorch实践
2.1 智能体构建与训练
调整向量环境的reset 之后,
- 支持actor, criticor用同一个cnn层提取特征(
PPOSharedCNN
) - 对
eps
进行了调小->eps=0.165
,希望更新的策略范围更小一些; - 关闭学习率衰减
- 进行不同
ent_coef
的尝试: 稍微大一点,增加agent的探索;ent_coef=0.015
&batch_size=256+128batch
陡降-回升慢ent_coef=0.025
&batch_size=256
陡降回升-最终reward=311
- √
ent_coef=0.05
&batch_size=256
-最终PPO2__AtariEnv instance__20241029__2217 reward=416
ent_coef=0.05
&batch_size=256+128
ent_coef=0.1
&batch_size=256
提升过于平缓
env_name = 'ALE/Breakout-v5'
env_name_str = env_name.replace('/', '-')
gym_env_desc(env_name)
print("gym.__version__ = ", gym.__version__ )
path_ = os.path.dirname(__file__)
num_envs = 12
episod_life = True
clip_reward = True
resize_inner_area = True # True
env_pool_flag = False # True
seed = 202404
envs = spSyncVectorEnv(
[make_atari_env(env_name, skip=4, episod_life=episod_life, clip_reward=clip_reward, ppo_train=True,
max_no_reward_count=120, resize_inner_area=resize_inner_area) for _ in range(num_envs)],
random_reset=False,
seed=202404
)
dist_type = 'norm'
cfg = Config(
envs,
save_path=os.path.join(path_, "test_models" ,f'PPO2_{env_name_str}-2'),
seed=202404,
num_envs=num_envs,
episod_life=episod_life,
clip_reward=clip_reward,
resize_inner_area=resize_inner_area,
env_pool_flag=env_pool_flag,
# 网络参数 Atria-CNN + MLP
actor_hidden_layers_dim=[512, 256],
critic_hidden_layers_dim=[512, 128],
# agent参数
actor_lr=4.5e-4,
gamma=0.99,
# 训练参数
num_episode=3600,
off_buffer_size=128,
max_episode_steps=128,
PPO_kwargs={
'cnn_flag': True,
'clean_rl_cnn': True,
'share_cnn_flag': True,
'continue_action_flag': False,
'lmbda': 0.95,
'eps': 0.165, # 0.165
'k_epochs': 4, # update_epochs
'sgd_batch_size': 512,
'minibatch_size': 256,
'act_type': 'relu',
'dist_type': dist_type,
'critic_coef': 1.0, # 1.0
'ent_coef': 0.05,
'max_grad_norm': 0.5,
'clip_vloss': True,
'mini_adv_norm': True,
'anneal_lr': False,
'num_episode': 3600,
}
)
minibatch_size = cfg.PPO_kwargs['minibatch_size']
max_grad_norm = cfg.PPO_kwargs['max_grad_norm']
cfg.trail_desc = f"actor_lr={cfg.actor_lr},minibatch_size={minibatch_size},max_grad_norm={max_grad_norm},hidden_layers={cfg.actor_hidden_layers_dim}",
agent = PPO2(
state_dim=cfg.state_dim,
actor_hidden_layers_dim=cfg.actor_hidden_layers_dim,
critic_hidden_layers_dim=cfg.critic_hidden_layers_dim,
action_dim=cfg.action_dim,
actor_lr=cfg.actor_lr,
critic_lr=cfg.critic_lr,
gamma=cfg.gamma,
PPO_kwargs=cfg.PPO_kwargs,
device=cfg.device,
reward_func=None
)
agent.train()
ppo2_train(envs, agent, cfg, wandb_flag=True, wandb_project_name=f"PPO2-{env_name_str}-NEW",
train_without_seed=False, test_ep_freq=cfg.off_buffer_size * 10,
online_collect_nums=cfg.off_buffer_size,
test_episode_count=10,
add_max_step_reward_flag=False,
play_func='ppo2_play',
ply_env=ply_env
)
2.2 训练出的智能体观测
最后将训练的最好的网络拿出来进行观察
env = make_atari_env(env_name, skip=4, episod_life=episod_life, clip_reward=clip_reward, ppo_train=True,
max_no_reward_count=120, resize_inner_area=resize_inner_area, render_mode='human')()
ppo2_play(env, agent, cfg, episode_count=2, play_without_seed=False, render=True, ppo_train=True)
原文地址:https://blog.csdn.net/Scc_hy/article/details/143608500
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!