Tianshou/test/modelbased/test_ppo_icm.py
Maximilian Huettenrauch 49c750fb09 update tests
2024-04-24 17:06:59 +02:00

204 lines
7.5 KiB
Python

import argparse
import os
import pprint
import gymnasium as gym
import numpy as np
import torch
from gymnasium.spaces import Box
from torch.utils.tensorboard import SummaryWriter
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import ICMPolicy, PPOPolicy
from tianshou.policy.base import BasePolicy
from tianshou.policy.modelfree.ppo import PPOTrainingStats
from tianshou.trainer import OnpolicyTrainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import MLP, ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic, IntrinsicCuriosityModule
from tianshou.utils.space_info import SpaceInfo
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, default="CartPole-v1")
parser.add_argument("--reward-threshold", type=float, default=None)
parser.add_argument("--seed", type=int, default=1626)
parser.add_argument("--buffer-size", type=int, default=20000)
parser.add_argument("--lr", type=float, default=3e-4)
parser.add_argument("--gamma", type=float, default=0.99)
parser.add_argument("--epoch", type=int, default=10)
parser.add_argument("--step-per-epoch", type=int, default=50000)
parser.add_argument("--step-per-collect", type=int, default=2000)
parser.add_argument("--repeat-per-collect", type=int, default=10)
parser.add_argument("--batch-size", type=int, default=64)
parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64])
parser.add_argument("--training-num", type=int, default=20)
parser.add_argument("--test-num", type=int, default=100)
parser.add_argument("--logdir", type=str, default="log")
parser.add_argument("--render", type=float, default=0.0)
parser.add_argument(
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
)
# ppo special
parser.add_argument("--vf-coef", type=float, default=0.5)
parser.add_argument("--ent-coef", type=float, default=0.0)
parser.add_argument("--eps-clip", type=float, default=0.2)
parser.add_argument("--max-grad-norm", type=float, default=0.5)
parser.add_argument("--gae-lambda", type=float, default=0.95)
parser.add_argument("--rew-norm", type=int, default=0)
parser.add_argument("--norm-adv", type=int, default=0)
parser.add_argument("--recompute-adv", type=int, default=0)
parser.add_argument("--dual-clip", type=float, default=None)
parser.add_argument("--value-clip", type=int, default=0)
parser.add_argument(
"--lr-scale",
type=float,
default=1.0,
help="use intrinsic curiosity module with this lr scale",
)
parser.add_argument(
"--reward-scale",
type=float,
default=0.01,
help="scaling factor for intrinsic curiosity reward",
)
parser.add_argument(
"--forward-loss-weight",
type=float,
default=0.2,
help="weight for the forward model loss in ICM",
)
return parser.parse_known_args()[0]
def test_ppo(args: argparse.Namespace = get_args()) -> None:
env = gym.make(args.task)
space_info = SpaceInfo.from_env(env)
args.state_shape = space_info.observation_info.obs_shape
args.action_shape = space_info.action_info.action_shape
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v1": 195}
args.reward_threshold = default_reward_threshold.get(
args.task,
env.spec.reward_threshold if env.spec else None,
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
net = Net(state_shape=args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
actor = Actor(net, args.action_shape, device=args.device).to(args.device)
critic = Critic(net, device=args.device).to(args.device)
actor_critic = ActorCritic(actor, critic)
# orthogonal initialization
for m in actor_critic.modules():
if isinstance(m, torch.nn.Linear):
torch.nn.init.orthogonal_(m.weight)
torch.nn.init.zeros_(m.bias)
optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)
dist = torch.distributions.Categorical
policy: PPOPolicy[PPOTrainingStats] = PPOPolicy(
actor=actor,
critic=critic,
optim=optim,
dist_fn=dist,
action_scaling=isinstance(env.action_space, Box),
discount_factor=args.gamma,
max_grad_norm=args.max_grad_norm,
eps_clip=args.eps_clip,
vf_coef=args.vf_coef,
ent_coef=args.ent_coef,
gae_lambda=args.gae_lambda,
reward_normalization=args.rew_norm,
dual_clip=args.dual_clip,
value_clip=args.value_clip,
action_space=env.action_space,
deterministic_eval=True,
advantage_normalization=args.norm_adv,
recompute_advantage=args.recompute_adv,
)
feature_dim = args.hidden_sizes[-1]
feature_net = MLP(
space_info.observation_info.obs_dim,
output_dim=feature_dim,
hidden_sizes=args.hidden_sizes[:-1],
device=args.device,
)
action_dim = space_info.action_info.action_dim
icm_net = IntrinsicCuriosityModule(
feature_net,
feature_dim,
action_dim,
hidden_sizes=args.hidden_sizes[-1:],
device=args.device,
).to(args.device)
icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr)
policy = ICMPolicy(
policy=policy,
model=icm_net,
optim=icm_optim,
action_space=env.action_space,
lr_scale=args.lr_scale,
reward_scale=args.reward_scale,
forward_loss_weight=args.forward_loss_weight,
)
# collector
train_collector = Collector(
policy,
train_envs,
VectorReplayBuffer(args.buffer_size, len(train_envs)),
)
test_collector = Collector(policy, test_envs)
# log
log_path = os.path.join(args.logdir, args.task, "ppo_icm")
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)
def save_best_fn(policy: BasePolicy) -> None:
torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))
def stop_fn(mean_rewards: float) -> bool:
return mean_rewards >= args.reward_threshold
# trainer
result = OnpolicyTrainer(
policy=policy,
train_collector=train_collector,
test_collector=test_collector,
max_epoch=args.epoch,
step_per_epoch=args.step_per_epoch,
repeat_per_collect=args.repeat_per_collect,
episode_per_test=args.test_num,
batch_size=args.batch_size,
step_per_collect=args.step_per_collect,
stop_fn=stop_fn,
save_best_fn=save_best_fn,
logger=logger,
).run()
assert stop_fn(result.best_reward)
if __name__ == "__main__":
pprint.pprint(result)
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
collector_stats = collector.collect(n_episode=1, render=args.render, is_eval=True)
print(collector_stats)
if __name__ == "__main__":
test_ppo()