Tianshou/test/continuous/test_ppo.py
Daniel Plop 8a0629ded6
Fix mypy issues in tests and examples (#1077)
Closes #952 

- `SamplingConfig` supports `batch_size=None`. #1077
- tests and examples are covered by `mypy`. #1077
- `NetBase` is more used, stricter typing by making it generic. #1077
- `utils.net.common.Recurrent` now receives and returns a
`RecurrentStateBatch` instead of a dict. #1077

---------

Co-authored-by: Michael Panchenko <m.panchenko@appliedai.de>
2024-04-03 18:07:51 +02:00

211 lines
8.1 KiB
Python

import argparse
import os
import pprint
import gymnasium as gym
import numpy as np
import torch
from torch.distributions import Distribution, Independent, Normal
from torch.utils.tensorboard import SummaryWriter
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import PPOPolicy
from tianshou.policy.base import BasePolicy
from tianshou.policy.modelfree.ppo import PPOTrainingStats
from tianshou.trainer import OnpolicyTrainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.continuous import ActorProb, Critic
from tianshou.utils.space_info import SpaceInfo
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, default="Pendulum-v1")
parser.add_argument("--reward-threshold", type=float, default=None)
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--buffer-size", type=int, default=20000)
parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--gamma", type=float, default=0.95)
parser.add_argument("--epoch", type=int, default=5)
parser.add_argument("--step-per-epoch", type=int, default=150000)
parser.add_argument("--episode-per-collect", type=int, default=16)
parser.add_argument("--repeat-per-collect", type=int, default=2)
parser.add_argument("--batch-size", type=int, default=128)
parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64])
parser.add_argument("--training-num", type=int, default=16)
parser.add_argument("--test-num", type=int, default=100)
parser.add_argument("--logdir", type=str, default="log")
parser.add_argument("--render", type=float, default=0.0)
parser.add_argument(
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
)
# ppo special
parser.add_argument("--vf-coef", type=float, default=0.25)
parser.add_argument("--ent-coef", type=float, default=0.0)
parser.add_argument("--eps-clip", type=float, default=0.2)
parser.add_argument("--max-grad-norm", type=float, default=0.5)
parser.add_argument("--gae-lambda", type=float, default=0.95)
parser.add_argument("--rew-norm", type=int, default=1)
parser.add_argument("--dual-clip", type=float, default=None)
parser.add_argument("--value-clip", type=int, default=1)
parser.add_argument("--norm-adv", type=int, default=1)
parser.add_argument("--recompute-adv", type=int, default=0)
parser.add_argument("--resume", action="store_true")
parser.add_argument("--save-interval", type=int, default=4)
return parser.parse_known_args()[0]
def test_ppo(args: argparse.Namespace = get_args()) -> None:
env = gym.make(args.task)
space_info = SpaceInfo.from_env(env)
args.state_shape = space_info.observation_info.obs_shape
args.action_shape = space_info.action_info.action_shape
args.max_action = space_info.action_info.max_action
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task,
env.spec.reward_threshold if env.spec else None,
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.test_num)])
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# model
net = Net(state_shape=args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
actor = ActorProb(net, args.action_shape, unbounded=True, device=args.device).to(args.device)
critic = Critic(
Net(state_shape=args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device),
device=args.device,
).to(args.device)
actor_critic = ActorCritic(actor, critic)
# orthogonal initialization
for m in actor_critic.modules():
if isinstance(m, torch.nn.Linear):
torch.nn.init.orthogonal_(m.weight)
torch.nn.init.zeros_(m.bias)
optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)
# replace DiagGuassian with Independent(Normal) which is equivalent
# pass *logits to be consistent with policy.forward
def dist(loc_scale: tuple[torch.Tensor, torch.Tensor]) -> Distribution:
loc, scale = loc_scale
return Independent(Normal(loc, scale), 1)
policy: PPOPolicy[PPOTrainingStats] = PPOPolicy(
actor=actor,
critic=critic,
optim=optim,
dist_fn=dist,
discount_factor=args.gamma,
max_grad_norm=args.max_grad_norm,
eps_clip=args.eps_clip,
vf_coef=args.vf_coef,
ent_coef=args.ent_coef,
reward_normalization=args.rew_norm,
advantage_normalization=args.norm_adv,
recompute_advantage=args.recompute_adv,
dual_clip=args.dual_clip,
value_clip=args.value_clip,
gae_lambda=args.gae_lambda,
action_space=env.action_space,
)
# collector
train_collector = Collector(
policy,
train_envs,
VectorReplayBuffer(args.buffer_size, len(train_envs)),
)
test_collector = Collector(policy, test_envs)
# log
log_path = os.path.join(args.logdir, args.task, "ppo")
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer, save_interval=args.save_interval)
def save_best_fn(policy: BasePolicy) -> None:
torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))
def stop_fn(mean_rewards: float) -> bool:
return mean_rewards >= args.reward_threshold
def save_checkpoint_fn(epoch: int, env_step: int, gradient_step: int) -> str:
# see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html
ckpt_path = os.path.join(log_path, "checkpoint.pth")
# Example: saving by epoch num
# ckpt_path = os.path.join(log_path, f"checkpoint_{epoch}.pth")
torch.save(
{
"model": policy.state_dict(),
"optim": optim.state_dict(),
},
ckpt_path,
)
return ckpt_path
if args.resume:
# load from existing checkpoint
print(f"Loading agent under {log_path}")
ckpt_path = os.path.join(log_path, "checkpoint.pth")
if os.path.exists(ckpt_path):
checkpoint = torch.load(ckpt_path, map_location=args.device)
policy.load_state_dict(checkpoint["model"])
optim.load_state_dict(checkpoint["optim"])
print("Successfully restore policy and optim.")
else:
print("Fail to restore policy and optim.")
# trainer
trainer = OnpolicyTrainer(
policy=policy,
train_collector=train_collector,
test_collector=test_collector,
max_epoch=args.epoch,
step_per_epoch=args.step_per_epoch,
repeat_per_collect=args.repeat_per_collect,
episode_per_test=args.test_num,
batch_size=args.batch_size,
episode_per_collect=args.episode_per_collect,
stop_fn=stop_fn,
save_best_fn=save_best_fn,
logger=logger,
resume_from_log=args.resume,
save_checkpoint_fn=save_checkpoint_fn,
)
for epoch_stat in trainer:
print(f"Epoch: {epoch_stat.epoch}")
print(epoch_stat)
# print(info)
assert stop_fn(epoch_stat.info_stat.best_reward)
if __name__ == "__main__":
pprint.pprint(epoch_stat)
# Let's watch its performance!
env = gym.make(args.task)
policy.eval()
collector = Collector(policy, env)
collector_stats = collector.collect(n_episode=1, render=args.render)
print(collector_stats)
def test_ppo_resume(args: argparse.Namespace = get_args()) -> None:
args.resume = True
test_ppo(args)
if __name__ == "__main__":
test_ppo()