import argparse import os import pickle import pprint import gymnasium as gym import numpy as np import torch from torch.distributions import Distribution, Independent, Normal from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, VectorReplayBuffer from tianshou.env import DummyVectorEnv from tianshou.policy import BasePolicy, GAILPolicy from tianshou.trainer import OnpolicyTrainer from tianshou.utils import TensorboardLogger from tianshou.utils.net.common import ActorCritic, Net from tianshou.utils.net.continuous import ActorProb, Critic from tianshou.utils.space_info import SpaceInfo if __name__ == "__main__": from gather_pendulum_data import expert_file_name, gather_data else: # pytest from test.offline.gather_pendulum_data import expert_file_name, gather_data def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("--task", type=str, default="Pendulum-v1") parser.add_argument("--reward-threshold", type=float, default=None) parser.add_argument("--seed", type=int, default=1) parser.add_argument("--buffer-size", type=int, default=20000) parser.add_argument("--lr", type=float, default=1e-3) parser.add_argument("--disc-lr", type=float, default=5e-4) parser.add_argument("--gamma", type=float, default=0.95) parser.add_argument("--epoch", type=int, default=5) parser.add_argument("--step-per-epoch", type=int, default=150000) parser.add_argument("--episode-per-collect", type=int, default=16) parser.add_argument("--repeat-per-collect", type=int, default=2) parser.add_argument("--disc-update-num", type=int, default=2) parser.add_argument("--batch-size", type=int, default=128) parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) parser.add_argument("--training-num", type=int, default=16) parser.add_argument("--test-num", type=int, default=100) parser.add_argument("--logdir", type=str, default="log") parser.add_argument("--render", type=float, default=0.0) parser.add_argument( "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", ) # ppo special parser.add_argument("--vf-coef", type=float, default=0.25) parser.add_argument("--ent-coef", type=float, default=0.0) parser.add_argument("--eps-clip", type=float, default=0.2) parser.add_argument("--max-grad-norm", type=float, default=0.5) parser.add_argument("--gae-lambda", type=float, default=0.95) parser.add_argument("--rew-norm", type=int, default=1) parser.add_argument("--dual-clip", type=float, default=None) parser.add_argument("--value-clip", type=int, default=1) parser.add_argument("--norm-adv", type=int, default=1) parser.add_argument("--recompute-adv", type=int, default=0) parser.add_argument("--resume", action="store_true") parser.add_argument("--save-interval", type=int, default=4) parser.add_argument("--load-buffer-name", type=str, default=expert_file_name()) return parser.parse_known_args()[0] def test_gail(args: argparse.Namespace = get_args()) -> None: if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: with open(args.load_buffer_name, "rb") as f: buffer = pickle.load(f) else: buffer = gather_data() env = gym.make(args.task) if args.reward_threshold is None: default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold if env.spec else None, ) space_info = SpaceInfo.from_env(env) args.state_shape = space_info.observation_info.obs_shape args.action_shape = space_info.action_info.action_shape args.max_action = space_info.action_info.max_action # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb(net, args.action_shape, max_action=args.max_action, device=args.device).to( args.device, ) critic = Critic( Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device), device=args.device, ).to(args.device) actor_critic = ActorCritic(actor, critic) # orthogonal initialization for m in actor_critic.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) # discriminator disc_net = Critic( Net( args.state_shape, action_shape=args.action_shape, hidden_sizes=args.hidden_sizes, activation=torch.nn.Tanh, device=args.device, concat=True, ), device=args.device, ).to(args.device) for m in disc_net.modules(): if isinstance(m, torch.nn.Linear): # orthogonal initialization torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) torch.nn.init.zeros_(m.bias) disc_optim = torch.optim.Adam(disc_net.parameters(), lr=args.disc_lr) # replace DiagGuassian with Independent(Normal) which is equivalent # pass *logits to be consistent with policy.forward def dist(*logits: torch.Tensor) -> Distribution: return Independent(Normal(*logits), 1) policy: BasePolicy = GAILPolicy( actor=actor, critic=critic, optim=optim, dist_fn=dist, expert_buffer=buffer, disc_net=disc_net, disc_optim=disc_optim, disc_update_num=args.disc_update_num, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv, dual_clip=args.dual_clip, value_clip=args.value_clip, gae_lambda=args.gae_lambda, action_space=env.action_space, ) # collector train_collector = Collector( policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs)), ) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, "gail") writer = SummaryWriter(log_path) logger = TensorboardLogger(writer, save_interval=args.save_interval) def save_best_fn(policy: BasePolicy) -> None: torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def stop_fn(mean_rewards: float) -> bool: return mean_rewards >= args.reward_threshold def save_checkpoint_fn(epoch: int, env_step: int, gradient_step: int) -> str: # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html ckpt_path = os.path.join(log_path, "checkpoint.pth") # Example: saving by epoch num # ckpt_path = os.path.join(log_path, f"checkpoint_{epoch}.pth") torch.save( { "model": policy.state_dict(), "optim": optim.state_dict(), }, ckpt_path, ) return ckpt_path if args.resume: # load from existing checkpoint print(f"Loading agent under {log_path}") ckpt_path = os.path.join(log_path, "checkpoint.pth") if os.path.exists(ckpt_path): checkpoint = torch.load(ckpt_path, map_location=args.device) policy.load_state_dict(checkpoint["model"]) optim.load_state_dict(checkpoint["optim"]) print("Successfully restore policy and optim.") else: print("Fail to restore policy and optim.") # trainer result = OnpolicyTrainer( policy=policy, train_collector=train_collector, test_collector=test_collector, max_epoch=args.epoch, step_per_epoch=args.step_per_epoch, repeat_per_collect=args.repeat_per_collect, episode_per_test=args.test_num, batch_size=args.batch_size, episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, resume_from_log=args.resume, save_checkpoint_fn=save_checkpoint_fn, ).run() assert stop_fn(result.best_reward) if __name__ == "__main__": pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) collector_stats = collector.collect(n_episode=1, render=args.render) print(collector_stats) if __name__ == "__main__": test_gail()