A test is not a script and should not be used as such Also marked pistonball test as skipped since it doesn't actually test anything
224 lines
8.9 KiB
Python
224 lines
8.9 KiB
Python
import argparse
|
|
import os
|
|
import pickle
|
|
from test.offline.gather_pendulum_data import expert_file_name, gather_data
|
|
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
import torch
|
|
from torch.distributions import Distribution, Independent, Normal
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
from tianshou.data import Collector, VectorReplayBuffer
|
|
from tianshou.env import DummyVectorEnv
|
|
from tianshou.policy import BasePolicy, GAILPolicy
|
|
from tianshou.trainer import OnpolicyTrainer
|
|
from tianshou.utils import TensorboardLogger
|
|
from tianshou.utils.net.common import ActorCritic, Net
|
|
from tianshou.utils.net.continuous import ActorProb, Critic
|
|
from tianshou.utils.space_info import SpaceInfo
|
|
|
|
|
|
def get_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--task", type=str, default="Pendulum-v1")
|
|
parser.add_argument("--reward-threshold", type=float, default=None)
|
|
parser.add_argument("--seed", type=int, default=1)
|
|
parser.add_argument("--buffer-size", type=int, default=20000)
|
|
parser.add_argument("--lr", type=float, default=1e-3)
|
|
parser.add_argument("--disc-lr", type=float, default=5e-4)
|
|
parser.add_argument("--gamma", type=float, default=0.95)
|
|
parser.add_argument("--epoch", type=int, default=5)
|
|
parser.add_argument("--step-per-epoch", type=int, default=150000)
|
|
parser.add_argument("--episode-per-collect", type=int, default=16)
|
|
parser.add_argument("--repeat-per-collect", type=int, default=2)
|
|
parser.add_argument("--disc-update-num", type=int, default=2)
|
|
parser.add_argument("--batch-size", type=int, default=128)
|
|
parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64])
|
|
parser.add_argument("--training-num", type=int, default=16)
|
|
parser.add_argument("--test-num", type=int, default=100)
|
|
parser.add_argument("--logdir", type=str, default="log")
|
|
parser.add_argument("--render", type=float, default=0.0)
|
|
parser.add_argument(
|
|
"--device",
|
|
type=str,
|
|
default="cuda" if torch.cuda.is_available() else "cpu",
|
|
)
|
|
# ppo special
|
|
parser.add_argument("--vf-coef", type=float, default=0.25)
|
|
parser.add_argument("--ent-coef", type=float, default=0.0)
|
|
parser.add_argument("--eps-clip", type=float, default=0.2)
|
|
parser.add_argument("--max-grad-norm", type=float, default=0.5)
|
|
parser.add_argument("--gae-lambda", type=float, default=0.95)
|
|
parser.add_argument("--rew-norm", type=int, default=1)
|
|
parser.add_argument("--dual-clip", type=float, default=None)
|
|
parser.add_argument("--value-clip", type=int, default=1)
|
|
parser.add_argument("--norm-adv", type=int, default=1)
|
|
parser.add_argument("--recompute-adv", type=int, default=0)
|
|
parser.add_argument("--resume", action="store_true")
|
|
parser.add_argument("--save-interval", type=int, default=4)
|
|
parser.add_argument("--load-buffer-name", type=str, default=expert_file_name())
|
|
return parser.parse_known_args()[0]
|
|
|
|
|
|
def test_gail(args: argparse.Namespace = get_args()) -> None:
|
|
if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name):
|
|
if args.load_buffer_name.endswith(".hdf5"):
|
|
buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name)
|
|
else:
|
|
with open(args.load_buffer_name, "rb") as f:
|
|
buffer = pickle.load(f)
|
|
else:
|
|
buffer = gather_data()
|
|
env = gym.make(args.task)
|
|
if args.reward_threshold is None:
|
|
default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100}
|
|
args.reward_threshold = default_reward_threshold.get(
|
|
args.task,
|
|
env.spec.reward_threshold if env.spec else None,
|
|
)
|
|
space_info = SpaceInfo.from_env(env)
|
|
args.state_shape = space_info.observation_info.obs_shape
|
|
args.action_shape = space_info.action_info.action_shape
|
|
args.max_action = space_info.action_info.max_action
|
|
# you can also use tianshou.env.SubprocVectorEnv
|
|
# train_envs = gym.make(args.task)
|
|
train_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.training_num)])
|
|
# test_envs = gym.make(args.task)
|
|
test_envs = DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.test_num)])
|
|
# seed
|
|
np.random.seed(args.seed)
|
|
torch.manual_seed(args.seed)
|
|
train_envs.seed(args.seed)
|
|
test_envs.seed(args.seed)
|
|
# model
|
|
net = Net(state_shape=args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
|
|
actor = ActorProb(
|
|
preprocess_net=net,
|
|
action_shape=args.action_shape,
|
|
max_action=args.max_action,
|
|
device=args.device,
|
|
).to(
|
|
args.device,
|
|
)
|
|
critic = Critic(
|
|
Net(state_shape=args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device),
|
|
device=args.device,
|
|
).to(args.device)
|
|
actor_critic = ActorCritic(actor, critic)
|
|
# orthogonal initialization
|
|
for m in actor_critic.modules():
|
|
if isinstance(m, torch.nn.Linear):
|
|
torch.nn.init.orthogonal_(m.weight)
|
|
torch.nn.init.zeros_(m.bias)
|
|
optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)
|
|
# discriminator
|
|
disc_net = Critic(
|
|
Net(
|
|
state_shape=args.state_shape,
|
|
action_shape=args.action_shape,
|
|
hidden_sizes=args.hidden_sizes,
|
|
activation=torch.nn.Tanh,
|
|
device=args.device,
|
|
concat=True,
|
|
),
|
|
device=args.device,
|
|
).to(args.device)
|
|
for m in disc_net.modules():
|
|
if isinstance(m, torch.nn.Linear):
|
|
# orthogonal initialization
|
|
torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
|
|
torch.nn.init.zeros_(m.bias)
|
|
disc_optim = torch.optim.Adam(disc_net.parameters(), lr=args.disc_lr)
|
|
|
|
# replace DiagGuassian with Independent(Normal) which is equivalent
|
|
# pass *logits to be consistent with policy.forward
|
|
def dist(loc_scale: tuple[torch.Tensor, torch.Tensor]) -> Distribution:
|
|
loc, scale = loc_scale
|
|
return Independent(Normal(loc, scale), 1)
|
|
|
|
policy: GAILPolicy = GAILPolicy(
|
|
actor=actor,
|
|
critic=critic,
|
|
optim=optim,
|
|
dist_fn=dist,
|
|
expert_buffer=buffer,
|
|
disc_net=disc_net,
|
|
disc_optim=disc_optim,
|
|
disc_update_num=args.disc_update_num,
|
|
discount_factor=args.gamma,
|
|
max_grad_norm=args.max_grad_norm,
|
|
eps_clip=args.eps_clip,
|
|
vf_coef=args.vf_coef,
|
|
ent_coef=args.ent_coef,
|
|
reward_normalization=args.rew_norm,
|
|
advantage_normalization=args.norm_adv,
|
|
recompute_advantage=args.recompute_adv,
|
|
dual_clip=args.dual_clip,
|
|
value_clip=args.value_clip,
|
|
gae_lambda=args.gae_lambda,
|
|
action_space=env.action_space,
|
|
)
|
|
# collector
|
|
train_collector = Collector(
|
|
policy,
|
|
train_envs,
|
|
VectorReplayBuffer(args.buffer_size, len(train_envs)),
|
|
)
|
|
test_collector = Collector(policy, test_envs)
|
|
# log
|
|
log_path = os.path.join(args.logdir, args.task, "gail")
|
|
writer = SummaryWriter(log_path)
|
|
logger = TensorboardLogger(writer, save_interval=args.save_interval)
|
|
|
|
def save_best_fn(policy: BasePolicy) -> None:
|
|
torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))
|
|
|
|
def stop_fn(mean_rewards: float) -> bool:
|
|
return mean_rewards >= args.reward_threshold
|
|
|
|
def save_checkpoint_fn(epoch: int, env_step: int, gradient_step: int) -> str:
|
|
# see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html
|
|
ckpt_path = os.path.join(log_path, "checkpoint.pth")
|
|
# Example: saving by epoch num
|
|
# ckpt_path = os.path.join(log_path, f"checkpoint_{epoch}.pth")
|
|
torch.save(
|
|
{
|
|
"model": policy.state_dict(),
|
|
"optim": optim.state_dict(),
|
|
},
|
|
ckpt_path,
|
|
)
|
|
return ckpt_path
|
|
|
|
if args.resume:
|
|
# load from existing checkpoint
|
|
print(f"Loading agent under {log_path}")
|
|
ckpt_path = os.path.join(log_path, "checkpoint.pth")
|
|
if os.path.exists(ckpt_path):
|
|
checkpoint = torch.load(ckpt_path, map_location=args.device)
|
|
policy.load_state_dict(checkpoint["model"])
|
|
optim.load_state_dict(checkpoint["optim"])
|
|
print("Successfully restore policy and optim.")
|
|
else:
|
|
print("Fail to restore policy and optim.")
|
|
|
|
# trainer
|
|
result = OnpolicyTrainer(
|
|
policy=policy,
|
|
train_collector=train_collector,
|
|
test_collector=test_collector,
|
|
max_epoch=args.epoch,
|
|
step_per_epoch=args.step_per_epoch,
|
|
repeat_per_collect=args.repeat_per_collect,
|
|
episode_per_test=args.test_num,
|
|
batch_size=args.batch_size,
|
|
episode_per_collect=args.episode_per_collect,
|
|
stop_fn=stop_fn,
|
|
save_best_fn=save_best_fn,
|
|
logger=logger,
|
|
resume_from_log=args.resume,
|
|
save_checkpoint_fn=save_checkpoint_fn,
|
|
).run()
|
|
assert stop_fn(result.best_reward)
|