#!/usr/bin/env python3 import argparse import datetime import os import pprint import gym import numpy as np import torch from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.env import SubprocVectorEnv from tianshou.policy import REDQPolicy from tianshou.trainer import offpolicy_trainer from tianshou.utils import TensorboardLogger from tianshou.utils.net.common import EnsembleLinear, Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Ant-v3') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=1000000) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) parser.add_argument('--ensemble-size', type=int, default=10) parser.add_argument('--subset-size', type=int, default=2) parser.add_argument('--actor-lr', type=float, default=1e-3) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--alpha', type=float, default=0.2) parser.add_argument('--auto-alpha', default=False, action='store_true') parser.add_argument('--alpha-lr', type=float, default=3e-4) parser.add_argument("--start-timesteps", type=int, default=10000) parser.add_argument('--epoch', type=int, default=200) parser.add_argument('--step-per-epoch', type=int, default=5000) parser.add_argument('--step-per-collect', type=int, default=1) parser.add_argument('--update-per-step', type=int, default=20) parser.add_argument('--n-step', type=int, default=1) parser.add_argument('--batch-size', type=int, default=256) parser.add_argument( '--target-mode', type=str, choices=('min', 'mean'), default='min' ) parser.add_argument('--training-num', type=int, default=1) parser.add_argument('--test-num', type=int, default=10) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' ) parser.add_argument('--resume-path', type=str, default=None) parser.add_argument( '--watch', default=False, action='store_true', help='watch the play of pre-trained policy only' ) return parser.parse_args() def test_redq(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # train_envs = gym.make(args.task) if args.training_num > 1: train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)] ) else: train_envs = gym.make(args.task) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, device=args.device, unbounded=True, conditioned_sigma=True ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) def linear(x, y): return EnsembleLinear(args.ensemble_size, x, y) net_c = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, linear_layer=linear, ) critics = Critic( net_c, device=args.device, linear_layer=linear, flatten_input=False, ).to(args.device) critics_optim = torch.optim.Adam(critics.parameters(), lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) args.alpha = (target_entropy, log_alpha, alpha_optim) policy = REDQPolicy( actor, actor_optim, critics, critics_optim, args.ensemble_size, args.subset_size, tau=args.tau, gamma=args.gamma, alpha=args.alpha, estimation_step=args.n_step, actor_delay=args.update_per_step, target_mode=args.target_mode, action_space=env.action_space, ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_redq' log_path = os.path.join(args.logdir, args.task, 'redq', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) if not args.watch: # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False ) pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') if __name__ == '__main__': test_redq()