2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import os
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import gym
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import torch
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import pprint
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import argparse
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import numpy as np
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from torch.utils.tensorboard import SummaryWriter
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from tianshou.policy import DQNPolicy
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from tianshou.utils.net.common import Net
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from tianshou.trainer import offpolicy_trainer
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from tianshou.data import Collector, ReplayBuffer
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from tianshou.env import DummyVectorEnv, SubprocVectorEnv
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def get_args():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser = argparse.ArgumentParser()
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # the parameters are found by Optuna
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--task', type=str, default='LunarLander-v2')
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--seed', type=int, default=0)
							 | 
						
					
						
							
								
									
										
										
										
											2020-09-26 16:35:37 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--eps-test', type=float, default=0.01)
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--eps-train', type=float, default=0.73)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--buffer-size', type=int, default=100000)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--lr', type=float, default=0.013)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--gamma', type=float, default=0.99)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--n-step', type=int, default=4)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--target-update-freq', type=int, default=500)
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--epoch', type=int, default=10)
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--step-per-epoch', type=int, default=5000)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--collect-per-step', type=int, default=16)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--batch-size', type=int, default=128)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--layer-num', type=int, default=1)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--training-num', type=int, default=10)
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--test-num', type=int, default=100)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--logdir', type=str, default='log')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument('--render', type=float, default=0.)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parser.add_argument(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        '--device', type=str,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        default='cuda' if torch.cuda.is_available() else 'cpu')
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    return parser.parse_args()
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def test_dqn(args=get_args()):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    env = gym.make(args.task)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    args.state_shape = env.observation_space.shape or env.observation_space.n
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    args.action_shape = env.action_space.shape or env.action_space.n
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # train_envs = gym.make(args.task)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # you can also use tianshou.env.SubprocVectorEnv
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    train_envs = DummyVectorEnv(
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [lambda: gym.make(args.task) for _ in range(args.training_num)])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # test_envs = gym.make(args.task)
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    test_envs = SubprocVectorEnv(
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [lambda: gym.make(args.task) for _ in range(args.test_num)])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # seed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    np.random.seed(args.seed)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    torch.manual_seed(args.seed)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    train_envs.seed(args.seed)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    test_envs.seed(args.seed)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # model
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    net = Net(args.layer_num, args.state_shape,
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								              args.action_shape, args.device,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								              dueling=(2, 2)).to(args.device)
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    policy = DQNPolicy(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        net, optim, args.gamma, args.n_step,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        target_update_freq=args.target_update_freq)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # collector
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    train_collector = Collector(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        policy, train_envs, ReplayBuffer(args.buffer_size))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    test_collector = Collector(policy, test_envs)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # policy.set_eps(1)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    train_collector.collect(n_step=args.batch_size)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # log
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    log_path = os.path.join(args.logdir, args.task, 'dqn')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    writer = SummaryWriter(log_path)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def save_fn(policy):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-09-26 16:35:37 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    def stop_fn(mean_rewards):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return mean_rewards >= env.spec.reward_threshold
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-09-26 16:35:37 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    def train_fn(epoch, env_step):  # exp decay
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        eps = max(args.eps_train * (1 - 5e-6) ** env_step, args.eps_test)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        policy.set_eps(eps)
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-09-26 16:35:37 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    def test_fn(epoch, env_step):
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        policy.set_eps(args.eps_test)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # trainer
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    result = offpolicy_trainer(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        policy, train_collector, test_collector, args.epoch,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        args.step_per_epoch, args.collect_per_step, args.test_num,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        args.batch_size, train_fn=train_fn, test_fn=test_fn,
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-19 15:00:24 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        stop_fn=stop_fn, save_fn=save_fn, writer=writer,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        test_in_train=False)
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    assert stop_fn(result['best_reward'])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if __name__ == '__main__':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pprint.pprint(result)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # Let's watch its performance!
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-27 12:15:18 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        policy.eval()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        policy.set_eps(args.eps_test)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        test_envs.seed(args.seed)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        test_collector.reset()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        result = test_collector.collect(n_episode=[1] * args.test_num,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                        render=args.render)
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-29 19:44:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if __name__ == '__main__':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    test_dqn(get_args())
							 |