Tianshou/test/continuous/test_ppo.py

import os
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from tianshou.env import VectorEnv
from tianshou.policy import PPOPolicy
from tianshou.policy.dist import DiagGaussian
from tianshou.trainer import onpolicy_trainer
from tianshou.data import Collector, ReplayBuffer
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='Pendulum-v0')
    parser.add_argument('--seed', type=int, default=1626)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--epoch', type=int, default=20)
    parser.add_argument('--step-per-epoch', type=int, default=2400)
    parser.add_argument('--collect-per-step', type=int, default=1)
    parser.add_argument('--repeat-per-collect', type=int, default=2)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--layer-num', type=int, default=1)
    parser.add_argument('--training-num', type=int, default=16)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument(
        '--device', type=str,
        default='cuda' if torch.cuda.is_available() else 'cpu')
    # ppo special
    parser.add_argument('--vf-coef', type=float, default=0.5)
    parser.add_argument('--ent-coef', type=float, default=0.01)
    parser.add_argument('--eps-clip', type=float, default=0.2)
    parser.add_argument('--max-grad-norm', type=float, default=0.5)
    parser.add_argument('--gae-lambda', type=float, default=0.95)
    parser.add_argument('--rew-norm', type=int, default=1)
    parser.add_argument('--dual-clip', type=float, default=None)
    parser.add_argument('--value-clip', type=int, default=1)
    args = parser.parse_known_args()[0]
    return args


def test_ppo(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = ActorProb(
        net, args.action_shape,
        args.max_action, args.device
    ).to(args.device)
    critic = Critic(Net(
        args.layer_num, args.state_shape, device=args.device
    ), device=args.device).to(args.device)
    # orthogonal initialization
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.orthogonal_(m.weight)
            torch.nn.init.zeros_(m.bias)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=args.lr)
    dist = DiagGaussian
    policy = PPOPolicy(
        actor, critic, optim, dist, args.gamma,
        max_grad_norm=args.max_grad_norm,
        eps_clip=args.eps_clip,
        vf_coef=args.vf_coef,
        ent_coef=args.ent_coef,
        reward_normalization=args.rew_norm,
        # dual_clip=args.dual_clip,
        # dual clip cause monotonically increasing log_std :)
        value_clip=args.value_clip,
        # action_range=[env.action_space.low[0], env.action_space.high[0]],)
        # if clip the action, ppo would not converge :)
        gae_lambda=args.gae_lambda)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
        args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,
        writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()


if __name__ == '__main__':
    test_ppo()
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`import os`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`import gym`
			`import torch`
			`import pprint`
			`import argparse`
			`import numpy as np`
			`from torch.utils.tensorboard import SummaryWriter`

add some docs 2020-04-03 21:28:12 +08:00			`from tianshou.env import VectorEnv`
gae 2020-04-14 21:11:06 +08:00			`from tianshou.policy import PPOPolicy`
item3 of #51 2020-05-27 11:02:23 +08:00			`from tianshou.policy.dist import DiagGaussian`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`from tianshou.trainer import onpolicy_trainer`
			`from tianshou.data import Collector, ReplayBuffer`
Remove dummy net code (#123) * remove dummy net; delete two files * split code to have backbone and head * rename class * change torch.float to torch.float32 * use flatten(1) instead of view(batch, -1) * remove dummy net in docs * bugfix for rnn * fix cuda error * minor fix of docs * do not change the example code in dqn tutorial, since it is for demonstration Co-authored-by: Trinkle23897 <463003665@qq.com> 2020-07-09 22:57:01 +08:00			`from tianshou.utils.net.common import Net`
			`from tianshou.utils.net.continuous import ActorProb, Critic`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='Pendulum-v0')`
fix ppo 2020-04-19 14:30:42 +08:00			`parser.add_argument('--seed', type=int, default=1626)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`parser.add_argument('--buffer-size', type=int, default=20000)`
gae 2020-04-14 21:11:06 +08:00			`parser.add_argument('--lr', type=float, default=1e-3)`
			`parser.add_argument('--gamma', type=float, default=0.99)`
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`parser.add_argument('--epoch', type=int, default=20)`
3 fix (#158) - fix 2 warning in doctest - change the minimum version of gym (to be aligned with openai baselines) - change squeeze and reshape to flatten (related to #155). I think flatten is better. 2020-07-23 15:12:02 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=2400)`
gae 2020-04-14 21:11:06 +08:00			`parser.add_argument('--collect-per-step', type=int, default=1)`
			`parser.add_argument('--repeat-per-collect', type=int, default=2)`
fix ppo 2020-04-19 14:30:42 +08:00			`parser.add_argument('--batch-size', type=int, default=128)`
			`parser.add_argument('--layer-num', type=int, default=1)`
			`parser.add_argument('--training-num', type=int, default=16)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`parser.add_argument('--test-num', type=int, default=100)`
			`parser.add_argument('--logdir', type=str, default='log')`
add examples, fix some bugs (#5) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection 2020-03-28 07:27:18 +08:00			`parser.add_argument('--render', type=float, default=0.)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`parser.add_argument(`
			`'--device', type=str,`
			`default='cuda' if torch.cuda.is_available() else 'cpu')`
			`# ppo special`
			`parser.add_argument('--vf-coef', type=float, default=0.5)`
fix ppo 2020-04-19 14:30:42 +08:00			`parser.add_argument('--ent-coef', type=float, default=0.01)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`parser.add_argument('--eps-clip', type=float, default=0.2)`
			`parser.add_argument('--max-grad-norm', type=float, default=0.5)`
gae 2020-04-14 21:11:06 +08:00			`parser.add_argument('--gae-lambda', type=float, default=0.95)`
nstep all (fix #51) 2020-06-03 13:59:47 +08:00			`parser.add_argument('--rew-norm', type=int, default=1)`
			`parser.add_argument('--dual-clip', type=float, default=None)`
			`parser.add_argument('--value-clip', type=int, default=1)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`args = parser.parse_known_args()[0]`
			`return args`


fix ppo 2020-04-19 14:30:42 +08:00			`def test_ppo(args=get_args()):`
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`torch.set_num_threads(1) # we just need only one thread for NN`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`env = gym.make(args.task)`
			`if args.task == 'Pendulum-v0':`
			`env.spec.reward_threshold = -250`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
			`args.max_action = env.action_space.high[0]`
add some docs 2020-04-03 21:28:12 +08:00			`# you can also use tianshou.env.SubprocVectorEnv`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`# train_envs = gym.make(args.task)`
add some docs 2020-04-03 21:28:12 +08:00			`train_envs = VectorEnv(`
fix collector 2020-03-25 14:08:28 +08:00			`[lambda: gym.make(args.task) for _ in range(args.training_num)])`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`# test_envs = gym.make(args.task)`
add some docs 2020-04-03 21:28:12 +08:00			`test_envs = VectorEnv(`
fix collector 2020-03-25 14:08:28 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)])`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`train_envs.seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
Remove dummy net code (#123) * remove dummy net; delete two files * split code to have backbone and head * rename class * change torch.float to torch.float32 * use flatten(1) instead of view(batch, -1) * remove dummy net in docs * bugfix for rnn * fix cuda error * minor fix of docs * do not change the example code in dqn tutorial, since it is for demonstration Co-authored-by: Trinkle23897 <463003665@qq.com> 2020-07-09 22:57:01 +08:00			`net = Net(args.layer_num, args.state_shape, device=args.device)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`actor = ActorProb(`
Remove dummy net code (#123) * remove dummy net; delete two files * split code to have backbone and head * rename class * change torch.float to torch.float32 * use flatten(1) instead of view(batch, -1) * remove dummy net in docs * bugfix for rnn * fix cuda error * minor fix of docs * do not change the example code in dqn tutorial, since it is for demonstration Co-authored-by: Trinkle23897 <463003665@qq.com> 2020-07-09 22:57:01 +08:00			`net, args.action_shape,`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`args.max_action, args.device`
			`).to(args.device)`
Remove dummy net code (#123) * remove dummy net; delete two files * split code to have backbone and head * rename class * change torch.float to torch.float32 * use flatten(1) instead of view(batch, -1) * remove dummy net in docs * bugfix for rnn * fix cuda error * minor fix of docs * do not change the example code in dqn tutorial, since it is for demonstration Co-authored-by: Trinkle23897 <463003665@qq.com> 2020-07-09 22:57:01 +08:00			`critic = Critic(Net(`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`args.layer_num, args.state_shape, device=args.device`
Remove dummy net code (#123) * remove dummy net; delete two files * split code to have backbone and head * rename class * change torch.float to torch.float32 * use flatten(1) instead of view(batch, -1) * remove dummy net in docs * bugfix for rnn * fix cuda error * minor fix of docs * do not change the example code in dqn tutorial, since it is for demonstration Co-authored-by: Trinkle23897 <463003665@qq.com> 2020-07-09 22:57:01 +08:00			`), device=args.device).to(args.device)`
orthogonal init for ppo in test script 2020-05-16 20:27:01 +08:00			`# orthogonal initialization`
			`for m in list(actor.modules()) + list(critic.modules()):`
			`if isinstance(m, torch.nn.Linear):`
			`torch.nn.init.orthogonal_(m.weight)`
oinit with 0 bias 2020-05-17 17:06:20 +08:00			`torch.nn.init.zeros_(m.bias)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`optim = torch.optim.Adam(list(`
			`actor.parameters()) + list(critic.parameters()), lr=args.lr)`
Fix log_prob and PPO dual_clip (#49) * Added DiagGaussian to fix log_probg * Disable PPO dual_clip 2020-05-18 16:23:35 +08:00			`dist = DiagGaussian`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`policy = PPOPolicy(`
			`actor, critic, optim, dist, args.gamma,`
			`max_grad_norm=args.max_grad_norm,`
			`eps_clip=args.eps_clip,`
			`vf_coef=args.vf_coef,`
			`ent_coef=args.ent_coef,`
fix ppo 2020-04-19 14:30:42 +08:00			`reward_normalization=args.rew_norm,`
Fix log_prob and PPO dual_clip (#49) * Added DiagGaussian to fix log_probg * Disable PPO dual_clip 2020-05-18 16:23:35 +08:00			`# dual_clip=args.dual_clip,`
			`# dual clip cause monotonically increasing log_std :)`
fix ppo 2020-04-19 14:30:42 +08:00			`value_clip=args.value_clip,`
			`# action_range=[env.action_space.low[0], env.action_space.high[0]],)`
			`# if clip the action, ppo would not converge :)`
gae 2020-04-14 21:11:06 +08:00			`gae_lambda=args.gae_lambda)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`# collector`
			`train_collector = Collector(`
update readme 2020-03-26 11:42:34 +08:00			`policy, train_envs, ReplayBuffer(args.buffer_size))`
td3 2020-03-23 11:34:52 +08:00			`test_collector = Collector(policy, test_envs)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`# log`
save_fn 2020-04-11 16:54:27 +08:00			`log_path = os.path.join(args.logdir, args.task, 'ppo')`
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`writer = SummaryWriter(log_path)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00
save_fn 2020-04-11 16:54:27 +08:00			`def save_fn(policy):`
			`torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))`

add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`def stop_fn(x):`
			`return x >= env.spec.reward_threshold`

			`# trainer`
			`result = onpolicy_trainer(`
			`policy, train_collector, test_collector, args.epoch,`
			`args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,`
save_fn 2020-04-11 16:54:27 +08:00			`args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,`
			`writer=writer)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`assert stop_fn(result['best_reward'])`
			`train_collector.close()`
			`test_collector.close()`
			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`env = gym.make(args.task)`
			`collector = Collector(policy, env)`
add examples, fix some bugs (#5) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection 2020-03-28 07:27:18 +08:00			`result = collector.collect(n_episode=1, render=args.render)`
add demo of ppo continuous action task 2020-03-21 17:04:42 +08:00			`print(f'Final reward: {result["rew"]}, length: {result["len"]}')`
			`collector.close()`


			`if __name__ == '__main__':`
fix ppo 2020-04-19 14:30:42 +08:00			`test_ppo()`