Tianshou/test/discrete/test_a2c_with_il.py

import os
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from tianshou.env import DummyVectorEnv
from tianshou.utils.net.common import Net
from tianshou.data import Collector, ReplayBuffer
from tianshou.utils.net.discrete import Actor, Critic
from tianshou.policy import A2CPolicy, ImitationPolicy
from tianshou.trainer import onpolicy_trainer, offpolicy_trainer


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--il-lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.9)
    parser.add_argument('--epoch', type=int, default=10)
    parser.add_argument('--step-per-epoch', type=int, default=1000)
    parser.add_argument('--collect-per-step', type=int, default=10)
    parser.add_argument('--repeat-per-collect', type=int, default=1)
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument('--layer-num', type=int, default=2)
    parser.add_argument('--training-num', type=int, default=8)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument(
        '--device', type=str,
        default='cuda' if torch.cuda.is_available() else 'cpu')
    # a2c special
    parser.add_argument('--vf-coef', type=float, default=0.5)
    parser.add_argument('--ent-coef', type=float, default=0.0)
    parser.add_argument('--max-grad-norm', type=float, default=None)
    parser.add_argument('--gae-lambda', type=float, default=1.)
    parser.add_argument('--rew-norm', type=bool, default=False)
    args = parser.parse_known_args()[0]
    return args


def test_a2c_with_il(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = A2CPolicy(
        actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda,
        vf_coef=args.vf_coef, ent_coef=args.ent_coef,
        max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'a2c')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
        args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,
        writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')

    policy.eval()
    # here we define an imitation collector with a trivial policy
    if args.task == 'CartPole-v0':
        env.spec.reward_threshold = 190  # lower the goal
    net = Net(1, args.state_shape, device=args.device)
    net = Actor(net, args.action_shape).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)
    il_policy = ImitationPolicy(net, optim, mode='discrete')
    il_test_collector = Collector(
        il_policy,
        DummyVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.test_num)])
    )
    train_collector.reset()
    result = offpolicy_trainer(
        il_policy, train_collector, il_test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.test_num,
        args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        il_policy.eval()
        collector = Collector(il_policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')


if __name__ == '__main__':
    test_a2c_with_il()
save_fn 2020-04-11 16:54:27 +08:00			`import os`
a2c 2020-03-17 20:22:37 +08:00			`import gym`
			`import torch`
ppo and early stop 2020-03-20 19:52:29 +08:00			`import pprint`
a2c 2020-03-17 20:22:37 +08:00			`import argparse`
			`import numpy as np`
			`from torch.utils.tensorboard import SummaryWriter`

code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`from tianshou.env import DummyVectorEnv`
Numba acceleration (#193) Training FPS improvement (base commit is 94bfb32): test_pdqn: 1660 (without numba) -> 1930 discrete/test_ppo: 5100 -> 5170 since nstep has little impact on overall performance, the unit test result is: GAE: 4.1s -> 0.057s nstep: 0.3s -> 0.15s (little improvement) Others: - fix a bug in ttt set_eps - keep only sumtree in segment tree implementation - dirty fix for asyncVenv check_id test 2020-09-02 13:03:32 +08:00			`from tianshou.utils.net.common import Net`
a2c 2020-03-17 20:22:37 +08:00			`from tianshou.data import Collector, ReplayBuffer`
Numba acceleration (#193) Training FPS improvement (base commit is 94bfb32): test_pdqn: 1660 (without numba) -> 1930 discrete/test_ppo: 5100 -> 5170 since nstep has little impact on overall performance, the unit test result is: GAE: 4.1s -> 0.057s nstep: 0.3s -> 0.15s (little improvement) Others: - fix a bug in ttt set_eps - keep only sumtree in segment tree implementation - dirty fix for asyncVenv check_id test 2020-09-02 13:03:32 +08:00			`from tianshou.utils.net.discrete import Actor, Critic`
imitation with discrete action space 2020-04-20 11:25:20 +08:00			`from tianshou.policy import A2CPolicy, ImitationPolicy`
			`from tianshou.trainer import onpolicy_trainer, offpolicy_trainer`
a2c 2020-03-17 20:22:37 +08:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='CartPole-v0')`
fix optional type syntax 2020-05-16 20:08:32 +08:00			`parser.add_argument('--seed', type=int, default=1)`
a2c 2020-03-17 20:22:37 +08:00			`parser.add_argument('--buffer-size', type=int, default=20000)`
ddpg 2020-03-18 21:45:41 +08:00			`parser.add_argument('--lr', type=float, default=3e-4)`
imitation with discrete action space 2020-04-20 11:25:20 +08:00			`parser.add_argument('--il-lr', type=float, default=1e-3)`
a2c 2020-03-17 20:22:37 +08:00			`parser.add_argument('--gamma', type=float, default=0.9)`
gae 2020-04-14 21:11:06 +08:00			`parser.add_argument('--epoch', type=int, default=10)`
fix collector 2020-03-25 14:08:28 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=1000)`
a2c 2020-03-17 20:22:37 +08:00			`parser.add_argument('--collect-per-step', type=int, default=10)`
ppo and early stop 2020-03-20 19:52:29 +08:00			`parser.add_argument('--repeat-per-collect', type=int, default=1)`
a2c 2020-03-17 20:22:37 +08:00			`parser.add_argument('--batch-size', type=int, default=64)`
			`parser.add_argument('--layer-num', type=int, default=2)`
nstep all (fix #51) 2020-06-03 13:59:47 +08:00			`parser.add_argument('--training-num', type=int, default=8)`
a2c 2020-03-17 20:22:37 +08:00			`parser.add_argument('--test-num', type=int, default=100)`
			`parser.add_argument('--logdir', type=str, default='log')`
add examples, fix some bugs (#5) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection 2020-03-28 07:27:18 +08:00			`parser.add_argument('--render', type=float, default=0.)`
a2c 2020-03-17 20:22:37 +08:00			`parser.add_argument(`
			`'--device', type=str,`
			`default='cuda' if torch.cuda.is_available() else 'cpu')`
			`# a2c special`
			`parser.add_argument('--vf-coef', type=float, default=0.5)`
nstep all (fix #51) 2020-06-03 13:59:47 +08:00			`parser.add_argument('--ent-coef', type=float, default=0.0)`
ddpg 2020-03-18 21:45:41 +08:00			`parser.add_argument('--max-grad-norm', type=float, default=None)`
gae 2020-04-14 21:11:06 +08:00			`parser.add_argument('--gae-lambda', type=float, default=1.)`
fix historical issues 2020-04-26 16:13:51 +08:00			`parser.add_argument('--rew-norm', type=bool, default=False)`
a2c 2020-03-17 20:22:37 +08:00			`args = parser.parse_known_args()[0]`
			`return args`


fix optional type syntax 2020-05-16 20:08:32 +08:00			`def test_a2c_with_il(args=get_args()):`
add some docs 2020-04-03 21:28:12 +08:00			`torch.set_num_threads(1) # for poor CPU`
a2c 2020-03-17 20:22:37 +08:00			`env = gym.make(args.task)`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
add some docs 2020-04-03 21:28:12 +08:00			`# you can also use tianshou.env.SubprocVectorEnv`
a2c 2020-03-17 20:22:37 +08:00			`# train_envs = gym.make(args.task)`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`train_envs = DummyVectorEnv(`
fix collector 2020-03-25 14:08:28 +08:00			`[lambda: gym.make(args.task) for _ in range(args.training_num)])`
a2c 2020-03-17 20:22:37 +08:00			`# test_envs = gym.make(args.task)`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`test_envs = DummyVectorEnv(`
fix collector 2020-03-25 14:08:28 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)])`
a2c 2020-03-17 20:22:37 +08:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`train_envs.seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
refract test code 2020-03-21 10:58:01 +08:00			`net = Net(args.layer_num, args.state_shape, device=args.device)`
add trainer 2020-03-19 17:23:46 +08:00			`actor = Actor(net, args.action_shape).to(args.device)`
			`critic = Critic(net).to(args.device)`
			`optim = torch.optim.Adam(list(`
			`actor.parameters()) + list(critic.parameters()), lr=args.lr)`
a2c 2020-03-17 20:22:37 +08:00			`dist = torch.distributions.Categorical`
			`policy = A2CPolicy(`
gae 2020-04-14 21:11:06 +08:00			`actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda,`
			`vf_coef=args.vf_coef, ent_coef=args.ent_coef,`
fix historical issues 2020-04-26 16:13:51 +08:00			`max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm)`
a2c 2020-03-17 20:22:37 +08:00			`# collector`
add trainer 2020-03-19 17:23:46 +08:00			`train_collector = Collector(`
a2c 2020-03-17 20:22:37 +08:00			`policy, train_envs, ReplayBuffer(args.buffer_size))`
td3 2020-03-23 11:34:52 +08:00			`test_collector = Collector(policy, test_envs)`
a2c 2020-03-17 20:22:37 +08:00			`# log`
save_fn 2020-04-11 16:54:27 +08:00			`log_path = os.path.join(args.logdir, args.task, 'a2c')`
			`writer = SummaryWriter(log_path)`

			`def save_fn(policy):`
			`torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))`
add trainer 2020-03-19 17:23:46 +08:00
			`def stop_fn(x):`
			`return x >= env.spec.reward_threshold`

			`# trainer`
ppo and early stop 2020-03-20 19:52:29 +08:00			`result = onpolicy_trainer(`
			`policy, train_collector, test_collector, args.epoch,`
			`args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,`
save_fn 2020-04-11 16:54:27 +08:00			`args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,`
			`writer=writer)`
ppo and early stop 2020-03-20 19:52:29 +08:00			`assert stop_fn(result['best_reward'])`
a2c 2020-03-17 20:22:37 +08:00			`if __name__ == '__main__':`
ppo and early stop 2020-03-20 19:52:29 +08:00			`pprint.pprint(result)`
a2c 2020-03-17 20:22:37 +08:00			`# Let's watch its performance!`
			`env = gym.make(args.task)`
optimize training procedure and improve code coverage (#189) 1. add policy.eval() in all test scripts' "watch performance" 2. remove dict return support for collector preprocess_fn 3. add `__contains__` and `pop` in batch: `key in batch`, `batch.pop(key, deft)` 4. exact n_episode for a list of n_episode limitation and save fake data in cache_buffer when self.buffer is None (#184) 5. fix tensorboard logging: h-axis stands for env step instead of gradient step; add test results into tensorboard 6. add test_returns (both GAE and nstep) 7. change the type-checking order in batch.py and converter.py in order to meet the most often case first 8. fix shape inconsistency for torch.Tensor in replay buffer 9. remove `**kwargs` in ReplayBuffer 10. remove default value in batch.split() and add merge_last argument (#185) 11. improve nstep efficiency 12. add max_batchsize in onpolicy algorithms 13. potential bugfix for subproc.wait 14. fix RecurrentActorProb 15. improve the code-coverage (from 90% to 95%) and remove the dead code 16. fix some incorrect type annotation The above improvement also increases the training FPS: on my computer, the previous version is only ~1800 FPS and after that, it can reach ~2050 (faster than v0.2.4.post1). 2020-08-27 12:15:18 +08:00			`policy.eval()`
add trainer 2020-03-19 17:23:46 +08:00			`collector = Collector(policy, env)`
add examples, fix some bugs (#5) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection 2020-03-28 07:27:18 +08:00			`result = collector.collect(n_episode=1, render=args.render)`
add trainer 2020-03-19 17:23:46 +08:00			`print(f'Final reward: {result["rew"]}, length: {result["len"]}')`
a2c 2020-03-17 20:22:37 +08:00
optimize training procedure and improve code coverage (#189) 1. add policy.eval() in all test scripts' "watch performance" 2. remove dict return support for collector preprocess_fn 3. add `__contains__` and `pop` in batch: `key in batch`, `batch.pop(key, deft)` 4. exact n_episode for a list of n_episode limitation and save fake data in cache_buffer when self.buffer is None (#184) 5. fix tensorboard logging: h-axis stands for env step instead of gradient step; add test results into tensorboard 6. add test_returns (both GAE and nstep) 7. change the type-checking order in batch.py and converter.py in order to meet the most often case first 8. fix shape inconsistency for torch.Tensor in replay buffer 9. remove `**kwargs` in ReplayBuffer 10. remove default value in batch.split() and add merge_last argument (#185) 11. improve nstep efficiency 12. add max_batchsize in onpolicy algorithms 13. potential bugfix for subproc.wait 14. fix RecurrentActorProb 15. improve the code-coverage (from 90% to 95%) and remove the dead code 16. fix some incorrect type annotation The above improvement also increases the training FPS: on my computer, the previous version is only ~1800 FPS and after that, it can reach ~2050 (faster than v0.2.4.post1). 2020-08-27 12:15:18 +08:00			`policy.eval()`
imitation with discrete action space 2020-04-20 11:25:20 +08:00			`# here we define an imitation collector with a trivial policy`
fix optional type syntax 2020-05-16 20:08:32 +08:00			`if args.task == 'CartPole-v0':`
			`env.spec.reward_threshold = 190 # lower the goal`
imitation with discrete action space 2020-04-20 11:25:20 +08:00			`net = Net(1, args.state_shape, device=args.device)`
			`net = Actor(net, args.action_shape).to(args.device)`
			`optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)`
			`il_policy = ImitationPolicy(net, optim, mode='discrete')`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`il_test_collector = Collector(`
			`il_policy,`
			`DummyVectorEnv(`
			`[lambda: gym.make(args.task) for _ in range(args.test_num)])`
			`)`
imitation with discrete action space 2020-04-20 11:25:20 +08:00			`train_collector.reset()`
			`result = offpolicy_trainer(`
			`il_policy, train_collector, il_test_collector, args.epoch,`
			`args.step_per_epoch, args.collect_per_step, args.test_num,`
			`args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)`
			`assert stop_fn(result['best_reward'])`
			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`env = gym.make(args.task)`
optimize training procedure and improve code coverage (#189) 1. add policy.eval() in all test scripts' "watch performance" 2. remove dict return support for collector preprocess_fn 3. add `__contains__` and `pop` in batch: `key in batch`, `batch.pop(key, deft)` 4. exact n_episode for a list of n_episode limitation and save fake data in cache_buffer when self.buffer is None (#184) 5. fix tensorboard logging: h-axis stands for env step instead of gradient step; add test results into tensorboard 6. add test_returns (both GAE and nstep) 7. change the type-checking order in batch.py and converter.py in order to meet the most often case first 8. fix shape inconsistency for torch.Tensor in replay buffer 9. remove `**kwargs` in ReplayBuffer 10. remove default value in batch.split() and add merge_last argument (#185) 11. improve nstep efficiency 12. add max_batchsize in onpolicy algorithms 13. potential bugfix for subproc.wait 14. fix RecurrentActorProb 15. improve the code-coverage (from 90% to 95%) and remove the dead code 16. fix some incorrect type annotation The above improvement also increases the training FPS: on my computer, the previous version is only ~1800 FPS and after that, it can reach ~2050 (faster than v0.2.4.post1). 2020-08-27 12:15:18 +08:00			`il_policy.eval()`
imitation with discrete action space 2020-04-20 11:25:20 +08:00			`collector = Collector(il_policy, env)`
			`result = collector.collect(n_episode=1, render=args.render)`
			`print(f'Final reward: {result["rew"]}, length: {result["len"]}')`

a2c 2020-03-17 20:22:37 +08:00
			`if __name__ == '__main__':`
fix optional type syntax 2020-05-16 20:08:32 +08:00			`test_a2c_with_il()`