Tianshou/test/modelbased/test_psrl.py

import argparse
import os
import pprint

import gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv, SubprocVectorEnv
from tianshou.policy import PSRLPolicy
from tianshou.trainer import onpolicy_trainer


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='NChain-v0')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--buffer-size', type=int, default=50000)
    parser.add_argument('--epoch', type=int, default=5)
    parser.add_argument('--step-per-epoch', type=int, default=1000)
    parser.add_argument('--episode-per-collect', type=int, default=1)
    parser.add_argument('--training-num', type=int, default=1)
    parser.add_argument('--test-num', type=int, default=10)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.0)
    parser.add_argument('--rew-mean-prior', type=float, default=0.0)
    parser.add_argument('--rew-std-prior', type=float, default=1.0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--eps', type=float, default=0.01)
    parser.add_argument('--add-done-loop', action="store_true", default=False)
    return parser.parse_known_args()[0]


def test_psrl(args=get_args()):
    env = gym.make(args.task)
    if args.task == "NChain-v0":
        env.spec.reward_threshold = 3400
        # env.spec.reward_threshold = 3647  # described in PSRL paper
    print("reward threshold:", env.spec.reward_threshold)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)]
    )
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)]
    )
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    n_action = args.action_shape
    n_state = args.state_shape
    trans_count_prior = np.ones((n_state, n_action, n_state))
    rew_mean_prior = np.full((n_state, n_action), args.rew_mean_prior)
    rew_std_prior = np.full((n_state, n_action), args.rew_std_prior)
    policy = PSRLPolicy(
        trans_count_prior, rew_mean_prior, rew_std_prior, args.gamma, args.eps,
        args.add_done_loop
    )
    # collector
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True
    )
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'psrl')
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))

    def stop_fn(mean_rewards):
        if env.spec.reward_threshold:
            return mean_rewards >= env.spec.reward_threshold
        else:
            return False

    train_collector.collect(n_step=args.buffer_size, random=True)
    # trainer, test it without logger
    result = onpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        1,
        args.test_num,
        0,
        episode_per_collect=args.episode_per_collect,
        stop_fn=stop_fn,
        # logger=logger,
        test_in_train=False
    )

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        policy.eval()
        test_envs.seed(args.seed)
        test_collector.reset()
        result = test_collector.collect(n_episode=args.test_num, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
    elif env.spec.reward_threshold:
        assert result["best_reward"] >= env.spec.reward_threshold


if __name__ == '__main__':
    test_psrl()
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import argparse`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`import os`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`import pprint`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00
			`import gym`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`import numpy as np`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import torch`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`from torch.utils.tensorboard import SummaryWriter`

Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`from tianshou.data import Collector, VectorReplayBuffer`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`from tianshou.env import DummyVectorEnv, SubprocVectorEnv`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.policy import PSRLPolicy`
			`from tianshou.trainer import onpolicy_trainer`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='NChain-v0')`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--seed', type=int, default=1)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`parser.add_argument('--buffer-size', type=int, default=50000)`
			`parser.add_argument('--epoch', type=int, default=5)`
Trainer refactor : some definition change (#293) This PR focus on some definition change of trainer to make it more friendly to use and be consistent with typical usage in research papers, typically change `collect-per-step` to `step-per-collect`, add `update-per-step` / `episode-per-collect` accordingly, and modify the documentation. 2021-02-21 13:06:02 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=1000)`
			`parser.add_argument('--episode-per-collect', type=int, default=1)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`parser.add_argument('--training-num', type=int, default=1)`
add vizdoom example, bump version to 0.4.2 (#384) 2021-06-26 18:08:41 +08:00			`parser.add_argument('--test-num', type=int, default=10)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`parser.add_argument('--logdir', type=str, default='log')`
			`parser.add_argument('--render', type=float, default=0.0)`
			`parser.add_argument('--rew-mean-prior', type=float, default=0.0)`
			`parser.add_argument('--rew-std-prior', type=float, default=1.0)`
			`parser.add_argument('--gamma', type=float, default=0.99)`
			`parser.add_argument('--eps', type=float, default=0.01)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--add-done-loop', action="store_true", default=False)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`return parser.parse_known_args()[0]`


			`def test_psrl(args=get_args()):`
			`env = gym.make(args.task)`
			`if args.task == "NChain-v0":`
add vizdoom example, bump version to 0.4.2 (#384) 2021-06-26 18:08:41 +08:00			`env.spec.reward_threshold = 3400`
			`# env.spec.reward_threshold = 3647 # described in PSRL paper`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`print("reward threshold:", env.spec.reward_threshold)`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
			`# train_envs = gym.make(args.task)`
			`train_envs = DummyVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.training_num)]`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`# test_envs = gym.make(args.task)`
			`test_envs = SubprocVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)]`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`train_envs.seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
			`n_action = args.action_shape`
			`n_state = args.state_shape`
			`trans_count_prior = np.ones((n_state, n_action, n_state))`
			`rew_mean_prior = np.full((n_state, n_action), args.rew_mean_prior)`
			`rew_std_prior = np.full((n_state, n_action), args.rew_std_prior)`
			`policy = PSRLPolicy(`
			`trans_count_prior, rew_mean_prior, rew_std_prior, args.gamma, args.eps,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`args.add_done_loop`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`# collector`
			`train_collector = Collector(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_envs,`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`VectorReplayBuffer(args.buffer_size, len(train_envs)),`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`exploration_noise=True`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`test_collector = Collector(policy, test_envs)`
			`# log`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`log_path = os.path.join(args.logdir, args.task, 'psrl')`
			`writer = SummaryWriter(log_path)`
			`writer.add_text("args", str(args))`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00
change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def stop_fn(mean_rewards):`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`if env.spec.reward_threshold:`
change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`return mean_rewards >= env.spec.reward_threshold`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`else:`
			`return False`

			`train_collector.collect(n_step=args.buffer_size, random=True)`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`# trainer, test it without logger`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`result = onpolicy_trainer(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_collector,`
			`test_collector,`
			`args.epoch,`
			`args.step_per_epoch,`
			`1,`
			`args.test_num,`
			`0,`
			`episode_per_collect=args.episode_per_collect,`
			`stop_fn=stop_fn,`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`# logger=logger,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`test_in_train=False`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00
			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`policy.eval()`
			`test_envs.seed(args.seed)`
			`test_collector.reset()`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`result = test_collector.collect(n_episode=args.test_num, render=args.render)`
			`rews, lens = result["rews"], result["lens"]`
			`print(f"Final reward: {rews.mean()}, length: {lens.mean()}")`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`elif env.spec.reward_threshold:`
			`assert result["best_reward"] >= env.spec.reward_threshold`


			`if __name__ == '__main__':`
			`test_psrl()`