Tianshou/test/modelbased/test_psrl.py

import argparse
import os
import pprint

import envpool
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.policy import PSRLPolicy
from tianshou.trainer import onpolicy_trainer
from tianshou.utils import LazyLogger, TensorboardLogger, WandbLogger


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='NChain-v0')
    parser.add_argument('--reward-threshold', type=float, default=None)
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--buffer-size', type=int, default=50000)
    parser.add_argument('--epoch', type=int, default=5)
    parser.add_argument('--step-per-epoch', type=int, default=1000)
    parser.add_argument('--episode-per-collect', type=int, default=1)
    parser.add_argument('--training-num', type=int, default=1)
    parser.add_argument('--test-num', type=int, default=10)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.0)
    parser.add_argument('--rew-mean-prior', type=float, default=0.0)
    parser.add_argument('--rew-std-prior', type=float, default=1.0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--eps', type=float, default=0.01)
    parser.add_argument('--add-done-loop', action="store_true", default=False)
    parser.add_argument(
        '--logger',
        type=str,
        default="wandb",
        choices=["wandb", "tensorboard", "none"],
    )
    return parser.parse_known_args()[0]


def test_psrl(args=get_args()):
    train_envs = env = envpool.make_gym(
        args.task, num_envs=args.training_num, seed=args.seed
    )
    test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
    if args.reward_threshold is None:
        default_reward_threshold = {"NChain-v0": 3400}
        args.reward_threshold = default_reward_threshold.get(
            args.task, env.spec.reward_threshold
        )
    print("reward threshold:", args.reward_threshold)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # model
    n_action = args.action_shape
    n_state = args.state_shape
    trans_count_prior = np.ones((n_state, n_action, n_state))
    rew_mean_prior = np.full((n_state, n_action), args.rew_mean_prior)
    rew_std_prior = np.full((n_state, n_action), args.rew_std_prior)
    policy = PSRLPolicy(
        trans_count_prior, rew_mean_prior, rew_std_prior, args.gamma, args.eps,
        args.add_done_loop
    )
    # collector
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True
    )
    test_collector = Collector(policy, test_envs)
    # Logger
    if args.logger == "wandb":
        logger = WandbLogger(
            save_interval=1, project='psrl', name='wandb_test', config=args
        )
    if args.logger != "none":
        log_path = os.path.join(args.logdir, args.task, 'psrl')
        writer = SummaryWriter(log_path)
        writer.add_text("args", str(args))
        if args.logger == "tensorboard":
            logger = TensorboardLogger(writer)
        else:
            logger.load(writer)
    else:
        logger = LazyLogger()

    def stop_fn(mean_rewards):
        return mean_rewards >= args.reward_threshold

    train_collector.collect(n_step=args.buffer_size, random=True)
    # trainer, test it without logger
    result = onpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        1,
        args.test_num,
        0,
        episode_per_collect=args.episode_per_collect,
        stop_fn=stop_fn,
        logger=logger,
        test_in_train=False,
    )

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        policy.eval()
        test_envs.seed(args.seed)
        test_collector.reset()
        result = test_collector.collect(n_episode=args.test_num, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
    elif env.spec.reward_threshold:
        assert result["best_reward"] >= env.spec.reward_threshold


if __name__ == '__main__':
    test_psrl()
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import argparse`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`import os`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`import pprint`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00
upgrade gym version to >=0.21, fix related CI and update examples/atari (#534) Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2022-02-25 07:40:33 +08:00			`import envpool`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`import numpy as np`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import torch`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`from torch.utils.tensorboard import SummaryWriter`

Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`from tianshou.data import Collector, VectorReplayBuffer`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.policy import PSRLPolicy`
			`from tianshou.trainer import onpolicy_trainer`
Improve W&B logger (#441) - rename WandBLogger -> WandbLogger - add save_data and restore_data - allow more input arguments for wandb init - integrate wandb into test/modelbase/test_psrl.py and examples/atari/atari_dqn.py - documentation update 2021-09-24 19:22:23 +05:30			`from tianshou.utils import LazyLogger, TensorboardLogger, WandbLogger`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='NChain-v0')`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`parser.add_argument('--reward-threshold', type=float, default=None)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--seed', type=int, default=1)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`parser.add_argument('--buffer-size', type=int, default=50000)`
			`parser.add_argument('--epoch', type=int, default=5)`
Trainer refactor : some definition change (#293) This PR focus on some definition change of trainer to make it more friendly to use and be consistent with typical usage in research papers, typically change `collect-per-step` to `step-per-collect`, add `update-per-step` / `episode-per-collect` accordingly, and modify the documentation. 2021-02-21 13:06:02 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=1000)`
			`parser.add_argument('--episode-per-collect', type=int, default=1)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`parser.add_argument('--training-num', type=int, default=1)`
add vizdoom example, bump version to 0.4.2 (#384) 2021-06-26 18:08:41 +08:00			`parser.add_argument('--test-num', type=int, default=10)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`parser.add_argument('--logdir', type=str, default='log')`
			`parser.add_argument('--render', type=float, default=0.0)`
			`parser.add_argument('--rew-mean-prior', type=float, default=0.0)`
			`parser.add_argument('--rew-std-prior', type=float, default=1.0)`
			`parser.add_argument('--gamma', type=float, default=0.99)`
			`parser.add_argument('--eps', type=float, default=0.01)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--add-done-loop', action="store_true", default=False)`
Improve W&B logger (#441) - rename WandBLogger -> WandbLogger - add save_data and restore_data - allow more input arguments for wandb init - integrate wandb into test/modelbase/test_psrl.py and examples/atari/atari_dqn.py - documentation update 2021-09-24 19:22:23 +05:30			`parser.add_argument(`
			`'--logger',`
			`type=str,`
			`default="wandb",`
			`choices=["wandb", "tensorboard", "none"],`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`return parser.parse_known_args()[0]`


			`def test_psrl(args=get_args()):`
upgrade gym version to >=0.21, fix related CI and update examples/atari (#534) Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2022-02-25 07:40:33 +08:00			`train_envs = env = envpool.make_gym(`
			`args.task, num_envs=args.training_num, seed=args.seed`
			`)`
			`test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`if args.reward_threshold is None:`
			`default_reward_threshold = {"NChain-v0": 3400}`
			`args.reward_threshold = default_reward_threshold.get(`
			`args.task, env.spec.reward_threshold`
			`)`
			`print("reward threshold:", args.reward_threshold)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`# model`
			`n_action = args.action_shape`
			`n_state = args.state_shape`
			`trans_count_prior = np.ones((n_state, n_action, n_state))`
			`rew_mean_prior = np.full((n_state, n_action), args.rew_mean_prior)`
			`rew_std_prior = np.full((n_state, n_action), args.rew_std_prior)`
			`policy = PSRLPolicy(`
			`trans_count_prior, rew_mean_prior, rew_std_prior, args.gamma, args.eps,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`args.add_done_loop`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`# collector`
			`train_collector = Collector(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_envs,`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`VectorReplayBuffer(args.buffer_size, len(train_envs)),`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`exploration_noise=True`
			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`test_collector = Collector(policy, test_envs)`
Improve W&B logger (#441) - rename WandBLogger -> WandbLogger - add save_data and restore_data - allow more input arguments for wandb init - integrate wandb into test/modelbase/test_psrl.py and examples/atari/atari_dqn.py - documentation update 2021-09-24 19:22:23 +05:30			`# Logger`
			`if args.logger == "wandb":`
			`logger = WandbLogger(`
			`save_interval=1, project='psrl', name='wandb_test', config=args`
			`)`
Update WandbLogger implementation (#558) * Use `global_step` as the x-axis for wandb * Use Tensorboard SummaryWritter as core with `wandb.init(..., sync_tensorboard=True)` * Update all atari examples with wandb Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2022-03-06 17:40:47 -05:00			`if args.logger != "none":`
Improve W&B logger (#441) - rename WandBLogger -> WandbLogger - add save_data and restore_data - allow more input arguments for wandb init - integrate wandb into test/modelbase/test_psrl.py and examples/atari/atari_dqn.py - documentation update 2021-09-24 19:22:23 +05:30			`log_path = os.path.join(args.logdir, args.task, 'psrl')`
			`writer = SummaryWriter(log_path)`
			`writer.add_text("args", str(args))`
Update WandbLogger implementation (#558) * Use `global_step` as the x-axis for wandb * Use Tensorboard SummaryWritter as core with `wandb.init(..., sync_tensorboard=True)` * Update all atari examples with wandb Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2022-03-06 17:40:47 -05:00			`if args.logger == "tensorboard":`
			`logger = TensorboardLogger(writer)`
			`else:`
			`logger.load(writer)`
Improve W&B logger (#441) - rename WandBLogger -> WandbLogger - add save_data and restore_data - allow more input arguments for wandb init - integrate wandb into test/modelbase/test_psrl.py and examples/atari/atari_dqn.py - documentation update 2021-09-24 19:22:23 +05:30			`else:`
			`logger = LazyLogger()`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00
change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def stop_fn(mean_rewards):`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`return mean_rewards >= args.reward_threshold`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00
			`train_collector.collect(n_step=args.buffer_size, random=True)`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`# trainer, test it without logger`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`result = onpolicy_trainer(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_collector,`
			`test_collector,`
			`args.epoch,`
			`args.step_per_epoch,`
			`1,`
			`args.test_num,`
			`0,`
			`episode_per_collect=args.episode_per_collect,`
			`stop_fn=stop_fn,`
Improve W&B logger (#441) - rename WandBLogger -> WandbLogger - add save_data and restore_data - allow more input arguments for wandb init - integrate wandb into test/modelbase/test_psrl.py and examples/atari/atari_dqn.py - documentation update 2021-09-24 19:22:23 +05:30			`logger=logger,`
			`test_in_train=False,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`)`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00
			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`policy.eval()`
			`test_envs.seed(args.seed)`
			`test_collector.reset()`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`result = test_collector.collect(n_episode=args.test_num, render=args.render)`
			`rews, lens = result["rews"], result["lens"]`
			`print(f"Final reward: {rews.mean()}, length: {lens.mean()}")`
add PSRL policy (#202) Add PSRL policy in tianshou/policy/modelbase/psrl.py. Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-23 20:57:33 +08:00			`elif env.spec.reward_threshold:`
			`assert result["best_reward"] >= env.spec.reward_threshold`


			`if __name__ == '__main__':`
			`test_psrl()`