Tianshou/test/discrete/test_sac.py

import argparse
import os
import pprint

import gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import DiscreteSACPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from tianshou.utils.net.discrete import Actor, Critic


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='CartPole-v0')
    parser.add_argument('--reward-threshold', type=float, default=None)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--alpha-lr', type=float, default=3e-4)
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--alpha', type=float, default=0.05)
    parser.add_argument('--auto-alpha', action="store_true", default=False)
    parser.add_argument('--epoch', type=int, default=5)
    parser.add_argument('--step-per-epoch', type=int, default=10000)
    parser.add_argument('--step-per-collect', type=int, default=10)
    parser.add_argument('--update-per-step', type=float, default=0.1)
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64])
    parser.add_argument('--training-num', type=int, default=10)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.0)
    parser.add_argument('--rew-norm', action="store_true", default=False)
    parser.add_argument('--n-step', type=int, default=3)
    parser.add_argument(
        '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'
    )
    args = parser.parse_known_args()[0]
    return args


def test_discrete_sac(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    if args.reward_threshold is None:
        default_reward_threshold = {"CartPole-v0": 170}  # lower the goal
        args.reward_threshold = default_reward_threshold.get(
            args.task, env.spec.reward_threshold
        )

    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)]
    )
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)]
    )
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    actor = Actor(net, args.action_shape, softmax_output=False,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c1 = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    critic1 = Critic(net_c1, last_size=args.action_shape,
                     device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    net_c2 = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    critic2 = Critic(net_c2, last_size=args.action_shape,
                     device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)

    # better not to use auto alpha in CartPole
    if args.auto_alpha:
        target_entropy = 0.98 * np.log(np.prod(args.action_shape))
        log_alpha = torch.zeros(1, requires_grad=True, device=args.device)
        alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
        args.alpha = (target_entropy, log_alpha, alpha_optim)

    policy = DiscreteSACPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        args.tau,
        args.gamma,
        args.alpha,
        estimation_step=args.n_step,
        reward_normalization=args.rew_norm
    )
    # collector
    train_collector = Collector(
        policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs))
    )
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'discrete_sac')
    writer = SummaryWriter(log_path)
    logger = TensorboardLogger(writer)

    def save_best_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= args.reward_threshold

    # trainer
    result = offpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.step_per_collect,
        args.test_num,
        args.batch_size,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        logger=logger,
        update_per_step=args.update_per_step,
        test_in_train=False
    )
    assert stop_fn(result['best_reward'])

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")


if __name__ == '__main__':
    test_discrete_sac()
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import argparse`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`import os`
			`import pprint`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00
			`import gym`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`import numpy as np`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import torch`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`from torch.utils.tensorboard import SummaryWriter`

bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.data import Collector, VectorReplayBuffer`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`from tianshou.env import DummyVectorEnv`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`from tianshou.policy import DiscreteSACPolicy`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`from tianshou.trainer import offpolicy_trainer`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.utils import TensorboardLogger`
			`from tianshou.utils.net.common import Net`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`from tianshou.utils.net.discrete import Actor, Critic`


			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='CartPole-v0')`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`parser.add_argument('--reward-threshold', type=float, default=None)`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`parser.add_argument('--seed', type=int, default=0)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`parser.add_argument('--buffer-size', type=int, default=20000)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--actor-lr', type=float, default=1e-4)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`parser.add_argument('--critic-lr', type=float, default=1e-3)`
			`parser.add_argument('--alpha-lr', type=float, default=3e-4)`
			`parser.add_argument('--gamma', type=float, default=0.95)`
			`parser.add_argument('--tau', type=float, default=0.005)`
			`parser.add_argument('--alpha', type=float, default=0.05)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--auto-alpha', action="store_true", default=False)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`parser.add_argument('--epoch', type=int, default=5)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=10000)`
			`parser.add_argument('--step-per-collect', type=int, default=10)`
			`parser.add_argument('--update-per-step', type=float, default=0.1)`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`parser.add_argument('--batch-size', type=int, default=64)`
			`parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64])`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--training-num', type=int, default=10)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`parser.add_argument('--test-num', type=int, default=100)`
			`parser.add_argument('--logdir', type=str, default='log')`
			`parser.add_argument('--render', type=float, default=0.0)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--rew-norm', action="store_true", default=False)`
			`parser.add_argument('--n-step', type=int, default=3)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`parser.add_argument(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`'--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'`
			`)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`args = parser.parse_known_args()[0]`
			`return args`


			`def test_discrete_sac(args=get_args()):`
			`env = gym.make(args.task)`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`if args.reward_threshold is None:`
Add Atari SAC examples (#657) - Add Atari (discrete) SAC examples; - Fix a bug in Discrete SAC evaluation; default to deterministic mode. 2022-06-03 22:26:08 -07:00			`default_reward_threshold = {"CartPole-v0": 170} # lower the goal`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`args.reward_threshold = default_reward_threshold.get(`
			`args.task, env.spec.reward_threshold`
			`)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`train_envs = DummyVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.training_num)]`
			`)`
make unit test faster (#522) * test cache expert data in offline training * faster cql test * faster tests * use dummy * test ray dependency 2022-02-08 11:24:52 -05:00			`test_envs = DummyVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)]`
			`)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`train_envs.seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)`
			`actor = Actor(net, args.action_shape, softmax_output=False,`
			`device=args.device).to(args.device)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`net_c1 = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)`
hotfix：fix test failure in cuda environment (#289) 2021-02-09 17:13:40 +08:00			`critic1 = Critic(net_c1, last_size=args.action_shape,`
			`device=args.device).to(args.device)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`net_c2 = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)`
hotfix：fix test failure in cuda environment (#289) 2021-02-09 17:13:40 +08:00			`critic2 = Critic(net_c2, last_size=args.action_shape,`
			`device=args.device).to(args.device)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)`

			`# better not to use auto alpha in CartPole`
			`if args.auto_alpha:`
			`target_entropy = 0.98 * np.log(np.prod(args.action_shape))`
			`log_alpha = torch.zeros(1, requires_grad=True, device=args.device)`
			`alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)`
			`args.alpha = (target_entropy, log_alpha, alpha_optim)`

			`policy = DiscreteSACPolicy(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`actor,`
			`actor_optim,`
			`critic1,`
			`critic1_optim,`
			`critic2,`
			`critic2_optim,`
			`args.tau,`
			`args.gamma,`
			`args.alpha,`
			`estimation_step=args.n_step,`
			`reward_normalization=args.rew_norm`
			`)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`# collector`
			`train_collector = Collector(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs))`
			`)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`test_collector = Collector(policy, test_envs)`
			`# train_collector.collect(n_step=args.buffer_size)`
			`# log`
			`log_path = os.path.join(args.logdir, args.task, 'discrete_sac')`
			`writer = SummaryWriter(log_path)`
Add Weights and Biases Logger (#427) - rename BasicLogger to TensorboardLogger - refactor logger code - add WandbLogger Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2021-08-30 10:35:02 -04:00			`logger = TensorboardLogger(writer)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00
rename save_fn to save_best_fn to avoid ambiguity (#575) This PR also introduces `tianshou.utils.deprecation` for a unified deprecation wrapper. 2022-03-21 16:29:27 -04:00			`def save_best_fn(policy):`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))`

change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def stop_fn(mean_rewards):`
Fixed hardcoded reward_treshold (#548) 2022-03-04 03:35:39 +01:00			`return mean_rewards >= args.reward_threshold`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00
			`# trainer`
			`result = offpolicy_trainer(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_collector,`
			`test_collector,`
			`args.epoch,`
			`args.step_per_epoch,`
			`args.step_per_collect,`
			`args.test_num,`
			`args.batch_size,`
			`stop_fn=stop_fn,`
rename save_fn to save_best_fn to avoid ambiguity (#575) This PR also introduces `tianshou.utils.deprecation` for a unified deprecation wrapper. 2022-03-21 16:29:27 -04:00			`save_best_fn=save_best_fn,`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`logger=logger,`
			`update_per_step=args.update_per_step,`
			`test_in_train=False`
			`)`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`assert stop_fn(result['best_reward'])`
Make trainer resumable (#350) - specify tensorboard >= 2.5.0 - add `save_checkpoint_fn` and `resume_from_log` in trainer Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-05-06 08:53:53 +08:00
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`env = gym.make(args.task)`
			`policy.eval()`
			`collector = Collector(policy, env)`
			`result = collector.collect(n_episode=1, render=args.render)`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`rews, lens = result["rews"], result["lens"]`
			`print(f"Final reward: {rews.mean()}, length: {lens.mean()}")`
implement sac for discrete action settings (#216) Co-authored-by: n+e <trinkle23897@cmu.edu> 2020-09-14 14:59:23 +08:00

			`if __name__ == '__main__':`
			`test_discrete_sac()`