From d976a5aa913218634e62957f35621f8f43f38f77 Mon Sep 17 00:00:00 2001 From: Anas BELFADIL <56280198+BFAnas@users.noreply.github.com> Date: Fri, 4 Mar 2022 03:35:39 +0100 Subject: [PATCH] Fixed hardcoded reward_treshold (#548) --- test/continuous/test_ddpg.py | 11 +++++++---- test/continuous/test_npg.py | 10 +++++++--- test/continuous/test_ppo.py | 10 +++++++--- test/continuous/test_sac_with_il.py | 15 +++++++++------ test/continuous/test_td3.py | 11 +++++++---- test/continuous/test_trpo.py | 10 +++++++--- test/discrete/test_a2c_with_il.py | 8 +++++++- test/discrete/test_c51.py | 8 +++++++- test/discrete/test_dqn.py | 8 +++++++- test/discrete/test_drqn.py | 8 +++++++- test/discrete/test_fqf.py | 8 +++++++- test/discrete/test_iqn.py | 8 +++++++- test/discrete/test_pg.py | 8 +++++++- test/discrete/test_ppo.py | 8 +++++++- test/discrete/test_qrdqn.py | 8 +++++++- test/discrete/test_rainbow.py | 8 +++++++- test/discrete/test_sac.py | 11 +++++++---- test/modelbased/test_dqn_icm.py | 8 +++++++- test/modelbased/test_ppo_icm.py | 8 +++++++- test/modelbased/test_psrl.py | 18 ++++++++---------- test/offline/gather_cartpole_data.py | 10 +++++++--- test/offline/gather_pendulum_data.py | 10 +++++++--- test/offline/test_bcq.py | 11 ++++++++--- test/offline/test_cql.py | 11 ++++++++--- test/offline/test_discrete_bcq.py | 10 +++++++--- test/offline/test_discrete_cql.py | 10 +++++++--- test/offline/test_discrete_crr.py | 10 +++++++--- 27 files changed, 194 insertions(+), 70 deletions(-) diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index 4f76bb8..84885ff 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import Actor, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--actor-lr', type=float, default=1e-4) @@ -47,13 +48,15 @@ def get_args(): def test_ddpg(args=get_args()): - torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( @@ -112,7 +115,7 @@ def test_ddpg(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = offpolicy_trainer( diff --git a/test/continuous/test_npg.py b/test/continuous/test_npg.py index d369bfd..4833350 100644 --- a/test/continuous/test_npg.py +++ b/test/continuous/test_npg.py @@ -21,6 +21,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=50000) parser.add_argument('--lr', type=float, default=1e-3) @@ -52,11 +53,14 @@ def get_args(): def test_npg(args=get_args()): env = gym.make(args.task) - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( @@ -134,7 +138,7 @@ def test_npg(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer( diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index 4e95e92..9f4af9b 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=1e-3) @@ -56,11 +57,14 @@ def get_args(): def test_ppo(args=get_args()): env = gym.make(args.task) - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( @@ -129,7 +133,7 @@ def test_ppo(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py index 7eaf879..b2287a2 100644 --- a/test/continuous/test_sac_with_il.py +++ b/test/continuous/test_sac_with_il.py @@ -23,6 +23,7 @@ from tianshou.utils.net.continuous import Actor, ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--actor-lr', type=float, default=1e-3) @@ -62,12 +63,14 @@ def test_sac_with_il(args=get_args()): args.task, num_envs=args.training_num, seed=args.seed ) test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed) - reward_threshold = None - if args.task == 'Pendulum-v0': - reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # seed np.random.seed(args.seed) @@ -139,7 +142,7 @@ def test_sac_with_il(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = offpolicy_trainer( @@ -160,8 +163,8 @@ def test_sac_with_il(args=get_args()): # here we define an imitation collector with a trivial policy policy.eval() - if args.task == 'Pendulum-v0': - reward_threshold = -300 # lower the goal + if args.task.startswith("Pendulum"): + args.reward_threshold -= 50 # lower the goal net = Actor( Net( args.state_shape, diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 023c02a..64df9c4 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import Actor, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--actor-lr', type=float, default=1e-4) @@ -50,13 +51,15 @@ def get_args(): def test_td3(args=get_args()): - torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( @@ -130,7 +133,7 @@ def test_td3(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = offpolicy_trainer( diff --git a/test/continuous/test_trpo.py b/test/continuous/test_trpo.py index 2824e6e..92228b7 100644 --- a/test/continuous/test_trpo.py +++ b/test/continuous/test_trpo.py @@ -21,6 +21,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=50000) parser.add_argument('--lr', type=float, default=1e-3) @@ -55,11 +56,14 @@ def get_args(): def test_trpo(args=get_args()): env = gym.make(args.task) - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( @@ -138,7 +142,7 @@ def test_trpo(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer( diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py index ec74484..977aafa 100644 --- a/test/discrete/test_a2c_with_il.py +++ b/test/discrete/test_a2c_with_il.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=1e-3) @@ -58,6 +59,11 @@ def test_a2c_with_il(args=get_args()): test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) @@ -94,7 +100,7 @@ def test_a2c_with_il(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer( diff --git a/test/discrete/test_c51.py b/test/discrete/test_c51.py index 3c74723..fa1db2f 100644 --- a/test/discrete/test_c51.py +++ b/test/discrete/test_c51.py @@ -19,6 +19,7 @@ from tianshou.utils.net.common import Net def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -58,6 +59,11 @@ def test_c51(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -116,7 +122,7 @@ def test_c51(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/discrete/test_dqn.py b/test/discrete/test_dqn.py index c028664..d0a9f08 100644 --- a/test/discrete/test_dqn.py +++ b/test/discrete/test_dqn.py @@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -52,6 +53,11 @@ def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -107,7 +113,7 @@ def test_dqn(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/discrete/test_drqn.py b/test/discrete/test_drqn.py index 064dbba..496420a 100644 --- a/test/discrete/test_drqn.py +++ b/test/discrete/test_drqn.py @@ -18,6 +18,7 @@ from tianshou.utils.net.common import Recurrent def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -48,6 +49,11 @@ def test_drqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -94,7 +100,7 @@ def test_drqn(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): policy.set_eps(args.eps_train) diff --git a/test/discrete/test_fqf.py b/test/discrete/test_fqf.py index e952294..73cc2bd 100644 --- a/test/discrete/test_fqf.py +++ b/test/discrete/test_fqf.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import FractionProposalNetwork, FullQuantileFun def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -55,6 +56,11 @@ def test_fqf(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -124,7 +130,7 @@ def test_fqf(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/discrete/test_iqn.py b/test/discrete/test_iqn.py index c93ddfc..c7e074c 100644 --- a/test/discrete/test_iqn.py +++ b/test/discrete/test_iqn.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import ImplicitQuantileNetwork def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -55,6 +56,11 @@ def test_iqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -118,7 +124,7 @@ def test_iqn(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/discrete/test_pg.py b/test/discrete/test_pg.py index fafd7cc..c3210b2 100644 --- a/test/discrete/test_pg.py +++ b/test/discrete/test_pg.py @@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=1e-3) @@ -44,6 +45,11 @@ def test_pg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -95,7 +101,7 @@ def test_pg(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer( diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index da74b7a..c0b7e1b 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=3e-4) @@ -55,6 +56,11 @@ def test_ppo(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -120,7 +126,7 @@ def test_ppo(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer( diff --git a/test/discrete/test_qrdqn.py b/test/discrete/test_qrdqn.py index 956cb03..eaa4dc0 100644 --- a/test/discrete/test_qrdqn.py +++ b/test/discrete/test_qrdqn.py @@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -55,6 +56,11 @@ def test_qrdqn(args=get_args()): env.spec.reward_threshold = 190 # lower the goal args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -111,7 +117,7 @@ def test_qrdqn(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/discrete/test_rainbow.py b/test/discrete/test_rainbow.py index b226a02..1e4d0a3 100644 --- a/test/discrete/test_rainbow.py +++ b/test/discrete/test_rainbow.py @@ -20,6 +20,7 @@ from tianshou.utils.net.discrete import NoisyLinear def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -61,6 +62,11 @@ def test_rainbow(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -130,7 +136,7 @@ def test_rainbow(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annealing, just a demo diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py index 118a296..d532682 100644 --- a/test/discrete/test_sac.py +++ b/test/discrete/test_sac.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--actor-lr', type=float, default=1e-4) @@ -49,11 +50,13 @@ def get_args(): def test_discrete_sac(args=get_args()): env = gym.make(args.task) - if args.task == 'CartPole-v0': - env.spec.reward_threshold = 180 # lower the goal - args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 180} # lower the goal + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)] @@ -115,7 +118,7 @@ def test_discrete_sac(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = offpolicy_trainer( diff --git a/test/modelbased/test_dqn_icm.py b/test/modelbased/test_dqn_icm.py index 64bcda6..d5d9aaa 100644 --- a/test/modelbased/test_dqn_icm.py +++ b/test/modelbased/test_dqn_icm.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import IntrinsicCuriosityModule def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -71,6 +72,11 @@ def test_dqn_icm(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -146,7 +152,7 @@ def test_dqn_icm(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/modelbased/test_ppo_icm.py b/test/modelbased/test_ppo_icm.py index f548197..f87f51a 100644 --- a/test/modelbased/test_ppo_icm.py +++ b/test/modelbased/test_ppo_icm.py @@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic, IntrinsicCuriosityModule def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=3e-4) @@ -73,6 +74,11 @@ def test_ppo(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 195} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -152,7 +158,7 @@ def test_ppo(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer( diff --git a/test/modelbased/test_psrl.py b/test/modelbased/test_psrl.py index d650108..a20d3dd 100644 --- a/test/modelbased/test_psrl.py +++ b/test/modelbased/test_psrl.py @@ -16,6 +16,7 @@ from tianshou.utils import LazyLogger, TensorboardLogger, WandbLogger def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='NChain-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=50000) parser.add_argument('--epoch', type=int, default=5) @@ -44,12 +45,12 @@ def test_psrl(args=get_args()): args.task, num_envs=args.training_num, seed=args.seed ) test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed) - if args.task == "NChain-v0": - reward_threshold = 3400 - # reward_threshold = 3647 # described in PSRL paper - else: - reward_threshold = None - print("reward threshold:", reward_threshold) + if args.reward_threshold is None: + default_reward_threshold = {"NChain-v0": 3400} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) + print("reward threshold:", args.reward_threshold) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # seed @@ -87,10 +88,7 @@ def test_psrl(args=get_args()): logger = LazyLogger() def stop_fn(mean_rewards): - if reward_threshold: - return mean_rewards >= reward_threshold - else: - return False + return mean_rewards >= args.reward_threshold train_collector.collect(n_step=args.buffer_size, random=True) # trainer, test it without logger diff --git a/test/offline/gather_cartpole_data.py b/test/offline/gather_cartpole_data.py index eeddf1a..12abf8b 100644 --- a/test/offline/gather_cartpole_data.py +++ b/test/offline/gather_cartpole_data.py @@ -22,6 +22,7 @@ def expert_file_name(): def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) @@ -57,10 +58,13 @@ def get_args(): def gather_data(): args = get_args() env = gym.make(args.task) - if args.task == 'CartPole-v0': - env.spec.reward_threshold = 190 # lower the goal args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 190} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( @@ -117,7 +121,7 @@ def gather_data(): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo diff --git a/test/offline/gather_pendulum_data.py b/test/offline/gather_pendulum_data.py index 33fac4f..2386458 100644 --- a/test/offline/gather_pendulum_data.py +++ b/test/offline/gather_pendulum_data.py @@ -23,6 +23,7 @@ def expert_file_name(): def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) @@ -65,11 +66,14 @@ def gather_data(): """Return expert buffer data.""" args = get_args() env = gym.make(args.task) - if args.task == 'Pendulum-v0': - env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] + if args.reward_threshold is None: + default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( @@ -147,7 +151,7 @@ def gather_data(): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold # trainer offpolicy_trainer( diff --git a/test/offline/test_bcq.py b/test/offline/test_bcq.py index 8a5ec98..ca3c2c9 100644 --- a/test/offline/test_bcq.py +++ b/test/offline/test_bcq.py @@ -26,6 +26,7 @@ else: # pytest def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64]) parser.add_argument('--actor-lr', type=float, default=1e-3) @@ -73,8 +74,12 @@ def test_bcq(args=get_args()): args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -1100 # too low? + if args.reward_threshold is None: + # too low? + default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] @@ -180,7 +185,7 @@ def test_bcq(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def watch(): policy.load_state_dict( diff --git a/test/offline/test_cql.py b/test/offline/test_cql.py index be43ea2..0969cb9 100644 --- a/test/offline/test_cql.py +++ b/test/offline/test_cql.py @@ -26,6 +26,7 @@ else: # pytest def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v1') + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) parser.add_argument('--actor-lr', type=float, default=1e-3) @@ -78,8 +79,12 @@ def test_cql(args=get_args()): args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float - if args.task == 'Pendulum-v1': - env.spec.reward_threshold = -1200 # too low? + if args.reward_threshold is None: + # too low? + default_reward_threshold = {"Pendulum-v0": -1200, "Pendulum-v1": -1200} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] @@ -177,7 +182,7 @@ def test_cql(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def watch(): policy.load_state_dict( diff --git a/test/offline/test_discrete_bcq.py b/test/offline/test_discrete_bcq.py index e83d13d..1793117 100644 --- a/test/offline/test_discrete_bcq.py +++ b/test/offline/test_discrete_bcq.py @@ -25,6 +25,7 @@ else: # pytest def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--task", type=str, default="CartPole-v0") + parser.add_argument('--reward-threshold', type=float, default=None) parser.add_argument("--seed", type=int, default=1626) parser.add_argument("--eps-test", type=float, default=0.001) parser.add_argument("--lr", type=float, default=3e-4) @@ -55,10 +56,13 @@ def get_args(): def test_discrete_bcq(args=get_args()): # envs env = gym.make(args.task) - if args.task == 'CartPole-v0': - env.spec.reward_threshold = 190 # lower the goal args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 190} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) @@ -108,7 +112,7 @@ def test_discrete_bcq(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html diff --git a/test/offline/test_discrete_cql.py b/test/offline/test_discrete_cql.py index eaac481..2f275e8 100644 --- a/test/offline/test_discrete_cql.py +++ b/test/offline/test_discrete_cql.py @@ -24,6 +24,7 @@ else: # pytest def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--task", type=str, default="CartPole-v0") + parser.add_argument("--reward-threshold", type=float, default=None) parser.add_argument("--seed", type=int, default=1626) parser.add_argument("--eps-test", type=float, default=0.001) parser.add_argument("--lr", type=float, default=3e-3) @@ -52,10 +53,13 @@ def get_args(): def test_discrete_cql(args=get_args()): # envs env = gym.make(args.task) - if args.task == 'CartPole-v0': - env.spec.reward_threshold = 170 # lower the goal args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 170} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) @@ -103,7 +107,7 @@ def test_discrete_cql(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold result = offline_trainer( policy, diff --git a/test/offline/test_discrete_crr.py b/test/offline/test_discrete_crr.py index 2e8916a..d581780 100644 --- a/test/offline/test_discrete_crr.py +++ b/test/offline/test_discrete_crr.py @@ -25,6 +25,7 @@ else: # pytest def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--task", type=str, default="CartPole-v0") + parser.add_argument("--reward-threshold", type=float, default=None) parser.add_argument("--seed", type=int, default=1626) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--gamma", type=float, default=0.99) @@ -50,10 +51,13 @@ def get_args(): def test_discrete_crr(args=get_args()): # envs env = gym.make(args.task) - if args.task == 'CartPole-v0': - env.spec.reward_threshold = 180 # lower the goal args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n + if args.reward_threshold is None: + default_reward_threshold = {"CartPole-v0": 180} + args.reward_threshold = default_reward_threshold.get( + args.task, env.spec.reward_threshold + ) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) @@ -106,7 +110,7 @@ def test_discrete_crr(args=get_args()): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold + return mean_rewards >= args.reward_threshold result = offline_trainer( policy,