Fixed hardcoded reward_treshold (#548)
This commit is contained in:
parent
c248b4f87e
commit
d976a5aa91
@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import Actor, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-4)
|
||||
@ -47,13 +48,15 @@ def get_args():
|
||||
|
||||
|
||||
def test_ddpg(args=get_args()):
|
||||
torch.set_num_threads(1) # we just need only one thread for NN
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -112,7 +115,7 @@ def test_ddpg(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
|
@ -21,6 +21,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=50000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
@ -52,11 +53,14 @@ def get_args():
|
||||
|
||||
def test_npg(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -134,7 +138,7 @@ def test_npg(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
|
@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
@ -56,11 +57,14 @@ def get_args():
|
||||
|
||||
def test_ppo(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -129,7 +133,7 @@ def test_ppo(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def save_checkpoint_fn(epoch, env_step, gradient_step):
|
||||
# see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html
|
||||
|
@ -23,6 +23,7 @@ from tianshou.utils.net.continuous import Actor, ActorProb, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-3)
|
||||
@ -62,12 +63,14 @@ def test_sac_with_il(args=get_args()):
|
||||
args.task, num_envs=args.training_num, seed=args.seed
|
||||
)
|
||||
test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
|
||||
reward_threshold = None
|
||||
if args.task == 'Pendulum-v0':
|
||||
reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
@ -139,7 +142,7 @@ def test_sac_with_il(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
@ -160,8 +163,8 @@ def test_sac_with_il(args=get_args()):
|
||||
|
||||
# here we define an imitation collector with a trivial policy
|
||||
policy.eval()
|
||||
if args.task == 'Pendulum-v0':
|
||||
reward_threshold = -300 # lower the goal
|
||||
if args.task.startswith("Pendulum"):
|
||||
args.reward_threshold -= 50 # lower the goal
|
||||
net = Actor(
|
||||
Net(
|
||||
args.state_shape,
|
||||
|
@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import Actor, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-4)
|
||||
@ -50,13 +51,15 @@ def get_args():
|
||||
|
||||
|
||||
def test_td3(args=get_args()):
|
||||
torch.set_num_threads(1) # we just need only one thread for NN
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -130,7 +133,7 @@ def test_td3(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
|
@ -21,6 +21,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=50000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
@ -55,11 +56,14 @@ def get_args():
|
||||
|
||||
def test_trpo(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -138,7 +142,7 @@ def test_trpo(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
@ -58,6 +59,11 @@ def test_a2c_with_il(args=get_args()):
|
||||
test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# seed
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
@ -94,7 +100,7 @@ def test_a2c_with_il(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.common import Net
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -58,6 +59,11 @@ def test_c51(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -116,7 +122,7 @@ def test_c51(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -52,6 +53,11 @@ def test_dqn(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -107,7 +113,7 @@ def test_dqn(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -18,6 +18,7 @@ from tianshou.utils.net.common import Recurrent
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -48,6 +49,11 @@ def test_drqn(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -94,7 +100,7 @@ def test_drqn(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
policy.set_eps(args.eps_train)
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import FractionProposalNetwork, FullQuantileFun
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -55,6 +56,11 @@ def test_fqf(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -124,7 +130,7 @@ def test_fqf(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import ImplicitQuantileNetwork
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -55,6 +56,11 @@ def test_iqn(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -118,7 +124,7 @@ def test_iqn(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=1e-3)
|
||||
@ -44,6 +45,11 @@ def test_pg(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -95,7 +101,7 @@ def test_pg(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=3e-4)
|
||||
@ -55,6 +56,11 @@ def test_ppo(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -120,7 +126,7 @@ def test_ppo(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
|
@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -55,6 +56,11 @@ def test_qrdqn(args=get_args()):
|
||||
env.spec.reward_threshold = 190 # lower the goal
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -111,7 +117,7 @@ def test_qrdqn(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -20,6 +20,7 @@ from tianshou.utils.net.discrete import NoisyLinear
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -61,6 +62,11 @@ def test_rainbow(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -130,7 +136,7 @@ def test_rainbow(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annealing, just a demo
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-4)
|
||||
@ -49,11 +50,13 @@ def get_args():
|
||||
|
||||
def test_discrete_sac(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'CartPole-v0':
|
||||
env.spec.reward_threshold = 180 # lower the goal
|
||||
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 180} # lower the goal
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
|
||||
train_envs = DummyVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.training_num)]
|
||||
@ -115,7 +118,7 @@ def test_discrete_sac(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = offpolicy_trainer(
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import IntrinsicCuriosityModule
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -71,6 +72,11 @@ def test_dqn_icm(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -146,7 +152,7 @@ def test_dqn_icm(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic, IntrinsicCuriosityModule
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1626)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--lr', type=float, default=3e-4)
|
||||
@ -73,6 +74,11 @@ def test_ppo(args=get_args()):
|
||||
env = gym.make(args.task)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 195}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -152,7 +158,7 @@ def test_ppo(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
result = onpolicy_trainer(
|
||||
|
@ -16,6 +16,7 @@ from tianshou.utils import LazyLogger, TensorboardLogger, WandbLogger
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='NChain-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--buffer-size', type=int, default=50000)
|
||||
parser.add_argument('--epoch', type=int, default=5)
|
||||
@ -44,12 +45,12 @@ def test_psrl(args=get_args()):
|
||||
args.task, num_envs=args.training_num, seed=args.seed
|
||||
)
|
||||
test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
|
||||
if args.task == "NChain-v0":
|
||||
reward_threshold = 3400
|
||||
# reward_threshold = 3647 # described in PSRL paper
|
||||
else:
|
||||
reward_threshold = None
|
||||
print("reward threshold:", reward_threshold)
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"NChain-v0": 3400}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
print("reward threshold:", args.reward_threshold)
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
# seed
|
||||
@ -87,10 +88,7 @@ def test_psrl(args=get_args()):
|
||||
logger = LazyLogger()
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
if reward_threshold:
|
||||
return mean_rewards >= reward_threshold
|
||||
else:
|
||||
return False
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
train_collector.collect(n_step=args.buffer_size, random=True)
|
||||
# trainer, test it without logger
|
||||
|
@ -22,6 +22,7 @@ def expert_file_name():
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='CartPole-v0')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=1)
|
||||
parser.add_argument('--eps-test', type=float, default=0.05)
|
||||
parser.add_argument('--eps-train', type=float, default=0.1)
|
||||
@ -57,10 +58,13 @@ def get_args():
|
||||
def gather_data():
|
||||
args = get_args()
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'CartPole-v0':
|
||||
env.spec.reward_threshold = 190 # lower the goal
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 190}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# train_envs = gym.make(args.task)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -117,7 +121,7 @@ def gather_data():
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def train_fn(epoch, env_step):
|
||||
# eps annnealing, just a demo
|
||||
|
@ -23,6 +23,7 @@ def expert_file_name():
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--buffer-size', type=int, default=20000)
|
||||
parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
|
||||
@ -65,11 +66,14 @@ def gather_data():
|
||||
"""Return expert buffer data."""
|
||||
args = get_args()
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'Pendulum-v0':
|
||||
env.spec.reward_threshold = -250
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0]
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
# you can also use tianshou.env.SubprocVectorEnv
|
||||
# train_envs = gym.make(args.task)
|
||||
train_envs = DummyVectorEnv(
|
||||
@ -147,7 +151,7 @@ def gather_data():
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
# trainer
|
||||
offpolicy_trainer(
|
||||
|
@ -26,6 +26,7 @@ else: # pytest
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64])
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-3)
|
||||
@ -73,8 +74,12 @@ def test_bcq(args=get_args()):
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0] # float
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -1100 # too low?
|
||||
if args.reward_threshold is None:
|
||||
# too low?
|
||||
default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
|
||||
args.state_dim = args.state_shape[0]
|
||||
args.action_dim = args.action_shape[0]
|
||||
@ -180,7 +185,7 @@ def test_bcq(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def watch():
|
||||
policy.load_state_dict(
|
||||
|
@ -26,6 +26,7 @@ else: # pytest
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--task', type=str, default='Pendulum-v1')
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64])
|
||||
parser.add_argument('--actor-lr', type=float, default=1e-3)
|
||||
@ -78,8 +79,12 @@ def test_cql(args=get_args()):
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
args.max_action = env.action_space.high[0] # float
|
||||
if args.task == 'Pendulum-v1':
|
||||
env.spec.reward_threshold = -1200 # too low?
|
||||
if args.reward_threshold is None:
|
||||
# too low?
|
||||
default_reward_threshold = {"Pendulum-v0": -1200, "Pendulum-v1": -1200}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
|
||||
args.state_dim = args.state_shape[0]
|
||||
args.action_dim = args.action_shape[0]
|
||||
@ -177,7 +182,7 @@ def test_cql(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def watch():
|
||||
policy.load_state_dict(
|
||||
|
@ -25,6 +25,7 @@ else: # pytest
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task", type=str, default="CartPole-v0")
|
||||
parser.add_argument('--reward-threshold', type=float, default=None)
|
||||
parser.add_argument("--seed", type=int, default=1626)
|
||||
parser.add_argument("--eps-test", type=float, default=0.001)
|
||||
parser.add_argument("--lr", type=float, default=3e-4)
|
||||
@ -55,10 +56,13 @@ def get_args():
|
||||
def test_discrete_bcq(args=get_args()):
|
||||
# envs
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'CartPole-v0':
|
||||
env.spec.reward_threshold = 190 # lower the goal
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 190}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
test_envs = DummyVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)]
|
||||
)
|
||||
@ -108,7 +112,7 @@ def test_discrete_bcq(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
def save_checkpoint_fn(epoch, env_step, gradient_step):
|
||||
# see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html
|
||||
|
@ -24,6 +24,7 @@ else: # pytest
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task", type=str, default="CartPole-v0")
|
||||
parser.add_argument("--reward-threshold", type=float, default=None)
|
||||
parser.add_argument("--seed", type=int, default=1626)
|
||||
parser.add_argument("--eps-test", type=float, default=0.001)
|
||||
parser.add_argument("--lr", type=float, default=3e-3)
|
||||
@ -52,10 +53,13 @@ def get_args():
|
||||
def test_discrete_cql(args=get_args()):
|
||||
# envs
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'CartPole-v0':
|
||||
env.spec.reward_threshold = 170 # lower the goal
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 170}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
test_envs = DummyVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)]
|
||||
)
|
||||
@ -103,7 +107,7 @@ def test_discrete_cql(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
result = offline_trainer(
|
||||
policy,
|
||||
|
@ -25,6 +25,7 @@ else: # pytest
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task", type=str, default="CartPole-v0")
|
||||
parser.add_argument("--reward-threshold", type=float, default=None)
|
||||
parser.add_argument("--seed", type=int, default=1626)
|
||||
parser.add_argument("--lr", type=float, default=7e-4)
|
||||
parser.add_argument("--gamma", type=float, default=0.99)
|
||||
@ -50,10 +51,13 @@ def get_args():
|
||||
def test_discrete_crr(args=get_args()):
|
||||
# envs
|
||||
env = gym.make(args.task)
|
||||
if args.task == 'CartPole-v0':
|
||||
env.spec.reward_threshold = 180 # lower the goal
|
||||
args.state_shape = env.observation_space.shape or env.observation_space.n
|
||||
args.action_shape = env.action_space.shape or env.action_space.n
|
||||
if args.reward_threshold is None:
|
||||
default_reward_threshold = {"CartPole-v0": 180}
|
||||
args.reward_threshold = default_reward_threshold.get(
|
||||
args.task, env.spec.reward_threshold
|
||||
)
|
||||
test_envs = DummyVectorEnv(
|
||||
[lambda: gym.make(args.task) for _ in range(args.test_num)]
|
||||
)
|
||||
@ -106,7 +110,7 @@ def test_discrete_crr(args=get_args()):
|
||||
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
|
||||
|
||||
def stop_fn(mean_rewards):
|
||||
return mean_rewards >= env.spec.reward_threshold
|
||||
return mean_rewards >= args.reward_threshold
|
||||
|
||||
result = offline_trainer(
|
||||
policy,
|
||||
|
Loading…
x
Reference in New Issue
Block a user