Fixed hardcoded reward_treshold (#548)

This commit is contained in:
Anas BELFADIL 2022-03-04 03:35:39 +01:00 committed by GitHub
parent c248b4f87e
commit d976a5aa91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 194 additions and 70 deletions

View File

@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=1e-4)
@ -47,13 +48,15 @@ def get_args():
def test_ddpg(args=get_args()):
torch.set_num_threads(1) # we just need only one thread for NN
env = gym.make(args.task)
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv(
@ -112,7 +115,7 @@ def test_ddpg(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = offpolicy_trainer(

View File

@ -21,6 +21,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=50000)
parser.add_argument('--lr', type=float, default=1e-3)
@ -52,11 +53,14 @@ def get_args():
def test_npg(args=get_args()):
env = gym.make(args.task)
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv(
@ -134,7 +138,7 @@ def test_npg(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = onpolicy_trainer(

View File

@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=1e-3)
@ -56,11 +57,14 @@ def get_args():
def test_ppo(args=get_args()):
env = gym.make(args.task)
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv(
@ -129,7 +133,7 @@ def test_ppo(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def save_checkpoint_fn(epoch, env_step, gradient_step):
# see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html

View File

@ -23,6 +23,7 @@ from tianshou.utils.net.continuous import Actor, ActorProb, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=1e-3)
@ -62,12 +63,14 @@ def test_sac_with_il(args=get_args()):
args.task, num_envs=args.training_num, seed=args.seed
)
test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
reward_threshold = None
if args.task == 'Pendulum-v0':
reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# seed
np.random.seed(args.seed)
@ -139,7 +142,7 @@ def test_sac_with_il(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = offpolicy_trainer(
@ -160,8 +163,8 @@ def test_sac_with_il(args=get_args()):
# here we define an imitation collector with a trivial policy
policy.eval()
if args.task == 'Pendulum-v0':
reward_threshold = -300 # lower the goal
if args.task.startswith("Pendulum"):
args.reward_threshold -= 50 # lower the goal
net = Actor(
Net(
args.state_shape,

View File

@ -20,6 +20,7 @@ from tianshou.utils.net.continuous import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=1e-4)
@ -50,13 +51,15 @@ def get_args():
def test_td3(args=get_args()):
torch.set_num_threads(1) # we just need only one thread for NN
env = gym.make(args.task)
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv(
@ -130,7 +133,7 @@ def test_td3(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = offpolicy_trainer(

View File

@ -21,6 +21,7 @@ from tianshou.utils.net.continuous import ActorProb, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=50000)
parser.add_argument('--lr', type=float, default=1e-3)
@ -55,11 +56,14 @@ def get_args():
def test_trpo(args=get_args()):
env = gym.make(args.task)
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv(
@ -138,7 +142,7 @@ def test_trpo(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = onpolicy_trainer(

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=1e-3)
@ -58,6 +59,11 @@ def test_a2c_with_il(args=get_args()):
test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
@ -94,7 +100,7 @@ def test_a2c_with_il(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = onpolicy_trainer(

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.common import Net
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -58,6 +59,11 @@ def test_c51(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -116,7 +122,7 @@ def test_c51(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -52,6 +53,11 @@ def test_dqn(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -107,7 +113,7 @@ def test_dqn(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -18,6 +18,7 @@ from tianshou.utils.net.common import Recurrent
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -48,6 +49,11 @@ def test_drqn(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -94,7 +100,7 @@ def test_drqn(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
policy.set_eps(args.eps_train)

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import FractionProposalNetwork, FullQuantileFun
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -55,6 +56,11 @@ def test_fqf(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -124,7 +130,7 @@ def test_fqf(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import ImplicitQuantileNetwork
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -55,6 +56,11 @@ def test_iqn(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -118,7 +124,7 @@ def test_iqn(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=1e-3)
@ -44,6 +45,11 @@ def test_pg(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -95,7 +101,7 @@ def test_pg(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = onpolicy_trainer(

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=3e-4)
@ -55,6 +56,11 @@ def test_ppo(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -120,7 +126,7 @@ def test_ppo(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = onpolicy_trainer(

View File

@ -18,6 +18,7 @@ from tianshou.utils.net.common import Net
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -55,6 +56,11 @@ def test_qrdqn(args=get_args()):
env.spec.reward_threshold = 190 # lower the goal
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -111,7 +117,7 @@ def test_qrdqn(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -20,6 +20,7 @@ from tianshou.utils.net.discrete import NoisyLinear
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -61,6 +62,11 @@ def test_rainbow(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -130,7 +136,7 @@ def test_rainbow(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annealing, just a demo

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=1e-4)
@ -49,11 +50,13 @@ def get_args():
def test_discrete_sac(args=get_args()):
env = gym.make(args.task)
if args.task == 'CartPole-v0':
env.spec.reward_threshold = 180 # lower the goal
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 180} # lower the goal
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
train_envs = DummyVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.training_num)]
@ -115,7 +118,7 @@ def test_discrete_sac(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = offpolicy_trainer(

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import IntrinsicCuriosityModule
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -71,6 +72,11 @@ def test_dqn_icm(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -146,7 +152,7 @@ def test_dqn_icm(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -19,6 +19,7 @@ from tianshou.utils.net.discrete import Actor, Critic, IntrinsicCuriosityModule
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=3e-4)
@ -73,6 +74,11 @@ def test_ppo(args=get_args()):
env = gym.make(args.task)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 195}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -152,7 +158,7 @@ def test_ppo(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
result = onpolicy_trainer(

View File

@ -16,6 +16,7 @@ from tianshou.utils import LazyLogger, TensorboardLogger, WandbLogger
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='NChain-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--buffer-size', type=int, default=50000)
parser.add_argument('--epoch', type=int, default=5)
@ -44,12 +45,12 @@ def test_psrl(args=get_args()):
args.task, num_envs=args.training_num, seed=args.seed
)
test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed)
if args.task == "NChain-v0":
reward_threshold = 3400
# reward_threshold = 3647 # described in PSRL paper
else:
reward_threshold = None
print("reward threshold:", reward_threshold)
if args.reward_threshold is None:
default_reward_threshold = {"NChain-v0": 3400}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
print("reward threshold:", args.reward_threshold)
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
# seed
@ -87,10 +88,7 @@ def test_psrl(args=get_args()):
logger = LazyLogger()
def stop_fn(mean_rewards):
if reward_threshold:
return mean_rewards >= reward_threshold
else:
return False
return mean_rewards >= args.reward_threshold
train_collector.collect(n_step=args.buffer_size, random=True)
# trainer, test it without logger

View File

@ -22,6 +22,7 @@ def expert_file_name():
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='CartPole-v0')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--eps-test', type=float, default=0.05)
parser.add_argument('--eps-train', type=float, default=0.1)
@ -57,10 +58,13 @@ def get_args():
def gather_data():
args = get_args()
env = gym.make(args.task)
if args.task == 'CartPole-v0':
env.spec.reward_threshold = 190 # lower the goal
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 190}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# train_envs = gym.make(args.task)
# you can also use tianshou.env.SubprocVectorEnv
train_envs = DummyVectorEnv(
@ -117,7 +121,7 @@ def gather_data():
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def train_fn(epoch, env_step):
# eps annnealing, just a demo

View File

@ -23,6 +23,7 @@ def expert_file_name():
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
@ -65,11 +66,14 @@ def gather_data():
"""Return expert buffer data."""
args = get_args()
env = gym.make(args.task)
if args.task == 'Pendulum-v0':
env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
if args.reward_threshold is None:
default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)
train_envs = DummyVectorEnv(
@ -147,7 +151,7 @@ def gather_data():
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
# trainer
offpolicy_trainer(

View File

@ -26,6 +26,7 @@ else: # pytest
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64])
parser.add_argument('--actor-lr', type=float, default=1e-3)
@ -73,8 +74,12 @@ def test_bcq(args=get_args()):
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0] # float
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -1100 # too low?
if args.reward_threshold is None:
# too low?
default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
args.state_dim = args.state_shape[0]
args.action_dim = args.action_shape[0]
@ -180,7 +185,7 @@ def test_bcq(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def watch():
policy.load_state_dict(

View File

@ -26,6 +26,7 @@ else: # pytest
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v1')
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64])
parser.add_argument('--actor-lr', type=float, default=1e-3)
@ -78,8 +79,12 @@ def test_cql(args=get_args()):
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0] # float
if args.task == 'Pendulum-v1':
env.spec.reward_threshold = -1200 # too low?
if args.reward_threshold is None:
# too low?
default_reward_threshold = {"Pendulum-v0": -1200, "Pendulum-v1": -1200}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
args.state_dim = args.state_shape[0]
args.action_dim = args.action_shape[0]
@ -177,7 +182,7 @@ def test_cql(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def watch():
policy.load_state_dict(

View File

@ -25,6 +25,7 @@ else: # pytest
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, default="CartPole-v0")
parser.add_argument('--reward-threshold', type=float, default=None)
parser.add_argument("--seed", type=int, default=1626)
parser.add_argument("--eps-test", type=float, default=0.001)
parser.add_argument("--lr", type=float, default=3e-4)
@ -55,10 +56,13 @@ def get_args():
def test_discrete_bcq(args=get_args()):
# envs
env = gym.make(args.task)
if args.task == 'CartPole-v0':
env.spec.reward_threshold = 190 # lower the goal
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 190}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
test_envs = DummyVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)]
)
@ -108,7 +112,7 @@ def test_discrete_bcq(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
def save_checkpoint_fn(epoch, env_step, gradient_step):
# see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html

View File

@ -24,6 +24,7 @@ else: # pytest
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, default="CartPole-v0")
parser.add_argument("--reward-threshold", type=float, default=None)
parser.add_argument("--seed", type=int, default=1626)
parser.add_argument("--eps-test", type=float, default=0.001)
parser.add_argument("--lr", type=float, default=3e-3)
@ -52,10 +53,13 @@ def get_args():
def test_discrete_cql(args=get_args()):
# envs
env = gym.make(args.task)
if args.task == 'CartPole-v0':
env.spec.reward_threshold = 170 # lower the goal
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 170}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
test_envs = DummyVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)]
)
@ -103,7 +107,7 @@ def test_discrete_cql(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
result = offline_trainer(
policy,

View File

@ -25,6 +25,7 @@ else: # pytest
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, default="CartPole-v0")
parser.add_argument("--reward-threshold", type=float, default=None)
parser.add_argument("--seed", type=int, default=1626)
parser.add_argument("--lr", type=float, default=7e-4)
parser.add_argument("--gamma", type=float, default=0.99)
@ -50,10 +51,13 @@ def get_args():
def test_discrete_crr(args=get_args()):
# envs
env = gym.make(args.task)
if args.task == 'CartPole-v0':
env.spec.reward_threshold = 180 # lower the goal
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
if args.reward_threshold is None:
default_reward_threshold = {"CartPole-v0": 180}
args.reward_threshold = default_reward_threshold.get(
args.task, env.spec.reward_threshold
)
test_envs = DummyVectorEnv(
[lambda: gym.make(args.task) for _ in range(args.test_num)]
)
@ -106,7 +110,7 @@ def test_discrete_crr(args=get_args()):
torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
def stop_fn(mean_rewards):
return mean_rewards >= env.spec.reward_threshold
return mean_rewards >= args.reward_threshold
result = offline_trainer(
policy,