diff --git a/examples/modelbase/psrl.py b/examples/modelbase/psrl.py deleted file mode 120000 index 228d259..0000000 --- a/examples/modelbase/psrl.py +++ /dev/null @@ -1 +0,0 @@ -../../test/modelbase/test_psrl.py \ No newline at end of file diff --git a/examples/modelbase/README.md b/examples/modelbased/README.md similarity index 100% rename from examples/modelbase/README.md rename to examples/modelbased/README.md diff --git a/examples/modelbased/psrl.py b/examples/modelbased/psrl.py new file mode 120000 index 0000000..711578b --- /dev/null +++ b/examples/modelbased/psrl.py @@ -0,0 +1 @@ +../../test/modelbased/test_psrl.py \ No newline at end of file diff --git a/examples/mujoco/run_experiments.sh b/examples/mujoco/run_experiments.sh index 3f0f8a9..6b38c2c 100755 --- a/examples/mujoco/run_experiments.sh +++ b/examples/mujoco/run_experiments.sh @@ -6,6 +6,6 @@ TASK=$1 echo "Experiments started." for seed in $(seq 0 9) do - python mujoco_sac.py --task $TASK --epoch 200 --seed $seed --logdir $LOGDIR > ${TASK}_`date '+%m-%d-%H-%M-%S'`_seed_$seed.txt 2>&1 + python mujoco_sac.py --task $TASK --epoch 200 --seed $seed --logdir $LOGDIR > ${TASK}_`date '+%m-%d-%H-%M-%S'`_seed_$seed.txt 2>&1 & done echo "Experiments ended." diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py index 8fb535a..c064ace 100644 --- a/test/continuous/test_sac_with_il.py +++ b/test/continuous/test_sac_with_il.py @@ -20,12 +20,14 @@ def get_args(): parser.add_argument('--task', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=3e-4) + parser.add_argument('--actor-lr', type=float, default=1e-3) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--il-lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--alpha', type=float, default=0.2) + parser.add_argument('--auto-alpha', type=int, default=1) + parser.add_argument('--alpha-lr', type=float, default=3e-4) parser.add_argument('--epoch', type=int, default=5) parser.add_argument('--step-per-epoch', type=int, default=24000) parser.add_argument('--il-step-per-epoch', type=int, default=500) @@ -41,7 +43,7 @@ def get_args(): parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) parser.add_argument('--rew-norm', action="store_true", default=False) - parser.add_argument('--n-step', type=int, default=4) + parser.add_argument('--n-step', type=int, default=3) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -85,6 +87,13 @@ def test_sac_with_il(args=get_args()): concat=True, device=args.device) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) + + if args.auto_alpha: + target_entropy = -np.prod(env.action_space.shape) + log_alpha = torch.zeros(1, requires_grad=True, device=args.device) + alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) + args.alpha = (target_entropy, log_alpha, alpha_optim) + policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, alpha=args.alpha, @@ -135,11 +144,12 @@ def test_sac_with_il(args=get_args()): args.action_shape, max_action=args.max_action, device=args.device ).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) - il_policy = ImitationPolicy(net, optim, mode='continuous') + il_policy = ImitationPolicy( + net, optim, mode='continuous', action_space=env.action_space, + action_scaling=True, action_bound_method="clip") il_test_collector = Collector( il_policy, - DummyVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)]) + DummyVectorEnv([lambda: gym.make(args.task) for _ in range(args.test_num)]) ) train_collector.reset() result = offpolicy_trainer( @@ -147,6 +157,7 @@ def test_sac_with_il(args=get_args()): args.il_step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) + if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! diff --git a/test/modelbase/__init__.py b/test/modelbased/__init__.py similarity index 100% rename from test/modelbase/__init__.py rename to test/modelbased/__init__.py diff --git a/test/modelbase/test_psrl.py b/test/modelbased/test_psrl.py similarity index 100% rename from test/modelbase/test_psrl.py rename to test/modelbased/test_psrl.py diff --git a/tianshou/policy/__init__.py b/tianshou/policy/__init__.py index a3625ca..0783fde 100644 --- a/tianshou/policy/__init__.py +++ b/tianshou/policy/__init__.py @@ -12,7 +12,7 @@ from tianshou.policy.modelfree.sac import SACPolicy from tianshou.policy.modelfree.discrete_sac import DiscreteSACPolicy from tianshou.policy.imitation.base import ImitationPolicy from tianshou.policy.imitation.discrete_bcq import DiscreteBCQPolicy -from tianshou.policy.modelbase.psrl import PSRLPolicy +from tianshou.policy.modelbased.psrl import PSRLPolicy from tianshou.policy.multiagent.mapolicy import MultiAgentPolicyManager diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 238aace..a0dcb33 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -12,39 +12,44 @@ from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy class BasePolicy(ABC, nn.Module): """The base class for any RL policy. - Tianshou aims to modularizing RL algorithms. It comes into several classes - of policies in Tianshou. All of the policy classes must inherit + Tianshou aims to modularizing RL algorithms. It comes into several classes of + policies in Tianshou. All of the policy classes must inherit :class:`~tianshou.policy.BasePolicy`. - A policy class typically has four parts: + A policy class typically has the following parts: - * :meth:`~tianshou.policy.BasePolicy.__init__`: initialize the policy, \ - including coping the target network and so on; + * :meth:`~tianshou.policy.BasePolicy.__init__`: initialize the policy, including \ + coping the target network and so on; * :meth:`~tianshou.policy.BasePolicy.forward`: compute action with given \ observation; - * :meth:`~tianshou.policy.BasePolicy.process_fn`: pre-process data from \ - the replay buffer (this function can interact with replay buffer); - * :meth:`~tianshou.policy.BasePolicy.learn`: update policy with a given \ - batch of data. + * :meth:`~tianshou.policy.BasePolicy.process_fn`: pre-process data from the \ + replay buffer (this function can interact with replay buffer); + * :meth:`~tianshou.policy.BasePolicy.learn`: update policy with a given batch of \ + data. + * :meth:`~tianshou.policy.BasePolicy.post_process_fn`: update the replay buffer \ + from the learning process (e.g., prioritized replay buffer needs to update \ + the weight); + * :meth:`~tianshou.policy.BasePolicy.update`: the main interface for training, \ + i.e., `process_fn -> learn -> post_process_fn`. Most of the policy needs a neural network to predict the action and an optimizer to optimize the policy. The rules of self-defined networks are: - 1. Input: observation "obs" (may be a ``numpy.ndarray``, a \ - ``torch.Tensor``, a dict or any others), hidden state "state" (for RNN \ - usage), and other information "info" provided by the environment. - 2. Output: some "logits", the next hidden state "state", and the \ - intermediate result during policy forwarding procedure "policy". The \ - "logits" could be a tuple instead of a ``torch.Tensor``. It depends on how\ - the policy process the network output. For example, in PPO, the return of \ - the network might be ``(mu, sigma), state`` for Gaussian policy. The \ - "policy" can be a Batch of torch.Tensor or other things, which will be \ - stored in the replay buffer, and can be accessed in the policy update \ - process (e.g. in "policy.learn()", the "batch.policy" is what you need). + 1. Input: observation "obs" (may be a ``numpy.ndarray``, a ``torch.Tensor``, a \ + dict or any others), hidden state "state" (for RNN usage), and other information \ + "info" provided by the environment. + 2. Output: some "logits", the next hidden state "state", and the intermediate \ + result during policy forwarding procedure "policy". The "logits" could be a tuple \ + instead of a ``torch.Tensor``. It depends on how the policy process the network \ + output. For example, in PPO, the return of the network might be \ + ``(mu, sigma), state`` for Gaussian policy. The "policy" can be a Batch of \ + torch.Tensor or other things, which will be stored in the replay buffer, and can \ + be accessed in the policy update process (e.g. in "policy.learn()", the \ + "batch.policy" is what you need). - Since :class:`~tianshou.policy.BasePolicy` inherits ``torch.nn.Module``, - you can use :class:`~tianshou.policy.BasePolicy` almost the same as - ``torch.nn.Module``, for instance, loading and saving the model: + Since :class:`~tianshou.policy.BasePolicy` inherits ``torch.nn.Module``, you can + use :class:`~tianshou.policy.BasePolicy` almost the same as ``torch.nn.Module``, + for instance, loading and saving the model: :: torch.save(policy.state_dict(), "policy.pth") @@ -117,6 +122,15 @@ class BasePolicy(ABC, nn.Module): return Batch(..., policy=Batch(log_prob=dist.log_prob(act))) # and in the sampled data batch, you can directly use # batch.policy.log_prob to get your data. + + .. note:: + + In continuous action space, you should do another step "map_action" to get + the real action: + :: + + act = policy(batch).act # doesn't map to the target action range + act = policy.map_action(act, batch) """ pass diff --git a/tianshou/policy/modelbase/__init__.py b/tianshou/policy/modelbased/__init__.py similarity index 100% rename from tianshou/policy/modelbase/__init__.py rename to tianshou/policy/modelbased/__init__.py diff --git a/tianshou/policy/modelbase/psrl.py b/tianshou/policy/modelbased/psrl.py similarity index 100% rename from tianshou/policy/modelbase/psrl.py rename to tianshou/policy/modelbased/psrl.py diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py index 324467f..e814975 100644 --- a/tianshou/policy/modelfree/ddpg.py +++ b/tianshou/policy/modelfree/ddpg.py @@ -27,8 +27,8 @@ class DDPGPolicy(BasePolicy): :param bool action_scaling: whether to map actions from range [-1, 1] to range [action_spaces.low, action_spaces.high]. Default to True. :param str action_bound_method: method to bound action to range [-1, 1], can be - either "clip" (for simply clipping the action), "tanh" (for applying tanh - squashing) for now, or empty string for no bounding. Default to "clip". + either "clip" (for simply clipping the action) or empty string for no bounding. + Default to "clip". :param Optional[gym.Space] action_space: env's action space, mandatory if you want to use option "action_scaling" or "action_bound_method". Default to None. @@ -55,6 +55,9 @@ class DDPGPolicy(BasePolicy): ) -> None: super().__init__(action_scaling=action_scaling, action_bound_method=action_bound_method, **kwargs) + assert action_bound_method != "tanh", "tanh mapping is not supported" \ + "in policies where action is used as input of critic , because" \ + "raw action in range (-inf, inf) will cause instability in training" if actor is not None and actor_optim is not None: self.actor: torch.nn.Module = actor self.actor_old = deepcopy(actor) diff --git a/tianshou/policy/modelfree/sac.py b/tianshou/policy/modelfree/sac.py index 6fa5911..274cd1c 100644 --- a/tianshou/policy/modelfree/sac.py +++ b/tianshou/policy/modelfree/sac.py @@ -37,8 +37,8 @@ class SACPolicy(DDPGPolicy): :param bool action_scaling: whether to map actions from range [-1, 1] to range [action_spaces.low, action_spaces.high]. Default to True. :param str action_bound_method: method to bound action to range [-1, 1], can be - either "clip" (for simply clipping the action), "tanh" (for applying tanh - squashing) for now, or empty string for no bounding. Default to "tanh". + either "clip" (for simply clipping the action) or empty string for no bounding. + Default to "clip". :param Optional[gym.Space] action_space: env's action space, mandatory if you want to use option "action_scaling" or "action_bound_method". Default to None. @@ -63,13 +63,11 @@ class SACPolicy(DDPGPolicy): estimation_step: int = 1, exploration_noise: Optional[BaseNoise] = None, deterministic_eval: bool = True, - action_bound_method: str = "tanh", **kwargs: Any, ) -> None: super().__init__( None, None, None, None, tau, gamma, exploration_noise, - reward_normalization, estimation_step, - action_bound_method=action_bound_method, **kwargs) + reward_normalization, estimation_step, **kwargs) self.actor, self.actor_optim = actor, actor_optim self.critic1, self.critic1_old = critic1, deepcopy(critic1) self.critic1_old.eval() @@ -120,20 +118,20 @@ class SACPolicy(DDPGPolicy): else: act = dist.rsample() log_prob = dist.log_prob(act).unsqueeze(-1) - if self.action_bound_method == "tanh" and self.action_space is not None: - # apply correction for Tanh squashing when computing logprob from Gaussian - # You can check out the original SAC paper (arXiv 1801.01290): Eq 21. - # in appendix C to get some understanding of this equation. - if self.action_scaling: - action_scale = to_torch_as( - (self.action_space.high - self.action_space.low) / 2.0, act) - else: - action_scale = 1.0 # type: ignore - squashed_action = torch.tanh(act) - log_prob = log_prob - torch.log( - action_scale * (1 - squashed_action.pow(2)) + self.__eps - ).sum(-1, keepdim=True) - return Batch(logits=logits, act=act, state=h, dist=dist, log_prob=log_prob) + # apply correction for Tanh squashing when computing logprob from Gaussian + # You can check out the original SAC paper (arXiv 1801.01290): Eq 21. + # in appendix C to get some understanding of this equation. + if self.action_scaling and self.action_space is not None: + action_scale = to_torch_as( + (self.action_space.high - self.action_space.low) / 2.0, act) + else: + action_scale = 1.0 # type: ignore + squashed_action = torch.tanh(act) + log_prob = log_prob - torch.log( + action_scale * (1 - squashed_action.pow(2)) + self.__eps + ).sum(-1, keepdim=True) + return Batch(logits=logits, act=squashed_action, + state=h, dist=dist, log_prob=log_prob) def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor: batch = buffer[indice] # batch.obs: s_{t+n} diff --git a/tianshou/policy/modelfree/td3.py b/tianshou/policy/modelfree/td3.py index 96843a3..b1839cf 100644 --- a/tianshou/policy/modelfree/td3.py +++ b/tianshou/policy/modelfree/td3.py @@ -35,8 +35,8 @@ class TD3Policy(DDPGPolicy): :param bool action_scaling: whether to map actions from range [-1, 1] to range [action_spaces.low, action_spaces.high]. Default to True. :param str action_bound_method: method to bound action to range [-1, 1], can be - either "clip" (for simply clipping the action), "tanh" (for applying tanh - squashing) for now, or empty string for no bounding. Default to "clip". + either "clip" (for simply clipping the action) or empty string for no bounding. + Default to "clip". :param Optional[gym.Space] action_space: env's action space, mandatory if you want to use option "action_scaling" or "action_bound_method". Default to None.