SAC implementation update (#212)

- replace DiagGuassian with Independent(Normal) (pytorch has already supported this) - detach alpha from autograd - add value/alpha to result (more informational) - revert #204 to fix #211 Co-authored-by: Trinkle23897 <463003665@qq.com>
2020-09-12 08:44:50 +08:00 · 2020-09-12 08:44:50 +08:00 · 16d8e9b051
commit 16d8e9b051
parent b86d78766b
8 changed files with 62 additions and 61 deletions
--- a/examples/box2d/README.md
+++ b/examples/box2d/README.md
@ -1,7 +1,6 @@
 # Bipedal-Hardcore-SAC
- Our default choice: remove the done flag penalty, will soon converge to \~250 reward within 100 epochs (10M env steps, 3~4 hours, see the image below)
+- Our default choice: remove the done flag penalty, will soon converge to \~270 reward within 100 epochs (10M env steps, 3~4 hours, see the image below)
 - If the done penalty is not removed, it converges much slower than before, about 200 epochs (20M env steps) to reach the same performance (\~200 reward)
 - Action noise is only necessary in the beginning. It is a negative impact at the end of the training. Removing it can reach \~255 (our best result under the original env, no done penalty removed).
 ![](results/sac/BipedalHardcore.png)
--- a/examples/box2d/bipedal_hardcore_sac.py
+++ b/examples/box2d/bipedal_hardcore_sac.py
@ -24,6 +24,8 @@ def get_args():
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--alpha', type=float, default=0.1)
    parser.add_argument('--auto_alpha', type=int, default=1)
    parser.add_argument('--alpha_lr', type=float, default=3e-4)
    parser.add_argument('--epoch', type=int, default=100)
    parser.add_argument('--step-per-epoch', type=int, default=10000)
    parser.add_argument('--collect-per-step', type=int, default=10)
@ -46,7 +48,7 @@ def get_args():
 class EnvWrapper(object):
    """Env wrapper for reward scale, action repeat and action noise"""
-    def __init__(self, task, action_repeat=3, reward_scale=5, act_noise=0.3):
+    def __init__(self, task, action_repeat=3, reward_scale=5, act_noise=0.0):
        self._env = gym.make(task)
        self.action_repeat = action_repeat
        self.reward_scale = reward_scale
@ -109,6 +111,12 @@ def test_sac_bipedal(args=get_args()):
    critic2 = Critic(net_c2, args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    if args.auto_alpha:
        target_entropy = -np.prod(env.action_space.shape)
        log_alpha = torch.zeros(1, requires_grad=True, device=args.device)
        alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
        args.alpha = (target_entropy, log_alpha, alpha_optim)
    policy = SACPolicy(
        actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
        args.tau, args.gamma, args.alpha,
--- a/examples/box2d/results/sac/BipedalHardcore.png
+++ b/examples/box2d/results/sac/BipedalHardcore.png
--- a/test/continuous/test_ppo.py
+++ b/test/continuous/test_ppo.py
@ -5,11 +5,11 @@ import pprint
 import argparse
 import numpy as np
 from torch.utils.tensorboard import SummaryWriter
 from torch.distributions import Independent, Normal
 from tianshou.policy import PPOPolicy
 from tianshou.env import DummyVectorEnv
 from tianshou.utils.net.common import Net
 from tianshou.policy.dist import DiagGaussian
 from tianshou.trainer import onpolicy_trainer
 from tianshou.data import Collector, ReplayBuffer
 from tianshou.utils.net.continuous import ActorProb, Critic
@ -84,7 +84,11 @@ def test_ppo(args=get_args()):
            torch.nn.init.zeros_(m.bias)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=args.lr)
-    dist = DiagGaussian
+
    # replace DiagGuassian with Independent(Normal) which is equivalent
    # pass *logits to be consistent with policy.forward
    def dist(*logits):
        return Independent(Normal(*logits), 1)
    policy = PPOPolicy(
        actor, critic, optim, dist, args.gamma,
        max_grad_norm=args.max_grad_norm,
--- a/tianshou/policy/dist.py
+++ b/tianshou/policy/dist.py
@ -1,11 +0,0 @@
 import torch
 class DiagGaussian(torch.distributions.Normal):
    """Diagonal Gaussian distribution."""
    def log_prob(self, actions):
        return super().log_prob(actions).sum(-1, keepdim=True)
    def entropy(self):
        return super().entropy().sum(-1)
--- a/tianshou/policy/modelfree/sac.py
+++ b/tianshou/policy/modelfree/sac.py
@ -2,9 +2,9 @@ import torch
 import numpy as np
 from copy import deepcopy
 from typing import Dict, Tuple, Union, Optional
 from torch.distributions import Normal, Independent
 from tianshou.policy import DDPGPolicy
 from tianshou.policy.dist import DiagGaussian
 from tianshou.data import Batch, to_torch_as, ReplayBuffer
 from tianshou.exploration import BaseNoise
@ -47,7 +47,8 @@ class SACPolicy(DDPGPolicy):
        explanation.
    """
-    def __init__(self,
+    def __init__(
        self,
        actor: torch.nn.Module,
        actor_optim: torch.optim.Optimizer,
        critic1: torch.nn.Module,
@ -56,14 +57,16 @@ class SACPolicy(DDPGPolicy):
        critic2_optim: torch.optim.Optimizer,
        tau: float = 0.005,
        gamma: float = 0.99,
-                 alpha: Tuple[float, torch.Tensor, torch.optim.Optimizer]
+        alpha: Union[
-                 or float = 0.2,
+            float, Tuple[float, torch.Tensor, torch.optim.Optimizer]
        ] = 0.2,
        action_range: Optional[Tuple[float, float]] = None,
        reward_normalization: bool = False,
        ignore_done: bool = False,
        estimation_step: int = 1,
        exploration_noise: Optional[BaseNoise] = None,
-                 **kwargs) -> None:
+        **kwargs
    ) -> None:
        super().__init__(None, None, None, None, tau, gamma, exploration_noise,
                         action_range, reward_normalization, ignore_done,
                         estimation_step, **kwargs)
@ -75,14 +78,12 @@ class SACPolicy(DDPGPolicy):
        self.critic2_old.eval()
        self.critic2_optim = critic2_optim
-        self._automatic_alpha_tuning = not isinstance(alpha, float)
+        self._is_auto_alpha = False
-        if self._automatic_alpha_tuning:
+        if isinstance(alpha, tuple):
-            self._target_entropy = alpha[0]
+            self._is_auto_alpha = True
-            assert(alpha[1].shape == torch.Size([1])
+            self._target_entropy, self._log_alpha, self._alpha_optim = alpha
-                   and alpha[1].requires_grad)
+            assert alpha[1].shape == torch.Size([1]) and alpha[1].requires_grad
-            self._log_alpha = alpha[1]
+            self._alpha = self._log_alpha.detach().exp()
            self._alpha_optim = alpha[2]
            self._alpha = self._log_alpha.exp()
        else:
            self._alpha = alpha
@ -111,12 +112,13 @@ class SACPolicy(DDPGPolicy):
        obs = getattr(batch, input)
        logits, h = self.actor(obs, state=state, info=batch.info)
        assert isinstance(logits, tuple)
-        dist = DiagGaussian(*logits)
+        dist = Independent(Normal(*logits), 1)
        x = dist.rsample()
        y = torch.tanh(x)
        act = y * self._action_scale + self._action_bias
        y = self._action_scale * (1 - y.pow(2)) + self.__eps
-        log_prob = dist.log_prob(x) - torch.log(y).sum(-1, keepdim=True)
+        log_prob = dist.log_prob(x).unsqueeze(-1)
        log_prob = log_prob - torch.log(y).sum(-1, keepdim=True)
        if self._noise is not None and self.training and explorating:
            act += to_torch_as(self._noise(act.shape), act)
        act = act.clamp(self._range[0], self._range[1])
@ -167,13 +169,13 @@ class SACPolicy(DDPGPolicy):
        actor_loss.backward()
        self.actor_optim.step()
-        if self._automatic_alpha_tuning:
+        if self._is_auto_alpha:
-            log_prob = (obs_result.log_prob + self._target_entropy).detach()
+            log_prob = obs_result.log_prob.detach() + self._target_entropy
            alpha_loss = -(self._log_alpha * log_prob).mean()
            self._alpha_optim.zero_grad()
            alpha_loss.backward()
            self._alpha_optim.step()
-            self._alpha = self._log_alpha.exp()
+            self._alpha = self._log_alpha.detach().exp()
        self.sync_weight()
@ -182,6 +184,7 @@ class SACPolicy(DDPGPolicy):
            'loss/critic1': critic1_loss.item(),
            'loss/critic2': critic2_loss.item(),
        }
-        if self._automatic_alpha_tuning:
+        if self._is_auto_alpha:
            result['loss/alpha'] = alpha_loss.item()
            result['v/alpha'] = self._alpha.item()
        return result
--- a/tianshou/trainer/offpolicy.py
+++ b/tianshou/trainer/offpolicy.py
@ -77,13 +77,13 @@ def offpolicy_trainer(
    start_time = time.time()
    test_in_train = test_in_train and train_collector.policy == policy
    for epoch in range(1, 1 + max_epoch):
        # train
        policy.train()
        if train_fn:
            train_fn(epoch)
        with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}',
                       **tqdm_config) as t:
            while t.n < t.total:
                # collect
                if train_fn:
                    train_fn(epoch)
                policy.eval()
                result = train_collector.collect(n_step=collect_per_step)
                data = {}
                if test_in_train and stop_fn and stop_fn(result['rew']):
@ -100,10 +100,9 @@ def offpolicy_trainer(
                            start_time, train_collector, test_collector,
                            test_result['rew'])
                    else:
                        policy.train()
                        if train_fn:
                            train_fn(epoch)
                # train
                policy.train()
                for i in range(update_per_step * min(
                        result['n/st'] // collect_per_step, t.total - t.n)):
                    global_step += collect_per_step
--- a/tianshou/trainer/onpolicy.py
+++ b/tianshou/trainer/onpolicy.py
@ -77,13 +77,13 @@ def onpolicy_trainer(
    start_time = time.time()
    test_in_train = test_in_train and train_collector.policy == policy
    for epoch in range(1, 1 + max_epoch):
        # train
        policy.train()
        if train_fn:
            train_fn(epoch)
        with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}',
                       **tqdm_config) as t:
            while t.n < t.total:
                # collect
                if train_fn:
                    train_fn(epoch)
                policy.eval()
                result = train_collector.collect(n_episode=collect_per_step)
                data = {}
                if test_in_train and stop_fn and stop_fn(result['rew']):
@ -100,10 +100,9 @@ def onpolicy_trainer(
                            start_time, train_collector, test_collector,
                            test_result['rew'])
                    else:
                        policy.train()
                        if train_fn:
                            train_fn(epoch)
                # train
                policy.train()
                losses = policy.update(
                    0, train_collector.buffer, batch_size, repeat_per_collect)
                train_collector.reset_buffer()