diff --git a/README.md b/README.md index 292f660..72e5267 100644 --- a/README.md +++ b/README.md @@ -20,14 +20,14 @@ - [Policy Gradient (PG)](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) - [Deep Q-Network (DQN)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) - [Double DQN (DDQN)](https://arxiv.org/pdf/1509.06461.pdf) with n-step returns -- [Prioritized DQN (PDQN)](https://arxiv.org/pdf/1511.05952.pdf) - [Advantage Actor-Critic (A2C)](https://openai.com/blog/baselines-acktr-a2c/) - [Deep Deterministic Policy Gradient (DDPG)](https://arxiv.org/pdf/1509.02971.pdf) - [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) - [Twin Delayed DDPG (TD3)](https://arxiv.org/pdf/1802.09477.pdf) - [Soft Actor-Critic (SAC)](https://arxiv.org/pdf/1812.05905.pdf) - Vanilla Imitation Learning -- [Generalized Advantage Estimation (GAE)](https://arxiv.org/pdf/1506.02438.pdf) +- [Prioritized Experience Replay (PER)](https://arxiv.org/pdf/1511.05952.pdf) +- [Generalized Advantage Estimator (GAE)](https://arxiv.org/pdf/1506.02438.pdf) Tianshou supports parallel workers for all algorithms as well. All of these algorithms are reformatted as replay-buffer based algorithms. Our team is working on supporting more algorithms and more scenarios on Tianshou in this period of development. diff --git a/docs/index.rst b/docs/index.rst index 949763f..fb2440a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,14 +11,14 @@ Welcome to Tianshou! * :class:`~tianshou.policy.PGPolicy` `Policy Gradient `_ * :class:`~tianshou.policy.DQNPolicy` `Deep Q-Network `_ * :class:`~tianshou.policy.DQNPolicy` `Double DQN `_ with n-step returns -* :class:`~tianshou.policy.DQNPolicy` `Prioritized DQN `_ * :class:`~tianshou.policy.A2CPolicy` `Advantage Actor-Critic `_ * :class:`~tianshou.policy.DDPGPolicy` `Deep Deterministic Policy Gradient `_ * :class:`~tianshou.policy.PPOPolicy` `Proximal Policy Optimization `_ * :class:`~tianshou.policy.TD3Policy` `Twin Delayed DDPG `_ * :class:`~tianshou.policy.SACPolicy` `Soft Actor-Critic `_ * :class:`~tianshou.policy.ImitationPolicy` Imitation Learning -* :meth:`~tianshou.policy.BasePolicy.compute_episodic_return` `Generalized Advantage Estimation `_ +* :class:`~tianshou.data.PrioritizedReplayBuffer` `Prioritized Experience Replay `_ +* :meth:`~tianshou.policy.BasePolicy.compute_episodic_return` `Generalized Advantage Estimator `_ Tianshou supports parallel workers for all algorithms as well. All of these algorithms are reformatted as replay-buffer based algorithms. diff --git a/test/base/test_batch.py b/test/base/test_batch.py index 6ef1e95..ab176b1 100644 --- a/test/base/test_batch.py +++ b/test/base/test_batch.py @@ -20,5 +20,14 @@ def test_batch(): print(batch) +def test_batch_over_batch(): + batch = Batch(a=[3, 4, 5], b=[4, 5, 6]) + batch2 = Batch(b=batch, c=[6, 7, 8]) + batch2.b.b[-1] = 0 + print(batch2) + assert batch2[-1].b.b == 0 + + if __name__ == '__main__': test_batch() + test_batch_over_batch() diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index a1c9d2b..8280f7f 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -8,7 +8,7 @@ from torch.utils.tensorboard import SummaryWriter from tianshou.env import VectorEnv from tianshou.policy import PPOPolicy -from tianshou.policy.utils import DiagGaussian +from tianshou.policy.dist import DiagGaussian from tianshou.trainer import onpolicy_trainer from tianshou.data import Collector, ReplayBuffer diff --git a/tianshou/data/batch.py b/tianshou/data/batch.py index 8015b86..1cbf772 100644 --- a/tianshou/data/batch.py +++ b/tianshou/data/batch.py @@ -70,7 +70,7 @@ class Batch(object): super().__init__() self._meta = {} for k, v in kwargs.items(): - if (isinstance(v, list) or isinstance(v, np.ndarray)) \ + if isinstance(v, (list, np.ndarray)) \ and len(v) > 0 and isinstance(v[0], dict) and k != 'info': self._meta[k] = list(v[0].keys()) for k_ in v[0].keys(): @@ -78,7 +78,7 @@ class Batch(object): self.__dict__[k__] = np.array([ v[i][k_] for i in range(len(v)) ]) - elif isinstance(v, dict) or isinstance(v, Batch): + elif isinstance(v, dict): self._meta[k] = list(v.keys()) for k_ in v.keys(): k__ = '_' + k + '@' + k_ diff --git a/tianshou/data/buffer.py b/tianshou/data/buffer.py index 163e852..181a6b7 100644 --- a/tianshou/data/buffer.py +++ b/tianshou/data/buffer.py @@ -151,7 +151,7 @@ class ReplayBuffer(object): if self.__dict__.get(name, None) is None: if isinstance(inst, np.ndarray): self.__dict__[name] = np.zeros([self._maxsize, *inst.shape]) - elif isinstance(inst, dict) or isinstance(inst, Batch): + elif isinstance(inst, (dict, Batch)): if name == 'info': self.__dict__[name] = np.array( [{} for _ in range(self._maxsize)]) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index 9a2d604..4753c66 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -192,15 +192,13 @@ class Collector(object): return if isinstance(self.state, list): self.state[id] = None - elif isinstance(self.state, dict) or isinstance(self.state, Batch): + elif isinstance(self.state, (dict, Batch)): for k in self.state.keys(): if isinstance(self.state[k], list): self.state[k][id] = None - elif isinstance(self.state[k], torch.Tensor) or \ - isinstance(self.state[k], np.ndarray): + elif isinstance(self.state[k], (torch.Tensor, np.ndarray)): self.state[k][id] = 0 - elif isinstance(self.state, torch.Tensor) or \ - isinstance(self.state, np.ndarray): + elif isinstance(self.state, (torch.Tensor, np.ndarray)): self.state[id] = 0 def _to_numpy(self, x: Union[ diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 737aaf5..75752d7 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -102,7 +102,7 @@ class BasePolicy(ABC, nn.Module): gamma: float = 0.99, gae_lambda: float = 0.95) -> Batch: """Compute returns over given full-length episodes, including the - implementation of Generalized Advantage Estimation (arXiv:1506.02438). + implementation of Generalized Advantage Estimator (arXiv:1506.02438). :param batch: a data batch which contains several full-episode data chronologically. diff --git a/tianshou/policy/utils.py b/tianshou/policy/dist.py similarity index 82% rename from tianshou/policy/utils.py rename to tianshou/policy/dist.py index 56aa035..f1792e4 100644 --- a/tianshou/policy/utils.py +++ b/tianshou/policy/dist.py @@ -2,9 +2,7 @@ import torch class DiagGaussian(torch.distributions.Normal): - """Diagonal Gaussian Distribution - - """ + """Diagonal Gaussian distribution.""" def log_prob(self, actions): return super().log_prob(actions).sum(-1, keepdim=True) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index e74e19e..8219fef 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -55,7 +55,6 @@ class A2CPolicy(PGPolicy): self._grad_norm = max_grad_norm self._batch = 64 self._rew_norm = reward_normalization - self.__eps = np.finfo(np.float32).eps.item() def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: @@ -99,7 +98,7 @@ class A2CPolicy(PGPolicy): **kwargs) -> Dict[str, List[float]]: self._batch = batch_size r = batch.returns - if self._rew_norm and r.std() > self.__eps: + if self._rew_norm and not np.isclose(r.std(), 0): batch.returns = (r - r.mean()) / r.std() losses, actor_losses, vf_losses, ent_losses = [], [], [], [] for _ in range(repeat): diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py index 0a4aa9c..c4cd465 100644 --- a/tianshou/policy/modelfree/ddpg.py +++ b/tianshou/policy/modelfree/ddpg.py @@ -71,7 +71,6 @@ class DDPGPolicy(BasePolicy): # self.noise = OUNoise() self._rm_done = ignore_done self._rew_norm = reward_normalization - self.__eps = np.finfo(np.float32).eps.item() def set_eps(self, eps: float) -> None: """Set the eps for exploration.""" @@ -102,7 +101,7 @@ class DDPGPolicy(BasePolicy): if self._rew_norm: bfr = buffer.rew[:min(len(buffer), 1000)] # avoid large buffer mean, std = bfr.mean(), bfr.std() - if std > self.__eps: + if not np.isclose(std, 0): batch.rew = (batch.rew - mean) / std if self._rm_done: batch.done = batch.done * 0. diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py index 58df054..8f87d10 100644 --- a/tianshou/policy/modelfree/dqn.py +++ b/tianshou/policy/modelfree/dqn.py @@ -10,6 +10,7 @@ from tianshou.data import Batch, ReplayBuffer, PrioritizedReplayBuffer class DQNPolicy(BasePolicy): """Implementation of Deep Q Network. arXiv:1312.5602 + Implementation of Double Q-Learning. arXiv:1509.06461 :param torch.nn.Module model: a model following the rules in :class:`~tianshou.policy.BasePolicy`. (s -> logits) diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index cd0e20a..fd05adb 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -36,7 +36,6 @@ class PGPolicy(BasePolicy): assert 0 <= discount_factor <= 1, 'discount factor should in [0, 1]' self._gamma = discount_factor self._rew_norm = reward_normalization - self.__eps = np.finfo(np.float32).eps.item() def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: @@ -83,7 +82,7 @@ class PGPolicy(BasePolicy): **kwargs) -> Dict[str, List[float]]: losses = [] r = batch.returns - if self._rew_norm and r.std() > self.__eps: + if self._rew_norm and not np.isclose(r.std(), 0): batch.returns = (r - r.mean()) / r.std() for _ in range(repeat): for b in batch.split(batch_size): diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 3f9b5e9..8631dc9 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -53,7 +53,7 @@ class PPOPolicy(PGPolicy): ent_coef: float = .01, action_range: Optional[Tuple[float, float]] = None, gae_lambda: float = 0.95, - dual_clip: float = None, + dual_clip: Optional[float] = None, value_clip: bool = True, reward_normalization: bool = True, **kwargs) -> None: @@ -74,13 +74,12 @@ class PPOPolicy(PGPolicy): self._dual_clip = dual_clip self._value_clip = value_clip self._rew_norm = reward_normalization - self.__eps = np.finfo(np.float32).eps.item() def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: if self._rew_norm: mean, std = batch.rew.mean(), batch.rew.std() - if std > self.__eps: + if not np.isclose(std, 0): batch.rew = (batch.rew - mean) / std if self._lambda in [0, 1]: return self.compute_episodic_return( @@ -140,12 +139,12 @@ class PPOPolicy(PGPolicy): ).reshape(batch.v.shape) if self._rew_norm: mean, std = batch.returns.mean(), batch.returns.std() - if std > self.__eps: + if not np.isclose(std.item(), 0): batch.returns = (batch.returns - mean) / std batch.adv = batch.returns - batch.v if self._rew_norm: mean, std = batch.adv.mean(), batch.adv.std() - if std > self.__eps: + if not np.isclose(std.item(), 0): batch.adv = (batch.adv - mean) / std for _ in range(repeat): for b in batch.split(batch_size): diff --git a/tianshou/policy/modelfree/sac.py b/tianshou/policy/modelfree/sac.py index 6e1a8f5..e409438 100644 --- a/tianshou/policy/modelfree/sac.py +++ b/tianshou/policy/modelfree/sac.py @@ -6,7 +6,7 @@ from typing import Dict, Tuple, Union, Optional from tianshou.data import Batch from tianshou.policy import DDPGPolicy -from tianshou.policy.utils import DiagGaussian +from tianshou.policy.dist import DiagGaussian class SACPolicy(DDPGPolicy):