Tianshou/tianshou/data/buffer.py

import numpy as np
from tianshou.data.batch import Batch


class ReplayBuffer(object):
    """:class:`~tianshou.data.ReplayBuffer` stores data generated from
    interaction between the policy and environment. It stores basically 6 types
    of data, as mentioned in :class:`~tianshou.data.Batch`, based on
    ``numpy.ndarray``. Here is the usage:
    ::

        >>> import numpy as np
        >>> from tianshou.data import ReplayBuffer
        >>> buf = ReplayBuffer(size=20)
        >>> for i in range(3):
        ...     buf.add(obs=i, act=i, rew=i, done=i, obs_next=i + 1, info={})
        >>> len(buf)
        3
        >>> buf.obs
        # since we set size = 20, len(buf.obs) == 20.
        array([0., 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
               0., 0., 0., 0.])

        >>> buf2 = ReplayBuffer(size=10)
        >>> for i in range(15):
        ...     buf2.add(obs=i, act=i, rew=i, done=i, obs_next=i + 1, info={})
        >>> len(buf2)
        10
        >>> buf2.obs
        # since its size = 10, it only stores the last 10 steps' result.
        array([10., 11., 12., 13., 14.,  5.,  6.,  7.,  8.,  9.])

        >>> # move buf2's result into buf (meanwhile keep it chronologically)
        >>> buf.update(buf2)
        array([ 0.,  1.,  2.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
                0.,  0.,  0.,  0.,  0.,  0.,  0.])

        >>> # get a random sample from buffer
        >>> # the batch_data is equal to buf[incide].
        >>> batch_data, indice = buf.sample(batch_size=4)
        >>> batch_data.obs == buf[indice].obs
        array([ True,  True,  True,  True])

    Since version v0.2.2, :class:`~tianshou.data.ReplayBuffer` supports
    frame_stack sampling (typically for RNN usage) and ignoring storing the
    next observation (save memory):
    ::

        >>> buf = ReplayBuffer(size=9, stack_num=4, ignore_obs_next=True)
        >>> for i in range(16):
        ...     done = i % 5 == 0
        ...     buf.add(obs=i, act=i, rew=i, done=done, obs_next=i + 1)
        >>> print(buf)
        ReplayBuffer(
            obs: [ 9. 10. 11. 12. 13. 14. 15.  7.  8.],
            act: [ 9. 10. 11. 12. 13. 14. 15.  7.  8.],
            rew: [ 9. 10. 11. 12. 13. 14. 15.  7.  8.],
            done: [0. 1. 0. 0. 0. 0. 1. 0. 0.],
            obs_next: [0. 0. 0. 0. 0. 0. 0. 0. 0.],
            info: [{} {} {} {} {} {} {} {} {}],
        )
        >>> index = np.arange(len(buf))
        >>> print(buf.get(index, 'obs'))
        [[ 7.  7.  8.  9.]
         [ 7.  8.  9. 10.]
         [11. 11. 11. 11.]
         [11. 11. 11. 12.]
         [11. 11. 12. 13.]
         [11. 12. 13. 14.]
         [12. 13. 14. 15.]
         [ 7.  7.  7.  7.]
         [ 7.  7.  7.  8.]]
        >>> # here is another way to get the stacked data
        >>> # (stack only for obs and obs_next)
        >>> abs(buf.get(index, 'obs') - buf[index].obs).sum().sum()
        0.0
        >>> # we can get obs_next through __getitem__, even if it doesn't store
        >>> print(buf[:].obs_next)
        [[ 7.  8.  9. 10.]
         [ 7.  8.  9. 10.]
         [11. 11. 11. 12.]
         [11. 11. 12. 13.]
         [11. 12. 13. 14.]
         [12. 13. 14. 15.]
         [12. 13. 14. 15.]
         [ 7.  7.  7.  8.]
         [ 7.  7.  8.  9.]]
    """

    def __init__(self, size, stack_num=0, ignore_obs_next=False, **kwargs):
        super().__init__()
        self._maxsize = size
        self._stack = stack_num
        self._save_s_ = not ignore_obs_next
        self.reset()

    def __len__(self):
        """Return len(self)."""
        return self._size

    def __repr__(self):
        """Return str(self)."""
        s = self.__class__.__name__ + '(\n'
        flag = False
        for k in self.__dict__.keys():
            if k[0] != '_' and self.__dict__[k] is not None:
                rpl = '\n' + ' ' * (6 + len(k))
                obj = str(self.__dict__[k]).replace('\n', rpl)
                s += f'    {k}: {obj},\n'
                flag = True
        if flag:
            s += ')\n'
        else:
            s = self.__class__.__name__ + '()\n'
        return s

    def _add_to_buffer(self, name, inst):
        if inst is None:
            if getattr(self, name, None) is None:
                self.__dict__[name] = None
            return
        if self.__dict__.get(name, None) is None:
            if isinstance(inst, np.ndarray):
                self.__dict__[name] = np.zeros([self._maxsize, *inst.shape])
            elif isinstance(inst, dict):
                self.__dict__[name] = np.array(
                    [{} for _ in range(self._maxsize)])
            else:  # assume `inst` is a number
                self.__dict__[name] = np.zeros([self._maxsize])
        if isinstance(inst, np.ndarray) and \
                self.__dict__[name].shape[1:] != inst.shape:
            self.__dict__[name] = np.zeros([self._maxsize, *inst.shape])
        self.__dict__[name][self._index] = inst

    def update(self, buffer):
        """Move the data from the given buffer to self."""
        i = begin = buffer._index % len(buffer)
        while True:
            self.add(
                buffer.obs[i], buffer.act[i], buffer.rew[i], buffer.done[i],
                buffer.obs_next[i] if self._save_s_ else None,
                buffer.info[i])
            i = (i + 1) % len(buffer)
            if i == begin:
                break

    def add(self, obs, act, rew, done, obs_next=None, info={}, weight=None):
        """Add a batch of data into replay buffer."""
        assert isinstance(info, dict), \
            'You should return a dict in the last argument of env.step().'
        self._add_to_buffer('obs', obs)
        self._add_to_buffer('act', act)
        self._add_to_buffer('rew', rew)
        self._add_to_buffer('done', done)
        if self._save_s_:
            self._add_to_buffer('obs_next', obs_next)
        self._add_to_buffer('info', info)
        if self._maxsize > 0:
            self._size = min(self._size + 1, self._maxsize)
            self._index = (self._index + 1) % self._maxsize
        else:
            self._size = self._index = self._index + 1

    def reset(self):
        """Clear all the data in replay buffer."""
        self._index = self._size = 0

    def sample(self, batch_size):
        """Get a random sample from buffer with size equal to batch_size. \
        Return all the data in the buffer if batch_size is ``0``.

        :return: Sample data and its corresponding index inside the buffer.
        """
        if batch_size > 0:
            indice = np.random.choice(self._size, batch_size)
        else:
            indice = np.concatenate([
                np.arange(self._index, self._size),
                np.arange(0, self._index),
            ])
        return self[indice], indice

    def get(self, indice, key):
        """Return the stacked result, e.g. [s_{t-3}, s_{t-2}, s_{t-1}, s_t],
        where s is self.key, t is indice. The stack_num (here equals to 4) is
        given from buffer initialization procedure.
        """
        if not isinstance(indice, np.ndarray):
            if np.isscalar(indice):
                indice = np.array(indice)
            elif isinstance(indice, slice):
                indice = np.arange(
                    0 if indice.start is None else indice.start,
                    self._size if indice.stop is None else indice.stop,
                    1 if indice.step is None else indice.step)
        # set last frame done to True
        last_index = (self._index - 1 + self._size) % self._size
        last_done, self.done[last_index] = self.done[last_index], True
        if key == 'obs_next' and not self._save_s_:
            indice += 1 - self.done[indice].astype(np.int)
            indice[indice == self._size] = 0
            key = 'obs'
        if self._stack == 0:
            self.done[last_index] = last_done
            return self.__dict__[key][indice]
        stack = []
        for i in range(self._stack):
            stack = [self.__dict__[key][indice]] + stack
            pre_indice = indice - 1
            pre_indice[pre_indice == -1] = self._size - 1
            indice = pre_indice + self.done[pre_indice].astype(np.int)
            indice[indice == self._size] = 0
        self.done[last_index] = last_done
        return np.stack(stack, axis=1)

    def __getitem__(self, index):
        """Return a data batch: self[index]. If stack_num is set to be > 0,
        return the stacked obs and obs_next with shape [batch, len, ...].
        """
        return Batch(
            obs=self.get(index, 'obs'),
            act=self.act[index],
            rew=self.rew[index],
            done=self.done[index],
            obs_next=self.get(index, 'obs_next'),
            info=self.info[index]
        )


class ListReplayBuffer(ReplayBuffer):
    """The function of :class:`~tianshou.data.ListReplayBuffer` is almost the
    same as :class:`~tianshou.data.ReplayBuffer`. The only difference is that
    :class:`~tianshou.data.ListReplayBuffer` is based on ``list``.

    .. seealso::

        Please refer to :class:`~tianshou.data.ListReplayBuffer` for more
        detailed explanation.
    """

    def __init__(self, **kwargs):
        super().__init__(size=0, ignore_obs_next=False, **kwargs)

    def _add_to_buffer(self, name, inst):
        if inst is None:
            return
        if self.__dict__.get(name, None) is None:
            self.__dict__[name] = []
        self.__dict__[name].append(inst)

    def reset(self):
        self._index = self._size = 0
        for k in list(self.__dict__.keys()):
            if not k.startswith('_'):
                self.__dict__[k] = []


class PrioritizedReplayBuffer(ReplayBuffer):
    """docstring for PrioritizedReplayBuffer"""

    def __init__(self, size, alpha: float, beta: float,
                 mode: str = 'weight', **kwargs):
        if mode != 'weight':
            raise NotImplementedError
        super().__init__(size, **kwargs)
        self._alpha = alpha  # prioritization exponent
        self._beta = beta  # importance sample soft coefficient
        self._weight_sum = 0.0
        self.weight = np.zeros(size, dtype=np.float64)
        self._amortization_freq = 50
        self._amortization_counter = 0

    def add(self, obs, act, rew, done, obs_next=0, info={}, weight=1.0):
        """Add a batch of data into replay buffer."""
        self._weight_sum += np.abs(weight)**self._alpha - \
            self.weight[self._index]
        # we have to sacrifice some convenience for speed :(
        self._add_to_buffer('weight', np.abs(weight)**self._alpha)
        super().add(obs, act, rew, done, obs_next, info)
        self._check_weight_sum()

    def sample(self, batch_size: int = 0, importance_sample: bool = True):
        """ Get a random sample from buffer with priority probability. \
        Return all the data in the buffer if batch_size is ``0``.

        :return: Sample data and its corresponding index inside the buffer.
        """
        if batch_size > 0 and batch_size <= self._size:
            # Multiple sampling of the same sample
            # will cause weight update conflict
            indice = np.random.choice(
                self._size, batch_size,
                p=(self.weight/self.weight.sum())[:self._size], replace=False)
            # self._weight_sum is not work for the accuracy issue
            # p=(self.weight/self._weight_sum)[:self._size], replace=False)
        elif batch_size == 0:
            indice = np.concatenate([
                np.arange(self._index, self._size),
                np.arange(0, self._index),
            ])
        else:
            # if batch_size larger than len(self),
            # it will lead to a bug in update weight
            raise ValueError("batch_size should be less than len(self)")
        batch = self[indice]
        if importance_sample:
            impt_weight = Batch(
                impt_weight=1/np.power(
                    self._size*(batch.weight/self._weight_sum), self._beta))
            batch.append(impt_weight)
        self._check_weight_sum()
        return batch, indice

    def reset(self):
        self._amortization_counter = 0
        super().reset()

    def update_weight(self, indice, new_weight: np.ndarray):
        """update priority weight by indice in this buffer

        :param indice: indice you want to update weight
        :param new_weight: new priority weight you wangt to update
        """
        self._weight_sum += np.power(np.abs(new_weight), self._alpha).sum() \
            - self.weight[indice].sum()
        self.weight[indice] = np.power(np.abs(new_weight), self._alpha)

    def __getitem__(self, index):
        return Batch(
            obs=self.get(index, 'obs'),
            act=self.act[index],
            rew=self.rew[index],
            done=self.done[index],
            obs_next=self.get(index, 'obs_next'),
            info=self.info[index],
            weight=self.weight[index]
        )

    def _check_weight_sum(self):
        # keep a accurate _weight_sum
        self._amortization_counter += 1
        if self._amortization_counter % self._amortization_freq == 0:
            self._weight_sum = np.sum(self.weight)
            self._amortization_counter = 0
env and data 2020-03-11 09:09:56 +08:00			`import numpy as np`
			`from tianshou.data.batch import Batch`


			`class ReplayBuffer(object):`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			""":class:`~tianshou.data.ReplayBuffer` stores data generated from
			`interaction between the policy and environment. It stores basically 6 types`
			of data, as mentioned in :class:`~tianshou.data.Batch`, based on
			``numpy.ndarray``. Here is the usage:
add some docs 2020-04-03 21:28:12 +08:00			`::`

seealso and change policy dir structure 2020-04-09 21:36:53 +08:00			`>>> import numpy as np`
add some docs 2020-04-03 21:28:12 +08:00			`>>> from tianshou.data import ReplayBuffer`
			`>>> buf = ReplayBuffer(size=20)`
			`>>> for i in range(3):`
			`... buf.add(obs=i, act=i, rew=i, done=i, obs_next=i + 1, info={})`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			`>>> len(buf)`
			`3`
add some docs 2020-04-03 21:28:12 +08:00			`>>> buf.obs`
			`# since we set size = 20, len(buf.obs) == 20.`
			`array([0., 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,`
			`0., 0., 0., 0.])`

			`>>> buf2 = ReplayBuffer(size=10)`
			`>>> for i in range(15):`
			`... buf2.add(obs=i, act=i, rew=i, done=i, obs_next=i + 1, info={})`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			`>>> len(buf2)`
			`10`
add some docs 2020-04-03 21:28:12 +08:00			`>>> buf2.obs`
			`# since its size = 10, it only stores the last 10 steps' result.`
			`array([10., 11., 12., 13., 14., 5., 6., 7., 8., 9.])`

add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			`>>> # move buf2's result into buf (meanwhile keep it chronologically)`
add some docs 2020-04-03 21:28:12 +08:00			`>>> buf.update(buf2)`
			`array([ 0., 1., 2., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,`
			`0., 0., 0., 0., 0., 0., 0.])`

			`>>> # get a random sample from buffer`
			`>>> # the batch_data is equal to buf[incide].`
			`>>> batch_data, indice = buf.sample(batch_size=4)`
			`>>> batch_data.obs == buf[indice].obs`
			`array([ True, True, True, True])`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			Since version v0.2.2, :class:`~tianshou.data.ReplayBuffer` supports
			`frame_stack sampling (typically for RNN usage) and ignoring storing the`
			`next observation (save memory):`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`::`

add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`>>> buf = ReplayBuffer(size=9, stack_num=4, ignore_obs_next=True)`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`>>> for i in range(16):`
			`... done = i % 5 == 0`
save_fn 2020-04-11 16:54:27 +08:00			`... buf.add(obs=i, act=i, rew=i, done=done, obs_next=i + 1)`
seealso and change policy dir structure 2020-04-09 21:36:53 +08:00			`>>> print(buf)`
			`ReplayBuffer(`
			`obs: [ 9. 10. 11. 12. 13. 14. 15. 7. 8.],`
			`act: [ 9. 10. 11. 12. 13. 14. 15. 7. 8.],`
			`rew: [ 9. 10. 11. 12. 13. 14. 15. 7. 8.],`
			`done: [0. 1. 0. 0. 0. 0. 1. 0. 0.],`
			`obs_next: [0. 0. 0. 0. 0. 0. 0. 0. 0.],`
			`info: [{} {} {} {} {} {} {} {} {}],`
			`)`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`>>> index = np.arange(len(buf))`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`>>> print(buf.get(index, 'obs'))`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`[[ 7. 7. 8. 9.]`
			`[ 7. 8. 9. 10.]`
			`[11. 11. 11. 11.]`
			`[11. 11. 11. 12.]`
			`[11. 11. 12. 13.]`
			`[11. 12. 13. 14.]`
			`[12. 13. 14. 15.]`
			`[ 7. 7. 7. 7.]`
			`[ 7. 7. 7. 8.]]`
			`>>> # here is another way to get the stacked data`
			`>>> # (stack only for obs and obs_next)`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`>>> abs(buf.get(index, 'obs') - buf[index].obs).sum().sum()`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`0.0`
save_fn 2020-04-11 16:54:27 +08:00			`>>> # we can get obs_next through __getitem__, even if it doesn't store`
			`>>> print(buf[:].obs_next)`
			`[[ 7. 8. 9. 10.]`
			`[ 7. 8. 9. 10.]`
			`[11. 11. 11. 12.]`
			`[11. 11. 12. 13.]`
			`[11. 12. 13. 14.]`
			`[12. 13. 14. 15.]`
			`[12. 13. 14. 15.]`
			`[ 7. 7. 7. 8.]`
			`[ 7. 7. 8. 9.]]`
add some docs 2020-04-03 21:28:12 +08:00			`"""`
maybe finished collector? 2020-03-13 17:49:22 +08:00
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`def __init__(self, size, stack_num=0, ignore_obs_next=False, **kwargs):`
env and data 2020-03-11 09:09:56 +08:00			`super().__init__()`
			`self._maxsize = size`
maybe finished rnn? 2020-04-08 21:13:15 +08:00			`self._stack = stack_num`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`self._save_s_ = not ignore_obs_next`
add test_buffer 2020-03-11 17:28:51 +08:00			`self.reset()`
env and data 2020-03-11 09:09:56 +08:00
			`def __len__(self):`
docs for env 2020-04-04 21:02:06 +08:00			`"""Return len(self)."""`
env and data 2020-03-11 09:09:56 +08:00			`return self._size`

fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`def __repr__(self):`
			`"""Return str(self)."""`
			`s = self.__class__.__name__ + '(\n'`
			`flag = False`
			`for k in self.__dict__.keys():`
			`if k[0] != '_' and self.__dict__[k] is not None:`
			`rpl = '\n' + ' ' * (6 + len(k))`
			`obj = str(self.__dict__[k]).replace('\n', rpl)`
			`s += f' {k}: {obj},\n'`
			`flag = True`
			`if flag:`
			`s += ')\n'`
			`else:`
			`s = self.__class__.__name__ + '()\n'`
			`return s`

env and data 2020-03-11 09:09:56 +08:00			`def _add_to_buffer(self, name, inst):`
			`if inst is None:`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`if getattr(self, name, None) is None:`
			`self.__dict__[name] = None`
env and data 2020-03-11 09:09:56 +08:00			`return`
			`if self.__dict__.get(name, None) is None:`
			`if isinstance(inst, np.ndarray):`
			`self.__dict__[name] = np.zeros([self._maxsize, *inst.shape])`
			`elif isinstance(inst, dict):`
maybe finished collector? 2020-03-13 17:49:22 +08:00			`self.__dict__[name] = np.array(`
			`[{} for _ in range(self._maxsize)])`
flake8 fix 2020-03-11 09:38:14 +08:00			else: # assume `inst` is a number
env and data 2020-03-11 09:09:56 +08:00			`self.__dict__[name] = np.zeros([self._maxsize])`
ddpg 2020-03-18 21:45:41 +08:00			`if isinstance(inst, np.ndarray) and \`
			`self.__dict__[name].shape[1:] != inst.shape:`
			`self.__dict__[name] = np.zeros([self._maxsize, *inst.shape])`
env and data 2020-03-11 09:09:56 +08:00			`self.__dict__[name][self._index] = inst`

add cache buf in collector 2020-03-14 21:48:31 +08:00			`def update(self, buffer):`
docs for env 2020-04-04 21:02:06 +08:00			`"""Move the data from the given buffer to self."""`
fix some bugs 2020-03-16 11:11:29 +08:00			`i = begin = buffer._index % len(buffer)`
			`while True:`
add cache buf in collector 2020-03-14 21:48:31 +08:00			`self.add(`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`buffer.obs[i], buffer.act[i], buffer.rew[i], buffer.done[i],`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`buffer.obs_next[i] if self._save_s_ else None,`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`buffer.info[i])`
fix some bugs 2020-03-16 11:11:29 +08:00			`i = (i + 1) % len(buffer)`
			`if i == begin:`
			`break`
add cache buf in collector 2020-03-14 21:48:31 +08:00
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`def add(self, obs, act, rew, done, obs_next=None, info={}, weight=None):`
docs for env 2020-04-04 21:02:06 +08:00			`"""Add a batch of data into replay buffer."""`
minor reformat (#2) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest 2020-03-26 09:01:20 +08:00			`assert isinstance(info, dict), \`
maybe finished collector? 2020-03-13 17:49:22 +08:00			`'You should return a dict in the last argument of env.step().'`
env and data 2020-03-11 09:09:56 +08:00			`self._add_to_buffer('obs', obs)`
			`self._add_to_buffer('act', act)`
			`self._add_to_buffer('rew', rew)`
			`self._add_to_buffer('done', done)`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`if self._save_s_:`
			`self._add_to_buffer('obs_next', obs_next)`
env and data 2020-03-11 09:09:56 +08:00			`self._add_to_buffer('info', info)`
add ListReplayBuffer 2020-03-28 15:14:41 +08:00			`if self._maxsize > 0:`
			`self._size = min(self._size + 1, self._maxsize)`
			`self._index = (self._index + 1) % self._maxsize`
			`else:`
			`self._size = self._index = self._index + 1`
env and data 2020-03-11 09:09:56 +08:00
			`def reset(self):`
docs for env 2020-04-04 21:02:06 +08:00			`"""Clear all the data in replay buffer."""`
env and data 2020-03-11 09:09:56 +08:00			`self._index = self._size = 0`

maybe finished collector? 2020-03-13 17:49:22 +08:00			`def sample(self, batch_size):`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			`"""Get a random sample from buffer with size equal to batch_size. \`
			Return all the data in the buffer if batch_size is ``0``.
add some docs 2020-04-03 21:28:12 +08:00
			`:return: Sample data and its corresponding index inside the buffer.`
			`"""`
half of collector 2020-03-12 22:20:33 +08:00			`if batch_size > 0:`
maybe finished collector? 2020-03-13 17:49:22 +08:00			`indice = np.random.choice(self._size, batch_size)`
half of collector 2020-03-12 22:20:33 +08:00			`else:`
finish pg 2020-03-17 11:37:31 +08:00			`indice = np.concatenate([`
			`np.arange(self._index, self._size),`
			`np.arange(0, self._index),`
			`])`
update some tutorial 2020-03-30 22:52:25 +08:00			`return self[indice], indice`
env and data 2020-03-11 09:09:56 +08:00
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`def get(self, indice, key):`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`"""Return the stacked result, e.g. [s_{t-3}, s_{t-2}, s_{t-1}, s_t],`
			`where s is self.key, t is indice. The stack_num (here equals to 4) is`
			`given from buffer initialization procedure.`
			`"""`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`if not isinstance(indice, np.ndarray):`
			`if np.isscalar(indice):`
			`indice = np.array(indice)`
			`elif isinstance(indice, slice):`
			`indice = np.arange(`
			`0 if indice.start is None else indice.start,`
			`self._size if indice.stop is None else indice.stop,`
			`1 if indice.step is None else indice.step)`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`# set last frame done to True`
			`last_index = (self._index - 1 + self._size) % self._size`
			`last_done, self.done[last_index] = self.done[last_index], True`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`if key == 'obs_next' and not self._save_s_:`
			`indice += 1 - self.done[indice].astype(np.int)`
			`indice[indice == self._size] = 0`
			`key = 'obs'`
			`if self._stack == 0:`
			`self.done[last_index] = last_done`
			`return self.__dict__[key][indice]`
			`stack = []`
maybe finished rnn? 2020-04-08 21:13:15 +08:00			`for i in range(self._stack):`
			`stack = [self.__dict__[key][indice]] + stack`
fix rnn (#19), add __repr__, and fix #26 2020-04-09 19:53:45 +08:00			`pre_indice = indice - 1`
			`pre_indice[pre_indice == -1] = self._size - 1`
			`indice = pre_indice + self.done[pre_indice].astype(np.int)`
			`indice[indice == self._size] = 0`
			`self.done[last_index] = last_done`
maybe finished rnn? 2020-04-08 21:13:15 +08:00			`return np.stack(stack, axis=1)`

finish dqn 2020-03-15 17:41:00 +08:00			`def __getitem__(self, index):`
maybe finished rnn? 2020-04-08 21:13:15 +08:00			`"""Return a data batch: self[index]. If stack_num is set to be > 0,`
			`return the stacked obs and obs_next with shape [batch, len, ...].`
			`"""`
finish dqn 2020-03-15 17:41:00 +08:00			`return Batch(`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`obs=self.get(index, 'obs'),`
finish dqn 2020-03-15 17:41:00 +08:00			`act=self.act[index],`
			`rew=self.rew[index],`
			`done=self.done[index],`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`obs_next=self.get(index, 'obs_next'),`
finish dqn 2020-03-15 17:41:00 +08:00			`info=self.info[index]`
			`)`

env and data 2020-03-11 09:09:56 +08:00
add ListReplayBuffer 2020-03-28 15:14:41 +08:00			`class ListReplayBuffer(ReplayBuffer):`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			"""The function of :class:`~tianshou.data.ListReplayBuffer` is almost the
			same as :class:`~tianshou.data.ReplayBuffer`. The only difference is that
add some docs 2020-04-03 21:28:12 +08:00			:class:`~tianshou.data.ListReplayBuffer` is based on ``list``.
seealso and change policy dir structure 2020-04-09 21:36:53 +08:00
			`.. seealso::`

			Please refer to :class:`~tianshou.data.ListReplayBuffer` for more
			`detailed explanation.`
add some docs 2020-04-03 21:28:12 +08:00			`"""`

add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`def __init__(self, **kwargs):`
			`super().__init__(size=0, ignore_obs_next=False, **kwargs)`
add ListReplayBuffer 2020-03-28 15:14:41 +08:00
			`def _add_to_buffer(self, name, inst):`
			`if inst is None:`
			`return`
			`if self.__dict__.get(name, None) is None:`
			`self.__dict__[name] = []`
			`self.__dict__[name].append(inst)`

			`def reset(self):`
			`self._index = self._size = 0`
			`for k in list(self.__dict__.keys()):`
			`if not k.startswith('_'):`
			`self.__dict__[k] = []`


env and data 2020-03-11 09:09:56 +08:00			`class PrioritizedReplayBuffer(ReplayBuffer):`
			`"""docstring for PrioritizedReplayBuffer"""`
maybe finished collector? 2020-03-13 17:49:22 +08:00
Prioritized DQN (#30) * add sum_tree.py * add prioritized replay buffer * del sum_tree.py * fix some format issues * fix weight_update bug * simply replace replaybuffer in test_dqn without weight update * weight default set to 1 * fix sampling bug when buffer is not full * rename parameter * fix formula error, add accuracy check * add PrioritizedDQN test * add test_pdqn.py * add update_weight() doc * add ref of prio dqn in readme.md and index.rst * restore test_dqn.py, fix args of test_pdqn.py 2020-04-26 12:05:58 +08:00			`def __init__(self, size, alpha: float, beta: float,`
			`mode: str = 'weight', **kwargs):`
			`if mode != 'weight':`
			`raise NotImplementedError`
add ignore_obs_next in buffer 2020-04-10 09:01:17 +08:00			`super().__init__(size, **kwargs)`
Prioritized DQN (#30) * add sum_tree.py * add prioritized replay buffer * del sum_tree.py * fix some format issues * fix weight_update bug * simply replace replaybuffer in test_dqn without weight update * weight default set to 1 * fix sampling bug when buffer is not full * rename parameter * fix formula error, add accuracy check * add PrioritizedDQN test * add test_pdqn.py * add update_weight() doc * add ref of prio dqn in readme.md and index.rst * restore test_dqn.py, fix args of test_pdqn.py 2020-04-26 12:05:58 +08:00			`self._alpha = alpha # prioritization exponent`
			`self._beta = beta # importance sample soft coefficient`
			`self._weight_sum = 0.0`
			`self.weight = np.zeros(size, dtype=np.float64)`
			`self._amortization_freq = 50`
			`self._amortization_counter = 0`

			`def add(self, obs, act, rew, done, obs_next=0, info={}, weight=1.0):`
			`"""Add a batch of data into replay buffer."""`
			`self._weight_sum += np.abs(weight)**self._alpha - \`
			`self.weight[self._index]`
			`# we have to sacrifice some convenience for speed :(`
			`self._add_to_buffer('weight', np.abs(weight)**self._alpha)`
			`super().add(obs, act, rew, done, obs_next, info)`
			`self._check_weight_sum()`

			`def sample(self, batch_size: int = 0, importance_sample: bool = True):`
			`""" Get a random sample from buffer with priority probability. \`
			Return all the data in the buffer if batch_size is ``0``.
flake8 fix 2020-03-11 09:38:14 +08:00
Prioritized DQN (#30) * add sum_tree.py * add prioritized replay buffer * del sum_tree.py * fix some format issues * fix weight_update bug * simply replace replaybuffer in test_dqn without weight update * weight default set to 1 * fix sampling bug when buffer is not full * rename parameter * fix formula error, add accuracy check * add PrioritizedDQN test * add test_pdqn.py * add update_weight() doc * add ref of prio dqn in readme.md and index.rst * restore test_dqn.py, fix args of test_pdqn.py 2020-04-26 12:05:58 +08:00			`:return: Sample data and its corresponding index inside the buffer.`
			`"""`
			`if batch_size > 0 and batch_size <= self._size:`
			`# Multiple sampling of the same sample`
			`# will cause weight update conflict`
			`indice = np.random.choice(`
			`self._size, batch_size,`
			`p=(self.weight/self.weight.sum())[:self._size], replace=False)`
			`# self._weight_sum is not work for the accuracy issue`
			`# p=(self.weight/self._weight_sum)[:self._size], replace=False)`
			`elif batch_size == 0:`
			`indice = np.concatenate([`
			`np.arange(self._index, self._size),`
			`np.arange(0, self._index),`
			`])`
			`else:`
			`# if batch_size larger than len(self),`
			`# it will lead to a bug in update weight`
			`raise ValueError("batch_size should be less than len(self)")`
			`batch = self[indice]`
			`if importance_sample:`
			`impt_weight = Batch(`
			`impt_weight=1/np.power(`
			`self._size*(batch.weight/self._weight_sum), self._beta))`
			`batch.append(impt_weight)`
			`self._check_weight_sum()`
			`return batch, indice`
env and data 2020-03-11 09:09:56 +08:00
update some tutorial 2020-03-30 22:52:25 +08:00			`def reset(self):`
Prioritized DQN (#30) * add sum_tree.py * add prioritized replay buffer * del sum_tree.py * fix some format issues * fix weight_update bug * simply replace replaybuffer in test_dqn without weight update * weight default set to 1 * fix sampling bug when buffer is not full * rename parameter * fix formula error, add accuracy check * add PrioritizedDQN test * add test_pdqn.py * add update_weight() doc * add ref of prio dqn in readme.md and index.rst * restore test_dqn.py, fix args of test_pdqn.py 2020-04-26 12:05:58 +08:00			`self._amortization_counter = 0`
			`super().reset()`

			`def update_weight(self, indice, new_weight: np.ndarray):`
			`"""update priority weight by indice in this buffer`

			`:param indice: indice you want to update weight`
			`:param new_weight: new priority weight you wangt to update`
			`"""`
			`self._weight_sum += np.power(np.abs(new_weight), self._alpha).sum() \`
			`- self.weight[indice].sum()`
			`self.weight[indice] = np.power(np.abs(new_weight), self._alpha)`

			`def __getitem__(self, index):`
			`return Batch(`
			`obs=self.get(index, 'obs'),`
			`act=self.act[index],`
			`rew=self.rew[index],`
			`done=self.done[index],`
			`obs_next=self.get(index, 'obs_next'),`
			`info=self.info[index],`
			`weight=self.weight[index]`
			`)`

			`def _check_weight_sum(self):`
			`# keep a accurate _weight_sum`
			`self._amortization_counter += 1`
			`if self._amortization_counter % self._amortization_freq == 0:`
			`self._weight_sum = np.sum(self.weight)`
			`self._amortization_counter = 0`