Tianshou/tianshou/data/replay_buffer/Replay.py

import tianshou.data.replay_buffer.naive as naive
import tianshou.data.replay_buffer.rank_based as rank_based
import tianshou.data.replay_buffer.proportional as proportional
import numpy as np
import tensorflow as tf
from tianshou.data import utils
import logging


class Replay(object):
    def __init__(self, replay_memory, env, pi, reward_processors, networks):
        self._replay_memory = replay_memory
        self._env = env
        self._pi = pi
        self._reward_processors = reward_processors
        self._networks = networks

        self._required_placeholders = {}
        for net in self._networks:
            self._required_placeholders.update(net.managed_placeholders)
        self._require_advantage = 'advantage' in self._required_placeholders.keys()
        self._collected_data = list()

        self._is_first_collect = True

    def _begin_act(self, exploration):
        while self._is_first_collect:
            self._observation = self._env.reset()
            self._action = self._pi.act(self._observation, exploration)
            self._observation, reward, done, _ = self._env.step(self._action)
            if not done:
                self._is_first_collect = False

    def collect(self, nums, exploration=None):
        """
        collect data for replay memory and update the priority according to the given data.
        store the previous action, previous observation, reward, action, observation in the replay memory.
        """
        sess = tf.get_default_session()
        self._collected_data = list()

        for _ in range(0, nums):
            if self._is_first_collect:
                self._begin_act(exploration)

            current_data = dict()
            current_data['previous_action'] = self._action
            current_data['previous_observation'] = self._observation
            self._action = self._pi.act(self._observation, exploration)
            self._observation, reward, done, _ = self._env.step(self._action)
            current_data['action'] = self._action
            current_data['observation'] = self._observation
            current_data['reward'] = reward
            current_data['end_flag'] = done
            self._replay_memory.add(current_data)
            self._collected_data.append(current_data)
            if done:
                self._begin_act(exploration)

    # I don't know what statistics should replay memory provide, for replay memory only saves discrete data
    def statistics(self):
        """
        compute the statistics of the current sampled paths
        :return:
        """
        raw_data = dict(zip(self._collected_data[0], zip(*[d.values() for d in self._collected_data])))
        rewards = np.array(raw_data['reward'])
        episode_start_flags = np.array(raw_data['end_flag'])
        num_timesteps = rewards.shape[0]

        returns = []
        episode_lengths = []
        max_return = 0
        num_episodes = 1
        episode_start_idx = 0
        for i in range(1, num_timesteps):
            if episode_start_flags[i] or (
                    i == num_timesteps - 1):  # found the start of next episode or the end of all episodes
                if episode_start_flags[i]:
                    num_episodes += 1
                if i < rewards.shape[0] - 1:
                    t = i - 1
                else:
                    t = i
                Gt = 0
                episode_lengths.append(t - episode_start_idx)
                while t >= episode_start_idx:
                    Gt += rewards[t]
                    t -= 1

                returns.append(Gt)
                if Gt > max_return:
                    max_return = Gt
                episode_start_idx = i

        print('AverageReturn: {}'.format(np.mean(returns)))
        print('StdReturn    : {}'.format(np.std(returns)))
        print('NumEpisodes  : {}'.format(num_episodes))
        print('MinMaxReturns: {}..., {}'.format(np.sort(returns)[:3], np.sort(returns)[-3:]))
        print('AverageLength: {}'.format(np.mean(episode_lengths)))
        print('MinMaxLengths: {}..., {}'.format(np.sort(episode_lengths)[:3], np.sort(episode_lengths)[-3:]))

    def next_batch(self, batch_size, global_step=0, standardize_advantage=True):
        """
        collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for
        updating q value network.
        :param batch_size: int batch size.
        :param global_step: int training global step.
        :return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient
        for gradient of q value network.
        """

        feed_dict = {}
        is_first = True

        for _ in range(0, batch_size):
            current_datas, current_wis, current_indexs = \
                self._replay_memory.sample(
                    {'batch_size': 1, 'global_step': global_step})
            current_data = current_datas[0]
            current_wi = current_wis[0]
            current_index = current_indexs[0]
            current_processed_data = {}
            for processors in self._reward_processors:
                current_processed_data.update(processors(current_data))

            for key, placeholder in self._required_placeholders.items():
                found, data_key = utils.internal_key_match(key, current_data.keys())
                if found:
                    if is_first:
                        feed_dict[placeholder] = np.array([current_data[data_key]])
                    else:
                        feed_dict[placeholder] = np.append(feed_dict[placeholder], np.array([current_data[data_key]]), 0)
                else:
                    found, data_key = utils.internal_key_match(key, current_processed_data.keys())
                    if found:
                        if is_first:
                            feed_dict[placeholder] = np.array(current_processed_data[data_key])
                        else:
                            feed_dict[placeholder] = np.append(feed_dict[placeholder],
                                                               np.array(current_processed_data[data_key]), 0)
                    else:
                        raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))
            next_max_qvalue = np.max(self._networks[-1].eval_value_all_actions(
                current_data['observation'].reshape((1,) + current_data['observation'].shape)))
            current_qvalue = self._networks[-1].eval_value_all_actions(
                current_data['previous_observation']
                    .reshape((1,) + current_data['previous_observation'].shape))[0, current_data['previous_action']]
            reward = current_data['reward'] + next_max_qvalue - current_qvalue
            import math
            self._replay_memory.update_priority([current_index], [math.fabs(reward)])
            if is_first:
                is_first = False

        if standardize_advantage:
            if self._require_advantage:
                advantage_value = feed_dict[self._required_placeholders['advantage']]
                advantage_mean = np.mean(advantage_value)
                advantage_std = np.std(advantage_value)
                if advantage_std < 1e-3:
                    logging.warning(
                        'advantage_std too small (< 1e-3) for advantage standardization. may cause numerical issues')
                feed_dict[self._required_placeholders['advantage']] = (advantage_value - advantage_mean) / advantage_std
        return feed_dict
first modify of replay buffer, make all three replay buffers work, wait for refactoring and testing 2018-02-27 13:10:47 +08:00			`import tianshou.data.replay_buffer.naive as naive`
			`import tianshou.data.replay_buffer.rank_based as rank_based`
			`import tianshou.data.replay_buffer.proportional as proportional`
			`import numpy as np`
			`import tensorflow as tf`
			`from tianshou.data import utils`
			`import logging`


			`class Replay(object):`
			`def __init__(self, replay_memory, env, pi, reward_processors, networks):`
			`self._replay_memory = replay_memory`
			`self._env = env`
			`self._pi = pi`
			`self._reward_processors = reward_processors`
			`self._networks = networks`

			`self._required_placeholders = {}`
			`for net in self._networks:`
			`self._required_placeholders.update(net.managed_placeholders)`
			`self._require_advantage = 'advantage' in self._required_placeholders.keys()`
			`self._collected_data = list()`

			`self._is_first_collect = True`

			`def _begin_act(self, exploration):`
			`while self._is_first_collect:`
			`self._observation = self._env.reset()`
			`self._action = self._pi.act(self._observation, exploration)`
			`self._observation, reward, done, _ = self._env.step(self._action)`
			`if not done:`
			`self._is_first_collect = False`

			`def collect(self, nums, exploration=None):`
			`"""`
			`collect data for replay memory and update the priority according to the given data.`
			`store the previous action, previous observation, reward, action, observation in the replay memory.`
			`"""`
			`sess = tf.get_default_session()`
			`self._collected_data = list()`

			`for _ in range(0, nums):`
			`if self._is_first_collect:`
			`self._begin_act(exploration)`

			`current_data = dict()`
			`current_data['previous_action'] = self._action`
			`current_data['previous_observation'] = self._observation`
			`self._action = self._pi.act(self._observation, exploration)`
			`self._observation, reward, done, _ = self._env.step(self._action)`
			`current_data['action'] = self._action`
			`current_data['observation'] = self._observation`
			`current_data['reward'] = reward`
			`current_data['end_flag'] = done`
			`self._replay_memory.add(current_data)`
			`self._collected_data.append(current_data)`
			`if done:`
			`self._begin_act(exploration)`

			`# I don't know what statistics should replay memory provide, for replay memory only saves discrete data`
			`def statistics(self):`
			`"""`
			`compute the statistics of the current sampled paths`
			`:return:`
			`"""`
			`raw_data = dict(zip(self._collected_data[0], zip(*[d.values() for d in self._collected_data])))`
			`rewards = np.array(raw_data['reward'])`
			`episode_start_flags = np.array(raw_data['end_flag'])`
			`num_timesteps = rewards.shape[0]`

			`returns = []`
			`episode_lengths = []`
			`max_return = 0`
			`num_episodes = 1`
			`episode_start_idx = 0`
			`for i in range(1, num_timesteps):`
			`if episode_start_flags[i] or (`
			`i == num_timesteps - 1): # found the start of next episode or the end of all episodes`
			`if episode_start_flags[i]:`
			`num_episodes += 1`
			`if i < rewards.shape[0] - 1:`
			`t = i - 1`
			`else:`
			`t = i`
			`Gt = 0`
			`episode_lengths.append(t - episode_start_idx)`
			`while t >= episode_start_idx:`
			`Gt += rewards[t]`
			`t -= 1`

			`returns.append(Gt)`
			`if Gt > max_return:`
			`max_return = Gt`
			`episode_start_idx = i`

			`print('AverageReturn: {}'.format(np.mean(returns)))`
			`print('StdReturn : {}'.format(np.std(returns)))`
			`print('NumEpisodes : {}'.format(num_episodes))`
			`print('MinMaxReturns: {}..., {}'.format(np.sort(returns)[:3], np.sort(returns)[-3:]))`
			`print('AverageLength: {}'.format(np.mean(episode_lengths)))`
			`print('MinMaxLengths: {}..., {}'.format(np.sort(episode_lengths)[:3], np.sort(episode_lengths)[-3:]))`

			`def next_batch(self, batch_size, global_step=0, standardize_advantage=True):`
			`"""`
			`collect a batch of data from replay buffer, update the priority and calculate the necessary statistics for`
			`updating q value network.`
			`:param batch_size: int batch size.`
			`:param global_step: int training global step.`
			`:return: a batch of data, with target storing the target q value and wi, rewards storing the coefficient`
			`for gradient of q value network.`
			`"""`

			`feed_dict = {}`
			`is_first = True`

			`for _ in range(0, batch_size):`
			`current_datas, current_wis, current_indexs = \`
			`self._replay_memory.sample(`
			`{'batch_size': 1, 'global_step': global_step})`
			`current_data = current_datas[0]`
			`current_wi = current_wis[0]`
			`current_index = current_indexs[0]`
			`current_processed_data = {}`
			`for processors in self._reward_processors:`
			`current_processed_data.update(processors(current_data))`

			`for key, placeholder in self._required_placeholders.items():`
			`found, data_key = utils.internal_key_match(key, current_data.keys())`
			`if found:`
			`if is_first:`
			`feed_dict[placeholder] = np.array([current_data[data_key]])`
			`else:`
			`feed_dict[placeholder] = np.append(feed_dict[placeholder], np.array([current_data[data_key]]), 0)`
			`else:`
			`found, data_key = utils.internal_key_match(key, current_processed_data.keys())`
			`if found:`
			`if is_first:`
			`feed_dict[placeholder] = np.array(current_processed_data[data_key])`
			`else:`
			`feed_dict[placeholder] = np.append(feed_dict[placeholder],`
			`np.array(current_processed_data[data_key]), 0)`
			`else:`
			`raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))`
			`next_max_qvalue = np.max(self._networks[-1].eval_value_all_actions(`
			`current_data['observation'].reshape((1,) + current_data['observation'].shape)))`
			`current_qvalue = self._networks[-1].eval_value_all_actions(`
			`current_data['previous_observation']`
			`.reshape((1,) + current_data['previous_observation'].shape))[0, current_data['previous_action']]`
			`reward = current_data['reward'] + next_max_qvalue - current_qvalue`
			`import math`
			`self._replay_memory.update_priority([current_index], [math.fabs(reward)])`
			`if is_first:`
			`is_first = False`

			`if standardize_advantage:`
			`if self._require_advantage:`
			`advantage_value = feed_dict[self._required_placeholders['advantage']]`
			`advantage_mean = np.mean(advantage_value)`
			`advantage_std = np.std(advantage_value)`
			`if advantage_std < 1e-3:`
			`logging.warning(`
			`'advantage_std too small (< 1e-3) for advantage standardization. may cause numerical issues')`
			`feed_dict[self._required_placeholders['advantage']] = (advantage_value - advantage_mean) / advantage_std`
			`return feed_dict`