an initial version of untested replaymemory qreturn

2018-03-03 21:25:29 +08:00 · 2018-03-03 21:25:29 +08:00 · 0cf2fd6c53
commit 0cf2fd6c53
parent 528c4be93c
1 changed files with 43 additions and 4 deletions
--- a/tianshou/data/advantage_estimation.py
+++ b/tianshou/data/advantage_estimation.py
@ -1,5 +1,6 @@
 import logging
-
+import tensorflow as tf
 import numpy as np
 STATE = 0
 ACTION = 1
@ -100,7 +101,7 @@ class ddpg_return:
        pass
-class nstep_q_return:
+class ReplayMemoryQReturn:
    """
    compute the n-step return for Q-learning targets
    """
@ -109,11 +110,49 @@ class nstep_q_return:
        self.action_value = action_value
        self.use_target_network = use_target_network
-    def __call__(self, buffer, index=None):
+    # TODO : we should transfer the tf -> numpy/python -> tf into a monolithic compute graph in tf
    def __call__(self, buffer, indexes =None):
        """
        :param buffer: buffer with property index and data. index determines the current content in `buffer`.
        :param index: (sampled) index to be computed. Defaults to all the data in `buffer`. Not necessarily in order within
                      each episode.
        :return: dict with key 'return' and value the computed returns corresponding to `index`.
        """
-        pass
+        qvalue = self.action_value._value_tensor_all_actions
        indexes = indexes or buffer.index
        episodes = buffer.data
        discount_factor = 0.99
        returns = []
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            for episode_index in range(len(indexes)):
                index = indexes[episode_index]
                if index:
                    episode = episodes[episode_index]
                    episode_q = []
                    if not episode[-1][DONE]:
                        logging.warning('Computing Q return on episode {} with no terminal state.'.format(episode_index))
                    for i in index:
                        current_discount_factor = 1
                        last_frame_index = i
                        target_q = episode[i][REWARD]
                        for lfi in range(i, min(len(episode), i + self.n + 1)):
                            if episode[lfi][DONE]:
                                break
                            target_q += current_discount_factor * episode[lfi][REWARD]
                            current_discount_factor *= discount_factor
                            last_frame_index = lfi
                        if last_frame_index > i:
                            target_q += current_discount_factor * \
                                 max(sess.run(qvalue, feed_dict={self.action_value.managed_placeholders['observation']:
                                                                     episode[last_frame_index][STATE]}))
                        episode_q.append(target_q)
                    returns.append(episode_q)
                else:
                    returns.append([])
        return {'TD-lambda': returns}