From e68dcd3c649cdc34b69096b31562d4afe712c6ae Mon Sep 17 00:00:00 2001
From: haoshengzou <zouhaosheng@163.com>
Date: Thu, 8 Mar 2018 16:51:12 +0800
Subject: [PATCH] working on off-policy test. other parts of dqn_replay is
 runnable, but performance not tested.

---
 examples/dqn_replay.py                       | 42 ++++++-------
 tianshou/core/losses.py                      | 11 ----
 tianshou/core/policy/dqn.py                  | 22 ++++++-
 tianshou/core/value_function/action_value.py |  2 +
 tianshou/data/advantage_estimation.py        | 65 ++++++++++----------
 tianshou/data/data_collector.py              | 17 +++--
 tianshou/data/tester.py                      |  8 +++
 7 files changed, 92 insertions(+), 75 deletions(-)
 create mode 100644 tianshou/data/tester.py

diff --git a/examples/dqn_replay.py b/examples/dqn_replay.py
index 127ea00..b9a5614 100644
--- a/examples/dqn_replay.py
+++ b/examples/dqn_replay.py
@@ -1,6 +1,4 @@
 #!/usr/bin/env python
-from __future__ import absolute_import
-
 import tensorflow as tf
 import gym
 import numpy as np
@@ -10,11 +8,9 @@ import time
 import sys
 sys.path.append('..')
 from tianshou.core import losses
-# from tianshou.data.batch import Batch
 import tianshou.data.advantage_estimation as advantage_estimation
-import tianshou.core.policy.dqn as policy  # TODO: fix imports as zhusuan so that only need to import to policy
+import tianshou.core.policy.dqn as policy
 import tianshou.core.value_function.action_value as value_function
-import sys
 
 from tianshou.data.replay_buffer.vanilla import VanillaReplayBuffer
 from tianshou.data.data_collector import DataCollector
@@ -25,14 +21,6 @@ if __name__ == '__main__':
     observation_dim = env.observation_space.shape
     action_dim = env.action_space.n
 
-    clip_param = 0.2
-    num_batches = 10
-    batch_size = 512
-
-    seed = 0
-    np.random.seed(seed)
-    tf.set_random_seed(seed)
-
     ### 1. build network with pure tf
     observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)
 
@@ -45,7 +33,7 @@ if __name__ == '__main__':
         return None, action_values  # no policy head
 
     ### 2. build policy, loss, optimizer
-    dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, weight_update=100)
+    dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, weight_update=200)
     pi = policy.DQN(dqn)
 
     dqn_loss = losses.qlearning(dqn)
@@ -69,6 +57,17 @@ if __name__ == '__main__':
     )
 
     ### 4. start training
+    # hyper-parameters
+    batch_size = 256
+    replay_buffer_warmup = 1000
+    epsilon_decay_interval = 200
+    epsilon = 0.3
+    test_interval = 1000
+
+    seed = 0
+    np.random.seed(seed)
+    tf.set_random_seed(seed)
+
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     with tf.Session(config=config) as sess:
@@ -78,12 +77,11 @@ if __name__ == '__main__':
         pi.sync_weights()  # TODO: automate this for policies with target network
 
         start_time = time.time()
-        epsilon = 0.5
         pi.set_epsilon_train(epsilon)
-        data_collector.collect(num_timesteps=int(1e3))  # warm-up
+        data_collector.collect(num_timesteps=replay_buffer_warmup)  # warm-up
         for i in range(int(1e8)):  # number of training steps
             # anneal epsilon step-wise
-            if (i + 1) % 1e4 == 0 and epsilon > 0.1:
+            if (i + 1) % epsilon_decay_interval == 0 and epsilon > 0.1:
                 epsilon -= 0.1
                 pi.set_epsilon_train(epsilon)
 
@@ -91,15 +89,13 @@ if __name__ == '__main__':
             data_collector.collect()
 
             # update network
-            for _ in range(num_batches):
-                feed_dict = data_collector.next_batch(batch_size)
-                sess.run(train_op, feed_dict=feed_dict)
-
-            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
+            feed_dict = data_collector.next_batch(batch_size)
+            sess.run(train_op, feed_dict=feed_dict)
 
             # test every 1000 training steps
             # tester could share some code with batch!
-            if i % 1000 == 0:
+            if i % test_interval == 0:
+                print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
                 # epsilon 0.05 as in nature paper
                 pi.set_epsilon_test(0.05)
                 #test(env, pi)  # go for act_test of pi, not act
diff --git a/tianshou/core/losses.py b/tianshou/core/losses.py
index 396054a..3b30e51 100644
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@@ -69,14 +69,3 @@ def qlearning(action_value_function):
 
     q_value = action_value_function.value_tensor
     return tf.losses.mean_squared_error(target_value_ph, q_value)
-
-
-def deterministic_policy_gradient(sampled_state, critic):
-    """
-    deterministic policy gradient:
-
-    :param sampled_action: placeholder of sampled actions during the interaction with the environment
-    :param critic: current `value` function
-    :return:
-    """
-    return tf.reduce_mean(critic.get_value(sampled_state))
\ No newline at end of file
diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py
index b93f1af..1bc91c7 100644
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@@ -30,8 +30,10 @@ class DQN(PolicyBase):
         feed_dict = {self.action_value._observation_placeholder: observation[None]}
         feed_dict.update(my_feed_dict)
         action = sess.run(self._argmax_action, feed_dict=feed_dict)
+
+        # epsilon_greedy
         if np.random.rand() < self.epsilon_train:
-            pass
+            action = np.random.randint(self.action_value.num_actions)
 
         if self.weight_update > 0:
             self.interaction_count += 1
@@ -39,7 +41,23 @@ class DQN(PolicyBase):
         return np.squeeze(action)
 
     def act_test(self, observation, my_feed_dict={}):
-        pass
+        sess = tf.get_default_session()
+        if self.weight_update > 1:
+            if self.interaction_count % self.weight_update == 0:
+                self.update_weights()
+
+        feed_dict = {self.action_value._observation_placeholder: observation[None]}
+        feed_dict.update(my_feed_dict)
+        action = sess.run(self._argmax_action, feed_dict=feed_dict)
+
+        # epsilon_greedy
+        if np.random.rand() < self.epsilon_test:
+            action = np.random.randint(self.action_value.num_actions)
+
+        if self.weight_update > 0:
+            self.interaction_count += 1
+
+        return np.squeeze(action)
 
     @property
     def q_net(self):
diff --git a/tianshou/core/value_function/action_value.py b/tianshou/core/value_function/action_value.py
index 2773687..dbed5e9 100644
--- a/tianshou/core/value_function/action_value.py
+++ b/tianshou/core/value_function/action_value.py
@@ -114,6 +114,8 @@ class DQN(ValueFunctionBase):
 
             self._value_tensor_all_actions = value_tensor
 
+            self.num_actions = value_tensor.shape.as_list()[-1]
+
             batch_size = tf.shape(value_tensor)[0]
             batch_dim_index = tf.range(batch_size)
             indices = tf.stack([batch_dim_index, action_placeholder], axis=1)
diff --git a/tianshou/data/advantage_estimation.py b/tianshou/data/advantage_estimation.py
index f86fe8c..151b260 100644
--- a/tianshou/data/advantage_estimation.py
+++ b/tianshou/data/advantage_estimation.py
@@ -1,5 +1,4 @@
 import logging
-import tensorflow as tf
 import numpy as np
 
 STATE = 0
@@ -105,12 +104,12 @@ class nstep_q_return:
     """
     compute the n-step return for Q-learning targets
     """
-    def __init__(self, n, action_value, use_target_network=True):
+    def __init__(self, n, action_value, use_target_network=True, discount_factor=0.99):
         self.n = n
         self.action_value = action_value
         self.use_target_network = use_target_network
+        self.discount_factor = discount_factor
 
-    # TODO : we should transfer the tf -> numpy/python -> tf into a monolithic compute graph in tf
     def __call__(self, buffer, indexes=None):
         """
         :param buffer: buffer with property index and data. index determines the current content in `buffer`.
@@ -118,41 +117,39 @@ class nstep_q_return:
                       each episode.
         :return: dict with key 'return' and value the computed returns corresponding to `index`.
         """
-        qvalue = self.action_value._value_tensor_all_actions
         indexes = indexes or buffer.index
         episodes = buffer.data
-        discount_factor = 0.99
         returns = []
 
-        config = tf.ConfigProto()
-        config.gpu_options.allow_growth = True
-        with tf.Session(config=config) as sess:
-            sess.run(tf.global_variables_initializer())
-            for episode_index in range(len(indexes)):
-                index = indexes[episode_index]
-                if index:
-                    episode = episodes[episode_index]
-                    episode_q = []
+        for episode_index in range(len(indexes)):
+            index = indexes[episode_index]
+            if index:
+                episode = episodes[episode_index]
+                episode_q = []
 
-                    for i in index:
-                        current_discount_factor = 1
-                        last_frame_index = i
-                        target_q = episode[i][REWARD]
-                        for lfi in range(i, min(len(episode), i + self.n + 1)):
-                            if episode[lfi][DONE]:
-                                break
-                            target_q += current_discount_factor * episode[lfi][REWARD]
-                            current_discount_factor *= discount_factor
-                            last_frame_index = lfi
-                        if last_frame_index > i:
-                            state = episode[last_frame_index][STATE]
-                            # the shape of qpredict is [batch_size, action_dimension]
-                            qpredict = sess.run(qvalue, feed_dict={self.action_value.managed_placeholders['observation']:
-                                                                        state.reshape(1, state.shape[0])})
-                            target_q += current_discount_factor * max(qpredict[0])
-                        episode_q.append(target_q)
+                for i in index:
+                    current_discount_factor = 1
+                    last_frame_index = i
+                    target_q = episode[i][REWARD]
+                    for lfi in range(i, min(len(episode), i + self.n + 1)):
+                        if episode[lfi][DONE]:
+                            break
+                        target_q += current_discount_factor * episode[lfi][REWARD]
+                        current_discount_factor *= self.discount_factor
+                        last_frame_index = lfi
+                    if last_frame_index > i:
+                        state = episode[last_frame_index][STATE]
+
+                        if self.use_target_network:
+                            # [None] adds one dimension to the beginning
+                            qpredict = self.action_value.eval_value_all_actions_old(state[None])
+                        else:
+                            qpredict = self.action_value.eval_value_all_actions(state[None])
+                        target_q += current_discount_factor * max(qpredict[0])
+                    episode_q.append(target_q)
+
+                returns.append(episode_q)
+            else:
+                returns.append([])
 
-                    returns.append(episode_q)
-                else:
-                    returns.append([])
         return {'return': returns}
diff --git a/tianshou/data/data_collector.py b/tianshou/data/data_collector.py
index 610d0f3..9834761 100644
--- a/tianshou/data/data_collector.py
+++ b/tianshou/data/data_collector.py
@@ -36,14 +36,20 @@ class DataCollector(object):
             "One and only one collection number specification permitted!"
 
         if num_timesteps > 0:
-            for _ in range(num_timesteps):
+            num_timesteps_ = int(num_timesteps)
+            for _ in range(num_timesteps_):
                 action = self.policy.act(self.current_observation, my_feed_dict=my_feed_dict)
                 next_observation, reward, done, _ = self.env.step(action)
                 self.data_buffer.add((self.current_observation, action, reward, done))
-                self.current_observation = next_observation
+
+                if done:
+                    self.current_observation = self.env.reset()
+                else:
+                    self.current_observation = next_observation
 
         if num_episodes > 0:
-            for _ in range(num_episodes):
+            num_episodes_ = int(num_episodes)
+            for _ in range(num_episodes_):
                 observation = self.env.reset()
                 done = False
                 while not done:
@@ -56,7 +62,7 @@ class DataCollector(object):
             for processor in self.process_functions:
                 self.data.update(processor(self.data_buffer))
 
-    def next_batch(self, batch_size, standardize_advantage=True):
+    def next_batch(self, batch_size, standardize_advantage=None):
         sampled_index = self.data_buffer.sample(batch_size)
         if self.process_mode == 'sample':
             for processor in self.process_functions:
@@ -87,7 +93,8 @@ class DataCollector(object):
             else:
                 raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))
 
-        if standardize_advantage:
+        auto_standardize = (standardize_advantage is None) and self.require_advantage
+        if standardize_advantage or auto_standardize:
             if self.require_advantage:
                 advantage_value = feed_dict[self.required_placeholders['advantage']]
                 advantage_mean = np.mean(advantage_value)
diff --git a/tianshou/data/tester.py b/tianshou/data/tester.py
new file mode 100644
index 0000000..2a2e407
--- /dev/null
+++ b/tianshou/data/tester.py
@@ -0,0 +1,8 @@
+from __future__ import absolute_import
+
+
+def test_policy_in_env(policy, env):
+    # make another env as the original is for training data collection
+    env_ = env
+
+    pass
\ No newline at end of file