working on off-policy test. other parts of dqn_replay is runnable, but performance not tested.

2018-03-09 15:07:14 +08:00 · 2018-03-09 15:07:14 +08:00 · 92894d3853
commit 92894d3853
parent e68dcd3c64
10 changed files with 142 additions and 28 deletions
--- a/examples/dqn_replay.py
+++ b/examples/dqn_replay.py
@ -3,6 +3,8 @@ import tensorflow as tf
 import gym
 import numpy as np
 import time
 import logging
 logging.basicConfig(level=logging.INFO)
 # our lib imports here! It's ok to append path in examples
 import sys
@ -12,8 +14,9 @@ import tianshou.data.advantage_estimation as advantage_estimation
 import tianshou.core.policy.dqn as policy
 import tianshou.core.value_function.action_value as value_function
-from tianshou.data.replay_buffer.vanilla import VanillaReplayBuffer
+from tianshou.data.data_buffer.vanilla import VanillaReplayBuffer
 from tianshou.data.data_collector import DataCollector
 from tianshou.data.tester import test_policy_in_env
 if __name__ == '__main__':
@ -33,7 +36,7 @@ if __name__ == '__main__':
        return None, action_values  # no policy head
    ### 2. build policy, loss, optimizer
-    dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, weight_update=200)
+    dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, weight_update=800)
    pi = policy.DQN(dqn)
    dqn_loss = losses.qlearning(dqn)
@ -43,7 +46,7 @@ if __name__ == '__main__':
    train_op = optimizer.minimize(total_loss, var_list=dqn.trainable_variables)
    ### 3. define data collection
-    replay_buffer = VanillaReplayBuffer(capacity=1e5, nstep=1)
+    replay_buffer = VanillaReplayBuffer(capacity=2e4, nstep=1)
    process_functions = [advantage_estimation.nstep_q_return(1, dqn)]
    managed_networks = [dqn]
@ -58,11 +61,11 @@ if __name__ == '__main__':
    ### 4. start training
    # hyper-parameters
-    batch_size = 256
+    batch_size = 128
    replay_buffer_warmup = 1000
-    epsilon_decay_interval = 200
+    epsilon_decay_interval = 500
-    epsilon = 0.3
+    epsilon = 0.6
-    test_interval = 1000
+    test_interval = 5000
    seed = 0
    np.random.seed(seed)
@ -74,11 +77,11 @@ if __name__ == '__main__':
        sess.run(tf.global_variables_initializer())
        # assign actor to pi_old
-        pi.sync_weights()  # TODO: automate this for policies with target network
+        pi.sync_weights()  # TODO: rethink and redesign target network interface
        start_time = time.time()
        pi.set_epsilon_train(epsilon)
-        data_collector.collect(num_timesteps=replay_buffer_warmup)  # warm-up
+        data_collector.collect(num_timesteps=replay_buffer_warmup)  # TODO: uniform random warm-up
        for i in range(int(1e8)):  # number of training steps
            # anneal epsilon step-wise
            if (i + 1) % epsilon_decay_interval == 0 and epsilon > 0.1:
@ -86,7 +89,7 @@ if __name__ == '__main__':
                pi.set_epsilon_train(epsilon)
            # collect data
-            data_collector.collect()
+            data_collector.collect(num_timesteps=4)
            # update network
            feed_dict = data_collector.next_batch(batch_size)
@ -95,7 +98,7 @@ if __name__ == '__main__':
            # test every 1000 training steps
            # tester could share some code with batch!
            if i % test_interval == 0:
-                print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
+                print('Step {}, elapsed time: {:.1f} min'.format(i, (time.time() - start_time) / 60))
                # epsilon 0.05 as in nature paper
                pi.set_epsilon_test(0.05)
-                #test(env, pi)  # go for act_test of pi, not act
+                test_policy_in_env(pi, env, num_timesteps=1000)
--- a/examples/ppo_cartpole.py
+++ b/examples/ppo_cartpole.py
@ -11,15 +11,18 @@ import argparse
 import sys
 sys.path.append('..')
 from tianshou.core import losses
 from tianshou.data.batch import Batch
 import tianshou.data.advantage_estimation as advantage_estimation
-import tianshou.core.policy.stochastic as policy  # TODO: fix imports as zhusuan so that only need to import to policy
+import tianshou.core.policy.stochastic as policy
 from tianshou.data.data_buffer.vanilla import VanillaReplayBuffer
 from tianshou.data.data_collector import DataCollector
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--render", action="store_true", default=False)
    args = parser.parse_args()
    env = gym.make('CartPole-v0')
    observation_dim = env.observation_space.shape
    action_dim = env.action_space.n
@ -59,7 +62,7 @@ if __name__ == '__main__':
    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)
    ### 3. define data collection
-    training_data = Batch(env, pi, [advantage_estimation.full_return], [pi], render=args.render)
+    data_collector = Batch(env, pi, [advantage_estimation.full_return], [pi], render=args.render)
    ### 4. start training
    config = tf.ConfigProto()
@ -73,15 +76,15 @@ if __name__ == '__main__':
        start_time = time.time()
        for i in range(100):
            # collect data
-            training_data.collect(num_episodes=50)
+            data_collector.collect(num_episodes=50)
            # print current return
            print('Epoch {}:'.format(i))
-            training_data.statistics()
+            data_collector.statistics()
            # update network
            for _ in range(num_batches):
-                feed_dict = training_data.next_batch(batch_size)
+                feed_dict = data_collector.next_batch(batch_size)
                sess.run(train_op, feed_dict=feed_dict)
            # assigning actor to pi_old
--- a/tianshou/data/replay_buffer/init.py
+++ b/tianshou/data/replay_buffer/init.py
--- a/tianshou/data/replay_buffer/base.py
+++ b/tianshou/data/replay_buffer/base.py
@ -1,13 +1,13 @@
-class ReplayBufferBase(object):
+class DataBufferBase(object):
    """
-    base class for replay buffer.
+    base class for data buffer, including replay buffer as in DQN and batched dataset as in on-policy algos
    """
    def add(self, frame):
        raise NotImplementedError()
-    def remove(self):
+    def clear(self):
        raise NotImplementedError()
    def sample(self, batch_size):
--- a/tianshou/data/data_buffer/batch_set.py
+++ b/tianshou/data/data_buffer/batch_set.py
@ -0,0 +1,24 @@
 from .base import DataBufferBase
 class BatchSet(DataBufferBase):
    """
    class for batched dataset as used in on-policy algos
    """
    def __init__(self):
        self.data = [[]]
        self.index = [[]]
        self.candidate_index = 0
        self.size = 0  # number of valid data points (not frames)
        self.index_lengths = [0]  # for sampling
    def add(self, frame):
        self.data[-1].append(frame)
    def clear(self):
        pass
    def sample(self, batch_size):
        pass
--- a/tianshou/data/data_buffer/replay_buffer_base.py
+++ b/tianshou/data/data_buffer/replay_buffer_base.py
@ -0,0 +1,12 @@
 from .base import DataBufferBase
 class ReplayBufferBase(DataBufferBase):
    """
    base class for replay buffer.
    """
    def remove(self):
        """
        when size exceeds capacity, removes extra data points
        :return:
        """
        raise NotImplementedError()
--- a/tianshou/data/replay_buffer/vanilla.py
+++ b/tianshou/data/replay_buffer/vanilla.py
@ -1,13 +1,14 @@
 import logging
 import numpy as np
-from .base import ReplayBufferBase
+from .replay_buffer_base import ReplayBufferBase
 STATE = 0
 ACTION = 1
 REWARD = 2
 DONE = 3
 # TODO: valid data points could be less than `nstep` timesteps
 class VanillaReplayBuffer(ReplayBufferBase):
    """
    vanilla replay buffer as used in (Mnih, et al., 2015).
--- a/tianshou/data/data_collector.py
+++ b/tianshou/data/data_collector.py
@ -3,7 +3,8 @@ import logging
 import itertools
 import sys
-from .replay_buffer.base import ReplayBufferBase
+from .data_buffer.replay_buffer_base import ReplayBufferBase
 from .data_buffer.batch_set import BatchSet
 class DataCollector(object):
    """
@ -31,10 +32,13 @@ class DataCollector(object):
        self.current_observation = self.env.reset()
-    def collect(self, num_timesteps=1, num_episodes=0, my_feed_dict={}):
+    def collect(self, num_timesteps=1, num_episodes=0, my_feed_dict={}, auto_clear=True):
        assert sum([num_timesteps > 0, num_episodes > 0]) == 1,\
            "One and only one collection number specification permitted!"
        if isinstance(self.data_buffer, BatchSet) and auto_clear:
            self.data_buffer.clear()
        if num_timesteps > 0:
            num_timesteps_ = int(num_timesteps)
            for _ in range(num_timesteps_):
--- a/tianshou/data/test_replay_buffer.py
+++ b/tianshou/data/test_replay_buffer.py
@ -1,6 +1,6 @@
 import numpy as np
-from replay_buffer.vanilla import VanillaReplayBuffer
+from data_buffer.vanilla import VanillaReplayBuffer
 capacity = 12
 nstep = 3
--- a/tianshou/data/tester.py
+++ b/tianshou/data/tester.py
@ -1,8 +1,75 @@
 from __future__ import absolute_import
 import gym
 import logging
 import numpy as np
 def test_policy_in_env(policy, env, num_timesteps=0, num_episodes=0, discount_factor=0.99):
    assert sum([num_episodes > 0, num_timesteps > 0]) == 1, \
        'One and only one collection number specification permitted!'
 def test_policy_in_env(policy, env):
    # make another env as the original is for training data collection
-    env_ = env
+    env_id = env.spec.id
    env_ = gym.make(env_id)
-    pass
+    # test policy
    returns = []
    undiscounted_returns = []
    current_return = 0.
    current_undiscounted_return = 0.
    if num_episodes > 0:
        returns = [0.] * num_episodes
        undiscounted_returns = [0.] * num_episodes
        for i in range(num_episodes):
            current_return = 0.
            current_undiscounted_return = 0.
            current_discount = 1.
            observation = env_.reset()
            done = False
            while not done:
                action = policy.act_test(observation)
                observation, reward, done, _ = env_.step(action)
                current_return += reward * current_discount
                current_undiscounted_return += reward
                current_discount *= discount_factor
            returns[i] = current_return
            undiscounted_returns[i] = current_undiscounted_return
    # run for fix number of timesteps, only the first episode and finished episodes
    # matters when calcuting average return
    if num_timesteps > 0:
        current_discount = 1.
        observation = env_.reset()
        for _ in range(num_timesteps):
            action = policy.act_test(observation)
            observation, reward, done, _ = env_.step(action)
            current_return += reward * current_discount
            current_undiscounted_return += reward
            current_discount *= discount_factor
            if done:
                returns.append(current_return)
                undiscounted_returns.append(current_undiscounted_return)
                current_return = 0.
                current_undiscounted_return = 0.
                current_discount = 1.
                observation = env_.reset()
    # log
    if returns:  # has at least one finished episode
        mean_return = np.mean(returns)
        mean_undiscounted_return = np.mean(undiscounted_returns)
    else:  # the first episode is too long to finish
        logging.warning('The first test episode is still not finished after {} timesteps. '
                        'Logging its return anyway.'.format(num_timesteps))
        mean_return = current_return
        mean_undiscounted_return = current_undiscounted_return
    logging.info('Mean return: {}'.format(mean_return))
    logging.info('Mean undiscounted return: {}'.format(mean_undiscounted_return))
    # clear scene
    env_.close()