fixed the bugs on Jan 14, which gives inferior or even no improvement. mistook group_ndims. policy will soon need refactoring.

2018-01-17 11:55:51 +08:00 · 2018-01-17 11:55:51 +08:00 · ed25bf7586
commit ed25bf7586
parent d599506dc9
15 changed files with 619 additions and 119 deletions
--- a/README.md
+++ b/README.md
@ -15,6 +15,30 @@ Tianshou(天授) is a reinforcement learning platform. The following image illus

 &nbsp;&nbsp;&nbsp;&nbsp;Specific network architectures in original paper of DQN, TRPO, A3C, etc. Policy-Value Network of AlphaGo Zero

+#### brief intro of current implementation:
+
+how to write your own network:
+- define the observation placeholder yourself, pass it to `observation_placeholder` when initializing a policy instance
+- pass a callable when initializing a policy instance. The callable should satisfy only three conditions:
+    - it accepts no parameters
+    - it does not create any new placeholders
+    - it returns `action-related tensors, value_head`
+
+Our lib will take care of your observation placeholder from now on, as well as
+all the placeholders that will be created by our lib.
+
+The other placeholders, such as `keep_prob` in dropout and `clip_param` in ppo loss
+should be managed by your own (see examples/ppo_cartpole_alternative.py)
+
+The `weight_update` parameter:
+- 0 means manually update target network
+- 1 means no target network (the target network is updated every 1 minibatch)
+- (0, 1) is the target network as used in DDPG
+- greater than 1 is the target network as used in DQN
+
+Other comments are in the python files in example/ and in the lib codes.
+Refactor is definitely needed so don't dwell too much on annoying details...
+
 ### Algorithm

 #### losses
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@ -0,0 +1,94 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+
+import tensorflow as tf
+import time
+import numpy as np
+
+# our lib imports here! It's ok to append path in examples
+import sys
+sys.path.append('..')
+from tianshou.core import losses
+from tianshou.data.batch import Batch
+import tianshou.data.advantage_estimation as advantage_estimation
+import tianshou.core.policy.stochastic as policy  # TODO: fix imports as zhusuan so that only need to import to policy
+import tianshou.core.value_function.state_value as value_function
+
+from rllab.envs.box2d.cartpole_env import CartpoleEnv
+from rllab.envs.normalized_env import normalize
+
+
+# for tutorial purpose, placeholders are explicitly appended with '_ph' suffix
+
+if __name__ == '__main__':
+    env = normalize(CartpoleEnv())
+    observation_dim = env.observation_space.shape
+    action_dim = env.action_space.flat_dim
+
+    clip_param = 0.2
+    num_batches = 10
+    batch_size = 128
+
+    seed = 10
+    np.random.seed(seed)
+    tf.set_random_seed(seed)
+
+    ### 1. build network with pure tf
+    observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)
+
+    def my_network():
+        # placeholders defined in this function would be very difficult to manage
+        net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
+        net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
+
+        action_mean = tf.layers.dense(net, action_dim, activation=None)
+        action_logstd = tf.get_variable('action_logstd', shape=(action_dim, ))
+        value = tf.layers.dense(net, 1, activation=None)
+
+        return action_mean, action_logstd, value
+    # TODO: overriding seems not able to handle shared layers, unless a new class `SharedPolicyValue`
+    # maybe the most desired thing is to freely build policy and value function from any tensor?
+    # but for now, only the outputs of the network matters
+
+    ### 2. build policy, critic, loss, optimizer
+    actor = policy.Normal(my_network, observation_placeholder=observation_ph, weight_update=1)
+    critic = value_function.StateValue(my_network, observation_placeholder=observation_ph)
+
+    actor_loss = losses.REINFORCE(actor)
+    critic_loss = losses.state_value_mse(critic)
+    total_loss = actor_loss + critic_loss
+
+    optimizer = tf.train.AdamOptimizer(1e-4)
+
+    # this hack would be unnecessary if we have a `SharedPolicyValue` class, or hack the trainable_variables management
+    var_list = list(set(actor.trainable_variables + critic.trainable_variables))
+
+    train_op = optimizer.minimize(total_loss, var_list=var_list)
+
+    ### 3. define data collection
+    data_collector = Batch(env, actor,
+                           [advantage_estimation.gae_lambda(1, critic), advantage_estimation.nstep_return(1, critic)],
+                           [actor, critic])
+    # TODO: refactor this, data_collector should be just the top-level abstraction
+
+    ### 4. start training
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        sess.run(tf.global_variables_initializer())
+
+        start_time = time.time()
+        for i in range(100):
+            # collect data
+            data_collector.collect(num_episodes=20)
+
+            # print current return
+            print('Epoch {}:'.format(i))
+            data_collector.statistics()
+
+            # update network
+            for _ in range(num_batches):
+                feed_dict = data_collector.next_batch(batch_size)
+                sess.run(train_op, feed_dict=feed_dict)
+
+            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
--- a/examples/actor_critic_fail_cartpole.py
+++ b/examples/actor_critic_fail_cartpole.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+
+import tensorflow as tf
+import time
+import numpy as np
+
+# our lib imports here! It's ok to append path in examples
+import sys
+sys.path.append('..')
+from tianshou.core import losses
+from tianshou.data.batch import Batch
+import tianshou.data.advantage_estimation as advantage_estimation
+import tianshou.core.policy.stochastic as policy  # TODO: fix imports as zhusuan so that only need to import to policy
+import tianshou.core.value_function.state_value as value_function
+
+from rllab.envs.box2d.cartpole_env import CartpoleEnv
+from rllab.envs.normalized_env import normalize
+
+
+# for tutorial purpose, placeholders are explicitly appended with '_ph' suffix
+
+if __name__ == '__main__':
+    env = normalize(CartpoleEnv())
+    observation_dim = env.observation_space.shape
+    action_dim = env.action_space.flat_dim
+
+    clip_param = 0.2
+    num_batches = 10
+    batch_size = 128
+
+    seed = 10
+    np.random.seed(seed)
+    tf.set_random_seed(seed)
+
+    ### 1. build network with pure tf
+    observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)
+
+    def my_actor():
+        net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
+        net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
+
+        action_mean = tf.layers.dense(net, action_dim, activation=None)
+        action_logstd = tf.get_variable('action_logstd', shape=(action_dim, ))
+
+        return action_mean, action_logstd, None
+
+    def my_critic():
+        net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
+        net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
+        value = tf.layers.dense(net, 1, activation=None)
+
+        return None, value
+
+    ### 2. build policy, critic, loss, optimizer
+    actor = policy.Normal(my_actor, observation_placeholder=observation_ph, weight_update=1)
+    critic = value_function.StateValue(my_critic, observation_placeholder=observation_ph)
+
+    print('actor and critic will share variables in this case')
+    sys.exit()
+
+    actor_loss = losses.vanilla_policy_gradient(actor)
+    critic_loss = losses.state_value_mse(critic)
+    total_loss = actor_loss + critic_loss
+
+    optimizer = tf.train.AdamOptimizer(1e-4)
+    train_op = optimizer.minimize(total_loss, var_list=actor.trainable_variables)
+
+    ### 3. define data collection
+    training_data = Batch(env, actor, advantage_estimation.full_return)
+
+    ### 4. start training
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        sess.run(tf.global_variables_initializer())
+
+        # assign actor to pi_old
+        actor.sync_weights()  # TODO: automate this for policies with target network
+
+        start_time = time.time()
+        for i in range(100):
+            # collect data
+            training_data.collect(num_episodes=20)
+
+            # print current return
+            print('Epoch {}:'.format(i))
+            training_data.statistics()
+
+            # update network
+            for _ in range(num_batches):
+                feed_dict = training_data.next_batch(batch_size)
+                sess.run(train_op, feed_dict=feed_dict)
+
+            # assigning actor to pi_old
+            actor.update_weights()
+
+            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
--- a/examples/actor_critic_separate_cartpole.py
+++ b/examples/actor_critic_separate_cartpole.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+
+import tensorflow as tf
+import time
+import numpy as np
+
+# our lib imports here! It's ok to append path in examples
+import sys
+sys.path.append('..')
+from tianshou.core import losses
+from tianshou.data.batch import Batch
+import tianshou.data.advantage_estimation as advantage_estimation
+import tianshou.core.policy.stochastic as policy  # TODO: fix imports as zhusuan so that only need to import to policy
+import tianshou.core.value_function.state_value as value_function
+
+from rllab.envs.box2d.cartpole_env import CartpoleEnv
+from rllab.envs.normalized_env import normalize
+
+
+# for tutorial purpose, placeholders are explicitly appended with '_ph' suffix
+
+if __name__ == '__main__':
+    env = normalize(CartpoleEnv())
+    observation_dim = env.observation_space.shape
+    action_dim = env.action_space.flat_dim
+
+    clip_param = 0.2
+    num_batches = 10
+    batch_size = 128
+
+    seed = 10
+    np.random.seed(seed)
+    tf.set_random_seed(seed)
+
+    ### 1. build network with pure tf
+    observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)
+
+    def my_network():
+        net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
+        net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
+
+        action_mean = tf.layers.dense(net, action_dim, activation=None)
+        action_logstd = tf.get_variable('action_logstd', shape=(action_dim, ))
+
+        net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
+        net = tf.layers.dense(net, 32, activation=tf.nn.tanh)
+        value = tf.layers.dense(net, 1, activation=None)
+
+        return action_mean, action_logstd, value
+
+    ### 2. build policy, critic, loss, optimizer
+    actor = policy.Normal(my_network, observation_placeholder=observation_ph, weight_update=1)
+    critic = value_function.StateValue(my_network, observation_placeholder=observation_ph)
+
+    actor_loss = losses.REINFORCE(actor)
+    critic_loss = losses.state_value_mse(critic)
+
+    actor_optimizer = tf.train.AdamOptimizer(1e-4)
+    actor_train_op = actor_optimizer.minimize(actor_loss, var_list=actor.trainable_variables)
+
+    critic_optimizer = tf.train.RMSPropOptimizer(1e-4)
+    critic_train_op = critic_optimizer.minimize(critic_loss, var_list=critic.trainable_variables)
+
+    ### 3. define data collection
+    data_collector = Batch(env, actor,
+                           [advantage_estimation.gae_lambda(1, critic), advantage_estimation.nstep_return(1, critic)],
+                           [actor, critic])
+
+    ### 4. start training
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        sess.run(tf.global_variables_initializer())
+
+        start_time = time.time()
+        for i in range(100):
+            # collect data
+            data_collector.collect(num_episodes=20)
+
+            # print current return
+            print('Epoch {}:'.format(i))
+            data_collector.statistics()
+
+            # update network
+            for _ in range(num_batches):
+                feed_dict = data_collector.next_batch(batch_size)
+                sess.run([actor_train_op, critic_train_op], feed_dict=feed_dict)
+
+            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
--- a/examples/contrib_dqn_example.py
+++ b/examples/contrib_dqn_example.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import tensorflow as tf
+import gym
+
+# our lib imports here!
+import sys
+sys.path.append('..')
+import tianshou.core.losses as losses
+from tianshou.data.replay_buffer.utils import get_replay_buffer
+import tianshou.core.policy.dqn as policy
+
+
+# THIS EXAMPLE IS NOT FINISHED YET!!!
+
+
+def policy_net(observation, action_dim):
+    """
+    Constructs the policy network. NOT NEEDED IN THE LIBRARY! this is pure tf
+
+    :param observation: Placeholder for the observation. A tensor of shape (bs, x, y, channels)
+    :param action_dim: int. The number of actions.
+    :param scope: str. Specifying the scope of the variables.
+    """
+    net = tf.layers.conv2d(observation, 16, 8, 4, 'valid', activation=tf.nn.relu)
+    net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
+    net = tf.layers.flatten(net)
+    net = tf.layers.dense(net, 256, activation=tf.nn.relu)
+
+    q_values = tf.layers.dense(net, action_dim)
+
+    return q_values
+
+
+if __name__ == '__main__':
+    env = gym.make('PongNoFrameskip-v4')
+    observation_dim = env.observation_space.shape
+    action_dim = env.action_space.n
+
+    # 1. build network with pure tf
+    # TODO:
+    # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer
+    # access this observation variable.
+    observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
+    action = tf.placeholder(dtype=tf.int32, shape=(None,)) # batch of integer actions
+
+
+    with tf.variable_scope('q_net'):
+        q_values = policy_net(observation, action_dim)
+    with tf.variable_scope('target_net'):
+        q_values_target = policy_net(observation, action_dim)
+
+    # 2. build losses, optimizers
+    q_net = policy.DQNRefactor(q_values, observation_placeholder=observation, action_placeholder=action) # YongRen: policy.DQN
+    target_net = policy.DQNRefactor(q_values_target, observation_placeholder=observation, action_placeholder=action)
+
+    target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
+
+    dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
+    global_step = tf.Variable(0, name='global_step', trainable=False)
+    train_var_list = tf.get_collection(
+        tf.GraphKeys.TRAINABLE_VARIABLES)  # TODO: better management of TRAINABLE_VARIABLES
+    total_loss = dqn_loss
+    optimizer = tf.train.AdamOptimizer(1e-3)
+    train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step())
+    # 3. define data collection
+    # configuration should be given as parameters, different replay buffer has different parameters.
+    replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net,
+                                      {'size': 1000, 'batch_size': 64, 'learn_start': 20})
+                                                             # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
+    # maybe a dict to manage the elements to be collected
+
+    # 4. start training
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+
+        minibatch_count = 0
+        collection_count = 0
+        # need to first collect some then sample, collect_freq must be larger than batch_size
+        collect_freq = 100
+        while True: # until some stopping criterion met...
+            # collect data
+            for i in range(0, collect_freq):
+                replay_memory.collect() # ShihongSong
+                collection_count += 1
+                print('Collected {} times.'.format(collection_count))
+
+            # update network
+            data = replay_memory.next_batch(10) # YouQiaoben, ShihongSong
+            # TODO: auto managing of the placeholders? or add this to params of data.Batch
+            sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], target: data['target']})
+            minibatch_count += 1
+            print('Trained {} minibatches.'.format(minibatch_count))
+
+            # TODO: assigning pi to pi_old is not implemented yet
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@ -1,95 +1,83 @@
 #!/usr/bin/env python
+from __future__ import absolute_import

 import tensorflow as tf
 import gym
+import numpy as np
+import time

-# our lib imports here!
+# our lib imports here! It's ok to append path in examples
 import sys
 sys.path.append('..')
-import tianshou.core.losses as losses
-from tianshou.data.replay_buffer.utils import get_replay_buffer
-import tianshou.core.policy.dqn as policy
-
-
-# THIS EXAMPLE IS NOT FINISHED YET!!!
-
-
-def policy_net(observation, action_dim):
-    """
-    Constructs the policy network. NOT NEEDED IN THE LIBRARY! this is pure tf
-
-    :param observation: Placeholder for the observation. A tensor of shape (bs, x, y, channels)
-    :param action_dim: int. The number of actions.
-    :param scope: str. Specifying the scope of the variables.
-    """
-    net = tf.layers.conv2d(observation, 16, 8, 4, 'valid', activation=tf.nn.relu)
-    net = tf.layers.conv2d(net, 32, 4, 2, 'valid', activation=tf.nn.relu)
-    net = tf.layers.flatten(net)
-    net = tf.layers.dense(net, 256, activation=tf.nn.relu)
-
-    q_values = tf.layers.dense(net, action_dim)
-
-    return q_values
+from tianshou.core import losses
+from tianshou.data.batch import Batch
+import tianshou.data.advantage_estimation as advantage_estimation
+import tianshou.core.policy.dqn as policy  # TODO: fix imports as zhusuan so that only need to import to policy


 if __name__ == '__main__':
-    env = gym.make('PongNoFrameskip-v4')
+    env = gym.make('CartPole-v0')
    observation_dim = env.observation_space.shape
    action_dim = env.action_space.n

-    # 1. build network with pure tf
-    # TODO:
-    # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer
-    # access this observation variable.
-    observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
-    action = tf.placeholder(dtype=tf.int32, shape=(None,)) # batch of integer actions
+    clip_param = 0.2
+    num_batches = 10
+    batch_size = 512

+    seed = 0
+    np.random.seed(seed)
+    tf.set_random_seed(seed)

-    with tf.variable_scope('q_net'):
-        q_values = policy_net(observation, action_dim)
-    with tf.variable_scope('target_net'):
-        q_values_target = policy_net(observation, action_dim)
+    ### 1. build network with pure tf
+    observation_ph = tf.placeholder(tf.float32, shape=(None,) + observation_dim)

-    # 2. build losses, optimizers
-    q_net = policy.DQNRefactor(q_values, observation_placeholder=observation, action_placeholder=action) # YongRen: policy.DQN
-    target_net = policy.DQNRefactor(q_values_target, observation_placeholder=observation, action_placeholder=action)
+    def my_policy():
+        net = tf.layers.dense(observation_ph, 32, activation=tf.nn.tanh)
+        net = tf.layers.dense(net, 32, activation=tf.nn.tanh)

-    target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
+        action_values = tf.layers.dense(net, action_dim, activation=None)
+
+        return action_values, None  # None value head
+
+    # TODO: current implementation of passing function or overriding function has to return a value head
+    # to allow network sharing between policy and value networks. This makes 'policy' and 'value_function'
+    # imbalanced semantically (though they are naturally imbalanced since 'policy' is required to interact
+    # with the environment and 'value_function' is not). I have an idea to solve this imbalance, which is
+    # not based on passing function or overriding function.
+
+    ### 2. build policy, loss, optimizer
+    pi = policy.DQN(my_policy, observation_placeholder=observation_ph, weight_update=10)
+
+    dqn_loss = losses.qlearning(pi)

-    dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
-    global_step = tf.Variable(0, name='global_step', trainable=False)
-    train_var_list = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES)  # TODO: better management of TRAINABLE_VARIABLES
    total_loss = dqn_loss
-    optimizer = tf.train.AdamOptimizer(1e-3)
-    train_op = optimizer.minimize(total_loss, var_list=train_var_list, global_step=tf.train.get_global_step())
-    # 3. define data collection
-    # configuration should be given as parameters, different replay buffer has different parameters.
-    replay_memory = get_replay_buffer('rank_based', env, q_values, q_net, target_net,
-                                      {'size': 1000, 'batch_size': 64, 'learn_start': 20})
-                                                             # ShihongSong: Replay(env, q_net, advantage_estimation.qlearning_target(target_network)), use your ReplayMemory, interact as follows. Simplify your advantage_estimation.dqn to run before YongRen's DQN
-    # maybe a dict to manage the elements to be collected
+    optimizer = tf.train.AdamOptimizer(1e-4)
+    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)

-    # 4. start training
-    with tf.Session() as sess:
+    ### 3. define data collection
+    data_collector = Batch(env, pi, [advantage_estimation.nstep_q_return(1, pi.target_network)], [pi])
+
+    ### 4. start training
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

-        minibatch_count = 0
-        collection_count = 0
-        # need to first collect some then sample, collect_freq must be larger than batch_size
-        collect_freq = 100
-        while True: # until some stopping criterion met...
+        # assign actor to pi_old
+        pi.sync_weights()  # TODO: automate this for policies with target network
+
+        start_time = time.time()
+        for i in range(100):
            # collect data
-            for i in range(0, collect_freq):
-                replay_memory.collect() # ShihongSong
-                collection_count += 1
-                print('Collected {} times.'.format(collection_count))
+            data_collector.collect(num_episodes=50)
+
+            # print current return
+            print('Epoch {}:'.format(i))
+            data_collector.statistics()

            # update network
-            data = replay_memory.next_batch(10) # YouQiaoben, ShihongSong
-            # TODO: auto managing of the placeholders? or add this to params of data.Batch
-            sess.run(train_op, feed_dict={observation: data['observations'], action: data['actions'], target: data['target']})
-            minibatch_count += 1
-            print('Trained {} minibatches.'.format(minibatch_count))
+            for _ in range(num_batches):
+                feed_dict = data_collector.next_batch(batch_size)
+                sess.run(train_op, feed_dict=feed_dict)

-            # TODO: assigning pi to pi_old is not implemented yet
+            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
--- a/examples/ppo_cartpole.py
+++ b/examples/ppo_cartpole.py
@ -61,7 +61,7 @@ if __name__ == '__main__':
    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)

    ### 3. define data collection
-    training_data = Batch(env, pi, advantage_estimation.full_return)
+    training_data = Batch(env, pi, [advantage_estimation.full_return], [pi])

    ### 4. start training
    config = tf.ConfigProto()
@ -69,7 +69,7 @@ if __name__ == '__main__':
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

-        # assign pi to pi_old
+        # assign actor to pi_old
        pi.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
@ -86,7 +86,7 @@ if __name__ == '__main__':
                feed_dict = training_data.next_batch(batch_size)
                sess.run(train_op, feed_dict=feed_dict)

-            # assigning pi to pi_old
+            # assigning actor to pi_old
            pi.update_weights()

            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
--- a/examples/ppo_cartpole_alternative.py
+++ b/examples/ppo_cartpole_alternative.py
@ -47,7 +47,7 @@ if __name__ == '__main__':
    observation_dim = env.observation_space.shape
    action_dim = env.action_space.flat_dim

-    clip_param = 0.2
+    # clip_param = 0.2
    num_batches = 10
    batch_size = 128

@ -65,6 +65,7 @@ if __name__ == '__main__':
    ### 2. build policy, loss, optimizer
    pi = policy.Normal(my_policy, observation_placeholder=observation_ph, weight_update=0)

+    clip_param = tf.placeholder(tf.float32, shape=(), name='ppo_loss_clip_param')
    ppo_loss_clip = losses.ppo_clip(pi, clip_param)

    total_loss = ppo_loss_clip
@ -72,7 +73,7 @@ if __name__ == '__main__':
    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)

    ### 3. define data collection
-    training_data = Batch(env, pi, advantage_estimation.full_return)
+    training_data = Batch(env, pi, [advantage_estimation.full_return], [pi])

    ### 4. start training
    feed_dict_train = {is_training_ph: True, keep_prob_ph: 0.8}
@ -83,7 +84,7 @@ if __name__ == '__main__':
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

-        # assign pi to pi_old
+        # assign actor to pi_old
        pi.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
@ -95,13 +96,19 @@ if __name__ == '__main__':
            print('Epoch {}:'.format(i))
            training_data.statistics()

+            # manipulate decay_param
+            if i < 30:
+                feed_dict_train[clip_param] = 0.2
+            else:
+                feed_dict_train[clip_param] = 0.1
+
            # update network
            for _ in range(num_batches):
                feed_dict = training_data.next_batch(batch_size)
                feed_dict.update(feed_dict_train)
                sess.run(train_op, feed_dict=feed_dict)

-            # assigning pi to pi_old
+            # assigning actor to pi_old
            pi.update_weights()

            # approximate test mode
--- a/examples/ppo_cartpole_gym.py
+++ b/examples/ppo_cartpole_gym.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)

    ### 3. define data collection
-    training_data = Batch(env, pi, advantage_estimation.full_return)
+    training_data = Batch(env, pi, [advantage_estimation.full_return], [pi])

    ### 4. start training
    config = tf.ConfigProto()
@ -63,7 +63,7 @@ if __name__ == '__main__':
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

-        # assign pi to pi_old
+        # assign actor to pi_old
        pi.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
@ -80,7 +80,7 @@ if __name__ == '__main__':
                feed_dict = training_data.next_batch(batch_size)
                sess.run(train_op, feed_dict=feed_dict)

-            # assigning pi to pi_old
+            # assigning actor to pi_old
            pi.update_weights()

            print('Elapsed time: {:.1f} min'.format((time.time() - start_time) / 60))
--- a/internal_keys.md
+++ b/internal_keys.md
@ -0,0 +1,15 @@
+network.managed_placeholders.keys()
+
+data_collector.raw_data.keys()
+
+data_collector.data.keys()
+
+['observation']
+
+['action']
+
+['reward']
+
+['start_flag']
+
+['advantage'] > ['return'] # they may appear simultaneously
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@ -14,7 +14,7 @@ def ppo_clip(policy, clip_param):
    action_ph = tf.placeholder(policy.act_dtype, shape=(None,) + policy.action_shape, name='ppo_clip_loss/action_placeholder')
    advantage_ph = tf.placeholder(tf.float32, shape=(None,), name='ppo_clip_loss/advantage_placeholder')
    policy.managed_placeholders['action'] = action_ph
-    policy.managed_placeholders['processed_reward'] = advantage_ph
+    policy.managed_placeholders['advantage'] = advantage_ph

    log_pi_act = policy.log_prob(action_ph)
    log_pi_old_act = policy.log_prob_old(action_ph)
@ -24,7 +24,7 @@ def ppo_clip(policy, clip_param):
    return ppo_clip_loss


-def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
+def REINFORCE(policy):
    """
    vanilla policy gradient

@ -34,10 +34,29 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
    :param baseline: the baseline method used to reduce the variance, default is 'None'
    :return:
    """
-    log_pi_act = pi.log_prob(sampled_action)
-    vanilla_policy_gradient_loss = tf.reduce_mean(reward * log_pi_act)
-    # TODO: Different baseline methods like REINFORCE, etc.
-    return vanilla_policy_gradient_loss
+    action_ph = tf.placeholder(policy.act_dtype, shape=(None,) + policy.action_shape,
+                               name='REINFORCE/action_placeholder')
+    advantage_ph = tf.placeholder(tf.float32, shape=(None,), name='REINFORCE/advantage_placeholder')
+    policy.managed_placeholders['action'] = action_ph
+    policy.managed_placeholders['advantage'] = advantage_ph
+
+    log_pi_act = policy.log_prob(action_ph)
+    REINFORCE_loss = -tf.reduce_mean(advantage_ph * log_pi_act)
+    return REINFORCE_loss
+
+
+def state_value_mse(state_value_function):
+    """
+    L2 loss of state value
+    :param state_value_function: instance of StateValue
+    :return: tensor of the mse loss
+    """
+    state_value_ph = tf.placeholder(tf.float32, shape=(None,), name='state_value_mse/state_value_placeholder')
+    state_value_function.managed_placeholders['return'] = state_value_ph
+
+    state_value = state_value_function.value_tensor
+    return tf.losses.mean_squared_error(state_value_ph, state_value)
+

 def dqn_loss(sampled_action, sampled_target, policy):
    """
--- a/tianshou/core/policy/stochastic.py
+++ b/tianshou/core/policy/stochastic.py
@ -44,7 +44,8 @@ class OnehotCategorical(StochasticPolicy):
        self.weight_update = weight_update
        self.interaction_count = -1  # defaults to -1. only useful if weight_update > 1.

-        with tf.variable_scope('network'):
+        # build network, action and value
+        with tf.variable_scope('network', reuse=tf.AUTO_REUSE):
            logits, value_head = policy_callable()
            self._logits = tf.convert_to_tensor(logits, dtype=tf.float32)
            self._action = tf.multinomial(self._logits, num_samples=1)
@ -55,11 +56,12 @@ class OnehotCategorical(StochasticPolicy):

        self.trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

+        # deal with target network
        if self.weight_update == 1:
            self.weight_update_ops = None
            self.sync_weights_ops = None
        else:  # then we need to build another tf graph as target network
-            with tf.variable_scope('net_old'):
+            with tf.variable_scope('net_old', reuse=tf.AUTO_REUSE):
                logits, value_head = policy_callable()
                self._logits_old = tf.convert_to_tensor(logits, dtype=tf.float32)

@ -173,7 +175,8 @@ class Normal(StochasticPolicy):
        self.weight_update = weight_update
        self.interaction_count = -1  # defaults to -1. only useful if weight_update > 1.

-        with tf.variable_scope('network'):
+        # build network, action and value
+        with tf.variable_scope('network', reuse=tf.AUTO_REUSE):
            mean, logstd, value_head = policy_callable()
            self._mean = tf.convert_to_tensor(mean, dtype = tf.float32)
            self._logstd = tf.convert_to_tensor(logstd, dtype = tf.float32)
@ -188,11 +191,12 @@ class Normal(StochasticPolicy):

        self.trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

+        # deal with target network
        if self.weight_update == 1:
            self.weight_update_ops = None
            self.sync_weights_ops = None
        else:  # then we need to build another tf graph as target network
-            with tf.variable_scope('net_old'):
+            with tf.variable_scope('net_old', reuse=tf.AUTO_REUSE):
                mean, logstd, value_head = policy_callable()
                self._mean_old = tf.convert_to_tensor(mean, dtype=tf.float32)
                self._logstd_old = tf.convert_to_tensor(logstd, dtype=tf.float32)
--- a/tianshou/core/value_function/state_value.py
+++ b/tianshou/core/value_function/state_value.py
@ -8,7 +8,12 @@ class StateValue(ValueFunctionBase):
    """
    class of state values V(s).
    """
-    def __init__(self, value_tensor, observation_placeholder):
+    def __init__(self, policy_callable, observation_placeholder):
+        self.managed_placeholders = {'observation': observation_placeholder}
+        with tf.variable_scope('network', reuse=tf.AUTO_REUSE):
+            value_tensor = policy_callable()[-1]
+            self.trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+
        super(StateValue, self).__init__(
            value_tensor=value_tensor,
            observation_placeholder=observation_placeholder
--- a/tianshou/data/advantage_estimation.py
+++ b/tianshou/data/advantage_estimation.py
@ -6,15 +6,13 @@ def full_return(raw_data):
    naively compute full return
    :param raw_data: dict of specified keys and values.
    """
-    observations = raw_data['observations']
-    actions = raw_data['actions']
-    rewards = raw_data['rewards']
-    episode_start_flags = raw_data['episode_start_flags']
+    observations = raw_data['observation']
+    actions = raw_data['action']
+    rewards = raw_data['reward']
+    episode_start_flags = raw_data['end_flag']
    num_timesteps = rewards.shape[0]

    data = {}
-    data['observations'] = observations
-    data['actions'] = actions

    returns = rewards.copy()
    episode_start_idx = 0
@ -33,11 +31,39 @@ def full_return(raw_data):

            episode_start_idx = i

-    data['returns'] = returns
+    data['return'] = returns

    return data


+class gae_lambda:
+    """
+    Generalized Advantage Estimation (Schulman, 15) to compute advantage
+    """
+    def __init__(self, T, value_function):
+        self.T = T
+        self.value_function = value_function
+
+    def __call__(self, raw_data):
+        reward = raw_data['reward']
+
+        return {'advantage': reward}
+
+
+class nstep_return:
+    """
+    compute the n-step return from n-step rewards and bootstrapped value function
+    """
+    def __init__(self, n, value_function):
+        self.n = n
+        self.value_function = value_function
+
+    def __call__(self, raw_data):
+        reward = raw_data['reward']
+
+        return {'return': reward}
+
+
 class QLearningTarget:
    def __init__(self, policy, gamma):
        self._policy = policy
@ -68,3 +94,4 @@ class QLearningTarget:
        data['rewards'] = np.array(rewards)

        return data
+
--- a/tianshou/data/batch.py
+++ b/tianshou/data/batch.py
@ -1,6 +1,7 @@
 import numpy as np
 import gc
-
+import logging
+from . import utils

 # TODO: Refactor with tf.train.slice_input_producer, tf.train.Coordinator, tf.train.QueueRunner
 class Batch(object):
@ -8,14 +9,31 @@ class Batch(object):
    class for batch datasets. Collect multiple observations (actions, rewards, etc.) on-policy.
    """

-    def __init__(self, env, pi, advantage_estimation_function):  # how to name the function?
+    def __init__(self, env, pi, reward_processors, networks):  # how to name the function?
+        """
+        constructor
+        :param env:
+        :param pi:
+        :param reward_processors: list of functions to process reward
+        :param networks: list of networks to be optimized, so as to match data in feed_dict
+        """
        self._env = env
        self._pi = pi
-        self._advantage_estimation_function = advantage_estimation_function
+        self.raw_data = {}
+        self.data = {}
+
+        self.reward_processors = reward_processors
+        self.networks = networks
+
+        self.required_placeholders = {}
+        for net in self.networks:
+            self.required_placeholders.update(net.managed_placeholders)
+        self.require_advantage = 'advantage' in self.required_placeholders.keys()
+
        self._is_first_collect = True

    def collect(self, num_timesteps=0, num_episodes=0, my_feed_dict={},
-                apply_function=True):  # specify how many data to collect here, or fix it in __init__()
+                process_reward=True):  # specify how many data to collect here, or fix it in __init__()
        assert sum(
            [num_timesteps > 0, num_episodes > 0]) == 1, "One and only one collection number specification permitted!"

@ -98,6 +116,7 @@ class Batch(object):
                        break

                    if done:  # end of episode, discard s_T
+                        # TODO: for num_timesteps collection, has to store terminal flag instead of start flag!
                        break
                    else:
                        observations.append(ob)
@ -113,33 +132,48 @@ class Batch(object):
            del rewards
            del episode_start_flags

-            self.raw_data = {'observations': self.observations, 'actions': self.actions, 'rewards': self.rewards,
-                             'episode_start_flags': self.episode_start_flags}
+            self.raw_data = {'observation': self.observations, 'action': self.actions, 'reward': self.rewards,
+                             'end_flag': self.episode_start_flags}

            self._is_first_collect = False

-        if apply_function:
+        if process_reward:
            self.apply_advantage_estimation_function()

        gc.collect()

    def apply_advantage_estimation_function(self):
-        self.data = self._advantage_estimation_function(self.raw_data)
+        for processor in self.reward_processors:
+            self.data.update(processor(self.raw_data))

-    def next_batch(self, batch_size, standardize_advantage=True):  # YouQiaoben: referencing other iterate over batches
-        rand_idx = np.random.choice(self.data['observations'].shape[0], batch_size)
-        current_batch = {key: value[rand_idx] for key, value in self.data.items()}
-
-        if standardize_advantage:
-            advantage_mean = np.mean(current_batch['returns'])
-            advantage_std = np.std(current_batch['returns'])
-            current_batch['returns'] = (current_batch['returns'] - advantage_mean) / advantage_std
+    def next_batch(self, batch_size, standardize_advantage=True):
+        rand_idx = np.random.choice(self.raw_data['observation'].shape[0], batch_size)

        feed_dict = {}
-        feed_dict[self._pi.managed_placeholders['observation']] = current_batch['observations']
-        feed_dict[self._pi.managed_placeholders['action']] = current_batch['actions']
-        feed_dict[self._pi.managed_placeholders['processed_reward']] = current_batch['returns']
-        # TODO: should use the keys in pi.managed_placeholders to find values in self.data and self.raw_data
+        for key, placeholder in self.required_placeholders.items():
+            found, data_key = utils.internal_key_match(key, self.raw_data.keys())
+            if found:
+                feed_dict[placeholder] = self.raw_data[data_key][rand_idx]
+            else:
+                found, data_key = utils.internal_key_match(key, self.data.keys())
+                if found:
+                    feed_dict[placeholder] = self.data[data_key][rand_idx]
+
+            if not found:
+                raise TypeError('Placeholder {} has no value to feed!'.format(str(placeholder.name)))
+
+        if standardize_advantage:
+            if self.require_advantage:
+                advantage_value = feed_dict[self.required_placeholders['advantage']]
+                advantage_mean = np.mean(advantage_value)
+                advantage_std = np.std(advantage_value)
+                if advantage_std < 1e-3:
+                    logging.warning('advantage_std too small (< 1e-3) for advantage standardization. may cause numerical issues')
+                feed_dict[self.required_placeholders['advantage']] = (advantage_value - advantage_mean) / advantage_std
+
+        # TODO: maybe move all advantage estimation functions to tf, as in tensorforce (though haven't
+        # understood tensorforce after reading) maybe tf.stop_gradient for targets/advantages
+        # this will simplify data collector as it only needs to collect raw data, (s, a, r, done) only

        return feed_dict

@ -149,8 +183,8 @@ class Batch(object):
        compute the statistics of the current sampled paths
        :return:
        """
-        rewards = self.raw_data['rewards']
-        episode_start_flags = self.raw_data['episode_start_flags']
+        rewards = self.raw_data['reward']
+        episode_start_flags = self.raw_data['end_flag']
        num_timesteps = rewards.shape[0]

        returns = []