implement dqn loss and dpg loss, add TODO for separate actor and critic

2017-12-15 14:24:08 +08:00 · 2017-12-15 14:24:08 +08:00 · e5bf7a9270
commit e5bf7a9270
parent 039c8140e2
5 changed files with 35 additions and 12 deletions
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@ -53,7 +53,7 @@ if __name__ == '__main__':
    action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
    target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
-    dqn_loss = losses.dqn_loss(action, target, pi) # TongzhengRen
+    dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
    total_loss = dqn_loss
    optimizer = tf.train.AdamOptimizer(1e-3)
--- a/tianshou/core/README.md
+++ b/tianshou/core/README.md
@ -1,3 +1,7 @@
 #TODO: 
 Separate actor and critic. (Important, we need to focus on that recently)
 # policy
 YongRen
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@ -26,7 +26,7 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
    :param sampled_action: placeholder of sampled actions during interaction with the environment
    :param reward: placeholder of reward the 'sampled_action' get
-    :param pi: current 'policy' to be optimized
+    :param pi: current `policy` to be optimized
    :param baseline: the baseline method used to reduce the variance, default is 'None'
    :return:
    """
@ -35,8 +35,25 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
    # TODO： Different baseline methods like REINFORCE, etc.
    return vanilla_policy_gradient_loss
-def temporal_difference_loss():
+def dqn_loss(sampled_action, sampled_target, q_net):
-    pass
+    """
    deep q-network
-def deterministic_policy_gradient():
+    :param sampled_action: placeholder of sampled actions during the interaction with the environment
-    pass
+    :param sampled_target: estimated Q(s,a)
    :param q_net: current `policy` to be optimized
    :return:
    """
    action_num = q_net.get_values().shape()[1]
    sampled_q = tf.reduce_sum(q_net.get_values() * tf.one_hot(sampled_action, action_num), axis=1)
    return tf.reduce_mean(tf.square(sampled_target - sampled_q))
 def deterministic_policy_gradient(sampled_state, critic):
    """
    deterministic policy gradient:
    :param sampled_action: placeholder of sampled actions during the interaction with the environment
    :param critic: current `value` function
    :return:
    """
    return tf.reduce_mean(critic.get_value(sampled_state))
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@ -14,12 +14,14 @@ __all__ = [
    'StochasticPolicy',
 ]
 #TODO: separate actor and critic, we should focus on it once we finish the basic module.
 class QValuePolicy(object):
    """
    The policy as in DQN
    """
-    def __init__(self, value_tensor):
+    def __init__(self, observation_placeholder):
-        pass
+        self.observation_placeholder = observation_placeholder
    def act(self, observation, exploration=None): # first implement no exploration
        """
@ -222,7 +224,3 @@ class StochasticPolicy(object):
        Private method for subclasses to rewrite the :meth:`prob` method.
        """
        raise NotImplementedError()
 class QValuePolicy(object):
    pass
--- a/tianshou/data/README.md
+++ b/tianshou/data/README.md
@ -1,3 +1,7 @@
 # TODO:
 Notice that we will separate actor and critic, and batch will collect data for optimizing policy while replay will collect data for optimizing critic.
 # Batch
 YouQiaoben