towards policy/value refactor

2017-12-23 17:25:16 +08:00 · 2017-12-23 17:25:16 +08:00 · b33a141373
commit b33a141373
parent 86bf94fde1
9 changed files with 41 additions and 42 deletions
--- a/examples/dqn_example.py
+++ b/examples/dqn_example.py
@ -1,8 +1,6 @@
 #!/usr/bin/env python
 import tensorflow as tf
 import numpy as np
 import time
 import gym
 # our lib imports here!
@ -10,7 +8,7 @@ import sys
 sys.path.append('..')
 import tianshou.core.losses as losses
 from tianshou.data.replay_buffer.utils import get_replay_buffer
-import tianshou.core.policy as policy
+import tianshou.core.policy.dqn as policy
 def policy_net(observation, action_dim):
@ -41,6 +39,8 @@ if __name__ == '__main__':
    # pass the observation variable to the replay buffer or find a more reasonable way to help replay buffer
    # access this observation variable.
    observation = tf.placeholder(tf.float32, shape=(None,) + observation_dim, name="dqn_observation") # network input
    action = tf.placeholder(dtype=tf.int32, shape=(None,)) # batch of integer actions
    with tf.variable_scope('q_net'):
        q_values = policy_net(observation, action_dim)
@ -48,10 +48,9 @@ if __name__ == '__main__':
        q_values_target = policy_net(observation, action_dim)
    # 2. build losses, optimizers
-    q_net = policy.DQN(q_values, observation_placeholder=observation) # YongRen: policy.DQN
+    q_net = policy.DQNRefactor(q_values, observation_placeholder=observation, action_placeholder=action) # YongRen: policy.DQN
-    target_net = policy.DQN(q_values_target, observation_placeholder=observation)
+    target_net = policy.DQNRefactor(q_values_target, observation_placeholder=observation, action_placeholder=action)
    action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
    target = tf.placeholder(dtype=tf.float32, shape=[None]) # target value for DQN
    dqn_loss = losses.dqn_loss(action, target, q_net) # TongzhengRen
--- a/tianshou/core/README.md
+++ b/tianshou/core/README.md
@ -22,3 +22,7 @@ referencing QValuePolicy in base.py, should have at least the listed methods.
 TongzhengRen
 seems to be direct python functions. Though the management of placeholders may require some discussion. also may write it in a functional form.
 # policy, value_function
 naming should be reconsidered. Perhaps use plural forms for all nouns
--- a/tianshou/core/losses.py
+++ b/tianshou/core/losses.py
@ -35,17 +35,16 @@ def vanilla_policy_gradient(sampled_action, reward, pi, baseline="None"):
    # TODO: Different baseline methods like REINFORCE, etc.
    return vanilla_policy_gradient_loss
-def dqn_loss(sampled_action, sampled_target, q_net):
+def dqn_loss(sampled_action, sampled_target, policy):
    """
    deep q-network
    :param sampled_action: placeholder of sampled actions during the interaction with the environment
    :param sampled_target: estimated Q(s,a)
-    :param q_net: current `policy` to be optimized
+    :param policy: current `policy` to be optimized
    :return:
    """
-    action_num = q_net.values_tensor().get_shape()[1]
+    sampled_q = policy.q_net.value_tensor
    sampled_q = tf.reduce_sum(q_net.values_tensor() * tf.one_hot(sampled_action, action_num), axis=1)
    return tf.reduce_mean(tf.square(sampled_target - sampled_q))
 def deterministic_policy_gradient(sampled_state, critic):
--- a/tianshou/core/policy/base.py
+++ b/tianshou/core/policy/base.py
@ -3,19 +3,12 @@
 from __future__ import absolute_import
 from __future__ import division
 import warnings
 import tensorflow as tf
 # from zhusuan.utils import add_name_scope
 __all__ = [
    'StochasticPolicy',
    'QValuePolicy',
    'PolicyBase'
 ]
 # TODO: a even more "base" class for policy
@ -23,8 +16,8 @@ class PolicyBase(object):
    """
    base class for policy. only provides `act` method with exploration
    """
-    def __init__(self):
+    def __init__(self, observation_placeholder):
-        pass
+        self._observation_placeholder = observation_placeholder
    def act(self, observation, exploration):
        raise NotImplementedError()
@ -37,14 +30,14 @@ class QValuePolicy(object):
    def __init__(self, observation_placeholder):
        self._observation_placeholder = observation_placeholder
-    def act(self, observation, exploration=None): # first implement no exploration
+    def act(self, observation, exploration=None):  # first implement no exploration
        """
        return the action (int) to be executed.
        no exploration when exploration=None.
        """
        self._act(observation, exploration)
-    def _act(self, observation, exploration = None):
+    def _act(self, observation, exploration=None):
        raise NotImplementedError()
    def values(self, observation):
@ -60,7 +53,6 @@ class QValuePolicy(object):
        pass
 class StochasticPolicy(object):
    """
    The :class:`Distribution` class is the base class for various probabilistic
@ -130,7 +122,7 @@ class StochasticPolicy(object):
                 param_dtype,
                 is_continuous,
                 observation_placeholder,
-                 group_ndims=0, # maybe useful for repeat_action
+                 group_ndims=0,  # maybe useful for repeat_action
                 **kwargs):
        self._act_dtype = act_dtype
--- a/tianshou/core/policy/dqn.py
+++ b/tianshou/core/policy/dqn.py
@ -10,16 +10,25 @@ class DQNRefactor(PolicyBase):
    use DQN from value_function as a member
    """
    def __init__(self, value_tensor, observation_placeholder, action_placeholder):
-        self._network = DQN(value_tensor, observation_placeholder, action_placeholder)
+        self._q_net = DQN(value_tensor, observation_placeholder, action_placeholder)
        self._argmax_action = tf.argmax(value_tensor, axis=1)
-    def act(self, observation, exploration):
+        super(DQNRefactor, self).__init__(observation_placeholder=observation_placeholder)
    def act(self, observation, exploration=None):
        sess = tf.get_default_session()
        if not exploration:  # no exploration
-             action = sess.run(self._argmax_action, feed_dict={})
+            action = sess.run(self._argmax_action, feed_dict={self._observation_placeholder: observation})
-class DQN(QValuePolicy):
+        return action
    @property
    def q_net(self):
        return self._q_net
 class DQNOld(QValuePolicy):
    """
    The policy as in DQN
    """
--- a/tianshou/core/policy/stochastic.py
+++ b/tianshou/core/policy/stochastic.py
@ -10,12 +10,6 @@ import tensorflow as tf
 from .base import StochasticPolicy
 __all__ = [
    'OnehotCategorical',
    'OnehotDiscrete',
 ]
 class OnehotCategorical(StochasticPolicy):
    """
    The class of one-hot Categorical distribution.
--- a/tianshou/core/value_function/action_value.py
+++ b/tianshou/core/value_function/action_value.py
@ -15,7 +15,7 @@ class ActionValue(ValueFunctionBase):
            observation_placeholder=observation_placeholder
        )
-    def get_value(self, observation, action):
+    def eval_value(self, observation, action):
        """
        :param observation: numpy array of observations, of shape (batchsize, observation_dim).
        :param action: numpy array of actions, of shape (batchsize, action_dim)
@ -24,7 +24,7 @@ class ActionValue(ValueFunctionBase):
        # TODO: dealing with the last dim of 1 in V(s) and Q(s, a)
        """
        sess = tf.get_default_session()
-        return sess.run(self.get_value_tensor(), feed_dict=
+        return sess.run(self.value_tensor, feed_dict=
        {self._observation_placeholder: observation, self._action_placeholder: action})
@ -50,7 +50,7 @@ class DQN(ActionValue):
                                  observation_placeholder=observation_placeholder,
                                  action_placeholder=action_placeholder)
-    def get_value_all_actions(self, observation):
+    def eval_value_all_actions(self, observation):
        """
        :param observation:
        :return: numpy array of Q(s, *) given s, of shape (batchsize, num_actions)
@ -58,5 +58,6 @@ class DQN(ActionValue):
        sess = tf.get_default_session()
        return sess.run(self._value_tensor_all_actions, feed_dict={self._observation_placeholder: observation})
-    def get_value_tensor_all_actions(self):
+    @property
    def value_tensor_all_actions(self):
        return self._value_tensor_all_actions
--- a/tianshou/core/value_function/base.py
+++ b/tianshou/core/value_function/base.py
@ -11,14 +11,15 @@ class ValueFunctionBase(object):
        self._observation_placeholder = observation_placeholder
        self._value_tensor = tf.squeeze(value_tensor)  # canonical values has shape (batchsize, )
-    def get_value(self, **kwargs):
+    def eval_value(self, **kwargs):
        """
        :return: batch of corresponding values in numpy array
        """
        raise NotImplementedError()
-    def get_value_tensor(self):
+    @property
    def value_tensor(self):
        """
        :return: tensor of the corresponding values
--- a/tianshou/core/value_function/state_value.py
+++ b/tianshou/core/value_function/state_value.py
@ -14,7 +14,7 @@ class StateValue(ValueFunctionBase):
            observation_placeholder=observation_placeholder
        )
-    def get_value(self, observation):
+    def eval_value(self, observation):
        """
        :param observation: numpy array of observations, of shape (batchsize, observation_dim).
@ -22,4 +22,4 @@ class StateValue(ValueFunctionBase):
        # TODO: dealing with the last dim of 1 in V(s) and Q(s, a), this should rely on the action shape returned by env
        """
        sess = tf.get_default_session()
-        return sess.run(self.get_value_tensor(), feed_dict={self._observation_placeholder: observation})
+        return sess.run(self.value_tensor, feed_dict={self._observation_placeholder: observation})