From 2a3bc3ef35e95bfb686a30c33a5ff815010a592f Mon Sep 17 00:00:00 2001 From: haoshengzou Date: Thu, 12 Apr 2018 21:10:50 +0800 Subject: [PATCH] part of API doc --- examples/dqn.py | 2 +- tianshou/core/policy/base.py | 42 +++++++++++++-- tianshou/core/policy/deterministic.py | 73 ++++++++++++++++++++++---- tianshou/core/policy/distributional.py | 41 +++++++++++++-- tianshou/core/policy/dqn.py | 46 +++++++++++++++- tianshou/core/value_function/base.py | 18 ++++--- 6 files changed, 194 insertions(+), 28 deletions(-) diff --git a/examples/dqn.py b/examples/dqn.py index 42fa5db..39fc688 100644 --- a/examples/dqn.py +++ b/examples/dqn.py @@ -68,7 +68,7 @@ if __name__ == '__main__': test_interval = 5000 target_network_update_interval = 800 - seed = 0 + seed = 123 np.random.seed(seed) tf.set_random_seed(seed) diff --git a/tianshou/core/policy/base.py b/tianshou/core/policy/base.py index 86653fb..42e8f30 100644 --- a/tianshou/core/policy/base.py +++ b/tianshou/core/policy/base.py @@ -6,14 +6,50 @@ import tensorflow as tf class PolicyBase(object): """ - base class for policy. only provides `act` method with exploration + Base class for policy. Mandatory methods for a policy class are: + + - :func:`act`. It's used interacting with the environment during training, \ + so exploration noise should be added in this method. + + - :func:`act_test`. Since RL usually adds additional exploration noise during training, a different method\ + for testing the policy should be defined with different exploration specification.\ + Generally, DQN uses different :math:`\epsilon` in :math:`\epsilon`-greedy and\ + DDPG removes exploration noise during test. + + - :func:`reset`. It's mainly to reset the states of the exploration random process, or if your policy has\ + some internal states that should be reset at the beginning of each new episode. Otherwise, this method\ + does nothing. """ def act(self, observation, my_feed_dict): + """ + Return action given observation, when interacting with the environment during training. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: A dict. Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. Action given the single observation. Its "batch_size" is 1, + but should not be explicitly set. + """ raise NotImplementedError() + def act_test(self, observation, my_feed_dict): + """ + Return action given observation, when interacting with the environment during test. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: A dict. Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. Action given the single observation. Its "batch_size" is 1, + but should not be explicitly set. + """ + raise NotImplementedError + def reset(self): """ - for temporal correlated random process exploration, as in DDPG - :return: + Reset the internal states of the policy. Does nothing by default. """ pass diff --git a/tianshou/core/policy/deterministic.py b/tianshou/core/policy/deterministic.py index c71caf0..793fcac 100644 --- a/tianshou/core/policy/deterministic.py +++ b/tianshou/core/policy/deterministic.py @@ -8,7 +8,18 @@ from ..utils import identify_dependent_variables class Deterministic(PolicyBase): """ - deterministic policy as used in deterministic policy gradient (DDPG) methods + Deterministic policy as used in deterministic policy gradient (DDPG) methods. It can only be used with + continuous action space. The output of the policy network is directly the action. + + :param network_callable: A Python callable returning (action head, value head). When called it builds the tf graph and returns a Tensor + of the action on the action head. + :param observation_placeholder: A :class:`tf.placeholder`. The observation placeholder of the network graph. + :param has_old_net: A bool defaulting to ``False``. If true this class will create another graph with another + set of :class:`tf.Variable` s to be the "old net". The "old net" could be the target networks as in DQN + and DDPG, or just an old net to help optimization as in PPO. + :param random_process: Optional. A :class:`RandomProcess`. The additional random process for exploration. + Defaults to an :class:`OrnsteinUhlenbeckProcess` with :math:`\\theta=0.15` and :math:`\sigma=0.3` if not + set explicitly. """ def __init__(self, network_callable, observation_placeholder, has_old_net=False, random_process=None): self.observation_placeholder = observation_placeholder @@ -54,9 +65,25 @@ class Deterministic(PolicyBase): @property def trainable_variables(self): + """ + The trainable variables of the policy in a Python **set**. It contains only the :class:`tf.Variable` s + that affect the action. + """ return set(self._trainable_variables) def act(self, observation, my_feed_dict={}): + """ + Return action given observation, adding the exploration noise sampled from ``self.random_process``. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. + Action given the single observation. Its "batch_size" is 1, but should not be explicitly set. + """ sess = tf.get_default_session() # observation[None] adds one dimension at the beginning @@ -69,9 +96,24 @@ class Deterministic(PolicyBase): return sampled_action def reset(self): + """ + Reset the internal states of ``self.random_process``. + """ self.random_process.reset_states() def act_test(self, observation, my_feed_dict={}): + """ + Return action given observation, removing the exploration noise. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. + Action given the single observation. Its "batch_size" is 1, but should not be explicitly set. + """ sess = tf.get_default_session() # observation[None] adds one dimension at the beginning @@ -85,18 +127,22 @@ class Deterministic(PolicyBase): def sync_weights(self): """ - sync the weights of network_old. Direct copy the weights of network. - :return: + Sync the variables of the "old net" to be the same as the current network. """ if self.sync_weights_ops is not None: sess = tf.get_default_session() sess.run(self.sync_weights_ops) - def eval_action(self, observation): + def eval_action(self, observation, my_feed_dict={}): """ - evaluate action in minibatch - :param observation: - :return: 2-D numpy array + Evaluate action in minibatch using the current network. + + :param observation: An array-like. Contrary to :func:`act` and :func:`act_test`, it has the dimension + of batch_size. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array with the batch_size dimension and same batch_size as ``observation``. """ sess = tf.get_default_session() @@ -105,11 +151,16 @@ class Deterministic(PolicyBase): return action - def eval_action_old(self, observation): + def eval_action_old(self, observation, my_feed_dict={}): """ - evaluate action in minibatch - :param observation: - :return: 2-D numpy array + Evaluate action in minibatch using the old net. + + :param observation: An array-like. Contrary to :func:`act` and :func:`act_test`, it has the dimension + of batch_size. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array with the batch_size dimension and same batch_size as ``observation``. """ sess = tf.get_default_session() diff --git a/tianshou/core/policy/distributional.py b/tianshou/core/policy/distributional.py index 10eba03..dc2dcc5 100644 --- a/tianshou/core/policy/distributional.py +++ b/tianshou/core/policy/distributional.py @@ -6,7 +6,15 @@ from ..utils import identify_dependent_variables class Distributional(PolicyBase): """ - policy class where action is specified by a probability distribution + Policy class where action is specified by a probability distribution. Depending on the distribution, + it can be applied to both continuous and discrete action spaces. + + :param network_callable: A Python callable returning (action head, value head). When called it builds the tf graph and returns a + :class:`tf.distributions.Distribution` on the action space on the action head. + :param observation_placeholder: A :class:`tf.placeholder`. The observation placeholder of the network graph. + :param has_old_net: A bool defaulting to ``False``. If true this class will create another graph with another + set of :class:`tf.Variable` s to be the "old net". The "old net" could be the target networks as in DQN + and DDPG, or just an old net to help optimization as in PPO. """ def __init__(self, network_callable, observation_placeholder, has_old_net=False): self.observation_placeholder = observation_placeholder @@ -50,9 +58,25 @@ class Distributional(PolicyBase): @property def trainable_variables(self): + """ + The trainable variables of the policy in a Python **set**. It contains only the :class:`tf.Variable` s + that affect the action. + """ return set(self._trainable_variables) def act(self, observation, my_feed_dict={}): + """ + Return action given observation, directly sampling from the action distribution. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. + Action given the single observation. Its "batch_size" is 1, but should not be explicitly set. + """ sess = tf.get_default_session() # observation[None] adds one dimension at the beginning @@ -64,12 +88,23 @@ class Distributional(PolicyBase): return sampled_action def act_test(self, observation, my_feed_dict={}): + """ + Return action given observation, directly sampling from the action distribution. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. + Action given the single observation. Its "batch_size" is 1, but should not be explicitly set. + """ return self.act(observation, my_feed_dict) def sync_weights(self): """ - sync the weights of network_old. Direct copy the weights of network. - :return: + Sync the variables of the "old net" to be the same as the current network. """ if self.sync_weights_ops is not None: sess = tf.get_default_session() diff --git a/tianshou/core/policy/dqn.py b/tianshou/core/policy/dqn.py index 66e5609..70f4a56 100644 --- a/tianshou/core/policy/dqn.py +++ b/tianshou/core/policy/dqn.py @@ -8,6 +8,16 @@ import numpy as np class DQN(PolicyBase): """ use DQN from value_function as a member + + Policy derived from a Deep-Q Network (DQN). It should be constructed from a :class:`tianshou.core.value_function.DQN`. + Action is the argmax of the Q-values (usually with further :math:`\epsilon`-greedy). + It can only be applied to discrete action spaces. + + :param dqn: A :class:`tianshou.core.value_function.DQN`. The Q-value network to derive this policy. + :param epsilon_train: A float in range :math:`[0, 1]`. The :math:`\epsilon` used in :math:`\epsilon`-greedy + during training while interacting with the environment. + :param epsilon_test: A float in range :math:`[0, 1]`. The :math:`\epsilon` used in :math:`\epsilon`-greedy + during test while interacting with the environment. """ def __init__(self, dqn, epsilon_train=0.1, epsilon_test=0.05): self.action_value = dqn @@ -17,6 +27,18 @@ class DQN(PolicyBase): self.epsilon_test = epsilon_test def act(self, observation, my_feed_dict={}): + """ + Return action given observation, with :math:`\epsilon`-greedy using ``self.epsilon_train``. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. + Action given the single observation. Its "batch_size" is 1, but should not be explicitly set. + """ sess = tf.get_default_session() feed_dict = {self.action_value.observation_placeholder: observation[None]} @@ -30,6 +52,18 @@ class DQN(PolicyBase): return np.squeeze(action) def act_test(self, observation, my_feed_dict={}): + """ + Return action given observation, with :math:`\epsilon`-greedy using ``self.epsilon_test``. + + :param observation: An array-like with rank the same as a single observation of the environment. + Its "batch_size" is 1, but should not be explicitly set. This method will add the dimension + of "batch_size" to the first dimension. + :param my_feed_dict: Optional. A dict defaulting to empty. + Specifies placeholders such as dropout and batch_norm except observation. + + :return: A numpy array. + Action given the single observation. Its "batch_size" is 1, but should not be explicitly set. + """ sess = tf.get_default_session() feed_dict = {self.action_value.observation_placeholder: observation[None]} @@ -44,18 +78,26 @@ class DQN(PolicyBase): @property def q_net(self): + """The DQN (:class:`tianshou.core.value_function.DQN`) this policy based on.""" return self.action_value def sync_weights(self): """ - sync the weights of network_old. Direct copy the weights of network. - :return: + Sync the variables of the "old net" to be the same as the current network. """ if self.action_value.sync_weights_ops is not None: self.action_value.sync_weights() def set_epsilon_train(self, epsilon): + """ + Set the :math:`\epsilon` in :math:`\epsilon`-greedy during training. + :param epsilon: A float in range :math:`[0, 1]`. + """ self.epsilon_train = epsilon def set_epsilon_test(self, epsilon): + """ + Set the :math:`\epsilon` in :math:`\epsilon`-greedy during training. + :param epsilon: A float in range :math:`[0, 1]`. + """ self.epsilon_test = epsilon diff --git a/tianshou/core/value_function/base.py b/tianshou/core/value_function/base.py index a10d006..7c4ce88 100644 --- a/tianshou/core/value_function/base.py +++ b/tianshou/core/value_function/base.py @@ -4,23 +4,25 @@ import tensorflow as tf class ValueFunctionBase(object): """ - base class of value functions. Children include state values V(s) and action values Q(s, a) + Base class for value functions, including S-values and Q-values. The only + mandatory method for a value function class is: + + :func:`eval_value`, which runs the graph and evaluates the corresponding value. + + :param value_tensor: a Tensor. The tensor of V(s) or Q(s, a). + :param observation_placeholder: a :class:`tf.placeholder`. The observation placeholder of the network graph. """ def __init__(self, value_tensor, observation_placeholder): self.observation_placeholder = observation_placeholder - self._value_tensor = tf.squeeze(value_tensor) # canonical values has shape (batchsize, ) + self._value_tensor = tf.squeeze(value_tensor) # canonical value has shape (batchsize, ) def eval_value(self, **kwargs): """ - - :return: batch of corresponding values in numpy array + Runs the graph and evaluates the corresponding value. """ raise NotImplementedError() @property def value_tensor(self): - """ - - :return: tensor of the corresponding values - """ + """Tensor of the corresponding value""" return self._value_tensor