2017-12-08 21:09:23 +08:00
|
|
|
import tensorflow as tf
|
|
|
|
|
2018-05-20 22:36:04 +08:00
|
|
|
__all__ = [
|
|
|
|
'ppo_clip',
|
|
|
|
'REINFORCE',
|
|
|
|
'value_mse'
|
|
|
|
]
|
|
|
|
|
2017-12-08 21:09:23 +08:00
|
|
|
|
2018-01-14 20:58:28 +08:00
|
|
|
def ppo_clip(policy, clip_param):
|
2017-12-10 17:23:13 +08:00
|
|
|
"""
|
2018-04-15 09:35:31 +08:00
|
|
|
Builds the graph of clipped loss :math:`L^{CLIP}` as in the
|
2018-04-15 17:41:43 +08:00
|
|
|
`PPO paper <https://arxiv.org/pdf/1707.06347.pdf>`_, which is basically
|
|
|
|
:math:`-\min(r_t(\\theta)A_t, \mathrm{clip}(r_t(\\theta), 1 - \epsilon, 1 + \epsilon)A_t)`.
|
2018-04-15 09:35:31 +08:00
|
|
|
We minimize the objective instead of maximizing, hence the leading negative sign.
|
2018-04-15 17:41:43 +08:00
|
|
|
It creates an action placeholder and an advantage placeholder and adds into the ``managed_placeholders``
|
|
|
|
of the ``policy``.
|
2017-12-10 17:23:13 +08:00
|
|
|
|
2018-04-15 09:35:31 +08:00
|
|
|
:param policy: A :class:`tianshou.core.policy` to be optimized.
|
|
|
|
:param clip param: A float or Tensor of type float. The :math:`\epsilon` in the loss equation.
|
|
|
|
|
|
|
|
:return: A scalar float Tensor of the loss.
|
2017-12-10 17:23:13 +08:00
|
|
|
"""
|
2018-04-11 14:23:40 +08:00
|
|
|
action_ph = tf.placeholder(policy.action.dtype, shape=policy.action.shape, name='ppo_clip_loss/action_placeholder')
|
2018-01-14 20:58:28 +08:00
|
|
|
advantage_ph = tf.placeholder(tf.float32, shape=(None,), name='ppo_clip_loss/advantage_placeholder')
|
|
|
|
policy.managed_placeholders['action'] = action_ph
|
2018-01-17 11:55:51 +08:00
|
|
|
policy.managed_placeholders['advantage'] = advantage_ph
|
2017-12-10 17:23:13 +08:00
|
|
|
|
2018-04-11 14:23:40 +08:00
|
|
|
log_pi_act = policy.action_dist.log_prob(action_ph)
|
|
|
|
log_pi_old_act = policy.action_dist_old.log_prob(action_ph)
|
2017-12-08 21:09:23 +08:00
|
|
|
ratio = tf.exp(log_pi_act - log_pi_old_act)
|
|
|
|
clipped_ratio = tf.clip_by_value(ratio, 1. - clip_param, 1. + clip_param)
|
2018-01-14 20:58:28 +08:00
|
|
|
ppo_clip_loss = -tf.reduce_mean(tf.minimum(ratio * advantage_ph, clipped_ratio * advantage_ph))
|
2017-12-08 21:09:23 +08:00
|
|
|
return ppo_clip_loss
|
|
|
|
|
|
|
|
|
2018-01-17 11:55:51 +08:00
|
|
|
def REINFORCE(policy):
|
2017-12-11 13:37:27 +08:00
|
|
|
"""
|
2018-04-15 09:35:31 +08:00
|
|
|
Builds the graph of the loss function as used in vanilla policy gradient algorithms, i.e., REINFORCE.
|
2018-04-15 17:41:43 +08:00
|
|
|
The loss is basically :math:`\log \pi(a|s) A_t`.
|
2018-04-15 09:35:31 +08:00
|
|
|
We minimize the objective instead of maximizing, hence the leading negative sign.
|
2018-04-15 17:41:43 +08:00
|
|
|
It creates an action placeholder and an advantage placeholder and adds into the ``managed_placeholders``
|
|
|
|
of the ``policy``.
|
2018-04-15 09:35:31 +08:00
|
|
|
|
|
|
|
:param policy: A :class:`tianshou.core.policy` to be optimized.
|
2017-12-11 13:37:27 +08:00
|
|
|
|
2018-04-15 09:35:31 +08:00
|
|
|
:return: A scalar float Tensor of the loss.
|
2017-12-11 13:37:27 +08:00
|
|
|
"""
|
2018-04-11 14:23:40 +08:00
|
|
|
action_ph = tf.placeholder(policy.action.dtype, shape=policy.action.shape,
|
2018-01-17 11:55:51 +08:00
|
|
|
name='REINFORCE/action_placeholder')
|
|
|
|
advantage_ph = tf.placeholder(tf.float32, shape=(None,), name='REINFORCE/advantage_placeholder')
|
|
|
|
policy.managed_placeholders['action'] = action_ph
|
|
|
|
policy.managed_placeholders['advantage'] = advantage_ph
|
|
|
|
|
2018-04-11 14:23:40 +08:00
|
|
|
log_pi_act = policy.action_dist.log_prob(action_ph)
|
2018-01-17 11:55:51 +08:00
|
|
|
REINFORCE_loss = -tf.reduce_mean(advantage_ph * log_pi_act)
|
|
|
|
return REINFORCE_loss
|
|
|
|
|
|
|
|
|
2018-04-15 09:35:31 +08:00
|
|
|
def value_mse(value_function):
|
2018-01-17 11:55:51 +08:00
|
|
|
"""
|
2018-04-15 09:35:31 +08:00
|
|
|
Builds the graph of L2 loss on value functions for, e.g., training critics or DQN.
|
2018-04-15 17:41:43 +08:00
|
|
|
It creates an placeholder for the target value adds it into the ``managed_placeholders``
|
|
|
|
of the ``value_function``.
|
2018-01-17 11:55:51 +08:00
|
|
|
|
2018-04-15 09:35:31 +08:00
|
|
|
:param value_function: A :class:`tianshou.core.value_function` to be optimized.
|
2018-01-17 11:55:51 +08:00
|
|
|
|
2018-04-15 09:35:31 +08:00
|
|
|
:return: A scalar float Tensor of the loss.
|
2017-12-15 14:24:08 +08:00
|
|
|
"""
|
2018-04-15 09:35:31 +08:00
|
|
|
target_value_ph = tf.placeholder(tf.float32, shape=(None,), name='value_mse/return_placeholder')
|
|
|
|
value_function.managed_placeholders['return'] = target_value_ph
|
2018-01-18 12:19:48 +08:00
|
|
|
|
2018-04-15 09:35:31 +08:00
|
|
|
state_value = value_function.value_tensor
|
|
|
|
return tf.losses.mean_squared_error(target_value_ph, state_value)
|