Tianshou/tianshou/core/losses.py

import tensorflow as tf


def ppo_clip(policy, clip_param):
    """
    the clip loss in ppo paper

    :param sampled_action: placeholder of sampled actions during interaction with the environment
    :param advantage: placeholder of estimated advantage values.
    :param clip param: float or Tensor of type float.
    :param policy: current `policy` to be optimized
    :param pi_old: old `policy` for computing the ppo loss as in Eqn. (7) in the paper
    """
    action_ph = tf.placeholder(policy.act_dtype, shape=(None,) + policy.action_shape, name='ppo_clip_loss/action_placeholder')
    advantage_ph = tf.placeholder(tf.float32, shape=(None,), name='ppo_clip_loss/advantage_placeholder')
    policy.managed_placeholders['action'] = action_ph
    policy.managed_placeholders['advantage'] = advantage_ph

    log_pi_act = policy.log_prob(action_ph)
    log_pi_old_act = policy.log_prob_old(action_ph)
    ratio = tf.exp(log_pi_act - log_pi_old_act)
    clipped_ratio = tf.clip_by_value(ratio, 1. - clip_param, 1. + clip_param)
    ppo_clip_loss = -tf.reduce_mean(tf.minimum(ratio * advantage_ph, clipped_ratio * advantage_ph))
    return ppo_clip_loss


def REINFORCE(policy):
    """
    vanilla policy gradient

    :param sampled_action: placeholder of sampled actions during interaction with the environment
    :param reward: placeholder of reward the 'sampled_action' get
    :param pi: current `policy` to be optimized
    :param baseline: the baseline method used to reduce the variance, default is 'None'
    :return:
    """
    action_ph = tf.placeholder(policy.act_dtype, shape=(None,) + policy.action_shape,
                               name='REINFORCE/action_placeholder')
    advantage_ph = tf.placeholder(tf.float32, shape=(None,), name='REINFORCE/advantage_placeholder')
    policy.managed_placeholders['action'] = action_ph
    policy.managed_placeholders['advantage'] = advantage_ph

    log_pi_act = policy.log_prob(action_ph)
    REINFORCE_loss = -tf.reduce_mean(advantage_ph * log_pi_act)
    return REINFORCE_loss


def state_value_mse(state_value_function):
    """
    L2 loss of state value
    :param state_value_function: instance of StateValue
    :return: tensor of the mse loss
    """
    state_value_ph = tf.placeholder(tf.float32, shape=(None,), name='state_value_mse/state_value_placeholder')
    state_value_function.managed_placeholders['return'] = state_value_ph

    state_value = state_value_function.value_tensor
    return tf.losses.mean_squared_error(state_value_ph, state_value)


def dqn_loss(sampled_action, sampled_target, policy):
    """
    deep q-network

    :param sampled_action: placeholder of sampled actions during the interaction with the environment
    :param sampled_target: estimated Q(s,a)
    :param policy: current `policy` to be optimized
    :return:
    """
    sampled_q = policy.q_net.value_tensor
    return tf.reduce_mean(tf.square(sampled_target - sampled_q))

def deterministic_policy_gradient(sampled_state, critic):
    """
    deterministic policy gradient:

    :param sampled_action: placeholder of sampled actions during the interaction with the environment
    :param critic: current `value` function
    :return:
    """
    return tf.reduce_mean(critic.get_value(sampled_state))