TrulyPPO/baselines/common/distributions.py

import tensorflow as tf
import numpy as np
import baselines.common.tf_util as U
from baselines.a2c.utils import fc
from tensorflow.python.ops import math_ops

class Pd(object):
    """
    A particular probability distribution
    """
    def flatparam(self):
        raise NotImplementedError
    def mode(self):
        raise NotImplementedError
    def neglogp(self, x):
        # Usually it's easier to define the negative logprob
        raise NotImplementedError
    def kl(self, other):
        raise NotImplementedError
    def entropy(self):
        raise NotImplementedError
    def sample(self):
        raise NotImplementedError
    def logp(self, x):
        return - self.neglogp(x)
    def p(self,x):
        return tf.exp( self.logp(x) )

class PdType(object):
    """
    Parametrized family of probability distributions
    """
    def pdclass(self):
        raise NotImplementedError
    def pdfromflat(self, flat):
        return self.pdclass()(flat)
    def pdfromlatent(self, latent_vector,**kwargs):
        raise NotImplementedError
    def param_shape(self):
        raise NotImplementedError
    def sample_shape(self):
        raise NotImplementedError
    def sample_dtype(self):
        raise NotImplementedError

    def param_placeholder(self, prepend_shape, name=None):
        return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
    def sample_placeholder(self, prepend_shape, name=None, default_value=None):
        if default_value is None:
            return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
        else:
            return tf.placeholder_with_default(default_value, shape=prepend_shape+self.sample_shape(), name=name)

class CategoricalPdType(PdType):
    def __init__(self, ncat):
        self.ncat = ncat

    def pdclass(self):
        return CategoricalPd

    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0, **kwargs):
        pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

    def param_shape(self):
        return [self.ncat]

    def sample_shape(self):
        return []
    def sample_dtype(self):
        return tf.int32


class MultiCategoricalPdType(PdType):
    def __init__(self, nvec):
        self.ncats = nvec
    def pdclass(self):
        return MultiCategoricalPd
    def pdfromflat(self, flat):
        return MultiCategoricalPd(self.ncats, flat)
    def param_shape(self):
        return [sum(self.ncats)]
    def sample_shape(self):
        return [len(self.ncats)]
    def sample_dtype(self):
        return tf.int32

class DiagGaussianPdType(PdType):
    def __init__(self, size):
        self.size = size
    def pdclass(self):
        return DiagGaussianPd

    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0, latent_logstd=None, logstd_initial=0.):
        # TODO: Add **kwargs to other PdType
        mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        if latent_logstd is not None:
            logstd = fc(latent_logstd, 'logstd', self.size, init_scale=0.0, init_bias=logstd_initial)
        else:
            # logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer() )
            logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.constant_initializer(logstd_initial))
        self.logstd = logstd
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        return self.pdfromflat(pdparam), mean

    def param_shape(self):
        return [2*self.size]
    def sample_shape(self):
        return [self.size]
    def sample_dtype(self):
        return tf.float32

class BetaPdType(PdType):
    def __init__(self, size):
        self.size = size
    def pdclass(self):
        return BetaPd
    # def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
        # mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        # logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
        # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        # return self.pdfromflat(pdparam), mean

    def param_shape(self):
        return [2*self.size]
    def sample_shape(self):
        return [self.size]
    def sample_dtype(self):
        return tf.float32

class MultimodalPdType(PdType):
    def __init__(self, size):
        self.size = size
    def pdclass(self):
        return MultimodalPd
    # def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
    #     mean = fc(latent_vector, 'pi', 2 * self.size, init_scale=init_scale, init_bias=init_bias)
    #     logstd = tf.get_variable(name='logstd', shape=[1, 2 * self.size], initializer=tf.zeros_initializer())
    #     alpha = tf.get_variable(name='alpha', shape=[1, self.size], initializer=tf.random_normal_initializer())
    #     pdparam = tf.concat([mean, mean * 0.0 + logstd, alpha], axis=1)
    #     return self.pdfromflat(pdparam), mean

    def param_shape(self):
        return [5*self.size]
    def sample_shape(self):
        return [self.size]
    def sample_dtype(self):
        return tf.float32


class BernoulliPdType(PdType):
    def __init__(self, size):
        self.size = size
    def pdclass(self):
        return BernoulliPd
    def param_shape(self):
        return [self.size]
    def sample_shape(self):
        return [self.size]
    def sample_dtype(self):
        return tf.int32

# WRONG SECOND DERIVATIVES
# class CategoricalPd(Pd):
#     def __init__(self, logits):
#         self.logits = logits
#         self.ps = tf.nn.softmax(logits)
#     @classmethod
#     def fromflat(cls, flat):
#         return cls(flat)
#     def flatparam(self):
#         return self.logits
#     def mode(self):
#         return U.argmax(self.logits, axis=-1)
#     def logp(self, x):
#         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
#     def kl(self, other):
#         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
#                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
#     def entropy(self):
#         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
#     def sample(self):
#         u = tf.random_uniform(tf.shape(self.logits))
#         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)

class CategoricalPd(Pd):
    def __init__(self, logits):
        self.logits = logits
    def flatparam(self):
        return self.logits
    def mode(self):
        return tf.argmax(self.logits, axis=-1)
    def neglogp(self, x):
        # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
        # Note: we can't use sparse_softmax_cross_entropy_with_logits because
        #       the implementation does not allow second-order derivatives...
        one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
        return tf.nn.softmax_cross_entropy_with_logits(
            logits=self.logits,
            labels=one_hot_actions)
    def kl(self, other):
        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
        a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
        ea0 = tf.exp(a0)
        ea1 = tf.exp(a1)
        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
        z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
        p0 = ea0 / z0
        return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
    def entropy(self):
        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
        ea0 = tf.exp(a0)
        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
        p0 = ea0 / z0
        return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
    def sample(self, sample_shape=None):
        shape_new = tf.shape(self.logits)
        if sample_shape is not None:
            if not isinstance( sample_shape, list ) or not isinstance( sample_shape, tuple):
                sample_shape = [sample_shape]
            shape_new = tf.concat( ( sample_shape, shape_new ), axis=0 )
        u = tf.random_uniform(shape_new)
        return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
    @classmethod
    def fromflat(cls, flat):
        return cls(flat)

class MultiCategoricalPd(Pd):
    def __init__(self, nvec, flat):
        self.flat = flat
        self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
    def flatparam(self):
        return self.flat
    def mode(self):
        return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
    def neglogp(self, x):
        return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
    def kl(self, other):
        return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
    def entropy(self):
        return tf.add_n([p.entropy() for p in self.categoricals])
    def sample(self):
        return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
    @classmethod
    def fromflat(cls, flat):
        raise NotImplementedError

class DiagGaussianPd(Pd):
    def __init__(self, flat, var_type='logstd'):
        self.flat = flat
        mean, var_term = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
        self.mean = mean
        '''
            std: standard deviation
            var: var=std^2. variance
            logstd: log(std)
            logvar: log(var)
        '''
        if var_type == 'logstd':
            logstd = var_term
            std = tf.exp(var_term)
        elif var_type == 'std':
            std = var_term
            logstd = tf.log(std)
        elif var_type == 'logvar':
            logstd = var_term /2
            std = tf.exp(var_term)
        elif var_type == 'var':
            std = tf.sqrt(var_term)
            logstd = tf.log(std)
        else:
            raise  NotImplementedError
        self.logstd = logstd
        self.std = std

    def flatparam(self):
        return self.flat
    def mode(self):
        return self.mean
    def neglogp(self, x):
        return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
               + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
               + tf.reduce_sum(self.logstd, axis=-1)
    def kl(self, other):
        assert isinstance(other, DiagGaussianPd)
        return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)

    def wasserstein(self, other):
        assert isinstance(other, DiagGaussianPd)
        return tf.reduce_sum( tf.square(self.mean-other.mean) + tf.square(self.std-other.std ) , axis=-1)

    def entropy(self):
        return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
    def sample(self, sample_shape=None):
        shape_new = tf.shape(self.mean)
        if sample_shape is not None:
            if not isinstance( sample_shape, list ) or not isinstance( sample_shape, tuple):
                sample_shape = [sample_shape]
            shape_new = tf.concat( ( sample_shape, shape_new ), axis=0 )

        return self.mean + self.std * tf.random_normal(shape_new)
    @classmethod
    def fromflat(cls, flat):
        return cls(flat)

class BetaPd(Pd):
    '''
        Note that the paramter is logstd, not logsigma
    '''
    def __init__(self, flat, std_type='log'):
        self.flat = flat
        alpha, beta = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
        self.alpha = alpha# = tf.squeeze(alpha)
        #beta = tf.squeeze(beta)
        tfd = tf.contrib.distributions
        self._pd = tfd.Beta(alpha, beta)
        self.low, self.high = 0., 5. # map the beta dist support (0, 1) to (low, high)


    def flatparam(self):
        return self.flat

    def neglogp(self, x):
        # x = tf.squeeze(x)
        return -tf.log(self._pd.prob(self._inverse_map_to_support(x)) / (self.high - self.low))

    def sample(self, sample_shape=None):
        # shape_new = tf.shape(self.alpha)
        # if sample_shape is not None:
        #     if not isinstance( sample_shape, list ) or not isinstance( sample_shape, tuple):
        #         sample_shape = [sample_shape]
        #     shape_new = tf.concat( ( sample_shape, shape_new ), axis=0 )
        #
        # return self.mean + self.std * tf.random_normal(shape_new)

        return tf.expand_dims(self._map_to_support(self._pd.sample()), 0)

    def _map_to_support(self, x):
        # map the beta dist support (0, 1) to (low, high)
        return self.low + x * (self.high - self.low)

    def _inverse_map_to_support(self, x):
        return (x - self.low) / (self.high - self.low)

    @classmethod
    def fromflat(cls, flat):
        return cls(flat)

class MultimodalPd(Pd):
    def __init__(self, flat, std_type='log'):
        self.flat = flat
        self.mean1, self.mean2, self.logstd1, self.logstd2, self.alpha = tf.split(axis=len(flat.shape)-1,
                                                                                  num_or_size_splits=5, value=flat)
        self.std1 = tf.exp(self.logstd1)
        self.std2 = tf.exp(self.logstd2)
        self.alpha = tf.sigmoid(self.alpha)
        self.alpha = tf.squeeze(self.alpha, 0)

        self._pdtype = DiagGaussianPdType(self.mean1.shape[0])
        self._pd1 = self._pdtype.pdfromflat(tf.concat([self.mean1, self.logstd1], axis=-1))
        self._pd2 = self._pdtype.pdfromflat(tf.concat([self.mean2, self.logstd2], axis=-1))

    def flatparam(self):
        return self.flat
    def mode(self):
        return self.mean1, self.mean2
    def neglogp(self, x):
        # return tf.squeeze(-tf.log(self.alpha * tf.exp(-self._pd1.neglogp(x)) + (1 - self.alpha) * tf.exp(-self._pd2.neglogp(x))), 0)
        # return -tf.log(self.alpha * tf.exp(-self._pd1.neglogp(x)) + (1 - self.alpha) * tf.exp(-self._pd2.neglogp(x)))

        return -tf.log(self.alpha * self._pd1.p(x) + (1 - self.alpha) * self._pd2.p(x))

    def sample(self, sample_shape=None):
        return self.alpha * self._pd1.sample() + (1 - self.alpha) * self._pd2.sample()

    @classmethod
    def fromflat(cls, flat):
        return cls(flat)

class BernoulliPd(Pd):
    def __init__(self, logits):
        self.logits = logits
        self.ps = tf.sigmoid(logits)
    def flatparam(self):
        return self.logits
    def mode(self):
        return tf.round(self.ps)
    def neglogp(self, x):
        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
    def kl(self, other):
        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
    def entropy(self):
        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
    def sample(self):
        u = tf.random_uniform(tf.shape(self.ps))
        return tf.to_float(math_ops.less(u, self.ps))
    @classmethod
    def fromflat(cls, flat):
        return cls(flat)

def make_pdtype(ac_space):
    from gym import spaces
    if isinstance(ac_space, spaces.Box):
        assert len(ac_space.shape) == 1
        return DiagGaussianPdType(ac_space.shape[0])
    elif isinstance(ac_space, spaces.Discrete):
        return CategoricalPdType(ac_space.n)
    elif isinstance(ac_space, spaces.MultiDiscrete):
        return MultiCategoricalPdType(ac_space.nvec)
    elif isinstance(ac_space, spaces.MultiBinary):
        return BernoulliPdType(ac_space.n)
    else:
        raise NotImplementedError

def make_pdtype_bandit(ac_space, policy_type):
    from gym import spaces
    if isinstance(ac_space, spaces.Box):
        assert len(ac_space.shape) == 1
        if policy_type == 'Gaussian':
            return DiagGaussianPdType(ac_space.shape[0])
        elif policy_type == 'Beta':
            return BetaPdType(ac_space.shape[0])
        elif policy_type == 'Multimodal':
            return MultimodalPdType(ac_space.shape[0])
    elif isinstance(ac_space, spaces.Discrete):
        return CategoricalPdType(ac_space.n)
    elif isinstance(ac_space, spaces.MultiDiscrete):
        return MultiCategoricalPdType(ac_space.nvec)
    elif isinstance(ac_space, spaces.MultiBinary):
        return BernoulliPdType(ac_space.n)
    else:
        raise NotImplementedError

def shape_el(v, i):
    maybe = v.get_shape()[i]
    if maybe is not None:
        return maybe
    else:
        return tf.shape(v)[i]

@U.in_session
def test_probtypes():
    np.random.seed(0)

    pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
    diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
    validate_probtype(diag_gauss, pdparam_diag_gauss)

    pdparam_categorical = np.array([-.2, .3, .5])
    categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
    validate_probtype(categorical, pdparam_categorical)

    nvec = [1,2,3]
    pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
    multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
    validate_probtype(multicategorical, pdparam_multicategorical)

    pdparam_bernoulli = np.array([-.2, .3, .5])
    bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
    validate_probtype(bernoulli, pdparam_bernoulli)


def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = - logliks.mean() #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    entval = calcent(Mval).mean() #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)