TrulyPPO/baselines/common/utils.py

import os
import gym
import numpy as np
import tensorflow as tf
from gym import spaces
from collections import deque

def sample(logits):
    noise = tf.random_uniform(tf.shape(logits))
    return tf.argmax(logits - tf.log(-tf.log(noise)), 1)

def cat_entropy(logits):
    a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
    ea0 = tf.exp(a0)
    z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
    p0 = ea0 / z0
    return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)

def cat_entropy_softmax(p0):
    return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)

def mse(pred, target):
    return tf.square(pred-target)/2.

def ortho_init(scale=1.0):
    def _ortho_init(shape, dtype, partition_info=None):
        #lasagne ortho init for tf
        shape = tuple(shape)
        if len(shape) == 2:
            flat_shape = shape
        elif len(shape) == 4: # assumes NHWC
            flat_shape = (np.prod(shape[:-1]), shape[-1])
        else:
            raise NotImplementedError
        a = np.random.normal(0.0, 1.0, flat_shape)
        u, _, v = np.linalg.svd(a, full_matrices=False)
        q = u if u.shape == flat_shape else v # pick the one with the correct shape
        q = q.reshape(shape)
        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
    return _ortho_init

def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False):
    if data_format == 'NHWC':
        channel_ax = 3
        strides = [1, stride, stride, 1]
        bshape = [1, 1, 1, nf]
    elif data_format == 'NCHW':
        channel_ax = 1
        strides = [1, 1, stride, stride]
        bshape = [1, nf, 1, 1]
    else:
        raise NotImplementedError
    bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1]
    nin = x.get_shape()[channel_ax].value
    wshape = [rf, rf, nin, nf]
    with tf.variable_scope(scope):
        w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))
        b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))
        if not one_dim_bias and data_format == 'NHWC':
            b = tf.reshape(b, bshape)
        return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)

def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
    with tf.variable_scope(scope):
        nin = x.get_shape()[1].value
        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
        return tf.matmul(x, w)+b

def batch_to_seq(h, nbatch, nsteps, flat=False):
    if flat:
        h = tf.reshape(h, [nbatch, nsteps])
    else:
        h = tf.reshape(h, [nbatch, nsteps, -1])
    return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]

def seq_to_batch(h, flat = False):
    shape = h[0].get_shape().as_list()
    if not flat:
        assert(len(shape) > 1)
        nh = h[0].get_shape()[-1].value
        return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
    else:
        return tf.reshape(tf.stack(values=h, axis=1), [-1])

def lstm(xs, ms, s, scope, nh, init_scale=1.0):
    nbatch, nin = [v.value for v in xs[0].get_shape()]
    nsteps = len(xs)
    with tf.variable_scope(scope):
        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))

    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
    for idx, (x, m) in enumerate(zip(xs, ms)):
        c = c*(1-m)
        h = h*(1-m)
        z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
        i = tf.nn.sigmoid(i)
        f = tf.nn.sigmoid(f)
        o = tf.nn.sigmoid(o)
        u = tf.tanh(u)
        c = f*c + i*u
        h = o*tf.tanh(c)
        xs[idx] = h
    s = tf.concat(axis=1, values=[c, h])
    return xs, s

def _ln(x, g, b, e=1e-5, axes=[1]):
    u, s = tf.nn.moments(x, axes=axes, keep_dims=True)
    x = (x-u)/tf.sqrt(s+e)
    x = x*g+b
    return x

def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
    nbatch, nin = [v.value for v in xs[0].get_shape()]
    nsteps = len(xs)
    with tf.variable_scope(scope):
        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
        gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
        bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))

        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
        gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))
        bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))

        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))

        gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))
        bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))

    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
    for idx, (x, m) in enumerate(zip(xs, ms)):
        c = c*(1-m)
        h = h*(1-m)
        z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
        i = tf.nn.sigmoid(i)
        f = tf.nn.sigmoid(f)
        o = tf.nn.sigmoid(o)
        u = tf.tanh(u)
        c = f*c + i*u
        h = o*tf.tanh(_ln(c, gc, bc))
        xs[idx] = h
    s = tf.concat(axis=1, values=[c, h])
    return xs, s

def conv_to_fc(x):
    nh = np.prod([v.value for v in x.get_shape()[1:]])
    x = tf.reshape(x, [-1, nh])
    return x

def discount_with_dones(rewards, dones, gamma):
    discounted = []
    r = 0
    for reward, done in zip(rewards[::-1], dones[::-1]):
        r = reward + gamma*r*(1.-done) # fixed off by one bug
        discounted.append(r)
    return discounted[::-1]

def find_trainable_variables(key):
    with tf.variable_scope(key):
        return tf.trainable_variables()

def make_path(f):
    return os.makedirs(f, exist_ok=True)

def constant(p):
    return 1

def linear(p):
    return 1-p

def middle_drop(p):
    eps = 0.75
    if 1-p<eps:
        return eps*0.1
    return 1-p

def double_linear_con(p):
    p *= 2
    eps = 0.125
    if 1-p<eps:
        return eps
    return 1-p

def double_middle_drop(p):
    eps1 = 0.75
    eps2 = 0.25
    if 1-p<eps1:
        if 1-p<eps2:
            return eps2*0.5
        return eps1*0.1
    return 1-p

schedules = {
    'linear':linear,
    'constant':constant,
    'double_linear_con': double_linear_con,
    'middle_drop': middle_drop,
    'double_middle_drop': double_middle_drop
}

class Scheduler(object):

    def __init__(self, v, nvalues, schedule):
        self.n = 0.
        self.v = v
        self.nvalues = nvalues
        self.schedule = schedules[schedule]

    def value(self):
        current_value = self.v*self.schedule(self.n/self.nvalues)
        self.n += 1.
        return current_value

    def value_steps(self, steps):
        return self.v*self.schedule(steps/self.nvalues)


class EpisodeStats:
    def __init__(self, nsteps, nenvs):
        self.episode_rewards = []
        for i in range(nenvs):
            self.episode_rewards.append([])
        self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
        self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
        self.nsteps = nsteps
        self.nenvs = nenvs

    def feed(self, rewards, masks):
        rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
        masks = np.reshape(masks, [self.nenvs, self.nsteps])
        for i in range(0, self.nenvs):
            for j in range(0, self.nsteps):
                self.episode_rewards[i].append(rewards[i][j])
                if masks[i][j]:
                    l = len(self.episode_rewards[i])
                    s = sum(self.episode_rewards[i])
                    self.lenbuffer.append(l)
                    self.rewbuffer.append(s)
                    self.episode_rewards[i] = []

    def mean_length(self):
        if self.lenbuffer:
            return np.mean(self.lenbuffer)
        else:
            return 0  # on the first params dump, no episodes are finished

    def mean_reward(self):
        if self.rewbuffer:
            return np.mean(self.rewbuffer)
        else:
            return 0


# For ACER
def get_by_index(x, idx):
    assert(len(x.get_shape()) == 2)
    assert(len(idx.get_shape()) == 1)
    idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
    y = tf.gather(tf.reshape(x, [-1]),  # flatten input
                  idx_flattened)  # use flattened indices
    return y

def check_shape(ts,shapes):
    i = 0
    for (t,shape) in zip(ts,shapes):
        assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
        i += 1

def avg_norm(t):
    return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))

def gradient_add(g1, g2, param):
    print([g1, g2, param.name])
    assert (not (g1 is None and g2 is None)), param.name
    if g1 is None:
        return g2
    elif g2 is None:
        return g1
    else:
        return g1 + g2

def q_explained_variance(qpred, q):
    _, vary = tf.nn.moments(q, axes=[0, 1])
    _, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
    check_shape([vary, varpred], [[]] * 2)
    return 1.0 - (varpred / vary)
initialize code 2020-01-17 12:30:26 +08:00			`import os`
			`import gym`
			`import numpy as np`
			`import tensorflow as tf`
			`from gym import spaces`
			`from collections import deque`

			`def sample(logits):`
			`noise = tf.random_uniform(tf.shape(logits))`
			`return tf.argmax(logits - tf.log(-tf.log(noise)), 1)`

			`def cat_entropy(logits):`
			`a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)`
			`ea0 = tf.exp(a0)`
			`z0 = tf.reduce_sum(ea0, 1, keep_dims=True)`
			`p0 = ea0 / z0`
			`return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)`

			`def cat_entropy_softmax(p0):`
			`return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)`

			`def mse(pred, target):`
			`return tf.square(pred-target)/2.`

			`def ortho_init(scale=1.0):`
			`def _ortho_init(shape, dtype, partition_info=None):`
			`#lasagne ortho init for tf`
			`shape = tuple(shape)`
			`if len(shape) == 2:`
			`flat_shape = shape`
			`elif len(shape) == 4: # assumes NHWC`
			`flat_shape = (np.prod(shape[:-1]), shape[-1])`
			`else:`
			`raise NotImplementedError`
			`a = np.random.normal(0.0, 1.0, flat_shape)`
			`u, _, v = np.linalg.svd(a, full_matrices=False)`
			`q = u if u.shape == flat_shape else v # pick the one with the correct shape`
			`q = q.reshape(shape)`
			`return (scale * q[:shape[0], :shape[1]]).astype(np.float32)`
			`return _ortho_init`

			`def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False):`
			`if data_format == 'NHWC':`
			`channel_ax = 3`
			`strides = [1, stride, stride, 1]`
			`bshape = [1, 1, 1, nf]`
			`elif data_format == 'NCHW':`
			`channel_ax = 1`
			`strides = [1, 1, stride, stride]`
			`bshape = [1, nf, 1, 1]`
			`else:`
			`raise NotImplementedError`
			`bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1]`
			`nin = x.get_shape()[channel_ax].value`
			`wshape = [rf, rf, nin, nf]`
			`with tf.variable_scope(scope):`
			`w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))`
			`b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))`
			`if not one_dim_bias and data_format == 'NHWC':`
			`b = tf.reshape(b, bshape)`
			`return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)`

			`def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):`
			`with tf.variable_scope(scope):`
			`nin = x.get_shape()[1].value`
			`w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))`
			`b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))`
			`return tf.matmul(x, w)+b`

			`def batch_to_seq(h, nbatch, nsteps, flat=False):`
			`if flat:`
			`h = tf.reshape(h, [nbatch, nsteps])`
			`else:`
			`h = tf.reshape(h, [nbatch, nsteps, -1])`
			`return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]`

			`def seq_to_batch(h, flat = False):`
			`shape = h[0].get_shape().as_list()`
			`if not flat:`
			`assert(len(shape) > 1)`
			`nh = h[0].get_shape()[-1].value`
			`return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])`
			`else:`
			`return tf.reshape(tf.stack(values=h, axis=1), [-1])`

			`def lstm(xs, ms, s, scope, nh, init_scale=1.0):`
			`nbatch, nin = [v.value for v in xs[0].get_shape()]`
			`nsteps = len(xs)`
			`with tf.variable_scope(scope):`
			`wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))`
			`wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))`
			`b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))`

			`c, h = tf.split(axis=1, num_or_size_splits=2, value=s)`
			`for idx, (x, m) in enumerate(zip(xs, ms)):`
			`c = c*(1-m)`
			`h = h*(1-m)`
			`z = tf.matmul(x, wx) + tf.matmul(h, wh) + b`
			`i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)`
			`i = tf.nn.sigmoid(i)`
			`f = tf.nn.sigmoid(f)`
			`o = tf.nn.sigmoid(o)`
			`u = tf.tanh(u)`
			`c = fc + iu`
			`h = o*tf.tanh(c)`
			`xs[idx] = h`
			`s = tf.concat(axis=1, values=[c, h])`
			`return xs, s`

			`def _ln(x, g, b, e=1e-5, axes=[1]):`
			`u, s = tf.nn.moments(x, axes=axes, keep_dims=True)`
			`x = (x-u)/tf.sqrt(s+e)`
			`x = x*g+b`
			`return x`

			`def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):`
			`nbatch, nin = [v.value for v in xs[0].get_shape()]`
			`nsteps = len(xs)`
			`with tf.variable_scope(scope):`
			`wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))`
			`gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))`
			`bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))`

			`wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))`
			`gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))`
			`bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))`

			`b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))`

			`gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))`
			`bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))`

			`c, h = tf.split(axis=1, num_or_size_splits=2, value=s)`
			`for idx, (x, m) in enumerate(zip(xs, ms)):`
			`c = c*(1-m)`
			`h = h*(1-m)`
			`z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b`
			`i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)`
			`i = tf.nn.sigmoid(i)`
			`f = tf.nn.sigmoid(f)`
			`o = tf.nn.sigmoid(o)`
			`u = tf.tanh(u)`
			`c = fc + iu`
			`h = o*tf.tanh(_ln(c, gc, bc))`
			`xs[idx] = h`
			`s = tf.concat(axis=1, values=[c, h])`
			`return xs, s`

			`def conv_to_fc(x):`
			`nh = np.prod([v.value for v in x.get_shape()[1:]])`
			`x = tf.reshape(x, [-1, nh])`
			`return x`

			`def discount_with_dones(rewards, dones, gamma):`
			`discounted = []`
			`r = 0`
			`for reward, done in zip(rewards[::-1], dones[::-1]):`
			`r = reward + gammar(1.-done) # fixed off by one bug`
			`discounted.append(r)`
			`return discounted[::-1]`

			`def find_trainable_variables(key):`
			`with tf.variable_scope(key):`
			`return tf.trainable_variables()`

			`def make_path(f):`
			`return os.makedirs(f, exist_ok=True)`

			`def constant(p):`
			`return 1`

			`def linear(p):`
			`return 1-p`

			`def middle_drop(p):`
			`eps = 0.75`
			`if 1-p<eps:`
			`return eps*0.1`
			`return 1-p`

			`def double_linear_con(p):`
			`p *= 2`
			`eps = 0.125`
			`if 1-p<eps:`
			`return eps`
			`return 1-p`

			`def double_middle_drop(p):`
			`eps1 = 0.75`
			`eps2 = 0.25`
			`if 1-p<eps1:`
			`if 1-p<eps2:`
			`return eps2*0.5`
			`return eps1*0.1`
			`return 1-p`

			`schedules = {`
			`'linear':linear,`
			`'constant':constant,`
			`'double_linear_con': double_linear_con,`
			`'middle_drop': middle_drop,`
			`'double_middle_drop': double_middle_drop`
			`}`

			`class Scheduler(object):`

			`def __init__(self, v, nvalues, schedule):`
			`self.n = 0.`
			`self.v = v`
			`self.nvalues = nvalues`
			`self.schedule = schedules[schedule]`

			`def value(self):`
			`current_value = self.v*self.schedule(self.n/self.nvalues)`
			`self.n += 1.`
			`return current_value`

			`def value_steps(self, steps):`
			`return self.v*self.schedule(steps/self.nvalues)`


			`class EpisodeStats:`
			`def __init__(self, nsteps, nenvs):`
			`self.episode_rewards = []`
			`for i in range(nenvs):`
			`self.episode_rewards.append([])`
			`self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths`
			`self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards`
			`self.nsteps = nsteps`
			`self.nenvs = nenvs`

			`def feed(self, rewards, masks):`
			`rewards = np.reshape(rewards, [self.nenvs, self.nsteps])`
			`masks = np.reshape(masks, [self.nenvs, self.nsteps])`
			`for i in range(0, self.nenvs):`
			`for j in range(0, self.nsteps):`
			`self.episode_rewards[i].append(rewards[i][j])`
			`if masks[i][j]:`
			`l = len(self.episode_rewards[i])`
			`s = sum(self.episode_rewards[i])`
			`self.lenbuffer.append(l)`
			`self.rewbuffer.append(s)`
			`self.episode_rewards[i] = []`

			`def mean_length(self):`
			`if self.lenbuffer:`
			`return np.mean(self.lenbuffer)`
			`else:`
			`return 0 # on the first params dump, no episodes are finished`

			`def mean_reward(self):`
			`if self.rewbuffer:`
			`return np.mean(self.rewbuffer)`
			`else:`
			`return 0`


			`# For ACER`
			`def get_by_index(x, idx):`
			`assert(len(x.get_shape()) == 2)`
			`assert(len(idx.get_shape()) == 1)`
			`idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx`
			`y = tf.gather(tf.reshape(x, [-1]), # flatten input`
			`idx_flattened) # use flattened indices`
			`return y`

			`def check_shape(ts,shapes):`
			`i = 0`
			`for (t,shape) in zip(ts,shapes):`
			`assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)`
			`i += 1`

			`def avg_norm(t):`
			`return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))`

			`def gradient_add(g1, g2, param):`
			`print([g1, g2, param.name])`
			`assert (not (g1 is None and g2 is None)), param.name`
			`if g1 is None:`
			`return g2`
			`elif g2 is None:`
			`return g1`
			`else:`
			`return g1 + g2`

			`def q_explained_variance(qpred, q):`
			`_, vary = tf.nn.moments(q, axes=[0, 1])`
			`_, varpred = tf.nn.moments(q - qpred, axes=[0, 1])`
			`check_shape([vary, varpred], [[]] * 2)`
			`return 1.0 - (varpred / vary)`