TrulyPPO/baselines/ppo2_AdaClip/ppo2.py

import os
import os.path as osp
import time
from collections import deque

import joblib
import numpy as np
import tensorflow as tf
import gym
from baselines import logger
from baselines.common import explained_variance
from baselines.common.runners import AbstractEnvRunner
from baselines.common.vec_env.vec_normalize import VecNormalize
from toolsm import tools

save_env = None
load_env = None


def set_save_load_env(env):
    global save_env, load_env
    save_env = save_env_fn(env)
    load_env = load_env_fn(env)


def save_env_fn(env):
    def save_env(save_path):
        if isinstance(env, VecNormalize):
            env.save(save_path + '.env')

    return save_env


def load_env_fn(env):
    def load_env(load_path):
        if isinstance(env, VecNormalize):
            env.load(load_path + '.env')

    return load_env
import baselines.common.tf_util as U

class Model(object):
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm, cliptype, args):
        sess = tf.get_default_session()

        # if args.policy_type == 'MlpPolicyMy':
        #     additional_keys = ['hidden_sizes','num_sharing_layers','ac_fn', 'seperate_logstd']
        # elif args.policy_type == 'MlpPolicy':
        #     additional_keys = ['seperate_logstd']
        # else:
        #     additional_keys = []
        #
        # additional_args = {}
        # for k in additional_keys:
        #     additional_args[k] = getattr( args, k)

        additional_args = {}
        if args.policy_type == 'MlpPolicyExt':
            additional_args = dict(args=args)

        # There are two models just to set different batch size

        act_model = policy(sess, ob_space, ac_space, nbatch_act, nsteps=1, reuse=False, **additional_args)
        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps=nsteps, reuse=True, **additional_args)
        if args.n_eval_epsiodes > 0:
            self.eval_policy = policy(sess, ob_space, ac_space, args.n_eval_epsiodes, nsteps=1, reuse=True, **additional_args)
            # ------- For copy Policy
            # self.eval_policy = eval_policy = policy(sess, ob_space, ac_space, args.n_eval_epsiodes, nsteps=1, reuse=False, name='policy_eval', **additional_args)
            # params_eval = eval_policy.variables_trainable
            # placeholders_params= [ tf.placeholder( dtype=v.dtype, shape=v.shape ) for v in params_eval ]
            # self.assign_eval_policy = U.function(inputs=placeholders_params,outputs=[], updates=[tf.assign(v, v_input) for (v, v_input) in zip(params_eval, placeholders_params)])
            # self.get_params_train = U.function( inputs=[], outputs=act_model.variables_trainable )
            # self.get_params_eval = U.function(inputs=[], outputs=eval_policy.variables_trainable )
            # ------ End------


            # for p in params:
            #     print( p.name )
            # for p in eval_policy.variables_all:
            #     print(p.name)
            # exit()
            # tools.save_vars( '/media/d/e/baselines_workingon/baselines/ppo2_AdaClip/t/a.pkl', params )
            # exit()


        self.OBS = OBS = train_model.X
        self.ACTIONS = ACTIONS = train_model.pdtype.sample_placeholder([None])

        self.RETURNS = RETURNS = tf.placeholder(tf.float32, [None])
        self.NEGLOGPACS_OLD = NEGLOGPACS_OLD = tf.placeholder(tf.float32, [None])
        self.VALUES_OLD = VALUES_OLD = tf.placeholder(tf.float32, [None])
        self.LR =  LR = tf.placeholder(tf.float32, [])
        self.CLIPRANGE =  CLIPRANGE = tf.placeholder(tf.float32, [])
        self.KLRANGE = KLRANGE = tf.placeholder(tf.float32, [])
        self.CLIPRANGE_LOWER = CLIPRANGE_LOWER = tf.placeholder(tf.float32, [None])
        self.CLIPRANGE_UPPER = CLIPRANGE_UPPER = tf.placeholder(tf.float32, [None])
        self.KL_COEF = KL_COEF = tf.placeholder(tf.float32, [])
        self.RANGE = RANGE = tf.placeholder(tf.float32, [])


        neglogpac = train_model.pd.neglogp(ACTIONS)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = VALUES_OLD + tf.clip_by_value(train_model.vf - VALUES_OLD, - CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - RETURNS)
        vf_losses2 = tf.square(vpredclipped - RETURNS)
        vf_loss = tf.maximum(vf_losses1, vf_losses2)
        vf_loss = .5 * tf.reduce_mean(vf_loss)

        ratio = tf.exp(NEGLOGPACS_OLD - neglogpac)

        pg_loss = None

        self.ADVS = ADVS = tf.placeholder(tf.float32, [None])

        new_pd = train_model.pd
        flat_shape = new_pd.flatparam().shape
        self.POLICYFLATS_OLD = POLICYFLATS_OLD = tf.placeholder(tf.float32, shape=flat_shape, name='old_policyflat')
        old_pd = train_model.pdtype.pdfromflat(POLICYFLATS_OLD)
        kl = old_pd.kl(new_pd)
        if hasattr(old_pd, 'wasserstein'):
            wasserstein = old_pd.wasserstein(new_pd)
        if hasattr(old_pd, 'totalvariation'):
            totalvariation = old_pd.totalvariation(new_pd)
        if cliptype == ClipType.ratio:
            pg_losses = -ADVS * ratio
            pg_losses2 = -ADVS * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
            clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        elif cliptype == ClipType.ratio_rollback:
            slope = args.clipargs.slope_rollback
            pg_targets = tf.where(
                ADVS >= 0,
                tf.where( ratio <= 1 + CLIPRANGE,
                                ratio,
                                slope * ratio + (1 - slope) * (1 + CLIPRANGE) ), # When ratio=1+CLIPRANGE, the corresponding value should also be 1+CLIPRANGE
                tf.where( ratio >= 1 - CLIPRANGE,
                                ratio,
                                slope * ratio + (1 - slope) * (1 - CLIPRANGE))
            ) * ADVS
            pg_loss = -tf.reduce_mean(pg_targets)
            clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        elif cliptype == ClipType.ratio_rollback_constant:
            slope = args.clipargs.slope_rollback
            pg_targets = tf.where(
                ADVS >= 0,
                tf.where( ratio <= 1 + CLIPRANGE,
                                ratio*ADVS,
                                slope * ratio ), # When ratio=1+CLIPRANGE, the corresponding value should also be 1+CLIPRANGE
                tf.where( ratio >= 1 - CLIPRANGE,
                                ratio*ADVS,
                                -slope * ratio )
            )
            pg_loss = -tf.reduce_mean(pg_targets)
            clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        elif cliptype == ClipType.ratio_strict:
            pg_losses2 = -ADVS * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
            pg_loss = tf.reduce_mean(pg_losses2)
            clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        elif cliptype == ClipType.a2c:
            pg_losses = -ADVS * ratio
            pg_loss = tf.reduce_mean(pg_losses)
            clipfrac = tf.constant(0)

        elif cliptype == ClipType.kl:
            # version by hugo
            # pg_losses = ADV * ratio
            # pg_losses = tf.where(kl <= KLRANGE, pg_losses, tf.where(
            #     tf.logical_or(tf.logical_and(ratio > 1., ADV > 0), tf.logical_and(ratio < 1., ADV < 0)),
            #     tf.stop_gradient(pg_losses, name='pg_losses_notrain'), pg_losses))
            # clipfrac = tf.reduce_mean(tf.to_float(
            #     tf.logical_or(tf.logical_and(tf.greater(kl, KLRANGE), tf.logical_and(ADV > 0, ratio > 1.)),
            #                   tf.logical_and(tf.greater(kl, KLRANGE), tf.logical_and(ADV < 0., ratio < 1.)))))

            # version by siuming
            pg_losses = -ADVS * ratio
            pg_losses = tf.where(
                tf.logical_and( kl >= KLRANGE, ratio*ADVS > 1*ADVS ),
                tf.stop_gradient(pg_losses, name='pg_losses_notrain'),
                pg_losses
            )
            pg_loss = tf.reduce_mean(pg_losses)
            clipfrac = tf.reduce_mean(tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))

        elif cliptype == ClipType.kl_ratiorollback:
            # The slope of the objective is switched once the kl exceed.
            slope = args.clipargs.slope_rollback
            # version by hugo
            # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
            #                      tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
            #                       tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
            # version by siuming
            pg_targets = tf.where(
                tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
                slope * ratio + tf.stop_gradient((1-slope)*ratio), # The bias term is set to maintain continuity
                ratio
            ) * ADVS
            pg_loss = -tf.reduce_mean(pg_targets)
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))

        elif cliptype == ClipType.kl_klrollback_constant_withratio:
            # The slope of the objective is switched once the kl exceed.
            # version by hugo
            # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
            #                      tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
            #                       tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
            # version by siuming
            pg_targets = tf.where(
                tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
                args.clipargs.slope_likelihood * ratio * ADVS + args.clipargs.slope_rollback * kl,
                ratio * ADVS
            )
            pg_loss = -tf.reduce_mean(pg_targets)
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))

        elif cliptype == ClipType.kl_klrollback_constant:
            # The slope of the objective is switched once the kl exceed.
            slope = args.clipargs.slope_rollback
            # version by hugo
            # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
            #                      tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
            #                       tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
            # version by siuming
            pg_targets = tf.where(
                tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
                slope * kl,
                ratio * ADVS
            )
            pg_loss = -tf.reduce_mean(pg_targets)
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))

        elif cliptype == ClipType.kl_klrollback:
            # The slope of the objective is switched once the kl exceed.
            slope = args.clipargs.slope_rollback
            # version by hugo
            # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
            #                      tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
            #                       tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
            # version by siuming
            pg_targets = tf.where(
                tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
                slope * kl * tf.abs(ADVS),
                ratio * ADVS
            )
            pg_loss = -tf.reduce_mean(pg_targets)
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))
        elif cliptype == ClipType.kl_strict:
            pg_losses = -ADVS * ratio
            pg_losses = tf.where(
                kl >= KLRANGE,
                tf.stop_gradient(pg_losses, name='pg_losses_notrain'),
                pg_losses
            )
            pg_loss = tf.reduce_mean(pg_losses)
            clipfrac = tf.reduce_mean(tf.to_float( kl >= KLRANGE ))
        elif cliptype == ClipType.adaptivekl:
            pg_loss = tf.reduce_mean(-ADVS * ratio) + tf.reduce_mean(kl) * KL_COEF
            clipfrac = tf.constant(0.)
        else:
            raise NotImplementedError

        # approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        approxkl = tf.reduce_mean(kl)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        with tf.variable_scope('model'):
            params = tf.trainable_variables()
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        def train( **kwargs):

            feed_dict = dict()
            for key, value in kwargs.items():
                feed_dict.update({ getattr(self,key.upper()): value})
            # td_map = {
            #     CLIPRANGE: cliprange,
            #     OBS: obs, ACTIONS: actions, ADV: advs, RETURNS: returns, LR: lr,
            #     NEGLOGPACS_OLD: neglogpacs, VALUES_OLD: values,
            #     POLICYFALTS_OLD: policyflats
            # }
            #
            # if cliptype == ClipType.kl2clip:
            #     assert cliprange_lower is not None and cliprange_upper is not None
            #     td_map.update({CLIPRANGE_LOWER: cliprange_lower, CLIPRANGE_UPPER: cliprange_upper})
            # elif cliptype == ClipType.adaptivekl:
            #     assert kl_coef is not None
            #     td_map.update({KL_COEF: kl_coef})


            # TODO: train_model.S .modify to STATES
            # recurrent version
            # if states is not None:
            #     td_map[train_model.S] = states
            #     td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, kl, ratio, _train],
                feed_dict
            )[:-1]

        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']


        # def restore_summary_writer(graph_dir: str) -> tf.summary.FileWriter:
        #     return tf.summary.FileWriter.reopen()

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)
            save_env(save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            load_env(load_path)
            # If you want to load weights, also save/load observation scaling inside VecNormalize

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        # self.step = act_model.step
        self.step = act_model.step_policyflat
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  # pylint: disable=E1101


class Runner(AbstractEnvRunner):

    def __init__(self, *, env, model, nsteps, gamma, lam):
        super().__init__(env=env, model=model, nsteps=nsteps)
        self.lam = lam
        self.gamma = gamma

    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs, mb_policyflats = [], [], [], [], [], [], []
        mb_states = self.states
        epinfos = []
        # mb_obs_next = []
        mb_obs_next= None
        for _ in range(self.nsteps):
            actions, values, self.states, neglogpacs, policyflats = self.model.step(self.obs, self.states, self.dones)
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_policyflats.append(policyflats)
            mb_dones.append(self.dones)
            self.obs[:], rewards, self.dones, infos = self.env.step(actions)

            # mb_obs_next.append( self.env._obs_real.copy() )

            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)
            mb_rewards.append(rewards)
        # batch of steps to batch of rollouts
        # mb_obs_next = np.asarray(mb_obs_next, dtype=np.float32)
        # --- End xiaoming
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_policyflats = np.asarray(mb_policyflats, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = self.model.value(self.obs, self.states, self.dones)
        # discount/bootstrap off value fn
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0
        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1: # Last Timestep
                nextnonterminal = 1.0 - self.dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t + 1]
                nextvalues = mb_values[t + 1]
            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
        mb_returns = mb_advs + mb_values

        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, mb_policyflats)),
                mb_states, epinfos)


# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr):
    """
    swap and then flatten axes 0 and 1
    """
    s = arr.shape
    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])


def constfn(val):
    def f(_):
        return val
    return f


    # gradient_rectify = 7

from baselines.ppo2_AdaClip.algs import *
from toolsm import tools
def learn(*, policy, env, env_eval, n_steps, total_timesteps, ent_coef, lr,
          vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
          log_interval=1, nminibatches=4, n_opt_epochs=4,
          save_interval=10, load_path=None, cliptype=None, args=None):


    if isinstance(lr, float):
        lr = constfn(lr)
    else:
        assert callable(lr)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * n_steps
    nbatch_train = nbatch // nminibatches

    set_save_load_env(env)
    make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm, cliptype=cliptype, args=args)
    if save_interval:
        # only save make_model, the function to make a model with its closure/args
        import cloudpickle
        with open(osp.join(args.log_dir, 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env=env, model=model, nsteps=n_steps, gamma=gamma, lam=lam)

    if args.n_eval_epsiodes > 0:
        evaluator = Evaluator()

    # tfwriter = tf.summary.FileWriter(args.log_dir)

    if cliptype in [ClipType.kl2clip, ClipType.kl2clip_rollback]:
        # from baselines.ppo2_AdaClip.KL2Clip.KL2Clip_opt_tf import get_clip_new

        if isinstance(env.action_space, gym.spaces.box.Box):
            from baselines.ppo2_AdaClip.KL2Clip_reduce_v3.KL2Clip_reduce import KL2Clip, Adjust_Type
            kl2clip = KL2Clip(opt1Dkind=args.clipargs.kl2clip_opttype)

            args.clipargs.adjusttype = Adjust_Type[args.clipargs.adjusttype]
            if '2constant' in args.clipargs.adaptive_range:  # TODO: alg args
                dim = ac_space.shape[0]
                if args.clipargs.adjusttype == Adjust_Type.origin:
                    delta = args.clipargs.delta
                else:
                    delta = kl2clip.cliprange2delta(args.clipargs.cliprange, ac_space.shape[0],
                                                    args.clipargs.adjusttype)

                cliprange_upper_min = 1 + kl2clip.delta2cliprange(delta, dim=ac_space.shape[0],
                                                                  adjusttype='base_clip_upper')
                cliprange_lower_max = 1 - kl2clip.delta2cliprange(delta, dim=ac_space.shape[0],
                                                                  adjusttype='base_clip_lower')
            if args.clipargs.cliprange is None:
                assert isinstance(env.action_space, gym.spaces.box.Box)
                args.clipargs.cliprange = kl2clip.delta2cliprange(args.clipargs.klrange, dim=ac_space.shape[0],
                                                                  adjusttype=args.clipargs.adjusttype)
                tools.print_(
                    f'The provided cliprange is None. Set cliprange={args.clipargs.cliprange} by klrange={args.clipargs.klrange}, dim={ac_space.shape[0]}',
                    color='magenta')
        elif isinstance(env.action_space, gym.spaces.discrete.Discrete):
            # TODO: Atari上cliprange是不断减小的,那么delta也应该不断减小
            # raise NotImplementedError('Please review the code.....')
            from baselines.ppo2_AdaClip.KL2Clip_discrete.KL2Clip_discrete import KL2Clip
            kl2clip = KL2Clip(opt1Dkind=args.clipargs.kl2clip_opttype)
        else:
            raise NotImplementedError('Please run atari or mujoco!')
    elif cliptype == ClipType.adaptivekl:
        kl_coef = 1.
        kl_targ = args.clipargs.klrange


    if args.envtype == MUJOCO:
        cliprange = args.clipargs.cliprange
    elif args.envtype == ATARI:
        cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None

    if isinstance(cliprange, float) or cliprange is None:
        cliprange = constfn(cliprange)
    else:
        assert callable(cliprange)


    # alphas_kl2clip_decay = np.zeros(nupdates, dtype=np.float32)
    # alphas_kl2clip_decay[0:nupdates // 3] = 1
    # alphas_kl2clip_decay[nupdates // 3:] = np.linspace(1, -0.5, nupdates - nupdates // 3)
    from toolsm.logger import Logger
    logformats = ['csv','tensorflow', 'log']
    if not args.is_multiprocess:
        logformats.append('stdout')
    else:
        logger_multiprocess = Logger( ['stdout'])
    logger = Logger( logformats , path=args.log_dir, file_basename='process' )

    epinfobuf = deque(maxlen=100)

    nupdates = total_timesteps // nbatch
    performance_max = -np.inf
    print(f'nupdates:{nupdates},eval_interval:{args.eval_interval}')
    tstart_ini = time.time()
    for update in range(1, nupdates + 1):
        tstart = time.time()
        assert nbatch % nminibatches == 0
        debugs = dict( iteration=update )
        nbatch_train = nbatch // nminibatches

        frac = (update-1) * 1. / nupdates
        # frac_remain = 1.0 - (update - 1.0) / nupdates
        frac_remain = 1.0 - frac
        lrnow = lr(frac_remain)

        # ---------------- Sample data
        if args.lam_decay:
            runner.lam = lam - (lam - 0.5) * frac
            print(f'lam decay to {runner.lam}')
        # ----- explore by setting policy std
        # if nbatch * update <= args.explore_timesteps:
        #     model.train_model.set_logstd(model.train_model.get_logstd() * 0)
        # if args.explore_additive_threshold is not None and update * 1. / nupdates > args.explore_additive_threshold:
        #     logger.log(f'add additive explore: {args.explore_additive_rate}')
        #     model.train_model.set_logstd(model.train_model.get_logstd() - args.explore_additive_rate)

        # with tools.timed('sample'):
        obs, returns, masks, actions, values, neglogpacs, policyflats, states, epinfos = runner.run()  # pylint: disable=E0632
        advs = returns - values
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        debugs['advs'] = advs
        if isinstance(env.action_space, gym.spaces.Box):
            epinfobuf.clear()
        epinfobuf.extend(epinfos)


        # ----------------- Prepare for training: update clipping range, etc.
        cliprangenow = cliprange(frac_remain)
        # old version: unknown meaning....TODO: figure out the meaning
        # if isinstance(env.action_space, gym.spaces.Box):
        #     cliprangenow = cliprange(frac)
        # elif isinstance(env.action_space, gym.spaces.Discrete):
        #     cliprangenow = (lambda _: cliprange(None) * _)(frac)
        kwargs_in_scalar = dict(lr=lrnow, cliprange=cliprangenow)
        kwargs_in_arr = dict(
            obs=obs, returns=returns,
            actions=actions, values_old=values, neglogpacs_old=neglogpacs, advs=advs,
            policyflats_old=policyflats
        )
        if cliptype in [ClipType.kl, ClipType.kl_ratiorollback, ClipType.kl_klrollback_constant, ClipType.kl_klrollback, ClipType.kl_strict, ClipType.kl_klrollback_constant_withratio]:
            klrange = args.clipargs.klrange
            if 'decay_threshold' in args.clipargs.keys():
                decay_threshold = args.clipargs.decay_threshold
                if frac >= decay_threshold:
                    coef_ = frac_remain/(1-decay_threshold)
                    klrange *= coef_
            kwargs_in_scalar.update( klrange = klrange  )
        # print(kwargs_in_scalar)
        # ----------------- Train the model
        mblossvals = []
        if states is None:  # nonrecurrent version
            kls = []
            ratios = []
            # totalvariations = []

            inds = np.arange(nbatch)

            for ind_epoch in range(n_opt_epochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    mbinds = inds[start: start + nbatch_train]
                    kwargs_in_batch = dict()
                    for key in kwargs_in_arr.keys():
                        kwargs_in_batch[key] = kwargs_in_arr[key][mbinds]

                    *lossvals, kl, ratio = model.train(**kwargs_in_scalar,**kwargs_in_batch)
                    mblossvals.append(lossvals)
                    if ind_epoch == n_opt_epochs -1:# only add it at the last opt_epoch
                        kls.append(kl)
                        ratios.append(ratio)
                        # totalvariations.append(totalvariation)
            # --- restore the order of kls and ratios to the original order
            # kls and ratios are list of kl_batch
            # TODO: Do not suffle and run it at last time! make it easy to add more variables to debug!
            inds2position = {}
            for position, ind in enumerate(inds):
                inds2position[ind] = position
            inds_reverse = [inds2position[ind] for ind in range(len(inds))]
            kls, ratios = (np.concatenate(arr, axis=0)[inds_reverse] for arr in (kls, ratios))
            debugs['kls'] = kls
            debugs['ratios'] = ratios
            # debugs['totalvariations'] = totalvariations
            # print(kls.mean(), totalvariations.mean())


        else:  # recurrent version
            raise NotImplementedError('Not implemented!')
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * n_steps).reshape(nenvs, n_steps)
            envsperbatch = nbatch_train // n_steps
            # TODO: states: masks
            for _ in range(n_opt_epochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
        if args.save_debug:
            tools.save_vars( osp.join(args.log_dir, 'debugs.pkl'), debugs, append=update > 1 )

        # -------------- Log the result

        lossvals = np.mean(mblossvals, axis=0)
        is_eprewmean_better = False
        eprewmean_eval = None
        eplenmean_eval = None
        if args.n_eval_epsiodes > 0 and \
            (
                (args.eval_interval >0 and ( (update - 1) % args.eval_interval == 0) )
                    or
                (args.eval_interval<0 and update == nupdates)
            ):
            result_eval = evaluator.eval( env_eval, env, model, args, update )
            eprewmean_eval = safemean([epinfo['r'] for epinfo in result_eval])
            eplenmean_eval = safemean([epinfo['l'] for epinfo in result_eval])
            if eprewmean_eval > performance_max:
                is_eprewmean_better = True
                performance_max = eprewmean_eval

        if save_interval and ( (update -1 ) % save_interval == 0 or update == nupdates or is_eprewmean_better
        ):
            savepath = osp.join(args.model_dir, '%.5i' % update)
            # print('Saving to', savepath)
            model.save(savepath)

        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf])
            eplenmean = safemean([epinfo['l'] for epinfo in epinfobuf])
            timesteps = update * nbatch
            tnow = time.time()
            time_oneupdate = tnow - tstart
            fps = int(nbatch / time_oneupdate)

            if args.is_multiprocess: #and ( (update-1) % (args.eval_interval*2) == 0)
                # TODO: 改成log, print reward_eval
                logger_multiprocess.log_keyvalues(
                    update=update,
                    env=args.env,
                    timesteps=timesteps,
                    time_oneupdate=time_oneupdate,
                    eprewmean=eprewmean,
                    eprewmean_eval=eprewmean_eval,
                    time = tools.time_now_str('%H:%M:%S')
                )
                # tools.print_( f'timesteps:{timesteps},time_oneupdate:{time_oneupdate},eprewmean:{eprewmean},eprewmean_eval:{eprewmean_eval}', color='magenta' )

            logger.log_keyvalue(
                global_step = timesteps,
                nupdates = update,
                fps = fps,
                eprewmean = eprewmean,
                eplenmean = eplenmean,
                eprewmean_eval = eprewmean_eval,
                eplenmean_eval=eplenmean_eval,
                explained_variance = float(ev),
                time_elapsed = tnow - tstart_ini,
                time_oneupdate = time_oneupdate,
                serial_timesteps =update * n_steps,
                total_timesteps = timesteps,
            )
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.log_keyvalue( **{lossname:lossval} )

            logger.dump_keyvalues()


            # if args.debug_halfcheetah and args.env_pure == 'HalfCheetah':
            #     if frac > 0.6 and performance_max <= 1500:
            #         print( tools.colorize('HalfCheetah does not ahieve the threshold! Force it to stop!') )
            #         break

        if cliptype == ClipType.adaptivekl:
            kl_mean = np.mean(kls)
            if kl_mean < kl_targ / 1.5:
                kl_coef /= 2
            elif kl_mean > kl_targ * 1.5:
                kl_coef *= 2


    if args.model_dir is not None:
        import shutil
        for d in args.zip_dirs:
            d_path = args[f'{d}_dir']
            if len(tools.get_files( d_path ))> 2:
                shutil.make_archive( base_name=d_path, format='zip', root_dir=d_path )
                tools.safe_delete( d_path , confirm=False, require_not_containsub=False )

    env.close()
    return model


def safemean(xs):
    return np.nan if len(xs) == 0 else np.mean(xs)


class Evaluator():
    def __init__(self):
        pass
        # # ---- Clone env
        # self.env_eval = env_eval
        # # ----- copy pi
        # self.model = model
        # self.assign_policy = model.assign_policy_eval_eq_train
        # self.policy = model.eval_policy


    def eval(self, env_eval, env_train, model, args, update):
        # from copy import deepcopy
        from copy import copy
        # assign env
        if args.envtype == MUJOCO:
            env_eval.ob_rms = copy(env_train.ob_rms)
        else:
            pass

        # assign pi
        # params = model.get_params_train()
        # model.assign_eval_policy( *params )

        eval_policy = model.eval_policy
        # start process
        epinfos = rollout( env_eval, eval_policy, evaluate_times=args.n_eval_epsiodes, deterministic=True, verbose=(args.envtype == ATARI) )
        # print(f'update:{update}, epinfos:{epinfos}')
        # rewards_epi = [ epinfo['r'] for epinfo in epinfos ]
        return epinfos


def rollout(env, policy, evaluate_times=1, deterministic=True, verbose=0):
    import itertools

    epinfos = []
    obs = env.reset()
    if verbose:
        tools.warn_(f'Evaluate Policy...')
    # rewards_episode = []
    for t in itertools.count():
        # tools.print_refresh(t)
        if not deterministic:
            actions, *_ = policy.step(obs)
        else:
            actions, *_ = policy.step_test(obs)

        obs[:], reward, dones, infos = env.step(actions)
        # env.render()
        # cnt_dones += dones.sum()
        # If it is done, it will contains a key 'episode' in info
        # {'episode': {'r': 118.048395, 'l': 64, 't': 0.67222}}
        # print(infos, f't={t},done={dones}')
        # if dones[0]:
        #     print('done')
        #     exit()
        for info in infos:
            maybeepinfo = info.get('episode')
            if maybeepinfo:
                epinfos.append(maybeepinfo)
        if len(epinfos) >= evaluate_times:
            break
    return epinfos