import os import os.path as osp import time from collections import deque import joblib import numpy as np import tensorflow as tf import gym from baselines import logger from baselines.common import explained_variance from baselines.common.runners import AbstractEnvRunner from baselines.common.vec_env.vec_normalize import VecNormalize from toolsm import tools save_env = None load_env = None def set_save_load_env(env): global save_env, load_env save_env = save_env_fn(env) load_env = load_env_fn(env) def save_env_fn(env): def save_env(save_path): if isinstance(env, VecNormalize): env.save(save_path + '.env') return save_env def load_env_fn(env): def load_env(load_path): if isinstance(env, VecNormalize): env.load(load_path + '.env') return load_env import baselines.common.tf_util as U class Model(object): def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, cliptype, args): sess = tf.get_default_session() # if args.policy_type == 'MlpPolicyMy': # additional_keys = ['hidden_sizes','num_sharing_layers','ac_fn', 'seperate_logstd'] # elif args.policy_type == 'MlpPolicy': # additional_keys = ['seperate_logstd'] # else: # additional_keys = [] # # additional_args = {} # for k in additional_keys: # additional_args[k] = getattr( args, k) additional_args = {} if args.policy_type == 'MlpPolicyExt': additional_args = dict(args=args) # There are two models just to set different batch size act_model = policy(sess, ob_space, ac_space, nbatch_act, nsteps=1, reuse=False, **additional_args) train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps=nsteps, reuse=True, **additional_args) if args.n_eval_epsiodes > 0: self.eval_policy = policy(sess, ob_space, ac_space, args.n_eval_epsiodes, nsteps=1, reuse=True, **additional_args) # ------- For copy Policy # self.eval_policy = eval_policy = policy(sess, ob_space, ac_space, args.n_eval_epsiodes, nsteps=1, reuse=False, name='policy_eval', **additional_args) # params_eval = eval_policy.variables_trainable # placeholders_params= [ tf.placeholder( dtype=v.dtype, shape=v.shape ) for v in params_eval ] # self.assign_eval_policy = U.function(inputs=placeholders_params,outputs=[], updates=[tf.assign(v, v_input) for (v, v_input) in zip(params_eval, placeholders_params)]) # self.get_params_train = U.function( inputs=[], outputs=act_model.variables_trainable ) # self.get_params_eval = U.function(inputs=[], outputs=eval_policy.variables_trainable ) # ------ End------ # for p in params: # print( p.name ) # for p in eval_policy.variables_all: # print(p.name) # exit() # tools.save_vars( '/media/d/e/baselines_workingon/baselines/ppo2_AdaClip/t/a.pkl', params ) # exit() self.OBS = OBS = train_model.X self.ACTIONS = ACTIONS = train_model.pdtype.sample_placeholder([None]) self.RETURNS = RETURNS = tf.placeholder(tf.float32, [None]) self.NEGLOGPACS_OLD = NEGLOGPACS_OLD = tf.placeholder(tf.float32, [None]) self.VALUES_OLD = VALUES_OLD = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) self.KLRANGE = KLRANGE = tf.placeholder(tf.float32, []) self.CLIPRANGE_LOWER = CLIPRANGE_LOWER = tf.placeholder(tf.float32, [None]) self.CLIPRANGE_UPPER = CLIPRANGE_UPPER = tf.placeholder(tf.float32, [None]) self.KL_COEF = KL_COEF = tf.placeholder(tf.float32, []) self.RANGE = RANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(ACTIONS) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = VALUES_OLD + tf.clip_by_value(train_model.vf - VALUES_OLD, - CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - RETURNS) vf_losses2 = tf.square(vpredclipped - RETURNS) vf_loss = tf.maximum(vf_losses1, vf_losses2) vf_loss = .5 * tf.reduce_mean(vf_loss) ratio = tf.exp(NEGLOGPACS_OLD - neglogpac) pg_loss = None self.ADVS = ADVS = tf.placeholder(tf.float32, [None]) new_pd = train_model.pd flat_shape = new_pd.flatparam().shape self.POLICYFLATS_OLD = POLICYFLATS_OLD = tf.placeholder(tf.float32, shape=flat_shape, name='old_policyflat') old_pd = train_model.pdtype.pdfromflat(POLICYFLATS_OLD) kl = old_pd.kl(new_pd) if hasattr(old_pd, 'wasserstein'): wasserstein = old_pd.wasserstein(new_pd) if hasattr(old_pd, 'totalvariation'): totalvariation = old_pd.totalvariation(new_pd) if cliptype == ClipType.ratio: pg_losses = -ADVS * ratio pg_losses2 = -ADVS * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) elif cliptype == ClipType.ratio_rollback: slope = args.clipargs.slope_rollback pg_targets = tf.where( ADVS >= 0, tf.where( ratio <= 1 + CLIPRANGE, ratio, slope * ratio + (1 - slope) * (1 + CLIPRANGE) ), # When ratio=1+CLIPRANGE, the corresponding value should also be 1+CLIPRANGE tf.where( ratio >= 1 - CLIPRANGE, ratio, slope * ratio + (1 - slope) * (1 - CLIPRANGE)) ) * ADVS pg_loss = -tf.reduce_mean(pg_targets) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) elif cliptype == ClipType.ratio_rollback_constant: slope = args.clipargs.slope_rollback pg_targets = tf.where( ADVS >= 0, tf.where( ratio <= 1 + CLIPRANGE, ratio*ADVS, slope * ratio ), # When ratio=1+CLIPRANGE, the corresponding value should also be 1+CLIPRANGE tf.where( ratio >= 1 - CLIPRANGE, ratio*ADVS, -slope * ratio ) ) pg_loss = -tf.reduce_mean(pg_targets) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) elif cliptype == ClipType.ratio_strict: pg_losses2 = -ADVS * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(pg_losses2) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) elif cliptype == ClipType.a2c: pg_losses = -ADVS * ratio pg_loss = tf.reduce_mean(pg_losses) clipfrac = tf.constant(0) elif cliptype == ClipType.kl: # version by hugo # pg_losses = ADV * ratio # pg_losses = tf.where(kl <= KLRANGE, pg_losses, tf.where( # tf.logical_or(tf.logical_and(ratio > 1., ADV > 0), tf.logical_and(ratio < 1., ADV < 0)), # tf.stop_gradient(pg_losses, name='pg_losses_notrain'), pg_losses)) # clipfrac = tf.reduce_mean(tf.to_float( # tf.logical_or(tf.logical_and(tf.greater(kl, KLRANGE), tf.logical_and(ADV > 0, ratio > 1.)), # tf.logical_and(tf.greater(kl, KLRANGE), tf.logical_and(ADV < 0., ratio < 1.))))) # version by siuming pg_losses = -ADVS * ratio pg_losses = tf.where( tf.logical_and( kl >= KLRANGE, ratio*ADVS > 1*ADVS ), tf.stop_gradient(pg_losses, name='pg_losses_notrain'), pg_losses ) pg_loss = tf.reduce_mean(pg_losses) clipfrac = tf.reduce_mean(tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS))) elif cliptype == ClipType.kl_ratiorollback: # The slope of the objective is switched once the kl exceed. slope = args.clipargs.slope_rollback # version by hugo # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio, # tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV, # tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio))) # version by siuming pg_targets = tf.where( tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS), slope * ratio + tf.stop_gradient((1-slope)*ratio), # The bias term is set to maintain continuity ratio ) * ADVS pg_loss = -tf.reduce_mean(pg_targets) clipfrac = tf.reduce_mean( tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS))) elif cliptype == ClipType.kl_klrollback_constant_withratio: # The slope of the objective is switched once the kl exceed. # version by hugo # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio, # tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV, # tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio))) # version by siuming pg_targets = tf.where( tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS), args.clipargs.slope_likelihood * ratio * ADVS + args.clipargs.slope_rollback * kl, ratio * ADVS ) pg_loss = -tf.reduce_mean(pg_targets) clipfrac = tf.reduce_mean( tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS))) elif cliptype == ClipType.kl_klrollback_constant: # The slope of the objective is switched once the kl exceed. slope = args.clipargs.slope_rollback # version by hugo # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio, # tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV, # tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio))) # version by siuming pg_targets = tf.where( tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS), slope * kl, ratio * ADVS ) pg_loss = -tf.reduce_mean(pg_targets) clipfrac = tf.reduce_mean( tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS))) elif cliptype == ClipType.kl_klrollback: # The slope of the objective is switched once the kl exceed. slope = args.clipargs.slope_rollback # version by hugo # pg_losses = tf.where(kl <= KLRANGE, ADV * ratio, # tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV, # tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio))) # version by siuming pg_targets = tf.where( tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS), slope * kl * tf.abs(ADVS), ratio * ADVS ) pg_loss = -tf.reduce_mean(pg_targets) clipfrac = tf.reduce_mean( tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS))) elif cliptype == ClipType.kl_strict: pg_losses = -ADVS * ratio pg_losses = tf.where( kl >= KLRANGE, tf.stop_gradient(pg_losses, name='pg_losses_notrain'), pg_losses ) pg_loss = tf.reduce_mean(pg_losses) clipfrac = tf.reduce_mean(tf.to_float( kl >= KLRANGE )) elif cliptype == ClipType.adaptivekl: pg_loss = tf.reduce_mean(-ADVS * ratio) + tf.reduce_mean(kl) * KL_COEF clipfrac = tf.constant(0.) else: raise NotImplementedError # approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxkl = tf.reduce_mean(kl) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef with tf.variable_scope('model'): params = tf.trainable_variables() grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) def train( **kwargs): feed_dict = dict() for key, value in kwargs.items(): feed_dict.update({ getattr(self,key.upper()): value}) # td_map = { # CLIPRANGE: cliprange, # OBS: obs, ACTIONS: actions, ADV: advs, RETURNS: returns, LR: lr, # NEGLOGPACS_OLD: neglogpacs, VALUES_OLD: values, # POLICYFALTS_OLD: policyflats # } # # if cliptype == ClipType.kl2clip: # assert cliprange_lower is not None and cliprange_upper is not None # td_map.update({CLIPRANGE_LOWER: cliprange_lower, CLIPRANGE_UPPER: cliprange_upper}) # elif cliptype == ClipType.adaptivekl: # assert kl_coef is not None # td_map.update({KL_COEF: kl_coef}) # TODO: train_model.S .modify to STATES # recurrent version # if states is not None: # td_map[train_model.S] = states # td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, kl, ratio, _train], feed_dict )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] # def restore_summary_writer(graph_dir: str) -> tf.summary.FileWriter: # return tf.summary.FileWriter.reopen() def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) save_env(save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) load_env(load_path) # If you want to load weights, also save/load observation scaling inside VecNormalize self.train = train self.train_model = train_model self.act_model = act_model # self.step = act_model.step self.step = act_model.step_policyflat self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess) # pylint: disable=E1101 class Runner(AbstractEnvRunner): def __init__(self, *, env, model, nsteps, gamma, lam): super().__init__(env=env, model=model, nsteps=nsteps) self.lam = lam self.gamma = gamma def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs, mb_policyflats = [], [], [], [], [], [], [] mb_states = self.states epinfos = [] # mb_obs_next = [] mb_obs_next= None for _ in range(self.nsteps): actions, values, self.states, neglogpacs, policyflats = self.model.step(self.obs, self.states, self.dones) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_policyflats.append(policyflats) mb_dones.append(self.dones) self.obs[:], rewards, self.dones, infos = self.env.step(actions) # mb_obs_next.append( self.env._obs_real.copy() ) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_rewards.append(rewards) # batch of steps to batch of rollouts # mb_obs_next = np.asarray(mb_obs_next, dtype=np.float32) # --- End xiaoming mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_policyflats = np.asarray(mb_policyflats, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = self.model.value(self.obs, self.states, self.dones) # discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: # Last Timestep nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, mb_policyflats)), mb_states, epinfos) # obs, returns, masks, actions, values, neglogpacs, states = runner.run() def sf01(arr): """ swap and then flatten axes 0 and 1 """ s = arr.shape return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) def constfn(val): def f(_): return val return f # gradient_rectify = 7 from baselines.ppo2_AdaClip.algs import * from toolsm import tools def learn(*, policy, env, env_eval, n_steps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=1, nminibatches=4, n_opt_epochs=4, save_interval=10, load_path=None, cliptype=None, args=None): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * n_steps nbatch_train = nbatch // nminibatches set_save_load_env(env) make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, cliptype=cliptype, args=args) if save_interval: # only save make_model, the function to make a model with its closure/args import cloudpickle with open(osp.join(args.log_dir, 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=n_steps, gamma=gamma, lam=lam) if args.n_eval_epsiodes > 0: evaluator = Evaluator() # tfwriter = tf.summary.FileWriter(args.log_dir) if cliptype in [ClipType.kl2clip, ClipType.kl2clip_rollback]: # from baselines.ppo2_AdaClip.KL2Clip.KL2Clip_opt_tf import get_clip_new if isinstance(env.action_space, gym.spaces.box.Box): from baselines.ppo2_AdaClip.KL2Clip_reduce_v3.KL2Clip_reduce import KL2Clip, Adjust_Type kl2clip = KL2Clip(opt1Dkind=args.clipargs.kl2clip_opttype) args.clipargs.adjusttype = Adjust_Type[args.clipargs.adjusttype] if '2constant' in args.clipargs.adaptive_range: # TODO: alg args dim = ac_space.shape[0] if args.clipargs.adjusttype == Adjust_Type.origin: delta = args.clipargs.delta else: delta = kl2clip.cliprange2delta(args.clipargs.cliprange, ac_space.shape[0], args.clipargs.adjusttype) cliprange_upper_min = 1 + kl2clip.delta2cliprange(delta, dim=ac_space.shape[0], adjusttype='base_clip_upper') cliprange_lower_max = 1 - kl2clip.delta2cliprange(delta, dim=ac_space.shape[0], adjusttype='base_clip_lower') if args.clipargs.cliprange is None: assert isinstance(env.action_space, gym.spaces.box.Box) args.clipargs.cliprange = kl2clip.delta2cliprange(args.clipargs.klrange, dim=ac_space.shape[0], adjusttype=args.clipargs.adjusttype) tools.print_( f'The provided cliprange is None. Set cliprange={args.clipargs.cliprange} by klrange={args.clipargs.klrange}, dim={ac_space.shape[0]}', color='magenta') elif isinstance(env.action_space, gym.spaces.discrete.Discrete): # TODO: Atari上cliprange是不断减小的,那么delta也应该不断减小 # raise NotImplementedError('Please review the code.....') from baselines.ppo2_AdaClip.KL2Clip_discrete.KL2Clip_discrete import KL2Clip kl2clip = KL2Clip(opt1Dkind=args.clipargs.kl2clip_opttype) else: raise NotImplementedError('Please run atari or mujoco!') elif cliptype == ClipType.adaptivekl: kl_coef = 1. kl_targ = args.clipargs.klrange if args.envtype == MUJOCO: cliprange = args.clipargs.cliprange elif args.envtype == ATARI: cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None if isinstance(cliprange, float) or cliprange is None: cliprange = constfn(cliprange) else: assert callable(cliprange) # alphas_kl2clip_decay = np.zeros(nupdates, dtype=np.float32) # alphas_kl2clip_decay[0:nupdates // 3] = 1 # alphas_kl2clip_decay[nupdates // 3:] = np.linspace(1, -0.5, nupdates - nupdates // 3) from toolsm.logger import Logger logformats = ['csv','tensorflow', 'log'] if not args.is_multiprocess: logformats.append('stdout') else: logger_multiprocess = Logger( ['stdout']) logger = Logger( logformats , path=args.log_dir, file_basename='process' ) epinfobuf = deque(maxlen=100) nupdates = total_timesteps // nbatch performance_max = -np.inf print(f'nupdates:{nupdates},eval_interval:{args.eval_interval}') tstart_ini = time.time() for update in range(1, nupdates + 1): tstart = time.time() assert nbatch % nminibatches == 0 debugs = dict( iteration=update ) nbatch_train = nbatch // nminibatches frac = (update-1) * 1. / nupdates # frac_remain = 1.0 - (update - 1.0) / nupdates frac_remain = 1.0 - frac lrnow = lr(frac_remain) # ---------------- Sample data if args.lam_decay: runner.lam = lam - (lam - 0.5) * frac print(f'lam decay to {runner.lam}') # ----- explore by setting policy std # if nbatch * update <= args.explore_timesteps: # model.train_model.set_logstd(model.train_model.get_logstd() * 0) # if args.explore_additive_threshold is not None and update * 1. / nupdates > args.explore_additive_threshold: # logger.log(f'add additive explore: {args.explore_additive_rate}') # model.train_model.set_logstd(model.train_model.get_logstd() - args.explore_additive_rate) # with tools.timed('sample'): obs, returns, masks, actions, values, neglogpacs, policyflats, states, epinfos = runner.run() # pylint: disable=E0632 advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) debugs['advs'] = advs if isinstance(env.action_space, gym.spaces.Box): epinfobuf.clear() epinfobuf.extend(epinfos) # ----------------- Prepare for training: update clipping range, etc. cliprangenow = cliprange(frac_remain) # old version: unknown meaning....TODO: figure out the meaning # if isinstance(env.action_space, gym.spaces.Box): # cliprangenow = cliprange(frac) # elif isinstance(env.action_space, gym.spaces.Discrete): # cliprangenow = (lambda _: cliprange(None) * _)(frac) kwargs_in_scalar = dict(lr=lrnow, cliprange=cliprangenow) kwargs_in_arr = dict( obs=obs, returns=returns, actions=actions, values_old=values, neglogpacs_old=neglogpacs, advs=advs, policyflats_old=policyflats ) if cliptype in [ClipType.kl, ClipType.kl_ratiorollback, ClipType.kl_klrollback_constant, ClipType.kl_klrollback, ClipType.kl_strict, ClipType.kl_klrollback_constant_withratio]: klrange = args.clipargs.klrange if 'decay_threshold' in args.clipargs.keys(): decay_threshold = args.clipargs.decay_threshold if frac >= decay_threshold: coef_ = frac_remain/(1-decay_threshold) klrange *= coef_ kwargs_in_scalar.update( klrange = klrange ) # print(kwargs_in_scalar) # ----------------- Train the model mblossvals = [] if states is None: # nonrecurrent version kls = [] ratios = [] # totalvariations = [] inds = np.arange(nbatch) for ind_epoch in range(n_opt_epochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): mbinds = inds[start: start + nbatch_train] kwargs_in_batch = dict() for key in kwargs_in_arr.keys(): kwargs_in_batch[key] = kwargs_in_arr[key][mbinds] *lossvals, kl, ratio = model.train(**kwargs_in_scalar,**kwargs_in_batch) mblossvals.append(lossvals) if ind_epoch == n_opt_epochs -1:# only add it at the last opt_epoch kls.append(kl) ratios.append(ratio) # totalvariations.append(totalvariation) # --- restore the order of kls and ratios to the original order # kls and ratios are list of kl_batch # TODO: Do not suffle and run it at last time! make it easy to add more variables to debug! inds2position = {} for position, ind in enumerate(inds): inds2position[ind] = position inds_reverse = [inds2position[ind] for ind in range(len(inds))] kls, ratios = (np.concatenate(arr, axis=0)[inds_reverse] for arr in (kls, ratios)) debugs['kls'] = kls debugs['ratios'] = ratios # debugs['totalvariations'] = totalvariations # print(kls.mean(), totalvariations.mean()) else: # recurrent version raise NotImplementedError('Not implemented!') assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * n_steps).reshape(nenvs, n_steps) envsperbatch = nbatch_train // n_steps # TODO: states: masks for _ in range(n_opt_epochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) if args.save_debug: tools.save_vars( osp.join(args.log_dir, 'debugs.pkl'), debugs, append=update > 1 ) # -------------- Log the result lossvals = np.mean(mblossvals, axis=0) is_eprewmean_better = False eprewmean_eval = None eplenmean_eval = None if args.n_eval_epsiodes > 0 and \ ( (args.eval_interval >0 and ( (update - 1) % args.eval_interval == 0) ) or (args.eval_interval<0 and update == nupdates) ): result_eval = evaluator.eval( env_eval, env, model, args, update ) eprewmean_eval = safemean([epinfo['r'] for epinfo in result_eval]) eplenmean_eval = safemean([epinfo['l'] for epinfo in result_eval]) if eprewmean_eval > performance_max: is_eprewmean_better = True performance_max = eprewmean_eval if save_interval and ( (update -1 ) % save_interval == 0 or update == nupdates or is_eprewmean_better ): savepath = osp.join(args.model_dir, '%.5i' % update) # print('Saving to', savepath) model.save(savepath) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf]) eplenmean = safemean([epinfo['l'] for epinfo in epinfobuf]) timesteps = update * nbatch tnow = time.time() time_oneupdate = tnow - tstart fps = int(nbatch / time_oneupdate) if args.is_multiprocess: #and ( (update-1) % (args.eval_interval*2) == 0) # TODO: 改成log, print reward_eval logger_multiprocess.log_keyvalues( update=update, env=args.env, timesteps=timesteps, time_oneupdate=time_oneupdate, eprewmean=eprewmean, eprewmean_eval=eprewmean_eval, time = tools.time_now_str('%H:%M:%S') ) # tools.print_( f'timesteps:{timesteps},time_oneupdate:{time_oneupdate},eprewmean:{eprewmean},eprewmean_eval:{eprewmean_eval}', color='magenta' ) logger.log_keyvalue( global_step = timesteps, nupdates = update, fps = fps, eprewmean = eprewmean, eplenmean = eplenmean, eprewmean_eval = eprewmean_eval, eplenmean_eval=eplenmean_eval, explained_variance = float(ev), time_elapsed = tnow - tstart_ini, time_oneupdate = time_oneupdate, serial_timesteps =update * n_steps, total_timesteps = timesteps, ) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.log_keyvalue( **{lossname:lossval} ) logger.dump_keyvalues() # if args.debug_halfcheetah and args.env_pure == 'HalfCheetah': # if frac > 0.6 and performance_max <= 1500: # print( tools.colorize('HalfCheetah does not ahieve the threshold! Force it to stop!') ) # break if cliptype == ClipType.adaptivekl: kl_mean = np.mean(kls) if kl_mean < kl_targ / 1.5: kl_coef /= 2 elif kl_mean > kl_targ * 1.5: kl_coef *= 2 if args.model_dir is not None: import shutil for d in args.zip_dirs: d_path = args[f'{d}_dir'] if len(tools.get_files( d_path ))> 2: shutil.make_archive( base_name=d_path, format='zip', root_dir=d_path ) tools.safe_delete( d_path , confirm=False, require_not_containsub=False ) env.close() return model def safemean(xs): return np.nan if len(xs) == 0 else np.mean(xs) class Evaluator(): def __init__(self): pass # # ---- Clone env # self.env_eval = env_eval # # ----- copy pi # self.model = model # self.assign_policy = model.assign_policy_eval_eq_train # self.policy = model.eval_policy def eval(self, env_eval, env_train, model, args, update): # from copy import deepcopy from copy import copy # assign env if args.envtype == MUJOCO: env_eval.ob_rms = copy(env_train.ob_rms) else: pass # assign pi # params = model.get_params_train() # model.assign_eval_policy( *params ) eval_policy = model.eval_policy # start process epinfos = rollout( env_eval, eval_policy, evaluate_times=args.n_eval_epsiodes, deterministic=True, verbose=(args.envtype == ATARI) ) # print(f'update:{update}, epinfos:{epinfos}') # rewards_epi = [ epinfo['r'] for epinfo in epinfos ] return epinfos def rollout(env, policy, evaluate_times=1, deterministic=True, verbose=0): import itertools epinfos = [] obs = env.reset() if verbose: tools.warn_(f'Evaluate Policy...') # rewards_episode = [] for t in itertools.count(): # tools.print_refresh(t) if not deterministic: actions, *_ = policy.step(obs) else: actions, *_ = policy.step_test(obs) obs[:], reward, dones, infos = env.step(actions) # env.render() # cnt_dones += dones.sum() # If it is done, it will contains a key 'episode' in info # {'episode': {'r': 118.048395, 'l': 64, 't': 0.67222}} # print(infos, f't={t},done={dones}') # if dones[0]: # print('done') # exit() for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) if len(epinfos) >= evaluate_times: break return epinfos