2020-02-08 18:23:23 +08:00

813 lines
34 KiB
Python

import os
import os.path as osp
import time
from collections import deque
import joblib
import numpy as np
import tensorflow as tf
import gym
from baselines import logger
from baselines.common import explained_variance
from baselines.common.runners import AbstractEnvRunner
from baselines.common.vec_env.vec_normalize import VecNormalize
from toolsm import tools
save_env = None
load_env = None
def set_save_load_env(env):
global save_env, load_env
save_env = save_env_fn(env)
load_env = load_env_fn(env)
def save_env_fn(env):
def save_env(save_path):
if isinstance(env, VecNormalize):
env.save(save_path + '.env')
return save_env
def load_env_fn(env):
def load_env(load_path):
if isinstance(env, VecNormalize):
env.load(load_path + '.env')
return load_env
import baselines.common.tf_util as U
class Model(object):
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, max_grad_norm, cliptype, args):
sess = tf.get_default_session()
# if args.policy_type == 'MlpPolicyMy':
# additional_keys = ['hidden_sizes','num_sharing_layers','ac_fn', 'seperate_logstd']
# elif args.policy_type == 'MlpPolicy':
# additional_keys = ['seperate_logstd']
# else:
# additional_keys = []
#
# additional_args = {}
# for k in additional_keys:
# additional_args[k] = getattr( args, k)
additional_args = {}
if args.policy_type == 'MlpPolicyExt':
additional_args = dict(args=args)
# There are two models just to set different batch size
act_model = policy(sess, ob_space, ac_space, nbatch_act, nsteps=1, reuse=False, **additional_args)
train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps=nsteps, reuse=True, **additional_args)
if args.n_eval_epsiodes > 0:
self.eval_policy = policy(sess, ob_space, ac_space, args.n_eval_epsiodes, nsteps=1, reuse=True, **additional_args)
# ------- For copy Policy
# self.eval_policy = eval_policy = policy(sess, ob_space, ac_space, args.n_eval_epsiodes, nsteps=1, reuse=False, name='policy_eval', **additional_args)
# params_eval = eval_policy.variables_trainable
# placeholders_params= [ tf.placeholder( dtype=v.dtype, shape=v.shape ) for v in params_eval ]
# self.assign_eval_policy = U.function(inputs=placeholders_params,outputs=[], updates=[tf.assign(v, v_input) for (v, v_input) in zip(params_eval, placeholders_params)])
# self.get_params_train = U.function( inputs=[], outputs=act_model.variables_trainable )
# self.get_params_eval = U.function(inputs=[], outputs=eval_policy.variables_trainable )
# ------ End------
# for p in params:
# print( p.name )
# for p in eval_policy.variables_all:
# print(p.name)
# exit()
# tools.save_vars( '/media/d/e/baselines_workingon/baselines/ppo2_AdaClip/t/a.pkl', params )
# exit()
self.OBS = OBS = train_model.X
self.ACTIONS = ACTIONS = train_model.pdtype.sample_placeholder([None])
self.RETURNS = RETURNS = tf.placeholder(tf.float32, [None])
self.NEGLOGPACS_OLD = NEGLOGPACS_OLD = tf.placeholder(tf.float32, [None])
self.VALUES_OLD = VALUES_OLD = tf.placeholder(tf.float32, [None])
self.LR = LR = tf.placeholder(tf.float32, [])
self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
self.KLRANGE = KLRANGE = tf.placeholder(tf.float32, [])
self.CLIPRANGE_LOWER = CLIPRANGE_LOWER = tf.placeholder(tf.float32, [None])
self.CLIPRANGE_UPPER = CLIPRANGE_UPPER = tf.placeholder(tf.float32, [None])
self.KL_COEF = KL_COEF = tf.placeholder(tf.float32, [])
self.RANGE = RANGE = tf.placeholder(tf.float32, [])
neglogpac = train_model.pd.neglogp(ACTIONS)
entropy = tf.reduce_mean(train_model.pd.entropy())
vpred = train_model.vf
vpredclipped = VALUES_OLD + tf.clip_by_value(train_model.vf - VALUES_OLD, - CLIPRANGE, CLIPRANGE)
vf_losses1 = tf.square(vpred - RETURNS)
vf_losses2 = tf.square(vpredclipped - RETURNS)
vf_loss = tf.maximum(vf_losses1, vf_losses2)
vf_loss = .5 * tf.reduce_mean(vf_loss)
ratio = tf.exp(NEGLOGPACS_OLD - neglogpac)
pg_loss = None
self.ADVS = ADVS = tf.placeholder(tf.float32, [None])
new_pd = train_model.pd
flat_shape = new_pd.flatparam().shape
self.POLICYFLATS_OLD = POLICYFLATS_OLD = tf.placeholder(tf.float32, shape=flat_shape, name='old_policyflat')
old_pd = train_model.pdtype.pdfromflat(POLICYFLATS_OLD)
kl = old_pd.kl(new_pd)
if hasattr(old_pd, 'wasserstein'):
wasserstein = old_pd.wasserstein(new_pd)
if hasattr(old_pd, 'totalvariation'):
totalvariation = old_pd.totalvariation(new_pd)
if cliptype == ClipType.ratio:
pg_losses = -ADVS * ratio
pg_losses2 = -ADVS * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
elif cliptype == ClipType.ratio_rollback:
slope = args.clipargs.slope_rollback
pg_targets = tf.where(
ADVS >= 0,
tf.where( ratio <= 1 + CLIPRANGE,
ratio,
slope * ratio + (1 - slope) * (1 + CLIPRANGE) ), # When ratio=1+CLIPRANGE, the corresponding value should also be 1+CLIPRANGE
tf.where( ratio >= 1 - CLIPRANGE,
ratio,
slope * ratio + (1 - slope) * (1 - CLIPRANGE))
) * ADVS
pg_loss = -tf.reduce_mean(pg_targets)
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
elif cliptype == ClipType.ratio_rollback_constant:
slope = args.clipargs.slope_rollback
pg_targets = tf.where(
ADVS >= 0,
tf.where( ratio <= 1 + CLIPRANGE,
ratio*ADVS,
slope * ratio ), # When ratio=1+CLIPRANGE, the corresponding value should also be 1+CLIPRANGE
tf.where( ratio >= 1 - CLIPRANGE,
ratio*ADVS,
-slope * ratio )
)
pg_loss = -tf.reduce_mean(pg_targets)
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
elif cliptype == ClipType.ratio_strict:
pg_losses2 = -ADVS * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
pg_loss = tf.reduce_mean(pg_losses2)
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
elif cliptype == ClipType.a2c:
pg_losses = -ADVS * ratio
pg_loss = tf.reduce_mean(pg_losses)
clipfrac = tf.constant(0)
elif cliptype == ClipType.kl:
# version by hugo
# pg_losses = ADV * ratio
# pg_losses = tf.where(kl <= KLRANGE, pg_losses, tf.where(
# tf.logical_or(tf.logical_and(ratio > 1., ADV > 0), tf.logical_and(ratio < 1., ADV < 0)),
# tf.stop_gradient(pg_losses, name='pg_losses_notrain'), pg_losses))
# clipfrac = tf.reduce_mean(tf.to_float(
# tf.logical_or(tf.logical_and(tf.greater(kl, KLRANGE), tf.logical_and(ADV > 0, ratio > 1.)),
# tf.logical_and(tf.greater(kl, KLRANGE), tf.logical_and(ADV < 0., ratio < 1.)))))
# version by siuming
pg_losses = -ADVS * ratio
pg_losses = tf.where(
tf.logical_and( kl >= KLRANGE, ratio*ADVS > 1*ADVS ),
tf.stop_gradient(pg_losses, name='pg_losses_notrain'),
pg_losses
)
pg_loss = tf.reduce_mean(pg_losses)
clipfrac = tf.reduce_mean(tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))
elif cliptype == ClipType.kl_ratiorollback:
# The slope of the objective is switched once the kl exceed.
slope = args.clipargs.slope_rollback
# version by hugo
# pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
# tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
# tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
# version by siuming
pg_targets = tf.where(
tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
slope * ratio + tf.stop_gradient((1-slope)*ratio), # The bias term is set to maintain continuity
ratio
) * ADVS
pg_loss = -tf.reduce_mean(pg_targets)
clipfrac = tf.reduce_mean(
tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))
elif cliptype == ClipType.kl_klrollback_constant_withratio:
# The slope of the objective is switched once the kl exceed.
# version by hugo
# pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
# tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
# tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
# version by siuming
pg_targets = tf.where(
tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
args.clipargs.slope_likelihood * ratio * ADVS + args.clipargs.slope_rollback * kl,
ratio * ADVS
)
pg_loss = -tf.reduce_mean(pg_targets)
clipfrac = tf.reduce_mean(
tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))
elif cliptype == ClipType.kl_klrollback_constant:
# The slope of the objective is switched once the kl exceed.
slope = args.clipargs.slope_rollback
# version by hugo
# pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
# tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
# tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
# version by siuming
pg_targets = tf.where(
tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
slope * kl,
ratio * ADVS
)
pg_loss = -tf.reduce_mean(pg_targets)
clipfrac = tf.reduce_mean(
tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))
elif cliptype == ClipType.kl_klrollback:
# The slope of the objective is switched once the kl exceed.
slope = args.clipargs.slope_rollback
# version by hugo
# pg_losses = tf.where(kl <= KLRANGE, ADV * ratio,
# tf.where(tf.logical_and(ratio > 1., ADV > 0), slope * ratio * ADV,
# tf.where(tf.logical_and(ratio < 1., ADV < 0.), slope * ratio * ADV, ADV * ratio)))
# version by siuming
pg_targets = tf.where(
tf.logical_and( kl >= KLRANGE, ratio * ADVS > 1 * ADVS),
slope * kl * tf.abs(ADVS),
ratio * ADVS
)
pg_loss = -tf.reduce_mean(pg_targets)
clipfrac = tf.reduce_mean(
tf.to_float(tf.logical_and(kl >= KLRANGE, ratio*ADVS>ADVS)))
elif cliptype == ClipType.kl_strict:
pg_losses = -ADVS * ratio
pg_losses = tf.where(
kl >= KLRANGE,
tf.stop_gradient(pg_losses, name='pg_losses_notrain'),
pg_losses
)
pg_loss = tf.reduce_mean(pg_losses)
clipfrac = tf.reduce_mean(tf.to_float( kl >= KLRANGE ))
elif cliptype == ClipType.adaptivekl:
pg_loss = tf.reduce_mean(-ADVS * ratio) + tf.reduce_mean(kl) * KL_COEF
clipfrac = tf.constant(0.)
else:
raise NotImplementedError
# approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
approxkl = tf.reduce_mean(kl)
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
with tf.variable_scope('model'):
params = tf.trainable_variables()
grads = tf.gradients(loss, params)
if max_grad_norm is not None:
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params))
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
_train = trainer.apply_gradients(grads)
def train( **kwargs):
feed_dict = dict()
for key, value in kwargs.items():
feed_dict.update({ getattr(self,key.upper()): value})
# td_map = {
# CLIPRANGE: cliprange,
# OBS: obs, ACTIONS: actions, ADV: advs, RETURNS: returns, LR: lr,
# NEGLOGPACS_OLD: neglogpacs, VALUES_OLD: values,
# POLICYFALTS_OLD: policyflats
# }
#
# if cliptype == ClipType.kl2clip:
# assert cliprange_lower is not None and cliprange_upper is not None
# td_map.update({CLIPRANGE_LOWER: cliprange_lower, CLIPRANGE_UPPER: cliprange_upper})
# elif cliptype == ClipType.adaptivekl:
# assert kl_coef is not None
# td_map.update({KL_COEF: kl_coef})
# TODO: train_model.S .modify to STATES
# recurrent version
# if states is not None:
# td_map[train_model.S] = states
# td_map[train_model.M] = masks
return sess.run(
[pg_loss, vf_loss, entropy, approxkl, clipfrac, kl, ratio, _train],
feed_dict
)[:-1]
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
# def restore_summary_writer(graph_dir: str) -> tf.summary.FileWriter:
# return tf.summary.FileWriter.reopen()
def save(save_path):
ps = sess.run(params)
joblib.dump(ps, save_path)
save_env(save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
sess.run(restores)
load_env(load_path)
# If you want to load weights, also save/load observation scaling inside VecNormalize
self.train = train
self.train_model = train_model
self.act_model = act_model
# self.step = act_model.step
self.step = act_model.step_policyflat
self.value = act_model.value
self.initial_state = act_model.initial_state
self.save = save
self.load = load
tf.global_variables_initializer().run(session=sess) # pylint: disable=E1101
class Runner(AbstractEnvRunner):
def __init__(self, *, env, model, nsteps, gamma, lam):
super().__init__(env=env, model=model, nsteps=nsteps)
self.lam = lam
self.gamma = gamma
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs, mb_policyflats = [], [], [], [], [], [], []
mb_states = self.states
epinfos = []
# mb_obs_next = []
mb_obs_next= None
for _ in range(self.nsteps):
actions, values, self.states, neglogpacs, policyflats = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(self.obs.copy())
mb_actions.append(actions)
mb_values.append(values)
mb_neglogpacs.append(neglogpacs)
mb_policyflats.append(policyflats)
mb_dones.append(self.dones)
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
# mb_obs_next.append( self.env._obs_real.copy() )
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo: epinfos.append(maybeepinfo)
mb_rewards.append(rewards)
# batch of steps to batch of rollouts
# mb_obs_next = np.asarray(mb_obs_next, dtype=np.float32)
# --- End xiaoming
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
mb_actions = np.asarray(mb_actions)
mb_values = np.asarray(mb_values, dtype=np.float32)
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
mb_policyflats = np.asarray(mb_policyflats, dtype=np.float32)
mb_dones = np.asarray(mb_dones, dtype=np.bool)
last_values = self.model.value(self.obs, self.states, self.dones)
# discount/bootstrap off value fn
mb_returns = np.zeros_like(mb_rewards)
mb_advs = np.zeros_like(mb_rewards)
lastgaelam = 0
for t in reversed(range(self.nsteps)):
if t == self.nsteps - 1: # Last Timestep
nextnonterminal = 1.0 - self.dones
nextvalues = last_values
else:
nextnonterminal = 1.0 - mb_dones[t + 1]
nextvalues = mb_values[t + 1]
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
mb_returns = mb_advs + mb_values
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, mb_policyflats)),
mb_states, epinfos)
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr):
"""
swap and then flatten axes 0 and 1
"""
s = arr.shape
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
def constfn(val):
def f(_):
return val
return f
# gradient_rectify = 7
from baselines.ppo2_AdaClip.algs import *
from toolsm import tools
def learn(*, policy, env, env_eval, n_steps, total_timesteps, ent_coef, lr,
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
log_interval=1, nminibatches=4, n_opt_epochs=4,
save_interval=10, load_path=None, cliptype=None, args=None):
if isinstance(lr, float):
lr = constfn(lr)
else:
assert callable(lr)
total_timesteps = int(total_timesteps)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
nbatch = nenvs * n_steps
nbatch_train = nbatch // nminibatches
set_save_load_env(env)
make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs,
nbatch_train=nbatch_train,
nsteps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, cliptype=cliptype, args=args)
if save_interval:
# only save make_model, the function to make a model with its closure/args
import cloudpickle
with open(osp.join(args.log_dir, 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
if load_path is not None:
model.load(load_path)
runner = Runner(env=env, model=model, nsteps=n_steps, gamma=gamma, lam=lam)
if args.n_eval_epsiodes > 0:
evaluator = Evaluator()
# tfwriter = tf.summary.FileWriter(args.log_dir)
if cliptype in [ClipType.kl2clip, ClipType.kl2clip_rollback]:
# from baselines.ppo2_AdaClip.KL2Clip.KL2Clip_opt_tf import get_clip_new
if isinstance(env.action_space, gym.spaces.box.Box):
from baselines.ppo2_AdaClip.KL2Clip_reduce_v3.KL2Clip_reduce import KL2Clip, Adjust_Type
kl2clip = KL2Clip(opt1Dkind=args.clipargs.kl2clip_opttype)
args.clipargs.adjusttype = Adjust_Type[args.clipargs.adjusttype]
if '2constant' in args.clipargs.adaptive_range: # TODO: alg args
dim = ac_space.shape[0]
if args.clipargs.adjusttype == Adjust_Type.origin:
delta = args.clipargs.delta
else:
delta = kl2clip.cliprange2delta(args.clipargs.cliprange, ac_space.shape[0],
args.clipargs.adjusttype)
cliprange_upper_min = 1 + kl2clip.delta2cliprange(delta, dim=ac_space.shape[0],
adjusttype='base_clip_upper')
cliprange_lower_max = 1 - kl2clip.delta2cliprange(delta, dim=ac_space.shape[0],
adjusttype='base_clip_lower')
if args.clipargs.cliprange is None:
assert isinstance(env.action_space, gym.spaces.box.Box)
args.clipargs.cliprange = kl2clip.delta2cliprange(args.clipargs.klrange, dim=ac_space.shape[0],
adjusttype=args.clipargs.adjusttype)
tools.print_(
f'The provided cliprange is None. Set cliprange={args.clipargs.cliprange} by klrange={args.clipargs.klrange}, dim={ac_space.shape[0]}',
color='magenta')
elif isinstance(env.action_space, gym.spaces.discrete.Discrete):
# TODO: Atari上cliprange是不断减小的,那么delta也应该不断减小
# raise NotImplementedError('Please review the code.....')
from baselines.ppo2_AdaClip.KL2Clip_discrete.KL2Clip_discrete import KL2Clip
kl2clip = KL2Clip(opt1Dkind=args.clipargs.kl2clip_opttype)
else:
raise NotImplementedError('Please run atari or mujoco!')
elif cliptype == ClipType.adaptivekl:
kl_coef = 1.
kl_targ = args.clipargs.klrange
if args.envtype == MUJOCO:
cliprange = args.clipargs.cliprange
elif args.envtype == ATARI:
cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None
if isinstance(cliprange, float) or cliprange is None:
cliprange = constfn(cliprange)
else:
assert callable(cliprange)
# alphas_kl2clip_decay = np.zeros(nupdates, dtype=np.float32)
# alphas_kl2clip_decay[0:nupdates // 3] = 1
# alphas_kl2clip_decay[nupdates // 3:] = np.linspace(1, -0.5, nupdates - nupdates // 3)
from toolsm.logger import Logger
logformats = ['csv','tensorflow', 'log']
if not args.is_multiprocess:
logformats.append('stdout')
else:
logger_multiprocess = Logger( ['stdout'])
logger = Logger( logformats , path=args.log_dir, file_basename='process' )
epinfobuf = deque(maxlen=100)
nupdates = total_timesteps // nbatch
performance_max = -np.inf
print(f'nupdates:{nupdates},eval_interval:{args.eval_interval}')
tstart_ini = time.time()
for update in range(1, nupdates + 1):
tstart = time.time()
assert nbatch % nminibatches == 0
debugs = dict( iteration=update )
nbatch_train = nbatch // nminibatches
frac = (update-1) * 1. / nupdates
# frac_remain = 1.0 - (update - 1.0) / nupdates
frac_remain = 1.0 - frac
lrnow = lr(frac_remain)
# ---------------- Sample data
if args.lam_decay:
runner.lam = lam - (lam - 0.5) * frac
print(f'lam decay to {runner.lam}')
# ----- explore by setting policy std
# if nbatch * update <= args.explore_timesteps:
# model.train_model.set_logstd(model.train_model.get_logstd() * 0)
# if args.explore_additive_threshold is not None and update * 1. / nupdates > args.explore_additive_threshold:
# logger.log(f'add additive explore: {args.explore_additive_rate}')
# model.train_model.set_logstd(model.train_model.get_logstd() - args.explore_additive_rate)
# with tools.timed('sample'):
obs, returns, masks, actions, values, neglogpacs, policyflats, states, epinfos = runner.run() # pylint: disable=E0632
advs = returns - values
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
debugs['advs'] = advs
if isinstance(env.action_space, gym.spaces.Box):
epinfobuf.clear()
epinfobuf.extend(epinfos)
# ----------------- Prepare for training: update clipping range, etc.
cliprangenow = cliprange(frac_remain)
# old version: unknown meaning....TODO: figure out the meaning
# if isinstance(env.action_space, gym.spaces.Box):
# cliprangenow = cliprange(frac)
# elif isinstance(env.action_space, gym.spaces.Discrete):
# cliprangenow = (lambda _: cliprange(None) * _)(frac)
kwargs_in_scalar = dict(lr=lrnow, cliprange=cliprangenow)
kwargs_in_arr = dict(
obs=obs, returns=returns,
actions=actions, values_old=values, neglogpacs_old=neglogpacs, advs=advs,
policyflats_old=policyflats
)
if cliptype in [ClipType.kl, ClipType.kl_ratiorollback, ClipType.kl_klrollback_constant, ClipType.kl_klrollback, ClipType.kl_strict, ClipType.kl_klrollback_constant_withratio]:
klrange = args.clipargs.klrange
if 'decay_threshold' in args.clipargs.keys():
decay_threshold = args.clipargs.decay_threshold
if frac >= decay_threshold:
coef_ = frac_remain/(1-decay_threshold)
klrange *= coef_
kwargs_in_scalar.update( klrange = klrange )
# print(kwargs_in_scalar)
# ----------------- Train the model
mblossvals = []
if states is None: # nonrecurrent version
kls = []
ratios = []
# totalvariations = []
inds = np.arange(nbatch)
for ind_epoch in range(n_opt_epochs):
np.random.shuffle(inds)
for start in range(0, nbatch, nbatch_train):
mbinds = inds[start: start + nbatch_train]
kwargs_in_batch = dict()
for key in kwargs_in_arr.keys():
kwargs_in_batch[key] = kwargs_in_arr[key][mbinds]
*lossvals, kl, ratio = model.train(**kwargs_in_scalar,**kwargs_in_batch)
mblossvals.append(lossvals)
if ind_epoch == n_opt_epochs -1:# only add it at the last opt_epoch
kls.append(kl)
ratios.append(ratio)
# totalvariations.append(totalvariation)
# --- restore the order of kls and ratios to the original order
# kls and ratios are list of kl_batch
# TODO: Do not suffle and run it at last time! make it easy to add more variables to debug!
inds2position = {}
for position, ind in enumerate(inds):
inds2position[ind] = position
inds_reverse = [inds2position[ind] for ind in range(len(inds))]
kls, ratios = (np.concatenate(arr, axis=0)[inds_reverse] for arr in (kls, ratios))
debugs['kls'] = kls
debugs['ratios'] = ratios
# debugs['totalvariations'] = totalvariations
# print(kls.mean(), totalvariations.mean())
else: # recurrent version
raise NotImplementedError('Not implemented!')
assert nenvs % nminibatches == 0
envsperbatch = nenvs // nminibatches
envinds = np.arange(nenvs)
flatinds = np.arange(nenvs * n_steps).reshape(nenvs, n_steps)
envsperbatch = nbatch_train // n_steps
# TODO: states: masks
for _ in range(n_opt_epochs):
np.random.shuffle(envinds)
for start in range(0, nenvs, envsperbatch):
end = start + envsperbatch
mbenvinds = envinds[start:end]
mbflatinds = flatinds[mbenvinds].ravel()
slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
mbstates = states[mbenvinds]
mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
if args.save_debug:
tools.save_vars( osp.join(args.log_dir, 'debugs.pkl'), debugs, append=update > 1 )
# -------------- Log the result
lossvals = np.mean(mblossvals, axis=0)
is_eprewmean_better = False
eprewmean_eval = None
eplenmean_eval = None
if args.n_eval_epsiodes > 0 and \
(
(args.eval_interval >0 and ( (update - 1) % args.eval_interval == 0) )
or
(args.eval_interval<0 and update == nupdates)
):
result_eval = evaluator.eval( env_eval, env, model, args, update )
eprewmean_eval = safemean([epinfo['r'] for epinfo in result_eval])
eplenmean_eval = safemean([epinfo['l'] for epinfo in result_eval])
if eprewmean_eval > performance_max:
is_eprewmean_better = True
performance_max = eprewmean_eval
if save_interval and ( (update -1 ) % save_interval == 0 or update == nupdates or is_eprewmean_better
):
savepath = osp.join(args.model_dir, '%.5i' % update)
# print('Saving to', savepath)
model.save(savepath)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, returns)
eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf])
eplenmean = safemean([epinfo['l'] for epinfo in epinfobuf])
timesteps = update * nbatch
tnow = time.time()
time_oneupdate = tnow - tstart
fps = int(nbatch / time_oneupdate)
if args.is_multiprocess: #and ( (update-1) % (args.eval_interval*2) == 0)
# TODO: 改成log, print reward_eval
logger_multiprocess.log_keyvalues(
update=update,
env=args.env,
timesteps=timesteps,
time_oneupdate=time_oneupdate,
eprewmean=eprewmean,
eprewmean_eval=eprewmean_eval,
time = tools.time_now_str('%H:%M:%S')
)
# tools.print_( f'timesteps:{timesteps},time_oneupdate:{time_oneupdate},eprewmean:{eprewmean},eprewmean_eval:{eprewmean_eval}', color='magenta' )
logger.log_keyvalue(
global_step = timesteps,
nupdates = update,
fps = fps,
eprewmean = eprewmean,
eplenmean = eplenmean,
eprewmean_eval = eprewmean_eval,
eplenmean_eval=eplenmean_eval,
explained_variance = float(ev),
time_elapsed = tnow - tstart_ini,
time_oneupdate = time_oneupdate,
serial_timesteps =update * n_steps,
total_timesteps = timesteps,
)
for (lossval, lossname) in zip(lossvals, model.loss_names):
logger.log_keyvalue( **{lossname:lossval} )
logger.dump_keyvalues()
# if args.debug_halfcheetah and args.env_pure == 'HalfCheetah':
# if frac > 0.6 and performance_max <= 1500:
# print( tools.colorize('HalfCheetah does not ahieve the threshold! Force it to stop!') )
# break
if cliptype == ClipType.adaptivekl:
kl_mean = np.mean(kls)
if kl_mean < kl_targ / 1.5:
kl_coef /= 2
elif kl_mean > kl_targ * 1.5:
kl_coef *= 2
if args.model_dir is not None:
import shutil
for d in args.zip_dirs:
d_path = args[f'{d}_dir']
if len(tools.get_files( d_path ))> 2:
shutil.make_archive( base_name=d_path, format='zip', root_dir=d_path )
tools.safe_delete( d_path , confirm=False, require_not_containsub=False )
env.close()
return model
def safemean(xs):
return np.nan if len(xs) == 0 else np.mean(xs)
class Evaluator():
def __init__(self):
pass
# # ---- Clone env
# self.env_eval = env_eval
# # ----- copy pi
# self.model = model
# self.assign_policy = model.assign_policy_eval_eq_train
# self.policy = model.eval_policy
def eval(self, env_eval, env_train, model, args, update):
# from copy import deepcopy
from copy import copy
# assign env
if args.envtype == MUJOCO:
env_eval.ob_rms = copy(env_train.ob_rms)
else:
pass
# assign pi
# params = model.get_params_train()
# model.assign_eval_policy( *params )
eval_policy = model.eval_policy
# start process
epinfos = rollout( env_eval, eval_policy, evaluate_times=args.n_eval_epsiodes, deterministic=True, verbose=(args.envtype == ATARI) )
# print(f'update:{update}, epinfos:{epinfos}')
# rewards_epi = [ epinfo['r'] for epinfo in epinfos ]
return epinfos
def rollout(env, policy, evaluate_times=1, deterministic=True, verbose=0):
import itertools
epinfos = []
obs = env.reset()
if verbose:
tools.warn_(f'Evaluate Policy...')
# rewards_episode = []
for t in itertools.count():
# tools.print_refresh(t)
if not deterministic:
actions, *_ = policy.step(obs)
else:
actions, *_ = policy.step_test(obs)
obs[:], reward, dones, infos = env.step(actions)
# env.render()
# cnt_dones += dones.sum()
# If it is done, it will contains a key 'episode' in info
# {'episode': {'r': 118.048395, 'l': 64, 't': 0.67222}}
# print(infos, f't={t},done={dones}')
# if dones[0]:
# print('done')
# exit()
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo:
epinfos.append(maybeepinfo)
if len(epinfos) >= evaluate_times:
break
return epinfos