EfficientZeroV2/ez/config/exp/dmc_state.yaml

170 lines
3.6 KiB
YAML
Raw Normal View History

2024-06-07 16:02:01 +08:00
# agent
agent_name: dmc_state_agent
# env setting
env:
env: DMC
game: hopper_hop
base_seed: 0
n_skip: 2
n_stack: 1
max_episode_steps: 1000
gray_scale: False
image_based: False
clip_reward: False
obs_shape: 10
episodic: False
rl:
discount: 0.997 # prev 0.99
unroll_steps: 5 # prev 5
td_steps: 5 # prev 5
auto_td_steps: 60000 # prev 30000 for 100k env steps
td_lambda: 0.95
# optimizer
optimizer:
type: Adam
lr: 3e-4
lr_warm_up: 0.01
lr_decay_type: none
lr_decay_rate: 0.1
lr_decay_steps: 300000
weight_decay: 2e-5
momentum: 0.9
# priority of data
priority:
use_priority: True
priority_prob_alpha: 1.0 # prev 0.6
priority_prob_beta: 1.0 # prev 0.4
min_prior: 0.000001
# training
train:
load_model_path: ''
batch_size: 256
training_steps: 100000 # 100 * 1000
offline_training_steps: 20000 # 20 * 1000
start_transitions: 2000 # 2 * 1000
eval_n_episode: 10 # prev 10
eval_interval: 5000
self_play_update_interval: 100 # prev 100
reanalyze_update_interval: 200 # prev 200
save_ckpt_interval: 10000
mini_batch_size: 256
reanalyze_ratio: 1.0
reward_loss_coeff: 1.0
value_loss_coeff: 0.5 # prev 0.25
policy_loss_coeff: 1.0
consistency_coeff: 2.0
decorrelation_coeff: 0.01
off_diag_coeff: 5e-3
entropy_coeff: 5e-2 # prev 5e-3
max_grad_norm: 5
change_temperature: True
periodic_reset: False
value_reanalyze: False
path_consistency: False
use_decorrelation: False
value_policy_detach: False
optimal_Q: False
v_num: 1
value_target: 'mixed' # sarsa or search or mixed or max
use_IQL: False
IQL_weight: 0.5
start_use_mix_training_steps: 4e4 # prev 4e4
mixed_value_threshold: 2e4 # prev 2e4
# self-play data collection
data:
num_envs: 4
buffer_size: 100000 # 1 * 1000 * 1000
total_transitions: 100000 # 1M
top_transitions: 2e5
trajectory_size: 100 # prev 500
save_video: False
save_as_dataset: False
# MCTS
mcts:
language: cython
num_simulations: 32 # prev 8
num_top_actions: 16 # prev 4
num_sampled_actions: 16 # same as Sampled MuZero
c_visit: 50
c_scale: 0.1 # prev 0.1
value_minmax_delta: 0.01
mpc_horizon: 1
vis: ['print']
use_mppi: False
std_magnification: 3
# model architecture
model:
noisy_net: False
action_embedding: True
block_type: resnet # resnet, convnext
down_sample: True
state_norm: False
value_prefix: False
value_target: bootstrapped # bootstrapped or GAE
GAE_max_steps: 15 # 10 or 15 or 20
dynamic_type: None # RNN or Transformer or None
init_zero: True
use_bn: True
use_p_norm: False
num_blocks: 2 # prev 3
hidden_shape: 128 # prev 128
rep_net_shape: 256
dyn_shape: 256
act_embed_shape: 64
rew_net_shape: [ 256, 256 ]
val_net_shape: [ 256, 256 ]
pi_net_shape: [ 256, 256 ]
proj_hid_shape: 512
pred_hid_shape: 512
proj_shape: 128
pred_shape: 128
fc_layers: [32]
lstm_hidden_size: 512
lstm_horizon_len: 5
value_ensumble: 1
policy_distribution: squashed_gaussian # beta or squashed_gaussian or truncated_gaussian
policy_loss_type: reanalyze # policy_gradient or reanalyze
policy_action_num: 4
random_action_num: 12
random_type: std # std, normal, pink, OU
reward_support:
range: [ -2, 2 ]
scale: 0.01
env: DMC
bins: 51
type: support # support or symlog
value_support:
range: [ -299, 299 ]
scale: 0.5
env: DMC
bins: 51
type: support # support or symlog
# worker process allocation
actors:
data_worker: 1
batch_worker: 10
# wandb
wandb:
project: 'ez-v2-release'
tag: 'DMC-state'