“Shengjiewang-Jason” 1367bca203 first commit
2024-06-07 16:02:01 +08:00

157 lines
3.2 KiB
YAML

# agent
agent_name: atari_agent
# env setting
env:
env: Atari
game: Asterix
base_seed: 0
n_skip: 4
n_stack: 4
max_episode_steps: 3000 # 27000 for final test
gray_scale: False
image_based: True
clip_reward: True
obs_shape: [3, 96, 96]
episodic: True
rl:
discount: 0.997
unroll_steps: 5 # prev 5
td_steps: 5
auto_td_steps: 30000 # prev 30000
td_lambda: 0.95
# optimizer
optimizer:
type: SGD
lr: 0.2 # prev 0.2
lr_decay_type: none
lr_warm_up: 0.01
lr_decay_rate: 0.1
lr_decay_steps: 100000
weight_decay: 1e-4
momentum: 0.9
# priority of data
priority:
use_priority: True
priority_prob_alpha: 1.0 # prev 0.6
priority_prob_beta: 1.0 # prev 0.4
min_prior: 0.000001
# training
train:
load_model_path: ''
batch_size: 256
training_steps: 100000 # 100 * 1000
offline_training_steps: 20000 # 20 * 1000
start_transitions: 2000 # 2 * 1000
eval_n_episode: 10
eval_interval: 10000
self_play_update_interval: 100
reanalyze_update_interval: 200
save_ckpt_interval: 10000
mini_batch_size: 256
reanalyze_ratio: 1.0
reward_loss_coeff: 1.0
value_loss_coeff: 0.5 # prev 0.25, 1.5 for test model-free value fitting
policy_loss_coeff: 1.0 # prev 1.0
consistency_coeff: 5.0 # prev 2.0
decorrelation_coeff: 0.01
off_diag_coeff: 5e-3
entropy_coeff: 5e-3
max_grad_norm: 5
change_temperature: True
periodic_reset: False
value_reanalyze: False
path_consistency: False
use_decorrelation: False
value_policy_detach: False
optimal_Q: False
v_num: 1
value_target: 'mixed' # sarsa or search or mixed or max
use_IQL: False
IQL_weight: 0.7
start_use_mix_training_steps: 3e4
mixed_value_threshold: 5e3
# self-play data collection
data:
num_envs: 4
buffer_size: 1000000 # 1 * 1000 * 1000
total_transitions: 100000 # 100 * 1000
top_transitions: 2e5
trajectory_size: 400
save_video: False
save_as_dataset: False
# MCTS
mcts:
language: cython
num_simulations: 16 # prev 8, ori_mcts 50
num_top_actions: 4 # prev 4
c_visit: 50
c_scale: 0.1 # prev 0.1
c_base: 19652
c_init: 1.25
dirichlet_alpha: 0.3
explore_frac: 0.25
value_minmax_delta: 0.01
vis: ['print']
mpc_horizon: 1
use_gumbel: True
# model architecture
model:
noisy_net: False
action_embedding: True
action_embedding_dim: 16
down_sample: True
state_norm: False
value_prefix: True
value_target: bootstrapped # bootstrapped or GAE
GAE_max_steps: 15 # 10 or 15 or 20
init_zero: True # prev True
num_blocks: 1 # prev 1
num_channels: 64
reduced_channels: 16
projection_layers: [1024, 1024] # hidden dim, output dim
prjection_head_layers: [256, 1024] # hidden dim, output dim
fc_layers: [32] # prev [32]
lstm_hidden_size: 512
lstm_horizon_len: 5
policy_loss_type: reanalyze # policy_gradient or reanalyze
reward_support:
range: [-300, 300]
scale: 1
env: Atari
bins: 51
type: support # support or symlog
value_support:
range: [-300, 300]
scale: 1
env: Atari
bins: 51
type: support # support or symlog
# worker process allocation
actors:
data_worker: 1
batch_worker: 8 # oriMCTS 16
# wandb
wandb:
project: 'ez-v2-release'
tag: 'Atari'