# agent agent_name: atari_agent # env setting env: env: Atari game: Asterix base_seed: 0 n_skip: 4 n_stack: 4 max_episode_steps: 3000 # 27000 for final test gray_scale: False image_based: True clip_reward: True obs_shape: [3, 96, 96] episodic: True rl: discount: 0.997 unroll_steps: 5 # prev 5 td_steps: 5 auto_td_steps: 30000 # prev 30000 td_lambda: 0.95 # optimizer optimizer: type: SGD lr: 0.2 # prev 0.2 lr_decay_type: none lr_warm_up: 0.01 lr_decay_rate: 0.1 lr_decay_steps: 100000 weight_decay: 1e-4 momentum: 0.9 # priority of data priority: use_priority: True priority_prob_alpha: 1.0 # prev 0.6 priority_prob_beta: 1.0 # prev 0.4 min_prior: 0.000001 # training train: load_model_path: '' batch_size: 256 training_steps: 100000 # 100 * 1000 offline_training_steps: 20000 # 20 * 1000 start_transitions: 2000 # 2 * 1000 eval_n_episode: 10 eval_interval: 10000 self_play_update_interval: 100 reanalyze_update_interval: 200 save_ckpt_interval: 10000 mini_batch_size: 256 reanalyze_ratio: 1.0 reward_loss_coeff: 1.0 value_loss_coeff: 0.5 # prev 0.25, 1.5 for test model-free value fitting policy_loss_coeff: 1.0 # prev 1.0 consistency_coeff: 5.0 # prev 2.0 decorrelation_coeff: 0.01 off_diag_coeff: 5e-3 entropy_coeff: 5e-3 max_grad_norm: 5 change_temperature: True periodic_reset: False value_reanalyze: False path_consistency: False use_decorrelation: False value_policy_detach: False optimal_Q: False v_num: 1 value_target: 'mixed' # sarsa or search or mixed or max use_IQL: False IQL_weight: 0.7 start_use_mix_training_steps: 3e4 mixed_value_threshold: 5e3 # self-play data collection data: num_envs: 4 buffer_size: 1000000 # 1 * 1000 * 1000 total_transitions: 100000 # 100 * 1000 top_transitions: 2e5 trajectory_size: 400 save_video: False save_as_dataset: False # MCTS mcts: language: cython num_simulations: 16 # prev 8, ori_mcts 50 num_top_actions: 4 # prev 4 c_visit: 50 c_scale: 0.1 # prev 0.1 c_base: 19652 c_init: 1.25 dirichlet_alpha: 0.3 explore_frac: 0.25 value_minmax_delta: 0.01 vis: ['print'] mpc_horizon: 1 use_gumbel: True # model architecture model: noisy_net: False action_embedding: True action_embedding_dim: 16 down_sample: True state_norm: False value_prefix: True value_target: bootstrapped # bootstrapped or GAE GAE_max_steps: 15 # 10 or 15 or 20 init_zero: True # prev True num_blocks: 1 # prev 1 num_channels: 64 reduced_channels: 16 projection_layers: [1024, 1024] # hidden dim, output dim prjection_head_layers: [256, 1024] # hidden dim, output dim fc_layers: [32] # prev [32] lstm_hidden_size: 512 lstm_horizon_len: 5 policy_loss_type: reanalyze # policy_gradient or reanalyze reward_support: range: [-300, 300] scale: 1 env: Atari bins: 51 type: support # support or symlog value_support: range: [-300, 300] scale: 1 env: Atari bins: 51 type: support # support or symlog # worker process allocation actors: data_worker: 1 batch_worker: 8 # oriMCTS 16 # wandb wandb: project: 'ez-v2-release' tag: 'Atari'