Tianshou/examples/mujoco/mujoco_ppo_hl_multi.py

#!/usr/bin/env python3
"""The high-level multi experiment script demonstrates how to use the high-level API of TianShou to train
a single configuration of an experiment (here a PPO agent on mujoco) with multiple non-intersecting seeds.
Thus, the experiment will be repeated `num_experiments` times.
For each repetition, a policy seed, train env seeds, and test env seeds are set that
are non-intersecting with the seeds of the other experiments.
Each experiment's results are stored in a separate subdirectory.

The final results are aggregated and turned into useful statistics with the rliable API.
The call to `eval_experiments` will load the results from the log directory and
create an interp-quantile mean plot for the returns as well as a performance profile plot.
These plots are saved in the log directory and displayed in the console.
"""

import os
import sys
from collections.abc import Sequence
from typing import Literal

import torch

from examples.mujoco.mujoco_env import MujocoEnvFactory
from tianshou.evaluation.launcher import RegisteredExpLauncher
from tianshou.evaluation.rliable_evaluation_hl import RLiableExperimentResult
from tianshou.highlevel.config import SamplingConfig
from tianshou.highlevel.env import VectorEnvType
from tianshou.highlevel.experiment import (
    ExperimentConfig,
    PPOExperimentBuilder,
)
from tianshou.highlevel.logger import LoggerFactoryDefault
from tianshou.highlevel.params.dist_fn import (
    DistributionFunctionFactoryIndependentGaussians,
)
from tianshou.highlevel.params.lr_scheduler import LRSchedulerFactoryLinear
from tianshou.highlevel.params.policy_params import PPOParams
from tianshou.utils import logging
from tianshou.utils.logging import datetime_tag

log = logging.getLogger(__name__)


def main(
    experiment_config: ExperimentConfig,
    task: str = "Ant-v4",
    num_experiments: int = 5,
    buffer_size: int = 4096,
    hidden_sizes: Sequence[int] = (64, 64),
    lr: float = 3e-4,
    gamma: float = 0.99,
    epoch: int = 3,
    step_per_epoch: int = 30000,
    step_per_collect: int = 2048,
    repeat_per_collect: int = 10,
    batch_size: int = 64,
    training_num: int = 10,
    test_num: int = 10,
    rew_norm: bool = True,
    vf_coef: float = 0.25,
    ent_coef: float = 0.0,
    gae_lambda: float = 0.95,
    bound_action_method: Literal["clip", "tanh"] | None = "clip",
    lr_decay: bool = True,
    max_grad_norm: float = 0.5,
    eps_clip: float = 0.2,
    dual_clip: float | None = None,
    value_clip: bool = False,
    norm_adv: bool = False,
    recompute_adv: bool = True,
    run_experiments_sequentially: bool = True,
) -> str:
    """Use the high-level API of TianShou to evaluate the PPO algorithm on a MuJoCo environment with multiple seeds for
    a given configuration. The results for each run are stored in separate sub-folders. After the agents are trained,
    the results are evaluated using the rliable API.

    :param experiment_config:
    :param task: a mujoco task name
    :param num_experiments: how many experiments to run with different seeds
    :param buffer_size:
    :param hidden_sizes:
    :param lr:
    :param gamma:
    :param epoch:
    :param step_per_epoch:
    :param step_per_collect:
    :param repeat_per_collect:
    :param batch_size:
    :param training_num:
    :param test_num:
    :param rew_norm:
    :param vf_coef:
    :param ent_coef:
    :param gae_lambda:
    :param bound_action_method:
    :param lr_decay:
    :param max_grad_norm:
    :param eps_clip:
    :param dual_clip:
    :param value_clip:
    :param norm_adv:
    :param recompute_adv:
    :param run_experiments_sequentially: if True, the experiments are run sequentially, otherwise in parallel.
        LIMITATIONS: currently, the parallel execution does not seem to work properly on linux.
        It might generally be undesired to run multiple experiments in parallel on the same machine,
        as a single experiment already uses all available CPU cores by default.
    :return: the directory where the results are stored
    """
    persistence_dir = os.path.abspath(os.path.join("log", task, "ppo", datetime_tag()))

    experiment_config.persistence_base_dir = persistence_dir
    log.info(f"Will save all experiment results to {persistence_dir}.")
    experiment_config.watch = False

    sampling_config = SamplingConfig(
        num_epochs=epoch,
        step_per_epoch=step_per_epoch,
        batch_size=batch_size,
        num_train_envs=training_num,
        num_test_envs=test_num,
        num_test_episodes=test_num,
        buffer_size=buffer_size,
        step_per_collect=step_per_collect,
        repeat_per_collect=repeat_per_collect,
    )

    env_factory = MujocoEnvFactory(
        task,
        train_seed=sampling_config.train_seed,
        test_seed=sampling_config.test_seed,
        obs_norm=True,
        venv_type=VectorEnvType.SUBPROC_SHARED_MEM_FORK_CONTEXT
        if sys.platform == "darwin"
        else VectorEnvType.SUBPROC_SHARED_MEM,
    )

    experiments = (
        PPOExperimentBuilder(env_factory, experiment_config, sampling_config)
        .with_ppo_params(
            PPOParams(
                discount_factor=gamma,
                gae_lambda=gae_lambda,
                action_bound_method=bound_action_method,
                reward_normalization=rew_norm,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                value_clip=value_clip,
                advantage_normalization=norm_adv,
                eps_clip=eps_clip,
                dual_clip=dual_clip,
                recompute_advantage=recompute_adv,
                lr=lr,
                lr_scheduler_factory=LRSchedulerFactoryLinear(sampling_config)
                if lr_decay
                else None,
                dist_fn=DistributionFunctionFactoryIndependentGaussians(),
            ),
        )
        .with_actor_factory_default(hidden_sizes, torch.nn.Tanh, continuous_unbounded=True)
        .with_critic_factory_default(hidden_sizes, torch.nn.Tanh)
        .with_logger_factory(LoggerFactoryDefault("tensorboard"))
        .build_default_seeded_experiments(num_experiments)
    )

    if run_experiments_sequentially:
        launcher = RegisteredExpLauncher.sequential.create_launcher()
    else:
        launcher = RegisteredExpLauncher.joblib.create_launcher()
    launcher.launch(experiments)

    return persistence_dir


def eval_experiments(log_dir: str) -> RLiableExperimentResult:
    """Evaluate the experiments in the given log directory using the rliable API."""
    rliable_result = RLiableExperimentResult.load_from_disk(log_dir)
    rliable_result.eval_results(show_plots=True, save_plots=True)
    return rliable_result


if __name__ == "__main__":
    log_dir = logging.run_cli(main, level=logging.INFO)
    assert isinstance(log_dir, str)  # for mypy
    evaluation_result = eval_experiments(log_dir)