From 6c1bd85521a304a02bbd6bc212897fb59d845081 Mon Sep 17 00:00:00 2001
From: Maximilian Huettenrauch <m.huettenrauch@appliedai.de>
Date: Tue, 12 Mar 2024 11:44:48 +0100
Subject: [PATCH] add mujoco example with multiple runs and performance plots

---
 examples/mujoco/mujoco_ppo_hl_multi.py | 114 +++++++++++++++++++++++++
 examples/mujoco/tools.py               |  80 +++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 examples/mujoco/mujoco_ppo_hl_multi.py

diff --git a/examples/mujoco/mujoco_ppo_hl_multi.py b/examples/mujoco/mujoco_ppo_hl_multi.py
new file mode 100644
index 0000000..12185a5
--- /dev/null
+++ b/examples/mujoco/mujoco_ppo_hl_multi.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+import os
+from collections.abc import Sequence
+from functools import partial
+from typing import Literal
+
+import torch
+
+from examples.mujoco.mujoco_env import MujocoEnvFactory
+from examples.mujoco.tools import eval_results, RLiableExperimentResult
+from tianshou.highlevel.config import SamplingConfig
+from tianshou.highlevel.experiment import (
+    ExperimentConfig,
+    PPOExperimentBuilder,
+)
+from tianshou.highlevel.params.dist_fn import (
+    DistributionFunctionFactoryIndependentGaussians,
+)
+from tianshou.highlevel.params.lr_scheduler import LRSchedulerFactoryLinear
+from tianshou.highlevel.params.policy_params import PPOParams
+from tianshou.utils import logging
+from tianshou.utils.logging import datetime_tag
+
+
+def main(
+    experiment_config: ExperimentConfig,
+    task: str = "Ant-v4",
+    num_experiments: int = 5,
+    buffer_size: int = 4096,
+    hidden_sizes: Sequence[int] = (64, 64),
+    lr: float = 3e-4,
+    gamma: float = 0.99,
+    epoch: int = 100,
+    step_per_epoch: int = 30000,
+    step_per_collect: int = 2048,
+    repeat_per_collect: int = 10,
+    batch_size: int = 64,
+    training_num: int = 10,
+    test_num: int = 10,
+    rew_norm: bool = True,
+    vf_coef: float = 0.25,
+    ent_coef: float = 0.0,
+    gae_lambda: float = 0.95,
+    bound_action_method: Literal["clip", "tanh"] | None = "clip",
+    lr_decay: bool = True,
+    max_grad_norm: float = 0.5,
+    eps_clip: float = 0.2,
+    dual_clip: float | None = None,
+    value_clip: bool = False,
+    norm_adv: bool = False,
+    recompute_adv: bool = True,
+) -> str:
+    log_name = os.path.join("log", task, "ppo", datetime_tag())
+    experiment_config.persistence_base_dir = log_name
+
+    sampling_config = SamplingConfig(
+        num_epochs=epoch,
+        step_per_epoch=step_per_epoch,
+        batch_size=batch_size,
+        num_train_envs=training_num,
+        num_test_envs=test_num,
+        buffer_size=buffer_size,
+        step_per_collect=step_per_collect,
+        repeat_per_collect=repeat_per_collect,
+    )
+
+    env_factory = MujocoEnvFactory(task, train_seed=sampling_config.train_seed, test_seed=sampling_config.test_seed, obs_norm=True)
+
+    experiments = (
+        PPOExperimentBuilder(env_factory, experiment_config, sampling_config)
+        .with_ppo_params(
+            PPOParams(
+                discount_factor=gamma,
+                gae_lambda=gae_lambda,
+                action_bound_method=bound_action_method,
+                reward_normalization=rew_norm,
+                ent_coef=ent_coef,
+                vf_coef=vf_coef,
+                max_grad_norm=max_grad_norm,
+                value_clip=value_clip,
+                advantage_normalization=norm_adv,
+                eps_clip=eps_clip,
+                dual_clip=dual_clip,
+                recompute_advantage=recompute_adv,
+                lr=lr,
+                lr_scheduler_factory=LRSchedulerFactoryLinear(sampling_config)
+                if lr_decay
+                else None,
+                dist_fn=DistributionFunctionFactoryIndependentGaussians(),
+            ),
+        )
+        .with_actor_factory_default(hidden_sizes, torch.nn.Tanh, continuous_unbounded=True)
+        .with_critic_factory_default(hidden_sizes, torch.nn.Tanh)
+        .build_default_seeded_experiments(num_experiments)
+    )
+
+    for experiment_name, experiment in experiments.items():
+        experiment.run(experiment_name)
+
+    return log_name
+
+
+def eval_experiments(log_dir: str):
+    results = RLiableExperimentResult.load_from_disk(log_dir, 'PPO', None)
+    eval_results(results)
+
+
+if __name__ == "__main__":
+    # logging.run_cli(main)
+    experiment_config = ExperimentConfig(watch=False)
+    log_dir = logging.run_main(partial(main, experiment_config, epoch=2))
+    # log_dir = <path/to/exp>
+    eval_experiments(log_dir)
diff --git a/examples/mujoco/tools.py b/examples/mujoco/tools.py
index be289e3..60d1154 100755
--- a/examples/mujoco/tools.py
+++ b/examples/mujoco/tools.py
@@ -5,11 +5,91 @@ import csv
 import os
 import re
 from collections import defaultdict
+from dataclasses import dataclass, asdict
 
 import numpy as np
 import tqdm
 from tensorboard.backend.event_processing import event_accumulator
 
+from tianshou.highlevel.experiment import Experiment
+
+
+@dataclass
+class RLiableExperimentResult:
+    exp_dir: str
+    algorithms: list[str]
+    score_dict: dict[str, np.ndarray]  # (n_runs x n_epochs + 1)
+    env_steps: np.ndarray  # (n_epochs + 1)
+    score_thresholds: np.ndarray
+
+    @staticmethod
+    def load_from_disk(exp_dir: str, algo_name: str, score_thresholds: np.ndarray | None):
+        """Load the experiment result from disk.
+
+        :param exp_dir: The directory from where the experiment results are restored.
+        :param algo_name: The name of the algorithm used in the figure legend.
+        :param score_thresholds: The thresholds used to create the performance profile.
+            If None, it will be created from the test episode returns.
+        """
+        test_episode_returns = []
+
+        for entry in os.scandir(exp_dir):
+            if entry.name.startswith('.'):
+                continue
+
+            exp = Experiment.from_directory(entry.path)
+            logger = exp.logger_factory.create_logger(entry.path, entry.name, None, asdict(exp.config))
+            data = logger.restore_logged_data(entry.path)
+
+            test_data = data['test']
+
+            test_episode_returns.append(test_data['returns_stat']['mean'])
+        env_step = test_data['env_step']
+
+        if score_thresholds is None:
+            score_thresholds = np.linspace(0.0, np.max(test_episode_returns), 101)
+
+        return RLiableExperimentResult(algorithms=[algo_name],
+                                       score_dict={algo_name: np.array(test_episode_returns)},
+                                       env_steps=np.array(env_step),
+                                       score_thresholds=score_thresholds,
+                                       exp_dir=exp_dir)
+
+
+def eval_results(results: RLiableExperimentResult):
+    import matplotlib.pyplot as plt
+    import scipy.stats as sst
+    import seaborn as sns
+    from rliable import library as rly
+    from rliable import plot_utils
+
+    iqm = lambda scores: sst.trim_mean(scores, proportiontocut=0.25, axis=0)
+    iqm_scores, iqm_cis = rly.get_interval_estimates(
+        results.score_dict, iqm, reps=50000)
+
+    # Plot IQM sample efficiency curve
+    fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
+    plot_utils.plot_sample_efficiency_curve(
+        results.env_steps, iqm_scores, iqm_cis, algorithms=results.algorithms,
+        xlabel=r'Number of env steps',
+        ylabel='IQM episode return',
+        ax=ax)
+    plt.savefig(os.path.join(results.exp_dir, 'iqm_sample_efficiency_curve.png'))
+
+    final_score_dict = {algo: returns[:, [-1]] for algo, returns in results.score_dict.items()}
+    score_distributions, score_distributions_cis = rly.create_performance_profile(
+        final_score_dict, results.score_thresholds)
+
+    # Plot score distributions
+    fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
+    plot_utils.plot_performance_profiles(
+        score_distributions, results.score_thresholds,
+        performance_profile_cis=score_distributions_cis,
+        colors=dict(zip(results.algorithms, sns.color_palette('colorblind'))),
+        xlabel=r'Episode return $(\tau)$',
+        ax=ax)
+    plt.savefig(os.path.join(results.exp_dir, 'performance_profile.png'))
+
 
 def find_all_files(root_dir, pattern):
     """Find all files under root_dir according to relative pattern."""