Yi Su 3592f45446
Fix critic network for Discrete CRR (#485)
- Fixes an inconsistency in the implementation of Discrete CRR. Now it uses `Critic` class for its critic, following conventions in other actor-critic policies;
- Updates several offline policies to use `ActorCritic` class for its optimizer to eliminate randomness caused by parameter sharing between actor and critic;
- Add `writer.flush()` in TensorboardLogger to ensure real-time result;
- Enable `test_collector=None` in 3 trainers to turn off testing during training;
- Updates the Atari offline results in README.md;
- Moves Atari offline RL examples to `examples/offline`; tests to `test/offline` per review comments.
2021-11-28 23:10:28 +08:00

97 lines
3.5 KiB
Python

import time
from typing import Any, Callable, Dict, Optional, Union
import numpy as np
from tianshou.data import Collector
from tianshou.policy import BasePolicy
from tianshou.utils import BaseLogger
def test_episode(
policy: BasePolicy,
collector: Collector,
test_fn: Optional[Callable[[int, Optional[int]], None]],
epoch: int,
n_episode: int,
logger: Optional[BaseLogger] = None,
global_step: Optional[int] = None,
reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None,
) -> Dict[str, Any]:
"""A simple wrapper of testing policy in collector."""
collector.reset_env()
collector.reset_buffer()
policy.eval()
if test_fn:
test_fn(epoch, global_step)
result = collector.collect(n_episode=n_episode)
if reward_metric:
rew = reward_metric(result["rews"])
result.update(rews=rew, rew=rew.mean(), rew_std=rew.std())
if logger and global_step is not None:
logger.log_test_data(result, global_step)
return result
def gather_info(
start_time: float,
train_c: Optional[Collector],
test_c: Optional[Collector],
best_reward: float,
best_reward_std: float,
) -> Dict[str, Union[float, str]]:
"""A simple wrapper of gathering information from collectors.
:return: A dictionary with the following keys:
* ``train_step`` the total collected step of training collector;
* ``train_episode`` the total collected episode of training collector;
* ``train_time/collector`` the time for collecting transitions in the \
training collector;
* ``train_time/model`` the time for training models;
* ``train_speed`` the speed of training (env_step per second);
* ``test_step`` the total collected step of test collector;
* ``test_episode`` the total collected episode of test collector;
* ``test_time`` the time for testing;
* ``test_speed`` the speed of testing (env_step per second);
* ``best_reward`` the best reward over the test results;
* ``duration`` the total elapsed time.
"""
duration = time.time() - start_time
model_time = duration
result: Dict[str, Union[float, str]] = {
"duration": f"{duration:.2f}s",
"train_time/model": f"{model_time:.2f}s",
}
if test_c is not None:
model_time = duration - test_c.collect_time
test_speed = test_c.collect_step / test_c.collect_time
result.update(
{
"test_step": test_c.collect_step,
"test_episode": test_c.collect_episode,
"test_time": f"{test_c.collect_time:.2f}s",
"test_speed": f"{test_speed:.2f} step/s",
"best_reward": best_reward,
"best_result": f"{best_reward:.2f} ± {best_reward_std:.2f}",
"duration": f"{duration:.2f}s",
"train_time/model": f"{model_time:.2f}s",
}
)
if train_c is not None:
model_time -= train_c.collect_time
if test_c is not None:
train_speed = train_c.collect_step / (duration - test_c.collect_time)
else:
train_speed = train_c.collect_step / duration
result.update(
{
"train_step": train_c.collect_step,
"train_episode": train_c.collect_episode,
"train_time/collector": f"{train_c.collect_time:.2f}s",
"train_time/model": f"{model_time:.2f}s",
"train_speed": f"{train_speed:.2f} step/s",
}
)
return result