ChenDRAG 150d0ec51b
Step collector implementation (#280)
This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail.

Things changed in this PR:

1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv;
2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.)
3. add policy.exploration_noise(act, batch) -> act
4. small change in BasePolicy.compute_*_returns
5. move reward_metric from collector to trainer
6. fix np.asanyarray issue (different version's numpy will result in different output)
7. flake8 maxlength=88
8. polish docs and fix test

Co-authored-by: n+e <trinkle23897@gmail.com>
2021-02-19 10:33:49 +08:00

86 lines
3.4 KiB
Python

import time
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from typing import Any, Dict, Union, Callable, Optional
from tianshou.data import Collector
from tianshou.policy import BasePolicy
def test_episode(
policy: BasePolicy,
collector: Collector,
test_fn: Optional[Callable[[int, Optional[int]], None]],
epoch: int,
n_episode: int,
writer: Optional[SummaryWriter] = None,
global_step: Optional[int] = None,
reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None,
) -> Dict[str, Any]:
"""A simple wrapper of testing policy in collector."""
collector.reset_env()
collector.reset_buffer()
policy.eval()
if test_fn:
test_fn(epoch, global_step)
result = collector.collect(n_episode=n_episode)
if reward_metric:
result["rews"] = reward_metric(result["rews"])
if writer is not None and global_step is not None:
rews, lens = result["rews"], result["lens"]
writer.add_scalar("test/rew", rews.mean(), global_step=global_step)
writer.add_scalar("test/rew_std", rews.std(), global_step=global_step)
writer.add_scalar("test/len", lens.mean(), global_step=global_step)
writer.add_scalar("test/len_std", lens.std(), global_step=global_step)
return result
def gather_info(
start_time: float,
train_c: Optional[Collector],
test_c: Collector,
best_reward: float,
best_reward_std: float,
) -> Dict[str, Union[float, str]]:
"""A simple wrapper of gathering information from collectors.
:return: A dictionary with the following keys:
* ``train_step`` the total collected step of training collector;
* ``train_episode`` the total collected episode of training collector;
* ``train_time/collector`` the time for collecting frames in the \
training collector;
* ``train_time/model`` the time for training models;
* ``train_speed`` the speed of training (frames per second);
* ``test_step`` the total collected step of test collector;
* ``test_episode`` the total collected episode of test collector;
* ``test_time`` the time for testing;
* ``test_speed`` the speed of testing (frames per second);
* ``best_reward`` the best reward over the test results;
* ``duration`` the total elapsed time.
"""
duration = time.time() - start_time
model_time = duration - test_c.collect_time
test_speed = test_c.collect_step / test_c.collect_time
result: Dict[str, Union[float, str]] = {
"test_step": test_c.collect_step,
"test_episode": test_c.collect_episode,
"test_time": f"{test_c.collect_time:.2f}s",
"test_speed": f"{test_speed:.2f} step/s",
"best_reward": best_reward,
"best_result": f"{best_reward:.2f} ± {best_reward_std:.2f}",
"duration": f"{duration:.2f}s",
"train_time/model": f"{model_time:.2f}s",
}
if train_c is not None:
model_time -= train_c.collect_time
train_speed = train_c.collect_step / (duration - test_c.collect_time)
result.update({
"train_step": train_c.collect_step,
"train_episode": train_c.collect_episode,
"train_time/collector": f"{train_c.collect_time:.2f}s",
"train_time/model": f"{model_time:.2f}s",
"train_speed": f"{train_speed:.2f} step/s",
})
return result