Tianshou/tianshou/trainer/onpolicy.py

import time
import tqdm

from tianshou.utils import tqdm_config, MovAvg
from tianshou.trainer import test_episode, gather_info


def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,
                     step_per_epoch, collect_per_step, repeat_per_collect,
                     episode_per_test, batch_size,
                     train_fn=None, test_fn=None, stop_fn=None,
                     writer=None, log_interval=1, verbose=True, task='',
                     **kwargs):
    """A wrapper for on-policy trainer procedure.

    :param policy: an instance of the :class:`~tianshou.policy.BasePolicy`
        class.
    :param train_collector: the collector used for training.
    :type train_collector: :class:`~tianshou.data.Collector`
    :param test_collector: the collector used for testing.
    :type test_collector: :class:`~tianshou.data.Collector`
    :param int max_epoch: the maximum of epochs for training. The training
        process might be finished before reaching the ``max_epoch``.
    :param int step_per_epoch: the number of step for updating policy network
        in one epoch.
    :param int collect_per_step: the number of frames the collector would
        collect before the network update. In other words, collect some frames
        and do one policy network update.
    :param int repeat_per_collect: the number of repeat time for policy
        learning, for example, set it to 2 means the policy needs to learn each
        given batch data twice.
    :param episode_per_test: the number of episodes for one policy evaluation.
    :type episode_per_test: int or list of ints
    :param int batch_size: the batch size of sample data, which is going to
        feed in the policy network.
    :param function train_fn: a function receives the current number of epoch
        index and performs some operations at the beginning of training in this
        epoch.
    :param function test_fn: a function receives the current number of epoch
        index and performs some operations at the beginning of testing in this
        epoch.
    :param function stop_fn: a function receives the average undiscounted
        returns of the testing result, return a boolean which indicates whether
        reaching the goal.
    :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard
        SummaryWriter.
    :param int log_interval: the log interval of the writer.
    :param bool verbose: whether to print the information.

    :return: See :func:`~tianshou.trainer.gather_info`.
    """
    global_step = 0
    best_epoch, best_reward = -1, -1
    stat = {}
    start_time = time.time()
    for epoch in range(1, 1 + max_epoch):
        # train
        policy.train()
        if train_fn:
            train_fn(epoch)
        with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}',
                       **tqdm_config) as t:
            while t.n < t.total:
                result = train_collector.collect(n_episode=collect_per_step)
                data = {}
                if stop_fn and stop_fn(result['rew']):
                    test_result = test_episode(
                        policy, test_collector, test_fn,
                        epoch, episode_per_test)
                    if stop_fn and stop_fn(test_result['rew']):
                        for k in result.keys():
                            data[k] = f'{result[k]:.2f}'
                        t.set_postfix(**data)
                        return gather_info(
                            start_time, train_collector, test_collector,
                            test_result['rew'])
                    else:
                        policy.train()
                        if train_fn:
                            train_fn(epoch)
                losses = policy.learn(
                    train_collector.sample(0), batch_size, repeat_per_collect)
                train_collector.reset_buffer()
                step = 1
                for k in losses.keys():
                    if isinstance(losses[k], list):
                        step = max(step, len(losses[k]))
                global_step += step
                for k in result.keys():
                    data[k] = f'{result[k]:.2f}'
                    if writer and global_step % log_interval == 0:
                        writer.add_scalar(
                            k + '_' + task if task else k,
                            result[k], global_step=global_step)
                for k in losses.keys():
                    if stat.get(k) is None:
                        stat[k] = MovAvg()
                    stat[k].add(losses[k])
                    data[k] = f'{stat[k].get():.6f}'
                    if writer and global_step % log_interval == 0:
                        writer.add_scalar(
                            k + '_' + task if task else k,
                            stat[k].get(), global_step=global_step)
                t.update(step)
                t.set_postfix(**data)
            if t.n <= t.total:
                t.update()
        # test
        result = test_episode(
            policy, test_collector, test_fn, epoch, episode_per_test)
        if best_epoch == -1 or best_reward < result['rew']:
            best_reward = result['rew']
            best_epoch = epoch
        if verbose:
            print(f'Epoch #{epoch}: test_reward: {result["rew"]:.6f}, '
                  f'best_reward: {best_reward:.6f} in #{best_epoch}')
        if stop_fn and stop_fn(best_reward):
            break
    return gather_info(
        start_time, train_collector, test_collector, best_reward)
add trainer 2020-03-19 17:23:46 +08:00			`import time`
			`import tqdm`

			`from tianshou.utils import tqdm_config, MovAvg`
ppo and early stop 2020-03-20 19:52:29 +08:00			`from tianshou.trainer import test_episode, gather_info`
add trainer 2020-03-19 17:23:46 +08:00

ppo and early stop 2020-03-20 19:52:29 +08:00			`def onpolicy_trainer(policy, train_collector, test_collector, max_epoch,`
			`step_per_epoch, collect_per_step, repeat_per_collect,`
			`episode_per_test, batch_size,`
			`train_fn=None, test_fn=None, stop_fn=None,`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00			`writer=None, log_interval=1, verbose=True, task='',`
			`**kwargs):`
			`"""A wrapper for on-policy trainer procedure.`

add policy docs (#21) 2020-04-06 19:36:59 +08:00			:param policy: an instance of the :class:`~tianshou.policy.BasePolicy`
			`class.`
			`:param train_collector: the collector used for training.`
			:type train_collector: :class:`~tianshou.data.Collector`
			`:param test_collector: the collector used for testing.`
			:type test_collector: :class:`~tianshou.data.Collector`
			`:param int max_epoch: the maximum of epochs for training. The training`
			process might be finished before reaching the ``max_epoch``.
			`:param int step_per_epoch: the number of step for updating policy network`
			`in one epoch.`
			`:param int collect_per_step: the number of frames the collector would`
			`collect before the network update. In other words, collect some frames`
			`and do one policy network update.`
			`:param int repeat_per_collect: the number of repeat time for policy`
			`learning, for example, set it to 2 means the policy needs to learn each`
			`given batch data twice.`
			`:param episode_per_test: the number of episodes for one policy evaluation.`
			`:type episode_per_test: int or list of ints`
			`:param int batch_size: the batch size of sample data, which is going to`
			`feed in the policy network.`
			`:param function train_fn: a function receives the current number of epoch`
			`index and performs some operations at the beginning of training in this`
			`epoch.`
			`:param function test_fn: a function receives the current number of epoch`
			`index and performs some operations at the beginning of testing in this`
			`epoch.`
			`:param function stop_fn: a function receives the average undiscounted`
			`returns of the testing result, return a boolean which indicates whether`
			`reaching the goal.`
			`:param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard`
			`SummaryWriter.`
			`:param int log_interval: the log interval of the writer.`
			`:param bool verbose: whether to print the information.`
add docs of collector and trainer (#20) 2020-04-05 18:34:45 +08:00
			:return: See :func:`~tianshou.trainer.gather_info`.
			`"""`
add trainer 2020-03-19 17:23:46 +08:00			`global_step = 0`
			`best_epoch, best_reward = -1, -1`
			`stat = {}`
			`start_time = time.time()`
			`for epoch in range(1, 1 + max_epoch):`
			`# train`
			`policy.train()`
			`if train_fn:`
			`train_fn(epoch)`
maybe finished rnn? 2020-04-08 21:13:15 +08:00			`with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}',`
			`**tqdm_config) as t:`
add trainer 2020-03-19 17:23:46 +08:00			`while t.n < t.total:`
			`result = train_collector.collect(n_episode=collect_per_step)`
ppo and early stop 2020-03-20 19:52:29 +08:00			`data = {}`
			`if stop_fn and stop_fn(result['rew']):`
			`test_result = test_episode(`
			`policy, test_collector, test_fn,`
			`epoch, episode_per_test)`
			`if stop_fn and stop_fn(test_result['rew']):`
			`for k in result.keys():`
			`data[k] = f'{result[k]:.2f}'`
			`t.set_postfix(**data)`
			`return gather_info(`
			`start_time, train_collector, test_collector,`
			`test_result['rew'])`
			`else:`
			`policy.train()`
			`if train_fn:`
			`train_fn(epoch)`
			`losses = policy.learn(`
			`train_collector.sample(0), batch_size, repeat_per_collect)`
add trainer 2020-03-19 17:23:46 +08:00			`train_collector.reset_buffer()`
			`step = 1`
			`for k in losses.keys():`
			`if isinstance(losses[k], list):`
			`step = max(step, len(losses[k]))`
			`global_step += step`
			`for k in result.keys():`
			`data[k] = f'{result[k]:.2f}'`
add an example of bullet env (experiment from jiqizhixin) (#15) * add_pybullet_ens_test test on pybullet envs modify some log config * delete DS_Store file * add pybullet_envs test add HalfCheetahBulletEnv-v0 test modify log config * fix pep 8 errors * add pybullet to dev * delete a line * by pass F401 * add log_interval to onpolicy_trainer * add comments * Update halfcheetahBullet_v0_sac.py 2020-04-04 11:46:18 +08:00			`if writer and global_step % log_interval == 0:`
add trainer 2020-03-19 17:23:46 +08:00			`writer.add_scalar(`
add rllib result and fix pep8 2020-03-28 09:43:35 +08:00			`k + '_' + task if task else k,`
			`result[k], global_step=global_step)`
add trainer 2020-03-19 17:23:46 +08:00			`for k in losses.keys():`
			`if stat.get(k) is None:`
			`stat[k] = MovAvg()`
			`stat[k].add(losses[k])`
			`data[k] = f'{stat[k].get():.6f}'`
add an example of bullet env (experiment from jiqizhixin) (#15) * add_pybullet_ens_test test on pybullet envs modify some log config * delete DS_Store file * add pybullet_envs test add HalfCheetahBulletEnv-v0 test modify log config * fix pep 8 errors * add pybullet to dev * delete a line * by pass F401 * add log_interval to onpolicy_trainer * add comments * Update halfcheetahBullet_v0_sac.py 2020-04-04 11:46:18 +08:00			`if writer and global_step % log_interval == 0:`
add trainer 2020-03-19 17:23:46 +08:00			`writer.add_scalar(`
add rllib result and fix pep8 2020-03-28 09:43:35 +08:00			`k + '_' + task if task else k,`
			`stat[k].get(), global_step=global_step)`
add trainer 2020-03-19 17:23:46 +08:00			`t.update(step)`
			`t.set_postfix(**data)`
			`if t.n <= t.total:`
			`t.update()`
ppo and early stop 2020-03-20 19:52:29 +08:00			`# test`
			`result = test_episode(`
			`policy, test_collector, test_fn, epoch, episode_per_test)`
add trainer 2020-03-19 17:23:46 +08:00			`if best_epoch == -1 or best_reward < result['rew']:`
			`best_reward = result['rew']`
			`best_epoch = epoch`
			`if verbose:`
			`print(f'Epoch #{epoch}: test_reward: {result["rew"]:.6f}, '`
			`f'best_reward: {best_reward:.6f} in #{best_epoch}')`
ppo and early stop 2020-03-20 19:52:29 +08:00			`if stop_fn and stop_fn(best_reward):`
add trainer 2020-03-19 17:23:46 +08:00			`break`
ppo and early stop 2020-03-20 19:52:29 +08:00			`return gather_info(`
			`start_time, train_collector, test_collector, best_reward)`