Tianshou/examples/box2d/acrobot_dualdqn.py

import argparse
import os
import pprint

import gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import DQNPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='Acrobot-v1')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--eps-test', type=float, default=0.05)
    parser.add_argument('--eps-train', type=float, default=0.5)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--n-step', type=int, default=3)
    parser.add_argument('--target-update-freq', type=int, default=320)
    parser.add_argument('--epoch', type=int, default=10)
    parser.add_argument('--step-per-epoch', type=int, default=100000)
    parser.add_argument('--step-per-collect', type=int, default=100)
    parser.add_argument('--update-per-step', type=float, default=0.01)
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128])
    parser.add_argument(
        '--dueling-q-hidden-sizes', type=int, nargs='*', default=[128, 128]
    )
    parser.add_argument(
        '--dueling-v-hidden-sizes', type=int, nargs='*', default=[128, 128]
    )
    parser.add_argument('--training-num', type=int, default=10)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument(
        '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'
    )
    return parser.parse_args()


def test_dqn(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)]
    )
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)]
    )
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    Q_param = {"hidden_sizes": args.dueling_q_hidden_sizes}
    V_param = {"hidden_sizes": args.dueling_v_hidden_sizes}
    net = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        device=args.device,
        dueling_param=(Q_param, V_param)
    ).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(
        net,
        optim,
        args.gamma,
        args.n_step,
        target_update_freq=args.target_update_freq
    )
    # collector
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size * args.training_num)
    # log
    log_path = os.path.join(args.logdir, args.task, 'dqn')
    writer = SummaryWriter(log_path)
    logger = TensorboardLogger(writer)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    def train_fn(epoch, env_step):
        if env_step <= 100000:
            policy.set_eps(args.eps_train)
        elif env_step <= 500000:
            eps = args.eps_train - (env_step - 100000) / \
                400000 * (0.5 * args.eps_train)
            policy.set_eps(eps)
        else:
            policy.set_eps(0.5 * args.eps_train)

    def test_fn(epoch, env_step):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.step_per_collect,
        args.test_num,
        args.batch_size,
        update_per_step=args.update_per_step,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_fn=save_fn,
        logger=logger
    )

    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        policy.eval()
        policy.set_eps(args.eps_test)
        test_envs.seed(args.seed)
        test_collector.reset()
        result = test_collector.collect(n_episode=args.test_num, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")


if __name__ == '__main__':
    test_dqn(get_args())
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import argparse`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`import os`
			`import pprint`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00
			`import gym`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`import numpy as np`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`import torch`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`from torch.utils.tensorboard import SummaryWriter`

bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.data import Collector, VectorReplayBuffer`
			`from tianshou.env import DummyVectorEnv`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`from tianshou.policy import DQNPolicy`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`from tianshou.trainer import offpolicy_trainer`
Add Weights and Biases Logger (#427) - rename BasicLogger to TensorboardLogger - refactor logger code - add WandbLogger Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2021-08-30 10:35:02 -04:00			`from tianshou.utils import TensorboardLogger`
change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`from tianshou.utils.net.common import Net`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='Acrobot-v1')`
			`parser.add_argument('--seed', type=int, default=0)`
			`parser.add_argument('--eps-test', type=float, default=0.05)`
			`parser.add_argument('--eps-train', type=float, default=0.5)`
			`parser.add_argument('--buffer-size', type=int, default=20000)`
			`parser.add_argument('--lr', type=float, default=1e-3)`
			`parser.add_argument('--gamma', type=float, default=0.95)`
			`parser.add_argument('--n-step', type=int, default=3)`
			`parser.add_argument('--target-update-freq', type=int, default=320)`
			`parser.add_argument('--epoch', type=int, default=10)`
Trainer refactor : some definition change (#293) This PR focus on some definition change of trainer to make it more friendly to use and be consistent with typical usage in research papers, typically change `collect-per-step` to `step-per-collect`, add `update-per-step` / `episode-per-collect` accordingly, and modify the documentation. 2021-02-21 13:06:02 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=100000)`
			`parser.add_argument('--step-per-collect', type=int, default=100)`
			`parser.add_argument('--update-per-step', type=float, default=0.01)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`parser.add_argument('--batch-size', type=int, default=64)`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128])`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`parser.add_argument(`
			`'--dueling-q-hidden-sizes', type=int, nargs='*', default=[128, 128]`
			`)`
			`parser.add_argument(`
			`'--dueling-v-hidden-sizes', type=int, nargs='*', default=[128, 128]`
			`)`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`parser.add_argument('--training-num', type=int, default=10)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`parser.add_argument('--test-num', type=int, default=100)`
			`parser.add_argument('--logdir', type=str, default='log')`
			`parser.add_argument('--render', type=float, default=0.)`
			`parser.add_argument(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`'--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'`
			`)`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`return parser.parse_args()`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00

			`def test_dqn(args=get_args()):`
			`env = gym.make(args.task)`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
			`# train_envs = gym.make(args.task)`
			`# you can also use tianshou.env.SubprocVectorEnv`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`train_envs = DummyVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.training_num)]`
			`)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`# test_envs = gym.make(args.task)`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`test_envs = DummyVectorEnv(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)]`
			`)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`train_envs.seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
update utils.network (#275) This is the first commit of 6 commits mentioned in #274, which features 1. Refactor of `Class Net` to support any form of MLP. 2. Enable type check in utils.network. 3. Relative change in docs/test/examples. 4. Move atari-related network to examples/atari/atari_network.py Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-01-20 16:54:13 +08:00			`Q_param = {"hidden_sizes": args.dueling_q_hidden_sizes}`
			`V_param = {"hidden_sizes": args.dueling_v_hidden_sizes}`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`net = Net(`
			`args.state_shape,`
			`args.action_shape,`
			`hidden_sizes=args.hidden_sizes,`
			`device=args.device,`
			`dueling_param=(Q_param, V_param)`
			`).to(args.device)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`optim = torch.optim.Adam(net.parameters(), lr=args.lr)`
			`policy = DQNPolicy(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`net,`
			`optim,`
			`args.gamma,`
			`args.n_step,`
			`target_update_freq=args.target_update_freq`
			`)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`# collector`
			`train_collector = Collector(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_envs,`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`VectorReplayBuffer(args.buffer_size, len(train_envs)),`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`exploration_noise=True`
			`)`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`test_collector = Collector(policy, test_envs, exploration_noise=True)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`# policy.set_eps(1)`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`train_collector.collect(n_step=args.batch_size * args.training_num)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`# log`
			`log_path = os.path.join(args.logdir, args.task, 'dqn')`
			`writer = SummaryWriter(log_path)`
Add Weights and Biases Logger (#427) - rename BasicLogger to TensorboardLogger - refactor logger code - add WandbLogger Co-authored-by: Jiayi Weng <trinkle23897@gmail.com> 2021-08-30 10:35:02 -04:00			`logger = TensorboardLogger(writer)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00
			`def save_fn(policy):`
			`torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))`

change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def stop_fn(mean_rewards):`
			`return mean_rewards >= env.spec.reward_threshold`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00
change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def train_fn(epoch, env_step):`
			`if env_step <= 100000:`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`policy.set_eps(args.eps_train)`
change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`elif env_step <= 500000:`
			`eps = args.eps_train - (env_step - 100000) / \`
			`400000 * (0.5 * args.eps_train)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`policy.set_eps(eps)`
			`else:`
			`policy.set_eps(0.5 * args.eps_train)`

change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def test_fn(epoch, env_step):`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00			`policy.set_eps(args.eps_test)`

			`# trainer`
			`result = offpolicy_trainer(`
bump to v0.4.3 (#432) * add makefile * bump version * add isort and yapf * update contributing.md * update PR template * spelling check 2021-09-03 05:05:04 +08:00			`policy,`
			`train_collector,`
			`test_collector,`
			`args.epoch,`
			`args.step_per_epoch,`
			`args.step_per_collect,`
			`args.test_num,`
			`args.batch_size,`
			`update_per_step=args.update_per_step,`
			`train_fn=train_fn,`
			`test_fn=test_fn,`
			`stop_fn=stop_fn,`
			`save_fn=save_fn,`
			`logger=logger`
			`)`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00
			`assert stop_fn(result['best_reward'])`
			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
optimize training procedure and improve code coverage (#189) 1. add policy.eval() in all test scripts' "watch performance" 2. remove dict return support for collector preprocess_fn 3. add `__contains__` and `pop` in batch: `key in batch`, `batch.pop(key, deft)` 4. exact n_episode for a list of n_episode limitation and save fake data in cache_buffer when self.buffer is None (#184) 5. fix tensorboard logging: h-axis stands for env step instead of gradient step; add test results into tensorboard 6. add test_returns (both GAE and nstep) 7. change the type-checking order in batch.py and converter.py in order to meet the most often case first 8. fix shape inconsistency for torch.Tensor in replay buffer 9. remove `**kwargs` in ReplayBuffer 10. remove default value in batch.split() and add merge_last argument (#185) 11. improve nstep efficiency 12. add max_batchsize in onpolicy algorithms 13. potential bugfix for subproc.wait 14. fix RecurrentActorProb 15. improve the code-coverage (from 90% to 95%) and remove the dead code 16. fix some incorrect type annotation The above improvement also increases the training FPS: on my computer, the previous version is only ~1800 FPS and after that, it can reach ~2050 (faster than v0.2.4.post1). 2020-08-27 12:15:18 +08:00			`policy.eval()`
			`policy.set_eps(args.eps_test)`
			`test_envs.seed(args.seed)`
			`test_collector.reset()`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`result = test_collector.collect(n_episode=args.test_num, render=args.render)`
			`rews, lens = result["rews"], result["lens"]`
			`print(f"Final reward: {rews.mean()}, length: {lens.mean()}")`
Dueling DQN (#170) Co-authored-by: n+e <463003665@qq.com> 2020-07-29 19:44:42 +08:00

			`if __name__ == '__main__':`
			`test_dqn(get_args())`