Tianshou/test/continuous/test_td3.py

import os
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from tianshou.policy import TD3Policy
from tianshou.utils import BasicLogger
from tianshou.env import DummyVectorEnv
from tianshou.utils.net.common import Net
from tianshou.trainer import offpolicy_trainer
from tianshou.exploration import GaussianNoise
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.utils.net.continuous import Actor, Critic


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='Pendulum-v0')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--exploration-noise', type=float, default=0.1)
    parser.add_argument('--policy-noise', type=float, default=0.2)
    parser.add_argument('--noise-clip', type=float, default=0.5)
    parser.add_argument('--update-actor-freq', type=int, default=2)
    parser.add_argument('--epoch', type=int, default=5)
    parser.add_argument('--step-per-epoch', type=int, default=20000)
    parser.add_argument('--step-per-collect', type=int, default=8)
    parser.add_argument('--update-per-step', type=float, default=0.125)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
    parser.add_argument('--training-num', type=int, default=8)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument('--rew-norm', action="store_true", default=False)
    parser.add_argument('--n-step', type=int, default=3)
    parser.add_argument(
        '--device', type=str,
        default='cuda' if torch.cuda.is_available() else 'cpu')
    args = parser.parse_known_args()[0]
    return args


def test_td3(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
              device=args.device)
    actor = Actor(net, args.action_shape, max_action=args.max_action,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c1 = Net(args.state_shape, args.action_shape,
                 hidden_sizes=args.hidden_sizes,
                 concat=True, device=args.device)
    critic1 = Critic(net_c1, device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    net_c2 = Net(args.state_shape, args.action_shape,
                 hidden_sizes=args.hidden_sizes,
                 concat=True, device=args.device)
    critic2 = Critic(net_c2, device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = TD3Policy(
        actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
        tau=args.tau, gamma=args.gamma,
        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
        policy_noise=args.policy_noise,
        update_actor_freq=args.update_actor_freq,
        noise_clip=args.noise_clip,
        reward_normalization=args.rew_norm,
        estimation_step=args.n_step,
        action_space=env.action_space)
    # collector
    train_collector = Collector(
        policy, train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True)
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'td3')
    writer = SummaryWriter(log_path)
    logger = BasicLogger(writer)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size,
        update_per_step=args.update_per_step, stop_fn=stop_fn,
        save_fn=save_fn, logger=logger)
    assert stop_fn(result['best_reward'])

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")


if __name__ == '__main__':
    test_td3()
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`import os`
td3 2020-03-23 11:34:52 +08:00			`import gym`
			`import torch`
			`import pprint`
			`import argparse`
			`import numpy as np`
			`from torch.utils.tensorboard import SummaryWriter`

			`from tianshou.policy import TD3Policy`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`from tianshou.utils import BasicLogger`
Numba acceleration (#193) Training FPS improvement (base commit is 94bfb32): test_pdqn: 1660 (without numba) -> 1930 discrete/test_ppo: 5100 -> 5170 since nstep has little impact on overall performance, the unit test result is: GAE: 4.1s -> 0.057s nstep: 0.3s -> 0.15s (little improvement) Others: - fix a bug in ttt set_eps - keep only sumtree in segment tree implementation - dirty fix for asyncVenv check_id test 2020-09-02 13:03:32 +08:00			`from tianshou.env import DummyVectorEnv`
			`from tianshou.utils.net.common import Net`
td3 2020-03-23 11:34:52 +08:00			`from tianshou.trainer import offpolicy_trainer`
Add auto alpha tuning and exploration noise for sac. (#80) Add class BaseNoise and GaussianNoise for the concept of exploration noise. Add new test for sac tested in MountainCarContinuous-v0, which should benefits from the two above new feature. 2020-06-16 22:17:28 +08:00			`from tianshou.exploration import GaussianNoise`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`from tianshou.data import Collector, VectorReplayBuffer`
Remove dummy net code (#123) * remove dummy net; delete two files * split code to have backbone and head * rename class * change torch.float to torch.float32 * use flatten(1) instead of view(batch, -1) * remove dummy net in docs * bugfix for rnn * fix cuda error * minor fix of docs * do not change the example code in dqn tutorial, since it is for demonstration Co-authored-by: Trinkle23897 <463003665@qq.com> 2020-07-09 22:57:01 +08:00			`from tianshou.utils.net.continuous import Actor, Critic`
td3 2020-03-23 11:34:52 +08:00

			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--task', type=str, default='Pendulum-v0')`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--seed', type=int, default=1)`
td3 2020-03-23 11:34:52 +08:00			`parser.add_argument('--buffer-size', type=int, default=20000)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--actor-lr', type=float, default=1e-4)`
td3 2020-03-23 11:34:52 +08:00			`parser.add_argument('--critic-lr', type=float, default=1e-3)`
			`parser.add_argument('--gamma', type=float, default=0.99)`
			`parser.add_argument('--tau', type=float, default=0.005)`
			`parser.add_argument('--exploration-noise', type=float, default=0.1)`
			`parser.add_argument('--policy-noise', type=float, default=0.2)`
			`parser.add_argument('--noise-clip', type=float, default=0.5)`
			`parser.add_argument('--update-actor-freq', type=int, default=2)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--epoch', type=int, default=5)`
Trainer refactor : some definition change (#293) This PR focus on some definition change of trainer to make it more friendly to use and be consistent with typical usage in research papers, typically change `collect-per-step` to `step-per-collect`, add `update-per-step` / `episode-per-collect` accordingly, and modify the documentation. 2021-02-21 13:06:02 +08:00			`parser.add_argument('--step-per-epoch', type=int, default=20000)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--step-per-collect', type=int, default=8)`
			`parser.add_argument('--update-per-step', type=float, default=0.125)`
td3 2020-03-23 11:34:52 +08:00			`parser.add_argument('--batch-size', type=int, default=128)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])`
			`parser.add_argument('--training-num', type=int, default=8)`
td3 2020-03-23 11:34:52 +08:00			`parser.add_argument('--test-num', type=int, default=100)`
			`parser.add_argument('--logdir', type=str, default='log')`
add examples, fix some bugs (#5) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection 2020-03-28 07:27:18 +08:00			`parser.add_argument('--render', type=float, default=0.)`
Remove reward_normaliztion option in offpolicy algorithm (#298) * remove rew_norm in nstep implementation * improve test * remove runnable/ * various doc fix Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-27 11:20:43 +08:00			`parser.add_argument('--rew-norm', action="store_true", default=False)`
			`parser.add_argument('--n-step', type=int, default=3)`
td3 2020-03-23 11:34:52 +08:00			`parser.add_argument(`
			`'--device', type=str,`
			`default='cuda' if torch.cuda.is_available() else 'cpu')`
			`args = parser.parse_known_args()[0]`
			`return args`


			`def test_td3(args=get_args()):`
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`torch.set_num_threads(1) # we just need only one thread for NN`
td3 2020-03-23 11:34:52 +08:00			`env = gym.make(args.task)`
			`if args.task == 'Pendulum-v0':`
			`env.spec.reward_threshold = -250`
			`args.state_shape = env.observation_space.shape or env.observation_space.n`
			`args.action_shape = env.action_space.shape or env.action_space.n`
			`args.max_action = env.action_space.high[0]`
add some docs 2020-04-03 21:28:12 +08:00			`# you can also use tianshou.env.SubprocVectorEnv`
td3 2020-03-23 11:34:52 +08:00			`# train_envs = gym.make(args.task)`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`train_envs = DummyVectorEnv(`
fix collector 2020-03-25 14:08:28 +08:00			`[lambda: gym.make(args.task) for _ in range(args.training_num)])`
td3 2020-03-23 11:34:52 +08:00			`# test_envs = gym.make(args.task)`
code refactor for venv (#179) - Refacor code to remove duplicate code - Enable async simulation for all vector envs - Remove `collector.close` and rename `VectorEnv` to `DummyVectorEnv` The abstraction of vector env changed. Prior to this pr, each vector env is almost independent. After this pr, each env is wrapped into a worker, and vector envs differ with their worker type. In fact, users can just use `BaseVectorEnv` with different workers, I keep `SubprocVectorEnv`, `ShmemVectorEnv` for backward compatibility. Co-authored-by: n+e <463003665@qq.com> Co-authored-by: magicly <magicly007@gmail.com> 2020-08-19 15:00:24 +08:00			`test_envs = DummyVectorEnv(`
fix collector 2020-03-25 14:08:28 +08:00			`[lambda: gym.make(args.task) for _ in range(args.test_num)])`
td3 2020-03-23 11:34:52 +08:00			`# seed`
			`np.random.seed(args.seed)`
			`torch.manual_seed(args.seed)`
			`train_envs.seed(args.seed)`
			`test_envs.seed(args.seed)`
			`# model`
update utils.network (#275) This is the first commit of 6 commits mentioned in #274, which features 1. Refactor of `Class Net` to support any form of MLP. 2. Enable type check in utils.network. 3. Relative change in docs/test/examples. 4. Move atari-related network to examples/atari/atari_network.py Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-01-20 16:54:13 +08:00			`net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,`
			`device=args.device)`
			`actor = Actor(net, args.action_shape, max_action=args.max_action,`
			`device=args.device).to(args.device)`
td3 2020-03-23 11:34:52 +08:00			`actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)`
update utils.network (#275) This is the first commit of 6 commits mentioned in #274, which features 1. Refactor of `Class Net` to support any form of MLP. 2. Enable type check in utils.network. 3. Relative change in docs/test/examples. 4. Move atari-related network to examples/atari/atari_network.py Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-01-20 16:54:13 +08:00			`net_c1 = Net(args.state_shape, args.action_shape,`
			`hidden_sizes=args.hidden_sizes,`
			`concat=True, device=args.device)`
			`critic1 = Critic(net_c1, device=args.device).to(args.device)`
td3 2020-03-23 11:34:52 +08:00			`critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)`
update utils.network (#275) This is the first commit of 6 commits mentioned in #274, which features 1. Refactor of `Class Net` to support any form of MLP. 2. Enable type check in utils.network. 3. Relative change in docs/test/examples. 4. Move atari-related network to examples/atari/atari_network.py Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-01-20 16:54:13 +08:00			`net_c2 = Net(args.state_shape, args.action_shape,`
			`hidden_sizes=args.hidden_sizes,`
			`concat=True, device=args.device)`
			`critic2 = Critic(net_c2, device=args.device).to(args.device)`
td3 2020-03-23 11:34:52 +08:00			`critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)`
			`policy = TD3Policy(`
			`actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,`
code format and update function signatures (#213) Cherry-pick from #200 - update the function signature - format code-style - move _compile into separate functions - fix a bug in to_torch and to_numpy (Batch) - remove None in action_range In short, the code-format only contains function-signature style and `'` -> `"`. (pick up from [black](https://github.com/psf/black)) 2020-09-12 15:39:01 +08:00			`tau=args.tau, gamma=args.gamma,`
			`exploration_noise=GaussianNoise(sigma=args.exploration_noise),`
			`policy_noise=args.policy_noise,`
			`update_actor_freq=args.update_actor_freq,`
			`noise_clip=args.noise_clip,`
nstep all (fix #51) 2020-06-03 13:59:47 +08:00			`reward_normalization=args.rew_norm,`
Remap action to fit gym's action space (#313) Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-03-21 16:45:50 +08:00			`estimation_step=args.n_step,`
			`action_space=env.action_space)`
td3 2020-03-23 11:34:52 +08:00			`# collector`
			`train_collector = Collector(`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`policy, train_envs,`
			`VectorReplayBuffer(args.buffer_size, len(train_envs)),`
			`exploration_noise=True)`
td3 2020-03-23 11:34:52 +08:00			`test_collector = Collector(policy, test_envs)`
fix collector 2020-03-25 14:08:28 +08:00			`# train_collector.collect(n_step=args.buffer_size)`
td3 2020-03-23 11:34:52 +08:00			`# log`
save_fn 2020-04-11 16:54:27 +08:00			`log_path = os.path.join(args.logdir, args.task, 'td3')`
Performance improve (#18) * improve performance set one thread for NN replace detach() op with torch.no_grad() * fix pep 8 errors 2020-04-05 09:10:21 +08:00			`writer = SummaryWriter(log_path)`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`logger = BasicLogger(writer)`
td3 2020-03-23 11:34:52 +08:00
save_fn 2020-04-11 16:54:27 +08:00			`def save_fn(policy):`
			`torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))`

change API of train_fn and test_fn (#229) train_fn(epoch) -> train_fn(epoch, num_env_step) test_fn(epoch) -> test_fn(epoch, num_env_step) 2020-09-26 16:35:37 +08:00			`def stop_fn(mean_rewards):`
			`return mean_rewards >= env.spec.reward_threshold`
td3 2020-03-23 11:34:52 +08:00
			`# trainer`
			`result = offpolicy_trainer(`
			`policy, train_collector, test_collector, args.epoch,`
Trainer refactor : some definition change (#293) This PR focus on some definition change of trainer to make it more friendly to use and be consistent with typical usage in research papers, typically change `collect-per-step` to `step-per-collect`, add `update-per-step` / `episode-per-collect` accordingly, and modify the documentation. 2021-02-21 13:06:02 +08:00			`args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size,`
			`update_per_step=args.update_per_step, stop_fn=stop_fn,`
add logger (#295) This PR focus on refactor of logging method to solve bug of nan reward and log interval. After these two pr, hopefully fundamental change of tianshou/data is finished. We then can concentrate on building benchmarks of tianshou finally. Things changed: 1. trainer now accepts logger (BasicLogger or LazyLogger) instead of writer; 2. remove utils.SummaryWriter; 2021-02-24 14:48:42 +08:00			`save_fn=save_fn, logger=logger)`
fix collector 2020-03-25 14:08:28 +08:00			`assert stop_fn(result['best_reward'])`
Make trainer resumable (#350) - specify tensorboard >= 2.5.0 - add `save_checkpoint_fn` and `resume_from_log` in trainer Co-authored-by: Trinkle23897 <trinkle23897@gmail.com> 2021-05-06 08:53:53 +08:00
td3 2020-03-23 11:34:52 +08:00			`if __name__ == '__main__':`
			`pprint.pprint(result)`
			`# Let's watch its performance!`
			`env = gym.make(args.task)`
optimize training procedure and improve code coverage (#189) 1. add policy.eval() in all test scripts' "watch performance" 2. remove dict return support for collector preprocess_fn 3. add `__contains__` and `pop` in batch: `key in batch`, `batch.pop(key, deft)` 4. exact n_episode for a list of n_episode limitation and save fake data in cache_buffer when self.buffer is None (#184) 5. fix tensorboard logging: h-axis stands for env step instead of gradient step; add test results into tensorboard 6. add test_returns (both GAE and nstep) 7. change the type-checking order in batch.py and converter.py in order to meet the most often case first 8. fix shape inconsistency for torch.Tensor in replay buffer 9. remove `**kwargs` in ReplayBuffer 10. remove default value in batch.split() and add merge_last argument (#185) 11. improve nstep efficiency 12. add max_batchsize in onpolicy algorithms 13. potential bugfix for subproc.wait 14. fix RecurrentActorProb 15. improve the code-coverage (from 90% to 95%) and remove the dead code 16. fix some incorrect type annotation The above improvement also increases the training FPS: on my computer, the previous version is only ~1800 FPS and after that, it can reach ~2050 (faster than v0.2.4.post1). 2020-08-27 12:15:18 +08:00			`policy.eval()`
td3 2020-03-23 11:34:52 +08:00			`collector = Collector(policy, env)`
add examples, fix some bugs (#5) * update atari.py * fix setup.py pass the pytest * fix setup.py pass the pytest * add args "render" * change the tensorboard writter * change the tensorboard writter * change device, render, tensorboard log location * change device, render, tensorboard log location * remove some wrong local files * fix some tab mistakes and the envs name in continuous/test_xx.py * add examples and point robot maze environment * fix some bugs during testing examples * add dqn network and fix some args * change back the tensorboard writter's frequency to ensure ppo and a2c can write things normally * add a warning to collector * rm some unrelated files * reformat * fix a bug in test_dqn due to the model wrong selection 2020-03-28 07:27:18 +08:00			`result = collector.collect(n_episode=1, render=args.render)`
Step collector implementation (#280) This is the third PR of 6 commits mentioned in #274, which features refactor of Collector to fix #245. You can check #274 for more detail. Things changed in this PR: 1. refactor collector to be more cleaner, split AsyncCollector to support asyncvenv; 2. change buffer.add api to add(batch, bffer_ids); add several types of buffer (VectorReplayBuffer, PrioritizedVectorReplayBuffer, etc.) 3. add policy.exploration_noise(act, batch) -> act 4. small change in BasePolicy.compute_*_returns 5. move reward_metric from collector to trainer 6. fix np.asanyarray issue (different version's numpy will result in different output) 7. flake8 maxlength=88 8. polish docs and fix test Co-authored-by: n+e <trinkle23897@gmail.com> 2021-02-19 10:33:49 +08:00			`rews, lens = result["rews"], result["lens"]`
			`print(f"Final reward: {rews.mean()}, length: {lens.mean()}")`
td3 2020-03-23 11:34:52 +08:00

			`if __name__ == '__main__':`
			`test_td3()`