diff --git a/examples/ant_v2_ddpg.py b/examples/ant_v2_ddpg.py index 2e265d6..16b0299 100644 --- a/examples/ant_v2_ddpg.py +++ b/examples/ant_v2_ddpg.py @@ -9,6 +9,7 @@ from tianshou.policy import DDPGPolicy from tianshou.trainer import offpolicy_trainer from tianshou.data import Collector, ReplayBuffer from tianshou.env import VectorEnv, SubprocVectorEnv +from tianshou.exploration import GaussianNoise from continuous_net import Actor, Critic @@ -67,7 +68,7 @@ def test_ddpg(args=get_args()): critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, - args.tau, args.gamma, args.exploration_noise, + args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector diff --git a/examples/ant_v2_td3.py b/examples/ant_v2_td3.py index 905a473..45770b2 100644 --- a/examples/ant_v2_td3.py +++ b/examples/ant_v2_td3.py @@ -9,6 +9,7 @@ from tianshou.policy import TD3Policy from tianshou.trainer import offpolicy_trainer from tianshou.data import Collector, ReplayBuffer from tianshou.env import VectorEnv, SubprocVectorEnv +from tianshou.exploration import GaussianNoise from continuous_net import Actor, Critic @@ -74,7 +75,8 @@ def test_td3(args=get_args()): critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, - args.tau, args.gamma, args.exploration_noise, args.policy_noise, + args.tau, args.gamma, + GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) diff --git a/examples/point_maze_td3.py b/examples/point_maze_td3.py index 5475921..b3f2c95 100644 --- a/examples/point_maze_td3.py +++ b/examples/point_maze_td3.py @@ -9,6 +9,7 @@ from tianshou.policy import TD3Policy from tianshou.trainer import offpolicy_trainer from tianshou.data import Collector, ReplayBuffer from tianshou.env import VectorEnv, SubprocVectorEnv +from tianshou.exploration import GaussianNoise from continuous_net import Actor, Critic from mujoco.register import reg @@ -77,7 +78,8 @@ def test_td3(args=get_args()): critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, - args.tau, args.gamma, args.exploration_noise, args.policy_noise, + args.tau, args.gamma, + GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) diff --git a/examples/sac_mcc.py b/examples/sac_mcc.py index d8b5413..ed8ca46 100644 --- a/examples/sac_mcc.py +++ b/examples/sac_mcc.py @@ -23,20 +23,20 @@ def get_args(): parser.add_argument('--actor-lr', type=float, default=3e-4) parser.add_argument('--critic-lr', type=float, default=3e-4) parser.add_argument('--alpha-lr', type=float, default=3e-4) - parser.add_argument('--noise_std', type=float, default=0.5) + parser.add_argument('--noise_std', type=float, default=1.2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--auto_alpha', type=bool, default=True) parser.add_argument('--alpha', type=float, default=0.2) parser.add_argument('--epoch', type=int, default=20) parser.add_argument('--step-per-epoch', type=int, default=2400) - parser.add_argument('--collect-per-step', type=int, default=1) + parser.add_argument('--collect-per-step', type=int, default=5) parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--layer-num', type=int, default=1) - parser.add_argument('--training-num', type=int, default=80) + parser.add_argument('--training-num', type=int, default=16) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=1.0/35.0) + parser.add_argument('--render', type=float, default=0.) parser.add_argument('--rew-norm', type=bool, default=False) parser.add_argument( '--device', type=str, diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index 0839944..9428f11 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -28,6 +28,7 @@ def get_args(): parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--exploration-noise', type=float, default=0.1) + parser.add_argument('--test-noise', type=float, default=0.1) parser.add_argument('--epoch', type=int, default=20) parser.add_argument('--step-per-epoch', type=int, default=2400) parser.add_argument('--collect-per-step', type=int, default=4) @@ -87,7 +88,8 @@ def test_ddpg(args=get_args()): # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) - test_collector = Collector(policy, test_envs) + test_collector = Collector( + policy, test_envs, action_noise=GaussianNoise(sigma=args.test_noise)) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index 4ff0997..227e9e1 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -9,6 +9,7 @@ from tianshou.utils import MovAvg from tianshou.env import BaseVectorEnv from tianshou.policy import BasePolicy from tianshou.data import Batch, ReplayBuffer, ListReplayBuffer, to_numpy +from tianshou.exploration import BaseNoise class Collector(object): @@ -27,6 +28,9 @@ class Collector(object): added to the buffer, see issue #42, defaults to ``None``. :param int stat_size: for the moving average of recording speed, defaults to 100. + :param BaseNoise action_noise: add a noise to continuous action. Normally + a policy already has a noise param for exploration in training phase, + so this is recommended to use in test collector for some purpose. The ``preprocess_fn`` is a function called before the data has been added to the buffer with batch format, which receives up to 7 keys as listed in @@ -87,6 +91,7 @@ class Collector(object): = None, preprocess_fn: Callable[[Any], Union[dict, Batch]] = None, stat_size: Optional[int] = 100, + action_noise: Optional[BaseNoise] = None, **kwargs) -> None: super().__init__() self.env = env @@ -119,6 +124,7 @@ class Collector(object): else: raise TypeError('The buffer in data collector is invalid!') self.stat_size = stat_size + self._action_noise = action_noise self.reset() def reset(self) -> None: @@ -132,6 +138,8 @@ class Collector(object): self.collect_step = 0 self.collect_episode = 0 self.collect_time = 0 + if self._action_noise is not None: + self._action_noise.reset() def reset_buffer(self) -> None: """Reset the main data buffer.""" @@ -268,6 +276,8 @@ class Collector(object): self._policy = to_numpy(result.policy) \ if hasattr(result, 'policy') else [{}] * self.env_num self._act = to_numpy(result.act) + if self._action_noise is not None: + self._act += self._action_noise(self._act.shape) obs_next, self._rew, self._done, self._info = self.env.step( self._act if self._multi_env else self._act[0]) if not self._multi_env: