diff --git a/tianshou/highlevel/params/policy_params.py b/tianshou/highlevel/params/policy_params.py index 8d64537..68ac34f 100644 --- a/tianshou/highlevel/params/policy_params.py +++ b/tianshou/highlevel/params/policy_params.py @@ -256,9 +256,13 @@ class ParamsMixinLearningRateWithScheduler(GetParamTransformersProtocol): @dataclass class ParamsMixinActorAndCritic(GetParamTransformersProtocol): actor_lr: float = 1e-3 + """the learning rate to use for the actor network""" critic_lr: float = 1e-3 + """the learning rate to use for the critic network""" actor_lr_scheduler_factory: LRSchedulerFactory | None = None + """factory for the creation of a learning rate scheduler to use for the actor network (if any)""" critic_lr_scheduler_factory: LRSchedulerFactory | None = None + """factory for the creation of a learning rate scheduler to use for the critic network (if any)""" def _get_param_transformers(self) -> list[ParamTransformer]: return [ @@ -272,17 +276,60 @@ class ParamsMixinActorAndCritic(GetParamTransformersProtocol): @dataclass -class PGParams(Params, ParamsMixinLearningRateWithScheduler): - discount_factor: float = 0.99 - reward_normalization: bool = False - deterministic_eval: bool = False +class ParamsMixinActionScaling(GetParamTransformersProtocol): action_scaling: bool | Literal["default"] = "default" """whether to apply action scaling; when set to "default", it will be enabled for continuous action spaces""" action_bound_method: Literal["clip", "tanh"] | None = "clip" + """ + method to bound action to range [-1, 1]. Only used if the action_space is continuous. + """ + + def _get_param_transformers(self) -> list[ParamTransformer]: + return [] + + +@dataclass +class ParamsMixinExplorationNoise(GetParamTransformersProtocol): + exploration_noise: BaseNoise | Literal["default"] | NoiseFactory | None = None + """ + If not None, add noise to actions for exploration. + This is useful when solving "hard exploration" problems. + It can either be a distribution, a factory for the creation of a distribution or "default". + When set to "default", use Gaussian noise with standard deviation 0.1. + """ + + def _get_param_transformers(self) -> list[ParamTransformer]: + return [ParamTransformerNoiseFactory("exploration_noise")] + + +@dataclass +class PGParams(Params, ParamsMixinActionScaling, ParamsMixinLearningRateWithScheduler): + discount_factor: float = 0.99 + """ + discount factor (gamma) for future rewards; must be in [0, 1] + """ + reward_normalization: bool = False + """ + if True, will normalize the returns by subtracting the running mean and dividing by the running + standard deviation. + """ + deterministic_eval: bool = False + """ + whether to use deterministic action (the dist's mode) instead of stochastic one during evaluation. + Does not affect training. + """ dist_fn: TDistributionFunction | DistributionFunctionFactory | Literal["default"] = "default" + """ + This can either be a function which maps the model output to a torch distribution or a + factory for the creation of such a function. + When set to "default", a factory which creates Gaussian distributions from mean and standard + deviation will be used for the continuous case and which creates categorical distributions + for the discrete case (see :class:`DistributionFunctionFactoryDefault`) + """ def _get_param_transformers(self) -> list[ParamTransformer]: transformers = super()._get_param_transformers() + transformers.extend(ParamsMixinActionScaling._get_param_transformers(self)) transformers.extend(ParamsMixinLearningRateWithScheduler._get_param_transformers(self)) transformers.append(ParamTransformerActionScaling("action_scaling")) transformers.append(ParamTransformerDistributionFunction("dist_fn")) @@ -290,47 +337,125 @@ class PGParams(Params, ParamsMixinLearningRateWithScheduler): @dataclass -class A2CParams(PGParams): - vf_coef: float = 0.5 - ent_coef: float = 0.01 - max_grad_norm: float | None = None +class ParamsMixinGeneralAdvantageEstimation(GetParamTransformersProtocol): gae_lambda: float = 0.95 + """ + determines the blend between Monte Carlo and one-step temporal difference (TD) estimates of the advantage + function in general advantage estimation (GAE). + A value of 0 gives a fully TD-based estimate; lambda=1 gives a fully Monte Carlo estimate. + """ max_batchsize: int = 256 + """the maximum size of the batch when computing general advantage estimation (GAE)""" + + def _get_param_transformers(self) -> list[ParamTransformer]: + return [] + + +@dataclass +class A2CParams(PGParams, ParamsMixinGeneralAdvantageEstimation): + vf_coef: float = 0.5 + """weight (coefficient) of the value loss in the loss function""" + ent_coef: float = 0.01 + """weight (coefficient) of the entropy loss in the loss function""" + max_grad_norm: float | None = None + """maximum norm for clipping gradients in backpropagation""" + + def _get_param_transformers(self) -> list[ParamTransformer]: + transformers = super()._get_param_transformers() + transformers.extend(ParamsMixinGeneralAdvantageEstimation._get_param_transformers(self)) + return transformers @dataclass class PPOParams(A2CParams): eps_clip: float = 0.2 + """ + determines the range of allowed change in the policy during a policy update: + The ratio between the probabilities indicated by the new and old policy is + constrained to stay in the interval [1 - eps_clip, 1 + eps_clip]. + Small values thus force the new policy to stay close to the old policy. + Typical values range between 0.1 and 0.3. + The optimal epsilon depends on the environment; more stochastic environments may need larger epsilons. + """ dual_clip: float | None = None + """ + determines the lower bound clipping for the probability ratio + (corresponds to parameter c in arXiv:1912.09729, Equation 5). + If set to None, dual clipping is not used and the bounds described in parameter eps_clip apply. + If set to a float value c, the lower bound is changed from 1 - eps_clip to c, + where c < 1 - eps_clip. + Setting c > 0 reduces policy oscillation and further stabilizes training. + Typical values are between 0 and 0.5. Smaller values provide more stability. + Setting c = 0 yields PPO with only the upper bound. + """ value_clip: bool = False + """ + whether to apply clipping of the predicted value function during policy learning. + Value clipping discourages large changes in value predictions between updates. + Inaccurate value predictions can lead to bad policy updates, which can cause training instability. + Clipping values prevents sporadic large errors from skewing policy updates too much. + """ advantage_normalization: bool = True + """whether to apply per mini-batch advantage normalization.""" recompute_advantage: bool = False + """ + whether to recompute advantage every update repeat as described in + https://arxiv.org/pdf/2006.05990.pdf, Sec. 3.5. + The original PPO implementation splits the data in each policy iteration + step into individual transitions and then randomly assigns them to minibatches. + This makes it impossible to compute advantages as the temporal structure is broken. + Therefore, the advantages are computed once at the beginning of each policy iteration step and + then used in minibatch policy and value function optimization. + This results in higher diversity of data in each minibatch at the cost of + using slightly stale advantage estimations. + Enabling this option will, as a remedy to this problem, recompute the advantages at the beginning + of each pass over the data instead of just once per iteration. + """ @dataclass -class NPGParams(PGParams): +class NPGParams(PGParams, ParamsMixinGeneralAdvantageEstimation): optim_critic_iters: int = 5 + """number of times to optimize critic network per update.""" actor_step_size: float = 0.5 + """step size for actor update in natural gradient direction""" advantage_normalization: bool = True - gae_lambda: float = 0.95 - max_batchsize: int = 256 + """whether to do per mini-batch advantage normalization.""" + + def _get_param_transformers(self) -> list[ParamTransformer]: + transformers = super()._get_param_transformers() + transformers.extend(ParamsMixinGeneralAdvantageEstimation._get_param_transformers(self)) + return transformers @dataclass class TRPOParams(NPGParams): max_kl: float = 0.01 + """ + maximum KL divergence, used to constrain each actor network update. + """ backtrack_coeff: float = 0.8 + """ + coefficient with which to reduce the step size when constraints are not met. + """ max_backtracks: int = 10 + """maximum number of times to backtrack in line search when the constraints are not met.""" @dataclass class ParamsMixinActorAndDualCritics(GetParamTransformersProtocol): actor_lr: float = 1e-3 + """the learning rate to use for the actor network""" critic1_lr: float = 1e-3 + """the learning rate to use for the first critic network""" critic2_lr: float = 1e-3 + """the learning rate to use for the second critic network""" actor_lr_scheduler_factory: LRSchedulerFactory | None = None + """factory for the creation of a learning rate scheduler to use for the actor network (if any)""" critic1_lr_scheduler_factory: LRSchedulerFactory | None = None + """factory for the creation of a learning rate scheduler to use for the first critic network (if any)""" critic2_lr_scheduler_factory: LRSchedulerFactory | None = None + """factory for the creation of a learning rate scheduler to use for the second critic network (if any)""" def _get_param_transformers(self) -> list[ParamTransformer]: return [ @@ -345,46 +470,69 @@ class ParamsMixinActorAndDualCritics(GetParamTransformersProtocol): @dataclass -class SACParams(Params, ParamsMixinActorAndDualCritics): +class _SACParams(Params, ParamsMixinActorAndDualCritics): tau: float = 0.005 + """controls the contribution of the entropy term in the overall optimization objective, + i.e. the desired amount of randomness in the optimal policy. + Higher values mean greater target entropy and therefore more randomness in the policy. + Lower values mean lower target entropy and therefore a more deterministic policy. + """ gamma: float = 0.99 - alpha: float | tuple[float, torch.Tensor, torch.optim.Optimizer] | AutoAlphaFactory = 0.2 + """discount factor (gamma) for future rewards; must be in [0, 1]""" + alpha: float | AutoAlphaFactory = 0.2 + """ + controls the relative importance (coefficient) of the entropy term in the loss function. + This can be a constant or a factory for the creation of a representation that allows the + parameter to be automatically tuned; + use :class:`tianshou.highlevel.params.alpha.AutoAlphaFactoryDefault` for the standard + auto-adjusted alpha. + """ estimation_step: int = 1 - exploration_noise: BaseNoise | Literal["default"] | NoiseFactory | None = None - deterministic_eval: bool = True - action_scaling: bool = True - action_bound_method: Literal["clip"] | None = "clip" + """the number of steps to look ahead""" def _get_param_transformers(self) -> list[ParamTransformer]: transformers = super()._get_param_transformers() transformers.extend(ParamsMixinActorAndDualCritics._get_param_transformers(self)) transformers.append(ParamTransformerAutoAlpha("alpha")) - transformers.append(ParamTransformerNoiseFactory("exploration_noise")) return transformers @dataclass -class DiscreteSACParams(Params, ParamsMixinActorAndDualCritics): - tau: float = 0.005 - gamma: float = 0.99 - alpha: float | tuple[float, torch.Tensor, torch.optim.Optimizer] | AutoAlphaFactory = 0.2 - estimation_step: int = 1 +class SACParams(_SACParams, ParamsMixinExplorationNoise, ParamsMixinActionScaling): + deterministic_eval: bool = True + """ + whether to use deterministic action (mean of Gaussian policy) in evaluation mode instead of stochastic + action sampled by the policy. Does not affect training.""" def _get_param_transformers(self) -> list[ParamTransformer]: transformers = super()._get_param_transformers() - transformers.extend(ParamsMixinActorAndDualCritics._get_param_transformers(self)) - transformers.append(ParamTransformerAutoAlpha("alpha")) + transformers.extend(ParamsMixinExplorationNoise._get_param_transformers(self)) + transformers.extend(ParamsMixinActionScaling._get_param_transformers(self)) return transformers +@dataclass +class DiscreteSACParams(_SACParams): + pass + + @dataclass class DQNParams(Params, ParamsMixinLearningRateWithScheduler): discount_factor: float = 0.99 + """ + discount factor (gamma) for future rewards; must be in [0, 1] + """ estimation_step: int = 1 + """the number of steps to look ahead""" target_update_freq: int = 0 + """the target network update frequency (0 if no target network is to be used)""" reward_normalization: bool = False + """whether to normalize the returns to Normal(0, 1)""" is_double: bool = True + """whether to use double Q learning""" clip_loss_grad: bool = False + """whether to clip the gradient of the loss in accordance with nature14236; this amounts to using the Huber + loss instead of the MSE loss.""" def _get_param_transformers(self) -> list[ParamTransformer]: transformers = super()._get_param_transformers() @@ -395,9 +543,13 @@ class DQNParams(Params, ParamsMixinLearningRateWithScheduler): @dataclass class IQNParams(DQNParams): sample_size: int = 32 + """the number of samples for policy evaluation""" online_sample_size: int = 8 + """the number of samples for online model in training""" target_sample_size: int = 8 + """the number of samples for target model in training.""" num_quantiles: int = 200 + """the number of quantile midpoints in the inverse cumulative distribution function of the value""" hidden_sizes: Sequence[int] = () """hidden dimensions to use in the IQN network""" num_cosines: int = 64 @@ -410,29 +562,54 @@ class IQNParams(DQNParams): @dataclass -class DDPGParams(Params, ParamsMixinActorAndCritic): +class DDPGParams( + Params, + ParamsMixinActorAndCritic, + ParamsMixinExplorationNoise, + ParamsMixinActionScaling, +): tau: float = 0.005 + """ + controls the soft update of the target network. + It determines how slowly the target networks track the main networks. + Smaller tau means slower tracking and more stable learning. + """ gamma: float = 0.99 - exploration_noise: BaseNoise | Literal["default"] | NoiseFactory | None = "default" + """discount factor (gamma) for future rewards; must be in [0, 1]""" estimation_step: int = 1 - action_scaling: bool = True - action_bound_method: Literal["clip"] | None = "clip" + """the number of steps to look ahead.""" def _get_param_transformers(self) -> list[ParamTransformer]: transformers = super()._get_param_transformers() transformers.extend(ParamsMixinActorAndCritic._get_param_transformers(self)) - transformers.append(ParamTransformerNoiseFactory("exploration_noise")) + transformers.extend(ParamsMixinExplorationNoise._get_param_transformers(self)) + transformers.extend(ParamsMixinActionScaling._get_param_transformers(self)) return transformers @dataclass class REDQParams(DDPGParams): ensemble_size: int = 10 + """the number of sub-networks in the critic ensemble""" subset_size: int = 2 - alpha: float | tuple[float, torch.Tensor, torch.optim.Optimizer] | AutoAlphaFactory = 0.2 + """the number of networks in the subset""" + alpha: float | AutoAlphaFactory = 0.2 + """ + controls the relative importance (coefficient) of the entropy term in the loss function. + This can be a constant or a factory for the creation of a representation that allows the + parameter to be automatically tuned; + use :class:`tianshou.highlevel.params.alpha.AutoAlphaFactoryDefault` for the standard + auto-adjusted alpha. + """ estimation_step: int = 1 + """the number of steps to look ahead""" actor_delay: int = 20 + """the number of critic updates before an actor update""" deterministic_eval: bool = True + """ + whether to use deterministic action (the dist's mode) instead of stochastic one during evaluation. + Does not affect training. + """ target_mode: Literal["mean", "min"] = "min" def _get_param_transformers(self) -> list[ParamTransformer]: @@ -442,21 +619,34 @@ class REDQParams(DDPGParams): @dataclass -class TD3Params(Params, ParamsMixinActorAndDualCritics): +class TD3Params( + Params, + ParamsMixinActorAndDualCritics, + ParamsMixinExplorationNoise, + ParamsMixinActionScaling, +): tau: float = 0.005 + """ + controls the soft update of the target network. + It determines how slowly the target networks track the main networks. + Smaller tau means slower tracking and more stable learning. + """ gamma: float = 0.99 - exploration_noise: BaseNoise | Literal["default"] | NoiseFactory | None = "default" + """discount factor (gamma) for future rewards; must be in [0, 1]""" policy_noise: float | FloatEnvValueFactory = 0.2 + """the scale of the the noise used in updating policy network""" noise_clip: float | FloatEnvValueFactory = 0.5 + """determines the clipping range of the noise used in updating the policy network as [-noise_clip, noise_clip]""" update_actor_freq: int = 2 + """the update frequency of actor network""" estimation_step: int = 1 - action_scaling: bool = True - action_bound_method: Literal["clip"] | None = "clip" + """the number of steps to look ahead.""" def _get_param_transformers(self) -> list[ParamTransformer]: transformers = super()._get_param_transformers() transformers.extend(ParamsMixinActorAndDualCritics._get_param_transformers(self)) - transformers.append(ParamTransformerNoiseFactory("exploration_noise")) + transformers.extend(ParamsMixinExplorationNoise._get_param_transformers(self)) + transformers.extend(ParamsMixinActionScaling._get_param_transformers(self)) transformers.append(ParamTransformerFloatEnvParamFactory("policy_noise")) transformers.append(ParamTransformerFloatEnvParamFactory("noise_clip")) return transformers