From 3beae186da11c68865f956f1feda95eff657f8f1 Mon Sep 17 00:00:00 2001
From: lucidrains <lucidrains@gmail.com>
Date: Thu, 30 Oct 2025 08:46:03 -0700
Subject: [PATCH] some more control over whether to normalize advantages

---
 dreamer4/dreamer4.py | 13 +++++++++----
 pyproject.toml       |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/dreamer4/dreamer4.py b/dreamer4/dreamer4.py
index dbfa7c4..6a3a8ee 100644
--- a/dreamer4/dreamer4.py
+++ b/dreamer4/dreamer4.py
@@ -1902,6 +1902,7 @@ class DynamicsWorldModel(Module):
         pmpo_pos_to_neg_weight = 0.5, # pos and neg equal weight
         pmpo_reverse_kl = True,
         pmpo_kl_div_loss_weight = .3,
+        normalize_advantages = None,
         value_clip = 0.4,
         policy_entropy_weight = .01,
         gae_use_accelerated = False
@@ -2425,6 +2426,7 @@ class DynamicsWorldModel(Module):
         value_optim: Optimizer | None = None,
         only_learn_policy_value_heads = True, # in the paper, they do not finetune the entire dynamics model, they just learn the heads
         use_pmpo = True,
+        normalize_advantages = None,
         eps = 1e-6
     ):
 
@@ -2507,16 +2509,19 @@ class DynamicsWorldModel(Module):
         else:
             advantage = returns - old_values
 
-        # apparently they just use the sign of the advantage
+        # if using pmpo, do not normalize advantages, but can be overridden
+
+        normalize_advantages = default(normalize_advantages, not use_pmpo)
+
+        if normalize_advantages:
+            advantage = F.layer_norm(advantage, advantage.shape, eps = eps)
+
         # https://arxiv.org/abs/2410.04166v1
 
         if use_pmpo:
             pos_advantage_mask = advantage >= 0.
             neg_advantage_mask = ~pos_advantage_mask
 
-        else:
-            advantage = F.layer_norm(advantage, advantage.shape, eps = eps)
-
         # replay for the action logits and values
         # but only do so if fine tuning the entire world model for RL
 
diff --git a/pyproject.toml b/pyproject.toml
index fe1d80f..3e7354b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dreamer4"
-version = "0.0.101"
+version = "0.0.102"
 description = "Dreamer 4"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }