correct a misunderstanding where past actions is a separate action token, while agent token is used for the prediction of next action, rewards, values

2025-10-12 16:16:18 -07:00 · 2025-10-12 16:16:18 -07:00 · 6dbdc3d7d8
commit 6dbdc3d7d8
parent 9c78962736
2 changed files with 29 additions and 21 deletions
--- a/dreamer4/dreamer4.py
+++ b/dreamer4/dreamer4.py
@ -87,6 +87,9 @@ def is_power_two(num):

 # tensor helpers

+def is_empty(t):
+    return t.numel() == 0
+
 def log(t, eps = 1e-20):
    return t.clamp(min = eps).log()

@ -1944,20 +1947,6 @@ class DynamicsWorldModel(Module):

        agent_tokens = repeat(agent_tokens, 'b ... d -> b t ... d', t = time)

-        # maybe add the action embed to the agent tokens per time step
-
-        if exists(discrete_actions) or exists(continuous_actions):
-            assert self.action_embedder.has_actions
-
-            action_embed = self.action_embedder(
-                discrete_actions = discrete_actions,
-                discrete_action_types = discrete_action_types,
-                continuous_actions = continuous_actions,
-                continuous_action_types = continuous_action_types
-            )
-
-            agent_tokens = einx.add('b t ... d, b t d', agent_tokens, action_embed)
-
        # maybe add a reward embedding to agent tokens

        if exists(rewards):
@ -1975,9 +1964,23 @@ class DynamicsWorldModel(Module):

                agent_tokens = einx.add('b t ... d, b t d', agent_tokens, reward_embeds)

+        # maybe create the action tokens
+
+        if exists(discrete_actions) or exists(continuous_actions):
+            assert self.action_embedder.has_actions
+
+            action_tokens = self.action_embedder(
+                discrete_actions = discrete_actions,
+                discrete_action_types = discrete_action_types,
+                continuous_actions = continuous_actions,
+                continuous_action_types = continuous_action_types
+            )
+        else:
+            action_tokens = agent_tokens[:, :, 0:0] # else empty off agent tokens
+
        # main function, needs to be defined as such for shortcut training - additional calls for consistency loss

-        def get_prediction(noised_latents, signal_levels, step_sizes_log2, agent_tokens, return_agent_tokens = False):
+        def get_prediction(noised_latents, signal_levels, step_sizes_log2, action_tokens, agent_tokens, return_agent_tokens = False):
            # latents to spatial tokens

            space_tokens = self.latents_to_spatial_tokens(noised_latents)
@ -1986,6 +1989,10 @@ class DynamicsWorldModel(Module):

            num_spatial_tokens = space_tokens.shape[-2]

+            # action tokens
+
+            num_action_tokens = 1 if not is_empty(action_tokens) else 0
+
            # pack to tokens
            # [signal + step size embed] [latent space tokens] [register] [actions / agent]

@ -2003,7 +2010,7 @@ class DynamicsWorldModel(Module):

            # pack to tokens for attending

-            tokens, packed_tokens_shape = pack([flow_token, space_tokens, registers, agent_tokens], 'b t * d')
+            tokens, packed_tokens_shape = pack([flow_token, space_tokens, registers, action_tokens, agent_tokens], 'b t * d')

            # attend functions for space and time

@ -2015,6 +2022,7 @@ class DynamicsWorldModel(Module):

            space_seq_len = (
                + 1                  # signal + step
+                + num_action_tokens  # past action tokens - todo: account for multi-agent
                + self.num_agents    # action / agent tokens
                + self.num_register_tokens
                + num_spatial_tokens
@ -2056,7 +2064,7 @@ class DynamicsWorldModel(Module):

            # unpack

-            flow_token, space_tokens, register_tokens, agent_tokens = unpack(tokens, packed_tokens_shape, 'b t * d')
+            flow_token, space_tokens, register_tokens, action_tokens, agent_tokens = unpack(tokens, packed_tokens_shape, 'b t * d')

            # pooling

@ -2071,7 +2079,7 @@ class DynamicsWorldModel(Module):

        # forward the network

-        pred, encoded_agent_tokens = get_prediction(noised_latents, signal_levels, step_sizes_log2, agent_tokens, return_agent_tokens = True)
+        pred, encoded_agent_tokens = get_prediction(noised_latents, signal_levels, step_sizes_log2, action_tokens, agent_tokens, return_agent_tokens = True)

        if return_pred_only:
            if not return_agent_tokens:
@ -2108,7 +2116,7 @@ class DynamicsWorldModel(Module):
            step_sizes_log2_minus_one = step_sizes_log2 - 1 # which equals d / 2
            half_step_size = 2 ** step_sizes_log2_minus_one

-            first_step_pred = get_prediction_no_grad(noised_latents, signal_levels, step_sizes_log2_minus_one, agent_tokens)
+            first_step_pred = get_prediction_no_grad(noised_latents, signal_levels, step_sizes_log2_minus_one, action_tokens, agent_tokens)

            # first derive b'

@ -2127,7 +2135,7 @@ class DynamicsWorldModel(Module):
            # get second prediction for b''

            signal_levels_plus_half_step = signal_levels + half_step_size[:, None]
-            second_step_pred = get_prediction_no_grad(denoised_latent, signal_levels_plus_half_step, step_sizes_log2_minus_one, agent_tokens)
+            second_step_pred = get_prediction_no_grad(denoised_latent, signal_levels_plus_half_step, step_sizes_log2_minus_one, action_tokens, agent_tokens)

            if is_v_space_pred:
                second_step_pred_flow = second_step_pred
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "dreamer4"
-version = "0.0.17"
+version = "0.0.18"
 description = "Dreamer 4"
 authors = [
    { name = "Phil Wang", email = "lucidrains@gmail.com" }