they employ two stability measures, qk rmsnorm and softclamping of attention logits

2025-10-01 09:40:24 -07:00 · 2025-10-01 09:40:24 -07:00 · 2e92c0121a
commit 2e92c0121a
parent e8678364ba
2 changed files with 62 additions and 2 deletions
--- a/dreamer4/dreamer4.py
+++ b/dreamer4/dreamer4.py
@ -5,11 +5,12 @@ from functools import partial

 import torch
 import torch.nn.functional as F
-from torch.nn import Module, ModuleList, Sequential, Linear, RMSNorm, Identity
+from torch.nn import Module, ModuleList, Parameter, Sequential, Linear, RMSNorm, Identity
 from torch import cat, stack, tensor, Tensor, is_tensor

 # ein related

+import einx
 from einops import einsum, rearrange, repeat, reduce
 from einops.layers.torch import Rearrange

@ -25,6 +26,32 @@ def exists(v):
 def default(v, d):
    return v if exists(v) else d

+def l2norm(t):
+    return F.normalize(t, dim = -1, p = 2)
+
+def softclamp(t, value = 50.):
+    return (t / value).tanh() * value
+
+# multi-head rmsnorm
+
+class MultiHeadRMSNorm(Module):
+    def __init__(
+        self,
+        dim_head,
+        heads = 8
+    ):
+        super().__init__()
+        self.scale = dim_head ** 0.5
+        self.gamma = Parameter(torch.zeros(heads, dim_head)) # weight decay friendly
+
+    def forward(
+        self,
+        x
+    ):
+        normed = l2norm(x)
+        scale = (self.gamma + 1.) * self.scale
+        return einx.multiply('... h n d, h d', normed, scale)
+
 # attention

 class Attention(Module):
@ -33,6 +60,7 @@ class Attention(Module):
        dim,
        dim_head = 64,
        heads = 8,
+        softclamp_value = 50.,
        pre_rmsnorm = True
    ):
        super().__init__()
@ -47,6 +75,13 @@ class Attention(Module):
        self.to_kv = LinearNoBias(dim, dim_inner * 2)
        self.to_out = LinearNoBias(dim_inner, dim)

+        # stability related
+
+        self.q_heads_rmsnorm = MultiHeadRMSNorm(dim_head, heads = heads)
+        self.k_heads_rmsnorm = MultiHeadRMSNorm(dim_head, heads = heads)
+
+        self.softclamp_value = softclamp_value
+
    def forward(
        self,
        tokens,
@ -57,23 +92,47 @@ class Attention(Module):

        q, k, v = (self.to_q(tokens), *self.to_kv(tokens).chunk(2, dim = -1))

+        # split heads
+
        q, k, v = map(self.split_heads, (q, k, v))

+        # qk rmsnorm
+
+        q = self.q_heads_rmsnorm(q)
+        k = self.k_heads_rmsnorm(k)
+
+        # caching
+
        if exists(kv_cache):
            ck, cv = kv_cache
            k = cat((ck, k), dim = -2)
            v = cat((cv, v), dim = -2)

-        q = q * self.scale
+        # similarity

        sim = einsum(q, k, 'b h i d, b h j d -> b h i j')

+        # softclamping a la gemma 3
+
+        if exists(self.softclamp_value):
+            sim = softclamp(sim, self.softclamp_value)
+
+        # scale and attention
+
+        sim = sim * self.scale
+
        attn = sim.softmax(dim = -1)

+        # aggregate
+
        out = einsum(attn, v, 'b h i j, b h j d -> b h i d')

+        # merge heads
+
        out = self.merge_heads(out)

+        # combine heads
+
        out = self.to_out(out)

        if not return_kv_cache:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -26,6 +26,7 @@ classifiers=[
 ]

 dependencies = [
+    "einx>=0.3.0",
    "einops>=0.8.1",
    "torch>=2.4"
 ]