allow for evolutionary policy optimization

2025-10-28 10:11:13 -07:00 · 2025-10-28 10:11:13 -07:00 · c0450359f3
commit c0450359f3
parent 46f86cd247
3 changed files with 82 additions and 1 deletions
--- a/dreamer4/dreamer4.py
+++ b/dreamer4/dreamer4.py
@ -1942,6 +1942,7 @@ class DynamicsWorldModel(Module):
        # learned set of latent genes

        self.agent_has_genes = num_latent_genes > 0
+        self.num_latent_genes = num_latent_genes
        self.latent_genes = Parameter(randn(num_latent_genes, dim) * 1e-2)

        # policy head
@ -2095,6 +2096,53 @@ class DynamicsWorldModel(Module):

        return align_dims_left(times, align_dims_left_to)

+    # evolutionary policy optimization - https://web3.arxiv.org/abs/2503.19037
+
+    @torch.no_grad()
+    def evolve_(
+        self,
+        fitness,
+        select_frac = 0.5,
+        tournament_frac = 0.5
+    ):
+        assert fitness.numel() == self.num_latent_genes
+
+        pop = self.latent_genes
+
+        pop_size = self.num_latent_genes
+        num_selected = ceil(pop_size * select_frac)
+        num_children = pop_size - num_selected
+
+        dim_gene = pop.shape[-1]
+
+        # natural selection just a sort and slice
+
+        selected_fitness, selected_indices = fitness.topk(num_selected, dim = -1)
+        selected = pop[selected_indices]
+
+        # use tournament - one tournament per child
+
+        tournament_size = max(2, ceil(num_selected * tournament_frac))
+
+        tournaments = torch.randn((num_children, num_selected), device = self.device).argsort(dim = -1)[:, :tournament_size]
+
+        parent_ids = selected_fitness[tournaments].topk(2, dim = -1).indices # get top 2 winners as parents
+
+        parents = selected[parent_ids]
+
+        # crossover by random interpolation from parent1 to parent2
+
+        random_uniform_mix = torch.randn((num_children, dim_gene), device = self.device).sigmoid()
+
+        parent1, parent2 = parents.unbind(dim = 1)
+        children = parent1.lerp(parent2, random_uniform_mix)
+
+        # store next population
+
+        next_pop = cat((selected, children))
+
+        self.latent_genes.copy_(next_pop)
+
    # interacting with env for experience

    @torch.no_grad()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "dreamer4"
-version = "0.0.88"
+version = "0.0.89"
 description = "Dreamer 4"
 authors = [
    { name = "Phil Wang", email = "lucidrains@gmail.com" }
--- a/tests/test_dreamer.py
+++ b/tests/test_dreamer.py
@ -753,3 +753,36 @@ def test_proprioception(

    assert exists(generations.proprio)
    assert generations.video.shape == video_shape
+
+def test_epo():
+    from dreamer4.dreamer4 import VideoTokenizer, DynamicsWorldModel
+
+    tokenizer = VideoTokenizer(
+        512,
+        dim_latent = 32,
+        patch_size = 32,
+        encoder_depth = 2,
+        decoder_depth = 2,
+        time_block_every = 2,
+        attn_heads = 8,
+        image_height = 256,
+        image_width = 256,
+        attn_kwargs = dict(
+            query_heads = 16
+        )
+    )
+
+    dynamics = DynamicsWorldModel(
+        512,
+        num_agents = 1,
+        video_tokenizer = tokenizer,
+        dim_latent = 32,
+        dim_proprio = 21,
+        num_tasks = 4,
+        num_latent_genes = 16,
+        num_discrete_actions = 4,
+        num_residual_streams = 1
+    )
+
+    fitness = torch.randn(16,)
+    dynamics.evolve_(fitness)