fix latent / modality attention pattern in video tokenizer, thanks to another researcher

This commit is contained in:
lucidrains 2025-10-06 09:43:16 -07:00
parent 25b8de91cc
commit 77724049e2

View File

@ -896,7 +896,7 @@ class VideoTokenizer(Module):
# modality can only attend to itself while latents can attend to everything
# similar to agent token in dynamics model
encoder_attend_fn = get_attend_fn(use_flex, seq_len, seq_len, special_attend_only_itself = True)
encoder_attend_fn = get_attend_fn(use_flex, seq_len, seq_len, special_attend_only_itself = False)
# encoder
@ -937,7 +937,7 @@ class VideoTokenizer(Module):
# decoder attend
decoder_attend_fn = get_attend_fn(use_flex, seq_len, seq_len)
decoder_attend_fn = get_attend_fn(use_flex, seq_len, seq_len, special_attend_only_itself = True)
# decoder attention