eole-nlp
diff --git a/‎eole/bin/convert/HF_mappings.py‎
Lines changed: 24 additions & 0 deletions b/‎eole/bin/convert/HF_mappings.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎eole/bin/convert/convert_HF.py‎
Lines changed: 9 additions & 0 deletions b/‎eole/bin/convert/convert_HF.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎eole/config/inference.py‎
Lines changed: 17 additions & 0 deletions b/‎eole/config/inference.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎eole/decoders/transformer.py‎
Lines changed: 90 additions & 4 deletions b/‎eole/decoders/transformer.py‎
Lines changed: 90 additions & 4 deletions
diff --git a/‎eole/encoders/transformer.py‎
Lines changed: 1 addition & 0 deletions b/‎eole/encoders/transformer.py‎
Lines changed: 1 addition & 0 deletions
@@ -40,6 +40,7 @@
     "Qwen2ForCausalLM": { # for bagel, but we need to add some conditions to keep supporting real qwen2...
         "decoder_layer_prefix": "language_model.model.layers.",
         "decoder.layer_norm.weight": "language_model.model.norm.weight",
+        "decoder.layer_norm_moe_gen.weight": "language_model.model.norm_moe_gen.weight",
         "encoder_layer_prefix": "vit_model.vision_model.encoder.layers.",
         "encoder.patch_conv.weight": "vit_model.vision_model.embeddings.patch_embedding.weight",
         "encoder.patch_conv.bias": "vit_model.vision_model.embeddings.patch_embedding.bias",
@@ -53,10 +54,33 @@
         "adapter.w_in.bias": "connector.fc1.bias",
         "adapter.w_out.weight": "connector.fc2.weight",
         "adapter.w_out.bias": "connector.fc2.bias",
+        # additional stuff, mostly replicated as-is for now
         "vit_pos_embed.pos_embed": "vit_pos_embed.pos_embed",
+        "latent_pos_embed.pos_embed": "latent_pos_embed.pos_embed",
+        "time_embedder.mlp.0.weight": "time_embedder.mlp.0.weight",
+        "time_embedder.mlp.0.bias": "time_embedder.mlp.0.bias",
+        "time_embedder.mlp.2.weight": "time_embedder.mlp.2.weight",
+        "time_embedder.mlp.2.bias": "time_embedder.mlp.2.bias",
+        "vae2llm.weight": "vae2llm.weight",
+        "vae2llm.bias": "vae2llm.bias",
+        "llm2vae.weight": "llm2vae.weight",
+        "llm2vae.bias": "llm2vae.bias",
+        # TODO: not sure how to properly grab VAE stuff
         "decoder": {
             ".self_attn.q_norm.": ".self_attn.q_norm.",
             ".self_attn.k_norm.": ".self_attn.k_norm.",
+            # MOE GEN (simplify with loop?)
+            ".self_attn.q_norm_moe_gen.": ".self_attn.q_norm_moe_gen.",
+            ".self_attn.k_norm_moe_gen.": ".self_attn.k_norm_moe_gen.",
+            ".self_attn.linear_query_moe_gen.": ".self_attn.q_proj_moe_gen.",
+            ".self_attn.linear_keys_moe_gen.": ".self_attn.k_proj_moe_gen.",
+            ".self_attn.linear_values_moe_gen.": ".self_attn.v_proj_moe_gen.",
+            ".self_attn.final_linear_moe_gen.": ".self_attn.o_proj_moe_gen.",
+            ".mlp_moe_gen.gate_up_proj.": ".mlp_moe_gen.gate_proj.",
+            ".mlp_moe_gen.down_proj.": ".mlp_moe_gen.down_proj.",
+            ".mlp_moe_gen.up_proj.": ".mlp_moe_gen.up_proj.",
+            ".input_layernorm_moe_gen.": ".input_layernorm_moe_gen.",
+            ".post_attention_layernorm_moe_gen.": ".post_attention_layernorm_moe_gen.",
         },
         "encoder": {
             ".self_attn.linear_query.": ".self_attn.q_proj.",
 
@@ -67,6 +67,7 @@ class HuggingfaceFiles:
     model_path: Optional[str] = None
     special_tokens_json: Optional[str] = None
     vision_config_path: Optional[str] = None
+    ae_model_path: Optional[str] = None
 
     # Unified dictionary to cache loaded files
     _loaded_files: dict = field(default_factory=dict, init=False)
@@ -130,6 +131,7 @@ def download_file_from_hub(file_name, required=True):
             or get_file_fn("pytorch_model.bin", required=False) or get_file_fn("ema.safetensors", required=False),
             "special_tokens_json": get_file_fn("special_tokens_map.json", required=False),
             "vision_config_path": get_file_fn("vit_config.json", required=False),
+            "ae_model_path": get_file_fn("ae.safetensors", required=False),
         }
 
         return cls(**paths, model_dir=args.model_dir, token=args.token)
@@ -677,6 +679,13 @@ def build_shards(model_config, hf, args, params):
         eole_safetensor = {}
 
         def build_first_shard(hf, eole_safetensor):
+            # let's add AE here
+            if hf.ae_model_path is not None:
+                ae_checkpoint = hf.get_load_ckpt(*os.path.split(hf.ae_model_path))
+                ae_params = safetensors.torch.load_file(ae_checkpoint)
+                for key, value in ae_params.items():
+                    eole_safetensor[f"image_autoencoder.{key}"] = value
+
             for target in KEY_MAPS[hf.arch].keys():
                 if model_config["share_decoder_embeddings"] and target == "generator.weight":
                     continue
 
@@ -111,6 +111,23 @@ class InferenceConfig(RunningConfig, DecodingConfig, LoRaConfig, QuantizeConfig)
         description="Optional EOS tokens that would stop generation, e.g. <|eot_id|> for Llama3",
     )
 
+    # image generation specific stuff, might move elsewhere
+    image_generation: bool | None = Field(
+        default=False,
+        description="Generate image from text input. "
+        "This will only work if the model is trained for image generation.",
+    )
+    image_width: int | None = Field(
+        default=1024,
+        description="Width of the generated image. "
+        "This will only work if the model is trained for image generation.",
+    )
+    image_height: int | None = Field(
+        default=1024,
+        description="Height of the generated image. "
+        "This will only work if the model is trained for image generation.",
+    )
+
     def get_model_path(self):
         return self.model_path[0]
 
 
@@ -76,6 +76,22 @@ def __init__(self, decoder_config, running_config=None, with_cross_attn=False):
                 running_config=running_config,
             )
 
+        self.image_generation = getattr(running_config, "image_generation", False)
+
+        if self.image_generation:
+            # initialize MOE GEN params
+            self.input_layernorm_moe_gen = LayerNorm[decoder_config.layer_norm](
+                decoder_config.hidden_size, eps=decoder_config.norm_eps
+            )
+            if decoder_config.post_attention_layernorm:
+                self.post_attention_layernorm_moe_gen = LayerNorm[decoder_config.layer_norm](
+                    decoder_config.hidden_size, eps=decoder_config.norm_eps
+                )
+            self.mlp_moe_gen = MLP(
+                decoder_config,
+                running_config=running_config,
+            )
+
     def _mlp(self, hidden_states):
         if self.ffn_layernorm:
             hidden_states = self.pre_feedforward_layernorm(hidden_states)
@@ -110,17 +126,34 @@ def forward(self, layer_in, **kwargs):
         return_attn = kwargs.pop("return_attn", False)
         position_embeddings = kwargs.pop("position_embeddings", None)
 
+        text_indices = kwargs.pop("text_indices", None)
+        image_indices = kwargs.pop("image_indices", None)
 
-        norm_layer_in = self.input_layernorm(layer_in)
+        if self.image_generation:
+            assert text_indices is not None, "Text indices must be provided for image generation"
+            assert image_indices is not None, "Image indices must be provided for image generation"
+            norm_layer_in = torch.zeros_like(layer_in, dtype=layer_in.dtype, device=layer_in.device)
+            norm_layer_in[:, text_indices, :] = self.input_layernorm(layer_in[:, text_indices, :])
+            norm_layer_in[:, image_indices, :] = self.input_layernorm_moe_gen(layer_in[:, image_indices, :])
+        else:
+            norm_layer_in = self.input_layernorm(layer_in)
+
+        print("NORM_LAYER_IN:", norm_layer_in.shape, norm_layer_in.sum(), norm_layer_in)
+        print("NORM_LAYER_IN img:", norm_layer_in[:, -4098:, :].shape, norm_layer_in[:, -4098:, :].sum(), norm_layer_in[:, -4098:, :])
 
         self_attn, attns = self.self_attn(
             norm_layer_in,
             attn_mask=attn_mask,
             step=step,
             return_attn=return_attn,
             position_embeddings=position_embeddings,
+            text_indices=text_indices,
+            image_indices=image_indices,
         )
 
+        # print("SELF_ATTN:", self_attn.shape, self_attn.sum(), self_attn)
+        # print("SELF_ATTN img:", self_attn[:, -4098:, :].shape, self_attn[:, -4098:, :].sum(), self_attn[:, -4098:, :])
+
         if self.dropout_p > 0:
             self_attn = self.dropout(self_attn)
 
@@ -130,6 +163,8 @@ def forward(self, layer_in, **kwargs):
             layer_out = ff_in + self._mlp(ff_in)
             return layer_out, attns
 
+        text_sequence, image_sequence = None, None # dirty patch
+
         if self.parallel_residual:
             if self.context_attn:
                 ctx_attn, attns = self.context_attn(
@@ -160,9 +195,27 @@ def forward(self, layer_in, **kwargs):
                     ctx_attn = self.dropout(ctx_attn)
             else:
                 ctx_attn = 0
-            ff_in = self.post_attention_layernorm(ctx_attn + self_attn + layer_in)
+            sequence = ctx_attn + self_attn + layer_in
+            if self.image_generation:
+                text_sequence = sequence[:, text_indices, :]
+                image_sequence = sequence[:, image_indices, :]
+                text_sequence = self.post_attention_layernorm(text_sequence)
+                image_sequence = self.post_attention_layernorm_moe_gen(image_sequence)
+                # print("POST_ATTENTION_LAYER_NORM text:", text_sequence.shape, text_sequence.sum(), text_sequence)
+                # print("POST_ATTENTION_LAYER_NORM img:", image_sequence.shape, image_sequence.sum(), image_sequence)
+            else:
+                ff_in = self.post_attention_layernorm(sequence)
         # we apply residual with un-normed
-        MLP = self.mlp(ff_in)
+        if self.image_generation:
+            MLP = torch.zeros_like(sequence, dtype=sequence.dtype, device=sequence.device)
+            MLP[:, text_indices, :] = self.mlp(text_sequence)
+            MLP[:, image_indices, :] = self.mlp_moe_gen(image_sequence)
+            # print("MLP text:", MLP[:, text_indices, :].shape, MLP[:, text_indices, :].sum(), MLP[:, text_indices, :])
+            # print("MLP img:", MLP[:, image_indices, :].shape, MLP[:, image_indices, :].sum(), MLP[:, image_indices, :])
+        else:
+            MLP = self.mlp(ff_in)
+        # print("MLP:", MLP.shape, MLP.sum(), MLP)
+        # print("MLP img:", MLP[:, -4098:, :].shape, MLP[:, -4098:, :].sum(), MLP[:, -4098:, :])
         layer_out = MLP + layer_in + self_attn + ctx_attn
 
         return layer_out, attns
@@ -227,7 +280,13 @@ def __init__(
                 for i in range(decoder_config.layers)
             ]
         )
+        self.image_generation = getattr(running_config, "image_generation", False)
         self.layer_norm = LayerNorm[decoder_config.layer_norm](decoder_config.hidden_size, eps=decoder_config.norm_eps)
+        if self.image_generation:
+            # initialize MOE GEN params
+            self.layer_norm_moe_gen = LayerNorm[decoder_config.layer_norm](
+                decoder_config.hidden_size, eps=decoder_config.norm_eps
+            )
         self._disable_cache()
 
     @classmethod
@@ -268,6 +327,8 @@ def _causal_attn_mask(self, tgt_pad_mask):
         )
         if self.sliding_window > 0:
             future_mask = future_mask.triu_(-self.sliding_window)
+        # print("future_mask", future_mask.shape, future_mask.dtype, future_mask.device)
+        # print("tgt_pad_mask", tgt_pad_mask.shape, tgt_pad_mask.dtype, tgt_pad_mask.device)
         attn_mask = ~tgt_pad_mask & future_mask.unsqueeze(0)
         return attn_mask.unsqueeze(1)  # (batch x 1 x 1 x tgt_len)
 
@@ -314,7 +375,11 @@ def forward(self, emb, **kwargs):
         with_align = kwargs.pop("with_align", False)
         return_attn = with_align or kwargs.pop("return_attn", False)
         positions = kwargs.pop("positions", None)
+        # print("positions", positions)
         position_embeddings = self.rope.update(emb.size(1), step=step, positions=positions)
+        cos, sin = position_embeddings
+        # print("COS:", cos.shape, cos.sum(), cos)
+        # print("SIN:", sin.shape, sin.sum(), sin)
         if self.rope_local is not None:
             position_embeddings_local = self.rope_local.update(emb.size(1), step=step)
         else:
@@ -341,12 +406,19 @@ def forward(self, emb, **kwargs):
 
         # we need to adapt the mask for gemma3, TODO: find another condition?
         # SEEMS OK TO MASK IMAGES FOR LLAVA TOO ?
+        # print("ATTN_MASK before update", attn_mask.shape, attn_mask)
         if decoder_in is not None and attn_mask is not None:
+            # print("DECODER_IN:", decoder_in)
             attn_mask = self._update_causal_mask(attn_mask, (decoder_in == image_token_id) | (decoder_in == 151652) | (decoder_in == 151653))
+            # print("ATTN_MASK after update", attn_mask.shape, attn_mask)
+
+
+
         if self.sliding_window > 0 and step >= self.sliding_window and attn_mask is not None:
             attn_mask = attn_mask[:, :, :, -self.sliding_window :]
 
         for i, layer in enumerate(self.transformer_layers):
+            print(f"\n=================\nLAYER {i}\n=================\n")
             emb, attn = layer(
                 emb,
                 enc_out=enc_out if enc_out is not None else emb,
@@ -357,7 +429,9 @@ def forward(self, emb, **kwargs):
                 position_embeddings=(
                     position_embeddings_local if (i + 1) % self.interleave_local else position_embeddings
                 ),
+                **kwargs,
             )
+            print("EMB:", emb.shape, emb.sum(), emb)
             if with_align:
                 attn_align = layer.get_attn_align(
                     emb,
@@ -374,7 +448,19 @@ def forward(self, emb, **kwargs):
                 if attn_align is not None:
                     attn_aligns.append(attn_align)
 
-        emb = self.layer_norm(emb)
+
+        # TODO apply MOE logic here
+        if self.image_generation:
+            emb_ = torch.zeros_like(emb, dtype=emb.dtype, device=emb.device)
+            text_indices = kwargs.get("text_indices", None)
+            image_indices = kwargs.get("image_indices", None)
+            assert text_indices is not None, "Text indices must be provided for image generation"
+            assert image_indices is not None, "Image indices must be provided for image generation"
+            emb_[:, text_indices, :] = self.layer_norm(emb[:, text_indices, :])
+            emb_[:, image_indices, :] = self.layer_norm_moe_gen(emb[:, image_indices, :])
+            emb = emb_
+        else:
+            emb = self.layer_norm(emb)
 
         # we take the first head
         top_attn = None if attn is None else attn[:, 0, :, :].contiguous()
 
@@ -45,6 +45,7 @@ def __init__(
         self.mlp = MLP(
             encoder_config,
             running_config=running_config,
+            is_decoder=False,
         )
 
     def forward(self, layer_in, pad_mask, position_embeddings=None):
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ def __init__(`
`45`	`45`	`self.mlp = MLP(`
`46`	`46`	`encoder_config,`
`47`	`47`	`running_config=running_config,`
	`48`	`+ is_decoder=False,`
`48`	`49`	`)`
`49`	`50`
`50`	`51`	`def forward(self, layer_in, pad_mask, position_embeddings=None):`