enable text cfg, some config definitions

francoishernandez · francoishernandez · commit 69831646ecbd · 2025-06-03T14:42:00.000Z
diff --git a/eole/config/inference.py b/eole/config/inference.py
@@ -81,8 +81,59 @@ class DecodingConfig(Config):
     align_debug: bool = Field(default=False, description="Print best align for each word.")
 
 
+class ImageGenerationConfig(Config):
+    """
+    Let's centralize image generation related stuff here.
+    This is not a complete config, but rather a subset of options
+    that are relevant for image generation tasks.
+    Used as mixin for InferenceConfig for now, but might be properly nested at some point.
+    """
+
+    # image generation specific stuff, might move elsewhere
+    image_generation: bool | None = Field(
+        default=False,
+        description="Generate image from text input. "
+        "This will only work if the model is trained for image generation.",
+    )
+    image_width: int | None = Field(
+        default=1024,
+        description="Width of the generated image. "
+        "This will only work if the model is trained for image generation.",
+    )
+    image_height: int | None = Field(
+        default=1024,
+        description="Height of the generated image. "
+        "This will only work if the model is trained for image generation.",
+    )
+    cfg_text_scale: float | None = Field(
+        default=1.0,
+        description="Classifier-free guidance scale for text input. "
+    )
+    cfg_image_scale: float | None = Field(
+        default=1.0,
+        description="Classifier-free guidance scale for image input. "
+    )
+    cfg_interval_min: float | None = Field(
+        default=0.0,
+        description="Minimum classifier-free guidance interval. "
+    )
+    cfg_interval_max: float | None = Field(
+        default=1.0,
+        description="Maximum classifier-free guidance interval. "
+    )
+    timestep_shift: float | None = Field(
+        default=1.0,
+        description="Shift the timestep for image generation. "
+    )
+    num_timesteps: int | None = Field(
+        default=50,
+        description="Number of timesteps for image generation. "
+    )
+
+
+
 # in legacy opts, decoding config is separated (probably to be used elsewhere)
-class InferenceConfig(RunningConfig, DecodingConfig, LoRaConfig, QuantizeConfig):
+class InferenceConfig(RunningConfig, DecodingConfig, LoRaConfig, QuantizeConfig, ImageGenerationConfig):
 
     model_config = get_config_dict()
     model_config["arbitrary_types_allowed"] = True  # to allow torch.dtype
@@ -111,23 +162,6 @@ class InferenceConfig(RunningConfig, DecodingConfig, LoRaConfig, QuantizeConfig)
         description="Optional EOS tokens that would stop generation, e.g. <|eot_id|> for Llama3",
     )
 
-    # image generation specific stuff, might move elsewhere
-    image_generation: bool | None = Field(
-        default=False,
-        description="Generate image from text input. "
-        "This will only work if the model is trained for image generation.",
-    )
-    image_width: int | None = Field(
-        default=1024,
-        description="Width of the generated image. "
-        "This will only work if the model is trained for image generation.",
-    )
-    image_height: int | None = Field(
-        default=1024,
-        description="Height of the generated image. "
-        "This will only work if the model is trained for image generation.",
-    )
-
     def get_model_path(self):
         return self.model_path[0]
 
diff --git a/eole/decoders/transformer.py b/eole/decoders/transformer.py
@@ -138,8 +138,8 @@ def forward(self, layer_in, **kwargs):
         else:
             norm_layer_in = self.input_layernorm(layer_in)
 
-        print("NORM_LAYER_IN:", norm_layer_in.shape, norm_layer_in.sum(), norm_layer_in)
-        print("NORM_LAYER_IN img:", norm_layer_in[:, -4098:, :].shape, norm_layer_in[:, -4098:, :].sum(), norm_layer_in[:, -4098:, :])
+        # print("NORM_LAYER_IN:", norm_layer_in.shape, norm_layer_in.sum(), norm_layer_in)
+        # print("NORM_LAYER_IN img:", norm_layer_in[:, -4098:, :].shape, norm_layer_in[:, -4098:, :].sum(), norm_layer_in[:, -4098:, :])
 
         self_attn, attns = self.self_attn(
             norm_layer_in,
@@ -418,7 +418,7 @@ def forward(self, emb, **kwargs):
             attn_mask = attn_mask[:, :, :, -self.sliding_window :]
 
         for i, layer in enumerate(self.transformer_layers):
-            print(f"\n=================\nLAYER {i}\n=================\n")
+            # print(f"\n=================\nLAYER {i}\n=================\n")
             emb, attn = layer(
                 emb,
                 enc_out=enc_out if enc_out is not None else emb,
@@ -431,7 +431,7 @@ def forward(self, emb, **kwargs):
                 ),
                 **kwargs,
             )
-            print("EMB:", emb.shape, emb.sum(), emb)
+            # print("EMB:", emb.shape, emb.sum(), emb)
             if with_align:
                 attn_align = layer.get_attn_align(
                     emb,
diff --git a/eole/models/model.py b/eole/models/model.py
@@ -30,7 +30,7 @@
 
 import math
 from PIL import Image
-
+from tqdm import tqdm
 
 def build_encoder(model_config, running_config=None):
     """
@@ -1257,7 +1257,7 @@ def generate_image(self, text_src, init_noise, position_ids, num_timesteps=20, t
             0, num_image_tokens, device=device
         )
 
-        for i, t in enumerate(timesteps):
+        for i, t in tqdm(enumerate(timesteps)):
             timestep = torch.tensor([t] * num_image_tokens, device=device)
             if t > min_cfg and t <= max_cfg:
                 cfg_text_scale_ = cfg_text_scale
@@ -1285,51 +1285,69 @@ def generate_image(self, text_src, init_noise, position_ids, num_timesteps=20, t
         # no real multi image support for now, so we just return the first one
         return output[0]
 
-    def forward_image_gen(self, text_src, x_t, timestep, text_ids, text_indices, image_indices, seqlens, image_position_ids, position_ids):
+    def forward_image_gen(
+        self,
+        text_src,
+        x_t,
+        timestep,
+        text_ids,
+        text_indices,
+        image_indices,
+        seqlens,
+        image_position_ids,
+        position_ids,
+        # cfg_text_scale=1.0,
+        cfg_text_scale=4.0,
+        cfg_img_scale=1.0,
+        cfg_renorm_type="global",
+        cfg_renorm_min=0.0,
+    ):
         """
         (Somewhat corresponds to bagel._forward_flow at high level.)
         """
-        print("TEXT_SRC:", text_src.shape, text_src)
-        print("TEXT_IDS:", text_ids)
+        # print("TEXT_SRC:", text_src.shape, text_src)
+        # print("TEXT_IDS:", text_ids)
         text_embeddings = self.tgt_emb(text_ids)
         text_prompt_emb = self.tgt_emb(text_src)
 
-        print("TEXT_EMBEDDINGS:", text_embeddings.shape, text_embeddings.sum(), text_embeddings.dtype)
+        # print("TEXT_EMBEDDINGS:", text_embeddings.shape, text_embeddings.sum(), text_embeddings.dtype)
         sequence = text_embeddings.new_zeros((sum(seqlens), self.hidden_size))
-        print("SEQUENCE:", sequence.shape, sequence.sum(), sequence.dtype)
+        # print("SEQUENCE:", sequence.shape, sequence.sum(), sequence.dtype)
         sequence[text_indices] = text_embeddings
 
 
-        print("IMAGE_POSITION_IDS:", image_position_ids)
+        # print("IMAGE_POSITION_IDS:", image_position_ids)
         position_embeddings = self.latent_pos_embed(image_position_ids)
-        print("POSITION_EMBEDDINGS:", position_embeddings.shape, position_embeddings.sum(), position_embeddings.dtype)
+        # print("POSITION_EMBEDDINGS:", position_embeddings.shape, position_embeddings.sum(), position_embeddings.dtype)
         timestep_embeddings = self.time_embedder(timestep)
-        print("TIMESTEP_EMBEDDINGS:", timestep_embeddings.shape, timestep_embeddings.sum(), timestep_embeddings.dtype)
-        print("X_T:", x_t.shape, x_t.sum(), x_t.dtype)
+        # print("TIMESTEP_EMBEDDINGS:", timestep_embeddings.shape, timestep_embeddings.sum(), timestep_embeddings.dtype)
+        # print("X_T:", x_t.shape, x_t.sum(), x_t.dtype)
         x_t = self.vae2llm(x_t) + timestep_embeddings + position_embeddings
         sequence[image_indices] = x_t
 
 
         
         sequence = sequence.unsqueeze(0)
 
-        print("TEXT_PROMPT_EMBED:", text_prompt_emb.shape, text_prompt_emb.sum(), text_prompt_emb.dtype)
-        print("SEQUENCE before text prompt:", sequence.shape, sequence.sum(), sequence.dtype)
+        # used for CFG
+        sequence_without_text = sequence.clone()
+
+        # print("TEXT_PROMPT_EMBED:", text_prompt_emb.shape, text_prompt_emb.sum(), text_prompt_emb.dtype)
+        # print("SEQUENCE before text prompt:", sequence.shape, sequence.sum(), sequence.dtype)
         sequence = torch.cat((text_prompt_emb, sequence), dim=1)
-        print("SEQUENCE after text prompt:", sequence.shape, sequence.sum(), sequence.dtype)
+        # print("SEQUENCE after text prompt:", sequence.shape, sequence.sum(), sequence.dtype)
 
-        print("DECODER IN:", sequence.shape, sequence.sum(), sequence)
+        # print("DECODER IN:", sequence.shape, sequence.sum(), sequence)
 
         offset_image_indices = [i + text_src.size(1) for i in image_indices]
         offset_text_indices = list(range(text_src.size(1))) + [i + text_src.size(1) for i in text_indices]
-        print("OFFSET IMAGE INDICES:", len(offset_image_indices), offset_image_indices)
-        print("OFFSET TEXT INDICES:", len(offset_text_indices), offset_text_indices)
+        # print("OFFSET IMAGE INDICES:", len(offset_image_indices), offset_image_indices)
+        # print("OFFSET TEXT INDICES:", len(offset_text_indices), offset_text_indices)
         output, _ = self.decoder(
             sequence,
             step=0, # not sure
             enc_out=None,
             src_len=seqlens,
-            with_align=False,
             # tgt_pad_mask=None,  # TODO: handle padding mask properly
             tgt_pad_mask=torch.zeros((sequence.size(0), sequence.size(1))).to(dtype=torch.bool, device=sequence.device), # no padding
             text_indices=offset_text_indices,
@@ -1347,6 +1365,68 @@ def forward_image_gen(self, text_src, x_t, timestep, text_ids, text_indices, ima
         print("V_T before cfg:", v_t.shape, v_t.sum(), v_t.dtype)
 
         # TODO: additional conditions for cfg_text_scale / cfg_img_scale ?
+        if cfg_text_scale > 1.0:
+            cfg_text_output, _ = self.decoder(
+                sequence_without_text,
+                step=0,
+                enc_out=None,
+                src_len=sequence_without_text.size(1),
+                tgt_pad_mask=torch.zeros((sequence_without_text.size(0), sequence_without_text.size(1))).to(
+                    dtype=torch.bool, device=sequence_without_text.device
+                ),  # no padding
+                text_indices=text_indices,
+                image_indices=image_indices,
+                decoder_in=torch.tensor([[self.image_token_id] * (len(image_indices) + 2)], device=text_src.device),
+                image_token_id=self.image_token_id,
+                positions=torch.zeros((sequence_without_text.size(1)), device=text_src.device),
+                # TODO: find a way to disable cache update for such calls (might be an issue for more complex queries downstream)
+            )
+            print("CFG TEXT OUTPUT:", cfg_text_output.shape, cfg_text_output.sum(), cfg_text_output.dtype)
+            cfg_text_v_t = self.llm2vae(cfg_text_output)
+            cfg_text_v_t = cfg_text_v_t.squeeze(0)
+            cfg_text_v_t = cfg_text_v_t[image_indices]  # select only image tokens
+            print("CFG TEXT V_T:", cfg_text_v_t.shape, cfg_text_v_t.sum(), cfg_text_v_t.dtype)
+
+        if cfg_img_scale > 1.0:
+            cfg_img_v_t = v_t.clone()
+            # this is actually useful only for the image editing case (input=text+image, output=image),
+            # which is still to be investigated
+            pass
+
+        # cfg renorm stuff
+        if cfg_text_scale > 1.0:
+            print("CFG_TEXT_SCALE > 1.0")
+            print("CFG_RENORM_TYPE:", cfg_renorm_type)
+            if cfg_renorm_type == "text_channel":
+                v_t_text_ = cfg_text_v_t + cfg_text_scale * (v_t - cfg_text_v_t)
+                norm_v_t = torch.norm(v_t, dim=-1, keepdim=True)
+                norm_v_t_text_ = torch.norm(v_t_text_, dim=-1, keepdim=True)
+                scale = (norm_v_t / (norm_v_t_text_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                v_t_text = v_t_text_ * scale
+                if cfg_img_scale > 1.0:
+                    v_t = cfg_img_v_t + cfg_img_scale * (v_t_text - cfg_img_v_t)
+                else:
+                    v_t = v_t_text
+            else:
+                v_t_text_ = cfg_text_v_t + cfg_text_scale * (v_t - cfg_text_v_t)
+
+                if cfg_img_scale > 1.0:
+                    v_t_ = cfg_img_v_t + cfg_img_scale * (v_t_text_ - cfg_img_v_t)
+                else:
+                    v_t_ = v_t_text_
+
+                # NOTE norm is computed over all dimensions, thus currently only supports batch_size = 1 with navit
+                if cfg_renorm_type == "global":
+                    norm_v_t = torch.norm(v_t)
+                    norm_v_t_ = torch.norm(v_t_)
+                elif cfg_renorm_type == "channel":
+                    norm_v_t = torch.norm(v_t, dim=-1, keepdim=True)
+                    norm_v_t_ = torch.norm(v_t_, dim=-1, keepdim=True)
+                else:
+                    raise NotImplementedError(f"{cfg_renorm_type} is not suppoprted")
+                scale = (norm_v_t / (norm_v_t_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                v_t = v_t_ * scale
+
 
         return v_t
 
diff --git a/eole/modules/rope.py b/eole/modules/rope.py
@@ -197,7 +197,6 @@ def forward_1d(self, maxseqlen, step=0, prefetch=1024, offset=32, positions=None
         tmax += self.model_config.rope_config.tmax_index
 
         rope = torch.outer(tmax, self.inv_freq.to(device))
-        print("ROPE freqs:", rope.shape, rope.sum(), rope)
         cos = torch.cos(rope)
         sin = torch.sin(rope)
         cos = torch.cat((cos, cos), dim=-1).to(dtype)  # Double the size by repeating `cos`
diff --git a/eole/predict/inference.py b/eole/predict/inference.py
@@ -92,6 +92,7 @@ def __init__(
         image_generation=False,
         image_width=1024,
         image_height=1024,
+        num_timesteps=20,
     ):
         self.model = model
         self.vocabs = vocabs
@@ -174,6 +175,7 @@ def __init__(
         self.image_generation = image_generation
         self.image_width = image_width
         self.image_height = image_height
+        self.num_timesteps = num_timesteps
 
     @classmethod
     def from_config(
@@ -253,6 +255,7 @@ def from_config(
             image_generation=config.image_generation,
             image_width=config.image_width,
             image_height=config.image_height,
+            num_timesteps=config.num_timesteps,
         )
 
     def _log(self, msg):
@@ -663,7 +666,7 @@ def _decode_and_generate(
                 decoder_in,
                 init_noise,
                 position_ids,
-                num_timesteps=50,
+                num_timesteps=self.num_timesteps,
             )
             image = self.model.decode_image(latent, self.image_height, self.image_width)
             image.save("generated_image.png")
diff --git a/recipes/bagel/generated_image.png b/recipes/bagel/generated_image.png
diff --git a/recipes/bagel/test_bagel.py b/recipes/bagel/test_bagel.py
@@ -37,6 +37,9 @@
     image_generation=True,
     image_width=1024,
     image_height=1024,
+    # num_timesteps=10,
+    # num_timesteps=30,
+    num_timesteps=50,
     # self_attn_backend="flash", # not properly supported (mixed masking)
 )
 
@@ -49,8 +52,7 @@
 print(engine.predictor.model)
 engine.predictor.model.count_parameters()
 
-# prompt = "A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."
-prompt = "A breathtaking photorealistic landscape of a windswept coastal cliff at golden hour. The scene features jagged rocks covered in moss, waves crashing below with mist rising, and seabirds flying overhead. The lighting is warm and natural, casting long shadows and reflecting on wet surfaces. The level of detail is ultra high, with textures of stone, water, and clouds rendered realistically, evoking a feeling of awe and solitude."
+prompt = "A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."
 
 # test_input = [{
 #     "text": f"<|im_start|>{prompt}<|im_end|><|im_start|>"