Update EmbeddingGemma to support optional final L2 normalization

kamalkraj · kamalkraj · commit 5f82bafe52ae · 2026-02-07T15:40:04.000+05:30
This commit introduces an optional final_l2_norm parameter to the EmbeddingGemma model, allowing users to enable or disable the final L2 normalization step. This enhancement is particularly useful for advanced use cases such as Matryoshka embeddings.

Changes:
- Model Update: Modified EmbeddingGemma in embedding_gemma.py to accept a final_l2_norm argument in __init__ and build_model. The forward method now conditionally applies L2 normalization.
- Conversion Script: Updated convert_to_tflite.py to include a --final_l2_norm flag (defaulting to True), passing it to the model builder.
- Verification: Updated verify.py and verify_util.py to support the final_l2_norm flag, ensuring the verification process matches the model configuration.
- Documentation: Updated README.md to document the new flag, mention Matryoshka embeddings, and include litert-torch in the requirements.

The default behavior remains unchanged (final_l2_norm=True), ensuring backward compatibility.
diff --git a/litert_torch/generative/examples/embedding_gemma/README.md b/litert_torch/generative/examples/embedding_gemma/README.md
@@ -0,0 +1,52 @@
+# Embedding Gemma-300M
+
+[Embedding Gemma](https://huggingface.co/google/embeddinggemma-300m) is a text embedding model based on the Gemma architecture. This example demonstrates how to reauthor the model using the LiteRT Torch Generative API and convert it to TFLite.
+
+## Model Details
+
+EmbeddingGemma-300M is an encoder-only model with 24 layers. It uses a combination of local sliding window attention and global attention. This implementation supports:
+- Sliding window attention mask.
+- Mean pooling of hidden states.
+- Final dense projections and L2 normalization (optional, useful for Matryoshka embeddings).
+
+## Requirements
+
+To run this example and verify the results, you need the following packages:
+
+```bash
+pip install litert-torch transformers sentence-transformers safetensors
+```
+
+## Convert to TFLite
+
+To convert the model to TFLite, use the `convert_to_tflite.py` script. You'll need the HuggingFace checkpoint for `google/embeddinggemma-300m`.
+
+```bash
+python convert_to_tflite.py 
+  --checkpoint_path=<path_to_checkpoint> 
+  --output_path=/tmp/ 
+  --quantize=dynamic_int8 
+  --prefill_seq_lens=512 
+  --final_l2_norm=True
+```
+
+### Conversion Flags
+
+- `--checkpoint_path`: Path to the directory containing the model's `model.safetensors` and Dense projection layers.
+- `--output_path`: Directory where the converted `.tflite` model will be saved.
+- `--quantize`: Quantization scheme (e.g., `dynamic_int8`, `none`).
+- `--prefill_seq_lens`: Defines the input sequence length for the converted TFLite model.
+- `--final_l2_norm`: Whether to apply final L2 normalization to the embeddings. Defaults to `True`. Set to `False` if using Matryoshka embeddings.
+
+## Verify the Model
+
+You can verify the reauthored model's output against the original HuggingFace model using `verify.py`.
+
+```bash
+python verify.py 
+  --checkpoint=<path_to_checkpoint> 
+  --prompts="What is the meaning of life?" 
+  --prompts="This is an example sentence."
+```
+
+The verification script compares the final embeddings produced by the original `sentence-transformers` implementation and the reauthored `litert_torch` implementation to ensure parity.
diff --git a/litert_torch/generative/examples/embedding_gemma/convert_to_tflite.py b/litert_torch/generative/examples/embedding_gemma/convert_to_tflite.py
@@ -25,11 +25,18 @@
 flags = generative_converter.define_conversion_flags(
     model_name="embedding_gemma"
 )
+flags.DEFINE_bool(
+    "final_l2_norm",
+    True,
+    "Whether to apply final L2 normalization to the embeddings.",
+)
 FLAGS = flags.FLAGS
 
 
 def main(_):
-  model = embedding_gemma.build_model(FLAGS.checkpoint_path)
+  model = embedding_gemma.build_model(
+      FLAGS.checkpoint_path, final_l2_norm=FLAGS.final_l2_norm
+  )
   model.eval()
   seq_len = max(FLAGS.prefill_seq_lens)
 
diff --git a/litert_torch/generative/examples/embedding_gemma/embedding_gemma.py b/litert_torch/generative/examples/embedding_gemma/embedding_gemma.py
@@ -91,9 +91,10 @@ def forward(
 class EmbeddingGemma(nn.Module):
   """EmbeddingGemma-300M model."""
 
-  def __init__(self, config: cfg.ModelConfig):
+  def __init__(self, config: cfg.ModelConfig, final_l2_norm: bool = True):
     super().__init__()
     self.config = config
+    self.final_l2_norm = final_l2_norm
 
     # Token embeddings
     self.embedder = nn.Embedding(
@@ -175,12 +176,10 @@ def create_sliding_mask(
   def mean_pool(self, hidden_states, attention_mask):
     """Mean pooling with attention mask."""
     if attention_mask is not None:
-      input_mask_expanded = attention_mask.unsqueeze(-1).expand(
-          hidden_states.size()
-      ).float()
-      sum_embeddings = torch.sum(
-          hidden_states * input_mask_expanded, dim=1
+      input_mask_expanded = (
+          attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
       )
+      sum_embeddings = torch.sum(hidden_states * input_mask_expanded, dim=1)
       sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
       return sum_embeddings / sum_mask
     else:
@@ -196,9 +195,7 @@ def forward(
     batch_size, seq_len = tokens.shape
 
     if attention_mask is None:
-      attention_mask = torch.ones(
-          batch_size, seq_len, device=tokens.device
-      )
+      attention_mask = torch.ones(batch_size, seq_len, device=tokens.device)
 
     x = self.embedder(tokens) * self.config.embedding_scale
 
@@ -263,8 +260,11 @@ def forward(
     pooled_x = self.dense2(pooled_x)
 
     # L2 normalization
-    embedding = torch.nn.functional.normalize(pooled_x, p=2, dim=1)
-    return embedding
+    if self.final_l2_norm:
+      embedding = torch.nn.functional.normalize(pooled_x, p=2, dim=1)
+      return embedding
+
+    return pooled_x
 
 
 def get_model_config() -> cfg.ModelConfig:
@@ -349,11 +349,11 @@ def get_model_config() -> cfg.ModelConfig:
   return config
 
 
-def build_model(checkpoint_path) -> EmbeddingGemma:
+def build_model(checkpoint_path, final_l2_norm: bool = True) -> EmbeddingGemma:
   """Build model and load weights from HuggingFace checkpoint."""
 
   config = get_model_config()
-  model = EmbeddingGemma(config)
+  model = EmbeddingGemma(config, final_l2_norm)
 
   print(f"Loading from checkpoint: {checkpoint_path}")
 
diff --git a/litert_torch/generative/examples/embedding_gemma/verify.py b/litert_torch/generative/examples/embedding_gemma/verify.py
@@ -35,13 +35,19 @@
     None,
     "Whether to enable the long input test.",
 )
+_FINAL_L2_NORM = flags.DEFINE_bool(
+    "final_l2_norm",
+    True,
+    "Whether to apply final L2 normalization to the embeddings.",
+)
 
 
 def main(_):
   if not verify_util.verify_embedding_gemma_300m(
       checkpoint_dir=_CHECKPOINT.value,
       prompts=_PROMPTS.value,
       long_input_prompt_path=_LONG_INPUT_PROMPT_PATH.value,
+      final_l2_norm=_FINAL_L2_NORM.value,
   ):
     exit(1)
 
diff --git a/litert_torch/generative/examples/embedding_gemma/verify_util.py b/litert_torch/generative/examples/embedding_gemma/verify_util.py
@@ -24,17 +24,19 @@
 
 DEFAULT_PROMPTS = [
     "What is the meaning of life?",
-    "This is an example sentence."
+    "This is an example sentence.",
 ]
 
 _MODEL_PATH = "google/embeddinggemma-300m"
 _LONG_INPUT_PROMPT_PATH = "long_input_prompt_test.txt"
 
+
 def verify_embedding_gemma_300m(
     checkpoint_dir: str = None,
     prompts: list[str] | None = None,
     long_input_prompt_path: str = None,
     atol: float = 0.0,
+    final_l2_norm: bool = True,
 ) -> bool:
   """Verifies EmbeddingGemma-300M."""
 
@@ -50,7 +52,9 @@ def verify_embedding_gemma_300m(
 
   print(f"Loading reauthored model from: {checkpoint_dir}")
   try:
-    reauthored_model = embedding_gemma.build_model(checkpoint_dir)
+    reauthored_model = embedding_gemma.build_model(
+        checkpoint_dir, final_l2_norm=final_l2_norm
+    )
     reauthored_model.eval()
   except Exception as e:  # pylint: disable=broad-except
     print(f"Failed to build or load reauthored model: {e}")
@@ -88,7 +92,9 @@ def verify_embedding_gemma_300m(
   print("\n--- Comparing Final Embeddings ---")
   with torch.no_grad():
     # Get embeddings from the original SentenceTransformer model.
-    final_original_output = original_model(inputs)  # pytype: disable=wrong-arg-types
+    final_original_output = original_model(
+        inputs
+    )  # pytype: disable=wrong-arg-types
     original_embedding = final_original_output["sentence_embedding"]
 
     # Get embeddings from the reauthored model.