No public description

sirakiin · copybara-github · commit 7ec9a199ed53 · 2026-02-04T08:40:12.000-08:00
PiperOrigin-RevId: 865424485
diff --git a/litert_torch/generative/export_hf/core/export_lib.py b/litert_torch/generative/export_hf/core/export_lib.py
@@ -326,7 +326,7 @@ def export_embedder_model(
         sample_kwargs=sample_inputs,
     )
   lrt_model = converter.convert(strict_export=False)
-  model_path = os.path.join(work_dir, 'model.tflite')
+  model_path = os.path.join(work_dir, 'embedder.tflite')
   lrt_model.export(model_path)
   quantization_recipe_list = (
       quantization_recipe.split(',') if quantization_recipe else [None]
@@ -359,7 +359,10 @@ def export_auxiliary_model(
         sample_kwargs=sample_input,
     )
   # Attention Mask
-  attention_mask_module = split_cache_module.SplitAttentionMaskBuilder(model)
+  attention_mask_module = split_cache_module.SplitAttentionMaskBuilder(
+      export_config.cache_length,
+      # TODO(weiyiw): Add sliding window sizes.
+  )
   sample_inputs = attention_mask_module.get_sample_inputs(
       text_model_config, export_config
   )
@@ -370,7 +373,7 @@ def export_auxiliary_model(
         sample_kwargs=sample_input,
     )
   # Cache Update
-  cache_update_module = split_cache_module.CacheUpdate(model)
+  cache_update_module = split_cache_module.CacheUpdate()
   sample_inputs = cache_update_module.get_sample_inputs(
       text_model_config, export_config
   )
diff --git a/litert_torch/generative/export_hf/core/exportable_module_config.py b/litert_torch/generative/export_hf/core/exportable_module_config.py
@@ -31,6 +31,7 @@ class ExportableModuleConfig:
 
   # Export configs
   externalize_embedder: bool = False
+  single_token_embedder: bool = False
   externalize_rope: bool = False
 
   split_cache: bool = False
diff --git a/litert_torch/generative/export_hf/core/external_emb/exportable_module.py b/litert_torch/generative/export_hf/core/external_emb/exportable_module.py
@@ -94,3 +94,33 @@ def forward(
     token_ids = torch.maximum(token_ids, torch.tensor(0, dtype=torch.int32))
     output = self.model(token_ids)
     return {"embeddings": output}
+
+  @classmethod
+  def get_sample_inputs(
+      cls,
+      model_config,
+      export_config: base_exportable_module.ExportableModuleConfig,
+  ):
+    """Gets sample inputs."""
+    batch_size = export_config.batch_size
+    prefill_length = export_config.prefill_lengths[0]
+    prefill_length_dim = export_config.prefill_length_dim
+    del model_config  # Unused.
+    tokens = {"token_ids": torch.ones((batch_size, 1), dtype=torch.int32)}
+    tokens_dynamic_shape = {"token_ids": {1: 1}} if prefill_length_dim else {}
+    if export_config.single_token_embedder:
+      return {"embedder": (tokens, tokens_dynamic_shape)}
+    else:
+      ret = {}
+      ret["decode_embedder"] = (tokens, tokens_dynamic_shape)
+
+      tokens = {
+          "token_ids": torch.ones(
+              (batch_size, prefill_length), dtype=torch.int32
+          )
+      }
+      tokens_dynamic_shape = (
+          {"token_ids": {1: prefill_length_dim}} if prefill_length_dim else {}
+      )
+      ret[f"prefill_embedder_{prefill_length}"] = (tokens, tokens_dynamic_shape)
+      return ret
diff --git a/litert_torch/generative/export_hf/export.py b/litert_torch/generative/export_hf/export.py
@@ -31,6 +31,7 @@ def export(
     quantization_recipe: str = 'dynamic_wi8_afp32',
     enable_dynamic_shape: bool = False,
     externalize_embedder: bool = False,
+    single_token_embedder: bool = False,
     key_ts_idx: int = 2,
     value_ts_idx: int = 3,
     split_cache: bool = False,
@@ -62,6 +63,7 @@ def export(
       if enable_dynamic_shape
       else None,
       externalize_embedder=externalize_embedder,
+      single_token_embedder=single_token_embedder,
       k_ts_idx=key_ts_idx,
       v_ts_idx=value_ts_idx,
       split_cache=split_cache,

Original file line number	Diff line number	Diff line change
`@@ -326,7 +326,7 @@ def export_embedder_model(`
`326`	`326`	`sample_kwargs=sample_inputs,`
`327`	`327`	`)`
`328`	`328`	`lrt_model = converter.convert(strict_export=False)`
`329`		`- model_path = os.path.join(work_dir, 'model.tflite')`
	`329`	`+ model_path = os.path.join(work_dir, 'embedder.tflite')`
`330`	`330`	`lrt_model.export(model_path)`
`331`	`331`	`quantization_recipe_list = (`
`332`	`332`	`quantization_recipe.split(',') if quantization_recipe else [None]`
`@@ -359,7 +359,10 @@ def export_auxiliary_model(`
`359`	`359`	`sample_kwargs=sample_input,`
`360`	`360`	`)`
`361`	`361`	`# Attention Mask`
`362`		`- attention_mask_module = split_cache_module.SplitAttentionMaskBuilder(model)`
	`362`	`+ attention_mask_module = split_cache_module.SplitAttentionMaskBuilder(`
	`363`	`+ export_config.cache_length,`
	`364`	`+ # TODO(weiyiw): Add sliding window sizes.`
	`365`	`+ )`
`363`	`366`	`sample_inputs = attention_mask_module.get_sample_inputs(`
`364`	`367`	`text_model_config, export_config`
`365`	`368`	`)`
`@@ -370,7 +373,7 @@ def export_auxiliary_model(`
`370`	`373`	`sample_kwargs=sample_input,`
`371`	`374`	`)`
`372`	`375`	`# Cache Update`
`373`		`- cache_update_module = split_cache_module.CacheUpdate(model)`
	`376`	`+ cache_update_module = split_cache_module.CacheUpdate()`
`374`	`377`	`sample_inputs = cache_update_module.get_sample_inputs(`
`375`	`378`	`text_model_config, export_config`
`376`	`379`	`)`