Fix conversion issue of amd-llama-135M.

haozha111 · copybara-github · commit 8c52eb03b72f · 2025-02-18T15:07:30.000-08:00
PiperOrigin-RevId: 728375648
diff --git a/ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py b/ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py
@@ -29,39 +29,48 @@
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/amd-llama-135m'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
-    '/tmp/',
-    'The tflite file path to export.',
-)
-_PREFILL_SEQ_LEN = flags.DEFINE_integer(
-    'prefill_seq_len',
-    1024,
-    'The maximum size of prefill input tensor.',
-)
 _KV_CACHE_MAX_LEN = flags.DEFINE_integer(
     'kv_cache_max_len',
     1280,
     'The maximum size of KV cache buffer, including both prefill and decode.',
 )
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
+    '/tmp/',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'deepseek',
+    'The prefix of the output tflite model name.',
+)
+_PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
+    'prefill_seq_lens',
+    (8, 64, 128, 256, 512, 1024),
+    'List of the maximum sizes of prefill input tensors.',
+)
 _QUANTIZE = flags.DEFINE_bool(
     'quantize',
     True,
     'Whether the model should be quantized.',
 )
-
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
 
 def main(_):
   pytorch_model = amd_llama_135m.build_model(
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  output_filename = f'amd-llama-135m_{quant_suffix}_seq{_PREFILL_SEQ_LEN.value}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
-      prefill_seq_len=_PREFILL_SEQ_LEN.value,
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
+      prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
       export_config=ExportConfig(),
   )
 
diff --git a/ai_edge_torch/generative/examples/amd_llama_135m/verify.py b/ai_edge_torch/generative/examples/amd_llama_135m/verify.py
@@ -51,7 +51,7 @@ def main(_):
   )
   reauthored_checkpoint = pathlib.Path(cached_config_file).parent
   logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
-  reauthored_model = amd_llama_135m.build_model(reauthored_checkpoint)
+  reauthored_model = amd_llama_135m.build_model(str(reauthored_checkpoint))
 
   logging.info("Loading the tokenizer from: %s", checkpoint)
   tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ def main(_):`
`51`	`51`	`)`
`52`	`52`	`reauthored_checkpoint = pathlib.Path(cached_config_file).parent`
`53`	`53`	`logging.info("Building the reauthored model from: %s", reauthored_checkpoint)`
`54`		`- reauthored_model = amd_llama_135m.build_model(reauthored_checkpoint)`
	`54`	`+ reauthored_model = amd_llama_135m.build_model(str(reauthored_checkpoint))`
`55`	`55`
`56`	`56`	`logging.info("Loading the tokenizer from: %s", checkpoint)`
`57`	`57`	`tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)`