Smollm2 implementation for ai_torch_edge.

ai-edge-bot · copybara-github · commit 4bf9d7644148 · 2025-01-08T20:05:09.000-08:00
PiperOrigin-RevId: 713507777
diff --git a/ai_edge_torch/generative/examples/README.md b/ai_edge_torch/generative/examples/README.md
@@ -40,11 +40,13 @@ with 270M, 450M, 1.1B, and 3B parameters.  The example we provide is OpenELM 3B,
 and the checkpoint for the model can be found
 [here](https://huggingface.co/apple/OpenELM-3B/tree/main).
 
-## HuggingFace SmolLM
+## HuggingFace SmolLM and SmolLM2
 [HuggingFace SmolLM](https://huggingface.co/blog/smollm) is also a decoder-only
 LLM with 135M, 360M, 1.7B parameters. The example we provide is SmolLM 135M, and
 the checkpoint for the model can be found
 [here](https://huggingface.co/HuggingFaceTB/SmolLM-135M).
+Similarly [SmolLM2](https://huggingface.co/HuggingFaceTB/SmolLM2-135M) has the 
+same architecture as SmolLM but it has been trained on improved training data.
 
 ## Qwen
 Alibaba's [Qwen 2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
diff --git a/ai_edge_torch/generative/examples/smollm/convert_v2_to_tflite.py b/ai_edge_torch/generative/examples/smollm/convert_v2_to_tflite.py
@@ -0,0 +1,71 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Example of converting SmolLM2 model to multi-signature tflite model."""
+
+import os
+import pathlib
+
+from absl import app
+from absl import flags
+from ai_edge_torch.generative.examples.smollm import smollm
+from ai_edge_torch.generative.utilities import converter
+from ai_edge_torch.generative.utilities.model_builder import ExportConfig
+
+_CHECKPOINT_PATH = flags.DEFINE_string(
+    'checkpoint_path',
+    os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smollm2'),
+    'The path to the model checkpoint, or directory holding the checkpoint.',
+)
+_TFLITE_PATH = flags.DEFINE_string(
+    'tflite_path',
+    '/tmp/',
+    'The tflite file path to export.',
+)
+_PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
+    'prefill_seq_lens',
+    (8, 64, 128, 256, 512, 1024),
+    'List of the maximum sizes of prefill input tensors.',
+)
+_KV_CACHE_MAX_LEN = flags.DEFINE_integer(
+    'kv_cache_max_len',
+    1280,
+    'The maximum size of KV cache buffer, including both prefill and decode.',
+)
+_QUANTIZE = flags.DEFINE_bool(
+    'quantize',
+    True,
+    'Whether the model should be quantized.',
+)
+
+
+def main(_):
+  pytorch_model = smollm.build_model_v2(
+      _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
+  )
+
+  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
+  output_filename = f'smollm2_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
+  converter.convert_to_tflite(
+      pytorch_model,
+      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      prefill_seq_len=_PREFILL_SEQ_LENS.value,
+      quantize=_QUANTIZE.value,
+      export_config=ExportConfig(),
+  )
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/ai_edge_torch/generative/examples/smollm/smollm.py b/ai_edge_torch/generative/examples/smollm/smollm.py
@@ -85,3 +85,41 @@ def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
       tensor_names=TENSOR_NAMES,
       model_class=SmolLM,
   )
+
+
+class SmolLM2(model_builder.DecoderOnlyModel):
+  """A SmolLM2 model built from the Edge Generative API layers."""
+  pass
+
+
+def get_model_config_v2(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a SmolLM2 135M model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for a SmolLM2 model.
+  """
+  config = get_model_config(kv_cache_max_len)
+  config.block_config(0).attn_config.rotary_base = 100000
+  return config
+
+
+def get_fake_model_config_v2(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config_v2(**kwargs)
+  config.vocab_size = 128
+  config.num_layers = 2
+  # SmolLM2 has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 64
+  return config
+
+
+def build_model_v2(checkpoint_path: str, **kwargs) -> nn.Module:
+  return model_builder.build_decoder_only_model(
+      checkpoint_path=checkpoint_path,
+      config=get_model_config_v2(**kwargs),
+      tensor_names=TENSOR_NAMES,
+      model_class=SmolLM2,
+  )
diff --git a/ai_edge_torch/generative/examples/smollm/verify.py b/ai_edge_torch/generative/examples/smollm/verify.py
@@ -36,10 +36,26 @@
     30,
     "The maximum size of the generated tokens.",
 )
+_MODEL_VERSION = flags.DEFINE_enum(
+    "model_version",
+    "v1",
+    ["v1", "v2"],
+    "The version of SmolLm to verify.",
+)
+_CHECKPOINT = {
+    "v1": "HuggingFaceTB/SmolLM-135M",
+    "v2": "HuggingFaceTB/SmolLM2-135M",
+}
+
+_BUILDER = {
+    "v1": smollm.build_model,
+    "v2": smollm.build_model_v2,
+}
 
 
 def main(_):
-  checkpoint = "HuggingFaceTB/SmolLM-135M"
+  checkpoint = _CHECKPOINT[_MODEL_VERSION.value]
+  builder = _BUILDER[_MODEL_VERSION.value]
   logging.info("Loading the original model from: %s", checkpoint)
   original_model = transformers.AutoModelForCausalLM.from_pretrained(checkpoint)
 
@@ -49,7 +65,7 @@ def main(_):
   )
   reauthored_checkpoint = pathlib.Path(cached_config_file).parent
   logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
-  reauthored_model = smollm.build_model(reauthored_checkpoint)
+  reauthored_model = builder(reauthored_checkpoint)
 
   logging.info("Loading the tokenizer from: %s", checkpoint)
   tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
diff --git a/ai_edge_torch/generative/test/test_model_conversion_large.py b/ai_edge_torch/generative/test/test_model_conversion_large.py
@@ -150,6 +150,16 @@ def test_smollm(self):
       ai_edge_torch.config.in_oss,
       reason="tests with custom ops are not supported in oss",
   )
+
+  def test_smollm2(self):
+    config = smollm.get_fake_model_config_v2()
+    pytorch_model = smollm.SmolLM2(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
+  @googletest.skipIf(
+      ai_edge_torch.config.in_oss,
+      reason="tests with custom ops are not supported in oss",
+  )
+
   def test_openelm(self):
     config = openelm.get_fake_model_config()
     pytorch_model = openelm.OpenELM(config).eval()