add mamba

xadupre · xadupre · commit c1224c4e4fe3 · 2025-04-15T16:27:45.000+02:00
diff --git a/_doc/examples/plot_export_tiny_llm.py b/_doc/examples/plot_export_tiny_llm.py
@@ -31,7 +31,7 @@
 import transformers
 from onnx_diagnostic import doc
 from onnx_diagnostic.helpers import string_type
-from onnx_diagnostic.helpers.torch_test_helper import steel_forward
+from onnx_diagnostic.helpers.torch_test_helper import steal_forward
 from onnx_diagnostic.torch_models.llms import get_tiny_llm
 
 
@@ -77,9 +77,9 @@ def _forward_(*args, _f=None, **kwargs):
 model.forward = keep_model_forward
 
 # %%
-# Another syntax with :func:`onnx_diagnostic.helpers.torch_test_helper.steel_forward`.
+# Another syntax with :func:`onnx_diagnostic.helpers.torch_test_helper.steal_forward`.
 
-with steel_forward(model):
+with steal_forward(model):
     model.generate(inputs, max_length=50, temperature=1, top_k=50, top_p=0.95, do_sample=True)
 
 # %%
diff --git a/_unittests/ut_helpers/test_torch_test_helper.py b/_unittests/ut_helpers/test_torch_test_helper.py
@@ -8,7 +8,7 @@
     dummy_llm,
     to_numpy,
     is_torchdynamo_exporting,
-    steel_forward,
+    steal_forward,
     replace_string_by_dynamic,
     to_any,
     torch_deepcopy,
@@ -43,14 +43,14 @@ def test_to_numpy(self):
         self.assertEqual(a.dtype, ml_dtypes.bfloat16)
 
     @hide_stdout()
-    def test_steel_forward(self):
+    def test_steal_forward(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
                 return x + y
 
         inputs = torch.rand(3, 4), torch.rand(3, 4)
         model = Model()
-        with steel_forward(model):
+        with steal_forward(model):
             model(*inputs)
 
     def test_replace_string_by_dynamic(self):
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
@@ -113,6 +113,15 @@ def test_sentence_similary(self):
         model, inputs = data["model"], data["inputs"]
         model(**inputs)
 
+    @hide_stdout()
+    def test_falcon_mamba_dev(self):
+        mid = "tiiuae/falcon-mamba-tiny-dev"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        model, inputs = data["model"], data["inputs"]
+        print(self.string_type(inputs, with_shape=True))
+        model(**inputs)
+        self.assertIn((data["size"], data["n_weights"]), [(62461440, 15615360)])
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -1,7 +1,7 @@
 import unittest
 from onnx_diagnostic.ext_test_case import ExtTestCase, never_test
 from onnx_diagnostic.helpers import string_type
-from onnx_diagnostic.helpers.torch_test_helper import steel_forward
+from onnx_diagnostic.helpers.torch_test_helper import steal_forward
 
 
 class TestHuggingFaceHubModel(ExtTestCase):
@@ -92,7 +92,7 @@ def test_text2text_generation(self):
 
         # simply generate a single sequence
         print()
-        with steel_forward(model):
+        with steal_forward(model):
             generated_ids = model.generate(
                 decoder_input_ids=input_ids, attention_mask=mask, max_length=100
             )
@@ -121,7 +121,7 @@ def test_imagetext2text_generation(self):
             ["<image>", "<fake_token_around_image>"], add_special_tokens=False
         ).input_ids
         print()
-        with steel_forward(model):
+        with steal_forward(model):
             generated_ids = model.generate(
                 **inputs, max_new_tokens=10, bad_words_ids=bad_words_ids
             )
@@ -184,7 +184,7 @@ def test_automatic_speech_recognition(self):
 
         # generate token ids
         print()
-        with steel_forward(model):
+        with steal_forward(model):
             predicted_ids = model.generate(
                 input_features, forced_decoder_ids=forced_decoder_ids
             )
@@ -285,6 +285,80 @@ def mean_pooling(model_output, attention_mask):
         print("Sentence embeddings:")
         print(sentence_embeddings)
 
+    @never_test()
+    def test_falcon_mamba_dev(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k falcon_mamba_dev
+        # https://huggingface.co/tiiuae/falcon-mamba-tiny-dev
+
+        from transformers import AutoTokenizer
+        import transformers
+        import torch
+
+        model = "tiiuae/falcon-mamba-tiny-dev"
+
+        tokenizer = AutoTokenizer.from_pretrained(model)
+        pipeline = transformers.pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+        print()
+        with steal_forward(pipeline.model):
+            sequences = pipeline(
+                "Girafatron is obsessed with giraffes, "
+                "the most glorious animal on the face of this Earth. "
+                "Giraftron believes all other animals are irrelevant "
+                "when compared to the glorious majesty of the giraffe."
+                "\nDaniel: Hello, Girafatron!\nGirafatron:",
+                max_length=200,
+                do_sample=True,
+                top_k=10,
+                num_return_sequences=1,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        for seq in sequences:
+            print(f"Result: {seq['generated_text']}")
+
+    @never_test()
+    def test_falcon_mamba_7b(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k falcon_mamba_7b
+        # https://huggingface.co/tiiuae/falcon-mamba-7b
+
+        from transformers import AutoTokenizer
+        import transformers
+        import torch
+
+        model = "tiiuae/falcon-mamba-7b"
+
+        tokenizer = AutoTokenizer.from_pretrained(model)
+        pipeline = transformers.pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+        print()
+        with steal_forward(pipeline.model):
+            sequences = pipeline(
+                "Girafatron is obsessed with giraffes, "
+                "the most glorious animal on the face of this Earth. "
+                "Giraftron believes all other animals are irrelevant "
+                "when compared to the glorious majesty of the giraffe."
+                "\nDaniel: Hello, Girafatron!\nGirafatron:",
+                max_length=200,
+                do_sample=True,
+                top_k=10,
+                num_return_sequences=1,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        for seq in sequences:
+            print(f"Result: {seq['generated_text']}")
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -136,3 +136,27 @@ def make_encoder_decoder_cache(
     return transformers.cache_utils.EncoderDecoderCache(
         self_attention_cache=self_attention_cache, cross_attention_cache=cross_attention_cache
     )
+
+
+def make_mamba_cache(
+    key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+) -> transformers.cache_utils.MambaCache:
+    "Creates a :class:`transformers.cache_utils.MambaCache`."
+
+    class _config:
+        def __init__(self):
+            self.intermediate_size = key_value_pairs[0][0].shape[1]
+            self.conv_kernel = key_value_pairs[0][0].shape[-1]
+            self.state_size = key_value_pairs[0][1].shape[-1]
+            self.num_hidden_layers = len(key_value_pairs)
+            self.dtype = key_value_pairs[0][0].dtype
+
+    cache = transformers.cache_utils.MambaCache(
+        _config(),
+        max_batch_size=key_value_pairs[0][0].shape[0],
+        device=key_value_pairs[0][0].device,
+    )
+    for i in range(len(key_value_pairs)):
+        cache.conv_states[i][:, :, :] = key_value_pairs[i][0]
+        cache.ssm_states[i][:, :, :] = key_value_pairs[i][1]
+    return cache
diff --git a/onnx_diagnostic/helpers/torch_test_helper.py b/onnx_diagnostic/helpers/torch_test_helper.py
@@ -31,7 +31,7 @@ def _forward_(*args, _f=None, _context=None, **kwargs):
 
 
 @contextlib.contextmanager
-def steel_forward(model: torch.nn.Module, with_shape: bool = True, with_min_max: bool = False):
+def steal_forward(model: torch.nn.Module, with_shape: bool = True, with_min_max: bool = False):
     """
     The necessary modification to steem forward method and prints out inputs
     and outputs. See example :ref:`l-plot-tiny-llm-export`.
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py