improve task text2text

xadupre · xadupre · commit 330b46cfe2db · 2025-05-19T14:12:09.000+02:00
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.5.0
 +++++
 
+* :pr:`101`: first draft to rewrite loops
 * :pr:`100`: implements a context to automatically rewrite methods or function with control flows
 * :pr:`96`: implements ``is_stealing``, ``steal_append`` to complement ``steal_forward``
 * :pr:`95`: fixzq Scan implementation for ``OnnxruntimeEvaluator``
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
@@ -123,7 +123,7 @@ def test_fill_mask(self):
             )
 
     @hide_stdout()
-    def test_feature_extraction(self):
+    def test_feature_extraction_bart_base(self):
         mid = "facebook/bart-base"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
         self.assertEqual(data["task"], "feature-extraction")
@@ -136,6 +136,20 @@ def test_feature_extraction(self):
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
             )
 
+    @hide_stdout()
+    def test_feature_extraction_tiny_bart(self):
+        mid = "hf-tiny-model-private/tiny-random-PLBartForConditionalGeneration"
+        data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
+        self.assertEqual(data["task"], "feature-extraction")
+        self.assertIn((data["size"], data["n_weights"]), [(557681664, 139420416)])
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        model(**inputs)
+        model(**data["inputs2"])
+        with torch_export_patches(patch_transformers=True, verbose=10):
+            torch.export.export(
+                model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
+            )
+
     @hide_stdout()
     def test_text_classification(self):
         mid = "Intel/bert-base-uncased-mrpc"
diff --git a/onnx_diagnostic/tasks/text2text_generation.py b/onnx_diagnostic/tasks/text2text_generation.py
@@ -21,9 +21,11 @@ def get_inputs(
     model: torch.nn.Module,
     config: Optional[Any],
     dummy_max_token_id: int,
-    num_key_value_heads: int,
+    num_key_value_heads_encoder: int,
+    num_key_value_heads_decoder: int,
     num_hidden_layers: int,
-    head_dim: int,
+    head_dim_encoder: int,
+    head_dim_decoder: int,
     encoder_dim: int,
     batch_size: int = 2,
     sequence_length: int = 30,
@@ -36,7 +38,10 @@ def get_inputs(
 
     :param model: model to get the missing information
     :param config: configuration used to generate the model
-    :param head_dim: last dimension of the cache
+    :param head_dim_encoder: last dimension of the cache for the encoder
+    :param head_dim_decoder: last dimension of the cache for the decoder
+    :param num_key_value_heads_encoder: number of heads for the encoder
+    :param num_key_value_heads_decoder: number of heads for the decoder
     :param dummy_max_token_id: dummy max token id
     :param batch_size: batch size
     :param encoder_dim: last dimension of encoder_last_hidden_state
@@ -83,6 +88,7 @@ def get_inputs(
         # "encoder_last_hidden_state": {0: batch, 1: torch.export.Dim.DYNAMIC},
         # "encoder_outputs": {0: batch, 1: torch.export.Dim.DYNAMIC},
     }
+
     inputs = dict(
         input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
             torch.int64
@@ -99,10 +105,16 @@ def get_inputs(
                 [
                     (
                         torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
+                            batch_size,
+                            num_key_value_heads_encoder,
+                            sequence_length,
+                            head_dim_encoder,
                         ),
                         torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
+                            batch_size,
+                            num_key_value_heads_encoder,
+                            sequence_length,
+                            head_dim_encoder,
                         ),
                     )
                     for i in range(num_hidden_layers)
@@ -112,10 +124,16 @@ def get_inputs(
                 [
                     (
                         torch.randn(
-                            batch_size, num_key_value_heads, sequence_length2, head_dim
+                            batch_size,
+                            num_key_value_heads_decoder,
+                            sequence_length2,
+                            head_dim_decoder,
                         ),
                         torch.randn(
-                            batch_size, num_key_value_heads, sequence_length2, head_dim
+                            batch_size,
+                            num_key_value_heads_decoder,
+                            sequence_length2,
+                            head_dim_decoder,
                         ),
                     )
                     for i in range(num_hidden_layers)
@@ -132,9 +150,11 @@ def get_inputs(
             model=model,
             config=config,
             dummy_max_token_id=dummy_max_token_id,
-            num_key_value_heads=num_key_value_heads,
+            num_key_value_heads_encoder=num_key_value_heads_encoder,
+            num_key_value_heads_decoder=num_key_value_heads_decoder,
             num_hidden_layers=num_hidden_layers,
-            head_dim=head_dim,
+            head_dim_encoder=head_dim_encoder,
+            head_dim_decoder=head_dim_decoder,
             encoder_dim=encoder_dim,
             batch_size=batch_size + 1,
             sequence_length=sequence_length + 1,
@@ -173,20 +193,30 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
         batch_size=2,
         sequence_length=30,
         sequence_length2=3,
-        head_dim=16 if config is None else (config.d_kv if hasattr(config, "d_kv") else 1),
+        head_dim_encoder=16 if config is None else _pick(config, "d_kv", "encoder_ffn_dim"),
+        head_dim_decoder=16 if config is None else _pick(config, "d_kv", "decoder_ffn_dim"),
         dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
         num_hidden_layers=(
             8 if config is None else _pick(config, "num_hidden_layers", "num_layers")
         ),
-        num_key_value_heads=(
+        num_key_value_heads_encoder=(
+            16
+            if config is None
+            else _pick(
+                config,
+                "encoder_attention_heads",
+                "num_key_value_heads",
+                "num_heads",
+            )
+        ),
+        num_key_value_heads_decoder=(
             16
             if config is None
             else _pick(
                 config,
+                "decoder_attention_heads",
                 "num_key_value_heads",
                 "num_heads",
-                (sum, "encoder_attention_heads", "decoder_attention_heads"),
-                # exceptions=exceptions,
             )
         ),
         encoder_dim=512 if config is None else _pick(config, "n_positions", "d_model"),
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -3850,3 +3850,40 @@ def _ccached_hustvl_yolos_tiny():
             "use_mid_position_embeddings": false,
         }
     )
+
+
+def _ccached_facebook_bart_base():
+    "hf-tiny-model-private/tiny-random-PLBartForConditionalGeneration"
+    return transformers.BartConfig(
+        **{
+            "activation_dropout": 0.0,
+            "activation_function": "gelu",
+            "architectures": ["PLBartForConditionalGeneration"],
+            "attention_dropout": 0.1,
+            "bos_token_id": 0,
+            "classifier_dropout": 0.0,
+            "d_model": 16,
+            "decoder_attention_heads": 4,
+            "decoder_ffn_dim": 4,
+            "decoder_layerdrop": 0.0,
+            "decoder_layers": 2,
+            "dropout": 0.1,
+            "encoder_attention_heads": 4,
+            "encoder_ffn_dim": 4,
+            "encoder_layerdrop": 0.0,
+            "encoder_layers": 2,
+            "eos_token_id": 2,
+            "forced_eos_token_id": 2,
+            "init_std": 0.02,
+            "is_encoder_decoder": true,
+            "max_position_embeddings": 100,
+            "model_type": "plbart",
+            "num_hidden_layers": 2,
+            "pad_token_id": 1,
+            "scale_embedding": true,
+            "torch_dtype": "float32",
+            "transformers_version": "4.52.0.dev0",
+            "use_cache": true,
+            "vocab_size": 50005,
+        }
+    )
diff --git a/onnx_diagnostic/torch_models/test_helper.py b/onnx_diagnostic/torch_models/test_helper.py
@@ -374,7 +374,7 @@ def validate_model(
 
     for k in ["task", "size", "n_weights"]:
         summary[f"model_{k.replace('_','')}"] = data[k]
-    summary["model_inputs_opionts"] = str(input_options or "")
+    summary["model_inputs_options"] = str(input_options or "")
     summary["model_inputs"] = string_type(data["inputs"], with_shape=True)
     summary["model_shapes"] = string_type(data["dynamic_shapes"])
     summary["model_class"] = data["model"].__class__.__name__