add support for text-classification (#55)

xadupre · web-flow · commit 035ccf8ffe2a · 2025-04-15T14:49:54.000+02:00
* add support for text-classification

* fix zero shot

* fix examples
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,8 @@ Change Logs
 0.4.0
 +++++
 
+* :pr:`55`: add support for text-classification
+* :pr:`54`: add support for fill-mask, refactoring
 * :pr:`52`: add support for zero-shot-image-classification
 * :pr:`50`: add support for onnxruntime fusion
 * :pr:`48`: add support for EncoderDecoderCache, test with openai/whisper-tiny
diff --git a/_doc/api/tasks/index.rst b/_doc/api/tasks/index.rst
@@ -8,7 +8,8 @@ onnx_diagnostic.tasks
     automatic_speech_recognition
     fill_mask
     image_classification
-    image_text_to_text      
+    image_text_to_text
+    text_classification
     text_generation
     text2text_generation
     zero_shot_image_classification
diff --git a/_doc/api/tasks/text_classification.rst b/_doc/api/tasks/text_classification.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.text_classification
+==========================================
+
+.. automodule:: onnx_diagnostic.tasks.text_classification
+    :members:
+    :no-undoc-members:
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
@@ -100,6 +100,15 @@ def test_fill_mask(self):
         model, inputs = data["model"], data["inputs"]
         model(**inputs)
 
+    @hide_stdout()
+    def test_text_classification(self):
+        mid = "Intel/bert-base-uncased-mrpc"
+        # mid = "Salesforce/codet5-small"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(154420232, 38605058)])
+        model, inputs = data["model"], data["inputs"]
+        model(**inputs)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -211,6 +211,31 @@ def test_fill_mask(self):
         output = model(**encoded_input)
         print("-- outputs", string_type(output, with_shape=True, with_min_max=True))
 
+    @never_test()
+    def test_text_classification(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k text_cl
+        # https://huggingface.co/Intel/bert-base-uncased-mrpc
+
+        from transformers import BertTokenizer, BertModel
+
+        tokenizer = BertTokenizer.from_pretrained("Intel/bert-base-uncased-mrpc")
+        model = BertModel.from_pretrained("Intel/bert-base-uncased-mrpc")
+        text = "The inspector analyzed the soundness in the building."
+        encoded_input = tokenizer(text, return_tensors="pt")
+        print()
+        print("-- inputs", string_type(encoded_input, with_shape=True, with_min_max=True))
+        output = model(**encoded_input)
+        print("-- outputs", string_type(output, with_shape=True, with_min_max=True))
+        # print BaseModelOutputWithPoolingAndCrossAttentions and  pooler_output
+
+        # Print tokens * ids in of inmput string below
+        print("Tokenized Text: ", tokenizer.tokenize(text), "\n")
+        print("Token IDs: ", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)))
+
+        # Print tokens in text
+        encoded_input["input_ids"][0]
+        tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0])
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_xrun_doc/test_documentation_recipes.py b/_unittests/ut_xrun_doc/test_documentation_recipes.py
@@ -53,7 +53,10 @@ def run_test(self, fold: str, name: str, verbose=0) -> int:
                     # dot not installed, this part
                     # is tested in onnx framework
                     raise unittest.SkipTest(f"failed: {name!r} due to missing dot.")
-                if "We couldn't connect to 'https://huggingface.co'" in st:
+                if (
+                    "We couldn't connect to 'https://huggingface.co'" in st
+                    or "Cannot access content at: https://huggingface.co/" in st
+                ):
                     raise unittest.SkipTest(f"Connectivity issues due to\n{err}")
                 raise AssertionError(  # noqa: B904
                     "Example '{}' (cmd: {} - exec_prefix='{}') "
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
@@ -4,6 +4,7 @@
     fill_mask,
     image_classification,
     image_text_to_text,
+    text_classification,
     text_generation,
     text2text_generation,
     zero_shot_image_classification,
@@ -14,6 +15,7 @@
     fill_mask,
     image_classification,
     image_text_to_text,
+    text_classification,
     text_generation,
     text2text_generation,
     zero_shot_image_classification,
diff --git a/onnx_diagnostic/tasks/text_classification.py b/onnx_diagnostic/tasks/text_classification.py
@@ -0,0 +1,67 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "text-classification"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, "num_attention_heads", "num_hidden_layers")
+    kwargs = dict(
+        num_hidden_layers=min(config.num_hidden_layers, 2),
+        num_attention_heads=min(config.num_attention_heads, 4),
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    batch_size: int,
+    sequence_length: int,
+    dummy_max_token_id: int,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``fill-mask``.
+    Example:
+
+    ::
+
+        input_ids:T7s1x13[101,72654:A16789.23076923077],
+        token_type_ids:T7s1x13[0,0:A0.0],
+        attention_mask:T7s1x13[1,1:A1.0])
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("sequence_length", min=1, max=1024)
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "token_type_ids": {0: batch, 1: seq_length},
+        "attention_mask": {0: batch, 1: seq_length},
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
+            torch.int64
+        ),
+        token_type_ids=torch.zeros((batch_size, sequence_length)).to(torch.int64),
+        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, "vocab_size")
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/zero_shot_image_classification.py b/onnx_diagnostic/tasks/zero_shot_image_classification.py
@@ -62,9 +62,9 @@ def get_inputs(
     ), f"Unexpected type for input_height {type(input_height)}{config}"
 
     batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+    seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
     shapes = {
-        "inputs_ids": {0: batch, 1: seq_length},
+        "input_ids": {0: batch, 1: seq_length},
         "attention_mask": {0: batch, 1: seq_length},
         "pixel_values": {
             0: torch.export.Dim("batch_img", min=1, max=1024),
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -14,6 +14,7 @@
     AlbertModel,feature-extraction
     BeitForImageClassification,image-classification
     BertForMaskedLM,fill-mask
+    BertForSequenceClassification,text-classification
     BigBirdModel,feature-extraction
     BlenderbotModel,feature-extraction
     BloomModel,feature-extraction
@@ -145,6 +146,7 @@
     "no-pipeline-tag",
     "object-detection",
     "reinforcement-learning",
+    "text-classification",
     "text-generation",
     "text-to-audio",
     "text2text-generation",
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -3468,3 +3468,37 @@ def _ccached_google_bert_bert_base_multilingual_cased():
             "vocab_size": 119547,
         }
     )
+
+
+def _ccached_intel_bert_base_uncased_mrpc():
+    "Intel/bert-base-uncased-mrpc"
+    return transformers.BertConfig(
+        **{
+            "_name_or_path": "bert-base-uncased",
+            "architectures": ["BertForSequenceClassification"],
+            "attention_probs_dropout_prob": 0.1,
+            "classifier_dropout": null,
+            "finetuning_task": "mrpc",
+            "gradient_checkpointing": false,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "id2label": {"0": "not_equivalent", "1": "equivalent"},
+            "initializer_range": 0.02,
+            "intermediate_size": 3072,
+            "label2id": {"equivalent": 1, "not_equivalent": 0},
+            "layer_norm_eps": 1e-12,
+            "max_position_embeddings": 512,
+            "model_type": "bert",
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "pad_token_id": 0,
+            "position_embedding_type": "absolute",
+            "problem_type": "single_label_classification",
+            "torch_dtype": "float32",
+            "transformers_version": "4.17.0",
+            "type_vocab_size": 2,
+            "use_cache": true,
+            "vocab_size": 30522,
+        }
+    )
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -105,6 +105,7 @@ def get_untrained_model_with_inputs(
     # outputs even with the same inputs in training mode.
     model.eval()
     res = fct(model, config, **kwargs)
+
     res["input_kwargs"] = kwargs
     res["model_kwargs"] = mkwargs
 
@@ -118,19 +119,24 @@ def get_untrained_model_with_inputs(
     update = {}
     for k, v in res.items():
         if k.startswith(("inputs", "dynamic_shapes")) and isinstance(v, dict):
-            update[k] = filter_out_unexpected_inputs(model, v)
+            update[k] = filter_out_unexpected_inputs(model, v, verbose=verbose)
     res.update(update)
     return res
 
 
-def filter_out_unexpected_inputs(model: torch.nn.Module, kwargs: Dict[str, Any]):
+def filter_out_unexpected_inputs(
+    model: torch.nn.Module, kwargs: Dict[str, Any], verbose: int = 0
+):
     """
     Removes input names in kwargs if no parameter names was found in ``model.forward``.
     """
     sig = inspect.signature(model.forward)
     allowed = set(sig.parameters)
-    kwargs = {k: v for k, v in kwargs.items() if k in allowed}
-    return kwargs
+    new_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
+    diff = set(kwargs) - set(new_kwargs)
+    if diff and verbose:
+        print(f"[filter_out_unexpected_inputs] removed {diff}")
+    return new_kwargs
 
 
 def compute_model_size(model: torch.nn.Module) -> Tuple[int, int]: