add feature extraction

xadupre · xadupre · commit 2e92eda22b04 · 2025-04-24T13:44:50.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -27,6 +27,8 @@ jobs:
             transformers: '4.51.3'
           - python: '3.11'
             torch: '2.7'
+          - python: '3.12'
+            torch: '2.6'
     steps:
       - uses: actions/checkout@v3
 
diff --git a/_doc/api/tasks/feature_extraction.rst b/_doc/api/tasks/feature_extraction.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.tasks.feature_extraction
+========================================
+
+.. automodule:: onnx_diagnostic.tasks.feature_extraction
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/index.rst b/_doc/api/tasks/index.rst
@@ -34,6 +34,7 @@ Or:
 
     automatic_speech_recognition
     fill_mask
+    feature_extraction
     image_classification
     image_text_to_text
     mixture_of_expert
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
@@ -116,6 +116,18 @@ def test_fill_mask(self):
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
             )
 
+    @hide_stdout()
+    def test_feature_extraction(self):
+        mid = "facebook/bart-base"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(557681664, 139420416)])
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        model(**inputs)
+        with bypass_export_some_errors(patch_transformers=True, verbose=10):
+            torch.export.export(
+                model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
+            )
+
     @hide_stdout()
     def test_text_classification(self):
         mid = "Intel/bert-base-uncased-mrpc"
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -338,6 +338,22 @@ def test_fill_mask(self):
         output = model(**encoded_input)
         print("-- outputs", string_type(output, with_shape=True, with_min_max=True))
 
+    @never_test()
+    def test_feature_extraction(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k feature_ex
+        # https://huggingface.co/google-bert/bert-base-multilingual-cased
+
+        from transformers import BartTokenizer, BartModel
+
+        tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+        model = BartModel.from_pretrained("facebook/bart-base")
+        text = "Replace me by any text you'd like."
+        encoded_input = tokenizer(text, return_tensors="pt")
+        print()
+        print("-- inputs", string_type(encoded_input, with_shape=True, with_min_max=True))
+        output = model(**encoded_input)
+        print("-- outputs", string_type(output, with_shape=True, with_min_max=True))
+
     @never_test()
     def test_text_classification(self):
         # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k text_cl
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
@@ -1,6 +1,7 @@
 from typing import Any, Callable, Dict, List, Tuple
 from . import (
     automatic_speech_recognition,
+    feature_extraction,
     fill_mask,
     image_classification,
     image_text_to_text,
@@ -14,6 +15,7 @@
 
 __TASKS__ = [
     automatic_speech_recognition,
+    feature_extraction,
     fill_mask,
     image_classification,
     image_text_to_text,
diff --git a/onnx_diagnostic/tasks/feature_extraction.py b/onnx_diagnostic/tasks/feature_extraction.py
@@ -0,0 +1,65 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "feature-extraction"
+
+
+def reduce_model_config(config: Any) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, "num_attention_heads", "num_hidden_layers")
+    kwargs = dict(
+        num_hidden_layers=min(config.num_hidden_layers, 2),
+        num_attention_heads=min(config.num_attention_heads, 4),
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    batch_size: int,
+    sequence_length: int,
+    dummy_max_token_id: int,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``feature-extraction``.
+    Example:
+
+    ::
+
+        input_ids:T7s1x13[101,72654:A16789.23076923077],
+        token_type_ids:T7s1x13[0,0:A0.0],
+        attention_mask:T7s1x13[1,1:A1.0])
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = "sequence_length"
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "attention_mask": {0: batch, 1: seq_length},
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
+            torch.int64
+        ),
+        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, "vocab_size")
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -13,6 +13,7 @@
     ASTModel,feature-extraction
     AlbertModel,feature-extraction
     BeitForImageClassification,image-classification
+    BartModel,feature-extraction
     BertForMaskedLM,fill-mask
     BertForSequenceClassification,text-classification
     BertModel,sentence-similarity