Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOGS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Change Logs
0.4.0
+++++

* :pr:`55`: add support for text-classification
* :pr:`54`: add support for fill-mask, refactoring
* :pr:`52`: add support for zero-shot-image-classification
* :pr:`50`: add support for onnxruntime fusion
* :pr:`48`: add support for EncoderDecoderCache, test with openai/whisper-tiny
Expand Down
3 changes: 2 additions & 1 deletion _doc/api/tasks/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ onnx_diagnostic.tasks
automatic_speech_recognition
fill_mask
image_classification
image_text_to_text
image_text_to_text
text_classification
text_generation
text2text_generation
zero_shot_image_classification
Expand Down
7 changes: 7 additions & 0 deletions _doc/api/tasks/text_classification.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

onnx_diagnostic.export.text_classification
==========================================

.. automodule:: onnx_diagnostic.tasks.text_classification
:members:
:no-undoc-members:
9 changes: 9 additions & 0 deletions _unittests/ut_tasks/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ def test_fill_mask(self):
model, inputs = data["model"], data["inputs"]
model(**inputs)

@hide_stdout()
def test_text_classification(self):
mid = "Intel/bert-base-uncased-mrpc"
# mid = "Salesforce/codet5-small"
data = get_untrained_model_with_inputs(mid, verbose=1)
self.assertIn((data["size"], data["n_weights"]), [(154420232, 38605058)])
model, inputs = data["model"], data["inputs"]
model(**inputs)


if __name__ == "__main__":
unittest.main(verbosity=2)
25 changes: 25 additions & 0 deletions _unittests/ut_tasks/try_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,31 @@ def test_fill_mask(self):
output = model(**encoded_input)
print("-- outputs", string_type(output, with_shape=True, with_min_max=True))

@never_test()
def test_text_classification(self):
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k text_cl
# https://huggingface.co/Intel/bert-base-uncased-mrpc

from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("Intel/bert-base-uncased-mrpc")
model = BertModel.from_pretrained("Intel/bert-base-uncased-mrpc")
text = "The inspector analyzed the soundness in the building."
encoded_input = tokenizer(text, return_tensors="pt")
print()
print("-- inputs", string_type(encoded_input, with_shape=True, with_min_max=True))
output = model(**encoded_input)
print("-- outputs", string_type(output, with_shape=True, with_min_max=True))
# print BaseModelOutputWithPoolingAndCrossAttentions and pooler_output

# Print tokens * ids in of inmput string below
print("Tokenized Text: ", tokenizer.tokenize(text), "\n")
print("Token IDs: ", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)))

# Print tokens in text
encoded_input["input_ids"][0]
tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0])


if __name__ == "__main__":
unittest.main(verbosity=2)
5 changes: 4 additions & 1 deletion _unittests/ut_xrun_doc/test_documentation_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ def run_test(self, fold: str, name: str, verbose=0) -> int:
# dot not installed, this part
# is tested in onnx framework
raise unittest.SkipTest(f"failed: {name!r} due to missing dot.")
if "We couldn't connect to 'https://huggingface.co'" in st:
if (
"We couldn't connect to 'https://huggingface.co'" in st
or "Cannot access content at: https://huggingface.co/" in st
):
raise unittest.SkipTest(f"Connectivity issues due to\n{err}")
raise AssertionError( # noqa: B904
"Example '{}' (cmd: {} - exec_prefix='{}') "
Expand Down
2 changes: 2 additions & 0 deletions onnx_diagnostic/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
fill_mask,
image_classification,
image_text_to_text,
text_classification,
text_generation,
text2text_generation,
zero_shot_image_classification,
Expand All @@ -14,6 +15,7 @@
fill_mask,
image_classification,
image_text_to_text,
text_classification,
text_generation,
text2text_generation,
zero_shot_image_classification,
Expand Down
67 changes: 67 additions & 0 deletions onnx_diagnostic/tasks/text_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Any, Callable, Dict, Optional, Tuple
import torch
from ..helpers.config_helper import update_config, check_hasattr

__TASK__ = "text-classification"


def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
"""Reduces a model size."""
check_hasattr(config, "num_attention_heads", "num_hidden_layers")
kwargs = dict(
num_hidden_layers=min(config.num_hidden_layers, 2),
num_attention_heads=min(config.num_attention_heads, 4),
)
update_config(config, kwargs)
return kwargs


def get_inputs(
model: torch.nn.Module,
config: Optional[Any],
batch_size: int,
sequence_length: int,
dummy_max_token_id: int,
**kwargs, # unused
):
"""
Generates inputs for task ``fill-mask``.
Example:

::

input_ids:T7s1x13[101,72654:A16789.23076923077],
token_type_ids:T7s1x13[0,0:A0.0],
attention_mask:T7s1x13[1,1:A1.0])
"""
batch = torch.export.Dim("batch", min=1, max=1024)
seq_length = torch.export.Dim("sequence_length", min=1, max=1024)
shapes = {
"input_ids": {0: batch, 1: seq_length},
"token_type_ids": {0: batch, 1: seq_length},
"attention_mask": {0: batch, 1: seq_length},
}
inputs = dict(
input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
torch.int64
),
token_type_ids=torch.zeros((batch_size, sequence_length)).to(torch.int64),
attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
)
return dict(inputs=inputs, dynamic_shapes=shapes)


def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
"""
Inputs kwargs.

If the configuration is None, the function selects typical dimensions.
"""
if config is not None:
check_hasattr(config, "vocab_size")
kwargs = dict(
batch_size=2,
sequence_length=30,
dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
)
return kwargs, get_inputs
4 changes: 2 additions & 2 deletions onnx_diagnostic/tasks/zero_shot_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def get_inputs(
), f"Unexpected type for input_height {type(input_height)}{config}"

batch = torch.export.Dim("batch", min=1, max=1024)
seq_length = torch.export.Dim("seq_length", min=1, max=4096)
seq_length = "seq_length" # torch.export.Dim("seq_length", min=1, max=4096)
shapes = {
"inputs_ids": {0: batch, 1: seq_length},
"input_ids": {0: batch, 1: seq_length},
"attention_mask": {0: batch, 1: seq_length},
"pixel_values": {
0: torch.export.Dim("batch_img", min=1, max=1024),
Expand Down
2 changes: 2 additions & 0 deletions onnx_diagnostic/torch_models/hghub/hub_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
AlbertModel,feature-extraction
BeitForImageClassification,image-classification
BertForMaskedLM,fill-mask
BertForSequenceClassification,text-classification
BigBirdModel,feature-extraction
BlenderbotModel,feature-extraction
BloomModel,feature-extraction
Expand Down Expand Up @@ -145,6 +146,7 @@
"no-pipeline-tag",
"object-detection",
"reinforcement-learning",
"text-classification",
"text-generation",
"text-to-audio",
"text2text-generation",
Expand Down
34 changes: 34 additions & 0 deletions onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3468,3 +3468,37 @@ def _ccached_google_bert_bert_base_multilingual_cased():
"vocab_size": 119547,
}
)


def _ccached_intel_bert_base_uncased_mrpc():
"Intel/bert-base-uncased-mrpc"
return transformers.BertConfig(
**{
"_name_or_path": "bert-base-uncased",
"architectures": ["BertForSequenceClassification"],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"finetuning_task": "mrpc",
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {"0": "not_equivalent", "1": "equivalent"},
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": {"equivalent": 1, "not_equivalent": 0},
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"problem_type": "single_label_classification",
"torch_dtype": "float32",
"transformers_version": "4.17.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522,
}
)
14 changes: 10 additions & 4 deletions onnx_diagnostic/torch_models/hghub/model_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def get_untrained_model_with_inputs(
# outputs even with the same inputs in training mode.
model.eval()
res = fct(model, config, **kwargs)

res["input_kwargs"] = kwargs
res["model_kwargs"] = mkwargs

Expand All @@ -118,19 +119,24 @@ def get_untrained_model_with_inputs(
update = {}
for k, v in res.items():
if k.startswith(("inputs", "dynamic_shapes")) and isinstance(v, dict):
update[k] = filter_out_unexpected_inputs(model, v)
update[k] = filter_out_unexpected_inputs(model, v, verbose=verbose)
res.update(update)
return res


def filter_out_unexpected_inputs(model: torch.nn.Module, kwargs: Dict[str, Any]):
def filter_out_unexpected_inputs(
model: torch.nn.Module, kwargs: Dict[str, Any], verbose: int = 0
):
"""
Removes input names in kwargs if no parameter names was found in ``model.forward``.
"""
sig = inspect.signature(model.forward)
allowed = set(sig.parameters)
kwargs = {k: v for k, v in kwargs.items() if k in allowed}
return kwargs
new_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
diff = set(kwargs) - set(new_kwargs)
if diff and verbose:
print(f"[filter_out_unexpected_inputs] removed {diff}")
return new_kwargs


def compute_model_size(model: torch.nn.Module) -> Tuple[int, int]:
Expand Down
Loading