diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst index abedf352..3dd067aa 100644 --- a/CHANGELOGS.rst +++ b/CHANGELOGS.rst @@ -4,6 +4,8 @@ Change Logs 0.4.0 +++++ +* :pr:`55`: add support for text-classification +* :pr:`54`: add support for fill-mask, refactoring * :pr:`52`: add support for zero-shot-image-classification * :pr:`50`: add support for onnxruntime fusion * :pr:`48`: add support for EncoderDecoderCache, test with openai/whisper-tiny diff --git a/_doc/api/tasks/index.rst b/_doc/api/tasks/index.rst index ae7e14b3..4004b8f8 100644 --- a/_doc/api/tasks/index.rst +++ b/_doc/api/tasks/index.rst @@ -8,7 +8,8 @@ onnx_diagnostic.tasks automatic_speech_recognition fill_mask image_classification - image_text_to_text + image_text_to_text + text_classification text_generation text2text_generation zero_shot_image_classification diff --git a/_doc/api/tasks/text_classification.rst b/_doc/api/tasks/text_classification.rst new file mode 100644 index 00000000..22b53799 --- /dev/null +++ b/_doc/api/tasks/text_classification.rst @@ -0,0 +1,7 @@ + +onnx_diagnostic.export.text_classification +========================================== + +.. automodule:: onnx_diagnostic.tasks.text_classification + :members: + :no-undoc-members: diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py index e52a0697..30139048 100644 --- a/_unittests/ut_tasks/test_tasks.py +++ b/_unittests/ut_tasks/test_tasks.py @@ -100,6 +100,15 @@ def test_fill_mask(self): model, inputs = data["model"], data["inputs"] model(**inputs) + @hide_stdout() + def test_text_classification(self): + mid = "Intel/bert-base-uncased-mrpc" + # mid = "Salesforce/codet5-small" + data = get_untrained_model_with_inputs(mid, verbose=1) + self.assertIn((data["size"], data["n_weights"]), [(154420232, 38605058)]) + model, inputs = data["model"], data["inputs"] + model(**inputs) + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py index e05a2161..358eb40d 100644 --- a/_unittests/ut_tasks/try_tasks.py +++ b/_unittests/ut_tasks/try_tasks.py @@ -211,6 +211,31 @@ def test_fill_mask(self): output = model(**encoded_input) print("-- outputs", string_type(output, with_shape=True, with_min_max=True)) + @never_test() + def test_text_classification(self): + # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k text_cl + # https://huggingface.co/Intel/bert-base-uncased-mrpc + + from transformers import BertTokenizer, BertModel + + tokenizer = BertTokenizer.from_pretrained("Intel/bert-base-uncased-mrpc") + model = BertModel.from_pretrained("Intel/bert-base-uncased-mrpc") + text = "The inspector analyzed the soundness in the building." + encoded_input = tokenizer(text, return_tensors="pt") + print() + print("-- inputs", string_type(encoded_input, with_shape=True, with_min_max=True)) + output = model(**encoded_input) + print("-- outputs", string_type(output, with_shape=True, with_min_max=True)) + # print BaseModelOutputWithPoolingAndCrossAttentions and pooler_output + + # Print tokens * ids in of inmput string below + print("Tokenized Text: ", tokenizer.tokenize(text), "\n") + print("Token IDs: ", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))) + + # Print tokens in text + encoded_input["input_ids"][0] + tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0]) + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/_unittests/ut_xrun_doc/test_documentation_recipes.py b/_unittests/ut_xrun_doc/test_documentation_recipes.py index d057b744..cef7e9b4 100644 --- a/_unittests/ut_xrun_doc/test_documentation_recipes.py +++ b/_unittests/ut_xrun_doc/test_documentation_recipes.py @@ -53,7 +53,10 @@ def run_test(self, fold: str, name: str, verbose=0) -> int: # dot not installed, this part # is tested in onnx framework raise unittest.SkipTest(f"failed: {name!r} due to missing dot.") - if "We couldn't connect to 'https://huggingface.co'" in st: + if ( + "We couldn't connect to 'https://huggingface.co'" in st + or "Cannot access content at: https://huggingface.co/" in st + ): raise unittest.SkipTest(f"Connectivity issues due to\n{err}") raise AssertionError( # noqa: B904 "Example '{}' (cmd: {} - exec_prefix='{}') " diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py index b2d01d36..0a770514 100644 --- a/onnx_diagnostic/tasks/__init__.py +++ b/onnx_diagnostic/tasks/__init__.py @@ -4,6 +4,7 @@ fill_mask, image_classification, image_text_to_text, + text_classification, text_generation, text2text_generation, zero_shot_image_classification, @@ -14,6 +15,7 @@ fill_mask, image_classification, image_text_to_text, + text_classification, text_generation, text2text_generation, zero_shot_image_classification, diff --git a/onnx_diagnostic/tasks/text_classification.py b/onnx_diagnostic/tasks/text_classification.py new file mode 100644 index 00000000..5ed6329b --- /dev/null +++ b/onnx_diagnostic/tasks/text_classification.py @@ -0,0 +1,67 @@ +from typing import Any, Callable, Dict, Optional, Tuple +import torch +from ..helpers.config_helper import update_config, check_hasattr + +__TASK__ = "text-classification" + + +def reduce_model_config(config: Any, task: str) -> Dict[str, Any]: + """Reduces a model size.""" + check_hasattr(config, "num_attention_heads", "num_hidden_layers") + kwargs = dict( + num_hidden_layers=min(config.num_hidden_layers, 2), + num_attention_heads=min(config.num_attention_heads, 4), + ) + update_config(config, kwargs) + return kwargs + + +def get_inputs( + model: torch.nn.Module, + config: Optional[Any], + batch_size: int, + sequence_length: int, + dummy_max_token_id: int, + **kwargs, # unused +): + """ + Generates inputs for task ``fill-mask``. + Example: + + :: + + input_ids:T7s1x13[101,72654:A16789.23076923077], + token_type_ids:T7s1x13[0,0:A0.0], + attention_mask:T7s1x13[1,1:A1.0]) + """ + batch = torch.export.Dim("batch", min=1, max=1024) + seq_length = torch.export.Dim("sequence_length", min=1, max=1024) + shapes = { + "input_ids": {0: batch, 1: seq_length}, + "token_type_ids": {0: batch, 1: seq_length}, + "attention_mask": {0: batch, 1: seq_length}, + } + inputs = dict( + input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to( + torch.int64 + ), + token_type_ids=torch.zeros((batch_size, sequence_length)).to(torch.int64), + attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64), + ) + return dict(inputs=inputs, dynamic_shapes=shapes) + + +def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]: + """ + Inputs kwargs. + + If the configuration is None, the function selects typical dimensions. + """ + if config is not None: + check_hasattr(config, "vocab_size") + kwargs = dict( + batch_size=2, + sequence_length=30, + dummy_max_token_id=31999 if config is None else (config.vocab_size - 1), + ) + return kwargs, get_inputs diff --git a/onnx_diagnostic/tasks/zero_shot_image_classification.py b/onnx_diagnostic/tasks/zero_shot_image_classification.py index 87f774e5..10ce76a1 100644 --- a/onnx_diagnostic/tasks/zero_shot_image_classification.py +++ b/onnx_diagnostic/tasks/zero_shot_image_classification.py @@ -62,9 +62,9 @@ def get_inputs( ), f"Unexpected type for input_height {type(input_height)}{config}" batch = torch.export.Dim("batch", min=1, max=1024) - seq_length = torch.export.Dim("seq_length", min=1, max=4096) + seq_length = "seq_length" # torch.export.Dim("seq_length", min=1, max=4096) shapes = { - "inputs_ids": {0: batch, 1: seq_length}, + "input_ids": {0: batch, 1: seq_length}, "attention_mask": {0: batch, 1: seq_length}, "pixel_values": { 0: torch.export.Dim("batch_img", min=1, max=1024), diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py index 4da7f968..d5a70925 100644 --- a/onnx_diagnostic/torch_models/hghub/hub_data.py +++ b/onnx_diagnostic/torch_models/hghub/hub_data.py @@ -14,6 +14,7 @@ AlbertModel,feature-extraction BeitForImageClassification,image-classification BertForMaskedLM,fill-mask + BertForSequenceClassification,text-classification BigBirdModel,feature-extraction BlenderbotModel,feature-extraction BloomModel,feature-extraction @@ -145,6 +146,7 @@ "no-pipeline-tag", "object-detection", "reinforcement-learning", + "text-classification", "text-generation", "text-to-audio", "text2text-generation", diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py index 89786821..0daf939a 100644 --- a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +++ b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py @@ -3468,3 +3468,37 @@ def _ccached_google_bert_bert_base_multilingual_cased(): "vocab_size": 119547, } ) + + +def _ccached_intel_bert_base_uncased_mrpc(): + "Intel/bert-base-uncased-mrpc" + return transformers.BertConfig( + **{ + "_name_or_path": "bert-base-uncased", + "architectures": ["BertForSequenceClassification"], + "attention_probs_dropout_prob": 0.1, + "classifier_dropout": null, + "finetuning_task": "mrpc", + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "id2label": {"0": "not_equivalent", "1": "equivalent"}, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": {"equivalent": 1, "not_equivalent": 0}, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "torch_dtype": "float32", + "transformers_version": "4.17.0", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 30522, + } + ) diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py index 0625b0f6..400a7385 100644 --- a/onnx_diagnostic/torch_models/hghub/model_inputs.py +++ b/onnx_diagnostic/torch_models/hghub/model_inputs.py @@ -105,6 +105,7 @@ def get_untrained_model_with_inputs( # outputs even with the same inputs in training mode. model.eval() res = fct(model, config, **kwargs) + res["input_kwargs"] = kwargs res["model_kwargs"] = mkwargs @@ -118,19 +119,24 @@ def get_untrained_model_with_inputs( update = {} for k, v in res.items(): if k.startswith(("inputs", "dynamic_shapes")) and isinstance(v, dict): - update[k] = filter_out_unexpected_inputs(model, v) + update[k] = filter_out_unexpected_inputs(model, v, verbose=verbose) res.update(update) return res -def filter_out_unexpected_inputs(model: torch.nn.Module, kwargs: Dict[str, Any]): +def filter_out_unexpected_inputs( + model: torch.nn.Module, kwargs: Dict[str, Any], verbose: int = 0 +): """ Removes input names in kwargs if no parameter names was found in ``model.forward``. """ sig = inspect.signature(model.forward) allowed = set(sig.parameters) - kwargs = {k: v for k, v in kwargs.items() if k in allowed} - return kwargs + new_kwargs = {k: v for k, v in kwargs.items() if k in allowed} + diff = set(kwargs) - set(new_kwargs) + if diff and verbose: + print(f"[filter_out_unexpected_inputs] removed {diff}") + return new_kwargs def compute_model_size(model: torch.nn.Module) -> Tuple[int, int]: