diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 49c2bfbb..faf0ea99 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -42,6 +42,6 @@ jobs: print_all: false timeout: 2 retry_count# : 2 - exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311 - exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/,https://codecov.io/,https://huggingface.co/ + exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311,https://www.linux.org/ + exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/,https://codecov.io/,https://huggingface.co/,https://www.linux.org/ # force_pass : true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20f4fdfd..b89177a9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: matrix: os: [ubuntu-latest] python: ['3.10', '3.11', '3.12', '3.13'] - transformers: ['4.48.3', '4.51.3', '4.52.4', '4.53.3', '4.55.4', '4.56.1', 'main'] + transformers: ['4.48.3', '4.51.3', '4.52.4', '4.55.4', '4.56.2', 'main'] torch: ['2.8', 'main'] exclude: - python: '3.10' @@ -26,22 +26,18 @@ jobs: transformers: 'main' - python: '3.10' transformers: '4.52.4' - - python: '3.10' - transformers: '4.53.3' - python: '3.10' transformers: '4.55.4' - python: '3.10' - transformers: '4.56.1' + transformers: '4.56.2' - python: '3.11' torch: 'main' - - python: '3.11' - transformers: '4.53.3' - python: '3.11' transformers: 'main' - python: '3.11' transformers: '4.55.4' - python: '3.11' - transformers: '4.56.1' + transformers: '4.56.2' - python: '3.13' torch: '2.8' - python: '3.13' @@ -49,7 +45,9 @@ jobs: - python: '3.13' transformers: '4.51.3' - python: '3.13' - transformers: '4.52.4' + transformers: '4.55.4' + - python: '3.13' + transformers: '4.56.2' steps: - uses: actions/checkout@v3 diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst index 44d67518..bf6e231c 100644 --- a/CHANGELOGS.rst +++ b/CHANGELOGS.rst @@ -4,6 +4,8 @@ Change Logs 0.7.13 ++++++ +* :pr:`247`: supports more gemma models with ModelBuilder +* :pr:`246`: add a set of inputs checking models works for an empty cache on task text-generation * :pr:`237`: dummy inputs for google/gemma-3-4b-it * :pr:`244`: add a patch to bypass the exception raised when the dynamic dimension is in {0,1} diff --git a/_doc/patches.rst b/_doc/patches.rst index d174c3df..a397d11c 100644 --- a/_doc/patches.rst +++ b/_doc/patches.rst @@ -91,7 +91,10 @@ Here is the list of available patches: for name, cls in p.__dict__.items(): if name.startswith("patched_") and hasattr(cls, "_PATCHES_"): - print(f"{cls._PATCHED_CLASS_.__name__}: {', '.join(cls._PATCHES_)}") + print( + f"{cls._PATCHED_CLASS_.__name__}: " + f"{', '.join([_ for _ in cls._PATCHES_ if _ is not None])}" + ) Cache serialization =================== diff --git a/_doc/status/patches_coverage.rst b/_doc/status/patches_coverage.rst index 61d5b775..c2cdc37f 100644 --- a/_doc/status/patches_coverage.rst +++ b/_doc/status/patches_coverage.rst @@ -32,7 +32,10 @@ for transformers. for name, cls in p.__dict__.items(): if name.startswith("patched_") and hasattr(cls, "_PATCHES_"): - print(f"{cls._PATCHED_CLASS_.__name__}: {', '.join(cls._PATCHES_)}") + print( + f"{cls._PATCHED_CLASS_.__name__}: " + f"{', '.join([_ for _ in cls._PATCHES_ if _ is not None])}" + ) Half Automated Rewrites for Control Flows ========================================= diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py index 57c83b55..975372ec 100644 --- a/_unittests/ut_tasks/test_tasks.py +++ b/_unittests/ut_tasks/test_tasks.py @@ -48,6 +48,27 @@ def test_text_generation(self): model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False ) + def test_text_generation_empty_cache(self): + mid = "arnir0/Tiny-LLM" + data = get_untrained_model_with_inputs(mid, add_second_input=True) + model, inputs = data["model"], data["inputs"] + self.assertIn("inputs_empty_cache", data) + empty_inputs = torch_deepcopy(data["inputs_empty_cache"]) + model(**torch_deepcopy(empty_inputs)) + expected = model(**torch_deepcopy(inputs)) + self.assertEqual( + {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs) + ) + with torch_export_patches(patch_transformers=True, verbose=1): + ep = torch.export.export( + model, + (), + kwargs=torch_deepcopy(inputs), + dynamic_shapes=use_dyn_not_str(data["dynamic_shapes"]), + ) + got = ep.module()(**torch_deepcopy(inputs)) + self.assertEqualArrayAny(expected, got) + @hide_stdout() def test_automatic_speech_recognition_float32(self): mid = "openai/whisper-tiny" diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py index af60feaf..185cf57d 100644 --- a/_unittests/ut_tasks/try_tasks.py +++ b/_unittests/ut_tasks/try_tasks.py @@ -5,8 +5,8 @@ from onnx_diagnostic.helpers import string_type from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache from onnx_diagnostic.helpers.torch_helper import steal_forward -from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs from onnx_diagnostic.torch_export_patches import torch_export_patches +from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs class TestHuggingFaceHubModel(ExtTestCase): @@ -132,6 +132,52 @@ def test_text2text_generation_static(self): ) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) + @never_test() + def test_text_generation_tiny_llm(self): + # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k tiny_llm + """ + dict(cache_position:T7s21, + past_key_values:DynamicCache(key_cache=#0[], value_cache=#0[]), + input_ids:T7s1x21, + position_ids:T7s1x21 + attention_mask:T1s1x21) + dict(cache_position:T7s1, + past_key_values:DynamicCache(key_cache=#32[T1s1x8x21x128,...], + value_cache=#32[T1s1x8x21x128,...]), + input_ids:T7s1x21, + position_ids:T7s1x1 + attention_mask:T1s1x1) + """ + from transformers import AutoTokenizer, AutoModelForCausalLM + + tokenizer = AutoTokenizer.from_pretrained("arnir0/Tiny-LLM") + model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct") + + text = "def greet(user): print(f'hello !')" + input_ids = tokenizer(text, return_tensors="pt").input_ids.reshape((1, -1)) + mask = ( + torch.tensor([1 for i in range(input_ids.shape[1])]) + .to(torch.int64) + .reshape((1, -1)) + ) + position_ids = torch.arange(input_ids.shape[1], dtype=torch.int64).reshape((1, -1)) + + # simply generate a single sequence + print() + with ( + torch_export_patches( + patch_transformers=True, patch_torch=False, patch_sympy=False + ), + steal_forward(model), + ): + generated_ids = model.generate( + input_ids=input_ids, + max_length=100, + attention_mask=mask, + position_ids=position_ids, + ) + print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) + @never_test() def test_text_generation_phi4_mini(self): # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_mini diff --git a/_unittests/ut_torch_models/test_validate_whole_models.py b/_unittests/ut_torch_models/test_validate_whole_models.py index a29be5cf..830ee0b2 100644 --- a/_unittests/ut_torch_models/test_validate_whole_models.py +++ b/_unittests/ut_torch_models/test_validate_whole_models.py @@ -227,7 +227,7 @@ def test_m_validate_model_vit_model(self): self.assertIsInstance(summary, dict) self.assertIsInstance(data, dict) self.assertLess(summary["disc_onnx_ort_run_abs"], 1e-3) - self.assertLess(summary["disc_onnx_ort_run2_abs"], 1e-3) + self.assertLess(summary["disc_onnx_ort_run22_abs"], 1e-3) self.assertEqual("dict(pixel_values:A1s2x3x30x30)", summary["run_feeds_inputs"]) self.assertEqual("dict(pixel_values:A1s3x3x31x31)", summary["run_feeds_inputs2"]) self.assertEqual("#1[A1s2x2]", summary["run_output_inputs"]) diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py index 645c719c..60708d89 100644 --- a/onnx_diagnostic/helpers/torch_helper.py +++ b/onnx_diagnostic/helpers/torch_helper.py @@ -779,7 +779,12 @@ def to_any(value: Any, to_value: Union[torch.dtype, torch.device, str]) -> Any: def torch_deepcopy(value: Any) -> Any: - """Makes a deepcopy.""" + """ + Makes a deep copy. + + :param value: any value + :return: a deep copy + """ if value is None: return None if isinstance(value, (int, float, str)): diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py index 6e6e29ba..964e0462 100644 --- a/onnx_diagnostic/tasks/text_generation.py +++ b/onnx_diagnostic/tasks/text_generation.py @@ -269,6 +269,21 @@ def get_inputs( add_second_input=0, **kwargs, )["inputs"] + res["inputs_empty_cache"] = get_inputs( + model=model, + config=config, + dummy_max_token_id=dummy_max_token_id, + num_hidden_layers=num_hidden_layers, + batch_size=batch_size, + sequence_length=0, + sequence_length2=sequence_length2, + dynamic_rope=dynamic_rope, + num_key_value_heads=num_key_value_heads, + head_dim=head_dim, + cls_cache=cls_cache, + add_second_input=0, + **kwargs, + )["inputs"] return res diff --git a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py index dbe6dc9c..0ed5692c 100644 --- a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py +++ b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py @@ -83,7 +83,7 @@ def patch_module_or_classes(mod, verbose: int = 0) -> Dict[type, Dict[type, Call continue original = cls._PATCHED_CLASS_ - methods = cls._PATCHES_ + methods = [_ for _ in cls._PATCHES_ if _ is not None] if verbose: print(f"[patch_module_or_classes] {name}.{cls.__name__}: {', '.join(methods)}") diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py index d0af7e10..3fe0ba83 100644 --- a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +++ b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py @@ -9,6 +9,12 @@ import transformers from transformers.modeling_attn_mask_utils import AttentionMaskConverter from transformers.cache_utils import StaticCache, Cache +from transformers.generation.utils import ( + GenerateNonBeamOutput, + GenerationConfig, + StoppingCriteriaList, + LogitsProcessorList, +) try: from transformers.cache_utils import parse_processor_args # noqa: F401 @@ -459,18 +465,18 @@ class patched_GenerationMixin: """ _PATCHES_ = [ - name - for name in [ - "_cache_dependant_input_preparation", - "_cache_dependant_input_preparation_exporting", - ( - None - if pv.Version(transformers.__version__) >= pv.Version("4.56") - else "prepare_inputs_for_generation" - ), - "_sample", - ] - if name is not None + "_cache_dependant_input_preparation", + "_cache_dependant_input_preparation_exporting", + ( + None + if pv.Version(transformers.__version__) >= pv.Version("4.56") + else "prepare_inputs_for_generation" + ), + ( + "_sample" + if pv.Version(transformers.__version__) == pv.Version("4.57.0.dev0") + else None + ), ] _PATCHED_CLASS_ = transformers.generation.utils.GenerationMixin @@ -603,7 +609,7 @@ def prepare_inputs_for_generation( model_inputs = {} # - some models don't have `Cache` support # (which implies they don't expect `cache_position` in `forward`) - if self._supports_cache_class: + if getattr(self, "_supports_cache_class", False): model_inputs["cache_position"] = cache_position # - `cache_position` was not a mandatory input in # `prepare_inputs_for_generation` for those models, and this @@ -832,8 +838,6 @@ def _sample( else: outputs = model_forward(**model_inputs, return_dict=True) - # synced_gpus: don't waste resources running the code we don't need; - # kwargs must be updated before skipping model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, @@ -842,9 +846,6 @@ def _sample( if synced_gpus and this_peer_finished: continue - # Copy is needed to avoid keeping a hanging ref to outputs.logits - # which may be very large for first iteration - # (the clone itself is always small) next_token_logits = outputs.logits[:, -1, :].to( copy=True, dtype=torch.float32, device=input_ids.device ) diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py index be5f6f95..2c65d1a4 100644 --- a/onnx_diagnostic/torch_models/hghub/model_inputs.py +++ b/onnx_diagnostic/torch_models/hghub/model_inputs.py @@ -57,7 +57,7 @@ def get_untrained_model_with_inputs( to get a smaller model :param use_pretrained: download the pretrained weights as well :param use_preinstalled: use preinstalled configurations - :param add_second_input: provides a second inputs to check a model + :param add_second_input: provides others inputs to check a model supports different shapes :param subfolder: subfolder to use for this model id :param use_only_preinstalled: use only preinstalled version diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py index 4a691a74..719f3828 100644 --- a/onnx_diagnostic/torch_models/validate.py +++ b/onnx_diagnostic/torch_models/validate.py @@ -380,9 +380,10 @@ def validate_model( ``orteval10``, ``ref`` only if `do_run` is true :param repeat: number of time to measure the model :param warmup: warmup the model first - :param inputs2: checks that the second set of inputs is reunning as well, + :param inputs2: checks that other sets of inputs are running as well, this ensures that the model does support dynamism, the value is used - as an increment to the first set of values (added to dimensions) + as an increment to the first set of values (added to dimensions), + or an empty cache for example :param output_names: output names the onnx exporter should use :param ort_logs: increases onnxruntime verbosity when creating the session :return: two dictionaries, one with some metrics, @@ -412,6 +413,11 @@ def validate_model( :class:`onnx_diagnostic.reference.ExtendedReferenceEvaluator` if ``runtime == 'ref'``, ``orteval10`` increases the verbosity. + + .. versionchanged:: 0.7.13 + *inputs2* not only means a second set of inputs but many + such as ``input_empty_cache`` + which refers to a set of inputs using an empty cache. """ validation_begin = time.perf_counter() model_id, subfolder, same_as_pretrained, use_pretrained = _preprocess_model_id( @@ -531,10 +537,9 @@ def validate_model( ) ), ) - assert not inputs2 or "inputs2" in data, ( - f"inputs2 is True but second set is missing in data for " - f"model id {model_id!r}: {sorted(data)}" - ) + + second_input_keys = [k for k in data if k.startswith("inputs") and k != "inputs"] + if dump_folder: with open(os.path.join(dump_folder, "model_config.txt"), "w") as f: f.write(f"model_id: {model_id}\n------\n") @@ -627,16 +632,14 @@ def validate_model( if verbose: print(f"[validate_model] new inputs: {string_type(data['inputs'])}") print(f"[validate_model] new dynamic_hapes: {string_type(data['dynamic_shapes'])}") - if inputs2: - assert ( - "inputs2" in data - ), "Cannot test a second set of inputs as it was not defined." - data["inputs2"], _ = filter_inputs( - data["inputs2"], - drop_names=drop_inputs, - model=data["model"], - dynamic_shapes=data["dynamic_shapes"], - ) + if second_input_keys: + for k in second_input_keys: + data[k], _ = filter_inputs( + data[k], + drop_names=drop_inputs, + model=data["model"], + dynamic_shapes=data["dynamic_shapes"], + ) if not empty(dtype): if isinstance(dtype, str): @@ -646,8 +649,9 @@ def validate_model( data["model"] = to_any(data["model"], dtype) # type: ignore data["inputs"] = to_any(data["inputs"], dtype) # type: ignore summary["model_dtype"] = str(dtype) - if "inputs2" in data: - data["inputs2"] = to_any(data["inputs2"], dtype) # type: ignore + if second_input_keys: + for k in second_input_keys: + data[k] = to_any(data[k], dtype) # type: ignore if not empty(device): if verbose: @@ -655,11 +659,13 @@ def validate_model( data["model"] = to_any(data["model"], device) # type: ignore data["inputs"] = to_any(data["inputs"], device) # type: ignore summary["model_device"] = str(device) - if "inputs2" in data: - data["inputs2"] = to_any(data["inputs2"], device) # type: ignore + if second_input_keys: + for k in second_input_keys: + data[k] = to_any(data[k], device) # type: ignore for k in ["task", "size", "n_weights"]: summary[f"model_{k.replace('_','')}"] = data[k] + summary["second_input_keys"] = ",".join(second_input_keys) summary["model_inputs_options"] = str(input_options or "") summary["model_inputs"] = string_type(data["inputs"], with_shape=True) summary["model_shapes"] = string_type(data["dynamic_shapes"]) @@ -686,6 +692,7 @@ def validate_model( print(f"[validate_model] +INPUT {k}={string_type(v, with_shape=True)}") for k, v in data["dynamic_shapes"].items(): print(f"[validate_model] +SHAPE {k}={string_type(v)}") + print(f"[validate_model] second_input_keys={second_input_keys}") print("[validate_model] --") if do_run: @@ -694,10 +701,20 @@ def validate_model( _validate_do_run_model( data, summary, "inputs", "run", "run_expected", verbose, repeat, warmup, quiet ) - if inputs2: - _validate_do_run_model( - data, summary, "inputs2", "run2", "run_expected2", verbose, 1, 0, quiet - ) + if second_input_keys: + for k in second_input_keys: + _validate_do_run_model( + data, + summary, + k, + f"run2{k[6:]}", + f"run_expected2{k[6:]}", + verbose, + 1, + 0, + quiet, + ) + summary["time_total_validation_torch"] = time.perf_counter() - validation_begin if exporter: @@ -836,7 +853,7 @@ def validate_model( runtime=runtime, repeat=repeat, warmup=warmup, - inputs2=inputs2, + second_input_keys=second_input_keys, ort_logs=ort_logs, ) summary.update(summary_valid) @@ -899,7 +916,7 @@ def validate_model( runtime=runtime, repeat=repeat, warmup=warmup, - inputs2=inputs2, + second_input_keys=second_input_keys, ) summary.update(summary_valid) @@ -1283,7 +1300,7 @@ def validate_onnx_model( runtime: str = "onnxruntime", repeat: int = 1, warmup: int = 0, - inputs2: int = 1, + second_input_keys: Optional[List[str]] = None, ort_logs: bool = False, ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ @@ -1300,7 +1317,7 @@ def validate_onnx_model( :param runtime: onnx runtime to use, onnxruntime, torch, orteval, ref :param repeat: run that number of times the model :param warmup: warmup the model - :param inputs2: to validate the model on the second input set + :param second_input_keys: to validate the model on other input sets to make sure the exported model supports dynamism, the value is used as an increment added to the first set of inputs (added to dimensions) :param ort_logs: triggers the logs for onnxruntime @@ -1425,10 +1442,12 @@ def _mk(key, flavour=flavour): print(f"[validate_onnx_model] done (ort_session) flavour={flavour!r}") keys = [("inputs", "run_expected", "")] - if inputs2: - keys.append(("inputs2", "run_expected2", "2")) + if second_input_keys: + keys.extend([(k, f"run_expected2{k[6:]}", f"2{k[6:]}") for k in second_input_keys]) for k_input, k_expected, suffix in keys: # make_feeds + assert k_input in data, f"Unable to find {k_input!r} in {sorted(data)}" + assert k_expected in data, f"Unable to find {k_expected!r} in {sorted(data)}" if verbose: print(f"[validate_onnx_model] -- make_feeds for {k_input!r}...") print(