diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst index a0809cb4..2cfbdc43 100644 --- a/CHANGELOGS.rst +++ b/CHANGELOGS.rst @@ -4,6 +4,8 @@ Change Logs 0.7.11 ++++++ +* :pr:`223`: adds task image-to-video +* :pr:`220`: adds option --ort-logs to display onnxruntime logs when creating the session * :pr:`220`: adds a patch for PR `#40791 `_ in transformers 0.7.10 diff --git a/_unittests/ut_tasks/test_tasks_image_to_video.py b/_unittests/ut_tasks/test_tasks_image_to_video.py new file mode 100644 index 00000000..dc40697f --- /dev/null +++ b/_unittests/ut_tasks/test_tasks_image_to_video.py @@ -0,0 +1,68 @@ +import unittest +import torch +import transformers +from onnx_diagnostic.ext_test_case import ( + ExtTestCase, + hide_stdout, + requires_diffusers, + requires_torch, + requires_transformers, +) +from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs +from onnx_diagnostic.torch_export_patches import torch_export_patches +from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str + + +class TestTasksImageToVideo(ExtTestCase): + @hide_stdout() + @requires_diffusers("0.35") + @requires_transformers("4.55") + @requires_torch("2.8.99") + def test_image_to_video(self): + kwargs = { + "_diffusers_version": "0.34.0.dev0", + "_class_name": "CosmosTransformer3DModel", + "max_size": [128, 240, 240], + "text_embed_dim": 128, + "use_cache": True, + "in_channels": 3, + "out_channels": 16, + "num_layers": 2, + "model_type": "dia", + "patch_size": [1, 2, 2], + "rope_scale": [1.0, 3.0, 3.0], + "attention_head_dim": 16, + "mlp_ratio": 0.4, + "initializer_range": 0.02, + "num_attention_heads": 16, + "is_encoder_decoder": True, + "adaln_lora_dim": 16, + "concat_padding_mask": True, + "extra_pos_embed_type": None, + } + config = transformers.DiaConfig(**kwargs) + mid = "nvidia/Cosmos-Predict2-2B-Video2World" + data = get_untrained_model_with_inputs( + mid, + verbose=1, + add_second_input=True, + subfolder="transformer", + config=config, + inputs_kwargs=dict(image_height=8 * 50, image_width=8 * 80), + ) + self.assertEqual(data["task"], "image-to-video") + model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"] + model(**inputs) + model(**data["inputs2"]) + with torch.fx.experimental._config.patch( + backed_size_oblivious=True + ), torch_export_patches( + patch_transformers=True, patch_diffusers=True, verbose=10, stop_if_static=1 + ): + torch.export.export( + model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py index 9895f363..3f9da3f1 100644 --- a/onnx_diagnostic/tasks/__init__.py +++ b/onnx_diagnostic/tasks/__init__.py @@ -5,6 +5,8 @@ fill_mask, image_classification, image_text_to_text, + image_to_video, + mask_generation, mixture_of_expert, object_detection, sentence_similarity, @@ -14,7 +16,6 @@ text_to_image, text2text_generation, zero_shot_image_classification, - mask_generation, ) __TASKS__ = [ @@ -23,6 +24,8 @@ fill_mask, image_classification, image_text_to_text, + image_to_video, + mask_generation, mixture_of_expert, object_detection, sentence_similarity, @@ -32,7 +35,6 @@ text_to_image, text2text_generation, zero_shot_image_classification, - mask_generation, ] diff --git a/onnx_diagnostic/tasks/image_to_video.py b/onnx_diagnostic/tasks/image_to_video.py new file mode 100644 index 00000000..e88e8840 --- /dev/null +++ b/onnx_diagnostic/tasks/image_to_video.py @@ -0,0 +1,127 @@ +from typing import Any, Callable, Dict, Optional, Tuple +import torch +from ..helpers.config_helper import ( + update_config, + check_hasattr, + default_num_hidden_layers as nhl, +) + +__TASK__ = "image-to-video" + + +def reduce_model_config(config: Any) -> Dict[str, Any]: + """Reduces a model size.""" + if not hasattr(config, "num_hidden_layers") and not hasattr(config, "num_layers"): + # We cannot reduce. + return {} + check_hasattr(config, ("num_hidden_layers", "num_layers")) + kwargs = {} + if hasattr(config, "num_layers"): + kwargs["num_layers"] = min(config.num_layers, nhl()) + if hasattr(config, "num_hidden_layers"): + kwargs["num_hidden_layers"] = min(config.num_hidden_layers, nhl()) + + update_config(config, kwargs) + return kwargs + + +def get_inputs( + model: torch.nn.Module, + config: Optional[Any], + text_embed_dim: int, + latent_channels: int, + batch_size: int = 2, + image_height: int = 704, + image_width: int = 1280, + latent_frames: int = 1, + text_maxlen: int = 512, + add_second_input: int = 1, + **kwargs, # unused +): + """ + Generates inputs for task ``image-to-video``. + """ + assert ( + "cls_cache" not in kwargs + ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}." + latent_height = image_height // 8 + latent_width = image_width // 8 + dtype = torch.float32 + + inputs = dict( + hidden_states=torch.randn( + batch_size, + latent_channels, + latent_frames, + latent_height, + latent_width, + dtype=dtype, + ), + timestep=torch.tensor([1.0] * batch_size, dtype=dtype), + encoder_hidden_states=torch.randn( + batch_size, text_maxlen, text_embed_dim, dtype=dtype + ), + padding_mask=torch.ones(1, 1, image_height, image_width, dtype=dtype), + fps=torch.tensor([16] * batch_size, dtype=dtype), + condition_mask=torch.randn( + batch_size, 1, latent_frames, latent_height, latent_width, dtype=dtype + ), + ) + shapes = dict( + hidden_states={ + 0: "batch_size", + 2: "latent_frames", + 3: "latent_height", + 4: "latent_width", + }, + timestep={0: "batch_size"}, + encoder_hidden_states={0: "batch_size"}, + padding_mask={0: "batch_size", 2: "height", 3: "width"}, + fps={0: "batch_size"}, + condition_mask={ + 0: "batch_size", + 2: "latent_frames", + 3: "latent_height", + 4: "latent_width", + }, + ) + res = dict(inputs=inputs, dynamic_shapes=shapes) + + if add_second_input: + assert ( + add_second_input > 0 + ), f"Not implemented for add_second_input={add_second_input}." + res["inputs2"] = get_inputs( + model=model, + config=config, + text_embed_dim=text_embed_dim, + latent_channels=latent_channels, + batch_size=batch_size, + image_height=image_height, + image_width=image_width, + latent_frames=latent_frames, + text_maxlen=text_maxlen, + add_second_input=0, + **kwargs, + )["inputs"] + return res + + +def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]: + """ + Inputs kwargs. + + If the configuration is None, the function selects typical dimensions. + """ + if config is not None: + check_hasattr(config, "in_channels", "text_embed_dim"), + kwargs = dict( + text_embed_dim=1024 if config is None else config.text_embed_dim, + latent_channels=16 if config is None else config.in_channels - 1, + batch_size=1, + image_height=8 * 50, + image_width=8 * 80, + latent_frames=1, + text_maxlen=512, + ) + return kwargs, get_inputs diff --git a/onnx_diagnostic/torch_models/hghub/hub_api.py b/onnx_diagnostic/torch_models/hghub/hub_api.py index da05f7c0..94f0aa7e 100644 --- a/onnx_diagnostic/torch_models/hghub/hub_api.py +++ b/onnx_diagnostic/torch_models/hghub/hub_api.py @@ -177,6 +177,51 @@ def task_from_arch( return data[arch] +def _trygetattr(config, attname): + try: + return getattr(config, attname) + except AttributeError: + return None + + +def architecture_from_config(config) -> Optional[str]: + """Guesses the architecture (class) of the model described by this config.""" + if isinstance(config, dict): + if "_class_name" in config: + return config["_class_name"] + if "architecture" in config: + return config["architecture"] + if config.get("architectures", []): + return config["architectures"][0] + if hasattr(config, "_class_name"): + return config._class_name + if hasattr(config, "architecture"): + return config.architecture + if hasattr(config, "architectures") and config.architectures: + return config.architectures[0] + if hasattr(config, "__dict__"): + if "_class_name" in config.__dict__: + return config.__dict__["_class_name"] + if "architecture" in config.__dict__: + return config.__dict__["architecture"] + if config.__dict__.get("architectures", []): + return config.__dict__["architectures"][0] + return None + + +def find_package_source(config) -> Optional[str]: + """Guesses the package the class models from.""" + if isinstance(config, dict): + if "_diffusers_version" in config: + return "diffusers" + if hasattr(config, "_diffusers_version"): + return "diffusers" + if hasattr(config, "__dict__"): + if "_diffusers_version" in config.__dict__: + return "diffusers" + return "transformers" + + def task_from_id( model_id: str, default_value: Optional[str] = None, @@ -202,28 +247,30 @@ def task_from_id( if not fall_back_to_pretrained: raise config = get_pretrained_config(model_id, subfolder=subfolder) - try: - return config.pipeline_tag - except AttributeError: - guess = _guess_task_from_config(config) - if guess is not None: - return guess - data = load_architecture_task() - if model_id in data: - return data[model_id] - if type(config) is dict and "_class_name" in config: - return task_from_arch(config["_class_name"], default_value=default_value) - if not config.architectures or not config.architectures: - # Some hardcoded values until a better solution is found. - if model_id.startswith("google/bert_"): - return "fill-mask" - assert config.architectures is not None and len(config.architectures) == 1, ( - f"Cannot return the task of {model_id!r}, pipeline_tag is not setup, " - f"architectures={config.architectures} in config={config}. " - f"The task can be added in " - f"``onnx_diagnostic.torch_models.hghub.hub_data.__data_arch__``." - ) - return task_from_arch(config.architectures[0], default_value=default_value) + tag = _trygetattr(config, "pipeline_tag") + if tag is not None: + return tag + + guess = _guess_task_from_config(config) + if guess is not None: + return guess + data = load_architecture_task() + if subfolder: + full_id = f"{model_id}//{subfolder}" + if full_id in data: + return data[full_id] + if model_id in data: + return data[model_id] + arch = architecture_from_config(config) + if arch is None: + if model_id.startswith("google/bert_"): + return "fill-mask" + assert arch is not None, ( + f"Cannot return the task of {model_id!r}, pipeline_tag is not setup, " + f"config={config}. The task can be added in " + f"``onnx_diagnostic.torch_models.hghub.hub_data.__data_arch__``." + ) + return task_from_arch(arch, default_value=default_value) def task_from_tags(tags: Union[str, List[str]]) -> str: diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py index 1f29d832..299c37eb 100644 --- a/onnx_diagnostic/torch_models/hghub/hub_data.py +++ b/onnx_diagnostic/torch_models/hghub/hub_data.py @@ -30,6 +30,7 @@ ConvBertModel,feature-extraction ConvNextForImageClassification,image-classification ConvNextV2Model,image-feature-extraction + CosmosTransformer3DModel,image-to-video CvtModel,feature-extraction DPTModel,image-feature-extraction Data2VecAudioModel,feature-extraction @@ -156,7 +157,8 @@ YolosForObjectDetection,object-detection YolosModel,image-feature-extraction Alibaba-NLP/gte-large-en-v1.5,sentence-similarity - emilyalsentzer/Bio_ClinicalBERT,fill-mask""" + emilyalsentzer/Bio_ClinicalBERT,fill-mask + nvidia/Cosmos-Predict2-2B-Video2World//transformer,image-to-video""" ) __data_tasks__ = [ diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py index 730b659e..3ab2ec5f 100644 --- a/onnx_diagnostic/torch_models/hghub/model_inputs.py +++ b/onnx_diagnostic/torch_models/hghub/model_inputs.py @@ -2,13 +2,21 @@ import inspect import os import pprint +import time from typing import Any, Dict, Optional, Tuple import torch import transformers from ...helpers.config_helper import update_config, build_diff_config from ...tasks import reduce_model_config, random_input_kwargs -from .hub_api import task_from_arch, task_from_id, get_pretrained_config, download_code_modelid -from .model_specific import HANDLED_MODELS, load_specific_model +from .hub_api import ( + task_from_arch, + task_from_id, + get_pretrained_config, + download_code_modelid, + architecture_from_config, + find_package_source, +) +from .model_specific import HANDLED_MODELS, load_specific_model, instantiate_specific_model def _code_needing_rewriting(model: Any) -> Any: @@ -96,27 +104,18 @@ def get_untrained_model_with_inputs( model, task, config = load_specific_model(model_id, verbose=verbose) if model is None: - if hasattr(config, "architecture") and config.architecture: - archs = [config.architecture] - if type(config) is dict: - assert ( - "_class_name" in config - ), f"Unable to get the architecture from config={config}" - archs = [config["_class_name"]] - else: - archs = config.architectures # type: ignore - task = None - if archs is None: - task = task_from_id(model_id) - assert task is not None or (archs is not None and len(archs) == 1), ( + arch = architecture_from_config(config) + if arch is None: + task = task_from_id(model_id, subfolder=subfolder) + assert task is not None or arch is not None, ( f"Unable to determine the architecture for model {model_id!r}, " - f"architectures={archs!r}, conf={config}" + f"archs={arch!r}, conf={config}" ) if verbose: - print(f"[get_untrained_model_with_inputs] architectures={archs!r}") + print(f"[get_untrained_model_with_inputs] architecture={arch!r}") print(f"[get_untrained_model_with_inputs] cls={config.__class__.__name__!r}") if task is None: - task = task_from_arch(archs[0], model_id=model_id, subfolder=subfolder) + task = task_from_arch(arch, model_id=model_id, subfolder=subfolder) if verbose: print(f"[get_untrained_model_with_inputs] task={task!r}") @@ -170,36 +169,58 @@ def get_untrained_model_with_inputs( f"{getattr(config, '_attn_implementation', '?')!r}" # type: ignore[union-attr] ) - if type(config) is dict and "_diffusers_version" in config: + if find_package_source(config) == "diffusers": import diffusers package_source = diffusers else: package_source = transformers + if verbose: + print( + f"[get_untrained_model_with_inputs] package_source={package_source.__name__} é" + f"from {package_source.__file__}" + ) if use_pretrained: + begin = time.perf_counter() + if verbose: + print( + f"[get_untrained_model_with_inputs] pretrained model_id {model_id!r}, " + f"subfolder={subfolder!r}" + ) model = transformers.AutoModel.from_pretrained( - model_id, trust_remote_code=True, **mkwargs + model_id, subfolder=subfolder, trust_remote_code=True, **mkwargs ) + if verbose: + print( + f"[get_untrained_model_with_inputs] -- done in " + f"{time.perf_counter() - begin}s" + ) else: - if archs is not None: + begin = time.perf_counter() + if verbose: + print( + f"[get_untrained_model_with_inputs] instantiate model_id {model_id!r}, " + f"subfolder={subfolder!r}" + ) + if arch is not None: try: - cls_model = getattr(package_source, archs[0]) + cls_model = getattr(package_source, arch) except AttributeError as e: # The code of the models is not in transformers but in the # repository of the model. We need to download it. pyfiles = download_code_modelid(model_id, verbose=verbose) if pyfiles: - if "." in archs[0]: - cls_name = archs[0] + if "." in arch: + cls_name = arch else: modeling = [_ for _ in pyfiles if "/modeling_" in _] assert len(modeling) == 1, ( f"Unable to guess the main file implemented class " - f"{archs[0]!r} from {pyfiles}, found={modeling}." + f"{arch!r} from {pyfiles}, found={modeling}." ) last_name = os.path.splitext(os.path.split(modeling[0])[-1])[0] - cls_name = f"{last_name}.{archs[0]}" + cls_name = f"{last_name}.{arch}" if verbose: print( f"[get_untrained_model_with_inputs] " @@ -217,7 +238,7 @@ def get_untrained_model_with_inputs( ) else: raise AttributeError( - f"Unable to find class 'tranformers.{archs[0]}'. " + f"Unable to find class 'tranformers.{arch}'. " f"The code needs to be downloaded, config=" f"\n{pprint.pformat(config)}." ) from e @@ -225,20 +246,27 @@ def get_untrained_model_with_inputs( assert same_as_pretrained and use_pretrained, ( f"Model {model_id!r} cannot be built, the model cannot be built. " f"It must be downloaded. Use same_as_pretrained=True " - f"and use_pretrained=True." + f"and use_pretrained=True, arch={arch!r}, config={config}" + ) + if verbose: + print( + f"[get_untrained_model_with_inputs] -- done in " + f"{time.perf_counter() - begin}s" ) seed = int(os.environ.get("SEED", "17")) torch.manual_seed(seed) - try: - if type(config) is dict: - model = cls_model(**config) - else: - model = cls_model(config) - except RuntimeError as e: - raise RuntimeError( - f"Unable to instantiate class {cls_model.__name__} with\n{config}" - ) from e + model = instantiate_specific_model(cls_model, config) + if model is None: + try: + if type(config) is dict: + model = cls_model(**config) + else: + model = cls_model(config) + except RuntimeError as e: + raise RuntimeError( + f"Unable to instantiate class {cls_model.__name__} with\n{config}" + ) from e # input kwargs seed = int(os.environ.get("SEED", "17")) + 1 @@ -254,7 +282,7 @@ def get_untrained_model_with_inputs( # This line is important. Some models may produce different # outputs even with the same inputs in training mode. - model.eval() + model.eval() # type: ignore[union-attr] res = fct(model, config, add_second_input=add_second_input, **kwargs) res["input_kwargs"] = kwargs diff --git a/onnx_diagnostic/torch_models/hghub/model_specific.py b/onnx_diagnostic/torch_models/hghub/model_specific.py index 8cbb9665..9e055d0c 100644 --- a/onnx_diagnostic/torch_models/hghub/model_specific.py +++ b/onnx_diagnostic/torch_models/hghub/model_specific.py @@ -1,6 +1,33 @@ from typing import Any, Dict, Tuple +def instantiate_specific_model(cls_model: type, config: Any) -> object: + """ + Instantiates some model requiring some specific code. + """ + if cls_model.__name__ == "CosmosTransformer3DModel": + return instantiate_CosmosTransformer3DModel(cls_model, config) + return None + + +def instantiate_CosmosTransformer3DModel(cls_model: type, config: Any) -> object: + kwargs = dict( + in_channels=config.in_channels, + out_channels=config.out_channels, + attention_head_dim=config.attention_head_dim, + mlp_ratio=config.mlp_ratio, + num_layers=config.num_layers, + text_embed_dim=config.text_embed_dim, + adaln_lora_dim=config.adaln_lora_dim, + max_size=config.max_size, + patch_size=config.patch_size, + rope_scale=config.rope_scale, + concat_padding_mask=config.concat_padding_mask, + extra_pos_embed_type=config.extra_pos_embed_type, + ) + return cls_model(**kwargs) + + class SpecificConfig: """Creates a specific configuration for the loaded model."""