Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOGS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Change Logs
0.7.11
++++++

* :pr:`223`: adds task image-to-video
* :pr:`220`: adds option --ort-logs to display onnxruntime logs when creating the session
* :pr:`220`: adds a patch for PR `#40791 <https://github.com/huggingface/transformers/pull/40791>`_ in transformers

0.7.10
Expand Down
68 changes: 68 additions & 0 deletions _unittests/ut_tasks/test_tasks_image_to_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import unittest
import torch
import transformers
from onnx_diagnostic.ext_test_case import (
ExtTestCase,
hide_stdout,
requires_diffusers,
requires_torch,
requires_transformers,
)
from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
from onnx_diagnostic.torch_export_patches import torch_export_patches
from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str


class TestTasksImageToVideo(ExtTestCase):
@hide_stdout()
@requires_diffusers("0.35")
@requires_transformers("4.55")
@requires_torch("2.8.99")
def test_image_to_video(self):
kwargs = {
"_diffusers_version": "0.34.0.dev0",
"_class_name": "CosmosTransformer3DModel",
"max_size": [128, 240, 240],
"text_embed_dim": 128,
"use_cache": True,
"in_channels": 3,
"out_channels": 16,
"num_layers": 2,
"model_type": "dia",
"patch_size": [1, 2, 2],
"rope_scale": [1.0, 3.0, 3.0],
"attention_head_dim": 16,
"mlp_ratio": 0.4,
"initializer_range": 0.02,
"num_attention_heads": 16,
"is_encoder_decoder": True,
"adaln_lora_dim": 16,
"concat_padding_mask": True,
"extra_pos_embed_type": None,
}
config = transformers.DiaConfig(**kwargs)
mid = "nvidia/Cosmos-Predict2-2B-Video2World"
data = get_untrained_model_with_inputs(
mid,
verbose=1,
add_second_input=True,
subfolder="transformer",
config=config,
inputs_kwargs=dict(image_height=8 * 50, image_width=8 * 80),
)
self.assertEqual(data["task"], "image-to-video")
model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
model(**inputs)
model(**data["inputs2"])
with torch.fx.experimental._config.patch(
backed_size_oblivious=True
), torch_export_patches(
patch_transformers=True, patch_diffusers=True, verbose=10, stop_if_static=1
):
torch.export.export(
model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
)


if __name__ == "__main__":
unittest.main(verbosity=2)
6 changes: 4 additions & 2 deletions onnx_diagnostic/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
fill_mask,
image_classification,
image_text_to_text,
image_to_video,
mask_generation,
mixture_of_expert,
object_detection,
sentence_similarity,
Expand All @@ -14,7 +16,6 @@
text_to_image,
text2text_generation,
zero_shot_image_classification,
mask_generation,
)

__TASKS__ = [
Expand All @@ -23,6 +24,8 @@
fill_mask,
image_classification,
image_text_to_text,
image_to_video,
mask_generation,
mixture_of_expert,
object_detection,
sentence_similarity,
Expand All @@ -32,7 +35,6 @@
text_to_image,
text2text_generation,
zero_shot_image_classification,
mask_generation,
]


Expand Down
127 changes: 127 additions & 0 deletions onnx_diagnostic/tasks/image_to_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from typing import Any, Callable, Dict, Optional, Tuple
import torch
from ..helpers.config_helper import (
update_config,
check_hasattr,
default_num_hidden_layers as nhl,
)

__TASK__ = "image-to-video"


def reduce_model_config(config: Any) -> Dict[str, Any]:
"""Reduces a model size."""
if not hasattr(config, "num_hidden_layers") and not hasattr(config, "num_layers"):
# We cannot reduce.
return {}
check_hasattr(config, ("num_hidden_layers", "num_layers"))
kwargs = {}
if hasattr(config, "num_layers"):
kwargs["num_layers"] = min(config.num_layers, nhl())
if hasattr(config, "num_hidden_layers"):
kwargs["num_hidden_layers"] = min(config.num_hidden_layers, nhl())

update_config(config, kwargs)
return kwargs


def get_inputs(
model: torch.nn.Module,
config: Optional[Any],
text_embed_dim: int,
latent_channels: int,
batch_size: int = 2,
image_height: int = 704,
image_width: int = 1280,
latent_frames: int = 1,
text_maxlen: int = 512,
add_second_input: int = 1,
**kwargs, # unused
):
"""
Generates inputs for task ``image-to-video``.
"""
assert (
"cls_cache" not in kwargs
), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
latent_height = image_height // 8
latent_width = image_width // 8
dtype = torch.float32

inputs = dict(
hidden_states=torch.randn(
batch_size,
latent_channels,
latent_frames,
latent_height,
latent_width,
dtype=dtype,
),
timestep=torch.tensor([1.0] * batch_size, dtype=dtype),
encoder_hidden_states=torch.randn(
batch_size, text_maxlen, text_embed_dim, dtype=dtype
),
padding_mask=torch.ones(1, 1, image_height, image_width, dtype=dtype),
fps=torch.tensor([16] * batch_size, dtype=dtype),
condition_mask=torch.randn(
batch_size, 1, latent_frames, latent_height, latent_width, dtype=dtype
),
)
shapes = dict(
hidden_states={
0: "batch_size",
2: "latent_frames",
3: "latent_height",
4: "latent_width",
},
timestep={0: "batch_size"},
encoder_hidden_states={0: "batch_size"},
padding_mask={0: "batch_size", 2: "height", 3: "width"},
fps={0: "batch_size"},
condition_mask={
0: "batch_size",
2: "latent_frames",
3: "latent_height",
4: "latent_width",
},
)
res = dict(inputs=inputs, dynamic_shapes=shapes)

if add_second_input:
assert (
add_second_input > 0
), f"Not implemented for add_second_input={add_second_input}."
res["inputs2"] = get_inputs(
model=model,
config=config,
text_embed_dim=text_embed_dim,
latent_channels=latent_channels,
batch_size=batch_size,
image_height=image_height,
image_width=image_width,
latent_frames=latent_frames,
text_maxlen=text_maxlen,
add_second_input=0,
**kwargs,
)["inputs"]
return res


def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
"""
Inputs kwargs.

If the configuration is None, the function selects typical dimensions.
"""
if config is not None:
check_hasattr(config, "in_channels", "text_embed_dim"),
kwargs = dict(
text_embed_dim=1024 if config is None else config.text_embed_dim,
latent_channels=16 if config is None else config.in_channels - 1,
batch_size=1,
image_height=8 * 50,
image_width=8 * 80,
latent_frames=1,
text_maxlen=512,
)
return kwargs, get_inputs
91 changes: 69 additions & 22 deletions onnx_diagnostic/torch_models/hghub/hub_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,51 @@ def task_from_arch(
return data[arch]


def _trygetattr(config, attname):
try:
return getattr(config, attname)
except AttributeError:
return None


def architecture_from_config(config) -> Optional[str]:
"""Guesses the architecture (class) of the model described by this config."""
if isinstance(config, dict):
if "_class_name" in config:
return config["_class_name"]
if "architecture" in config:
return config["architecture"]
if config.get("architectures", []):
return config["architectures"][0]
if hasattr(config, "_class_name"):
return config._class_name
if hasattr(config, "architecture"):
return config.architecture
if hasattr(config, "architectures") and config.architectures:
return config.architectures[0]
if hasattr(config, "__dict__"):
if "_class_name" in config.__dict__:
return config.__dict__["_class_name"]
if "architecture" in config.__dict__:
return config.__dict__["architecture"]
if config.__dict__.get("architectures", []):
return config.__dict__["architectures"][0]
return None


def find_package_source(config) -> Optional[str]:
"""Guesses the package the class models from."""
if isinstance(config, dict):
if "_diffusers_version" in config:
return "diffusers"
if hasattr(config, "_diffusers_version"):
return "diffusers"
if hasattr(config, "__dict__"):
if "_diffusers_version" in config.__dict__:
return "diffusers"
return "transformers"


def task_from_id(
model_id: str,
default_value: Optional[str] = None,
Expand All @@ -202,28 +247,30 @@ def task_from_id(
if not fall_back_to_pretrained:
raise
config = get_pretrained_config(model_id, subfolder=subfolder)
try:
return config.pipeline_tag
except AttributeError:
guess = _guess_task_from_config(config)
if guess is not None:
return guess
data = load_architecture_task()
if model_id in data:
return data[model_id]
if type(config) is dict and "_class_name" in config:
return task_from_arch(config["_class_name"], default_value=default_value)
if not config.architectures or not config.architectures:
# Some hardcoded values until a better solution is found.
if model_id.startswith("google/bert_"):
return "fill-mask"
assert config.architectures is not None and len(config.architectures) == 1, (
f"Cannot return the task of {model_id!r}, pipeline_tag is not setup, "
f"architectures={config.architectures} in config={config}. "
f"The task can be added in "
f"``onnx_diagnostic.torch_models.hghub.hub_data.__data_arch__``."
)
return task_from_arch(config.architectures[0], default_value=default_value)
tag = _trygetattr(config, "pipeline_tag")
if tag is not None:
return tag

guess = _guess_task_from_config(config)
if guess is not None:
return guess
data = load_architecture_task()
if subfolder:
full_id = f"{model_id}//{subfolder}"
if full_id in data:
return data[full_id]
if model_id in data:
return data[model_id]
arch = architecture_from_config(config)
if arch is None:
if model_id.startswith("google/bert_"):
return "fill-mask"
assert arch is not None, (
f"Cannot return the task of {model_id!r}, pipeline_tag is not setup, "
f"config={config}. The task can be added in "
f"``onnx_diagnostic.torch_models.hghub.hub_data.__data_arch__``."
)
return task_from_arch(arch, default_value=default_value)


def task_from_tags(tags: Union[str, List[str]]) -> str:
Expand Down
4 changes: 3 additions & 1 deletion onnx_diagnostic/torch_models/hghub/hub_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ConvBertModel,feature-extraction
ConvNextForImageClassification,image-classification
ConvNextV2Model,image-feature-extraction
CosmosTransformer3DModel,image-to-video
CvtModel,feature-extraction
DPTModel,image-feature-extraction
Data2VecAudioModel,feature-extraction
Expand Down Expand Up @@ -156,7 +157,8 @@
YolosForObjectDetection,object-detection
YolosModel,image-feature-extraction
Alibaba-NLP/gte-large-en-v1.5,sentence-similarity
emilyalsentzer/Bio_ClinicalBERT,fill-mask"""
emilyalsentzer/Bio_ClinicalBERT,fill-mask
nvidia/Cosmos-Predict2-2B-Video2World//transformer,image-to-video"""
)

__data_tasks__ = [
Expand Down
Loading
Loading