Add task image-to-video

xadupre · xadupre · commit 6d85d19523a6 · 2025-09-19T17:16:42.000+02:00
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
@@ -5,6 +5,8 @@
     fill_mask,
     image_classification,
     image_text_to_text,
+    image_to_video,
+    mask_generation,
     mixture_of_expert,
     object_detection,
     sentence_similarity,
@@ -14,7 +16,6 @@
     text_to_image,
     text2text_generation,
     zero_shot_image_classification,
-    mask_generation,
 )
 
 __TASKS__ = [
@@ -23,6 +24,8 @@
     fill_mask,
     image_classification,
     image_text_to_text,
+    image_to_video,
+    mask_generation,
     mixture_of_expert,
     object_detection,
     sentence_similarity,
@@ -32,7 +35,6 @@
     text_to_image,
     text2text_generation,
     zero_shot_image_classification,
-    mask_generation,
 ]
 
 
diff --git a/onnx_diagnostic/tasks/image_to_video.py b/onnx_diagnostic/tasks/image_to_video.py
@@ -0,0 +1,127 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import (
+    update_config,
+    check_hasattr,
+    default_num_hidden_layers as nhl,
+)
+
+__TASK__ = "image-to-video"
+
+
+def reduce_model_config(config: Any) -> Dict[str, Any]:
+    """Reduces a model size."""
+    if not hasattr(config, "num_hidden_layers") and not hasattr(config, "num_layers"):
+        # We cannot reduce.
+        return {}
+    check_hasattr(config, ("num_hidden_layers", "num_layers"))
+    kwargs = {}
+    if hasattr(config, "num_layers"):
+        kwargs["num_layers"] = min(config.num_layers, nhl())
+    if hasattr(config, "num_hidden_layers"):
+        kwargs["num_hidden_layers"] = min(config.num_hidden_layers, nhl())
+
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    text_embed_dim: int,
+    latent_channels: int,
+    batch_size: int = 2,
+    image_height: int = 704,
+    image_width: int = 1280,
+    latent_frames: int = 1,
+    text_maxlen: int = 512,
+    add_second_input: int = 1,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``image-to-video``.
+    """
+    assert (
+        "cls_cache" not in kwargs
+    ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
+    latent_height = image_height // 8
+    latent_width = image_width // 8
+    dtype = torch.float32
+
+    inputs = dict(
+        hidden_states=torch.randn(
+            batch_size,
+            latent_channels,
+            latent_frames,
+            latent_height,
+            latent_width,
+            dtype=dtype,
+        ),
+        timestep=torch.tensor([1.0] * batch_size, dtype=dtype),
+        encoder_hidden_states=torch.randn(
+            batch_size, text_maxlen, text_embed_dim, dtype=dtype
+        ),
+        padding_mask=torch.ones(1, 1, image_height, image_width, dtype=dtype),
+        fps=torch.tensor([16] * batch_size, dtype=dtype),
+        condition_mask=torch.randn(
+            batch_size, 1, latent_frames, latent_height, latent_width, dtype=dtype
+        ),
+    )
+    shapes = dict(
+        hidden_states={
+            0: "batch_size",
+            2: "latent_frames",
+            3: "latent_height",
+            4: "latent_width",
+        },
+        timestep={0: "batch_size"},
+        encoder_hidden_states={0: "batch_size"},
+        padding_mask={0: "batch_size", 2: "height", 3: "width"},
+        fps={0: "batch_size"},
+        condition_mask={
+            0: "batch_size",
+            2: "latent_frames",
+            3: "latent_height",
+            4: "latent_width",
+        },
+    )
+    res = dict(inputs=inputs, dynamic_shapes=shapes)
+
+    if add_second_input:
+        assert (
+            add_second_input > 0
+        ), f"Not implemented for add_second_input={add_second_input}."
+        res["inputs2"] = get_inputs(
+            model=model,
+            config=config,
+            text_embed_dim=text_embed_dim,
+            latent_channels=latent_channels,
+            batch_size=batch_size,
+            image_height=image_height,
+            image_width=image_width,
+            latent_frames=latent_frames,
+            text_maxlen=text_maxlen,
+            add_second_input=0,
+            **kwargs,
+        )["inputs"]
+    return res
+
+
+def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, "in_channels", "text_embed_dim"),
+    kwargs = dict(
+        text_embed_dim=1024 if config is None else config.text_embed_dim,
+        latent_channels=16 if config is None else config.in_channels - 1,
+        batch_size=1,
+        image_height=8 * 50,
+        image_width=8 * 80,
+        latent_frames=1,
+        text_maxlen=512,
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/torch_models/hghub/hub_api.py b/onnx_diagnostic/torch_models/hghub/hub_api.py
@@ -177,6 +177,51 @@ def task_from_arch(
     return data[arch]
 
 
+def _trygetattr(config, attname):
+    try:
+        return getattr(config, attname)
+    except AttributeError:
+        return None
+
+
+def architecture_from_config(config) -> Optional[str]:
+    """Guesses the architecture (class) of the model described by this config."""
+    if isinstance(config, dict):
+        if "_class_name" in config:
+            return config["_class_name"]
+        if "architecture" in config:
+            return config["architecture"]
+        if config.get("architectures", []):
+            return config["architectures"][0]
+    if hasattr(config, "_class_name"):
+        return config._class_name
+    if hasattr(config, "architecture"):
+        return config.architecture
+    if hasattr(config, "architectures") and config.architectures:
+        return config.architectures[0]
+    if hasattr(config, "__dict__"):
+        if "_class_name" in config.__dict__:
+            return config.__dict__["_class_name"]
+        if "architecture" in config.__dict__:
+            return config.__dict__["architecture"]
+        if config.__dict__.get("architectures", []):
+            return config.__dict__["architectures"][0]
+    return None
+
+
+def find_package_source(config) -> Optional[str]:
+    """Guesses the package the class models from."""
+    if isinstance(config, dict):
+        if "_diffusers_version" in config:
+            return "diffusers"
+    if hasattr(config, "_diffusers_version"):
+        return "diffusers"
+    if hasattr(config, "__dict__"):
+        if "_diffusers_version" in config.__dict__:
+            return "diffusers"
+    return "transformers"
+
+
 def task_from_id(
     model_id: str,
     default_value: Optional[str] = None,
@@ -202,28 +247,30 @@ def task_from_id(
             if not fall_back_to_pretrained:
                 raise
     config = get_pretrained_config(model_id, subfolder=subfolder)
-    try:
-        return config.pipeline_tag
-    except AttributeError:
-        guess = _guess_task_from_config(config)
-        if guess is not None:
-            return guess
-        data = load_architecture_task()
-        if model_id in data:
-            return data[model_id]
-        if type(config) is dict and "_class_name" in config:
-            return task_from_arch(config["_class_name"], default_value=default_value)
-        if not config.architectures or not config.architectures:
-            # Some hardcoded values until a better solution is found.
-            if model_id.startswith("google/bert_"):
-                return "fill-mask"
-        assert config.architectures is not None and len(config.architectures) == 1, (
-            f"Cannot return the task of {model_id!r}, pipeline_tag is not setup, "
-            f"architectures={config.architectures} in config={config}. "
-            f"The task can be added in "
-            f"``onnx_diagnostic.torch_models.hghub.hub_data.__data_arch__``."
-        )
-        return task_from_arch(config.architectures[0], default_value=default_value)
+    tag = _trygetattr(config, "pipeline_tag")
+    if tag is not None:
+        return tag
+
+    guess = _guess_task_from_config(config)
+    if guess is not None:
+        return guess
+    data = load_architecture_task()
+    if subfolder:
+        full_id = f"{model_id}//{subfolder}"
+        if full_id in data:
+            return data[full_id]
+    if model_id in data:
+        return data[model_id]
+    arch = architecture_from_config(config)
+    if arch is None:
+        if model_id.startswith("google/bert_"):
+            return "fill-mask"
+    assert arch is not None, (
+        f"Cannot return the task of {model_id!r}, pipeline_tag is not setup, "
+        f"config={config}. The task can be added in "
+        f"``onnx_diagnostic.torch_models.hghub.hub_data.__data_arch__``."
+    )
+    return task_from_arch(arch, default_value=default_value)
 
 
 def task_from_tags(tags: Union[str, List[str]]) -> str:
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -156,7 +156,8 @@
     YolosForObjectDetection,object-detection
     YolosModel,image-feature-extraction
     Alibaba-NLP/gte-large-en-v1.5,sentence-similarity
-    emilyalsentzer/Bio_ClinicalBERT,fill-mask"""
+    emilyalsentzer/Bio_ClinicalBERT,fill-mask
+    nvidia/Cosmos-Predict2-2B-Video2World//transformer,image-to-video"""
 )
 
 __data_tasks__ = [
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
diff --git a/onnx_diagnostic/torch_models/hghub/model_specific.py b/onnx_diagnostic/torch_models/hghub/model_specific.py

Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,8 @@`
`156`	`156`	`YolosForObjectDetection,object-detection`
`157`	`157`	`YolosModel,image-feature-extraction`
`158`	`158`	`Alibaba-NLP/gte-large-en-v1.5,sentence-similarity`
`159`		`- emilyalsentzer/Bio_ClinicalBERT,fill-mask"""`
	`159`	`+ emilyalsentzer/Bio_ClinicalBERT,fill-mask`
	`160`	`+ nvidia/Cosmos-Predict2-2B-Video2World//transformer,image-to-video"""`
`160`	`161`	`)`
`161`	`162`
`162`	`163`	`__data_tasks__ = [`