Support for text-to-image

xadupre · xadupre · commit c30f91229cd5 · 2025-06-24T14:48:03.000+02:00
diff --git a/_unittests/ut_tasks/test_tasks_image_classification.py b/_unittests/ut_tasks/test_tasks_image_classification.py
@@ -6,7 +6,7 @@
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
-class TestTasks(ExtTestCase):
+class TestTasksImageClassification(ExtTestCase):
     @hide_stdout()
     def test_image_classification(self):
         mid = "hf-internal-testing/tiny-random-BeitForImageClassification"
diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -11,7 +11,7 @@
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
-class TestTasks(ExtTestCase):
+class TestTasksImageTextToText(ExtTestCase):
     @hide_stdout()
     @requires_transformers("4.52")
     @requires_torch("2.7.99")
diff --git a/_unittests/ut_tasks/test_tasks_object_detection.py b/_unittests/ut_tasks/test_tasks_object_detection.py
@@ -6,7 +6,7 @@
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
-class TestTasks(ExtTestCase):
+class TestTasksObjectDetection(ExtTestCase):
     @hide_stdout()
     def test_object_detection(self):
         mid = "hustvl/yolos-tiny"
diff --git a/_unittests/ut_tasks/test_tasks_text_to_image.py b/_unittests/ut_tasks/test_tasks_text_to_image.py
@@ -0,0 +1,35 @@
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    hide_stdout,
+    requires_transformers,
+    requires_torch,
+)
+from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
+from onnx_diagnostic.torch_export_patches import torch_export_patches
+from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
+
+
+class TestTasksTextToTimage(ExtTestCase):
+    @hide_stdout()
+    @requires_transformers("4.52")
+    @requires_torch("2.7.99")
+    def test_text_to_image(self):
+        mid = "diffusers/tiny-torch-full-checker"
+        data = get_untrained_model_with_inputs(
+            mid, verbose=1, add_second_input=True, subfolder="unet"
+        )
+        self.assertEqual(data["task"], "text-to-image")
+        self.assertIn((data["size"], data["n_weights"]), [(5708048, 1427012)])
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        model(**inputs)
+        model(**data["inputs2"])
+        with torch_export_patches(patch_transformers=True, verbose=10, stop_if_static=1):
+            torch.export.export(
+                model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
+            )
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/test_tasks_zero_shot_image_classification.py b/_unittests/ut_tasks/test_tasks_zero_shot_image_classification.py
@@ -6,7 +6,7 @@
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
-class TestTasks(ExtTestCase):
+class TestTasksZeroShotImageClassification(ExtTestCase):
     @requires_torch("2.7.99")
     @hide_stdout()
     def test_zero_shot_image_classification(self):
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -569,6 +569,27 @@ def test_object_detection(self):
                 f"{round(score.item(), 3)} at location {box}"
             )
 
+    @never_test()
+    def test_text_to_image(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k test_text_to_image
+        import torch
+        from diffusers import StableDiffusionPipeline
+
+        model_id = "diffusers/tiny-torch-full-checker"  # "stabilityai/stable-diffusion-2"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(
+            "cuda"
+        )
+
+        prompt = "a photo of an astronaut riding a horse on mars and on jupyter"
+        print()
+        with steal_forward(pipe.unet, with_min_max=True):
+            image = pipe(prompt).images[0]
+        print("-- output", self.string_type(image, with_shape=True, with_min_max=True))
+        # stolen forward for class UNet2DConditionModel -- iteration 44
+        # sample=T10s2x4x96x96[-3.7734375,4.359375:A-0.043463995395642184]
+        # time_step=T7s=101
+        # encoder_hidden_states:T10s2x77x1024[-6.58203125,13.0234375:A-0.16780663634440257]
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers/config_helper.py b/onnx_diagnostic/helpers/config_helper.py
@@ -43,7 +43,10 @@ def update_config(config: Any, mkwargs: Dict[str, Any]):
             else:
                 update_config(getattr(config, k), v)
             continue
-        setattr(config, k, v)
+        if type(config) is dict:
+            config[k] = v
+        else:
+            setattr(config, k, v)
 
 
 def _pick(config, *atts, exceptions: Optional[Dict[str, Callable]] = None):
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
@@ -11,6 +11,7 @@
     summarization,
     text_classification,
     text_generation,
+    text_to_image,
     text2text_generation,
     zero_shot_image_classification,
 )
@@ -27,6 +28,7 @@
     summarization,
     text_classification,
     text_generation,
+    text_to_image,
     text2text_generation,
     zero_shot_image_classification,
 ]
diff --git a/onnx_diagnostic/tasks/text_to_image.py b/onnx_diagnostic/tasks/text_to_image.py
@@ -0,0 +1,91 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "text-to-image"
+
+
+def reduce_model_config(config: Any) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, "sample_size", "cross_attention_dim")
+    kwargs = dict(
+        sample_size=min(config["sample_size"], 32),
+        cross_attention_dim=min(config["cross_attention_dim"], 64),
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    batch_size: int,
+    sequence_length: int,
+    cache_length: int,
+    in_channels: int,
+    sample_size: int,
+    cross_attention_dim: int,
+    add_second_input: bool = False,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``text-to-image``.
+    Example:
+
+    ::
+
+        sample:T10s2x4x96x96[-3.7734375,4.359375:A-0.043463995395642184]
+        timestep:T7s=101
+        encoder_hidden_states:T10s2x77x1024[-6.58203125,13.0234375:A-0.16780663634440257]
+    """
+    assert (
+        "cls_cache" not in kwargs
+    ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    shapes = {
+        "sample": {0: batch},
+        "timestep": {},
+        "encoder_hidden_states": {0: batch, 1: "encoder_length"},
+    }
+    inputs = dict(
+        sample=torch.randn((batch_size, sequence_length, sample_size, sample_size)).to(
+            torch.float32
+        ),
+        timestep=torch.tensor([101], dtype=torch.int64),
+        encoder_hidden_states=torch.randn(
+            (batch_size, sequence_length, cross_attention_dim)
+        ).to(torch.float32),
+    )
+    res = dict(inputs=inputs, dynamic_shapes=shapes)
+    if add_second_input:
+        res["inputs2"] = get_inputs(
+            model=model,
+            config=config,
+            batch_size=batch_size + 1,
+            sequence_length=sequence_length,
+            cache_length=cache_length + 1,
+            in_channels=in_channels,
+            sample_size=sample_size,
+            cross_attention_dim=cross_attention_dim,
+            **kwargs,
+        )["inputs"]
+    return res
+
+
+def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, "sample_size", "cross_attention_dim", "in_channels")
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=config["in_channels"],
+        cache_length=77,
+        in_channels=config["in_channels"],
+        sample_size=config["sample_size"],
+        cross_attention_dim=config["cross_attention_dim"],
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -4302,3 +4302,31 @@ def _ccached_microsoft_phi_35_mini_instruct():
             "vocab_size": 32064,
         }
     )
+
+
+def _ccached_diffusers_tiny_torch_full_checker_unet():
+    "diffusers/tiny-torch-full-checker/unet"
+    return {
+        "_class_name": "UNet2DConditionModel",
+        "_diffusers_version": "0.8.0",
+        "_name_or_path": "https://huggingface.co/diffusers/tiny-torch-full-checker/blob/main/unet/config.json",
+        "act_fn": "silu",
+        "attention_head_dim": 8,
+        "block_out_channels": [32, 64],
+        "center_input_sample": false,
+        "cross_attention_dim": 32,
+        "down_block_types": ["DownBlock2D", "CrossAttnDownBlock2D"],
+        "downsample_padding": 1,
+        "dual_cross_attention": false,
+        "flip_sin_to_cos": true,
+        "freq_shift": 0,
+        "in_channels": 4,
+        "layers_per_block": 2,
+        "mid_block_scale_factor": 1,
+        "norm_eps": 1e-05,
+        "norm_num_groups": 32,
+        "out_channels": 4,
+        "sample_size": 32,
+        "up_block_types": ["CrossAttnUpBlock2D", "UpBlock2D"],
+        "use_linear_projection": false,
+    }
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -145,12 +145,19 @@ def get_untrained_model_with_inputs(
                 f"{config._attn_implementation!r}"  # type: ignore[union-attr]
             )
 
+    if type(config) is dict and "_diffusers_version" in config:
+        import diffusers
+
+        package_source = diffusers
+    else:
+        package_source = transformers
+
     if use_pretrained:
         model = transformers.AutoModel.from_pretrained(model_id, **mkwargs)
     else:
         if archs is not None:
             try:
-                model = getattr(transformers, archs[0])(config)
+                cls_model = getattr(package_source, archs[0])
             except AttributeError as e:
                 # The code of the models is not in transformers but in the
                 # repository of the model. We need to download it.
@@ -174,10 +181,12 @@ def get_untrained_model_with_inputs(
                             f"[get_untrained_model_with_inputs] from folder "
                             f"{os.path.split(pyfiles[0])[0]!r}"
                         )
-                    cls = transformers.dynamic_module_utils.get_class_from_dynamic_module(
-                        cls_name, pretrained_model_name_or_path=os.path.split(pyfiles[0])[0]
+                    cls_model = (
+                        transformers.dynamic_module_utils.get_class_from_dynamic_module(
+                            cls_name,
+                            pretrained_model_name_or_path=os.path.split(pyfiles[0])[0],
+                        )
                     )
-                    model = cls(config)
                 else:
                     raise AttributeError(
                         f"Unable to find class 'tranformers.{archs[0]}'. "
@@ -191,6 +200,11 @@ def get_untrained_model_with_inputs(
                 f"and use_pretrained=True."
             )
 
+        if type(config) is dict:
+            model = cls_model(**config)
+        else:
+            model = cls_model(config)
+
     # input kwargs
     kwargs, fct = random_input_kwargs(config, task)
     if verbose:
diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py
@@ -538,9 +538,13 @@ def validate_model(
     if summary["model_module"] in sys.modules:
         summary["model_file"] = str(sys.modules[summary["model_module"]].__file__)  # type: ignore[index]
     summary["model_config_class"] = data["configuration"].__class__.__name__
-    summary["model_config"] = str(shrink_config(data["configuration"].to_dict())).replace(
-        " ", ""
-    )
+    summary["model_config"] = str(
+        shrink_config(
+            data["configuration"]
+            if type(data["configuration"])
+            else data["configuration"].to_dict()
+        )
+    ).replace(" ", "")
     summary["model_id"] = model_id
 
     if verbose: