one fix

xadupre · xadupre · commit c28b9e5eb6c3 · 2025-10-02T23:08:45.000+02:00
diff --git a/onnx_diagnostic/tasks/image_text_to_text.py b/onnx_diagnostic/tasks/image_text_to_text.py
@@ -94,11 +94,11 @@ def _get_inputs_gemma3(
     width: int,
     height: int,
     num_channels: int,
-    batch_size: int = 1,
-    sequence_length: int = 281,
-    n_images: int = 1,
-    max_sequence_length: int = 580,
-    total_sequence_length: int = 860,
+    batch_size: Optional[int] = 1,
+    sequence_length: Optional[int] = 281,
+    n_images: Optional[int] = 1,
+    max_sequence_length: Optional[int] = 580,
+    total_sequence_length: Optional[int] = 860,
     **kwargs,  # unused
 ):
     """
@@ -129,6 +129,12 @@ def _get_inputs_gemma3(
            attention_mask:dict(sliding_attention:T9s1x1x1x580,full_attention:T9s1x1x1x580),
            position_ids:None,
     """
+    batch_size = 1 if batch_size is None else batch_size
+    sequence_length = 281 if sequence_length is None else sequence_length
+    n_images = 1 if n_images is None else n_images
+    max_sequence_length = 580 if max_sequence_length is None else max_sequence_length
+    total_sequence_length = 860 if total_sequence_length is None else total_sequence_length
+
     assert (
         "cls_cache" not in kwargs
     ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
@@ -224,6 +230,111 @@ def _check_():
     return dict(inputs=inputs, dynamic_shapes=shapes)
 
 
+def get_inputs_default(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    pad_token_id: int,
+    image_token_index: int,
+    head_dim: int,
+    width: int,
+    height: int,
+    num_channels: int,
+    batch_size: Optional[int] = 2,
+    sequence_length: Optional[int] = 43,
+    n_images: Optional[int] = 2,
+    max_sequence_length: Optional[int] = 43,
+    total_sequence_length: Optional[int] = 43,
+    add_second_input: int = 0,
+    **kwargs,  # unused
+):
+    batch_size = 2 if batch_size is None else batch_size
+    sequence_length = 43 if sequence_length is None else sequence_length
+    n_images = 2 if n_images is None else n_images
+    max_sequence_length = 43 if max_sequence_length is None else max_sequence_length
+    total_sequence_length = 43 if total_sequence_length is None else total_sequence_length
+
+    assert (
+        "cls_cache" not in kwargs
+    ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
+    batch = "batch"
+    batch_img = torch.export.Dim("batch_img", min=1, max=1024)
+    seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
+    images = "images"  # torch.export.Dim("images", min=1, max=4096)
+
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "token_type_ids": {0: batch, 1: seq_length},
+        "attention_mask": {0: batch, 1: "cache+seq"},
+        "position_ids": {0: batch, 1: "cache+seq"},
+        "past_key_values": [
+            [{0: batch} for _ in range(num_hidden_layers)],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+        ],
+        "pixel_values": (
+            {0: batch, 1: images}
+            if model.__class__.__name__ == "IdeficsForVisionText2Text"
+            else {0: batch_img}
+        ),
+        "image_attention_mask": {0: batch, 1: seq_length, 2: images},
+        "image_grid_thw": {0: batch},
+        "use_cache": None,
+    }
+
+    input_ids = torch.randint(0, dummy_max_token_id, (batch_size, total_sequence_length)).to(
+        torch.int64
+    )
+    input_ids[0, 0] = image_token_index
+    input_ids[1, 1] = image_token_index
+    # input_ids[input_ids == image_token_index] = pad_token_id
+    token_type_ids = torch.zeros_like(input_ids)
+    token_type_ids[input_ids == image_token_index] = 1
+    image_grid_thw = torch.zeros((n_images, 3), dtype=torch.int64)
+    image_grid_thw[:, 1] = height
+    image_grid_thw[:, 2] = width
+    image_grid_thw[0, :] //= 2
+    image_grid_thw[:, 0] = torch.arange(n_images, dtype=image_grid_thw.dtype)
+
+    inputs = dict(
+        input_ids=input_ids,
+        attention_mask=torch.cat(
+            [
+                torch.ones((batch_size, sequence_length), dtype=torch.int64),
+                input_ids.ne(pad_token_id).to(torch.int64),
+            ],
+            axis=-1,
+        ),
+        position_ids=torch.arange(0, total_sequence_length)
+        .to(torch.int64)
+        .expand((batch_size, -1)),
+        past_key_values=make_dynamic_cache(
+            [
+                (
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                )
+                for i in range(num_hidden_layers)
+            ]
+        ),
+        pixel_values=(
+            torch.randn((batch_size, n_images, num_channels, width, height)).clamp(-1, 1)
+            if model.__class__.__name__ == "IdeficsForVisionText2Text"
+            else torch.randn(n_images, num_channels, width, height).clamp(-1, 1)
+        ),
+        # image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
+        #     torch.int64
+        # ),
+        token_type_ids=token_type_ids,
+        image_grid_thw=image_grid_thw,
+        use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
+    )
+    res = dict(inputs=inputs, dynamic_shapes=shapes)
+    return res
+
+
 def get_inputs(
     model: torch.nn.Module,
     config: Optional[Any],
@@ -236,11 +347,11 @@ def get_inputs(
     width: int,
     height: int,
     num_channels: int,
-    batch_size: int = 1,
-    sequence_length: int = 281,
-    n_images: int = 1,
-    max_sequence_length: int = 580,
-    total_sequence_length: int = 860,
+    batch_size: Optional[int] = None,
+    sequence_length: Optional[int] = None,
+    n_images: Optional[int] = None,
+    max_sequence_length: Optional[int] = None,
+    total_sequence_length: Optional[int] = None,
     add_second_input: int = 0,
     **kwargs,  # unused
 ):
@@ -290,86 +401,26 @@ def get_inputs(
             **kwargs,
         )
     else:
-        assert (
-            "cls_cache" not in kwargs
-        ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
-        batch = "batch"
-        batch_img = torch.export.Dim("batch_img", min=1, max=1024)
-        seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
-        cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
-        images = "images"  # torch.export.Dim("images", min=1, max=4096)
-
-        shapes = {
-            "input_ids": {0: batch, 1: seq_length},
-            "token_type_ids": {0: batch, 1: seq_length},
-            "attention_mask": {0: batch, 1: "cache+seq"},
-            "position_ids": {0: batch, 1: "cache+seq"},
-            "past_key_values": [
-                [{0: batch} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            ],
-            "pixel_values": (
-                {0: batch, 1: images}
-                if model.__class__.__name__ == "IdeficsForVisionText2Text"
-                else {0: batch_img}
-            ),
-            "image_attention_mask": {0: batch, 1: seq_length, 2: images},
-            "image_grid_thw": {0: batch},
-            "use_cache": None,
-        }
-
-        input_ids = torch.randint(
-            0, dummy_max_token_id, (batch_size, total_sequence_length)
-        ).to(torch.int64)
-        input_ids[0, 0] = image_token_index
-        input_ids[1, 1] = image_token_index
-        # input_ids[input_ids == image_token_index] = pad_token_id
-        token_type_ids = torch.zeros_like(input_ids)
-        token_type_ids[input_ids == image_token_index] = 1
-        image_grid_thw = torch.zeros((n_images, 3), dtype=torch.int64)
-        image_grid_thw[:, 1] = height
-        image_grid_thw[:, 2] = width
-        image_grid_thw[0, :] //= 2
-        image_grid_thw[:, 0] = torch.arange(n_images, dtype=image_grid_thw.dtype)
-
-        inputs = dict(
-            input_ids=input_ids,
-            attention_mask=torch.cat(
-                [
-                    torch.ones((batch_size, sequence_length), dtype=torch.int64),
-                    input_ids.ne(pad_token_id).to(torch.int64),
-                ],
-                axis=-1,
-            ),
-            position_ids=torch.arange(0, total_sequence_length)
-            .to(torch.int64)
-            .expand((batch_size, -1)),
-            past_key_values=make_dynamic_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
-                        ),
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
-            pixel_values=(
-                torch.randn((batch_size, n_images, num_channels, width, height)).clamp(-1, 1)
-                if model.__class__.__name__ == "IdeficsForVisionText2Text"
-                else torch.randn(n_images, num_channels, width, height).clamp(-1, 1)
-            ),
-            # image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
-            #     torch.int64
-            # ),
-            token_type_ids=token_type_ids,
-            image_grid_thw=image_grid_thw,
-            use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
+        res = get_inputs_default(
+            model,
+            config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_key_value_heads=num_key_value_heads,
+            num_hidden_layers=num_hidden_layers,
+            pad_token_id=pad_token_id,
+            image_token_index=image_token_index,
+            head_dim=head_dim,
+            width=width,
+            height=height,
+            num_channels=num_channels,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            max_sequence_length=max_sequence_length,
+            total_sequence_length=total_sequence_length,
+            n_images=n_images,
+            **kwargs,
         )
-        res = dict(inputs=inputs, dynamic_shapes=shapes)
+
     if add_second_input:
         assert (
             add_second_input > 0
@@ -384,7 +435,7 @@ def get_inputs(
             width=width,
             height=height,
             num_channels=num_channels,
-            batch_size=batch_size + 1,
+            batch_size=3,
             sequence_length=0,
             max_sequence_length=0,
             total_sequence_length=0,
@@ -431,9 +482,6 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
             text_config = False
         check_hasattr(config.vision_config, ("num_channels", "in_chans", "in_channels"))
     kwargs = dict(
-        sequence_length=281,
-        max_sequence_length=580,
-        total_sequence_length=860,
         head_dim=(
             16
             if config is None
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -4829,3 +4829,39 @@ def _ccached_microsoft_phi3_mini_128k_instruct():
             "vocab_size": 32064,
         }
     )
+
+
+def _ccached_google_gemma_3_4b_it_like():
+    "google/gemma-3-4b-it"
+    return transformers.Gemma3Config(
+        **{
+            "architectures": ["Gemma3ForConditionalGeneration"],
+            "boi_token_index": 255999,
+            "eoi_token_index": 256000,
+            "eos_token_id": [1, 106],
+            "image_token_index": 262144,
+            "initializer_range": 0.02,
+            "mm_tokens_per_image": 256,
+            "model_type": "gemma3",
+            "text_config": {
+                "hidden_size": 2560,
+                "intermediate_size": 10240,
+                "model_type": "gemma3_text",
+                "num_hidden_layers": 34,
+                "rope_scaling": {"factor": 8.0, "rope_type": "linear"},
+                "sliding_window": 1024,
+            },
+            "torch_dtype": "bfloat16",
+            "transformers_version": "4.50.0.dev0",
+            "vision_config": {
+                "hidden_size": 1152,
+                "image_size": 896,
+                "intermediate_size": 4304,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27,
+                "patch_size": 14,
+                "vision_use_head": false,
+            },
+        }
+    )