huggingface · metascroy · Jun 24, 2025 · Jun 26, 2025
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import logging
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 from packaging.version import parse
@@ -173,25 +173,35 @@ class VisionEncoderExportableModule(torch.nn.Module):
     This module ensures that the exported model is compatible with ExecuTorch.
     """
 
-    def __init__(self, model):
+    def __init__(self, model, model_id: Optional[str] = None):
         super().__init__()
         self.model = model
         self.config = model.config
         # Metadata to be recorded in the pte model file
         self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
 
+        self.model_id = model_id
+
     def forward(self, pixel_values):
         print(f"DEBUG: pixel_values: {pixel_values.shape}")
         print(f"DEBUG: forward: {self.model.method_meta('forward')}")
         return self.model(pixel_values=pixel_values)
 
     def export(self, pixel_values=None) -> Dict[str, ExportedProgram]:
         if pixel_values is None:
-            batch_size = 1
-            num_channels = self.config.num_channels
-            height = self.config.image_size
-            width = self.config.image_size
-            pixel_values = torch.rand(batch_size, num_channels, height, width)
+            model_to_pixel_values_size = {
+                "microsoft/resnet-50": [1, 3, 224, 224],
+            }
+            if self.model_id in model_to_pixel_values_size:
+                # If an explicit shape is provided for this model, use it
+                pixel_values = torch.rand(*model_to_pixel_values_size[self.model_id])
+            else:
+                # If no explicit shape is provided for this model, infer a shape from config
+                batch_size = 1
+                num_channels = self.config.num_channels
+                height = self.config.image_size
+                width = self.config.image_size
+                pixel_values = torch.rand(batch_size, num_channels, height, width)
 
         with torch.no_grad():
             return {

diff --git a/optimum/exporters/executorch/recipes/coreml.py b/optimum/exporters/executorch/recipes/coreml.py
@@ -96,7 +96,7 @@ def _lower_to_executorch(
                 ],
                 compile_config=EdgeCompileConfig(
                     _check_ir_validity=False,
-                    _skip_dim_order=False,
+                    _skip_dim_order=True,
                 ),
                 constant_methods=metadata,
             ).to_executorch(

diff --git a/optimum/exporters/executorch/tasks/image_classification.py b/optimum/exporters/executorch/tasks/image_classification.py
@@ -39,4 +39,4 @@ def load_image_classification_model(model_name_or_path: str, **kwargs) -> Vision
     """
 
     eager_model = AutoModelForImageClassification.from_pretrained(model_name_or_path, **kwargs).to("cpu").eval()
-    return VisionEncoderExportableModule(eager_model)
+    return VisionEncoderExportableModule(eager_model, model_name_or_path)
diff --git a/tests/models/test_modeling_resnet50.py b/tests/models/test_modeling_resnet50.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import tempfile
+import unittest
+
+import pytest
+import torch
+from transformers.testing_utils import slow
+
+from optimum.executorch import ExecuTorchModelForImageClassification
+
+from ..utils import check_close_recursively
+
+
+is_not_macos = sys.platform != "darwin"
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_vit_export_to_executorch(self):
+        model_id = "microsoft/resnet-50"
+        task = "image-classification"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(is_not_macos, reason="Only runs on MacOS")
+    def test_vit_image_classification_coreml_fp32_cpu(self):
+        model_id = "microsoft/resnet-50"
+
+        batch_size = 1
+        num_channels = 3
+        height = 224
+        width = 224
+        pixel_values = torch.rand(batch_size, num_channels, height, width)
+
+        # Test fetching and lowering the model to ExecuTorch
+        import coremltools as ct
+
+        et_model = ExecuTorchModelForImageClassification.from_pretrained(
+            model_id=model_id,
+            recipe="coreml",
+            recipe_kwargs={"compute_precision": ct.precision.FLOAT32, "compute_units": ct.ComputeUnit.CPU_ONLY},
+        )
+        et_output = et_model.forward(pixel_values)
+
+        # Reference (using XNNPACK as reference because eager model currently segfaults in a PyTorch kernel)
+        et_xnnpack = ExecuTorchModelForImageClassification.from_pretrained(
+            model_id=model_id,
+            recipe="xnnpack",
+        )
+        et_xnnpack_output = et_xnnpack.forward(pixel_values)
+
+        # Compare with reference
+        self.assertTrue(check_close_recursively(et_output, et_xnnpack_output))