pytorch · mergennachin · Jul 10, 2025 · Jul 11, 2025
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -317,8 +317,9 @@ elif [[ "${BACKEND}" == *"xnnpack"* ]]; then
   echo "Testing ${MODEL_NAME} with xnnpack..."
   WITH_QUANTIZATION=true
   WITH_DELEGATION=true
-  if [[ "$MODEL_NAME" == "mobilebert" ]]; then
-    # TODO(T197452682)
+  if [[ "$MODEL_NAME" == "mobilebert" || "$MODEL_NAME" == "albert" ]]; then
+    # TODO(https://github.com/pytorch/executorch/issues/12341) 
+    # mobilebert, albert incompatible with XNNPACK quantization
     WITH_QUANTIZATION=false
   fi
   test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}"

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -63,7 +63,7 @@ jobs:
       contents: read
     strategy:
       matrix:
-        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
+        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe, efficientnet_b4, detr_resnet50, segformer_ade, albert, wav2vec2, clip, sentence_transformers, distilbert_qa, real_esrgan, audio_spectrogram_transformer, roberta_sentiment, depth_anything_v2]
         backend: [portable, xnnpack-quantization-delegation]
         runner: [linux.arm64.2xlarge]
         include:

@@ -37,6 +37,20 @@ class Model(str, Enum):
     EfficientSam = "efficient_sam"
     Qwen25 = "qwen2_5"
     Phi4Mini = "phi_4_mini"
+    EfficientNetB4 = "efficientnet_b4"
+    DetrResNet50 = "detr_resnet50"
+    SegformerADE = "segformer_ade"
+    Albert = "albert"
+    Swin2SR2x = "swin2sr_2x"
+    TrOCRHandwritten = "trocr_handwritten"
+    Wav2Vec2 = "wav2vec2"
+    CLIP = "clip"
+    SentenceTransformers = "sentence_transformers"
+    DistilBertQA = "distilbert_qa"
+    RealESRGAN = "real_esrgan"
+    AudioSpectrogramTransformer = "audio_spectrogram_transformer"
+    RobertaSentiment = "roberta_sentiment"
+    DepthAnythingV2 = "depth_anything_v2"
 
     def __str__(self) -> str:
         return self.value
@@ -82,6 +96,26 @@ def __str__(self) -> str:
     str(Model.EfficientSam): ("efficient_sam", "EfficientSAM"),
     str(Model.Qwen25): ("qwen2_5", "Qwen2_5Model"),
     str(Model.Phi4Mini): ("phi_4_mini", "Phi4MiniModel"),
+    str(Model.EfficientNetB4): ("efficientnet_b4", "EfficientNetB4Model"),
+    str(Model.DetrResNet50): ("detr_resnet50", "DetrResNet50Model"),
+    str(Model.SegformerADE): ("segformer_ade", "SegformerADEModel"),
+    str(Model.Albert): ("albert", "AlbertModelExample"),
+    str(Model.Swin2SR2x): ("swin2sr_2x", "Swin2SR2xModel"),
+    str(Model.TrOCRHandwritten): ("trocr_handwritten", "TrOCRHandwrittenModel"),
+    str(Model.Wav2Vec2): ("wav2vec2", "Wav2Vec2Model"),
+    str(Model.CLIP): ("clip", "CLIPModel"),
+    str(Model.SentenceTransformers): (
+        "sentence_transformers",
+        "SentenceTransformersModel",
+    ),
+    str(Model.DistilBertQA): ("distilbert_qa", "DistilBertQAModel"),
+    str(Model.RealESRGAN): ("real_esrgan", "RealESRGANModel"),
+    str(Model.AudioSpectrogramTransformer): (
+        "audio_spectrogram_transformer",
+        "AudioSpectrogramTransformerModel",
+    ),
+    str(Model.RobertaSentiment): ("roberta_sentiment", "RobertaSentimentModel"),
+    str(Model.DepthAnythingV2): ("depth_anything_v2", "DepthAnythingV2Model"),
 }
 
 __all__ = [

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import AlbertModelExample
+
+__all__ = [
+    "AlbertModelExample",
+]
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+
+from transformers import AlbertModel, AutoTokenizer  # @manual
+
+from ..model_base import EagerModelBase
+
+
+class AlbertModelExample(EagerModelBase):
+    def __init__(self):
+        pass
+
+    def get_eager_model(self) -> torch.nn.Module:
+        logging.info("Loading ALBERT model")
+        # pyre-ignore
+        model = AlbertModel.from_pretrained("albert-base-v2", return_dict=False)
+        model.eval()
+        logging.info("Loaded ALBERT model")
+        return model
+
+    def get_example_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+        return (tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"],)
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import AudioSpectrogramTransformerModel
+
+__all__ = ["AudioSpectrogramTransformerModel"]
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from transformers import ASTFeatureExtractor, ASTForAudioClassification
+
+from ..model_base import EagerModelBase
+
+
+class AudioSpectrogramTransformerWrapper(torch.nn.Module):
+    """Wrapper for HuggingFace Audio Spectrogram Transformer model to make it torch.export compatible"""
+
+    def __init__(self, model_name="MIT/ast-finetuned-audioset-10-10-0.4593"):
+        super().__init__()
+        self.model = ASTForAudioClassification.from_pretrained(model_name)
+        self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
+        self.model.eval()
+
+    def forward(self, input_values):
+        # Audio classification with AST
+        with torch.no_grad():
+            outputs = self.model(input_values)
+
+        # Return classification logits
+        return outputs.logits
+
+
+class AudioSpectrogramTransformerModel(EagerModelBase):
+    def __init__(self):
+        pass
+
+    def get_eager_model(self) -> torch.nn.Module:
+        logging.info("Loading Audio Spectrogram Transformer model from HuggingFace")
+        model = AudioSpectrogramTransformerWrapper(
+            "MIT/ast-finetuned-audioset-10-10-0.4593"
+        )
+        model.eval()
+        logging.info("Loaded Audio Spectrogram Transformer model")
+        return model
+
+    def get_example_inputs(self):
+        # Example inputs for AST
+        # Audio spectrogram: batch_size=1, time_steps=1024, freq_bins=128
+        input_values = torch.randn(1, 1024, 128)
+
+        return (input_values,)
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import CLIPModel
+
+__all__ = ["CLIPModel"]
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from transformers import CLIPModel as HFCLIPModel, CLIPProcessor
+
+from ..model_base import EagerModelBase
+
+
+class OpenCLIPWrapper(torch.nn.Module):
+    """Wrapper for OpenCLIP model to make it torch.export compatible"""
+
+    def __init__(self, model_name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K"):
+        super().__init__()
+        self.model = HFCLIPModel.from_pretrained(model_name)
+        self.processor = CLIPProcessor.from_pretrained(model_name)
+        self.model.eval()
+
+    def forward(self, pixel_values, input_ids, attention_mask):
+        # Extract image and text features
+        with torch.no_grad():
+            outputs = self.model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                return_loss=False,
+            )
+
+        # Return image and text embeddings
+        return outputs.image_embeds, outputs.text_embeds
+
+
+class CLIPModel(EagerModelBase):
+    def __init__(self):
+        pass
+
+    def get_eager_model(self) -> torch.nn.Module:
+        logging.info("Loading OpenCLIP model from HuggingFace")
+        model = OpenCLIPWrapper("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+        model.eval()
+        logging.info("Loaded OpenCLIP model")
+        return model
+
+    def get_example_inputs(self):
+        # Example inputs for CLIP
+        # Image: batch_size=1, channels=3, height=224, width=224
+        pixel_values = torch.randn(1, 3, 224, 224)
+
+        # Text: batch_size=1, max_length=77 (CLIP's typical context length)
+        input_ids = torch.randint(0, 49408, (1, 77))  # CLIP vocab size is ~49408
+        attention_mask = torch.ones(1, 77)
+
+        return (pixel_values, input_ids, attention_mask)
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import DepthAnythingV2Model
+
+__all__ = ["DepthAnythingV2Model"]
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.examples.models.model_base import EagerModelBase
+
+
+class DepthAnythingV2Model(EagerModelBase):
+    def __init__(self, model_name="depth-anything/Depth-Anything-V2-Small-hf"):
+        self.model_name = model_name
+
+    def _load_model(self):
+        """Load the Depth Anything V2 model from HuggingFace"""
+        try:
+            from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        except ImportError:
+            raise ImportError(
+                "transformers is required for DepthAnythingV2Model. "
+                "Install with: pip install transformers"
+            )
+
+        # Load model and processor
+        self.processor = AutoImageProcessor.from_pretrained(self.model_name)
+        model = AutoModelForDepthEstimation.from_pretrained(self.model_name)
+
+        return model
+
+    def get_eager_model(self) -> torch.nn.Module:
+        return DepthAnythingV2Wrapper(self.model_name)
+
+    def get_example_inputs(self):
+        """Get example inputs for the model"""
+        # Standard input size for Depth Anything V2 models
+        # The model expects images of size (3, 518, 518) based on the processor configuration
+        return (torch.randn(1, 3, 518, 518),)
+
+    def get_dynamic_shapes(self):
+        """Dynamic shapes for variable input sizes"""
+        from torch.export import Dim
+
+        batch_size = Dim("batch_size", min=1, max=8)
+        height = Dim("height", min=224, max=1024)
+        width = Dim("width", min=224, max=1024)
+
+        return ({0: batch_size, 2: height, 3: width},)
+
+
+class DepthAnythingV2Wrapper(torch.nn.Module):
+    """
+    Wrapper for Depth Anything V2 model that handles preprocessing and provides a clean interface.
+    """
+
+    def __init__(self, model_name="depth-anything/Depth-Anything-V2-Small-hf"):
+        super().__init__()
+        try:
+            from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        except ImportError:
+            raise ImportError(
+                "transformers is required for DepthAnythingV2Model. "
+                "Install with: pip install transformers"
+            )
+
+        self.processor = AutoImageProcessor.from_pretrained(model_name)
+        self.model = AutoModelForDepthEstimation.from_pretrained(model_name)
+
+        # Set to evaluation mode
+        self.model.eval()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for depth estimation.
+
+        Args:
+            pixel_values: Input image tensor of shape (batch_size, 3, height, width)
+                         Values should be normalized to [0, 1] range
+
+        Returns:
+            predicted_depth: Depth map tensor of shape (batch_size, height, width)
+        """
+        # The model expects inputs to be preprocessed
+        # pixel_values should already be properly normalized and sized
+
+        # Remove torch.no_grad() for export compatibility
+        outputs = self.model(pixel_values=pixel_values)
+        predicted_depth = outputs.predicted_depth
+
+        # The model outputs depth in a specific format - we may need to interpolate
+        # to match the input image size
+        if predicted_depth.shape[-2:] != pixel_values.shape[-2:]:
+            predicted_depth = torch.nn.functional.interpolate(
+                predicted_depth.unsqueeze(1),
+                size=pixel_values.shape[-2:],
+                mode="bilinear",
+                align_corners=False,
+            ).squeeze(1)
+
+        return predicted_depth
+
+    def preprocess_image(self, image):
+        """
+        Preprocess a PIL image for the model.
+        This method is not used in the forward pass but can be helpful for testing.
+        """
+        inputs = self.processor(images=image, return_tensors="pt")
+        return inputs["pixel_values"]
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import DetrResNet50Model
+
+__all__ = [
+    "DetrResNet50Model",
+]