pytorch
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/__init__.py‎
Lines changed: 20 additions & 1 deletion b/‎examples/models/__init__.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎examples/models/audio_spectrogram_transformer/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎examples/models/audio_spectrogram_transformer/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/models/audio_spectrogram_transformer/model.py‎
Lines changed: 51 additions & 0 deletions b/‎examples/models/audio_spectrogram_transformer/model.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎examples/models/clip/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎examples/models/clip/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/models/clip/model.py‎
Lines changed: 58 additions & 0 deletions b/‎examples/models/clip/model.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎examples/models/depth_anything_v2/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎examples/models/depth_anything_v2/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/models/depth_anything_v2/model.py‎
Lines changed: 102 additions & 0 deletions b/‎examples/models/depth_anything_v2/model.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎examples/models/distilbert_qa/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎examples/models/distilbert_qa/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/models/distilbert_qa/model.py‎
Lines changed: 50 additions & 0 deletions b/‎examples/models/distilbert_qa/model.py‎
Lines changed: 50 additions & 0 deletions
@@ -63,7 +63,7 @@ jobs:
       contents: read
     strategy:
       matrix:
-        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe, efficientnet_b4, detr_resnet50, segformer_ade, albert, wav2vec2]
+        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe, efficientnet_b4, detr_resnet50, segformer_ade, albert, wav2vec2, clip, sentence_transformers, distilbert_qa, real_esrgan, audio_spectrogram_transformer, roberta_sentiment, depth_anything_v2]
         backend: [portable, xnnpack-quantization-delegation]
         runner: [linux.arm64.2xlarge]
         include:
 
@@ -41,10 +41,16 @@ class Model(str, Enum):
     DetrResNet50 = "detr_resnet50"
     SegformerADE = "segformer_ade"
     Albert = "albert"
-    BiLSTM = "bilstm"
     Swin2SR2x = "swin2sr_2x"
     TrOCRHandwritten = "trocr_handwritten"
     Wav2Vec2 = "wav2vec2"
+    CLIP = "clip"
+    SentenceTransformers = "sentence_transformers"
+    DistilBertQA = "distilbert_qa"
+    RealESRGAN = "real_esrgan"
+    AudioSpectrogramTransformer = "audio_spectrogram_transformer"
+    RobertaSentiment = "roberta_sentiment"
+    DepthAnythingV2 = "depth_anything_v2"
 
     def __str__(self) -> str:
         return self.value
@@ -97,6 +103,19 @@ def __str__(self) -> str:
     str(Model.Swin2SR2x): ("swin2sr_2x", "Swin2SR2xModel"),
     str(Model.TrOCRHandwritten): ("trocr_handwritten", "TrOCRHandwrittenModel"),
     str(Model.Wav2Vec2): ("wav2vec2", "Wav2Vec2Model"),
+    str(Model.CLIP): ("clip", "CLIPModel"),
+    str(Model.SentenceTransformers): (
+        "sentence_transformers",
+        "SentenceTransformersModel",
+    ),
+    str(Model.DistilBertQA): ("distilbert_qa", "DistilBertQAModel"),
+    str(Model.RealESRGAN): ("real_esrgan", "RealESRGANModel"),
+    str(Model.AudioSpectrogramTransformer): (
+        "audio_spectrogram_transformer",
+        "AudioSpectrogramTransformerModel",
+    ),
+    str(Model.RobertaSentiment): ("roberta_sentiment", "RobertaSentimentModel"),
+    str(Model.DepthAnythingV2): ("depth_anything_v2", "DepthAnythingV2Model"),
 }
 
 __all__ = [
 
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import AudioSpectrogramTransformerModel
+
+__all__ = ["AudioSpectrogramTransformerModel"]
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from transformers import ASTFeatureExtractor, ASTForAudioClassification
+
+from ..model_base import EagerModelBase
+
+
+class AudioSpectrogramTransformerWrapper(torch.nn.Module):
+    """Wrapper for HuggingFace Audio Spectrogram Transformer model to make it torch.export compatible"""
+
+    def __init__(self, model_name="MIT/ast-finetuned-audioset-10-10-0.4593"):
+        super().__init__()
+        self.model = ASTForAudioClassification.from_pretrained(model_name)
+        self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
+        self.model.eval()
+
+    def forward(self, input_values):
+        # Audio classification with AST
+        with torch.no_grad():
+            outputs = self.model(input_values)
+
+        # Return classification logits
+        return outputs.logits
+
+
+class AudioSpectrogramTransformerModel(EagerModelBase):
+    def __init__(self):
+        pass
+
+    def get_eager_model(self) -> torch.nn.Module:
+        logging.info("Loading Audio Spectrogram Transformer model from HuggingFace")
+        model = AudioSpectrogramTransformerWrapper(
+            "MIT/ast-finetuned-audioset-10-10-0.4593"
+        )
+        model.eval()
+        logging.info("Loaded Audio Spectrogram Transformer model")
+        return model
+
+    def get_example_inputs(self):
+        # Example inputs for AST
+        # Audio spectrogram: batch_size=1, time_steps=1024, freq_bins=128
+        input_values = torch.randn(1, 1024, 128)
+
+        return (input_values,)
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import CLIPModel
+
+__all__ = ["CLIPModel"]
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from transformers import CLIPModel as HFCLIPModel, CLIPProcessor
+
+from ..model_base import EagerModelBase
+
+
+class OpenCLIPWrapper(torch.nn.Module):
+    """Wrapper for OpenCLIP model to make it torch.export compatible"""
+
+    def __init__(self, model_name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K"):
+        super().__init__()
+        self.model = HFCLIPModel.from_pretrained(model_name)
+        self.processor = CLIPProcessor.from_pretrained(model_name)
+        self.model.eval()
+
+    def forward(self, pixel_values, input_ids, attention_mask):
+        # Extract image and text features
+        with torch.no_grad():
+            outputs = self.model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                return_loss=False,
+            )
+
+        # Return image and text embeddings
+        return outputs.image_embeds, outputs.text_embeds
+
+
+class CLIPModel(EagerModelBase):
+    def __init__(self):
+        pass
+
+    def get_eager_model(self) -> torch.nn.Module:
+        logging.info("Loading OpenCLIP model from HuggingFace")
+        model = OpenCLIPWrapper("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+        model.eval()
+        logging.info("Loaded OpenCLIP model")
+        return model
+
+    def get_example_inputs(self):
+        # Example inputs for CLIP
+        # Image: batch_size=1, channels=3, height=224, width=224
+        pixel_values = torch.randn(1, 3, 224, 224)
+
+        # Text: batch_size=1, max_length=77 (CLIP's typical context length)
+        input_ids = torch.randint(0, 49408, (1, 77))  # CLIP vocab size is ~49408
+        attention_mask = torch.ones(1, 77)
+
+        return (pixel_values, input_ids, attention_mask)
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import DepthAnythingV2Model
+
+__all__ = ["DepthAnythingV2Model"]
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.examples.models.model_base import EagerModelBase
+
+
+class DepthAnythingV2Model(EagerModelBase):
+    def __init__(self, model_name="depth-anything/Depth-Anything-V2-Small-hf"):
+        self.model_name = model_name
+
+    def _load_model(self):
+        """Load the Depth Anything V2 model from HuggingFace"""
+        try:
+            from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        except ImportError:
+            raise ImportError(
+                "transformers is required for DepthAnythingV2Model. "
+                "Install with: pip install transformers"
+            )
+
+        # Load model and processor
+        self.processor = AutoImageProcessor.from_pretrained(self.model_name)
+        model = AutoModelForDepthEstimation.from_pretrained(self.model_name)
+
+        return model
+
+    def get_eager_model(self) -> torch.nn.Module:
+        return DepthAnythingV2Wrapper(self.model_name)
+
+    def get_example_inputs(self):
+        """Get example inputs for the model"""
+        # Standard input size for Depth Anything V2 models
+        # The model expects images of size (3, 518, 518) based on the processor configuration
+        return (torch.randn(1, 3, 518, 518),)
+
+    def get_dynamic_shapes(self):
+        """Dynamic shapes for variable input sizes"""
+        return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
+
+
+class DepthAnythingV2Wrapper(torch.nn.Module):
+    """
+    Wrapper for Depth Anything V2 model that handles preprocessing and provides a clean interface.
+    """
+
+    def __init__(self, model_name="depth-anything/Depth-Anything-V2-Small-hf"):
+        super().__init__()
+        try:
+            from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        except ImportError:
+            raise ImportError(
+                "transformers is required for DepthAnythingV2Model. "
+                "Install with: pip install transformers"
+            )
+
+        self.processor = AutoImageProcessor.from_pretrained(model_name)
+        self.model = AutoModelForDepthEstimation.from_pretrained(model_name)
+
+        # Set to evaluation mode
+        self.model.eval()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for depth estimation.
+
+        Args:
+            pixel_values: Input image tensor of shape (batch_size, 3, height, width)
+                         Values should be normalized to [0, 1] range
+
+        Returns:
+            predicted_depth: Depth map tensor of shape (batch_size, height, width)
+        """
+        # The model expects inputs to be preprocessed
+        # pixel_values should already be properly normalized and sized
+
+        with torch.no_grad():
+            outputs = self.model(pixel_values=pixel_values)
+            predicted_depth = outputs.predicted_depth
+
+            # The model outputs depth in a specific format - we may need to interpolate
+            # to match the input image size
+            if predicted_depth.shape[-2:] != pixel_values.shape[-2:]:
+                predicted_depth = torch.nn.functional.interpolate(
+                    predicted_depth.unsqueeze(1),
+                    size=pixel_values.shape[-2:],
+                    mode="bilinear",
+                    align_corners=False,
+                ).squeeze(1)
+
+        return predicted_depth
+
+    def preprocess_image(self, image):
+        """
+        Preprocess a PIL image for the model.
+        This method is not used in the forward pass but can be helpful for testing.
+        """
+        inputs = self.processor(images=image, return_tensors="pt")
+        return inputs["pixel_values"]
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import DistilBertQAModel
+
+__all__ = ["DistilBertQAModel"]
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer
+
+from ..model_base import EagerModelBase
+
+
+class DistilBertQAWrapper(torch.nn.Module):
+    """Wrapper for HuggingFace DistilBERT QA model to make it torch.export compatible"""
+
+    def __init__(self, model_name="distilbert-base-cased-distilled-squad"):
+        super().__init__()
+        self.model = DistilBertForQuestionAnswering.from_pretrained(model_name)
+        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+        self.model.eval()
+
+    def forward(self, input_ids, attention_mask):
+        # Get question answering outputs
+        with torch.no_grad():
+            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+
+        # Return start and end logits for answer span
+        return outputs.start_logits, outputs.end_logits
+
+
+class DistilBertQAModel(EagerModelBase):
+    def __init__(self):
+        pass
+
+    def get_eager_model(self) -> torch.nn.Module:
+        logging.info("Loading DistilBERT QA model from HuggingFace")
+        model = DistilBertQAWrapper("distilbert-base-cased-distilled-squad")
+        model.eval()
+        logging.info("Loaded DistilBERT QA model")
+        return model
+
+    def get_example_inputs(self):
+        # Example inputs for DistilBERT QA
+        # Combined question and context: batch_size=1, max_length=512
+        input_ids = torch.randint(0, 28996, (1, 512))  # DistilBERT vocab size
+        attention_mask = torch.ones(1, 512)
+
+        return (input_ids, attention_mask)