Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .ci/scripts/test_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,9 @@ elif [[ "${BACKEND}" == *"xnnpack"* ]]; then
echo "Testing ${MODEL_NAME} with xnnpack..."
WITH_QUANTIZATION=true
WITH_DELEGATION=true
if [[ "$MODEL_NAME" == "mobilebert" ]]; then
# TODO(T197452682)
if [[ "$MODEL_NAME" == "mobilebert" || "$MODEL_NAME" == "albert" ]]; then
# TODO(https://github.com/pytorch/executorch/issues/12341)
# mobilebert, albert incompatible with XNNPACK quantization
WITH_QUANTIZATION=false
fi
test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
contents: read
strategy:
matrix:
model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe, efficientnet_b4, detr_resnet50, segformer_ade, albert, wav2vec2, clip, sentence_transformers, distilbert_qa, real_esrgan, audio_spectrogram_transformer, roberta_sentiment, depth_anything_v2]
backend: [portable, xnnpack-quantization-delegation]
runner: [linux.arm64.2xlarge]
include:
Expand Down
34 changes: 34 additions & 0 deletions examples/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,20 @@ class Model(str, Enum):
EfficientSam = "efficient_sam"
Qwen25 = "qwen2_5"
Phi4Mini = "phi_4_mini"
EfficientNetB4 = "efficientnet_b4"
DetrResNet50 = "detr_resnet50"
SegformerADE = "segformer_ade"
Albert = "albert"
Swin2SR2x = "swin2sr_2x"
TrOCRHandwritten = "trocr_handwritten"
Wav2Vec2 = "wav2vec2"
CLIP = "clip"
SentenceTransformers = "sentence_transformers"
DistilBertQA = "distilbert_qa"
RealESRGAN = "real_esrgan"
AudioSpectrogramTransformer = "audio_spectrogram_transformer"
RobertaSentiment = "roberta_sentiment"
DepthAnythingV2 = "depth_anything_v2"

def __str__(self) -> str:
return self.value
Expand Down Expand Up @@ -82,6 +96,26 @@ def __str__(self) -> str:
str(Model.EfficientSam): ("efficient_sam", "EfficientSAM"),
str(Model.Qwen25): ("qwen2_5", "Qwen2_5Model"),
str(Model.Phi4Mini): ("phi_4_mini", "Phi4MiniModel"),
str(Model.EfficientNetB4): ("efficientnet_b4", "EfficientNetB4Model"),
str(Model.DetrResNet50): ("detr_resnet50", "DetrResNet50Model"),
str(Model.SegformerADE): ("segformer_ade", "SegformerADEModel"),
str(Model.Albert): ("albert", "AlbertModelExample"),
str(Model.Swin2SR2x): ("swin2sr_2x", "Swin2SR2xModel"),
str(Model.TrOCRHandwritten): ("trocr_handwritten", "TrOCRHandwrittenModel"),
str(Model.Wav2Vec2): ("wav2vec2", "Wav2Vec2Model"),
str(Model.CLIP): ("clip", "CLIPModel"),
str(Model.SentenceTransformers): (
"sentence_transformers",
"SentenceTransformersModel",
),
str(Model.DistilBertQA): ("distilbert_qa", "DistilBertQAModel"),
str(Model.RealESRGAN): ("real_esrgan", "RealESRGANModel"),
str(Model.AudioSpectrogramTransformer): (
"audio_spectrogram_transformer",
"AudioSpectrogramTransformerModel",
),
str(Model.RobertaSentiment): ("roberta_sentiment", "RobertaSentimentModel"),
str(Model.DepthAnythingV2): ("depth_anything_v2", "DepthAnythingV2Model"),
}

__all__ = [
Expand Down
11 changes: 11 additions & 0 deletions examples/models/albert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from .model import AlbertModelExample

__all__ = [
"AlbertModelExample",
]
30 changes: 30 additions & 0 deletions examples/models/albert/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import logging

import torch

from transformers import AlbertModel, AutoTokenizer # @manual

from ..model_base import EagerModelBase


class AlbertModelExample(EagerModelBase):
def __init__(self):
pass

def get_eager_model(self) -> torch.nn.Module:
logging.info("Loading ALBERT model")
# pyre-ignore
model = AlbertModel.from_pretrained("albert-base-v2", return_dict=False)
model.eval()
logging.info("Loaded ALBERT model")
return model

def get_example_inputs(self):
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
return (tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"],)
9 changes: 9 additions & 0 deletions examples/models/audio_spectrogram_transformer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from .model import AudioSpectrogramTransformerModel

__all__ = ["AudioSpectrogramTransformerModel"]
51 changes: 51 additions & 0 deletions examples/models/audio_spectrogram_transformer/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import logging

import torch
from transformers import ASTFeatureExtractor, ASTForAudioClassification

from ..model_base import EagerModelBase


class AudioSpectrogramTransformerWrapper(torch.nn.Module):
"""Wrapper for HuggingFace Audio Spectrogram Transformer model to make it torch.export compatible"""

def __init__(self, model_name="MIT/ast-finetuned-audioset-10-10-0.4593"):
super().__init__()
self.model = ASTForAudioClassification.from_pretrained(model_name)
self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
self.model.eval()

def forward(self, input_values):
# Audio classification with AST
with torch.no_grad():
outputs = self.model(input_values)

# Return classification logits
return outputs.logits


class AudioSpectrogramTransformerModel(EagerModelBase):
def __init__(self):
pass

def get_eager_model(self) -> torch.nn.Module:
logging.info("Loading Audio Spectrogram Transformer model from HuggingFace")
model = AudioSpectrogramTransformerWrapper(
"MIT/ast-finetuned-audioset-10-10-0.4593"
)
model.eval()
logging.info("Loaded Audio Spectrogram Transformer model")
return model

def get_example_inputs(self):
# Example inputs for AST
# Audio spectrogram: batch_size=1, time_steps=1024, freq_bins=128
input_values = torch.randn(1, 1024, 128)

return (input_values,)
9 changes: 9 additions & 0 deletions examples/models/clip/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from .model import CLIPModel

__all__ = ["CLIPModel"]
58 changes: 58 additions & 0 deletions examples/models/clip/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import logging

import torch
from transformers import CLIPModel as HFCLIPModel, CLIPProcessor

from ..model_base import EagerModelBase


class OpenCLIPWrapper(torch.nn.Module):
"""Wrapper for OpenCLIP model to make it torch.export compatible"""

def __init__(self, model_name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K"):
super().__init__()
self.model = HFCLIPModel.from_pretrained(model_name)
self.processor = CLIPProcessor.from_pretrained(model_name)
self.model.eval()

def forward(self, pixel_values, input_ids, attention_mask):
# Extract image and text features
with torch.no_grad():
outputs = self.model(
pixel_values=pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
return_loss=False,
)

# Return image and text embeddings
return outputs.image_embeds, outputs.text_embeds


class CLIPModel(EagerModelBase):
def __init__(self):
pass

def get_eager_model(self) -> torch.nn.Module:
logging.info("Loading OpenCLIP model from HuggingFace")
model = OpenCLIPWrapper("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
model.eval()
logging.info("Loaded OpenCLIP model")
return model

def get_example_inputs(self):
# Example inputs for CLIP
# Image: batch_size=1, channels=3, height=224, width=224
pixel_values = torch.randn(1, 3, 224, 224)

# Text: batch_size=1, max_length=77 (CLIP's typical context length)
input_ids = torch.randint(0, 49408, (1, 77)) # CLIP vocab size is ~49408
attention_mask = torch.ones(1, 77)

return (pixel_values, input_ids, attention_mask)
9 changes: 9 additions & 0 deletions examples/models/depth_anything_v2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from .model import DepthAnythingV2Model

__all__ = ["DepthAnythingV2Model"]
108 changes: 108 additions & 0 deletions examples/models/depth_anything_v2/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import torch
from executorch.examples.models.model_base import EagerModelBase


class DepthAnythingV2Model(EagerModelBase):
def __init__(self, model_name="depth-anything/Depth-Anything-V2-Small-hf"):
self.model_name = model_name

def _load_model(self):
"""Load the Depth Anything V2 model from HuggingFace"""
try:
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
except ImportError:
raise ImportError(
"transformers is required for DepthAnythingV2Model. "
"Install with: pip install transformers"
)

# Load model and processor
self.processor = AutoImageProcessor.from_pretrained(self.model_name)
model = AutoModelForDepthEstimation.from_pretrained(self.model_name)

return model

def get_eager_model(self) -> torch.nn.Module:
return DepthAnythingV2Wrapper(self.model_name)

def get_example_inputs(self):
"""Get example inputs for the model"""
# Standard input size for Depth Anything V2 models
# The model expects images of size (3, 518, 518) based on the processor configuration
return (torch.randn(1, 3, 518, 518),)

def get_dynamic_shapes(self):
"""Dynamic shapes for variable input sizes"""
from torch.export import Dim

batch_size = Dim("batch_size", min=1, max=8)
height = Dim("height", min=224, max=1024)
width = Dim("width", min=224, max=1024)

return ({0: batch_size, 2: height, 3: width},)


class DepthAnythingV2Wrapper(torch.nn.Module):
"""
Wrapper for Depth Anything V2 model that handles preprocessing and provides a clean interface.
"""

def __init__(self, model_name="depth-anything/Depth-Anything-V2-Small-hf"):
super().__init__()
try:
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
except ImportError:
raise ImportError(
"transformers is required for DepthAnythingV2Model. "
"Install with: pip install transformers"
)

self.processor = AutoImageProcessor.from_pretrained(model_name)
self.model = AutoModelForDepthEstimation.from_pretrained(model_name)

# Set to evaluation mode
self.model.eval()

def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
"""
Forward pass for depth estimation.

Args:
pixel_values: Input image tensor of shape (batch_size, 3, height, width)
Values should be normalized to [0, 1] range

Returns:
predicted_depth: Depth map tensor of shape (batch_size, height, width)
"""
# The model expects inputs to be preprocessed
# pixel_values should already be properly normalized and sized

# Remove torch.no_grad() for export compatibility
outputs = self.model(pixel_values=pixel_values)
predicted_depth = outputs.predicted_depth

# The model outputs depth in a specific format - we may need to interpolate
# to match the input image size
if predicted_depth.shape[-2:] != pixel_values.shape[-2:]:
predicted_depth = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=pixel_values.shape[-2:],
mode="bilinear",
align_corners=False,
).squeeze(1)

return predicted_depth

def preprocess_image(self, image):
"""
Preprocess a PIL image for the model.
This method is not used in the forward pass but can be helpful for testing.
"""
inputs = self.processor(images=image, return_tensors="pt")
return inputs["pixel_values"]
11 changes: 11 additions & 0 deletions examples/models/detr_resnet50/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from .model import DetrResNet50Model

__all__ = [
"DetrResNet50Model",
]
Loading
Loading