pytorch
diff --git a/‎.ci/scripts/test_huggingface_optimum_model.py‎
Lines changed: 9 additions & 3 deletions b/‎.ci/scripts/test_huggingface_optimum_model.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 4 additions & 1 deletion b/‎.ci/scripts/test_model.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 41 additions & 1 deletion b/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm‎
Lines changed: 8 additions & 6 deletions b/‎backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/executorch_operations.h‎
Lines changed: 5 additions & 0 deletions b/‎backends/apple/coreml/runtime/delegate/executorch_operations.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/executorch_operations.mm‎
Lines changed: 29 additions & 0 deletions b/‎backends/apple/coreml/runtime/delegate/executorch_operations.mm‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 60 additions & 0 deletions b/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎backends/arm/arm_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/arm_backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/quantizer/arm_quantizer.py‎
Lines changed: 29 additions & 3 deletions b/‎backends/arm/quantizer/arm_quantizer.py‎
Lines changed: 29 additions & 3 deletions
@@ -262,14 +262,20 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
 
     assert torch.allclose(
         eager_output.logits, et_output, atol=1e-02, rtol=1e-02
-    ), "CoreML output does not match eager"
+    ), "Model output does not match eager"
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str, required=True)
     parser.add_argument("--recipe", type=str, required=True)
     parser.add_argument("--quantize", action="store_true", help="Enable quantization")
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        required=False,
+        help="When provided, write the pte file to this directory. Otherwise, a temporary directory is created for the test.",
+    )
     args = parser.parse_args()
 
     model_to_model_id_and_test_function = {
@@ -294,11 +300,11 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
             f"Unknown model name: {args.model}. Available models: {model_to_model_id_and_test_function.keys()}"
         )
 
+    model_id, test_fn = model_to_model_id_and_test_function[args.model]
     with tempfile.TemporaryDirectory() as tmp_dir:
-        model_id, test_fn = model_to_model_id_and_test_function[args.model]
         test_fn(
             model_id=model_id,
-            model_dir=tmp_dir,
+            model_dir=tmp_dir if args.model_dir is None else args.model_dir,
             recipe=args.recipe,
             quantize=args.quantize,
         )
@@ -199,6 +199,9 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=albert
   elif [[ "${MODEL_NAME}" == "bert" ]]; then
     EXPORT_SCRIPT=bert
+  elif [[ "${MODEL_NAME}" == "conv_former" ]]; then
+    EXPORT_SCRIPT=conv_former
+    EXTRA_FLAGS="--dataset imagenet-mini/val"
   elif [[ "${MODEL_NAME}" == "cvt" ]]; then
     EXPORT_SCRIPT=cvt
   elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
@@ -238,7 +241,7 @@ test_model_with_qnn() {
     "cvt"|"dit"|"focalnet"|"mobilevit_v2"|"pvt"|"swin")
         SCRIPT_FOLDER=oss_scripts
         ;;
-    "albert"|"bert"|"distilbert"|"roberta"|"efficientnet"|"mobilevit_v1")
+    "albert"|"bert"|"conv_former"|"distilbert"|"roberta"|"efficientnet"|"mobilevit_v1")
         pip install evaluate
         SCRIPT_FOLDER=oss_scripts
         # 16bit models will encounter op validation fail on some operations,
 
@@ -568,7 +568,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l]
+        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l, conv_former]
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -815,6 +815,9 @@ jobs:
           smollm|coreml_fp32_gpu|--quantize,
           llama3|coreml_fp32_gpu|--quantize,
           olmo|coreml_fp32_gpu|--quantize,
+          # roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access
+          bert|coreml_fp32_gpu|--quantize,
+          distilbert|coreml_fp32_gpu|--quantize,
         ]
       fail-fast: false
     with:
 
@@ -8,6 +8,7 @@
 # coremltools than is used by ExecuTorch.  Each op registered here should have a link to a PR in coremltools that adds
 # the op to the coremltools library.
 
+import numpy as np
 import torch as _torch
 from coremltools import _logger
 from coremltools.converters.mil.frontend import _utils
@@ -21,7 +22,6 @@
     transpose,
     unbind,
 )
-
 from coremltools.converters.mil.frontend.torch.torch_op_registry import (
     register_torch_op,
 )
@@ -132,3 +132,43 @@ def dequantize_affine(context, node):
         name=node.name,
     )
     context.add(output, node.name)
+
+
+@register_torch_op(
+    torch_alias=["quant::dequantize_codebook", "quant.dequantize_codebook"],
+    override=False,
+)
+def dequantize_codebook(context, node):
+    inputs = _get_inputs(context, node, expected=[4, 5])
+    codes = inputs[0].val
+    codebook = inputs[1].val
+    nbits = inputs[2].val
+
+    # information in block_size is redundant with codebook.shape
+    block_size = inputs[3].val  # noqa: F841
+
+    assert len(codes.shape) == 2, "Only rank 2 inputs are supported"
+
+    # Assert codebook is as expected.  codebook.dim() = codes.dim() + 2
+    assert len(codebook.shape) == 4, "Only rank 4 inputs are supported for codebook"
+    assert codebook.shape[0] == 1, "Only grouped_channel granularity is supported"
+    n_luts = codebook.shape[1]
+    assert (
+        codes.shape[1] % n_luts == 0
+    ), "codes.shape[1] must be divisible by codebook.shape[1]"
+    assert codebook.shape[2] == 2**nbits
+    assert codebook.shape[3] == 1, "Only scalar look up values are supported"
+
+    if len(inputs) > 4:
+        output_dtype = inputs[4].val
+        out_np_dtype = NUM_TO_NUMPY_DTYPE[output_dtype]
+        _logger.warning(
+            f"Core ML ignores output_dtype {out_np_dtype} on torchao.dequantize_affine and instead uses the native precision."
+        )
+
+    output = _utils._construct_constexpr_lut_op(
+        codes.astype(np.int8),
+        codebook,
+        name=node.name,
+    )
+    context.add(output, node.name)
@@ -88,17 +88,17 @@
         ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
         return std::nullopt;
     }
-    
+
     std::vector<ssize_t> strides(tensor.strides().begin(), tensor.strides().end());
     std::vector<size_t> shape(tensor.sizes().begin(), tensor.sizes().end());
-    
+
     // If tensor is rank 0, wrap in rank 1
     // See https://github.com/apple/coremltools/blob/8.2/coremltools/converters/mil/frontend/torch/exir_utils.py#L73
     if (shape.size() == 0) {
         shape.push_back(1);
         strides.push_back(1);
     }
-    
+
     MultiArray::MemoryLayout layout(dataType.value(), std::move(shape), std::move(strides));
     switch (argType) {
         case ArgType::Input: {
@@ -281,9 +281,11 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
 }
 
 namespace {
-auto cls = CoreMLBackendDelegate();
-Backend backend{ETCoreMLStrings.delegateIdentifier.UTF8String, &cls};
-static auto success_with_compiler = register_backend(backend);
+    #ifndef LAZY_LOAD_IOS_PYTORCH_INITIALIZER
+        auto cls = CoreMLBackendDelegate();
+        Backend backend{ETCoreMLStrings.delegateIdentifier.UTF8String, &cls};
+        static auto success_with_compiler = register_backend(backend);
+    #endif
 }
 
 } // namespace coreml
 
@@ -0,0 +1,5 @@
+#pragma once
+
+namespace executorch::core_ml_backend_delegate {
+void register_backend_coreml();
+} // namespace executorch::core_ml_backend_delegate
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "executorch_operations.h"
+#import <coreml_backend/delegate.h>
+#import "ETCoreMLStrings.h"
+#import "backend_delegate.h"
+
+#import <executorch/runtime/core/evalue.h>
+#import <executorch/runtime/platform/log.h>
+#import <executorch/runtime/backend/interface.h>
+
+#include <array>
+#import <memory>
+
+namespace executorch::core_ml_backend_delegate {
+  using executorch::runtime::get_backend_class;
+
+static std::unique_ptr<executorch::backends::coreml::CoreMLBackendDelegate> backendInterfaceLazy_;
+
+void register_backend_coreml() {
+    auto backendInterface = executorch::runtime::get_backend_class(ETCoreMLStrings.delegateIdentifier.UTF8String);
+    if (backendInterface == nullptr) {
+      backendInterfaceLazy_ = std::make_unique<executorch::backends::coreml::CoreMLBackendDelegate>();
+      executorch::runtime::Backend backend{ETCoreMLStrings.delegateIdentifier.UTF8String, backendInterfaceLazy_.get()};
+      std::ignore = register_backend(backend);
+    }
+  }
+
+} // namespace executorch::core_ml_backend_delegate
@@ -14,6 +14,9 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
+
+from torchao.prototype.quantization.codebook_coreml import CodebookWeightOnlyConfig
 from torchao.quantization import IntxWeightOnlyConfig, PerAxis, PerGroup, quantize_
 
 
@@ -164,6 +167,61 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
+    def test_dequantize_codebook_linear(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            CodebookWeightOnlyConfig(dtype=torch.uint2, block_size=[-1, 16]),
+        )
+        ep = torch.export.export(model, example_inputs)
+        assert "torch.ops.quant.dequantize_codebook.default" in ep.graph_module.code
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+
+        assert (
+            "executorch.exir.dialects.edge._ops.quant.dequantize_codebook.default"
+            in format_delegated_graph(delegated_program.exported_program().graph_module)
+        )
+
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
+    def test_dequantize_codebook_embedding(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            CodebookWeightOnlyConfig(dtype=torch.uint3, block_size=[-1, 16]),
+            lambda m, fqn: isinstance(m, torch.nn.Embedding),
+        )
+        ep = torch.export.export(model, example_inputs)
+        assert "torch.ops.quant.dequantize_codebook.default" in ep.graph_module.code
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+
+        assert (
+            "executorch.exir.dialects.edge._ops.quant.dequantize_codebook.default"
+            in format_delegated_graph(delegated_program.exported_program().graph_module)
+        )
+
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
 
 if __name__ == "__main__":
     test_runner = TestTorchOps()
@@ -172,3 +230,5 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
     test_runner.test_dequantize_affine_c4w_embedding()
     test_runner.test_dequantize_affine_c4w_linear()
     test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
+    test_runner.test_dequantize_codebook_linear()
+    test_runner.test_dequantize_codebook_embedding()
@@ -57,7 +57,7 @@ def vgf_compile_spec(
                 f"Invalid TOSA version: {tosa_version}"
             )
 
-        if not ("FP" or "INT" in tosa_profiles):
+        if "FP" not in tosa_profiles and "INT" not in tosa_profiles:
             raise ValueError(
                 "Arm backend only supports converter-backend for FP or INT. "
                 f"Invalid TOSA profile: {tosa_profiles}"
 
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from executorch.backends.arm._passes import ArmPassManager
@@ -218,9 +218,35 @@ def not_module_type_or_name_filter(n: Node) -> bool:
 
 class TOSAQuantizer(Quantizer):
 
-    def __init__(self, tosa_spec: TosaSpecification) -> None:
+    def __init__(
+        self, compile_spec_or_tosa_spec: Union[TosaSpecification, List[CompileSpec]]
+    ) -> None:
+
         super().__init__()
-        self.tosa_spec = tosa_spec
+        if isinstance(compile_spec_or_tosa_spec, TosaSpecification):
+            self.tosa_spec = compile_spec_or_tosa_spec
+            self.compile_spec = None
+        elif isinstance(compile_spec_or_tosa_spec, list):
+            self.compile_spec = compile_spec_or_tosa_spec
+            # find entry that is 'tosa_spec'
+            for cs in compile_spec_or_tosa_spec:
+                if cs.key == "tosa_spec":
+                    spec_val = (
+                        cs.value.decode() if isinstance(cs.value, bytes) else cs.value
+                    )
+                    self.tosa_spec = TosaSpecification.create_from_string(spec_val)
+                    break
+            else:
+                raise ValueError(
+                    "compile_spec list did not contain a 'tosa_spec' entry"
+                )
+        else:
+            raise TypeError(
+                f"TOSAQuantizer constructor expects "
+                f"a TosaSpecification or compile_spec list, "
+                f"got {type(compile_spec_or_tosa_spec)}"
+            )
+
         self.global_config: Optional[QuantizationConfig] = None
         self.io_config: Optional[QuantizationConfig] = None
         self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {}
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def vgf_compile_spec(`
`57`	`57`	`f"Invalid TOSA version: {tosa_version}"`
`58`	`58`	`)`
`59`	`59`
`60`		`- if not ("FP" or "INT" in tosa_profiles):`
	`60`	`+ if "FP" not in tosa_profiles and "INT" not in tosa_profiles:`
`61`	`61`	`raise ValueError(`
`62`	`62`	`"Arm backend only supports converter-backend for FP or INT. "`
`63`	`63`	`f"Invalid TOSA profile: {tosa_profiles}"`