pytorch
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 53 additions & 7 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 53 additions & 7 deletions
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 41 additions & 1 deletion b/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm‎
Lines changed: 8 additions & 6 deletions b/‎backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 60 additions & 0 deletions b/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_avg_pool2d.py‎
Lines changed: 12 additions & 2 deletions b/‎backends/arm/_passes/decompose_avg_pool2d.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_grouped_conv.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/decompose_grouped_conv.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/fuse_quantized_activation_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/fuse_quantized_activation_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 1 addition & 1 deletion
@@ -78,25 +78,71 @@ jobs:
         mkdir -p zephyr_scratch/
         cd zephyr_scratch
         export ZEPHYR_PROJ_ROOT=$(realpath $(pwd))
+        export ARM_FVP_TUTORIALS_ROOT=$ZEPHYR_PROJ_ROOT/zephyr/samples/modules/executorch/arm-fvp-tutorials
 
+        # TODO @Bujji: Should see if this can be moved into the docker image itself
         download_arm_zephyr_sdk
         ./zephyr-sdk-0.16.0/setup.sh -c -t arm-zephyr-eabi
-
         cd $ZEPHYR_PROJ_ROOT
         setup_zephyr_et_module
 
+        # Run setup scripts for Arm FVP and Arm AOT Compilation
         cd $ZEPHYR_PROJ_ROOT/modules/lib/executorch
         install_executorch "--use-pt-pinned-commit"
         .ci/scripts/setup-arm-baremetal-tools.sh --target-toolchain zephyr
         source examples/arm/ethos-u-scratch/setup_path.sh
         source $ZEPHYR_PROJ_ROOT/zephyr/zephyr-env.sh
-        cd $ZEPHYR_PROJ_ROOT/zephyr/samples/modules/executorch/arm/hello_world
-        west build -p always -b mps3/corstone300/fvp
-        FVP_Corstone_SSE-300_Ethos-U55 -a build/zephyr/zephyr.elf -C mps3_board.visualisation.disable-visualisation=1 -C mps3_board.telnetterminal0.start_telnet=0 -C mps3_board.uart0.out_file='sim.out'  -C cpu0.CFGITCMSZ=15 -C cpu0.CFGDTCMSZ=15 --simlimit 120
 
-        grep -qF "Output[0][0]: (float) 2.000000" sim.out
-        exit_status=$? #store 0 if found (success), 1 if not (failure)
-        exit $exit_status
+        # Get the model as PTE
+        python -m examples.arm.aot_arm_compiler \
+            --model_name="${MODEL_NAME}" \
+            --output="${MODEL_NAME}.pte"
+
+        # Generate the C-style header
+        cd $ARM_FVP_TUTORIALS_ROOT
+        python build_model.py \
+            --executorch-root $ZEPHYR_PROJ_ROOT/modules/lib/executorch \
+            --pte-file $ZEPHYR_PROJ_ROOT/modules/lib/executorch/${MODEL_NAME}.pte \
+            --output-path $ARM_FVP_TUTORIALS_ROOT/models/${MODEL_NAME}/src/
+
+        cd $ARM_FVP_TUTORIALS_ROOT/models/${MODEL_NAME}/
+
+        # Build the zephyr elf
+        west build -p always -b mps3/corstone300/fvp -- \
+            -DET_PTE_FILE_PATH_FOR_SELECTIVE_BUILD=$ZEPHYR_PROJ_ROOT/modules/lib/executorch/${MODEL_NAME}.pte
+
+        # Run the simulation
+        FVP_Corstone_SSE-300_Ethos-U55 -a build/zephyr/zephyr.elf \
+            -C mps3_board.visualisation.disable-visualisation=1 \
+            -C mps3_board.telnetterminal0.start_telnet=0 \
+            -C mps3_board.uart0.out_file='sim.out'  \
+            -C cpu0.CFGITCMSZ=15 \
+            -C cpu0.CFGDTCMSZ=15 \
+            --simlimit 120
+
+        # Disable exit on error
+        set +e
+        # Report failure if any of the ouptut verification checks fail
+        # store 0 if found (failure), 1 if not (success)
+        grep -qF "ERROR" sim.out
+        exit_status=$?
+        if [[ "$exit_status" -eq "0" ]]; then
+            cat sim.out
+            set -e
+            exit 1
+        fi
+
+        # Report fail if simulation does not complete successfully
+        # store 0 if found (success), 1 if not (failure)
+        grep -qF "SUCCESS: Program complete, exiting." sim.out
+        exit_status=$?
+        if [[ "$exit_status" -eq "1" ]]; then
+            cat sim.out
+            set -e
+            exit 1
+        fi
+        # Re-enable exit on error
+        set -e
 
   test-models-linux-aarch64:
     name: test-models-linux-aarch64
 
@@ -8,6 +8,7 @@
 # coremltools than is used by ExecuTorch.  Each op registered here should have a link to a PR in coremltools that adds
 # the op to the coremltools library.
 
+import numpy as np
 import torch as _torch
 from coremltools import _logger
 from coremltools.converters.mil.frontend import _utils
@@ -21,7 +22,6 @@
     transpose,
     unbind,
 )
-
 from coremltools.converters.mil.frontend.torch.torch_op_registry import (
     register_torch_op,
 )
@@ -132,3 +132,43 @@ def dequantize_affine(context, node):
         name=node.name,
     )
     context.add(output, node.name)
+
+
+@register_torch_op(
+    torch_alias=["quant::dequantize_codebook", "quant.dequantize_codebook"],
+    override=False,
+)
+def dequantize_codebook(context, node):
+    inputs = _get_inputs(context, node, expected=[4, 5])
+    codes = inputs[0].val
+    codebook = inputs[1].val
+    nbits = inputs[2].val
+
+    # information in block_size is redundant with codebook.shape
+    block_size = inputs[3].val  # noqa: F841
+
+    assert len(codes.shape) == 2, "Only rank 2 inputs are supported"
+
+    # Assert codebook is as expected.  codebook.dim() = codes.dim() + 2
+    assert len(codebook.shape) == 4, "Only rank 4 inputs are supported for codebook"
+    assert codebook.shape[0] == 1, "Only grouped_channel granularity is supported"
+    n_luts = codebook.shape[1]
+    assert (
+        codes.shape[1] % n_luts == 0
+    ), "codes.shape[1] must be divisible by codebook.shape[1]"
+    assert codebook.shape[2] == 2**nbits
+    assert codebook.shape[3] == 1, "Only scalar look up values are supported"
+
+    if len(inputs) > 4:
+        output_dtype = inputs[4].val
+        out_np_dtype = NUM_TO_NUMPY_DTYPE[output_dtype]
+        _logger.warning(
+            f"Core ML ignores output_dtype {out_np_dtype} on torchao.dequantize_affine and instead uses the native precision."
+        )
+
+    output = _utils._construct_constexpr_lut_op(
+        codes.astype(np.int8),
+        codebook,
+        name=node.name,
+    )
+    context.add(output, node.name)
@@ -88,17 +88,17 @@
         ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
         return std::nullopt;
     }
-    
+
     std::vector<ssize_t> strides(tensor.strides().begin(), tensor.strides().end());
     std::vector<size_t> shape(tensor.sizes().begin(), tensor.sizes().end());
-    
+
     // If tensor is rank 0, wrap in rank 1
     // See https://github.com/apple/coremltools/blob/8.2/coremltools/converters/mil/frontend/torch/exir_utils.py#L73
     if (shape.size() == 0) {
         shape.push_back(1);
         strides.push_back(1);
     }
-    
+
     MultiArray::MemoryLayout layout(dataType.value(), std::move(shape), std::move(strides));
     switch (argType) {
         case ArgType::Input: {
@@ -281,9 +281,11 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
 }
 
 namespace {
-auto cls = CoreMLBackendDelegate();
-Backend backend{ETCoreMLStrings.delegateIdentifier.UTF8String, &cls};
-static auto success_with_compiler = register_backend(backend);
+    #ifndef LAZY_LOAD_IOS_PYTORCH_INITIALIZER
+        auto cls = CoreMLBackendDelegate();
+        Backend backend{ETCoreMLStrings.delegateIdentifier.UTF8String, &cls};
+        static auto success_with_compiler = register_backend(backend);
+    #endif
 }
 
 } // namespace coreml
 
@@ -14,6 +14,9 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
+
+from torchao.prototype.quantization.codebook_coreml import CodebookWeightOnlyConfig
 from torchao.quantization import IntxWeightOnlyConfig, PerAxis, PerGroup, quantize_
 
 
@@ -164,6 +167,61 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
+    def test_dequantize_codebook_linear(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            CodebookWeightOnlyConfig(dtype=torch.uint2, block_size=[-1, 16]),
+        )
+        ep = torch.export.export(model, example_inputs)
+        assert "torch.ops.quant.dequantize_codebook.default" in ep.graph_module.code
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+
+        assert (
+            "executorch.exir.dialects.edge._ops.quant.dequantize_codebook.default"
+            in format_delegated_graph(delegated_program.exported_program().graph_module)
+        )
+
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
+    def test_dequantize_codebook_embedding(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            CodebookWeightOnlyConfig(dtype=torch.uint3, block_size=[-1, 16]),
+            lambda m, fqn: isinstance(m, torch.nn.Embedding),
+        )
+        ep = torch.export.export(model, example_inputs)
+        assert "torch.ops.quant.dequantize_codebook.default" in ep.graph_module.code
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+
+        assert (
+            "executorch.exir.dialects.edge._ops.quant.dequantize_codebook.default"
+            in format_delegated_graph(delegated_program.exported_program().graph_module)
+        )
+
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
 
 if __name__ == "__main__":
     test_runner = TestTorchOps()
@@ -172,3 +230,5 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
     test_runner.test_dequantize_affine_c4w_embedding()
     test_runner.test_dequantize_affine_c4w_linear()
     test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
+    test_runner.test_dequantize_codebook_linear()
+    test_runner.test_dequantize_codebook_embedding()
@@ -45,7 +45,10 @@ def call_operator(self, op, args, kwargs, meta):
         x = args[0]
         kernel_h, kernel_w = args[1]
         kernel_size = kernel_h * kernel_w
-        stride_h, stride_w = args[2]
+        if len(args) > 2 and args[2] is not None:
+            stride_h, stride_w = args[2]
+        else:
+            stride_h, stride_w = kernel_h, kernel_w
         pad_h, pad_w = new_pad_h, new_pad_w = args[3] if len(args) > 3 else (0, 0)
         ceil_mode = args[4] if len(args) > 4 else False
         count_include_pad = args[5] if len(args) > 5 else True
@@ -108,7 +111,14 @@ def call_operator(self, op, args, kwargs, meta):
             x = super().call_operator(cat_op, (cat_nodes, 2), kwargs, meta)
             new_pad_h = 0
 
-        avgpool_args = (x, args[1], args[2], [new_pad_h, new_pad_w], ceil_mode, False)
+        avgpool_args = (
+            x,
+            args[1],
+            [stride_h, stride_w],
+            [new_pad_h, new_pad_w],
+            ceil_mode,
+            False,
+        )
         x = super().call_operator(avgpool_op, avgpool_args, kwargs, meta)
 
         # Multiply by factor (kernel_size / divisor_override) if divisor_override
 
@@ -6,7 +6,7 @@
 from copy import copy
 
 import torch
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
 
@@ -15,9 +15,9 @@
     get_param_tensor,
     is_param_node,
 )
-from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.arm._passes.quant_args import QuantArgs
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 
@@ -6,8 +6,8 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import Q_OPS
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import Node
 
@@ -9,8 +9,8 @@
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch import Tensor
 from torch.fx import GraphModule, Node
 
@@ -10,7 +10,7 @@
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.exir import ExportedProgram
 
 from executorch.exir.dialects._ops import ops as exir_ops