Update base for Update on "[ET-VK] Miscellaneous fixes"

ssjia · ssjia · commit 6c7e36370f70 · 2025-10-03T09:45:45.000-07:00
Collecting fixes for various models/ops in this diff/PR. They have all been squashed into this single change to make it easier to cherry pick. # Fixes ## Wav2Letter Type: Output correctness failure This is caused by a bug in swiftshader, and not reproducible on any other platform. Specifically, the issue is in the softmax shader; the exact cause of the issue is unknown, but it is related to using shared memory within shaders. The workaround for this issue is to use separate shared memory arrays for the shared max and shared sum. ## ConvNeXT Type: Exception during runtime This is caused by an incompatible memory layout being used for mean2d. More technically, the packed dimension of the tensor cannot be one of the dims being reduced. The current operator registry system did not have a way to select valid tensor representations based on the actual arguments of an op. To fix, we have to introduce a mechanism for ops to specify valid representations once a node's arguments are known. Once the model is exported with supported memory layout, the model test passes. ## Inception_V3/ViT Type: Exception during runtime The root cause of this was an interaction betwen the fuse batch norm pass and how `vulkan_preprocess.py` was applying passes. Essentially, the fuse batch norm pass creates a new param node for the fused weight, but after the pass is applied `_copy_module` is used to copy the transformed graph back into the ExportedProgram. However, it seems that _copy_module lowercases the node names without updating the exported program's graph signature. Therefore, subsequent passes couldn't recognize the weight tensor of convolution tensors as a constant/parameter node. The solution was to migrate vulkan_preprocess.py to use the _transform() API instead of using _copy_module. ## DenseNet 161 (w/ dynamic shapes) Type: Output Mismatch Cause: the native_batch_norm op doesn't support dynamic shapes. However, the backend test runner doesn't set the correct compile option to filter ops without dynamic shape support. Differential Revision: [D83703496](https://our.internmc.facebook.com/intern/diff/D83703496/) [ghstack-poisoned]
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
@@ -79,6 +79,12 @@ def define_node(
             input1_zp = input_qparams[1].get_zp_per_tensor()
             bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
             bmm_output_name = bmm_result.name
+        elif inputs[0].dtype == ts.DType.INT16:
+            input_qparams = get_input_qparams(node)
+            input0_zp = input_qparams[0].get_zp_per_tensor()
+            input1_zp = input_qparams[1].get_zp_per_tensor()
+            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT48)
+            bmm_output_name = bmm_result.name
         else:
             bmm_output_name = output.name
             input0_zp, input1_zp = 0, 0
@@ -118,3 +124,20 @@ def define_node(
                 output_zp=[output_qparams.get_zp_per_tensor()],
                 rounding_mode=RoundingMode.SINGLE_ROUND,
             )
+        elif output.dtype == ts.DType.INT16:
+            output_qparams = get_output_qparams(node)[0]
+            final_output_scale = (
+                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
+            ) / output_qparams.get_scale_per_tensor()
+
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=[final_output_scale],
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
+                input_node=bmm_result,  # type: ignore[possibly-undefined]
+                output_name=output.name,
+                output_type=ts.DType.INT16,
+                input_zp=[0],
+                output_zp=[output_qparams.get_zp_per_tensor()],
+                rounding_mode=RoundingMode.SINGLE_ROUND,
+            )
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
@@ -213,9 +213,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979"
-)
 def test_addmm_16a8w_tosa_INT(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -268,9 +265,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations"
-)
 def test_addmm_16a8w_u85_INT16(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -89,10 +89,10 @@ def replace_logical_nop_where_with_where(
 
             # Get the third arg node and its input
             logical_not_node = node.args[0]
-            logical_not_input_tensor = logical_not_node.args[0].to_tensor()
+            logical_not_input_node = logical_not_node.args[0]
 
             # If the logical_not input is not a boolean tensor, bail.
-            if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+            if logical_not_input_node.meta["val"].dtype != torch.bool:
                 continue
 
             # Replace the where op with another one, flipping the inputs and using the boolean
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
@@ -6,6 +6,7 @@ runtime.python_library(
     name = "cuda_backend",
     srcs = [
         "cuda_backend.py",
+        "replace_slice_copy_with_slice.py",
     ],
     visibility = [
         "//executorch/...",
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -144,7 +144,9 @@ def preprocess(
         }
 
         with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
-            [SDPBackend.MATH]
+            [
+                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
+            ]
         ), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
             so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
@@ -6,20 +6,23 @@
 
 # pyre-strict
 
-from typing import Iterable
+from typing import Dict, Iterable, Tuple
 
 import torch
 from executorch.exir.dialects._ops import ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch import fx
 
 
-_SLICE_COPY_TARGETS = (
+_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
     torch.ops.aten.slice_copy.Tensor,
     ops.edge.aten.slice_copy.Tensor,
 )
 
-_SLICE_TARGETS = {
+_SLICE_TARGETS: Dict[
+    torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
+] = {
     torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
     ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
 }
@@ -99,8 +102,8 @@ def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
         return False
 
     def _argument_mutates(
-        self, schema: torch._C.FunctionSchema, key
-    ) -> bool:  # pyre-ignore[11]
+        self, schema: torch._C.FunctionSchema, key: int | str
+    ) -> bool:
         arguments = schema.arguments
         if isinstance(key, int):
             if key >= len(arguments):
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
@@ -8,6 +8,7 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from torch.export import export
@@ -30,7 +31,9 @@ def _export_to_cuda_with_lower(
         exported_program = export(module, inputs, strict=True)
 
         # Create partitioner and compile specs
-        partitioner = CudaPartitioner([])
+        partitioner = CudaPartitioner(
+            [CudaBackend.generate_method_name_compile_spec("forward")]
+        )
 
         # Use to_edge_transform_and_lower for complete pipeline
         edge_program_manager = to_edge_transform_and_lower(
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
@@ -200,7 +200,7 @@ def get_16a8w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -398,7 +398,7 @@ def get_ptq_per_block_quant_config(
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.uint8,
         qscheme=(
             torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
@@ -458,7 +458,7 @@ def get_8a8w_qnn_qat_config(
 def get_16a4w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -541,7 +541,7 @@ def get_qat_per_channel_quant_config(
         # If zero_point is 128, htp can do optimizations.
         # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
         # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             qscheme=torch.per_tensor_symmetric,
             observer=act_observer,
@@ -553,7 +553,7 @@ def get_qat_per_channel_quant_config(
             observer_or_fake_quant_ctr=act_fake_quant_ctr,
         )
     else:
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             quant_min=torch.iinfo(act_dtype).min,
             quant_max=torch.iinfo(act_dtype).max,
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -63,6 +63,9 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Device to use for the model: "cpu" or "cuda" (needed for QAT)
+    # Only used for creating Rope parameters
+    device: str = "cpu"
     # Generate logits for all inputs. When it's True, it would take big memory usage
     # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
     # logits for all input tokens.)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -138,15 +138,19 @@ def forward(
 # and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
 # Current only support non-long rope.
 def hf_precompute_freqs_cis(
-    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+    dim: int,
+    end: int,
+    theta: float,
+    partial_rotary_factor: float = 1.0,
+    device: Union[str, torch.device] = "cpu",
 ):
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
 
     # Short factor scaling.
     freqs = 1.0 / (
         theta
-        ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
+        ** (torch.arange(0, dim, 2, device=device, dtype=torch.int64).float() / dim)
     )
     # TODO: support long factor scaling.
 
@@ -236,6 +240,7 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = partial(
                 hf_precompute_freqs_cis,
                 partial_rotary_factor=self.params.partial_rotary_factor,
+                device=self.params.device,
             )
             self.apply_rotary_emb = hf_apply_rotary_emb
         else:
@@ -244,6 +249,7 @@ def __init__(self, params: ModelArgs):
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
                 high_freq_factor=self.params.high_freq_factor,
+                device=self.params.device,
             )
             self.apply_rotary_emb = RotaryEmbedding()
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -25,12 +25,23 @@ __attribute__((deprecated("This API is experimental.")))
 
  @param modelPath      File system path to the serialized model.
  @param tokenizerPath  File system path to the tokenizer data.
- @param tokens         An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchLLMTextRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath;
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath      File system path to the serialized model.
+ @param tokenizerPath  File system path to the tokenizer data.
+ @param specialTokens  An array of NSString special tokens to use during tokenization.
  @return An initialized ExecuTorchLLMTextRunner instance.
 */
 - (instancetype)initWithModelPath:(NSString *)modelPath
                     tokenizerPath:(NSString *)tokenizerPath
-                    specialTokens:(NSArray<NSString *> *)tokens
+                    specialTokens:(NSArray<NSString *> *)specialTokens
     NS_DESIGNATED_INITIALIZER;
 
 /**
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
@@ -28,15 +28,22 @@ @implementation ExecuTorchLLMTextRunner {
   std::unique_ptr<llm::TextLLMRunner> _runner;
 }
 
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  return [self initWithModelPath:modelPath
+                   tokenizerPath:tokenizerPath
+                   specialTokens:@[]];
+}
+
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath
-                    specialTokens:(NSArray<NSString*>*)tokens {
+                    specialTokens:(NSArray<NSString*>*)specialTokens {
   self = [super init];
   if (self) {
     _modelPath = [modelPath copy];
     _tokenizerPath = [tokenizerPath copy];
     _specialTokens = std::make_unique<std::vector<std::string>>();
-    for (NSString *token in tokens) {
+    for (NSString *token in specialTokens) {
       _specialTokens->emplace_back(token.UTF8String);
     }
   }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -60,7 +60,7 @@ class MultimodalRunnerTest: XCTestCase {
   let userPrompt = "What's on the picture?"
   let sequenceLength = 768
 
-  func test() {
+  func testLLaVA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
           let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -39,7 +39,7 @@ class TextRunnerTest: XCTestCase {
   let userPrompt = "The capital of France is called"
   let sequenceLength = 128
 
-  func test() {
+  func testLLaMA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
           let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
@@ -73,4 +73,39 @@ class TextRunnerTest: XCTestCase {
     }
     XCTAssertTrue(text.lowercased().contains("paris"))
   }
+
+  func testPhi4() {
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = TextRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+  }
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,10 @@
 [build-system]
 requires = [
   "cmake>=3.29,<4.0.0",  # For building binary targets in the wheel. 4.0.0 breaks third-party CMake build so temporarily pin the version.
+  "packaging>=24.2", # Lower bound required by setuptools
   "pip>=23",  # For building the pip package.
   "pyyaml",  # Imported by the kernel codegen tools.
-  "setuptools>=63",  # For building the pip package contents.
+  "setuptools>=77.0.3",  # For building the pip package contents.
   "wheel",  # For building the pip package archive.
   "zstd",  # Imported by resolve_buck.py.
   "certifi",  # Imported by resolve_buck.py.
@@ -21,7 +22,8 @@ readme = "README-wheel.md"
 authors = [
   {name="PyTorch Team", email="packages@pytorch.org"},
 ]
-license = {file = "LICENSE"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
 keywords = ["pytorch", "machine learning"]
 # PyPI package information.
 classifiers = [
@@ -33,7 +35,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: BSD License",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Mathematics",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,9 +1,10 @@
 # Pip packages needed to build from source. Mainly for development of ExecuTorch.
 
 cmake>=3.29, <4.0.0  # For building binary targets in the wheel.
+packaging>=24.2 # Lower bound required by setuptools
 pip>=23  # For building the pip package.
 pyyaml  # Imported by the kernel codegen tools.
-setuptools>=63  # For building the pip package contents.
+setuptools>=77.0.3  # For building the pip package contents.
 wheel  # For building the pip package archive.
 zstd  # Imported by resolve_buck.py.
 certifi  # Imported by resolve_buck.py.