Update on "[ET-VK] Miscellaneous fixes"

ssjia · ssjia · commit bddb1cc1855f · 2025-10-03T09:45:46.000-07:00
Collecting fixes for various models/ops in this diff/PR. They have all been squashed into this single change to make it easier to cherry pick. # Fixes ## Wav2Letter Type: Output correctness failure This is caused by a bug in swiftshader, and not reproducible on any other platform. Specifically, the issue is in the softmax shader; the exact cause of the issue is unknown, but it is related to using shared memory within shaders. The workaround for this issue is to use separate shared memory arrays for the shared max and shared sum. ## ConvNeXT Type: Exception during runtime This is caused by an incompatible memory layout being used for mean2d. More technically, the packed dimension of the tensor cannot be one of the dims being reduced. The current operator registry system did not have a way to select valid tensor representations based on the actual arguments of an op. To fix, we have to introduce a mechanism for ops to specify valid representations once a node's arguments are known. Once the model is exported with supported memory layout, the model test passes. ## Inception_V3/ViT Type: Exception during runtime The root cause of this was an interaction betwen the fuse batch norm pass and how `vulkan_preprocess.py` was applying passes. Essentially, the fuse batch norm pass creates a new param node for the fused weight, but after the pass is applied `_copy_module` is used to copy the transformed graph back into the ExportedProgram. However, it seems that _copy_module lowercases the node names without updating the exported program's graph signature. Therefore, subsequent passes couldn't recognize the weight tensor of convolution tensors as a constant/parameter node. The solution was to migrate vulkan_preprocess.py to use the _transform() API instead of using _copy_module. ## DenseNet 161 (w/ dynamic shapes) Type: Output Mismatch Cause: the native_batch_norm op doesn't support dynamic shapes. However, the backend test runner doesn't set the correct compile option to filter ops without dynamic shape support. Differential Revision: [D83703496](https://our.internmc.facebook.com/intern/diff/D83703496/) [ghstack-poisoned]
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -970,7 +970,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build
 
         # Test models serially
-        models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4 convnext_small vit"
+        models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4 vit"
         for model in $models; do
           python -m examples.vulkan.export --model_name=$model --test
         done
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
@@ -79,6 +79,12 @@ def define_node(
             input1_zp = input_qparams[1].get_zp_per_tensor()
             bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
             bmm_output_name = bmm_result.name
+        elif inputs[0].dtype == ts.DType.INT16:
+            input_qparams = get_input_qparams(node)
+            input0_zp = input_qparams[0].get_zp_per_tensor()
+            input1_zp = input_qparams[1].get_zp_per_tensor()
+            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT48)
+            bmm_output_name = bmm_result.name
         else:
             bmm_output_name = output.name
             input0_zp, input1_zp = 0, 0
@@ -118,3 +124,20 @@ def define_node(
                 output_zp=[output_qparams.get_zp_per_tensor()],
                 rounding_mode=RoundingMode.SINGLE_ROUND,
             )
+        elif output.dtype == ts.DType.INT16:
+            output_qparams = get_output_qparams(node)[0]
+            final_output_scale = (
+                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
+            ) / output_qparams.get_scale_per_tensor()
+
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=[final_output_scale],
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
+                input_node=bmm_result,  # type: ignore[possibly-undefined]
+                output_name=output.name,
+                output_type=ts.DType.INT16,
+                input_zp=[0],
+                output_zp=[output_qparams.get_zp_per_tensor()],
+                rounding_mode=RoundingMode.SINGLE_ROUND,
+            )
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
@@ -213,9 +213,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979"
-)
 def test_addmm_16a8w_tosa_INT(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -268,9 +265,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations"
-)
 def test_addmm_16a8w_u85_INT16(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -89,10 +89,10 @@ def replace_logical_nop_where_with_where(
 
             # Get the third arg node and its input
             logical_not_node = node.args[0]
-            logical_not_input_tensor = logical_not_node.args[0].to_tensor()
+            logical_not_input_node = logical_not_node.args[0]
 
             # If the logical_not input is not a boolean tensor, bail.
-            if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+            if logical_not_input_node.meta["val"].dtype != torch.bool:
                 continue
 
             # Replace the where op with another one, flipping the inputs and using the boolean
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
@@ -6,6 +6,7 @@ runtime.python_library(
     name = "cuda_backend",
     srcs = [
         "cuda_backend.py",
+        "replace_slice_copy_with_slice.py",
     ],
     visibility = [
         "//executorch/...",
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -144,7 +144,9 @@ def preprocess(
         }
 
         with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
-            [SDPBackend.MATH]
+            [
+                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
+            ]
         ), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
             so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
@@ -6,20 +6,23 @@
 
 # pyre-strict
 
-from typing import Iterable
+from typing import Dict, Iterable, Tuple
 
 import torch
 from executorch.exir.dialects._ops import ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch import fx
 
 
-_SLICE_COPY_TARGETS = (
+_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
     torch.ops.aten.slice_copy.Tensor,
     ops.edge.aten.slice_copy.Tensor,
 )
 
-_SLICE_TARGETS = {
+_SLICE_TARGETS: Dict[
+    torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
+] = {
     torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
     ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
 }
@@ -99,8 +102,8 @@ def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
         return False
 
     def _argument_mutates(
-        self, schema: torch._C.FunctionSchema, key
-    ) -> bool:  # pyre-ignore[11]
+        self, schema: torch._C.FunctionSchema, key: int | str
+    ) -> bool:
         arguments = schema.arguments
         if isinstance(key, int):
             if key >= len(arguments):
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
@@ -8,6 +8,7 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from torch.export import export
@@ -30,7 +31,9 @@ def _export_to_cuda_with_lower(
         exported_program = export(module, inputs, strict=True)
 
         # Create partitioner and compile specs
-        partitioner = CudaPartitioner([])
+        partitioner = CudaPartitioner(
+            [CudaBackend.generate_method_name_compile_spec("forward")]
+        )
 
         # Use to_edge_transform_and_lower for complete pipeline
         edge_program_manager = to_edge_transform_and_lower(
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
@@ -200,7 +200,7 @@ def get_16a8w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -398,7 +398,7 @@ def get_ptq_per_block_quant_config(
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.uint8,
         qscheme=(
             torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
@@ -458,7 +458,7 @@ def get_8a8w_qnn_qat_config(
 def get_16a4w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -541,7 +541,7 @@ def get_qat_per_channel_quant_config(
         # If zero_point is 128, htp can do optimizations.
         # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
         # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             qscheme=torch.per_tensor_symmetric,
             observer=act_observer,
@@ -553,7 +553,7 @@ def get_qat_per_channel_quant_config(
             observer_or_fake_quant_ctr=act_fake_quant_ctr,
         )
     else:
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             quant_min=torch.iinfo(act_dtype).min,
             quant_max=torch.iinfo(act_dtype).max,
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -230,6 +230,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
+        # For non-tensor arguments, return ANY_STORAGE
+        if not utils.is_tensor_arg_node(arg_node):
+            return utils.ANY_STORAGE
+
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
             arg_node = arg_node[0]
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -501,6 +501,23 @@ def register_2d_pool_op():
     ]
 )
 def register_convolution_op():
+    def check_conv_node(node: torch.fx.Node) -> bool:
+        x = node.args[0]
+        x_shape = x.meta["val"].size()
+        # 4-D input implies 2D convolution
+        if len(x_shape) == 4:
+            batches = x.meta["val"].size()[0]
+            if batches != 1:
+                return False
+        # 3-D input implies 1D convolution
+        if len(x_shape) == 3:
+            transpose = node.args[6]
+            # Transposed 1D convolution is not supported yet
+            if transpose:
+                return False
+
+        return True
+
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE,  # input
@@ -517,6 +534,7 @@ def register_convolution_op():
         ],
         supports_resize=True,
         supports_prepacking=True,
+        are_node_inputs_supported_fn=check_conv_node,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
@@ -32,8 +32,13 @@ void add_squeeze_copy_dims_node(
   // 2. Squeeze outter most dim
   // For these cases, just pass input to output via clone.
   for (int i = 0; i < dims.size(); ++i) {
-    if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) {
-      squeeze_dims.push_back(dims.at(i));
+    // adjust negative dims
+    int64_t dim_val = dims.at(i);
+    if (dim_val < 0) {
+      dim_val += in_dim;
+    }
+    if (dims.at(i) != 0 && in_sizes.at(dim_val) == 1) {
+      squeeze_dims.push_back(dim_val);
     }
   }
   if (squeeze_dims.size() == 0) {
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -206,6 +206,8 @@ def is_tensor_arg_node(node: Any) -> bool:
     if isinstance(node, torch.fx.Node):
         return is_tensor_node(node)
     elif isinstance(node, (list, tuple)):
+        if len(node) == 0:
+            return False
         return all(is_tensor_node(n) for n in node)
 
     return False
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -63,6 +63,9 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Device to use for the model: "cpu" or "cuda" (needed for QAT)
+    # Only used for creating Rope parameters
+    device: str = "cpu"
     # Generate logits for all inputs. When it's True, it would take big memory usage
     # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
     # logits for all input tokens.)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -138,15 +138,19 @@ def forward(
 # and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
 # Current only support non-long rope.
 def hf_precompute_freqs_cis(
-    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+    dim: int,
+    end: int,
+    theta: float,
+    partial_rotary_factor: float = 1.0,
+    device: Union[str, torch.device] = "cpu",
 ):
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
 
     # Short factor scaling.
     freqs = 1.0 / (
         theta
-        ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
+        ** (torch.arange(0, dim, 2, device=device, dtype=torch.int64).float() / dim)
     )
     # TODO: support long factor scaling.
 
@@ -236,6 +240,7 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = partial(
                 hf_precompute_freqs_cis,
                 partial_rotary_factor=self.params.partial_rotary_factor,
+                device=self.params.device,
             )
             self.apply_rotary_emb = hf_apply_rotary_emb
         else:
@@ -244,6 +249,7 @@ def __init__(self, params: ModelArgs):
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
                 high_freq_factor=self.params.high_freq_factor,
+                device=self.params.device,
             )
             self.apply_rotary_emb = RotaryEmbedding()
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -25,12 +25,23 @@ __attribute__((deprecated("This API is experimental.")))
 
  @param modelPath      File system path to the serialized model.
  @param tokenizerPath  File system path to the tokenizer data.
- @param tokens         An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchLLMTextRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath;
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath      File system path to the serialized model.
+ @param tokenizerPath  File system path to the tokenizer data.
+ @param specialTokens  An array of NSString special tokens to use during tokenization.
  @return An initialized ExecuTorchLLMTextRunner instance.
 */
 - (instancetype)initWithModelPath:(NSString *)modelPath
                     tokenizerPath:(NSString *)tokenizerPath
-                    specialTokens:(NSArray<NSString *> *)tokens
+                    specialTokens:(NSArray<NSString *> *)specialTokens
     NS_DESIGNATED_INITIALIZER;
 
 /**
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
@@ -28,15 +28,22 @@ @implementation ExecuTorchLLMTextRunner {
   std::unique_ptr<llm::TextLLMRunner> _runner;
 }
 
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  return [self initWithModelPath:modelPath
+                   tokenizerPath:tokenizerPath
+                   specialTokens:@[]];
+}
+
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath
-                    specialTokens:(NSArray<NSString*>*)tokens {
+                    specialTokens:(NSArray<NSString*>*)specialTokens {
   self = [super init];
   if (self) {
     _modelPath = [modelPath copy];
     _tokenizerPath = [tokenizerPath copy];
     _specialTokens = std::make_unique<std::vector<std::string>>();
-    for (NSString *token in tokens) {
+    for (NSString *token in specialTokens) {
       _specialTokens->emplace_back(token.UTF8String);
     }
   }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -60,7 +60,7 @@ class MultimodalRunnerTest: XCTestCase {
   let userPrompt = "What's on the picture?"
   let sequenceLength = 768
 
-  func test() {
+  func testLLaVA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
           let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements-dev.txt b/requirements-dev.txt