[ET-VK] Add some utility compile options + improve export script (pytorch#15795)

SS-JIA · ssjia · web-flow · commit 89e2c5dcb7df · 2025-11-14T16:45:25.000-05:00
Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * pytorch#15829 * pytorch#15796 * __->__ pytorch#15795 * pytorch#15794 * pytorch#15793 Title says it all! Add two additional export options: 1. `skip_memory_planning` - skips the memory planning pass which can be useful for debugging. 2. `small_texture_limits` - sets the default texture limit to be (2048, 2048, 2048) which is compatible with more devices (i.e. desktop/laptop GPUs) compared to the default (16384, 16384, 2048) which is more targeted for mobile GPUs Also adds some improvements to the export script that were made while debugging the `YOLO_NAS` model (pytorch#15700) Differential Revision: [D86910640](https://our.internmc.facebook.com/intern/diff/D86910640/) --------- Co-authored-by: ssjia <ssjia@devvm26340.ftw0.facebook.com>
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
@@ -8,18 +8,14 @@
 import logging
 from collections import OrderedDict
 from copy import deepcopy
-
 from enum import auto, Enum
 from typing import Any, List, Optional, Tuple
 
 import executorch.backends.vulkan.utils as utils
-
 import torch
-
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
@@ -36,7 +32,6 @@
 )
 from executorch.extension.pytree import tree_flatten
 from torch.export import export
-
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -275,16 +270,25 @@ def check_outputs_equal(
                 )
             return result
         else:
+            result = True
             for i in range(len(ref_output)):
-                if not torch.allclose(
-                    model_output[i], ref_output[i], atol=atol, rtol=rtol
-                ):
-                    print(f"\n=== Output {i} comparison failed ===")
-                    print_tensor_comparison_errors(
-                        model_output[i], ref_output[i], atol, rtol
-                    )
-                    return False
-            return True
+                if isinstance(ref_output[i], torch.Tensor):
+                    if not torch.allclose(
+                        model_output[i], ref_output[i], atol=atol, rtol=rtol
+                    ):
+                        print(f"\n=== Output {i} comparison failed ===")
+                        print_tensor_comparison_errors(
+                            model_output[i], ref_output[i], atol, rtol
+                        )
+                        result = False
+                elif isinstance(ref_output[i], int):
+                    if not model_output[i] == ref_output[i]:
+                        print(f"\n=== Output {i} comparison failed ===")
+                        print(f"{model_output[i]} vs {ref_output[[i]]}")
+                        result = False
+                else:
+                    print(f"WARNING: Output {i} has type {type(ref_output[i])}")
+            return result
     else:
         # If one output, eager returns tensor while executor tuple of size 1
         result = torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol)
@@ -326,7 +330,7 @@ def run_and_check_output(
     model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
 
     # Generate reference outputs using the reference model
-    ref_output = reference_model(*sample_inputs)
+    ref_output, _ = tree_flatten(reference_model(*sample_inputs))
 
     # Check if outputs are equal
     return check_outputs_equal(
@@ -805,3 +809,26 @@ def find_bad_operators(
         "all_operators": all_operators,
         "test_count": test_count,
     }
+
+
+def make_indent(indent_level):
+    indent_str = ""
+    for _ in range(indent_level):
+        indent_str += " "
+    return indent_str
+
+
+def print_output(outputs, n: int = 0, indent_level: int = 0):
+    if isinstance(outputs, (list, tuple)):
+        print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}")
+        new_indent_level = indent_level + 2
+        for n, test_out in enumerate(outputs):
+            print_output(test_out, n, new_indent_level)
+    elif isinstance(outputs, torch.Tensor):
+        print(
+            f"{make_indent(indent_level)}output_{n} = test_utils.random_uniform_tensor({outputs.shape}, low={outputs.min().item()}, high={outputs.max().item()},  dtype={outputs.dtype})"
+        )
+    elif isinstance(outputs, int):
+        print(f"{make_indent(indent_level)}output_{n} = {outputs}")
+    else:
+        print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}")
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import copy
 from functools import partial
 from typing import Any, Callable, Dict, final, List
 
@@ -127,15 +128,21 @@ def preprocess(  # noqa: C901
         module_compile_spec: List[CompileSpec],
     ) -> PreprocessResult:
         compile_options = parse_compile_spec(module_compile_spec)
-        limits_x = compile_options.get(
-            "texture_limits_x", utils.DEFAULT_TEXTURE_LIMITS[0]
-        )
-        limits_y = compile_options.get(
-            "texture_limits_y", utils.DEFAULT_TEXTURE_LIMITS[1]
-        )
-        limits_z = compile_options.get(
-            "texture_limits_z", utils.DEFAULT_TEXTURE_LIMITS[2]
-        )
+
+        default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS)
+        # 2048 is the typical limit value for 3D textures, but mobile GPUs often support
+        # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment,
+        # 16394 is the default texture limit used. This option is provided as a
+        # convenient way to switch to using a limit of 2048 for image textures which
+        # will be compatible with most GPUs.
+        if compile_options.get("small_texture_limits", False):
+            default_texture_limits[0] = 2048
+            default_texture_limits[1] = 2048
+            default_texture_limits[2] = 2048
+
+        limits_x = compile_options.get("texture_limits_x", default_texture_limits[0])
+        limits_y = compile_options.get("texture_limits_y", default_texture_limits[1])
+        limits_z = compile_options.get("texture_limits_z", default_texture_limits[2])
         texture_limits = (limits_x, limits_y, limits_z)
 
         default_storage_type = compile_options.get(
@@ -204,22 +211,26 @@ def preprocess(  # noqa: C901
 
         # Finally, apply dynamic shape passes and memory planning pass. These passes
         # must be applied only when the graph structure is finalized.
-        greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
-        mem_planning_suite = MemoryPlanningAlgorithmSuite(
-            algo_list=[greedy_memory_planning]
-        )
-        # This is a workaround to allow the memory planning pass to work without having
-        # to first apply ToOutVarPass(). See the `greedy()` function in
-        # `exir.memory_planning`; if this attribute isn't set, assertions in
-        # `collect_spec_from_nodes()` will fail.
-        program.graph_module.encounter_to_out_var_failure = True
-        program = apply_passes(
-            program,
-            [
-                ConstraintBasedSymShapeEvalPass(),
-                MemoryPlanningPass(memory_planning_algo=mem_planning_suite),
-            ],
-        )
+        final_passes = [
+            ConstraintBasedSymShapeEvalPass(),
+        ]
+        if not compile_options.get("skip_memory_planning", False):
+            greedy_memory_planning = partial(
+                greedy, allow_overlapping_allocations=False
+            )
+            mem_planning_suite = MemoryPlanningAlgorithmSuite(
+                algo_list=[greedy_memory_planning]
+            )
+            # This is a workaround to allow the memory planning pass to work without having
+            # to first apply ToOutVarPass(). See the `greedy()` function in
+            # `exir.memory_planning`; if this attribute isn't set, assertions in
+            # `collect_spec_from_nodes()` will fail.
+            program.graph_module.encounter_to_out_var_failure = True
+            final_passes.append(
+                MemoryPlanningPass(memory_planning_algo=mem_planning_suite)
+            )
+
+        program = apply_passes(program, final_passes)
 
         graph_builder = VkGraphBuilder(
             program,
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py