diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt deleted file mode 100644 index 29ff90e7293..00000000000 --- a/backends/vulkan/CMakeLists.txt +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# Copyright 2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ### Editing this file ### -# -# This file should be formatted with -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ -# It should also be cmake-lint clean. -# -# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON - -cmake_minimum_required(VERSION 3.19) - -if(NOT EXECUTORCH_ROOT) - set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) -endif() - -if(NOT RUNTIME_PATH) - set(RUNTIME_PATH ${CMAKE_CURRENT_SOURCE_DIR}/runtime) -endif() - -# Include this file to access executorch_target_link_options_shared_lib This is -# required to provide access to executorch_target_link_options_shared_lib which -# allows libraries to be linked with the --whole-archive flag. This is required -# for libraries that perform dynamic registration via static initialization. -include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) - -include(cmake/ShaderLibrary.cmake) - -# Third party include paths - -set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third-party) - -set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers) -set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) -set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) - -set(COMMON_INCLUDES - $ - $ - $ $ -) - -# Compile settings - -set(VULKAN_CXX_FLAGS "-fexceptions") -list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_WRAPPER") -list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_VOLK") - -# vulkan API files - -file(GLOB_RECURSE vulkan_api_cpp ${RUNTIME_PATH}/api/*) -file(GLOB_RECURSE vulkan_vkapi_cpp ${RUNTIME_PATH}/vk_api/*) -list(APPEND vulkan_api_cpp ${vulkan_vkapi_cpp}) -list(APPEND vulkan_api_cpp ${VOLK_PATH}/volk.c) - -# vulkan ComputeGraph files - -file(GLOB_RECURSE vulkan_graph_cpp ${RUNTIME_PATH}/graph/*) -list(APPEND vulkan_graph_cpp ${vulkan_api_cpp}) - -# Standard GLSL shader library - -set(VULKAN_GRAPH_SHADERS_PATH ${RUNTIME_PATH}/graph/ops/glsl/) -# Generates a spv.cpp file containing compiled GLSL shaders -gen_vulkan_shader_lib_cpp(${VULKAN_GRAPH_SHADERS_PATH}) -# Save the path of the generated cpp file -set(vulkan_standard_shaders_cpp ${generated_spv_cpp}) - -# Generate Vulkan Delegate Schema Files from flatc - -set(SCHEMA_INCLUDE_DIR ${CMAKE_BINARY_DIR}/schema/include) - -set(GENERATED_HEADER - ${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/schema_generated.h -) - -add_custom_command( - OUTPUT ${GENERATED_HEADER} - COMMAND - flatc --cpp --cpp-std c++11 --scoped-enums -o - "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/" - ${_vulkan_schema__srcs} - WORKING_DIRECTORY ${EXECUTORCH_ROOT} - DEPENDS flatc - COMMENT "Generating vulkan_schema headers" - VERBATIM -) - -# vulkan_schema library - -add_library(vulkan_schema INTERFACE ${GENERATED_HEADER}) -set_target_properties(vulkan_schema PROPERTIES LINKER_LANGUAGE CXX) - -target_include_directories( - vulkan_schema - INTERFACE - $ - $ -) - -# vulkan_backend - -file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp) -list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp}) -list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp}) - -add_library(vulkan_backend ${vulkan_backend_cpp}) -target_include_directories( - vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES} -) -target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core) -target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS}) -# Link this library with --whole-archive due to dynamic backend registration -executorch_target_link_options_shared_lib(vulkan_backend) - -set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17) - -# Test targets - -install( - TARGETS vulkan_backend vulkan_schema - EXPORT ExecuTorchTargets - DESTINATION lib - INCLUDES - DESTINATION ${COMMON_INCLUDES} -) diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt new file mode 120000 index 00000000000..c59c41b3538 --- /dev/null +++ b/backends/vulkan/CMakeLists.txt @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/CMakeLists.txt \ No newline at end of file diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md deleted file mode 100644 index e0a953d05fe..00000000000 --- a/backends/vulkan/README.md +++ /dev/null @@ -1,205 +0,0 @@ -# Vulkan Backend - -The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is -built on top of the cross-platform Vulkan GPU API standard. It is primarily -designed to leverage the GPU to accelerate model inference on Android devices, -but can be used on any platform that supports an implementation of Vulkan: -laptops, servers, and edge devices. - -::::{note} -The Vulkan delegate is currently under active development, and its components -are subject to change. -:::: - -## What is Vulkan? - -Vulkan is a low-level GPU API specification developed as a successor to OpenGL. -It is designed to offer developers more explicit control over GPUs compared to -previous specifications in order to reduce overhead and maximize the -capabilities of the modern graphics hardware. - -Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both -desktop and mobile) in the market support Vulkan. Vulkan is also included in -Android from Android 7.0 onwards. - -**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it -provides a way to execute compute and graphics operations on a GPU, but does not -come with a built-in library of performant compute kernels. - -## The Vulkan Compute Library - -The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as -the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to -provide GPU implementations for PyTorch operators via GLSL compute shaders. - -The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html). -The core components of the PyTorch Vulkan backend were forked into ExecuTorch -and adapted for an AOT graph-mode style of model inference (as opposed to -PyTorch which adopted an eager execution style of model inference). - -The components of the Vulkan Compute Library are contained in the -`executorch/backends/vulkan/runtime/` directory. The core components are listed -and described below: - -``` -runtime/ -├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects -└── graph/ .................. ComputeGraph class which implements graph mode inference - └── ops/ ................ Base directory for operator implementations - ├── glsl/ ........... GLSL compute shaders - │ ├── *.glsl - │ └── conv2d.glsl - └── impl/ ........... C++ code to dispatch GPU compute shaders - ├── *.cpp - └── Conv2d.cpp -``` - -## Features - -The Vulkan delegate currently supports the following features: - -* **Memory Planning** - * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference. -* **Capability Based Partitioning**: - * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs -* **Support for upper-bound dynamic shapes**: - * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering - -In addition to increasing operator coverage, the following features are -currently in development: - -* **Quantization Support** - * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future. -* **Memory Layout Management** - * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication. -* **Selective Build** - * We plan to make it possible to control build size by selecting which operators/shaders you want to build with - -## End to End Example - -To further understand the features of the Vulkan Delegate and how to use it, -consider the following end to end example with a simple single operator model. - -### Compile and lower a model to the Vulkan Delegate - -Assuming ExecuTorch has been set up and installed, the following script can be -used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`. - -Once ExecuTorch has been set up and installed, the following script can be used -to generate a simple model and lower it to the Vulkan delegate. - -``` -# Note: this script is the same as the script from the "Setting up ExecuTorch" -# page, with one minor addition to lower to the Vulkan backend. -import torch -from torch.export import export -from executorch.exir import to_edge - -from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner - -# Start with a PyTorch model that adds two input tensors (matrices) -class Add(torch.nn.Module): - def __init__(self): - super(Add, self).__init__() - - def forward(self, x: torch.Tensor, y: torch.Tensor): - return x + y - -# 1. torch.export: Defines the program with the ATen operator set. -aten_dialect = export(Add(), (torch.ones(1), torch.ones(1))) - -# 2. to_edge: Make optimizations for Edge devices -edge_program = to_edge(aten_dialect) -# 2.1 Lower to the Vulkan backend -edge_program = edge_program.to_backend(VulkanPartitioner()) - -# 3. to_executorch: Convert the graph to an ExecuTorch program -executorch_program = edge_program.to_executorch() - -# 4. Save the compiled .pte program -with open("vk_add.pte", "wb") as file: - file.write(executorch_program.buffer) -``` - -Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate -using the `to_backend()` API. The Vulkan Delegate implements the -`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph -that are supported by the Vulkan delegate, and separates compatible sections of -the model to be executed on the GPU. - -This means the a model can be lowered to the Vulkan delegate even if it contains -some unsupported operators. This will just mean that only parts of the graph -will be executed on the GPU. - - -::::{note} -The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/op_registry.py#L194) -Vulkan partitioner code can be inspected to examine which ops are currently -implemented in the Vulkan delegate. -:::: - -### Build Vulkan Delegate libraries - -The easiest way to build and test the Vulkan Delegate is to build for Android -and test on a local Android device. Android devices have built in support for -Vulkan, and the Android NDK ships with a GLSL compiler which is needed to -compile the Vulkan Compute Library's GLSL compute shaders. - -The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON` -when building with CMake. - -First, make sure that you have the Android NDK installed; any NDK version past -NDK r19c should work. Note that the examples in this doc have been validated with -NDK r27b. The Android SDK should also be installed so that you have access to `adb`. - -The instructions in this page assumes that the following environment variables -are set. - -```shell -export ANDROID_NDK= -# Select the appropriate Android ABI for your device -export ANDROID_ABI=arm64-v8a -# All subsequent commands should be performed from ExecuTorch repo root -cd -# Make sure adb works -adb --version -``` - -To build and install ExecuTorch libraries (for Android) with the Vulkan -Delegate: - -```shell -# From executorch root directory -(rm -rf cmake-android-out && \ - pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DPYTHON_EXECUTABLE=python \ - -Bcmake-android-out && \ - cmake --build cmake-android-out -j16 --target install) -``` - -### Run the Vulkan model on device - -::::{note} -Since operator support is currently limited, only binary arithmetic operators -will run on the GPU. Expect inference to be slow as the majority of operators -are being executed via Portable operators. -:::: - -Now, the partially delegated model can be executed (partially) on your device's -GPU! - -```shell -# Build a model runner binary linked with the Vulkan delegate libs -cmake --build cmake-android-out --target executor_runner -j32 - -# Push model to device -adb push vk_add.pte /data/local/tmp/vk_add.pte -# Push binary to device -adb push cmake-android-out/executor_runner /data/local/tmp/runner_bin - -# Run the model -adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vk_add.pte -``` diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md new file mode 120000 index 00000000000..4017cdc2caa --- /dev/null +++ b/backends/vulkan/README.md @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/README.md \ No newline at end of file diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS deleted file mode 100644 index 41893d29274..00000000000 --- a/backends/vulkan/TARGETS +++ /dev/null @@ -1,4 +0,0 @@ -load(":targets.bzl", "define_common_targets") -oncall("executorch") - -define_common_targets(is_fbcode = True) diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS new file mode 120000 index 00000000000..2be80e569d1 --- /dev/null +++ b/backends/vulkan/TARGETS @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/TARGETS \ No newline at end of file diff --git a/backends/vulkan/__init__.py b/backends/vulkan/__init__.py deleted file mode 100644 index 6c25e56115b..00000000000 --- a/backends/vulkan/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from .partitioner.vulkan_partitioner import VulkanPartitioner - -from .vulkan_preprocess import VulkanBackend - -__all__ = [ - "VulkanPartitioner", - "VulkanBackend", -] diff --git a/backends/vulkan/__init__.py b/backends/vulkan/__init__.py new file mode 120000 index 00000000000..bf978851c6e --- /dev/null +++ b/backends/vulkan/__init__.py @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/__init__.py \ No newline at end of file diff --git a/backends/vulkan/_passes b/backends/vulkan/_passes new file mode 120000 index 00000000000..e3c7d6c74fe --- /dev/null +++ b/backends/vulkan/_passes @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/_passes \ No newline at end of file diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS deleted file mode 100644 index 8558a2eea93..00000000000 --- a/backends/vulkan/_passes/TARGETS +++ /dev/null @@ -1,171 +0,0 @@ -load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -oncall("executorch") - -runtime.python_library( - name = "fuse_quantized_ops", - srcs = ["fuse_quantized_ops.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/transforms:utils", - "//executorch/backends/vulkan:custom_ops_lib", - "//executorch/backends/vulkan:utils_lib", - "//executorch/exir:pass_base", - "//executorch/exir:sym_util", - "//executorch/exir/dialects:lib", - ], -) - -runtime.python_library( - name = "insert_prepack_nodes", - srcs = ["insert_prepack_nodes.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:pass_base", - "//executorch/backends/vulkan:utils_lib", - "//executorch/backends/vulkan:op_registry", - ], -) - -runtime.python_library( - name = "int4_weight_only_quantizer", - srcs = [ - "int4_weight_only_quantizer.py", - ], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//executorch/backends/vulkan:custom_ops_lib", - "//pytorch/ao:torchao", - ] -) - -runtime.python_library( - name = "squeeze_unsqueeze_inputs", - srcs = [ - "squeeze_unsqueeze_inputs.py", - ], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan:custom_ops_lib", - "//executorch/exir:pass_base", - "//executorch/exir/dialects:lib", - ] -) - -runtime.python_library( - name = "remove_asserts", - srcs = ["remove_asserts.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:pass_base", - "//executorch/exir/dialects:lib", - ], -) - -runtime.python_library( - name = "remove_local_scalar_dense", - srcs = ["remove_local_scalar_dense_ops.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:pass_base", - "//executorch/exir/dialects:lib", - ], -) - -runtime.python_library( - name = "remove_redundant_ops", - srcs = ["remove_redundant_ops.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:pass_base", - "//executorch/exir/dialects:lib", - ], -) - -runtime.python_library( - name = "tag_memory_meta_pass", - srcs = ["tag_memory_meta_pass.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:pass_base", - "//executorch/exir/dialects:lib", - "//executorch/backends/vulkan:utils_lib", - "//executorch/backends/vulkan/serialization:lib", - ], -) - -runtime.python_library( - name = "fold_qdq", - srcs = ["fold_qdq.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan:utils_lib", - "//executorch/exir:pass_base", - ], -) - -runtime.python_library( - name = "fuse_patterns", - srcs = ["fuse_patterns.py"], - visibility = [ - "//executorch/backends/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan/patterns:vulkan_patterns", - "//executorch/exir:lib", - "//executorch/exir:pass_base", - "//executorch/exir/dialects:lib", - ], - typing = True, -) - -runtime.python_library( - name = "vulkan_passes", - srcs = [ - "__init__.py", - ], - visibility = [ - "//executorch/backends/...", - "//executorch/examples/...", - ], - deps = [ - ":fold_qdq", - ":fuse_patterns", - ":fuse_quantized_ops", - ":insert_prepack_nodes", - ":int4_weight_only_quantizer", - ":remove_asserts", - ":remove_local_scalar_dense", - ":remove_redundant_ops", - ":squeeze_unsqueeze_inputs", - ":tag_memory_meta_pass", - ] -) diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py deleted file mode 100644 index 2c4588ac43d..00000000000 --- a/backends/vulkan/_passes/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from executorch.backends.vulkan._passes.fold_qdq import FoldQDQPass -from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass -from executorch.backends.vulkan._passes.fuse_quantized_ops import ( - FuseQuantizedOpsTransform, -) -from executorch.backends.vulkan._passes.insert_prepack_nodes import insert_prepack_nodes -from executorch.backends.vulkan._passes.int4_weight_only_quantizer import ( - VkInt4WeightOnlyQuantizer, -) -from executorch.backends.vulkan._passes.remove_asserts import ( - remove_asserts, - RemoveAssertsTransform, -) -from executorch.backends.vulkan._passes.remove_local_scalar_dense_ops import ( - RemoveLocalScalarDenseOpsTransform, -) -from executorch.backends.vulkan._passes.remove_redundant_ops import ( - RemoveRedundantOpsTransform, -) -from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import ( - SqueezeUnsqueezeInputs, -) -from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass - -__all__ = [ - "FoldQDQPass", - "FusePatternsPass", - "FuseQuantizedOpsTransform", - "insert_prepack_nodes", - "VkInt4WeightOnlyQuantizer", - "remove_asserts", - "RemoveAssertsTransform", - "RemoveLocalScalarDenseOpsTransform", - "RemoveRedundantOpsTransform", - "SqueezeUnsqueezeInputs", - "TagMemoryMetaPass", -] diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py deleted file mode 100644 index 3beccc2205c..00000000000 --- a/backends/vulkan/_passes/fold_qdq.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import executorch.backends.vulkan.utils as utils -import torch - -from executorch.exir.pass_base import ExportPass, PassResult -from executorch.exir.passes import dead_code_elimination_pass - - -class FoldQDQPass(ExportPass): - """ - Erase Q/DQ chain introduced by PT2E quantization workflow. It is assumed that all - valid quant op patterns have already been fused before this pass. - """ - - def __init__(self, edge_program: torch.export.ExportedProgram): - super(FoldQDQPass, self).__init__() - self.edge_program = edge_program - - def call(self, graph_module: torch.fx.GraphModule): - for node in graph_module.graph.nodes: - if utils.is_quant_node(node): - original_node = node.args[0] - assert isinstance(original_node, torch.fx.Node) - # For each direct user that is a dequant node, connect the original - # node to the users of the dequant node. - for user in node.users: - if utils.is_dequant_node(user): - dq_node = user - dq_node.replace_all_uses_with(original_node) - - graph_module.recompile() - dead_code_elimination_pass(graph_module) - # Re-trace to validate everything is ok - graph_module = super().call(graph_module).graph_module - - return PassResult(graph_module, True) diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py deleted file mode 100644 index 6ced1f32a7c..00000000000 --- a/backends/vulkan/_passes/fuse_patterns.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import executorch.backends.vulkan.patterns as vk_patterns - -import torch - -from executorch.exir import ExportedProgram -from executorch.exir.pass_base import ExportPass, PassResult - - -class FusePatternsPass(ExportPass): - def __init__(self, exported_program: ExportedProgram) -> None: - super().__init__() - self.program = exported_program - - def call(self, graph_module: torch.fx.GraphModule): - total_replaced = vk_patterns.replace_all_fusable_subgraphs( - self.program, graph_module - ) - - if total_replaced > 0: - graph_module.recompile() - # Re-trace the graph - graph_module = super().call(graph_module).graph_module - - return PassResult(graph_module, total_replaced > 0) diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py deleted file mode 100644 index 3d3214bb4ee..00000000000 --- a/backends/vulkan/_passes/fuse_quantized_ops.py +++ /dev/null @@ -1,515 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from typing import Optional, Tuple - -import executorch.backends.vulkan.utils as utils -import torch - -import torch.nn.functional as F - -from executorch.backends.transforms.utils import get_param_tensor, is_param_node -from executorch.exir import ExportedProgram -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult -from executorch.exir.passes import dead_code_elimination_pass - -################# -## linear_qcnw ## -################# - - -def matches_linear_qcnw_pattern( # noqa: C901 - program: ExportedProgram, node: torch.fx.Node -) -> Optional[Tuple[torch.qscheme, int]]: - """ - Checks if the nodes surrounding a linear node matches the pattern for weight only - quantized linear, where the weight is quantized channelswise to n bits. - - If the graph pattern matches, then return a tuple of (quantization_method, nbits) - describing the type of quantization used for the weights. Otherwise, return None. - """ - if not utils.is_linear_node(node): - return None - - input_node = node.args[0] - weight_node = node.args[1] - - # Type checking - if not isinstance(weight_node, torch.fx.Node): - return None - if not isinstance(input_node, torch.fx.Node): - return None - - # The input arg should not be a dequant node; if it is, then it is indicative that - # dynamically quantized linear should be used instead - if utils.is_dequant_node(input_node): - return None - - # The weight arg should be a dequant node dequantizing the quantized weight - # Furthermore, the op expects per channel quantization of the weight - if not utils.is_dequant_per_channel_node(weight_node): - return None - - orig_weight = weight_node.args[0] - zeros = weight_node.args[2] - - # Type checking - if not isinstance(orig_weight, torch.fx.Node): - return None - if not is_param_node(program, orig_weight): - return None - if not isinstance(zeros, torch.fx.Node): - return None - if not is_param_node(program, zeros): - return None - - zeros_tensor = get_param_tensor(program, zeros) - if not isinstance(zeros_tensor, torch.Tensor): - return None - - quant_method = torch.per_channel_affine - # Check for symmetric quantization, where the zeros used for dequantization will - # actually be all zeros. - if torch.all(zeros_tensor == 0): - quant_method = torch.per_channel_symmetric - - orig_weight_tensor = get_param_tensor(program, orig_weight) - if not isinstance(orig_weight_tensor, torch.Tensor): - return None - # Sanity check the dtype of the quantized weight - if orig_weight_tensor.dtype != torch.int8: - return None - - quant_min = orig_weight_tensor.min().item() - quant_max = orig_weight_tensor.max().item() - # Determine the number of bits the weight has been quantized to - if quant_min >= -8 and quant_max <= 7: - return quant_method, 4 - elif quant_min >= -128 and quant_max <= 127: - return quant_method, 8 - - return None - - -def pack_4bit_weight_tensor(inp: torch.Tensor) -> torch.Tensor: - """ - Given a 8-bit weight tensor containing values quantized to 4 bits, create a packed - weight tensor by packing 2 4-bit values in one unsigned 8-bit value. - - An input weight tensor of shape (M, K) will produce a packed weight tensor of shape - (M, K / 2). - """ - - # Assert we got a properly quantized tensor. - min, max = inp.min().item(), inp.max().item() - assert ( - max <= 7 and min >= -8 - ), f"convert_to_qc4w: [min,max] out of [-8, 7] range, got [{min}, {max}]" - - # Assuming we have a 2d tensor - if inp.ndim != 2: - inp = inp.squeeze() - assert ( - inp.ndim == 2 - ), f"convert_to_qc4w: expecting input tensor to be 2d, got {inp.ndim}" - - # pad ic - if inp.shape[-1] % 2 != 0: - inp = F.pad(input=inp, pad=(0, 1, 0, 0), mode="constant", value=0) - - # Shape after padding - oc, ic = inp.shape - assert ic % 2 == 0, "convert_to_qc4w: expecting ic to be even" - - # Adjust inp tensor for zp - inp = inp.to(dtype=torch.uint8) + 8 - - # Prepare the Result tensor - inp = inp.contiguous().view(-1) - return (inp[::2] << 4 | inp[1::2]).view(oc, int(ic / 2)) - - -def fuse_into_linear_qcnw_node( - program: ExportedProgram, - graph_module: torch.fx.GraphModule, - linear_node: torch.fx.Node, - quant_method: torch.qscheme, - nbits: int, -) -> None: - """ - The weight_int8pack_mm operator represents a weight only quantized linear operator, - where the weight tensor has been quantized channelswise to nbits bits. - - After the PT2E quantization flow, the expected graph pattern is - - dq_weight = dequantize(weight, scales) - out = linear(activation, dq_weight, bias?) - - The goal of this function is to condense that sequence into - - out = quantized_linear(activation, dq_weight, scales) - out = out + bias - """ - activation = linear_node.args[0] - dq_weight_node = linear_node.args[1] - assert isinstance(activation, torch.fx.Node) - assert isinstance(dq_weight_node, torch.fx.Node) - - bias = None - if len(linear_node.args) > 2: - bias = linear_node.args[2] - assert isinstance(bias, torch.fx.Node) - - orig_weight = dq_weight_node.args[0] - scale = dq_weight_node.args[1] - - # For 4 bit quantization, pack the weight tensor - if nbits == 4: - assert isinstance(orig_weight, torch.fx.Node) - orig_weight_tensor = get_param_tensor(program, orig_weight) - assert isinstance(orig_weight_tensor, torch.Tensor) - packed_weight_tensor = pack_4bit_weight_tensor(orig_weight_tensor) - utils.update_program_state_dict( - program, - orig_weight.name, - packed_weight_tensor, - ) - orig_weight.meta["val"] = orig_weight.meta["val"][:, ::2].to(torch.uint8) - - if nbits == 8 and quant_method == torch.per_channel_symmetric: - op_target = exir_ops.edge.aten._weight_int8pack_mm.default - elif nbits == 4 and quant_method == torch.per_channel_symmetric: - op_target = exir_ops.edge.et_vk.linear_qcs4w.default - else: - raise NotImplementedError( - "only 4 and 8 bits per channel symmetric quant supported for linear_qcnw" - ) - - with graph_module.graph.inserting_before(linear_node): - weight_int8pack_mm_node = graph_module.graph.create_node( - "call_function", - op_target, - (activation, orig_weight, scale), - ) - if bias: - add_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.aten.add.Tensor, - (weight_int8pack_mm_node, bias), - ) - linear_node.replace_all_uses_with(add_node) - else: - linear_node.replace_all_uses_with(weight_int8pack_mm_node) - graph_module.graph.erase_node(linear_node) - graph_module.graph.erase_node(dq_weight_node) - - -######################### -## linear_qta8a_qga4w ## -######################### - - -def _is_dequantize_affine_node(node: torch.fx.Node) -> bool: - """Check if a node is a dequantize_affine operation.""" - return ( - node.op == "call_function" - and node.target is not None - and hasattr(node.target, "__name__") - and "dequantize_affine" in getattr(node.target, "__name__", "") - ) - - -def _is_view_copy_node(node: torch.fx.Node) -> bool: - """Check if a node is a view_copy operation.""" - return ( - node.op == "call_function" - and node.target is not None - and hasattr(node.target, "__name__") - and "view_copy" in getattr(node.target, "__name__", "") - ) - - -def _validate_qta8a_qga4w_nodes( - input_node: torch.fx.node.Argument, weight_node: torch.fx.node.Argument -) -> Optional[torch.fx.Node]: - """ - Validate input and weight nodes for QTA8A_QGA4W pattern. - Returns the actual input node (after handling view operations) or None if invalid. - """ - # Type checking - ensure we have torch.fx.Node objects - if not isinstance(weight_node, torch.fx.Node) or not isinstance( - input_node, torch.fx.Node - ): - return None - - # Input may be preprocessed with a view node - actual_input_node = input_node - if _is_view_copy_node(input_node): - actual_input_node = input_node.args[0] - if not isinstance(actual_input_node, torch.fx.Node): - return None - - # Check if input is dequantized with dequantize_affine (from dynamic quantization) - if not _is_dequantize_affine_node(actual_input_node): - return None - - # Check if weight is dequantized with dequantize_affine - if not _is_dequantize_affine_node(weight_node): - return None - - return actual_input_node - - -def _extract_weight_params( - program: ExportedProgram, weight_node: torch.fx.Node -) -> Optional[Tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node]]: - """Extract and validate weight parameters from dequantize_affine node.""" - # Get the original quantized weight and quantization parameters - if len(weight_node.args) < 4: - return None - - orig_weight = weight_node.args[0] - weight_scales = weight_node.args[2] - weight_zeros = weight_node.args[3] - - # Type checking - if not isinstance(orig_weight, torch.fx.Node) or not is_param_node( - program, orig_weight - ): - return None - if not isinstance(weight_scales, torch.fx.Node) or not is_param_node( - program, weight_scales - ): - return None - if not isinstance(weight_zeros, torch.fx.Node) or not is_param_node( - program, weight_zeros - ): - return None - - return orig_weight, weight_scales, weight_zeros - - -def _validate_4bit_quantization(weight_tensor: torch.Tensor) -> bool: - """Check if weight tensor is quantized to 4 bits (values in [-8, 7] range).""" - quant_min = weight_tensor.min().item() - quant_max = weight_tensor.max().item() - return quant_min >= -8 and quant_max <= 7 - - -def _calculate_group_size( - orig_weight_tensor: torch.Tensor, weight_scales_tensor: torch.Tensor -) -> Optional[int]: - """Calculate and validate group size from weight and scales tensors.""" - out_features, in_features = orig_weight_tensor.shape - - if len(weight_scales_tensor.shape) != 2: - return None - - scales_out_features, num_groups = weight_scales_tensor.shape - - if scales_out_features != out_features: - return None - - group_size = in_features // num_groups - if in_features % group_size != 0: - return None - - return group_size - - -def matches_linear_qta8a_qga4w_pattern( - program: ExportedProgram, node: torch.fx.Node -) -> Optional[Tuple[int, int]]: - """ - Checks if the nodes surrounding a linear node matches the pattern for dynamic - activation + grouped weight quantized linear (QTA8A_QGA4W). - - This pattern involves: - 1. Dynamic quantization of input activations (8-bit) - 2. Grouped quantization of weights (4-bit with group size) - - The expected pattern from Int8DynActInt4WeightQuantizer is: - scale, zero_point = choose_qparams_affine(input) - quantized_input = quantize_affine(input, scale, zero_point) - dequantized_input = dequantize_affine(quantized_input, ...) - dequantized_weight = dequantize_affine(weight, weight_scales, weight_zeros) - output = linear(dequantized_input, dequantized_weight) - - If the pattern matches, return (group_size, weight_bits), otherwise None. - """ - if not utils.is_linear_node(node): - return None - - input_node = node.args[0] - weight_node = node.args[1] - - # Validate nodes and get actual input node - actual_input_node = _validate_qta8a_qga4w_nodes(input_node, weight_node) - if actual_input_node is None: - return None - - # Extract weight parameters - if not isinstance(weight_node, torch.fx.Node): - return None - weight_params = _extract_weight_params(program, weight_node) - if weight_params is None: - return None - - orig_weight, weight_scales, weight_zeros = weight_params - - # Get tensors to analyze the quantization scheme - orig_weight_tensor = get_param_tensor(program, orig_weight) - weight_scales_tensor = get_param_tensor(program, weight_scales) - weight_zeros_tensor = get_param_tensor(program, weight_zeros) - - if not isinstance(orig_weight_tensor, torch.Tensor): - return None - if not isinstance(weight_scales_tensor, torch.Tensor): - return None - if not isinstance(weight_zeros_tensor, torch.Tensor): - return None - - # Check if weight is quantized to 4 bits - if not _validate_4bit_quantization(orig_weight_tensor): - return None - - # Calculate group size - group_size = _calculate_group_size(orig_weight_tensor, weight_scales_tensor) - if group_size is None: - return None - - # Verify this is 4-bit grouped quantization - weight_bits = 4 - - return group_size, weight_bits - - -def fuse_into_linear_qta8a_qga4w_node( - program: ExportedProgram, - graph_module: torch.fx.GraphModule, - linear_node: torch.fx.Node, - group_size: int, - weight_bits: int, -) -> None: - """ - Fuse the dynamic activation + grouped weight quantized linear pattern into - a single linear_qta8a_qga4w operator. - - The pattern: - dequantized_input = dequantize_affine(quantized_input, block_size, scale, zero_point, ...) - dequantized_weight = dequantize_affine(weight, block_size, weight_scales, weight_zeros, ...) - output = linear(dequantized_input, dequantized_weight) - - Becomes: - output = linear_qta8a_qga4w(quantized_input, input_scale, input_zero_point, - weight, group_size, weight_scales, weight_zeros) - """ - dq_input_node = linear_node.args[0] - dq_weight_node = linear_node.args[1] - - assert isinstance(dq_input_node, torch.fx.Node) - - input_view_node = None - # Input may be preprocessed with a view node - if ( - dq_input_node.op == "call_function" - and dq_input_node.target is not None - and hasattr(dq_input_node.target, "__name__") - and "view_copy" in getattr(dq_input_node.target, "__name__", "") - ): - input_view_node = dq_input_node - dq_input_node = dq_input_node.args[0] - assert isinstance(dq_input_node, torch.fx.Node) - - assert isinstance(dq_input_node, torch.fx.Node) - assert isinstance(dq_weight_node, torch.fx.Node) - - # Get the quantized input and quantization parameters from the input dequantize_affine node - # Args: (input, block_size, scale, zero_point, input_dtype, quant_min, quant_max, output_dtype) - quantized_input = dq_input_node.args[0] - input_scale = dq_input_node.args[2] # scale is the 3rd argument - input_zero_point = dq_input_node.args[3] if len(dq_input_node.args) > 3 else None - - # Get the weight and its quantization parameters from dequantize_affine - # Args: (weight, block_size, weight_scales, weight_zeros, input_dtype, quant_min, quant_max, output_dtype) - orig_weight = dq_weight_node.args[0] - weight_scales = dq_weight_node.args[2] - weight_zeros = dq_weight_node.args[3] - - # Pack the 4-bit weight tensor for efficient storage - assert isinstance(orig_weight, torch.fx.Node) - orig_weight_tensor = get_param_tensor(program, orig_weight) - assert isinstance(orig_weight_tensor, torch.Tensor) - packed_weight_tensor = pack_4bit_weight_tensor(orig_weight_tensor) - utils.update_program_state_dict( - program, - orig_weight.name, - packed_weight_tensor, - ) - # Update the metadata to reflect the new packed shape - orig_weight.meta["val"] = orig_weight.meta["val"][:, ::2].to(torch.uint8) - - # Create the linear_qta8a_qga4w node - with graph_module.graph.inserting_before(linear_node): - linear_qta8a_qga4w_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.et_vk.linear_qta8a_qga4w.default, - ( - quantized_input, # quantized input (int8) - input_scale, # mat1_scale - input_zero_point, # mat1_zero_point - orig_weight, # mat2_data (packed 4-bit weights) - group_size, # group_size (int) - weight_scales, # weight_scales - weight_zeros, # weight_zeros - ), - ) - - # Replace the linear node with the new fused node - linear_node.replace_all_uses_with(linear_qta8a_qga4w_node) - - # Erase nodes in the correct order (users first, then dependencies) - graph_module.graph.erase_node(linear_node) - if input_view_node is not None: - graph_module.graph.erase_node(input_view_node) - graph_module.graph.erase_node(dq_weight_node) - graph_module.graph.erase_node(dq_input_node) - - -class FuseQuantizedOpsTransform(ExportPass): - def __init__(self, exported_program: ExportedProgram) -> None: - super().__init__() - self.program = exported_program - - def call(self, graph_module: torch.fx.GraphModule) -> PassResult: - for node in graph_module.graph.nodes: - # Check for linear_qcnw pattern (weight-only quantization) - qcnw_details = matches_linear_qcnw_pattern(self.program, node) - if qcnw_details is not None: - qcnw_method, qcnw_nbits = qcnw_details - fuse_into_linear_qcnw_node( - self.program, graph_module, node, qcnw_method, qcnw_nbits - ) - continue - - # Check for linear_qta8a_qga4w pattern (dynamic activation + grouped weight quantization) - qta8a_qga4w_details = None - if qta8a_qga4w_details is not None: - group_size, weight_bits = qta8a_qga4w_details - fuse_into_linear_qta8a_qga4w_node( - self.program, graph_module, node, group_size, weight_bits - ) - continue - - graph_module.recompile() - dead_code_elimination_pass(graph_module) - - # Re-trace the graph since new nodes were (potentially) inserted - graph_module = super().call(graph_module).graph_module - return PassResult(graph_module, True) diff --git a/backends/vulkan/_passes/insert_prepack_nodes.py b/backends/vulkan/_passes/insert_prepack_nodes.py deleted file mode 100644 index c45ed4ea25d..00000000000 --- a/backends/vulkan/_passes/insert_prepack_nodes.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from copy import deepcopy - -from executorch.backends.vulkan.op_registry import handles_own_prepacking -from executorch.backends.vulkan.utils import is_param_node - -from executorch.exir.dialects._ops import ops as exir_ops - -from torch.export import ExportedProgram - - -def insert_prepack_nodes(program: ExportedProgram) -> ExportedProgram: - """ - Insert `et_vk.prepack` nodes for constant tensors in the graph. The prepack operator - is responsible for transferring the tensor data, which is serialized with the model, - to a GPU tensor object during the prepacking stage of model execution. - - Some operators are performance sensitive and will prefer to handle prepacking within - the operator. For these ops, the constant tensor data will be passed directly as an - argument into the operator implementation. - """ - - for node in program.graph_module.graph.nodes: - # Prepacking is only needed for constant tensors. Only nodes corresponding to - # constant tensors will proceed beyond this point. - if not is_param_node(program, node): - continue - - # Mark that this node is going to be represented as a TensorRef type in the - # Vulkan compute graph. This annotation is used in later graph passes. - node.meta["etvk_tensorref"] = True - - # Get the list of node users that do not handle their own prepacking - nodes_to_replace_input = [] - for user in node.users: - if user.op == "call_function" and not handles_own_prepacking(user.target): - nodes_to_replace_input.append(user) - - if len(nodes_to_replace_input) == 0: - continue - - replace_all_uses = len(nodes_to_replace_input) == len(node.users) - - with program.graph_module.graph.inserting_after(node): - prepack_node = program.graph_module.graph.create_node( - "call_function", - exir_ops.edge.et_vk.prepack.default, - (node,), - ) - # This pass assumes that the SpecPropPass() has already been applied - assert "spec" in node.meta - # Mutable buffers will not be marked as constant, but it might as well be - # for the purposes of memory planning. Mark it as a constant tensor so that - # it is handled correctly by the memory planning pass. - if not node.meta["spec"].const: - assert is_param_node(program, node) - node.meta["spec"].const = True - # Validate that the original node is marked as a constant. Constant tensors - # do not participate in memory planning. - assert node.meta["spec"].const - prepack_node.meta["val"] = node.meta["val"] - prepack_node.meta["spec"] = deepcopy(node.meta["spec"]) - # Set the mem_obj_id to -1 to indicate that this node requires a dedicated - # memory object. - prepack_node.meta["spec"].mem_obj_id = -1 - if replace_all_uses: - node.replace_all_uses_with( - prepack_node, - lambda x, y=prepack_node: (x != y and x.op != "output"), - ) - else: - for user_node in nodes_to_replace_input: - user_node.replace_input_with(node, prepack_node) - - program.graph.eliminate_dead_code() - return program diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py deleted file mode 100644 index 34ff5937822..00000000000 --- a/backends/vulkan/_passes/int4_weight_only_quantizer.py +++ /dev/null @@ -1,283 +0,0 @@ -# pyre-unsafe -import logging -from typing import Any, Callable, Dict, Optional, Type - -import executorch.backends.vulkan.custom_ops_lib # noqa - -import torch -import torch.nn.functional as F - -from torchao.quantization.unified import Quantizer -from torchao.quantization.utils import groupwise_affine_quantize_tensor - - -# TODO: import from from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k -# Once diff train catches up -def _check_linear_int4_k(k, group_size=1, inner_k_tiles=None): - """ - Check if the dimensions are compatible with int4 quantization. - - Args: - k: The dimension size to check - group_size: The group size for quantization - inner_k_tiles: The inner k tiles size - - Returns: - bool: Whether the dimensions are compatible - """ - k_divisible_by_group_size = k % group_size == 0 - if inner_k_tiles is not None: - k_divisible_by_16_times_inner_k_tiles = k % (inner_k_tiles * 16) == 0 - return k_divisible_by_group_size and k_divisible_by_16_times_inner_k_tiles - return k_divisible_by_group_size - - -# This module is copied from torchao.quantization.GPTQ.WeightOnlyInt4Linear with -# changes at the annotated lines. -class VkWeightOnlyInt4Linear(torch.nn.Module): - __constants__ = ["in_features", "out_features"] - in_features: int - out_features: int - weight: torch.Tensor - - def __init__( - self, - in_features: int, - out_features: int, - # TODO: remove dtype field, not used - bias=False, - device=None, - dtype=None, - groupsize: int = 128, - inner_k_tiles: int = 8, - precision: torch.dtype = torch.bfloat16, - scales_precision: torch.dtype = torch.bfloat16, - ) -> None: - super().__init__() - self.padding = not _check_linear_int4_k(in_features, groupsize, inner_k_tiles) - if self.padding: - from torchao.utils import find_multiple - - self.origin_in_features = in_features - # pyre-ignore[6]: Incompatible parameter type - in_features = find_multiple(in_features, 1024) - - self.use_bias = bias - self.in_features = in_features - self.out_features = out_features - self.device = device - self.groupsize = groupsize - self.inner_k_tiles = inner_k_tiles - self.precision = precision - self.scales_precision = scales_precision - - if dtype is not None: - raise ValueError("Please specify 'precision' instead of 'dtype'") - - assert out_features % 8 == 0, "require out_features % 8 == 0" - assert ( - in_features % (inner_k_tiles * 16) == 0 - ), "require in_features % (innerKTiles * 16) == 0" - # In the original implementation, the weight buffer is registered with the packed - # sizes, i.e. the result of calling the _convert_weight_to_int4pack operator. - # However, the Vulkan implementation does not expect the weights to be packed - # therefore the weight tensor is registered with the unpacked sizes instead. - # Note that in_features is divided by 2 because each `uint8` tensor element - # contains 2 4-bit packed values. - self.register_buffer( - "weight", - torch.empty( - (out_features, in_features // 2), - dtype=torch.uint8, - device=device, - ), - ) - self.dtype = dtype - self.register_buffer( - "scales_and_zeros", - torch.empty( - (in_features // groupsize, out_features, 2), - dtype=self.scales_precision, - device=device, - ), - ) - if bias: - self.register_buffer( - "bias", - torch.empty((out_features,), dtype=torch.float32, device=device), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - if self.padding: - input = F.pad(input, pad=(0, self.in_features - self.origin_in_features)) - # The forward method is replaced. In the original implementation, the forward - # method is torchao.quantization.GPTQ.linear_forward_int4; here a Vulkan custom - # operator is called instead. - r = torch.ops.et_vk.linear_weight_int4( - input, - self.weight, - self.groupsize, - self.scales_and_zeros, - self.inner_k_tiles, - ) - if self.use_bias: - return r + self.bias - return r - - -# This function is coped from torchao.quantization.GPTQ._replace_linear_int4 -# with small changes at the annotated locations. -def _vk_replace_linear_int4( - module: torch.nn.Module, - groupsize: int, - inner_k_tiles: Optional[int], - padding_allowed: bool, - skip_layer_func: Optional[Callable] = None, - precision: torch.dtype = torch.bfloat16, - scales_precision: torch.dtype = torch.bfloat16, - # Use custom vulkan linear layer as default - linear_class: Type[torch.nn.Module] = VkWeightOnlyInt4Linear, - copy_weights: bool = False, -): - for name, child in module.named_children(): - if isinstance(child, torch.nn.Linear) and ( - skip_layer_func is None or not skip_layer_func(child.weight) - ): - # Add an additional condition that the out/in features must not exceed the - # `feature_limit` argument. - if ( - _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) - or padding_allowed - ): - new_linear = linear_class( - child.in_features, - child.out_features, - bias=child.bias is not None, - device=child.weight.device, - groupsize=groupsize, - inner_k_tiles=inner_k_tiles, - precision=precision, - scales_precision=scales_precision, - ) - if copy_weights and child.weight.device != torch.device("meta"): - # pyre-fixme[16]: `Module` has no attribute `weight`. - new_linear.weight = child.weight - if child.bias is not None: - # pyre-fixme[16]: `Module` has no attribute `bias`. - new_linear.bias = child.bias - setattr(module, name, new_linear) - else: - _vk_replace_linear_int4( - child, - groupsize, - inner_k_tiles, - padding_allowed, - skip_layer_func, - precision, - scales_precision, - linear_class, - copy_weights, - ) - - -# This module is copied from torchao.quantization.GPTQ.Int4WeightOnlyQuantizer -# with some changes at the annotated lines. -class VkInt4WeightOnlyQuantizer(Quantizer): - def __init__( - self, - groupsize: int = 256, - padding_allowed: bool = True, - inner_k_tiles: Optional[int] = 8, - device: torch.device = torch.device("cpu"), # noqa - precision: torch.dtype = torch.float32, - ) -> None: - super().__init__() - assert inner_k_tiles in [2, 4, 8] - assert groupsize in [32, 64, 128, 256] - - self.inner_k_tiles = inner_k_tiles - self.groupsize: int = groupsize - self.padding_allowed: bool = padding_allowed - self.device: torch.device = device - self.precision: torch.dtype = precision - - @torch.no_grad() - def _create_quantized_state_dict( - self, model: torch.nn.Module - ) -> Dict[str, torch.Tensor]: - cur_state_dict = model.state_dict() - for fqn, mod in model.named_modules(): - # Add additional check to make sure features do not exceed feature limit - if isinstance(mod, torch.nn.Linear): - out_features = mod.out_features - in_features = mod.in_features - logging.info(f"linear: {fqn}, in={in_features}, out={out_features}") - - assert ( - in_features % self.groupsize == 0 - ), f"require in_features:{in_features} % self.groupsize:{self.groupsize} == 0" - - weight = mod.weight.data - if not _check_linear_int4_k( - in_features, self.groupsize, self.inner_k_tiles - ): - if self.padding_allowed: - import torch.nn.functional as F - - from torchao.utils import find_multiple - - logging.warn( - f"warning: {fqn} is padded to satisfy in_features % 1024 == 0" - ) - # pyre-ignore[6]: Incompatible parameter type - padded_in_features = find_multiple(in_features, 1024) - weight = F.pad( - weight, pad=(0, padded_in_features - in_features) - ) - else: - logging.warn( - f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, " - + "and that groupsize and inner_k_tiles*16 evenly divide into it" - ) - continue - (w_int4x8, scales_and_zeros) = groupwise_affine_quantize_tensor( - weight, - 4, # n_bit - self.groupsize, - self.precision, # dtype for scales_and_zeros - ) - # If the packing of 2 4-bit values into a single 8-bit value was not - # performed in the previous function call, then do it manually now. - if w_int4x8.shape == weight.shape: - w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to( - torch.uint8 - ) - # In the original implementation, w_int4x8 is packed via calling the - # _convert_weight_to_int4pack operator before storing the weight. However - # the Vulkan implementation does not expect the weights to be packed, so - # the w_int4x8 tensor is stored as the weight instead. - cur_state_dict[f"{fqn}.weight"] = w_int4x8.to(self.device) - cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to( - self.device - ) - return cur_state_dict - - def _convert_for_runtime(self, model: torch.nn.Module) -> torch.nn.Module: - _vk_replace_linear_int4( - model, - self.groupsize, - self.inner_k_tiles, - self.padding_allowed, - skip_layer_func=None, - precision=self.precision, - scales_precision=self.precision, - ) - return model - - def quantize( - self, model: torch.nn.Module, *args: Any, **kwargs: Any - ) -> torch.nn.Module: - state_dict = self._create_quantized_state_dict(model) - model = self._convert_for_runtime(model) - model.load_state_dict(state_dict, strict=False) - return model diff --git a/backends/vulkan/_passes/remove_asserts.py b/backends/vulkan/_passes/remove_asserts.py deleted file mode 100644 index 835f2ec1415..00000000000 --- a/backends/vulkan/_passes/remove_asserts.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from typing import Set, Union - -import torch - -from executorch.exir.dialects.edge._ops import EdgeOpOverload -from executorch.exir.pass_base import ExportPass, PassResult -from executorch.exir.program._program import _get_updated_graph_signature - -from torch.export.exported_program import ExportedProgram - -OpType = Union[str, torch._ops.OpOverload, EdgeOpOverload] - - -class RemoveAssertsTransform(ExportPass): - """ - Remove operators which perform assertions. These are not possible to execute in - Vulkan since GLSL shaders cannot abort execution at runtime. Therefore, remove these - operators. - """ - - assert_ops: Set[OpType] = { - torch.ops.aten._assert_scalar.default, - torch.ops.aten.sym_constrain_range_for_size.default, - } - - def call(self, graph_module: torch.fx.GraphModule) -> PassResult: - for node in graph_module.graph.nodes: - if node.target in self.assert_ops: - graph_module.graph.erase_node(node) - - graph_module.graph.eliminate_dead_code() - graph_module.recompile() - return PassResult(graph_module, True) - - -def remove_asserts(edge_program: ExportedProgram) -> ExportedProgram: - graph_module = edge_program.graph_module - RemoveAssertsTransform()(graph_module) - - edge_program._graph_signature = _get_updated_graph_signature( - edge_program.graph_signature, graph_module - ) - edge_program._validate() - return edge_program diff --git a/backends/vulkan/_passes/remove_local_scalar_dense_ops.py b/backends/vulkan/_passes/remove_local_scalar_dense_ops.py deleted file mode 100644 index 6ce3572ec0c..00000000000 --- a/backends/vulkan/_passes/remove_local_scalar_dense_ops.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult - -from torch._subclasses.fake_tensor import FakeTensor - - -def node_is_local_scalar_dense_chain(node: torch.fx.Node) -> bool: - """ - Converting a tensor to a scalar via tensor[0].item() creates a index_select + - local_scalar_dense pattern in the graph. Check if a node is the start of this pattern. - """ - if ( - node.op == "call_function" - and node.target == exir_ops.edge.aten.select_copy.int - and len(node.users) == 1 - ): - user = list(node.users.keys())[0] - return user.target == torch.ops.aten._local_scalar_dense.default - - return False - - -def tag_node_if_scalar_tensor(node: torch.fx.Node) -> None: - """ - A scalar tensor in the Vulkan backend is a tensor that can be represented as a scalar - value instead of a Tensor object. The criteria for identifying a tensor as a scalar - tensor are as follows: - - 1. The tensor has only 1 element - 2. One of the node's uses is converting it to a scalar via `tensor[0].item()`, which - creates a index_select + local_scalar_dense pattern in the graph - - If any of these criteria are fulfilled, then tag the node for the tensor to mark it - so that it is added as a scalar value during serialization. - """ - tensor_val = node.meta["val"] - if not isinstance(tensor_val, FakeTensor): - return - - # Scalar tensors must have only one element - if tensor_val.numel() != 1: - return - - for user in node.users: - if node_is_local_scalar_dense_chain(user): - node.meta["etvk_is_scalar_tensor"] = True - - -def remove_local_scalar_dense_chain(graph: torch.fx.Graph, node: torch.fx.Node) -> None: - """ - Remove the index_select + local_scalar_dense pattern in the graph in favor of passing - the original scalar tensor directly. - """ - replace_node = node.args[0] - assert isinstance(replace_node, torch.fx.Node) - # If the argument to the local_scalar_dense op is a select op with only - # one user, and the argument to the select op is a tensor with only one - # element (i.e. a scalar tensor), then replace the entire pattern with the - # scalar tensor. - if ( - replace_node.op == "call_function" - and replace_node.target == exir_ops.edge.aten.select_copy.int - ): - # pyre-ignore - if replace_node.args[0].meta["val"].numel() == 1: - replace_node = replace_node.args[0] - assert isinstance(replace_node, torch.fx.Node) - assert replace_node.meta.get("etvk_is_scalar_tensor", True) - - with graph.inserting_after(node): - node.replace_all_uses_with(replace_node) - - -def remove_local_scalar_dense_ops(graph: torch.fx.Graph) -> torch.fx.Graph: - """ - The purpose of this pass is twofold: - 1. Tag scalar tensors (see `tag_node_if_scalar_tensor()` for the criteria) - 2. Remove the index_select + local_scalar_dense pattern in the graph in favor of - passing the original scalar tensor directly (see `remove_local_scalar_dense_chain()`) - - This makes it easier to deal with scalar tensors in the Vulkan backend. In particular, - it allows serializing scalar tensors as SymInt objects instead of Tensor objects. - Because scalar tensors are often used to inform tensor shapes, their values need to - be easily accessed by the CPU during resizing logic, while also being able to reflect - updates to their value in any GPU shaders that reference them. - """ - target_op = torch.ops.aten._local_scalar_dense.default - for node in graph.nodes: - tag_node_if_scalar_tensor(node) - - if node.op == "call_function" and node.target == target_op: - remove_local_scalar_dense_chain(graph, node) - - graph.eliminate_dead_code() - return graph - - -class RemoveLocalScalarDenseOpsTransform(ExportPass): - def call(self, graph_module: torch.fx.GraphModule) -> PassResult: - graph_module.graph = remove_local_scalar_dense_ops(graph_module.graph) - return PassResult(graph_module, True) diff --git a/backends/vulkan/_passes/remove_redundant_ops.py b/backends/vulkan/_passes/remove_redundant_ops.py deleted file mode 100644 index 530505f7003..00000000000 --- a/backends/vulkan/_passes/remove_redundant_ops.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from typing import Set, Union - -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.dialects.edge._ops import EdgeOpOverload -from executorch.exir.pass_base import ExportPass, PassResult -from executorch.exir.passes import dead_code_elimination_pass - -OpType = Union[str, torch._ops.OpOverload, EdgeOpOverload] - - -class RemoveRedundantOpsTransform(ExportPass): - """ - Trim certain operators to reduce unnecessary overhead. - """ - - redundant_ops: Set[OpType] = { - torch.clone, - torch.ops.aten.clone.default, - exir_ops.edge.aten.clone.default, - torch.ops.aten.alias.default, - exir_ops.edge.aten.alias.default, - exir_ops.edge.aten.lift_fresh_copy.default, - exir_ops.edge.dim_order_ops._to_dim_order_copy.default, - } - - def __init__(self) -> None: - super(RemoveRedundantOpsTransform, self).__init__() - - def _should_remove(self, node: torch.fx.Node) -> bool: - if node.target in self.redundant_ops: - return True - - # Only remove to_copy if dtype does not change. Otherwise, memory format changes - # will be handled internally by the backend. - if ( - node.target == exir_ops.edge.aten._to_copy.default - or node.target == torch.ops.aten._to_copy.default - ): - src_dtype = node.meta["val"].dtype - # pyre-ignore - dst_dtype = node.args[0].meta["val"].dtype - return src_dtype == dst_dtype - - return False - - def _remove(self, graph_module: torch.fx.GraphModule) -> None: - for node in graph_module.graph.nodes: - if not self._should_remove(node): - continue - - with graph_module.graph.inserting_after(node): - node.replace_all_uses_with(node.args[0]) - - graph_module.graph.eliminate_dead_code() - - def call(self, graph_module: torch.fx.GraphModule) -> PassResult: - self._remove(graph_module) - graph_module.recompile() - dead_code_elimination_pass(graph_module) - return PassResult(graph_module, True) diff --git a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py deleted file mode 100644 index c415249383e..00000000000 --- a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from typing import Dict, List, Set, Tuple, Union - -import executorch.backends.vulkan.custom_ops_lib # noqa: needed to access vk op -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.dialects.edge._ops import EdgeOpOverload -from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue - -from torch._ops import OpOverload - -from torch.fx.node import Argument - -OpType = Union[str, OpOverload, EdgeOpOverload] - - -class SqueezeUnsqueezeInputs(ExportPass): - _squeezable_ops: Set[OpType] = { - exir_ops.edge.et_vk.linear_weight_int4.default, - exir_ops.edge.aten.relu.default, - exir_ops.edge.aten.gelu.default, - } - - def should_squeeze(self, op, shape: List[int]) -> bool: # pyre-ignore - if len(shape) == 3: - return shape[1] == 1 and shape[0] > 1 - if len(shape) == 4: - # No need to squeeze if all dims are 1 except the width dim - if shape[0] == shape[1] == shape[2] == 1: - return False - # No need to squeeze if batch and channel dims are 1 and height and width are > 1 - if shape[0] == shape[1] == 1 and shape[2] > 1 and shape[3] > 1: - return False - # No need to squeeze if batch dim is 1 and channel, height and width are > 1 - if shape[0] == 1 and shape[1] > 1 and shape[2] > 1 and shape[3] > 1: - return False - # Otherwise, check for squeezable dim - return 1 in shape[:-1] - - # Prefer not to introduce additional orchestration ops by default - return False - - def call_operator( - self, - op, # pyre-ignore - args: Tuple[Argument, ...], - kwargs: Dict[str, Argument], - meta: NodeMetadata, - ) -> ProxyValue: - if op not in self._squeezable_ops: - return super().call_operator(op, args, kwargs, meta) - # pyre-ignore[16]: `None` has no attribute `node` - input_shape = args[0].node.meta["val"].shape - output_shape = meta["val"].shape - - if not self.should_squeeze(op, input_shape): - return super().call_operator(op, args, kwargs, meta) - - def _squeezable(shape: List[int]) -> bool: - return len(shape) > 2 and 1 in shape - - # squeeze input tensor - squeeze_shape = list(input_shape) - while _squeezable(squeeze_shape): - squeeze_shape.remove(1) - - squeeze_out = super().call_operator( - exir_ops.edge.aten.view_copy.default, - (args[0], squeeze_shape), - kwargs, - meta, - ) - # call linear on squeezed output - new_args = (squeeze_out, *args[1:]) - linear_out = super().call_operator( - op, - new_args, - kwargs, - meta, - ) - # unsqueeze output - unsqueeze_shape = list(output_shape) - return super().call_operator( - exir_ops.edge.aten.view_copy.default, - (linear_out, unsqueeze_shape), - kwargs, - meta, - ) diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py deleted file mode 100644 index db53cc666a8..00000000000 --- a/backends/vulkan/_passes/tag_memory_meta_pass.py +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import logging -import operator - -from typing import Any - -import executorch.backends.vulkan.utils as utils - -import torch - -from executorch.backends.vulkan.op_registry import get_op_features, has_impl, OpFeatures - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - VkMemoryLayout, - VkStorageType, -) - -from executorch.exir.dialects._ops import ops as exir_ops - -from executorch.exir.pass_base import ExportPass, PassResult -from executorch.exir.tensor import TensorSpec - -logger: logging.Logger = logging.getLogger("") -logger.setLevel(logging.INFO) - - -def insert_transition_node( - graph_module: torch.fx.GraphModule, - node: torch.fx.Node, - arg: torch.fx.Node, - arg_node_repr: utils.TensorRepr, -) -> None: - """ - Insert a clone node to transition the tensor associated with `arg` to a tensor with - the requested representation `arg_node_repr`, and use the cloned node as an argument - to `node` instead of `arg`. - """ - with graph_module.graph.inserting_before(node): - clone_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.aten.clone.default, - (arg,), - ) - clone_node.meta["val"] = arg.meta["val"] - clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"]) - clone_node.meta["spec"].const = False - utils.set_node_repr(clone_node, arg_node_repr) - arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y) - - -def set_arg_node_repr_or_transition( - graph_module: torch.fx.GraphModule, - op_node: torch.fx.Node, - arg_i: int, - arg_node_repr: utils.TensorRepr, - dirty: bool, -) -> bool: - """ - Does one of following: - 1. Sets the `node_repr` of the argument at `arg_i` of `op_node` if the argument node - does not currently have a `node_repr` - 2. No-op if the current `node_repr` is already the same as the requested represetnation. - 3. Insert a transition node to create a copy of the argument with the desired `node_repr` - if the current `node_repr` is different than what is needed. - """ - arg_node = op_node.args[arg_i] - - def single_node_impl(node: torch.fx.Node) -> bool: - # Case where the arg node has not been touched yet; in this case, simply set it and - # return. - if not utils.has_node_repr(node): - utils.set_node_repr(node, arg_node_repr) - return False - - # Case where the current node representation is the same as the new one. - cur_node_repr = utils.get_node_repr(node) - assert isinstance(cur_node_repr, utils.TensorRepr) - - if cur_node_repr == arg_node_repr: - return False - - if not dirty: - logger.info( - f"[Vulkan Delegate] Inserting transition(s) for {op_node.format_node()}:" - ) - - # Existing node representation is different; insert a transition node - # Currently, the transition node insertion logic can only handle single tensor nodes - assert utils.is_single_tensor_node(node) - insert_transition_node(graph_module, op_node, node, arg_node_repr) - - logger.info(f" arg {arg_i} ({node}): ({cur_node_repr}) -> ({arg_node_repr})") - - return True - - if isinstance(arg_node, torch.fx.Node): - return single_node_impl(arg_node) - elif isinstance(arg_node, (list, tuple)): - ret: bool = False - for n in arg_node: - assert isinstance(n, torch.fx.Node) - assert utils.is_single_tensor_node(n) - ret = single_node_impl(n) or ret - - return ret - - raise NotImplementedError(f"Unhandled node type {arg_node}") - - -class TagMemoryMetaPass(ExportPass): - """ - Operator implementations in the Vulkan delegate may require that input and output - tensors use a specific representation. Representation in this case refers to a - combination of storage type (buffer or texture) and memory layout (width, height, or - channels packed). - - The tag memory metadata pass is responsible for marking each tensor in the graph - with the appropriate representation to use. It is also responsible for inserting - operators to transition argument tensors to a required/compatible representation if - a mismatch has been detected. - """ - - def __init__( - self, - texture_limits: utils.ImageExtents, - default_storage_type: VkStorageType = VkStorageType.TEXTURE_3D, - default_memory_layout: VkMemoryLayout = VkMemoryLayout.TENSOR_WIDTH_PACKED, - ): - super().__init__() - self.default_storage: VkStorageType = default_storage_type - self.default_layout: VkMemoryLayout = default_memory_layout - self.texture_limits = texture_limits - - # Magic number to limit "lookahead" when tracing through users of an operator - # to constrain the representation of its arguments/outputs. - self.max_trace_search_depth = 20 - - def is_valid_op_node(self, node: Any) -> bool: - """ - Fails the check for: - * nodes that are not associated with a tensor - * nodes that are associated with a constant tensor - * nodes that are not associated with a supported operator - """ - if not isinstance(node, torch.fx.Node) or not utils.is_tensor_node(node): - return False - if node.meta.get("etvk_tensorref", False): - return False - if not has_impl(node.target): - return False - - return True - - def is_non_constant_tensor_node(self, node: Any) -> bool: - """ - Fails the check for: - * Nodes that are not associated with tensor values - * Nodes associated with constant tensors - * - """ - if isinstance(node, torch.fx.Node): - if not utils.is_tensor_node(node): - return False - if node.meta.get("etvk_tensorref", False): - return False - return True - - if isinstance(node, (tuple, list)): - for n in node: - if not isinstance(n, torch.fx.Node): - return False - if not self.is_non_constant_tensor_node(n): - return False - - return True - - # Return false by default - return False - - def get_node_cached_repsets(self, op_node: torch.fx.Node) -> utils.OpRepSets: - """ - Implements a cache layer for getting the OpRepSets for a given operator node. - """ - assert self.is_valid_op_node(op_node) - - if "etvk_node_repsets" in op_node.meta: - op_repsets = op_node.meta["etvk_node_repsets"] - assert isinstance(op_repsets, utils.OpRepSets) - return op_repsets - else: - # Special case for getitem - set the input and output to the repset of the - # tensor value being extracted - if op_node.target == operator.getitem: - src_node = op_node.args[0] - assert isinstance(src_node, torch.fx.Node) - idx = op_node.args[1] - assert isinstance(idx, int) - - arg_node_repsets = self.get_node_cached_repsets(src_node) - out_tensor_repset = arg_node_repsets.get_out_repset(idx) - - op_repsets = utils.OpRepSets( - utils.TensorRepSetList(out_tensor_repset), - utils.TensorRepSetList(out_tensor_repset), - op_node, - self.texture_limits, - ) - else: - features: OpFeatures = get_op_features(op_node.target) # noqa - op_repsets = features.make_op_repsets(op_node, self.texture_limits) - - op_node.meta["etvk_node_repsets"] = op_repsets - return op_repsets - - def get_arg_tensor_source_repset( - self, op_node: torch.fx.Node, arg_i: int - ) -> utils.TensorRepSet: - """ - Get the "source RepSet" for the tensor argument at index `arg_i` of `op_node`. - The source repset is obtained in one of two ways: - - 1. If the tensor argument already has a representation determined for it, return - a repset that contains that representation. - 2. Otherwise, return the output repset of the operator that produces the tensor - """ - arg_node = op_node.args[arg_i] - - # Special case for cat - use the first tensor in the list as representative - if isinstance(arg_node, list): - arg_node = arg_node[0] - - if utils.has_node_repr(arg_node): - arg_node_repr = utils.get_node_repr(arg_node) - assert isinstance(arg_node_repr, utils.TensorRepr) - return utils.make_tensor_repset(arg_node_repr) - elif self.is_valid_op_node(arg_node): - # Special case for getitem - propagate the node representation of the original node - if op_node.target == operator.getitem: - src_node = op_node.args[0] - assert isinstance(src_node, torch.fx.Node) - idx = op_node.args[1] - assert isinstance(idx, int) - - src_node_repsets = self.get_node_cached_repsets(src_node) - return src_node_repsets.get_out_repset(idx) - - src_node_repsets = self.get_node_cached_repsets(arg_node) - return src_node_repsets.get_out_repset(0) - - # default return - return utils.ANY_STORAGE - - def constrain_repset_with_user( - self, - current_node: torch.fx.Node, - arg_i: int, - arg_repset: utils.TensorRepSet, - search_depth: int = 0, - ) -> utils.TensorRepSet: - """ - Attempts to constrain `arg_repset` based on the required repset of the argument - at index `arg_i` of `current_node`. This tries to find a representation for the - argument that can be used for as long as possible without needing a transition. - """ - # The repset is already constrained; return it - if arg_repset.is_constrained(): - return arg_repset - - # The current node is not a valid op node, so no OpRepSets object can be created - # for it. - if not self.is_valid_op_node(current_node): - return arg_repset - - cur_node_repsets = self.get_node_cached_repsets(current_node) - - # Intersect with the repset required by the current operator; otherwise, return - # since a transition will be required anyways - req_arg_repset = cur_node_repsets.get_arg_repset(arg_i) - if req_arg_repset.any_in_common(arg_repset): - arg_repset = arg_repset.make_intersect(req_arg_repset) - else: - return arg_repset - - # Check if the argument at `arg_i` will influence the output representation of - # the current operator. - repset_propagates_to_output = cur_node_repsets.sync_primary_io_repr and ( - cur_node_repsets.sync_args_repr or arg_i == cur_node_repsets.primary_arg_idx - ) - - # If not, then no point in continuing to trace the users of the current node - if not repset_propagates_to_output: - return arg_repset - - return self.trace_node_users_to_constrain_repset( - current_node, arg_repset, search_depth - ) - - def trace_node_users_to_constrain_repset( - self, - origin_node: torch.fx.Node, - repset: utils.TensorRepSet, - search_depth: int = 0, - ) -> utils.TensorRepSet: - """ - For an ambiguous repset, try to constrain the repset by tracing the required - repsets of the users of `origin_node`. The idea is to try to find a representation - that can be used the longest without needing user nodes to insert a transition - for its arguments. - """ - # Optionally limit the search depth to improve export time - if self.max_trace_search_depth is not None: - if search_depth > self.max_trace_search_depth: - return repset - - users_to_trace = origin_node.users - - sync_outs_repr = True - if self.is_valid_op_node(origin_node): - sync_outs_repr = self.get_node_cached_repsets(origin_node).sync_outs_repr - - if utils.num_tensors_in_node(origin_node) > 1 and not sync_outs_repr: - users_to_trace = [] - for usage_node in origin_node.users: - if usage_node.target == operator.getitem and usage_node.args[1] == 1: - users_to_trace.append(usage_node) - - for usage_node in users_to_trace: - arg_i_in_user = None - for i in range(len(usage_node.args)): - if origin_node == usage_node.args[i]: - arg_i_in_user = i - break - - if arg_i_in_user is not None: - repset = self.constrain_repset_with_user( - usage_node, arg_i_in_user, repset, search_depth + 1 - ) - - if repset.is_constrained(): - return repset - - return repset - - def constrain_op_arg_repset(self, arg_i: int, op_repsets: utils.OpRepSets) -> None: - """ - Attempts to constrain the repset of the argument at index `arg_i` of the op - associated with `op_repsets`. Does this with two stages: - - 1. First, account for any existing representation that has already been determined - for the argument. If no existing representation has been determined, then use - the output repset of the operator that produces the argument. - 2. Then, try to trace through the users of the argument to find a representation - that can be used for as long as possible without needing a transition. - """ - arg_source_repset = self.get_arg_tensor_source_repset(op_repsets.op_node, arg_i) - op_repsets.try_constrain_with_arg_repset(arg_i, arg_source_repset) - - arg_repset = op_repsets.get_arg_repset(arg_i) - if arg_repset.is_constrained(): - return arg_repset - - arg_node = op_repsets.op_node.args[arg_i] - - if isinstance(arg_node, list): - arg_node = arg_node[0] - - arg_repset = self.trace_node_users_to_constrain_repset(arg_node, arg_repset) - op_repsets.try_constrain_with_arg_repset(arg_i, arg_repset) - - def constrain_op_repsets(self, op_repsets: utils.OpRepSets) -> None: - # For most ops, constraining the argument repsets will also contrain the output - # repset due to OpRepSets maintaining synchronization rules. - for i in range(len(op_repsets.op_node.args)): - if utils.is_tensor_arg_node(op_repsets.op_node.args[i]): - self.constrain_op_arg_repset(i, op_repsets) - - # TODO(ssjia): For most ops, inputs and outputs must be synchronized, so there - # is no need to constrain output repsets explicitly. Currently, the exceptions - # (i.e. choose qparams) already define constrined repsets for the output, so - # there is again no need to explicitly constrain the outputs. If an operator - # appears later on that does not sync input and output representations, and - # defines ambiguous repsets for the output tensor(s), then we will need to add - # additional logic to this function to constrain the output repsets separately - # from the input repsets. - - def set_op_node_tensor_reprs( - self, graph_module: torch.fx.GraphModule, op_node: torch.fx.Node - ) -> None: - """ - For an operator representated by `op_node`, get the OpRepSets associated with - the operation and try to constrain the repsets by accounting for existing - representations and tracing through the users of the operator. - - Then, determine a tensor representation for all tensors participating in the - operation and mark it in the node metadata. If the requested representation is - different than an already determined representation, then insert a transition - node to create a copy of the tensor with the desired representation. - """ - if not self.is_valid_op_node(op_node): - return - - # Special case for getitem - propagate the node representation of the original node - if op_node.target == operator.getitem: - src_node = op_node.args[0] - assert isinstance(src_node, torch.fx.Node) - idx = op_node.args[1] - assert isinstance(idx, int) - - arg_node_repr = utils.get_node_repr(src_node) - assert isinstance(arg_node_repr, list) - utils.set_node_repr(op_node, arg_node_repr[idx]) - return - - # Get a "fresh" OpRepSets object instead of using the cache. Do this because this - # class instance will go through the constraining process which may modify it. - features: OpFeatures = get_op_features(op_node.target) - op_repsets = features.make_op_repsets(op_node, self.texture_limits) - - self.constrain_op_repsets(op_repsets) - - args_repr_list, outs_repr_list = op_repsets.pick_representations() - - if len(outs_repr_list) == 1: - utils.set_node_repr(op_node, outs_repr_list[0]) - else: - utils.set_node_repr(op_node, outs_repr_list) - - transitions_inserted = False - for i, arg_node in enumerate(op_node.args): - if not self.is_non_constant_tensor_node(arg_node): - continue - - arg_node_repr = args_repr_list[i] - - if isinstance(arg_node, torch.fx.Node): - transitions_inserted = ( - set_arg_node_repr_or_transition( - graph_module, op_node, i, arg_node_repr, transitions_inserted - ) - or transitions_inserted - ) - elif isinstance(arg_node, (list, tuple)): - for n in arg_node: - assert isinstance(n, torch.fx.Node) - assert utils.is_single_tensor_node(n) - transitions_inserted = ( - set_arg_node_repr_or_transition( - graph_module, - op_node, - i, - arg_node_repr, - transitions_inserted, - ) - or transitions_inserted - ) - - def call(self, graph_module: torch.fx.GraphModule) -> PassResult: - for node in graph_module.graph.nodes: - self.set_op_node_tensor_reprs(graph_module, node) - - return PassResult(graph_module, True) diff --git a/backends/vulkan/cmake b/backends/vulkan/cmake new file mode 120000 index 00000000000..21498ceec01 --- /dev/null +++ b/backends/vulkan/cmake @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/cmake \ No newline at end of file diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake deleted file mode 100644 index 1b6838c4dfd..00000000000 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ### Editing this file ### -# -# This file should be formatted with -# ~~~ -# cmake-format -i ATenVulkan.cmake -# ~~~ -# It should also be cmake-lint clean. -# -# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON - -if(NOT PYTHON_EXECUTABLE) - message( - "WARNING: PYTHON_EXECUTABLE is not set! A failure is likely imminent." - ) -endif() - -if(NOT EXECUTORCH_ROOT) - message("WARNING: EXECUTORCH_ROOT is not set! A failure is likely imminent.") -endif() - -if(ANDROID) - if(NOT ANDROID_NDK) - message(FATAL_ERROR "ANDROID_NDK not set") - endif() - - if(NOT GLSLC_PATH) - set(GLSLC_PATH - "${ANDROID_NDK}/shader-tools/${ANDROID_NDK_HOST_SYSTEM_NAME}/glslc" - ) - endif() -else() - find_program(GLSLC_PATH glslc PATHS $ENV{PATH}) - - if(NOT GLSLC_PATH) - message(FATAL_ERROR "USE_VULKAN glslc not found") - endif() -endif() - -# Required to enable linking with --whole-archive -include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) - -function(gen_vulkan_shader_lib_cpp shaders_path) - set(VULKAN_SHADERGEN_ENV "") - set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/vulkan_compute_shaders) - - set(GEN_SPV_ARGS "--optimize") - if(DEFINED ENV{ETVK_USING_SWIFTSHADER}) - if("$ENV{ETVK_USING_SWIFTSHADER}" STREQUAL "1" - OR "$ENV{ETVK_USING_SWIFTSHADER}" STREQUAL "True" - ) - list(APPEND GEN_SPV_ARGS "--replace-u16vecn") - endif() - endif() - - add_custom_command( - COMMENT "Generating Vulkan Compute Shaders" - OUTPUT ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp - COMMAND - "${PYTHON_EXECUTABLE}" - ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path - ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} - --glslc-path=${GLSLC_PATH} - --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}/shader_cache/ --env - ${VULKAN_GEN_ARG_ENV} ${GEN_SPV_ARGS} - DEPENDS ${shaders_path}/* - ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py - ) - - set(generated_spv_cpp - ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp - PARENT_SCOPE - ) -endfunction() - -function(vulkan_shader_lib library_name generated_spv_cpp) - add_library(${library_name} STATIC ${generated_spv_cpp}) - target_include_directories( - ${library_name} - PRIVATE - ${EXECUTORCH_ROOT}/.. - ${EXECUTORCH_ROOT}/backends/vulkan/third-party/Vulkan-Headers/include - ${EXECUTORCH_ROOT}/backends/vulkan/third-party/volk - ) - target_link_libraries(${library_name} vulkan_backend) - target_compile_options(${library_name} PRIVATE ${VULKAN_CXX_FLAGS}) - # Link this library with --whole-archive due to dynamic shader registrations - executorch_target_link_options_shared_lib(${library_name}) -endfunction() - -# Convenience macro to generate a SPIR-V shader library target. Given the path -# to the shaders to compile and the name of the library, it will create a static -# library containing the generated SPIR-V shaders. The generated_spv_cpp -# variable can be used to reference the generated CPP file outside the macro. -macro(vulkan_shader_library shaders_path library_name) - set(VULKAN_SHADERGEN_ENV "") - set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${library_name}) - - set(generated_spv_cpp ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp) - - add_library(${library_name} STATIC ${generated_spv_cpp}) - target_include_directories( - ${library_name} - PRIVATE - ${EXECUTORCH_ROOT}/.. - ${EXECUTORCH_ROOT}/backends/vulkan/third-party/Vulkan-Headers/include - ${EXECUTORCH_ROOT}/backends/vulkan/third-party/volk - ) - target_link_libraries(${library_name} vulkan_backend) - target_compile_options(${library_name} PRIVATE ${VULKAN_CXX_FLAGS}) - # Link this library with --whole-archive due to dynamic shader registrations - executorch_target_link_options_shared_lib(${library_name}) - - unset(VULKAN_SHADERGEN_ENV) - unset(VULKAN_SHADERGEN_OUT_PATH) -endmacro() diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py deleted file mode 100644 index 4312971f5f1..00000000000 --- a/backends/vulkan/custom_ops_lib.py +++ /dev/null @@ -1,545 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Optional - -import executorch.backends.vulkan.patterns as vk_patterns -import torch.library - -namespace = "et_vk" -lib = torch.library.Library(namespace, "DEF") - -############# -## prepack ## -############# - - -def prepack_impl(x: torch.Tensor): - return x - - -name = "prepack" -lib.define(f"{name}(Tensor x) -> Tensor") -lib.impl(name, prepack_impl, "CompositeExplicitAutograd") -prepack_op = getattr(getattr(torch.ops, namespace), name) - -##################### -## conv_with_clamp ## -##################### - - -def conv_with_clamp_impl( - input, - weight, - bias=None, - stride=1, - padding=0, - dilation=1, - transposed=False, - output_padding=0, - groups=1, - output_min=-float("inf"), - output_max=float("inf"), -): - return torch.clamp( - torch.convolution( - input, - weight, - bias, - stride, - padding, - dilation, - transposed, - output_padding, - groups, - ), - output_min, - output_max, - ) - - -name = "conv_with_clamp" -lib.define( - f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max) -> Tensor" -) -lib.impl(name, conv_with_clamp_impl, "CompositeExplicitAutograd") -conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name) - -######################### -## conv_with_clamp.out ## -######################### - - -def conv_with_clamp_out_impl( - input, - weight, - bias=None, - stride=1, - padding=0, - dilation=1, - transposed=False, - output_padding=0, - groups=1, - output_min=-float("inf"), - output_max=float("inf"), - out=None, -): - out = conv_with_clamp_impl( - input, - weight, - bias, - stride, - padding, - dilation, - transposed, - output_padding, - groups, - output_min, - output_max, - ) - return out - - -name = "conv_with_clamp.out" -lib.define( - f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)" -) -lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd") - -################# -## grid_priors ## -################# - - -# The dimension of x should be larger than 1 -def grid_priors_impl( - x, - stride, - offset, -): - height, width = x.shape[-2:] - # Need to specify device of torch.arange to avoid executorch exporting error - shift_x = (torch.arange(0, width, device=x.device) + offset) * stride - shift_y = (torch.arange(0, height, device=x.device) + offset) * stride - # Need to specify indexing parameter ('ij' is the default value) to avoid executorch exporting error - shift_xx, shift_yy = torch.meshgrid([shift_y, shift_x], indexing="ij") - shift_xx = shift_xx.reshape(-1) - shift_yy = shift_yy.reshape(-1) - shifts = torch.stack((shift_yy, shift_xx), dim=-1) - return shifts - - -name = "grid_priors" -lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor") -lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd") -grid_priors_op = getattr(getattr(torch.ops, namespace), name) - - -# When lowering to executorch, ops are converted from default to out variant. Hence, custom ops define both variants. -def grid_priors_out_impl( - x, - stride, - offset, - out, -): - out = grid_priors_impl(x, stride, offset) - return out - - -name = "grid_priors_out" -lib.define( - f"{name}(Tensor self, int stride, float offset, *, Tensor(a!) out) -> Tensor(a!)" -) -lib.impl(name, grid_priors_out_impl, "CompositeExplicitAutograd") - -######################## -## linear_weight_int4 ## -######################## - - -def linear_weight_int4_impl( - x: torch.Tensor, - weights_4x8: torch.Tensor, - groupsize: int, - scales_and_zeros: torch.Tensor, - inner_k_tiles: int, -): - original_x_size = x.size() - out_features = weights_4x8.size(0) - x = x.reshape(-1, original_x_size[-1]) - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( - weights_4x8, inner_k_tiles - ) - out = torch.ops.aten._weight_int4pack_mm( - x, weight_int4pack, groupsize, scales_and_zeros - ) - out_shape = original_x_size[:-1] + (out_features,) - return out.reshape(out_shape) - - -name = "linear_weight_int4" -lib.define( - f"{name}(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros, int inner_k_tiles) -> Tensor" -) -lib.impl(name, linear_weight_int4_impl, "CompositeExplicitAutograd") -linear_weight_int4_op = getattr(getattr(torch.ops, namespace), name) - -################## -## linear_qcs4w ## -################## - - -def linear_qcs4w( - x: torch.Tensor, - weights_4x2: torch.Tensor, - scales: torch.Tensor, -): - original_x_shape = x.shape - x = x.reshape(-1, original_x_shape[-1]) - - unpacked_weights_shape = weights_4x2.shape - out_features = unpacked_weights_shape[0] - in_features = unpacked_weights_shape[1] - - weights_unpacked = torch.empty( - (out_features, in_features * 2), dtype=torch.int8, device=weights_4x2.device - ) - - weights_unpacked[:, ::2] = weights_4x2 >> 4 - weights_unpacked[:, 1::2] = weights_4x2 & 0x0F - - n_bit = 8 - quant_min = -(2 ** (n_bit - 1)) - quant_max = 2 ** (n_bit - 1) - 1 - dq_weights = torch.ops.quantized_decomposed.dequantize_per_channel( - weights_unpacked, - scales, - None, - 0, - quant_min, - quant_max, - torch.int8, - ) - - out = torch.nn.functional.linear(x, dq_weights) - out_shape = original_x_shape[:-1] + (out_features,) - return out.reshape(out_shape) - - -name = "linear_qcs4w" -lib.define(f"{name}(Tensor self, Tensor weight, Tensor scales) -> Tensor") -lib.impl(name, linear_qcs4w, "CompositeExplicitAutograd") -linear_qc4w_op = getattr(getattr(torch.ops, namespace), name) - -################## -## linear_q4gsw ## -################## - - -def unpack_4bit_weight_tensor( - packed_weight_tensor: torch.Tensor, x: torch.Tensor -) -> torch.Tensor: - """ - Reverses the packing performed in quantized_linear.pack_4bit_weight_tensor - """ - # Each packed byte contains two 4-bit values: high nibble and low nibble - K, N_half = packed_weight_tensor.shape - N = N_half * 2 - - # Unpack high and low nibbles - high_nibble = (packed_weight_tensor >> 4) & 0x0F - low_nibble = packed_weight_tensor & 0x0F - - # Stack to shape (K, N) - unpacked = torch.empty( - (K, N), dtype=torch.uint8, device=packed_weight_tensor.device - ) - unpacked[:, ::2] = low_nibble - unpacked[:, 1::2] = high_nibble - - # Undo the +8 offset and convert to signed 4-bit range [-8, 7] - unpacked = unpacked.to(torch.int8) - 8 - - in_channels = x.shape[-1] - # Undo any padding that may have been added to input channels - if in_channels != unpacked.shape[-1]: - return unpacked[:, :in_channels] - - return unpacked - - -def linear_q4gsw( - x: torch.Tensor, - weights: torch.Tensor, - weight_scales: torch.Tensor, - group_size: int, - bias: Optional[torch.Tensor] = None, -): - # Unpack the packed weights - weights = unpack_4bit_weight_tensor(weights, x) - - # Un-transpose the weight scales - weight_scales = weight_scales.transpose(0, 1) - weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32) - - weights = torch.ops.torchao.dequantize_affine( - weights, [1, group_size], weight_scales, weight_zeros, torch.int8, -8, 7 - ) - - out = torch.nn.functional.linear(x, weights) - return out - - -name = "linear_q4gsw" -lib.define( - f""" - {name}( - Tensor self, - Tensor weights, - Tensor weight_scales, - int group_size, - Tensor? bias = None) -> Tensor - """ -) -lib.impl(name, linear_q4gsw, "CompositeExplicitAutograd") -linear_qc4w_op = getattr(getattr(torch.ops, namespace), name) - -######################## -## linear_qta8a_qga4w ## -######################## - - -def linear_qta8a_qga4w( - x_quantized: torch.Tensor, - input_scale: torch.Tensor, - input_zero_point: torch.Tensor, - weights_4bit: torch.Tensor, - group_size: int, - weight_scales: torch.Tensor, - weight_zeros: torch.Tensor, -): - """ - Dynamic activation + grouped weight quantized linear (QTA8A_QGA4W). - - Args: - x_quantized: Already quantized input tensor (int8, per-token quantized) - input_scale: Scale for per-token quantization of input (shape: [batch_size]) - input_zero_point: Zero point for per-token quantization of input (shape: [batch_size]) - weights_4bit: Packed 4-bit quantized weights - group_size: Group size for weight quantization (int) - weight_scales: Per-group scales for weights - weight_zeros: Per-group zero points for weights - """ - original_x_shape = x_quantized.shape - feature_dim = original_x_shape[-1] - - # Reshape for processing - x_quantized_2d = x_quantized.reshape(-1, feature_dim) - - # Unpack 4-bit weights - unpacked_weights_shape = weights_4bit.shape - out_features = unpacked_weights_shape[0] - in_features = unpacked_weights_shape[1] - - weights_unpacked = torch.empty( - (out_features, in_features * 2), dtype=torch.int8, device=weights_4bit.device - ) - - weights_unpacked[:, ::2] = weights_4bit >> 4 - weights_unpacked[:, 1::2] = weights_4bit & 0x0F - - # Convert to signed 4-bit range [-8, 7] - weights_unpacked = torch.where( - weights_unpacked > 7, weights_unpacked - 16, weights_unpacked - ) - - # Dequantize weights using grouped quantization - actual_in_features = in_features * 2 - num_groups = actual_in_features // group_size - - # Reshape weights for grouped dequantization - weights_grouped = weights_unpacked.view(out_features, num_groups, group_size) - - # Expand scales and zeros to match grouped weights - scales_expanded = weight_scales.unsqueeze(-1).expand(-1, -1, group_size) - zeros_expanded = weight_zeros.unsqueeze(-1).expand(-1, -1, group_size) - - # Dequantize: (quantized - zero_point) * scale - dq_weights_grouped = (weights_grouped.float() - zeros_expanded) * scales_expanded - dq_weights = dq_weights_grouped.view(out_features, actual_in_features) - - # Dequantize input (per-token) - # For per-token quantization, each token (row) has its own scale and zero_point - x_dequantized = torch.ops.quantized_decomposed.dequantize_per_token( - x_quantized_2d, - input_scale, - input_zero_point, - -128, - 127, - torch.int8, - torch.float32, - ) - - # Perform linear operation - out = torch.nn.functional.linear(x_dequantized, dq_weights) - out_shape = original_x_shape[:-1] + (out_features,) - return out.reshape(out_shape) - - -name = "linear_qta8a_qga4w" -lib.define( - f"{name}(Tensor self, Tensor input_scale, Tensor input_zero_point, Tensor weight, int group_size, Tensor weight_scales, Tensor weight_zeros) -> Tensor" -) -lib.impl(name, linear_qta8a_qga4w, "CompositeExplicitAutograd") -linear_qta8a_qga4w_op = getattr(getattr(torch.ops, namespace), name) - -################# -## qaqw_linear ## -################# - - -def linear_q8ta_q8csw( - x: torch.Tensor, - input_scale: float, - input_zero_point: int, - weights: torch.Tensor, - weight_sums: torch.Tensor, - weight_scales: torch.Tensor, - bias: Optional[torch.Tensor] = None, -): - weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32) - weights = torch.ops.quantized_decomposed.dequantize_per_channel( - weights, - weight_scales, - weight_zeros, - 0, - -127, - 127, - torch.int8, - ) - - # Perform linear operation - out = torch.nn.functional.linear(x, weights) - if bias is not None: - out = out + bias - - return out - - -name = "linear_q8ta_q8csw" -lib.define( - f""" - {name}( - Tensor x, - float input_scale, - int input_zero_point, - Tensor weights, - Tensor weight_sums, - Tensor weight_scales, - Tensor? bias = None) -> Tensor - """ -) -lib.impl(name, linear_q8ta_q8csw, "CompositeExplicitAutograd") -qa_q8csw_linear = getattr(getattr(torch.ops, namespace), name) - -################## -## conv2d_q8ta_q8csw ## -################## - - -def conv2d_q8ta_q8csw( - x: torch.Tensor, - input_scale: float, - input_zero_point: int, - weights: torch.Tensor, - weight_sums: torch.Tensor, - weight_scales: torch.Tensor, - bias: Optional[torch.Tensor], - kernel_size: list, - stride: list, - padding: list, - dilation: list, - groups: int, -): - IC = x.shape[1] - K_h, K_w = kernel_size[0], kernel_size[1] - - canonical_weight_K_dim = K_h * K_w * IC - # Remove any padding added to output channels dim to align to a multiple of 4 - if weights.shape[-1] != canonical_weight_K_dim: - weights = weights[:, :canonical_weight_K_dim] - weight_scales = weight_scales[:canonical_weight_K_dim] - if bias is not None: - bias = bias[:canonical_weight_K_dim] - - weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32) - - # Calculate dimensions - OC = weights.shape[0] - in_features = weights.shape[1] - IC = in_features // (K_h * K_w) - - # Reshape to original 4D format (OC, IC, H, W) - weights = weights.view(OC, IC, K_h, K_w) - - # Dequantize weights - weights = torch.ops.quantized_decomposed.dequantize_per_channel( - weights, - weight_scales, - weight_zeros, - 0, # axis=0 for output channel quantization - -127, - 127, - torch.int8, - ) - - # Perform convolution - out = torch.nn.functional.conv2d( - x, weights, bias, stride, padding, dilation, groups - ) - - return out - - -name = "conv2d_q8ta_q8csw" -lib.define( - f""" - {name}( - Tensor x, - float input_scale, - int input_zero_point, - Tensor weights, - Tensor weight_sums, - Tensor weight_scales, - Tensor? bias, - SymInt[] kernel_size, - SymInt[] stride, - SymInt[] padding, - SymInt[] dilation, - SymInt groups) -> Tensor - """ -) -lib.impl(name, conv2d_q8ta_q8csw, "CompositeExplicitAutograd") -conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name) - -###################### -## apply_rotary_emb ## -###################### - - -def apply_rotary_emb_impl( - xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor -): - pattern = vk_patterns.RotaryEmbeddingPattern() - return pattern.forward(xq, xk, freqs_cos, freqs_sin) - - -name = "apply_rotary_emb" -lib.define( - f"{name}(Tensor xq, Tensor xk, Tensor freqs_cos, Tensor freqs_sin) -> (Tensor, Tensor)" -) -lib.impl(name, apply_rotary_emb_impl, "CompositeExplicitAutograd") -apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name) diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py new file mode 120000 index 00000000000..d0c384c9d97 --- /dev/null +++ b/backends/vulkan/custom_ops_lib.py @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/custom_ops_lib.py \ No newline at end of file diff --git a/backends/vulkan/docs b/backends/vulkan/docs new file mode 120000 index 00000000000..e833ee9a5ac --- /dev/null +++ b/backends/vulkan/docs @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/docs \ No newline at end of file diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md deleted file mode 100644 index ff84938b06f..00000000000 --- a/backends/vulkan/docs/android_demo.md +++ /dev/null @@ -1,128 +0,0 @@ -# Building and Running ExecuTorch with the Vulkan Backend - -The [ExecuTorch Vulkan Delegate](../../../docs/source/native-delegates-executorch-vulkan-delegate.md) -is a native GPU delegate for ExecuTorch. - - -::::{grid} 2 -:::{grid-item-card} What you will learn in this tutorial: -:class-card: card-content -* How to export the Llama3.2-1B parameter model with partial GPU delegation -* How to execute the partially delegated model on Android -::: -:::{grid-item-card} Prerequisites: -:class-card: card-prerequisites -* Follow [**Setting up ExecuTorch**](../../../docs/source/getting-started-setup.rst) -* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](../../../docs/source/native-delegates-executorch-vulkan-delegate.md) and follow the example in that page -::: -:::: - -## Prerequisites - -Note that all the steps below should be performed from the ExecuTorch repository -root directory, and assumes that you have gone through the steps of setting up -ExecuTorch. - -It is also assumed that the Android NDK and Android SDK is installed, and the -following environment examples are set. - -```shell -export ANDROID_NDK= -# Select an appropriate Android ABI for your device -export ANDROID_ABI=arm64-v8a -# All subsequent commands should be performed from ExecuTorch repo root -cd -# Make sure adb works -adb --version -``` - -## Lowering the Llama3.2-1B model to Vulkan - -::::{note} -The resultant model will only be partially delegated to the Vulkan backend. In -particular, only binary arithmetic operators (`aten.add`, `aten.sub`, -`aten.mul`, `aten.div`), matrix multiplication operators (`aten.mm`, `aten.bmm`), -and linear layers (`aten.linear`) will be executed on the GPU via the Vulkan -delegate. The rest of the model will be executed using Portable operators. - -Operator support for LLaMA models is currently in active development; please -check out the `main` branch of the ExecuTorch repo for the latest capabilities. -:::: - -First, obtain the `consolidated.00.pth`, `params.json` and `tokenizer.model` -files for the `Llama3.2-1B` model from the [Llama website](https://www.llama.com/llama-downloads/). - -Once the files have been downloaded, the `export_llama` script can be used to -partially lower the Llama model to Vulkan. - -```shell -# The files will usually be downloaded to ~/.llama -python -m examples.models.llama.export_llama \ - --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \ - --model "llama3_2" \ - -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \ - -p ~/.llama/checkpoints/Llama3.2-1B/params.json \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' -``` - -A `vulkan_llama2.pte` file should have been created as a result of running the -script. - -Push the tokenizer binary and `vulkan_llama2.pte` onto your Android device: - -```shell -adb push ~/.llama/tokenizer.model /data/local/tmp/ -adb push vulkan_llama2.pte /data/local/tmp/ -``` - -## Build and Run the LLaMA runner binary on Android - -First, build and install ExecuTorch libraries, then build the LLaMA runner -binary using the Android NDK toolchain. - -```shell -./install_executorch.sh --clean -(mkdir cmake-android-out && \ - cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -DPYTHON_EXECUTABLE=python \ - -Bcmake-android-out && \ - cmake --build cmake-android-out -j16 --target install) - -# Build LLaMA Runner library -(rm -rf cmake-android-out/examples/models/llama && \ - cmake examples/models/llama \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -DCMAKE_INSTALL_PREFIX=cmake-android-out \ - -DPYTHON_EXECUTABLE=python \ - -Bcmake-android-out/examples/models/llama && \ - cmake --build cmake-android-out/examples/models/llama -j16) -``` - -Finally, push and run the llama runner binary on your Android device. Note that -your device must have sufficient GPU memory to execute the model. - -```shell -adb push cmake-android-out/examples/models/llama/llama_main /data/local/tmp/llama_main - -adb shell /data/local/tmp/llama_main \ - --model_path=/data/local/tmp/vulkan_llama2.pte \ - --tokenizer_path=/data/local/tmp/tokenizer.model \ - --prompt "Hello" -``` - -Note that currently model inference will be very slow due to the high amount of -delegate blobs in the lowered graph, which requires a transfer to and from the -GPU for each sub graph. Performance is expected to improve drastically as more -of the model can be lowered to the Vulkan delegate, and techniques such as -quantization are supported. diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py deleted file mode 100644 index 1b74ef1ac65..00000000000 --- a/backends/vulkan/op_registry.py +++ /dev/null @@ -1,720 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -import operator - -from typing import Any, Callable, Dict, List, Optional, Union - -import executorch.backends.vulkan.custom_ops_lib # noqa - -import executorch.backends.vulkan.utils as utils - -import torch - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout - -from executorch.exir.dialects._ops import ops as exir_ops - -from executorch.exir.dialects.edge._ops import EdgeOpOverload -from torch._subclasses.fake_tensor import FakeTensor - -###################### -## OpFeatures class ## -###################### - - -def allow_node(node: torch.fx.Node) -> bool: - return True - - -class OpFeatures: - __slots__ = [ - # Sets of possible (storage types, memory layouts) to use for the input tensor(s) - "inputs_storage", - # Sets of possible (storage types, memory layouts) to use for the output tensor(s) - "outputs_storage", - # bool indicating if the operator has a resize function, which allows it to - # support models with dynamic shape - "supports_resize", - # bool indicating if the operator handles its own prepacking. If this is True, - # then the insert_prepack_nodes pass will not insert prepack nodes for the args - # of the op. - "supports_prepacking", - # Optional check function used during partitioning to determine if a node's - # inputs are supported by the operator implementation. - "are_node_inputs_supported_fn", - ] - - def __init__( - self, - inputs_storage: Optional[ - Union[utils.TensorRepSet, List[utils.TensorRepSet]] - ] = None, - outputs_storage: Optional[ - Union[utils.TensorRepSet, List[utils.TensorRepSet]] - ] = None, - supports_resize: bool = False, - supports_prepacking: bool = False, - are_node_inputs_supported_fn: Optional[Callable] = allow_node, - ): - self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList( - inputs_storage if inputs_storage is not None else [] - ) - self.outputs_storage: utils.TensorRepSetList = utils.TensorRepSetList( - outputs_storage if outputs_storage is not None else [] - ) - - # If output storage is not set, assume that it is derived from the first input - if self.outputs_storage.any_is_empty(): - self.outputs_storage = utils.TensorRepSetList(self.inputs_storage[0]) - - self.supports_resize = supports_resize - self.supports_prepacking = supports_prepacking - - self.are_node_inputs_supported_fn = are_node_inputs_supported_fn - - def make_op_repsets( - self, - op_node: torch.fx.Node, - texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS, - ) -> utils.OpRepSets: - return utils.OpRepSets( - self.inputs_storage, self.outputs_storage, op_node, texture_limits - ) - - -####################### -## Operator Registry ## -####################### - -OpKey = Union[str, torch._ops.OpOverload, EdgeOpOverload] - -vulkan_supported_ops: Dict[OpKey, OpFeatures] = {} - - -def update_features(aten_op): - def features_decorator(fn: Callable): - def update_features_impl(op: OpKey): - if op in vulkan_supported_ops: - raise RuntimeError(f"[Vulkan delegate] duplicate registration of {op}!") - vulkan_supported_ops[op] = fn() - - if isinstance(aten_op, list): - for op in aten_op: - update_features_impl(op) - else: - update_features_impl(aten_op) - - return fn - - return features_decorator - - -@update_features( - [ - operator.getitem, - # Symbolic integer ops - torch.ops.aten.sym_size.int, - operator.add, - operator.lt, - operator.gt, - operator.ge, - operator.le, - operator.eq, - # Guard and assert ops - torch.ops.aten._assert_scalar.default, - torch.ops.aten.sym_constrain_range_for_size.default, - ] -) -def register_ephemeral_op(): - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, - ] -) -def register_quantization_op(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_BUFFER, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.torchao.quantize_affine.default, - exir_ops.edge.torchao.dequantize_affine.default, - ] -) -def register_affine_quantization_op(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_BUFFER, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.quantized_decomposed.choose_qparams.tensor, - exir_ops.edge.quantized_decomposed.choose_qparams_per_token_asymmetric.default, - ] -) -def register_torchao_quantization_op(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_BUFFER, - supports_resize=True, - ) - - -@update_features( - exir_ops.edge.torchao.choose_qparams_affine.default, -) -def register_torchao_choose_qparams_affine(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_ANY, - outputs_storage=[ - utils.CONTIGUOUS_BUFFER, # scales - utils.CONTIGUOUS_BUFFER, # zero_points - ], - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.add.Tensor, - exir_ops.edge.aten.sub.Tensor, - exir_ops.edge.aten.minimum.default, - exir_ops.edge.aten.mul.Tensor, - exir_ops.edge.aten.div.Tensor, - exir_ops.edge.aten.div.Tensor_mode, - exir_ops.edge.aten.pow.Tensor_Tensor, - exir_ops.edge.aten.eq.Tensor, - exir_ops.edge.aten.lt.Tensor, - exir_ops.edge.aten.le.Tensor, - exir_ops.edge.aten.gt.Tensor, - exir_ops.edge.aten.ge.Tensor, - ] -) -def register_binary_op(): - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.abs.default, - exir_ops.edge.aten.clamp.default, - exir_ops.edge.aten.cos.default, - exir_ops.edge.aten.exp.default, - exir_ops.edge.aten.gelu.default, - exir_ops.edge.aten.hardshrink.default, - exir_ops.edge.aten.hardtanh.default, - exir_ops.edge.aten.neg.default, - exir_ops.edge.aten.relu.default, - exir_ops.edge.aten.sigmoid.default, - exir_ops.edge.aten.sin.default, - exir_ops.edge.aten.sqrt.default, - exir_ops.edge.aten.rsqrt.default, - exir_ops.edge.aten.tanh.default, - exir_ops.edge.aten.round.default, - exir_ops.edge.aten.leaky_relu.default, - ] -) -def register_unary_op(): - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - ) - - -@update_features(exir_ops.edge.aten._to_copy.default) -def register_to_copy_op(): - def check_to_copy_node(node: torch.fx.Node) -> bool: - float_dtypes = [torch.float16, torch.float32] - - if len(node.args) != 1: - return False - - in_arg = node.args[0] - if not isinstance(in_arg, torch.fx.Node): - return False - - in_tensor = in_arg.meta.get("val", None) - out_tensor = node.meta.get("val", None) - - if isinstance(in_tensor, FakeTensor) and isinstance(out_tensor, FakeTensor): - if out_tensor.dtype in float_dtypes and in_tensor.dtype in float_dtypes: - return True - - return False - - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - are_node_inputs_supported_fn=check_to_copy_node, - ) - - -@update_features(exir_ops.edge.dim_order_ops._to_dim_order_copy.default) -def register_to_copy_dim_order_op(): - # Currently there is no "real" implementation for to_dim_order_copy, but it can be - # removed as long as the operator is not changing the dtype, i.e. the operator call - # is modifying the dim order only. Therefore, check that the input and output dtypes - # are the same, if so the operator is safe to remove. - def check_dim_order_copy_node(node: torch.fx.Node) -> bool: - in_arg = node.args[0] - if not isinstance(in_arg, torch.fx.Node): - return False - - in_tensor = in_arg.meta.get("val", None) - out_tensor = node.meta.get("val", None) - - if in_tensor.dtype != out_tensor.dtype: - return False - - return True - - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - are_node_inputs_supported_fn=check_dim_order_copy_node, - ) - - -@update_features( - [ - exir_ops.edge.aten.bmm.default, - exir_ops.edge.aten.mm.default, - exir_ops.edge.aten.addmm.default, - exir_ops.edge.aten.linear.default, - ] -) -def register_mm_op(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_ANY, - supports_resize=True, - supports_prepacking=True, - ) - - -@update_features( - [ - exir_ops.edge.aten._weight_int8pack_mm.default, - exir_ops.edge.et_vk.linear_qcs4w.default, - ] -) -def register_int8_mm_op(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_ANY, - supports_resize=True, - supports_prepacking=True, - ) - - -@update_features( - [ - exir_ops.edge.et_vk.linear_q8ta_q8csw.default, - exir_ops.edge.et_vk.linear_q4gsw.default, - ] -) -def register_quantized_linear_ops(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_ANY, - supports_prepacking=True, - supports_resize=False, - ) - - -@update_features( - [ - exir_ops.edge.et_vk.linear_weight_int4.default, - ] -) -def register_int4_mm_op(): - return OpFeatures( - inputs_storage=utils.CONTIGUOUS_ANY, - supports_resize=True, - supports_prepacking=True, - ) - - -@update_features( - [ - exir_ops.edge.et_vk.linear_qta8a_qga4w.default, - ] -) -def register_dqlinear_op(): - return OpFeatures( - inputs_storage=[ - utils.CONTIGUOUS_ANY, # input - utils.CONTIGUOUS_BUFFER, # mat1 scales - utils.CONTIGUOUS_BUFFER, # mat1 zeros - utils.NO_STORAGE, # weight (prepacked) - utils.NO_STORAGE, # group size (non tensor) - utils.CONTIGUOUS_BUFFER, # mat2 scales - utils.CONTIGUOUS_BUFFER, # mat2 zeros - ], - supports_resize=True, - supports_prepacking=True, - ) - - -@update_features( - [ - exir_ops.edge.aten._log_softmax.default, - exir_ops.edge.aten._softmax.default, - ] -) -def register_softmax_op(): - return OpFeatures( - inputs_storage=utils.ANY_TEXTURE, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.mean.dim, - exir_ops.edge.aten.sum.dim_IntList, - exir_ops.edge.aten.amax.default, - exir_ops.edge.aten.amin.default, - ] -) -def register_reduce_op(): - def check_reduce_node(node: torch.fx.Node) -> bool: - dim_list = node.args[1] - if isinstance(dim_list, list) and len(dim_list) > 2: - return False - - if isinstance(dim_list, list) and len(dim_list) == 2: - # Try to get the memory layout for this node - try: - memory_layout = utils.get_node_memory_layout(node) - - # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension - if ( - memory_layout is not None - and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT - ): - # For now only default layout is supported for 2D reduction. - # Because we can't determine if the input is NCHW or NHWC here, - # assume the reduction dimension is packed so we cannot support it. - return False - except (AssertionError, KeyError, AttributeError): - # If we can't get memory layout information, we'll assume the dims aren't packed - pass - - def try_find_keepdim_arg(node: torch.fx.Node) -> bool: - for arg in node.args: - if isinstance(arg, bool): - return arg - - # Assume false by default - return False - - keepdim = try_find_keepdim_arg(node) - if isinstance(keepdim, bool) and not keepdim: - return False - - return True - - return OpFeatures( - inputs_storage=utils.ANY_TEXTURE, - supports_resize=True, - are_node_inputs_supported_fn=check_reduce_node, - ) - - -@update_features( - [ - exir_ops.edge.aten.avg_pool2d.default, - exir_ops.edge.aten.max_pool2d.default, - exir_ops.edge.aten.max_pool2d_with_indices.default, - ] -) -def register_2d_pool_op(): - return OpFeatures( - inputs_storage=utils.CHANNELS_PACKED_TEXTURE, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.convolution.default, - exir_ops.edge.et_vk.conv_with_clamp.default, - ] -) -def register_convolution_op(): - return OpFeatures( - inputs_storage=[ - utils.CHANNELS_PACKED_TEXTURE, # input - utils.NO_STORAGE, # weight (prepacked) - utils.NO_STORAGE, # bias (prepacked) - utils.NO_STORAGE, # stride (non tensor) - utils.NO_STORAGE, # padding (non tensor) - utils.NO_STORAGE, # dilation (non tensor) - utils.NO_STORAGE, # transposed (non tensor) - utils.NO_STORAGE, # output_padding (non tensor) - utils.NO_STORAGE, # groups (non tensor) - utils.NO_STORAGE, # output_min (non tensor) - utils.NO_STORAGE, # output_max (non tensor) - ], - supports_resize=True, - supports_prepacking=True, - ) - - -@update_features( - [ - exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default, - ] -) -def register_quantized_conv_op(): - return OpFeatures( - inputs_storage=[ - utils.CHANNELS_PACKED_TEXTURE, # input - utils.NO_STORAGE, # input_scale (non tensor) - utils.NO_STORAGE, # input_zero_point (non tensor) - utils.NO_STORAGE, # weight (prepacked) - utils.NO_STORAGE, # weight_sums (prepacked) - utils.NO_STORAGE, # weight_scales (prepacked) - utils.NO_STORAGE, # bias (prepacked) - utils.NO_STORAGE, # kernel_size (non tensor) - utils.NO_STORAGE, # stride (non tensor) - utils.NO_STORAGE, # padding (non tensor) - utils.NO_STORAGE, # dilation (non tensor) - utils.NO_STORAGE, # groups (non tensor) - utils.NO_STORAGE, # original OC count (non tensor) - ], - supports_resize=False, - supports_prepacking=True, - ) - - -@update_features("llama::sdpa_with_kv_cache") -def register_sdpa_with_kv_cache_op(): - return OpFeatures( - inputs_storage=utils.WIDTH_PACKED_TEXTURE, - supports_resize=True, - supports_prepacking=True, - ) - - -@update_features( - [ - "llama::update_cache", - "llama::custom_sdpa", - ] -) -def register_sdpa_ops(): - return OpFeatures( - inputs_storage=utils.WIDTH_PACKED_TEXTURE, - supports_resize=True, - ) - - -@update_features(exir_ops.edge.et_vk.apply_rotary_emb.default) -def register_rotary_emb_op(): - return OpFeatures( - inputs_storage=utils.WIDTH_PACKED_TEXTURE, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.permute.default, - ] -) -def register_view_ops(): - return OpFeatures( - inputs_storage=utils.ANY_TEXTURE, - supports_resize=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.view_copy.default, - exir_ops.edge.aten.squeeze_copy.dims, - exir_ops.edge.aten.unsqueeze_copy.default, - exir_ops.edge.aten.clone.default, - exir_ops.edge.aten.permute_copy.default, - ] -) -def register_view_ops_with_buffer_meta(): - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - ) - - -@update_features(exir_ops.edge.aten.expand_copy.default) -def register_expand(): - return OpFeatures(inputs_storage=utils.ANY_BUFFER, supports_resize=False) - - -# Fully featured transfer operators (i.e. operators that copy data from the input -# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations -# for both texture and buffer storage types. -@update_features(exir_ops.edge.aten.cat.default) -def register_cat_op(): - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - ) - - -# Fully featured transfer operators (i.e. operators that copy data from the input -# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations -# for both texture and buffer storage types. -@update_features( - [ - exir_ops.edge.aten.select_copy.int, - exir_ops.edge.aten.slice_copy.Tensor, - ] -) -def register_transfer_ops(): - return OpFeatures( - inputs_storage=utils.ANY_STORAGE, - supports_resize=True, - ) - - -# Ops ported from PyTorch Vulkan backend. These ops commonly support channels -# packed tensors only and do not have a resize function. -@update_features( - [ - # Shape Manipulation - exir_ops.edge.aten.t_copy.default, - # Indexing and lookup - exir_ops.edge.aten.flip.default, - exir_ops.edge.aten.index_select.default, - # Tensor creation - exir_ops.edge.aten.arange.start_step, - exir_ops.edge.aten.constant_pad_nd.default, - exir_ops.edge.aten.full.default, - exir_ops.edge.aten.full_like.default, - exir_ops.edge.aten.ones.default, - exir_ops.edge.aten.ones_like.default, - exir_ops.edge.aten.scalar_tensor.default, - exir_ops.edge.aten.upsample_nearest2d.vec, - exir_ops.edge.aten.upsample_bilinear2d.vec, - exir_ops.edge.aten.zeros.default, - exir_ops.edge.aten.zeros_like.default, - exir_ops.edge.et_vk.grid_priors.default, - ] -) -def register_ported_op(): - return OpFeatures( - inputs_storage=utils.CHANNELS_PACKED_TEXTURE, - ) - - -# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry because they support all packed dimensions -@update_features( - [ - # Tensor combination - exir_ops.edge.aten.repeat.default, - exir_ops.edge.aten.split_with_sizes_copy.default, - exir_ops.edge.aten.split.Tensor, - ] -) -def register_ported_op_all_packed_dims(): - return OpFeatures( - inputs_storage=utils.ANY_TEXTURE, - ) - - -# Ported ops that support their own prepacking. -@update_features( - [ - exir_ops.edge.aten.embedding.default, - exir_ops.edge.aten._native_batch_norm_legit_no_training.default, - ] -) -def register_ported_ops_with_prepacking(): - return OpFeatures( - inputs_storage=utils.CHANNELS_PACKED_TEXTURE, - supports_prepacking=True, - ) - - -@update_features( - [ - exir_ops.edge.aten.native_group_norm.default, - ] -) -def register_native_group_norm(): - return OpFeatures( - inputs_storage=utils.CHANNELS_PACKED_TEXTURE, - outputs_storage=[ - utils.CHANNELS_PACKED_TEXTURE, - utils.CONTIGUOUS_BUFFER, - utils.CONTIGUOUS_BUFFER, - ], - supports_prepacking=True, - ) - - -# Ported ops that support their own prepacking. -@update_features( - [ - exir_ops.edge.aten.native_layer_norm.default, - ] -) -def register_ported_ops_with_prepacking_all_dims(): - return OpFeatures( - inputs_storage=utils.ANY_TEXTURE, - supports_prepacking=True, - ) - - -####################### -## Utility functions ## -####################### - - -def has_impl(target: Any) -> bool: - if not isinstance(target, str): - if target not in vulkan_supported_ops: - return target.name() in vulkan_supported_ops - return target in vulkan_supported_ops - else: - return target in vulkan_supported_ops - - -def get_op_features(target: Any) -> OpFeatures: - if not isinstance(target, str): - if target not in vulkan_supported_ops: - # Try the op's name - return vulkan_supported_ops[target.name()] - - return vulkan_supported_ops[target] - else: - return vulkan_supported_ops[target] - - -def handles_own_prepacking(target: OpKey) -> bool: - return get_op_features(target).supports_prepacking diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py new file mode 120000 index 00000000000..f34d32d3a0b --- /dev/null +++ b/backends/vulkan/op_registry.py @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/op_registry.py \ No newline at end of file diff --git a/backends/vulkan/partitioner b/backends/vulkan/partitioner new file mode 120000 index 00000000000..a4f40e523fa --- /dev/null +++ b/backends/vulkan/partitioner @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/partitioner \ No newline at end of file diff --git a/backends/vulkan/partitioner/TARGETS b/backends/vulkan/partitioner/TARGETS deleted file mode 100644 index 986d872f730..00000000000 --- a/backends/vulkan/partitioner/TARGETS +++ /dev/null @@ -1,26 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -oncall("executorch") - -runtime.python_library( - name = "vulkan_partitioner", - srcs = [ - "vulkan_partitioner.py", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/backends/vulkan:op_registry", - "//executorch/backends/vulkan:utils_lib", - "//executorch/backends/vulkan:vulkan_preprocess", - "//executorch/backends/vulkan/patterns:vulkan_patterns", - "//executorch/exir:delegate", - "//executorch/exir:lib", - "//executorch/exir/backend:partitioner", - "//executorch/exir/backend:utils", - "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", - ], - typing = True, -) diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py deleted file mode 100644 index e5b2d0f7864..00000000000 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -import logging -from typing import Any, Callable, Dict, final, List, Mapping, Optional, Set, Tuple - -import executorch.backends.vulkan.patterns as vk_patterns -import executorch.backends.vulkan.utils as utils - -import torch - -from executorch.backends.vulkan.op_registry import ( - get_op_features, - has_impl, - OpFeatures, - OpKey, - vulkan_supported_ops, -) - -from executorch.backends.vulkan.patterns import PatternMatch - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - VkMemoryLayout, - VkStorageType, -) -from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend - -from executorch.exir.backend.compile_spec_schema import CompileSpec -from executorch.exir.backend.partitioner import ( - DelegationSpec, - Partitioner, - PartitionResult, -) -from executorch.exir.backend.utils import tag_constant_data -from executorch.exir.dialects._ops import ops as exir_ops - -from torch.export.exported_program import ExportedProgram - -from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner -from torch.fx.passes.operator_support import OperatorSupportBase - -# pyre-ignore -ops_not_to_decompose = [ - torch.ops.aten.upsample_nearest2d.vec, -] - -logger: logging.Logger = logging.getLogger("") -logger.setLevel(logging.INFO) - - -class VulkanSupportedOperators(OperatorSupportBase): - def __init__( - self, - texture_limits: utils.ImageExtents, - buffer_limit: int, - require_dynamic_shape: bool = False, - operator_blocklist: Optional[Set[OpKey]] = None, - operator_allowlist: Optional[Set[OpKey]] = None, - fusable_subgraphs: Optional[List[PatternMatch]] = None, - nn_module_blocklist: Optional[Set[str]] = None, - nn_module_allowlist: Optional[Set[str]] = None, - ) -> None: - super().__init__() - self.texture_limits: utils.ImageExtents = texture_limits - self.buffer_limit = buffer_limit - self.require_dynamic_shapes = require_dynamic_shape - self.operator_blocklist: Set[OpKey] = ( - operator_blocklist if operator_blocklist is not None else set() - ) - self.operator_allowlist = operator_allowlist - self.fusable_subgraphs: List[PatternMatch] = ( - fusable_subgraphs if fusable_subgraphs is not None else [] - ) - # Create a set of all nodes that are part of fusable subgraphs for quick lookup - self.fusable_nodes: Set[torch.fx.Node] = set() - for match in self.fusable_subgraphs: - self.fusable_nodes.update(match.all_nodes) - - self.nn_module_blocklist = nn_module_blocklist - self.nn_module_allowlist = nn_module_allowlist - - def op_node_is_compatible( # noqa: C901: Function is too complex - self, node: torch.fx.Node, features: Optional[OpFeatures] = None - ) -> Tuple[bool, str]: - """ - Check if a given node is compatible with the Vulkan delegate's implementation - of the operator called by the node. Each tensor argument participating in the - operator call must be able to be represented with a (storage type, memory layout) - combination that is supported by the operator implementation. - """ - target = node.target - # Account for custom operators - if node.target == torch.ops.higher_order.auto_functionalized: - first_arg = node.args[0] - assert isinstance(first_arg, torch._ops.OpOverload) - target = first_arg.name() - - # Operator allow list is only used for torch ops - if ( - utils.is_torch_op_node(node) - and (self.operator_allowlist is not None) - and (target not in self.operator_allowlist) - ): - return False, "op is not in allowlist" - - if target in self.operator_blocklist: - return False, "op is in blocklist" - - # Extract the features for the node's operator, if no override was provided - if features is None: - if not has_impl(target): - return False, "no operator implementation" - features = get_op_features(target) - - # Get the possible tensor representations for each tensor participating in the - # this operator. Then check that all tensors are representable as either a - # buffer or texture. - op_repsets: utils.OpRepSets = features.make_op_repsets( - node, self.texture_limits - ) - - if op_repsets.any_is_empty(): - return ( - False, - f"no valid representations for op {utils.node_io_str(node)}", - ) - - return True, "Op is compatible" - - def node_is_compatible( - self, node: torch.fx.Node, features: Optional[OpFeatures] = None - ) -> Tuple[bool, str]: - if utils.is_tensor_node(node): - return self.op_node_is_compatible(node, features=features) - # For non-tensor nodes, just check if the op is registered - elif hasattr(node, "target"): - return node.target in vulkan_supported_ops, "Op is compatible" - - return False, f"Unsupported node type: {node.format_node()}" - - def is_linear_permute(self, node: torch.fx.Node) -> Tuple[bool, bool]: - """ - Detect if a node is a permute/transpose that precedes a call to a `mm` or - `addmm` operator. This node can be fused with the `mm` or `addmm` to produce a - `linear` operator. - - This function returns two bool values: - 1. The first indicates if this node can be fused into a linear node - 2. The second indicates if the overall linear op can be executed with Vulkan - - The node will be partitioned only if both are true. - """ - if node.target not in [ - exir_ops.edge.aten.t_copy.default, - exir_ops.edge.aten.permute_copy.default, - ]: - return False, False - - if len(node.users) != 1: - return False, False - - first_user = list(node.users.keys())[0] - if first_user.target in [ - exir_ops.edge.aten.mm.default, - exir_ops.edge.aten.addmm.default, - ]: - # Only mark this node if the target linear op is valid - if self.node_is_compatible(first_user)[0]: - return True, True - else: - return True, False - - return False, False - - def is_in_local_scalar_dense_chain(self, node: torch.fx.Node) -> Tuple[bool, bool]: - """ - Scalar tensors are usually converted to scalar values in the graph via` - scalar_tensor[0].item()` in Python, which translates to a chain of - `local_scalar_dense(torch.select.int(scalar_tensor, 0, 0))` in the graph. - This function marks the entire chain as supported by the Vulkan delegate. - - Later, within vulkan_preprocess there will be a graph transform which replaces - the chain with passing in the scalar tensor directly. - - Similar to the `is_linear_permute` function, this function has 2 return values. - """ - if node.target == exir_ops.edge.aten.select_copy.int: - if len(node.users) != 1: - return False, False - # pyre-ignore - if node.args[0].meta["val"].numel() != 1: - return False, False - - local_scalar_dense = list(node.users.keys())[0] - if local_scalar_dense.target != torch.ops.aten._local_scalar_dense.default: - return False, False - - return self.is_in_local_scalar_dense_chain(local_scalar_dense) - - if node.target == torch.ops.aten._local_scalar_dense.default: - return True, all(self.node_is_compatible(user)[0] for user in node.users) - - return False, False - - def log_skip(self, node: torch.fx.Node, reason: str) -> None: - if node.op == "call_function": - logger.info( - f"[Vulkan Partitioner] Due to [{reason}], skipping {utils.node_io_str(node)}" - ) - - def is_node_supported( - self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node - ) -> bool: - r = self._is_node_supported(node) - return r - - def _is_node_supported(self, node: torch.fx.Node) -> bool: # noqa: C901 - if node.op == "call_function": - # Apply nn module allowlist and blocklist - if self.nn_module_allowlist is not None: - if not utils.node_comes_from_any_nn_module_in_set( - node, self.nn_module_allowlist - ): - self.log_skip(node, "source nn.Module is not in allowlist") - return False - - if self.nn_module_blocklist is not None: - if utils.node_comes_from_any_nn_module_in_set( - node, self.nn_module_blocklist - ): - self.log_skip(node, "source nn.Module is in blocklist") - return False - - # Check if this node is part of a fusable subgraph - if node in self.fusable_nodes: - return True - - target = node.target - if node.target == torch.ops.higher_order.auto_functionalized: - first_arg = node.args[0] - assert isinstance(first_arg, torch._ops.OpOverload) - target = first_arg.name() - - is_linear_permute, target_linear_is_compatible = self.is_linear_permute(node) - if is_linear_permute and target_linear_is_compatible: - return True - elif is_linear_permute: - # Skip so that the permute can be fused into a linear by another backend - self.log_skip(node, "permute node of non compatible linear node") - return False - - is_in_local_scalar_dense_chain, dst_node_is_compatible = ( - self.is_in_local_scalar_dense_chain(node) - ) - if is_in_local_scalar_dense_chain and dst_node_is_compatible: - return True - elif is_in_local_scalar_dense_chain: - self.log_skip(node, "local scalar dense of incompatible op node") - return False - - features = None - if target not in vulkan_supported_ops: - # For some ops, i.e. custom ops the name is registered instead of the - # OpOverload object. - if hasattr(target, "name") and target.name() in vulkan_supported_ops: - features = vulkan_supported_ops[target.name()] - else: - self.log_skip(node, "no operator implementation") - return False - else: - features = vulkan_supported_ops[target] - - assert features is not None - - if not features.are_node_inputs_supported_fn(node): - self.log_skip(node, "op args not supported") - return False - - if self.require_dynamic_shapes and not features.supports_resize: - self.log_skip(node, "no dynamic shape support") - return False - - is_compatible, reason = self.node_is_compatible(node, features=features) - if not is_compatible: - self.log_skip(node, reason) - - return is_compatible - - -def parse_compile_options(compile_options: Dict[str, Any]) -> List[CompileSpec]: - compile_specs = [] - - for key, value in compile_options.items(): - if isinstance(value, (VkStorageType, VkMemoryLayout)): - value_bytes = int(value).to_bytes(4, byteorder="little") - compile_specs.append(CompileSpec(key, value_bytes)) - - if isinstance(value, bool): - value_bytes = value.to_bytes(1, byteorder="little") - compile_specs.append(CompileSpec(key, value_bytes)) - - if key == "texture_limits": - compile_specs.append( - CompileSpec( - "texture_limits_x", int(value[0]).to_bytes(4, byteorder="little") - ) - ) - compile_specs.append( - CompileSpec( - "texture_limits_y", int(value[1]).to_bytes(4, byteorder="little") - ) - ) - compile_specs.append( - CompileSpec( - "texture_limits_z", int(value[2]).to_bytes(4, byteorder="little") - ) - ) - - # Unhandled options are ignored - - return compile_specs - - -@final -class VulkanPartitioner(Partitioner): - def __init__( - self, - compile_options: Optional[Dict[str, Any]] = None, - operator_blocklist: Optional[List[OpKey]] = None, - operator_allowlist: Optional[List[OpKey]] = None, - nn_module_blocklist: Optional[List[str]] = None, - nn_module_allowlist: Optional[List[str]] = None, - ) -> None: - self.options: Dict[str, Any] = {} - if compile_options is not None: - self.options = compile_options - - compile_spec = parse_compile_options(self.options) - self.delegation_spec = DelegationSpec(VulkanBackend.__name__, compile_spec) - - self.operator_blocklist: Set[OpKey] = set() - if operator_blocklist is not None: - for entry in operator_blocklist or []: - self.operator_blocklist.add(entry) - - self.operator_allowlist: Optional[Set[OpKey]] = None - if operator_allowlist is not None: - self.operator_allowlist = set() - for entry in operator_allowlist: - assert self.operator_allowlist is not None - self.operator_allowlist.add(entry) - - self.nn_module_blocklist: Optional[Set[str]] = None - if nn_module_blocklist is not None: - self.nn_module_blocklist = set() - for entry in nn_module_blocklist or []: - assert self.nn_module_blocklist is not None - self.nn_module_blocklist.add(entry) - - self.nn_module_allowlist: Optional[Set[str]] = None - if nn_module_allowlist is not None: - self.nn_module_allowlist = set() - for entry in nn_module_allowlist: - assert self.nn_module_allowlist is not None - self.nn_module_allowlist.add(entry) - - def ops_to_not_decompose( - self, ep: ExportedProgram - ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: - def filter_fn(node: torch.fx.Node) -> bool: - return True - - return (ops_not_to_decompose, filter_fn) - - def partition(self, exported_program: ExportedProgram) -> PartitionResult: - # Run the CapabilityBasedPartitioner to return the largest possible - # subgraphs containing the nodes with the tags - partition_tags = {} - - # Get all fusable subgraphs from fuse_patterns - fusable_subgraphs = vk_patterns.get_all_fusable_subgraphs( - exported_program.graph_module - ) - - texture_limits: utils.ImageExtents = self.options.get( - "texture_limits", utils.DEFAULT_TEXTURE_LIMITS - ) - buffer_limit: int = self.options.get("buffer_limit", utils.DEFAULT_BUFFER_LIMIT) - capability_partitioner = CapabilityBasedPartitioner( - exported_program.graph_module, - VulkanSupportedOperators( - texture_limits, - buffer_limit, - require_dynamic_shape=self.options.get("require_dynamic_shapes", False), - operator_blocklist=self.operator_blocklist, - operator_allowlist=self.operator_allowlist, - fusable_subgraphs=fusable_subgraphs, - nn_module_blocklist=self.nn_module_blocklist, - nn_module_allowlist=self.nn_module_allowlist, - ), - allows_single_node_partition=True, - ) - partition_list = capability_partitioner.propose_partitions() - for partition in partition_list: - for node in partition.nodes: - tag = f"tag{partition.id}" - node.meta["delegation_tag"] = tag - partition_tags[tag] = self.delegation_spec - - pl = len(partition_list) - if pl == 0: - logger.warning("No Vulkan subgraphs can be partitioned!") - else: - logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.") - - tag_constant_data(exported_program) - - return PartitionResult( - tagged_exported_program=exported_program, partition_tags=partition_tags - ) diff --git a/backends/vulkan/patterns b/backends/vulkan/patterns new file mode 120000 index 00000000000..8abcaf07403 --- /dev/null +++ b/backends/vulkan/patterns @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/patterns \ No newline at end of file diff --git a/backends/vulkan/patterns/TARGETS b/backends/vulkan/patterns/TARGETS deleted file mode 100644 index 791edf58984..00000000000 --- a/backends/vulkan/patterns/TARGETS +++ /dev/null @@ -1,26 +0,0 @@ -load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -oncall("executorch") - -runtime.python_library( - name = "vulkan_patterns", - srcs = [ - "__init__.py", - "pattern_registry.py", - "rope.py", - "quantized_linear.py", - "quantized_convolution.py", - ], - visibility = [ - "//executorch/backends/...", - "//executorch/examples/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:lib", - "//executorch/backends/transforms:utils", - "//executorch/backends/vulkan:utils_lib", - ], - typing = True, -) diff --git a/backends/vulkan/patterns/__init__.py b/backends/vulkan/patterns/__init__.py deleted file mode 100644 index 8ffad98b3c3..00000000000 --- a/backends/vulkan/patterns/__init__.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import List - -import executorch.backends.vulkan.patterns.quantized_convolution # noqa - -import executorch.backends.vulkan.patterns.quantized_linear # noqa - -import executorch.backends.vulkan.patterns.rope # noqa - -import torch - -from executorch.backends.vulkan.patterns.pattern_registry import ( - create_pattern_match_from_internal_match, - CreateReplacementFn, - DetectorFn, - fusable_patterns, - GetGraphFn, - PatternMatch, - register_pattern_detector, - register_pattern_graph, - register_pattern_replacement, -) - -from executorch.backends.vulkan.patterns.rope import RotaryEmbeddingPattern - -from executorch.exir import ExportedProgram - -from torch.fx.passes.utils.matcher_utils import SubgraphMatcher - - -__all__ = [ - "PatternMatch", - "GetGraphFn", - "DetectorFn", - "CreateReplacementFn", - "RotaryEmbeddingPattern", - "fusable_patterns", - "register_pattern_graph", - "register_pattern_detector", - "register_pattern_replacement", -] - - -def all_fusable_graph_patterns() -> List[torch.fx.GraphModule]: - all_patterns = [] - for entry in fusable_patterns.values(): - if entry.get_graphs_fn is not None: - all_patterns.extend(entry.get_graphs_fn()) - - return all_patterns - - -def get_all_fusable_subgraphs( - graph_module: torch.fx.GraphModule, -) -> List[PatternMatch]: - fusable_subgraphs = [] - - fuse_patterns = all_fusable_graph_patterns() - for pattern in fuse_patterns: - sm = SubgraphMatcher(pattern.graph, ignore_literals=True) - matches = list(sm.match(graph_module.graph)) - for match in matches: - fusable_subgraphs.append(create_pattern_match_from_internal_match(match)) - - for node in graph_module.graph.nodes: - for entry in fusable_patterns.values(): - if entry.detector_fn is not None: - maybe_match = entry.detector_fn(node) - if maybe_match is not None: - fusable_subgraphs.append(maybe_match) - - return fusable_subgraphs - - -def create_replacement_for_pattern( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - patterns: List[torch.fx.GraphModule], - create_replacement_func: CreateReplacementFn, -) -> int: - total_replaced = 0 - - for pattern in patterns: - sm = SubgraphMatcher(pattern.graph, ignore_literals=True) - matches = list(sm.match(graph_module.graph)) - - for partition_to_replace in matches: - pattern = create_pattern_match_from_internal_match(partition_to_replace) - create_replacement_func(ep, graph_module, pattern) - total_replaced += 1 - # Remove dead code so they won't be matched again - graph_module.graph.eliminate_dead_code() - - return total_replaced - - -def replace_all_fusable_subgraphs( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, -) -> int: - total_replaced = 0 - - # Handle patterns identified with SubgraphMatcher - for entry in fusable_patterns.values(): - if entry.get_graphs_fn is not None and entry.create_replacement_fn is not None: - total_replaced += create_replacement_for_pattern( - ep, - graph_module, - entry.get_graphs_fn(), - # pyre-ignore[6] - entry.create_replacement_fn, - ) - - # Handle patterns identified with custom detector function - for node in graph_module.graph.nodes: - for entry in fusable_patterns.values(): - if ( - entry.detector_fn is not None - and entry.create_replacement_fn is not None - ): - maybe_match = entry.detector_fn(node) - if maybe_match is not None: - assert entry.create_replacement_fn is not None - entry.create_replacement_fn(ep, graph_module, maybe_match) - total_replaced += 1 - - graph_module.graph.eliminate_dead_code() - return total_replaced diff --git a/backends/vulkan/patterns/pattern_registry.py b/backends/vulkan/patterns/pattern_registry.py deleted file mode 100644 index 9a906cd8770..00000000000 --- a/backends/vulkan/patterns/pattern_registry.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Callable, Dict, List, Optional - -import torch - -from executorch.exir import ExportedProgram - -from torch.fx.passes.utils.matcher_utils import InternalMatch - -GetGraphFn = Callable[[], List[torch.fx.GraphModule]] - - -class PatternMatch: - __slots__ = ("input_nodes", "output_nodes", "all_nodes", "anchor_node") - """ - The design of this class is based on InternalMatch from - torch.fx.passes.utils.matcher_utils. It represents nodes in a graph that - match a particular pattern. - - The reason to not use InternalMatch directly is to enable more (i.e. custom) - methods to detect and represent matches other than through SubgraphMatcher. - """ - - def __init__( - self, - input_nodes: List[torch.fx.Node], - output_nodes: List[torch.fx.Node], - all_nodes: List[torch.fx.Node], - anchor_node: Optional[torch.fx.Node] = None, - ): - self.input_nodes = input_nodes - self.output_nodes = output_nodes - self.all_nodes = all_nodes - self.anchor_node = anchor_node - - -def create_pattern_match_from_internal_match( - internal_match: InternalMatch, -) -> PatternMatch: - return PatternMatch( - internal_match.placeholder_nodes, - internal_match.returning_nodes, - list(internal_match.nodes_map.values()), - ) - - -CreateReplacementFn = Callable[ - [ExportedProgram, torch.fx.GraphModule, PatternMatch], None -] - - -DetectorFn = Callable[[torch.fx.Node], Optional[PatternMatch]] - - -class PatternEntry: - def __init__( - self, - get_graphs_fn: Optional[GetGraphFn] = None, - detector_fn: Optional[DetectorFn] = None, - create_replacement_fn: Optional[CreateReplacementFn] = None, - ): - self.get_graphs_fn = get_graphs_fn - self.detector_fn = detector_fn - self.create_replacement_fn = create_replacement_fn - - def is_valid(self): - return ( - self.get_graphs_fn is not None or self.detector_fn is not None - ) and self.create_replacement_fn is not None - - -fusable_patterns: Dict[str, PatternEntry] = {} - - -def register_pattern_graph(pattern_name: str): - def decorator(fn: GetGraphFn): - if pattern_name not in fusable_patterns: - fusable_patterns[pattern_name] = PatternEntry() - - # Cannot define both get_graphs_fn and detector_fn - assert fusable_patterns[pattern_name].detector_fn is None - fusable_patterns[pattern_name].get_graphs_fn = fn - - return fn - - return decorator - - -def register_pattern_detector(pattern_name: str): - def decorator(fn: DetectorFn): - if pattern_name not in fusable_patterns: - fusable_patterns[pattern_name] = PatternEntry() - - # Cannot define both get_graphs_fn and detector_fn - assert fusable_patterns[pattern_name].get_graphs_fn is None - fusable_patterns[pattern_name].detector_fn = fn - - return fn - - return decorator - - -def register_pattern_replacement(pattern_name: str): - def decorator(fn: CreateReplacementFn): - if pattern_name not in fusable_patterns: - fusable_patterns[pattern_name] = PatternEntry() - - fusable_patterns[pattern_name].create_replacement_fn = fn - return fn - - return decorator diff --git a/backends/vulkan/patterns/quantized_convolution.py b/backends/vulkan/patterns/quantized_convolution.py deleted file mode 100644 index 65b51b5e103..00000000000 --- a/backends/vulkan/patterns/quantized_convolution.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Optional - -import executorch.backends.vulkan.utils as utils - -import torch - -from executorch.backends.transforms.utils import ( - create_constant_placeholder, - get_param_tensor, -) - -from executorch.backends.vulkan.patterns.pattern_registry import ( - PatternMatch, - register_pattern_detector, - register_pattern_replacement, -) - -from executorch.exir import ExportedProgram -from executorch.exir.dialects._ops import ops as exir_ops - -from torch.export.graph_signature import InputKind - - -class QuantizedConvolutionMatch(PatternMatch): - def __init__(self, conv_node: torch.fx.Node) -> None: - self.anchor_node = conv_node - self.match_found = False - self.all_nodes = [self.anchor_node] - - # Extract convolution parameters - self.stride = conv_node.args[3] if len(conv_node.args) > 3 else [1, 1] - self.padding = conv_node.args[4] if len(conv_node.args) > 4 else [0, 0] - self.dilation = conv_node.args[5] if len(conv_node.args) > 5 else [1, 1] - self.groups = conv_node.args[8] if len(conv_node.args) > 8 else 1 - - const_node, arg_chain = utils.trace_args_until_placeholder( - self.anchor_node.args[1] - ) - - # weight is not a constant tensor - no match - if const_node is None: - return - - dequantize_weight_node = None - # Search for a dequantize node in the arg chain of weight - for node in arg_chain: - if isinstance(node, torch.fx.Node) and utils.is_dequant_node(node): - dequantize_weight_node = node - # weight is not quantized - no match - if dequantize_weight_node is None: - return - - self.weight_node = const_node - self.dequantize_weight_node = dequantize_weight_node - self.all_nodes.extend(arg_chain) - - # Identify weight quantization parameter nodes - self.weight_scales_node, arg_chain = utils.trace_args_until_placeholder( - self.dequantize_weight_node.args[1] - ) - assert self.weight_scales_node is not None - self.all_nodes.extend(arg_chain) - - self.weight_zeros_node, arg_chain = utils.trace_args_until_placeholder( - self.dequantize_weight_node.args[2] - ) - assert self.weight_zeros_node is not None - self.all_nodes.extend(arg_chain) - - # Identify output node - self.output_node = self.anchor_node - - out_channels = self.output_node.meta["val"].shape[-1] - # The implementation requires that for grouped convolutions, a group does not - # cross any texel boundary. The output channels per group must be a multiple of - # 4. If this is not true, then don't match the pattern. - if self.groups > 1 and (out_channels / self.groups) % 4 == 0: - return - - # Identify bias node, if applicable - self.bias_node = None - if len(self.anchor_node.args) > 2 and self.anchor_node.args[2] is not None: - self.bias_node, arg_chain = utils.trace_args_until_placeholder( - self.anchor_node.args[2] - ) - if self.bias_node is not None: - self.all_nodes.extend(arg_chain) - - # Identify input node - self.fp_input_node, self.quantize_input_node, dq_node = ( - utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0]) - ) - assert self.fp_input_node is not None - self.all_nodes.append(self.fp_input_node) - assert self.quantize_input_node is not None - assert dq_node is not None - - self.input_scales_node = self.quantize_input_node.args[1] - self.input_zeros_node = self.quantize_input_node.args[2] - - self.all_nodes.extend( - [ - self.quantize_input_node, - dq_node, - ] - ) - - self.match_found = True - - -convolution_anchor_nodes = { - exir_ops.edge.aten.conv2d.default, - exir_ops.edge.aten.convolution.default, -} - - -@register_pattern_detector("quantized_convolution") -def find_quantized_convolution_patterns( - node: torch.fx.Node, -) -> Optional[QuantizedConvolutionMatch]: - if node.target not in convolution_anchor_nodes: - return None - - matched_pattern = QuantizedConvolutionMatch(node) - if matched_pattern.match_found: - return matched_pattern - - return None - - -## -## Pattern Replacement -## - - -@register_pattern_replacement("quantized_convolution") -def make_conv2d_q8ta_q8csw_custom_op( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - match: QuantizedConvolutionMatch, -): - weight_tensor = get_param_tensor(ep, match.weight_node) - assert weight_tensor is not None - - assert match.weight_scales_node is not None - weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node) - assert weight_scales_tensor is not None - - assert match.weight_zeros_node is not None - weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node) - assert weight_zeros_tensor is not None - - bias_tensor = None - if match.bias_node is not None: - bias_tensor = get_param_tensor(ep, match.bias_node) - assert bias_tensor is not None - - OC, IC, H, W = weight_tensor.shape - - # Reshape weight tensor from (OC, IC, H, W) to (OC, H * W * IC) (i.e. matrix format) - # This prepares the weights for Im2Col-based convolution - weight_tensor = ( - weight_tensor.permute(0, 2, 3, 1).contiguous().view(OC, H * W * IC).contiguous() - ) - - # Need to make sure that OC dim is a multiple of 4 so that data load/stores are well - # aligned with texel boundaries. Add padding to align to the next multiple of 4 if - # needed. - utils.align_width_and_update_state_dict( - ep, match.weight_node, weight_tensor, force_update=True - ) - utils.align_width_and_update_state_dict( - ep, match.weight_scales_node, weight_scales_tensor - ) - if bias_tensor is not None: - utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor) - - first_graph_node = list(graph_module.graph.nodes)[0] - with graph_module.graph.inserting_before(first_graph_node): - qweight_tensor_name = utils.get_tensor_name(ep, match.weight_node) - # Pre-compute the weight sums which are needed to apply activation zero point - # when using integer accumulation. For the reshaped 2D weight matrix (IC * H * W, OC), - # sum over dimension 0 to get sums per output channel - sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous() - sums_name = qweight_tensor_name + "_sums" - # Sanitize the name - sums_name = sums_name.replace(".", "_") - - weight_sums_node = create_constant_placeholder( - exp_program=ep, - graph=graph_module.graph, - kind=InputKind.CONSTANT_TENSOR, - name=sums_name, - data=sum_per_output_channel, - ) - - with graph_module.graph.inserting_before(match.output_node): - qconv_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default, - args=( - match.fp_input_node, - match.input_scales_node, - match.input_zeros_node, - match.weight_node, - weight_sums_node, - match.weight_scales_node, - match.bias_node, # Add bias after weight_scales - [H, W], # Pass kernel size information before stride - match.stride, - match.padding, - match.dilation, - match.groups, - ), - ) - - qconv_node.meta["val"] = match.output_node.meta["val"] - match.output_node.replace_all_uses_with(qconv_node) diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py deleted file mode 100644 index ee1c7ee2d2a..00000000000 --- a/backends/vulkan/patterns/quantized_linear.py +++ /dev/null @@ -1,363 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Optional - -import executorch.backends.vulkan.utils as utils - -import torch -import torch.nn.functional as F - -from executorch.backends.transforms.utils import ( - create_constant_placeholder, - get_param_tensor, -) - -from executorch.backends.vulkan.patterns.pattern_registry import ( - PatternMatch, - register_pattern_detector, - register_pattern_replacement, -) - -from executorch.exir import ExportedProgram -from executorch.exir.dialects._ops import ops as exir_ops - -from torch.export.graph_signature import InputKind - - -class QuantizedLinearMatch(PatternMatch): - def __init__(self, mm_node: torch.fx.Node) -> None: - self.anchor_node = mm_node - self.match_found = False - self.all_nodes = [self.anchor_node] - - const_node, arg_chain = utils.trace_args_until_placeholder( - self.anchor_node.args[1] - ) - - # mat2 is not a constant tensor - no match - if const_node is None: - return - - dequantize_weight_node = None - # Search for a dequantize node in the arg chain of weight - for node in arg_chain: - if isinstance(node, torch.fx.Node) and utils.is_dequant_node(node): - dequantize_weight_node = node - # weight is not quantized - no match - if dequantize_weight_node is None: - return - - self.weight_node = const_node - self.dequantize_weight_node = dequantize_weight_node - self.all_nodes.extend(arg_chain) - - # By default, assume dequant node is from quantized_decomposed namespace - scales_arg_idx = 1 - zeros_arg_idx = 2 - # torchao dequantize has a different function schema than quantized_decomposed - if ( - self.dequantize_weight_node.target - == exir_ops.edge.torchao.dequantize_affine.default - ): - scales_arg_idx = 2 - zeros_arg_idx = 3 - - # Identify weight quantization parameter nodes - self.weight_scales_node, arg_chain = utils.trace_args_until_placeholder( - self.dequantize_weight_node.args[scales_arg_idx] - ) - assert self.weight_scales_node is not None - self.all_nodes.extend(arg_chain) - - self.weight_zeros_node, arg_chain = utils.trace_args_until_placeholder( - self.dequantize_weight_node.args[zeros_arg_idx] - ) - assert self.weight_zeros_node is not None - self.all_nodes.extend(arg_chain) - - # Identify output node - self.output_node = self.anchor_node - - # The implementation has a limitation that output channels must be a - # multiple of 4. This is to ensure that data loads are aligned well with - # texel boundaries. If this is not true, then don't match the pattern. - out_channels = self.output_node.meta["val"].shape[-1] - if out_channels % 4 != 0: - return - - # Identify input node - self.fp_input_node, self.quantize_input_node, dq_node = ( - utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0]) - ) - assert self.fp_input_node is not None - self.all_nodes.append(self.fp_input_node) - - # The implementation has a limitation that input channels must be a - # multiple of 4. This is to ensure that data loads are aligned well with - # texel boundaries. If this is not true, then don't match the pattern. - in_channels = self.fp_input_node.meta["val"].shape[-1] - if in_channels % 4 != 0: - return - - # Identify bias node, if applicable - self.bias_node = None - if self.anchor_node.target == exir_ops.edge.aten.addmm.default: - self.bias_node, arg_chain = utils.trace_args_until_placeholder( - self.anchor_node.args[2] - ) - assert self.bias_node is not None - self.all_nodes.extend(arg_chain) - - # If input is not quantized, then we are done - if self.quantize_input_node is None: - self.match_found = True - return - - self.input_scales_node = self.quantize_input_node.args[1] - self.input_zeros_node = self.quantize_input_node.args[2] - - assert dq_node is not None - self.all_nodes.extend( - [ - self.quantize_input_node, - dq_node, - ] - ) - - self.match_found = True - - def is_weight_only_quantized(self) -> bool: - return self.quantize_input_node is None - - def is_weight_pergroup_quantized(self) -> bool: - weight_shape = self.weight_node.meta["val"].shape - scales_shape = self.weight_scales_node.meta["val"].shape - if len(scales_shape) != 2: - return False - - # Check that: - # height dim of scales is same as height dim of weight (N / output channels dim) - # width dim of weight (K / in channels dim) is divisible by width dim of scales - # (number of quantization groups) - return scales_shape[-2] == weight_shape[-2] and ( - weight_shape[-1] % scales_shape[-1] == 0 - ) - - def is_weight_perchannel_quantized(self) -> bool: - weight_shape = self.weight_node.meta["val"].shape - scales_shape = self.weight_scales_node.meta["val"].shape - if len(scales_shape) != 1: - return False - - # scales should have same size as weight's output channels dim - return scales_shape[0] == weight_shape[-2] - - def is_input_static_per_tensor_quantized(self) -> bool: - if self.quantize_input_node is None: - return False - - # For static quantization per tensor quantization, the scales and zeros - # are scalars. - return isinstance(self.input_scales_node, float) - - -linear_anchor_nodes = { - exir_ops.edge.aten.linear.default, - exir_ops.edge.aten.mm.default, - exir_ops.edge.aten.addmm.default, -} - - -@register_pattern_detector("quantized_linear") -def find_quantized_linear_patterns( - node: torch.fx.Node, -) -> Optional[QuantizedLinearMatch]: - if node.target not in linear_anchor_nodes: - return None - - matched_pattern = QuantizedLinearMatch(node) - if matched_pattern.match_found: - return matched_pattern - - return None - - -## -## Constant tensor manipulation -## - - -def pack_4bit_weight_tensor(weight_tensor: torch.Tensor) -> torch.Tensor: - """ - Given a 8-bit weight tensor containing values quantized to 4 bits, create a packed - weight tensor by transposing the weight tensor, then packing 2 4-bit values in one - 8-bit value. - - An input weight tensor of shape (N, K) will produce a packed weight tensor of shape - (K, N / 2). - """ - - # Assert we got a properly quantized tensor. - min_val, max_val = weight_tensor.min().item(), weight_tensor.max().item() - assert ( - max_val <= 7 and min_val >= -8 - ), f"pack_4bit_weight_tensor: [min_val,max_val] out of [-8, 7] range, got [{min_val}, {max_val}]" - - # Assuming we have a 2d tensor - if weight_tensor.ndim != 2: - weight_tensor = weight_tensor.squeeze() - assert ( - weight_tensor.ndim == 2 - ), f"pack_4bit_weight_tensor: expecting input tensor to be 2d, got {weight_tensor.ndim}" - - # Need to pad innermost dim to be a multiple of 8, since the minimum load granularity - # is int32 (4 bytes), which contains 8 4-bit values. - if weight_tensor.shape[-1] % 8 != 0: - num_pad = 8 - (weight_tensor.shape[-1] % 8) - weight_tensor = F.pad(input=weight_tensor, pad=(0, num_pad)) - - # Shape after padding - _, in_channels = weight_tensor.shape - assert in_channels % 8 == 0, "convert_to_qc4w: expecting ic to be divisible by 8" - - # Adjust weight_tensor tensor for zp - weight_tensor = weight_tensor.to(dtype=torch.uint8) + 8 - # Pack each 4-bit value into a single 8-bit value - return weight_tensor[::, 1::2] << 4 | weight_tensor[::, ::2] - - -## -## Pattern Replacement -## - - -def make_linear_q4gsw_op( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - match: QuantizedLinearMatch, - weight_tensor: torch.Tensor, - weight_scales_tensor: torch.Tensor, -): - num_groups = weight_scales_tensor.shape[-1] - in_channels = weight_tensor.shape[-1] - group_size = in_channels // num_groups - - weight_tensor = pack_4bit_weight_tensor(weight_tensor) - # Use this function for convenience to update the state dict with the packed - # weight tensor. Alignment will already have been done in the above function. - weight_tensor = utils.align_width_and_update_state_dict( - ep, match.weight_node, weight_tensor, align_to=1, force_update=True - ) - - # Also transpose the weight scales tensor to shape [num_groups, N] - weight_scales_tensor = weight_scales_tensor.transpose(0, 1).contiguous() - # Align to multiple of 8 to ensure that data loads from the weight scales - # tensor do not go out of bounds. Each thread computes 8 output channels. - utils.align_width_and_update_state_dict( - ep, - match.weight_scales_node, - weight_scales_tensor, - align_to=8, - force_update=True, - ) - - with graph_module.graph.inserting_before(match.output_node): - linear_q4gsw_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.et_vk.linear_q4gsw.default, - args=( - match.fp_input_node, - match.weight_node, - match.weight_scales_node, - group_size, - ), - ) - - linear_q4gsw_node.meta["val"] = match.output_node.meta["val"] - match.output_node.replace_all_uses_with(linear_q4gsw_node) - - -def make_linear_q8ta_q8csw_custom_op( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - match: QuantizedLinearMatch, - weight_tensor: torch.Tensor, -): - first_graph_node = list(graph_module.graph.nodes)[0] - with graph_module.graph.inserting_before(first_graph_node): - weight_tensor_name = utils.get_tensor_name(ep, match.weight_node) - # Pre-compute the weight sums which are needed to apply activation zero point - # when using integer accumulation. - sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous() - sums_name = weight_tensor_name + "_sums" - # Sanitize the name - sums_name = sums_name.replace(".", "_") - - weight_sums_node = create_constant_placeholder( - exp_program=ep, - graph=graph_module.graph, - kind=InputKind.CONSTANT_TENSOR, - name=sums_name, - data=sum_per_output_channel, - ) - - with graph_module.graph.inserting_before(match.output_node): - qlinear_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.et_vk.linear_q8ta_q8csw.default, - args=( - match.fp_input_node, - match.input_scales_node, - match.input_zeros_node, - match.weight_node, - weight_sums_node, - match.weight_scales_node, - ), - ) - - qlinear_node.meta["val"] = match.output_node.meta["val"] - match.output_node.replace_all_uses_with(qlinear_node) - - -@register_pattern_replacement("quantized_linear") -def replace_quantized_linear_patterns( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - match: QuantizedLinearMatch, -): - # Extract relevant tensors - weight_tensor = get_param_tensor(ep, match.weight_node) - assert weight_tensor is not None - - assert match.weight_scales_node is not None - weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node) - assert weight_scales_tensor is not None - - assert match.weight_zeros_node is not None - weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node) - assert weight_zeros_tensor is not None - - # Biases not supported at the moment - if match.bias_node is not None: - return - - # Route to appropriate custom op - if ( - match.is_weight_only_quantized() - and match.is_weight_pergroup_quantized() - and utils.is_in_4bit_range(weight_tensor) - ): - make_linear_q4gsw_op( - ep, graph_module, match, weight_tensor, weight_scales_tensor - ) - elif ( - match.is_input_static_per_tensor_quantized() - and match.is_weight_perchannel_quantized() - ): - make_linear_q8ta_q8csw_custom_op(ep, graph_module, match, weight_tensor) - - # No-op for unsupported quant patterns diff --git a/backends/vulkan/patterns/rope.py b/backends/vulkan/patterns/rope.py deleted file mode 100644 index b174224ab78..00000000000 --- a/backends/vulkan/patterns/rope.py +++ /dev/null @@ -1,173 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import operator - -from functools import lru_cache -from typing import List, Optional - -import torch - -from executorch.backends.vulkan.patterns.pattern_registry import ( - PatternMatch, - register_pattern_graph, - register_pattern_replacement, -) - -from executorch.exir import EdgeCompileConfig, ExportedProgram, to_edge -from executorch.exir.dialects._ops import ops as exir_ops - -from torch.export import export - - -class RotaryEmbeddingPattern(torch.nn.Module): - """ - Implementation of rotary embedding pattern that matches the one - in examples/model/llama/rope.py - """ - - def __init__(self): - super().__init__() - - def forward( - self, - xq: torch.Tensor, - xk: torch.Tensor, - freqs_cos: torch.Tensor, - freqs_sin: torch.Tensor, - ): - # This implementation matches the apply_rotary_emb function in rope.py - # Split into real and imaginary parts - xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) - xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) - - # Reshape frequencies for broadcasting - freqs_cos = self._reshape_for_broadcast(freqs_cos, xq_r) - freqs_sin = self._reshape_for_broadcast(freqs_sin, xq_r) - - # Apply rotary embedding - xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin - xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos - xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin - xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos - - # Recombine real and imaginary parts - xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3) - xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) - - return xq_out.type_as(xq), xk_out.type_as(xk) - - def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor): - ndim = x.ndim - freqs_cis_ndim = freqs_cis.ndim - if freqs_cis_ndim == 3: - # freqs_cis: (seq_len, n_heads, head_dim // 2) - assert freqs_cis.shape == (x.shape[-3], x.shape[-2], x.shape[-1]) - shape = [ - d if (i == ndim - 3 or i == ndim - 2 or i == ndim - 1) else 1 - for i, d in enumerate(x.shape) - ] - else: - # freqs_cis: (seq_len, head_dim // 2) - assert freqs_cis.shape == (x.shape[1], x.shape[-1]) - shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] - return freqs_cis.view(shape) - - -@lru_cache(maxsize=2) -@register_pattern_graph("export_llama_rope") -def get_rope_graphs() -> List[torch.fx.GraphModule]: - batch_size = 1 - seq_len = 1 - n_heads = 4 - n_kv_heads = 2 - head_dim = 32 - - graphs = [] - dtype = torch.float32 - - xq = torch.randn(batch_size, seq_len, n_heads, head_dim, dtype=dtype) - xk = torch.randn(batch_size, seq_len, n_kv_heads, head_dim, dtype=dtype) - freqs_cos = torch.randn(seq_len, head_dim // 2, dtype=dtype) - freqs_sin = torch.randn(seq_len, head_dim // 2, dtype=dtype) - - edge = to_edge( - export( - RotaryEmbeddingPattern(), - (xq, xk, freqs_cos, freqs_sin), - strict=True, - ), - compile_config=EdgeCompileConfig(_check_ir_validity=False), - ) - gm = edge.exported_program().graph_module - graphs.append(gm) - - return graphs - - -def identify_rotary_emb_io_nodes( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - match: PatternMatch, -) -> Optional[List[torch.fx.Node]]: - # Get the input inputs (xq, xk, freqs_cos, freqs_sin) - input_nodes = match.input_nodes - if len(input_nodes) != 4: - return None - - xq, xk, freqs_cos, freqs_sin = input_nodes - - output_nodes = match.output_nodes - if len(output_nodes) != 2: - return None - - xq_out, xk_out = output_nodes - - return [xq, xk, freqs_cos, freqs_sin, xq_out, xk_out] - - -@register_pattern_replacement("export_llama_rope") -def create_rotary_emb_custom_op( - ep: ExportedProgram, - graph_module: torch.fx.GraphModule, - match: PatternMatch, -): - io_nodes = identify_rotary_emb_io_nodes(ep, graph_module, match) - if io_nodes is None: - return - - assert len(io_nodes) == 6 - xq, xk, freqs_cos, freqs_sin, xq_out, xk_out = io_nodes - - # Create the custom op node - with graph_module.graph.inserting_before(xq_out): - rotary_emb_node = graph_module.graph.create_node( - "call_function", - exir_ops.edge.et_vk.apply_rotary_emb.default, - args=(xq, xk, freqs_cos, freqs_sin), - ) - - # The custom op returns a tuple (xq_out, xk_out) - # We need to extract the individual outputs - with graph_module.graph.inserting_after(rotary_emb_node): - getitem_0 = graph_module.graph.create_node( - "call_function", - operator.getitem, - args=(rotary_emb_node, 0), - ) - getitem_1 = graph_module.graph.create_node( - "call_function", - operator.getitem, - args=(rotary_emb_node, 1), - ) - - if hasattr(xq_out, "meta") and "val" in xq_out.meta: - getitem_0.meta["val"] = xq_out.meta["val"] - if hasattr(xk_out, "meta") and "val" in xk_out.meta: - getitem_1.meta["val"] = xk_out.meta["val"] - - xq_out.replace_all_uses_with(getitem_0) - xk_out.replace_all_uses_with(getitem_1) diff --git a/backends/vulkan/quantizer b/backends/vulkan/quantizer new file mode 120000 index 00000000000..46087273095 --- /dev/null +++ b/backends/vulkan/quantizer @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/quantizer \ No newline at end of file diff --git a/backends/vulkan/quantizer/TARGETS b/backends/vulkan/quantizer/TARGETS deleted file mode 100644 index 2c3ae37923a..00000000000 --- a/backends/vulkan/quantizer/TARGETS +++ /dev/null @@ -1,20 +0,0 @@ -load("@fbcode_macros//build_defs:python_library.bzl", "python_library") - -oncall("executorch") - -python_library( - name = "vulkan_quantizer", - srcs = ["vulkan_quantizer.py"], - deps = [ - ":vulkan_quantizer_utils", - "//caffe2:torch", - ], -) - -python_library( - name = "vulkan_quantizer_utils", - srcs = ["vulkan_quantizer_utils.py"], - deps = [ - "//caffe2:torch", - ], -) diff --git a/backends/vulkan/quantizer/vulkan_quantizer.py b/backends/vulkan/quantizer/vulkan_quantizer.py deleted file mode 100644 index 40212c35c27..00000000000 --- a/backends/vulkan/quantizer/vulkan_quantizer.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from __future__ import annotations - -import functools -from typing import Callable, Optional - -import torch -from executorch.backends.vulkan.quantizer.vulkan_quantizer_utils import ( - _convert_scalars_to_attrs, - bits_to_range, - OP_TO_ANNOTATOR, - propagate_annotation, -) -from torch.fx import Node -from torchao.quantization.pt2e import PerChannelMinMaxObserver, PlaceholderObserver -from torchao.quantization.pt2e.quantizer import ( - QuantizationConfig, - QuantizationSpec, - Quantizer, -) - - -__all__ = [ - "VulkanQuantizer", - "get_symmetric_quantization_config", -] - - -@functools.lru_cache -def get_symmetric_quantization_config( - is_dynamic: bool = False, - weight_bits: int = 8, - act_bits: int = 8, - act_qmin: Optional[int] = None, - act_qmax: Optional[int] = None, - weight_qmin: Optional[int] = None, - weight_qmax: Optional[int] = None, -) -> QuantizationConfig: - """ - Return a QuantizationConfig for Vulkan quantizer. - - Args: - is_dynamic: If False, weight-only quantization. If True, dynamic quantization (activation + weight) - weight_bits: Number of bits for weight quantization (4 or 8) - act_bits: Number of bits for activation quantization (8) - act_qmin: Minimum quantization value for activations (auto-calculated if None) - act_qmax: Maximum quantization value for activations (auto-calculated if None) - weight_qmin: Minimum quantization value for weights (auto-calculated if None) - weight_qmax: Maximum quantization value for weights (auto-calculated if None) - """ - assert weight_bits in { - 8, - 4, - }, f"Unsupported weight quantization bits: {weight_bits}" - - assert act_bits in { - 8, - }, f"Unsupported activation quantization bits: {act_bits}" - - # Auto-calculate weight ranges if not provided - if weight_qmin is None or weight_qmax is None: - weight_range = bits_to_range(weight_bits) - weight_qmin = weight_qmin if weight_qmin is not None else weight_range[0] - weight_qmax = weight_qmax if weight_qmax is not None else weight_range[1] - - # Weight quantization: per-channel symmetric for Vulkan - weight_quantization_spec = QuantizationSpec( - dtype=torch.int8, - quant_min=weight_qmin, - quant_max=weight_qmax, - qscheme=torch.per_channel_symmetric, - ch_axis=0, - is_dynamic=False, - observer_or_fake_quant_ctr=PerChannelMinMaxObserver, - ) - - # Configure activation quantization based on is_dynamic - if not is_dynamic: - # Weight-only quantization: no activation quantization - act_quantization_spec = None - output_activation_spec = None - else: - # Dynamic quantization: per-token input quantization, no output quantization - # Auto-calculate activation ranges if not provided - if act_qmin is None or act_qmax is None: - act_range = bits_to_range(act_bits) - act_qmin = act_qmin if act_qmin is not None else act_range[0] - act_qmax = act_qmax if act_qmax is not None else act_range[1] - - act_observer_or_fake_quant_ctr = PlaceholderObserver - act_quantization_spec = QuantizationSpec( - dtype=torch.int8, - quant_min=act_qmin, - quant_max=act_qmax, - qscheme=torch.per_tensor_affine, - is_dynamic=True, - observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr, - ) - output_activation_spec = None - - return QuantizationConfig( - input_activation=act_quantization_spec, - output_activation=output_activation_spec, - weight=weight_quantization_spec, - bias=None, - is_qat=False, - ) - - -_SUPPORTED_OPS = [ - "linear", -] - - -class VulkanQuantizer(Quantizer): - - def __init__(self) -> None: - super().__init__() - self.global_config: Optional[QuantizationConfig] = None - - def set_global(self, quantization_config: QuantizationConfig) -> VulkanQuantizer: - self.global_config = quantization_config - return self - - def transform_for_annotation( - self, model: torch.fx.GraphModule - ) -> torch.fx.GraphModule: - """Transforms scalar values to tensor attributes""" - return _convert_scalars_to_attrs(model) - - def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: - model = self._annotate_for_quantization_config(model) - propagate_annotation(model) - return model - - def _annotate_all_patterns( - self, - model: torch.fx.GraphModule, - quantization_config: Optional[QuantizationConfig], - filter_fn: Optional[Callable[[Node], bool]] = None, - ) -> torch.fx.GraphModule: - if quantization_config is None: - return model - - for op in _SUPPORTED_OPS: - OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn) - return model - - def _annotate_for_quantization_config( - self, model: torch.fx.GraphModule - ) -> torch.fx.GraphModule: - self._annotate_all_patterns( - model, - self.global_config, - ) - return model - - def validate(self, model: torch.fx.GraphModule) -> None: - pass diff --git a/backends/vulkan/quantizer/vulkan_quantizer_utils.py b/backends/vulkan/quantizer/vulkan_quantizer_utils.py deleted file mode 100644 index c0b6ab39e84..00000000000 --- a/backends/vulkan/quantizer/vulkan_quantizer_utils.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from typing import Callable, Optional, Tuple - -import torch -from torch.fx import Node -from torchao.quantization.pt2e.quantizer import ( - annotate_input_qspec_map, - annotate_output_qspec, - get_bias_qspec, - get_input_act_qspec, - get_output_act_qspec, - get_weight_qspec, - QuantizationAnnotation, - QuantizationConfig, - SharedQuantizationSpec, -) -from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix - -__all__ = [ - "OP_TO_ANNOTATOR", - "propagate_annotation", - "_convert_scalars_to_attrs", - "bits_to_range", -] - - -def bits_to_range(bits: int) -> Tuple[int, int]: - """ - Calculate quantization range for given number of bits. - - Args: - bits: Number of quantization bits - - Returns: - Tuple of (qmin, qmax) for the given bit width - """ - return ( - -(2 ** (bits - 1)), - (2 ** (bits - 1) - 1), - ) - - -AnnotatorType = Callable[ - [ - torch.fx.GraphModule, - Optional[QuantizationConfig], - Optional[Callable[[Node], bool]], - ], - Optional[list[list[Node]]], -] -OP_TO_ANNOTATOR: dict[str, AnnotatorType] = {} - - -def register_annotator(op: str) -> Callable[[AnnotatorType], None]: - def decorator(annotator: AnnotatorType) -> None: - OP_TO_ANNOTATOR[op] = annotator - - return decorator - - -def _is_annotated(nodes: list[Node]) -> bool: - """ - Given a list of nodes (that represents an operator pattern), - check if any of the node is annotated, return True if any of the node - is annotated, otherwise return False - """ - annotated = False - for node in nodes: - annotated = annotated or ( - "quantization_annotation" in node.meta - and node.meta["quantization_annotation"]._annotated - ) - return annotated - - -def _mark_nodes_as_annotated(nodes: list[Node]) -> None: - for node in nodes: - if node is not None: - if "quantization_annotation" not in node.meta: - node.meta["quantization_annotation"] = QuantizationAnnotation() - node.meta["quantization_annotation"]._annotated = True - - -@register_annotator("linear") -def _annotate_linear( - gm: torch.fx.GraphModule, - quantization_config: Optional[QuantizationConfig], - filter_fn: Optional[Callable[[Node], bool]] = None, -) -> Optional[list[list[Node]]]: - annotated_partitions = [] - input_act_qspec = get_input_act_qspec(quantization_config) - output_act_qspec = get_output_act_qspec(quantization_config) - weight_qspec = get_weight_qspec(quantization_config) - bias_qspec = get_bias_qspec(quantization_config) - for node in gm.graph.nodes: - if node.op != "call_function" or node.target != torch.ops.aten.linear.default: - continue - if filter_fn and not filter_fn(node): - continue - act_node = node.args[0] - weight_node = node.args[1] - bias_node = None - if len(node.args) > 2: - bias_node = node.args[2] - - if _is_annotated([node]) is False: # type: ignore[list-item] - annotate_input_qspec_map( - node, - act_node, - input_act_qspec, - ) - annotate_input_qspec_map( - node, - weight_node, - weight_qspec, - ) - nodes_to_mark_annotated = [node, weight_node] - if bias_node: - annotate_input_qspec_map( - node, - bias_node, - bias_qspec, - ) - nodes_to_mark_annotated.append(bias_node) - annotate_output_qspec(node, output_act_qspec) - _mark_nodes_as_annotated(nodes_to_mark_annotated) - annotated_partitions.append(nodes_to_mark_annotated) - - return annotated_partitions - - -def _is_share_obs_or_fq_op(op: Callable[..., torch.Tensor]) -> bool: - return op in [ - torch.ops.aten.relu.default, - torch.ops.aten.hardtanh.default, - torch.ops.aten.hardtanh_.default, - torch.ops.aten.max_pool2d.default, - torch.ops.aten.mean.default, - torch.ops.aten.mean.dim, - torch.ops.aten.permute.default, - torch.ops.aten.permute_copy.default, - torch.ops.aten.squeeze.dim, - torch.ops.aten.squeeze_copy.dim, - torch.ops.aten.adaptive_avg_pool2d.default, - torch.ops.aten.view_copy.default, - torch.ops.aten.view.default, - torch.ops.aten.slice_copy.Tensor, - torch.ops.aten.flatten.using_ints, - ] - - -def propagate_annotation(model: torch.fx.GraphModule) -> None: - for n in model.graph.nodes: - if n.op != "call_function" or not _is_share_obs_or_fq_op(n.target): - continue - - prev_node = n.args[0] - if not isinstance(prev_node, Node): - continue - - quantization_annotation = prev_node.meta.get("quantization_annotation", None) - if not quantization_annotation: - continue - - output_qspec = quantization_annotation.output_qspec - if not output_qspec: - continue - - # make sure current node is not annotated - if ( - "quantization_annotation" in n.meta - and n.meta["quantization_annotation"]._annotated - ): - continue - - shared_qspec = SharedQuantizationSpec(prev_node) - # propagate the previous output_qspec to the current node - n.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map={ - prev_node: shared_qspec, - }, - output_qspec=shared_qspec, - _annotated=True, - ) - - -def _convert_scalars_to_attrs(model: torch.fx.GraphModule) -> torch.fx.GraphModule: - for n in model.graph.nodes: - if n.op != "call_function" or n.target not in [ - torch.ops.aten.add.Tensor, - torch.ops.aten.mul.Tensor, - ]: - continue - args = list(n.args) - new_args = [] - for i in range(len(args)): - if isinstance(args[i], torch.fx.Node): - new_args.append(args[i]) - continue - prefix = "_tensor_constant_" - get_new_attr_name = get_new_attr_name_with_prefix(prefix) - tensor_constant_name = get_new_attr_name(model) - float_tensor = torch.tensor(float(args[i])) - model.register_buffer(tensor_constant_name, float_tensor) - fake_mode = n.meta["val"].fake_mode - with model.graph.inserting_before(n): - get_attr_node = model.graph.create_node( - "get_attr", tensor_constant_name, (), {} - ) - get_attr_node.meta["val"] = fake_mode.from_tensor( - float_tensor, static_shapes=True - ) - new_args.append(get_attr_node) - n.args = tuple(new_args) - model.recompile() - return model diff --git a/backends/vulkan/runtime b/backends/vulkan/runtime new file mode 120000 index 00000000000..a33641e0cc0 --- /dev/null +++ b/backends/vulkan/runtime @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/runtime \ No newline at end of file diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp deleted file mode 100644 index 7b138072d50..00000000000 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ /dev/null @@ -1,680 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include - -#include - -#include - -#include -#include -#include -#ifdef ET_EVENT_TRACER_ENABLED -#include -#endif // ET_EVENT_TRACER_ENABLED -#include -#include -#include -#include - -#include -#include /* strtol */ -#include -#include -#include -#include - -namespace executorch { -namespace backends { -namespace vulkan { -namespace { - -using executorch::runtime::ArrayRef; -using executorch::runtime::Backend; -using executorch::runtime::BackendExecutionContext; -using executorch::runtime::BackendInitContext; -using executorch::runtime::CompileSpec; -using executorch::runtime::DelegateHandle; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::FreeableBuffer; -using executorch::runtime::kTensorDimensionLimit; -using executorch::runtime::NamedDataMap; -using executorch::runtime::Result; -using executorch::runtime::Span; - -using namespace vkcompute; - -// Flatbuffer types -using VkGraphPtr = const vkgraph::VkGraph*; -using OpCallPtr = const vkgraph::OperatorCall*; -using VkValuePtr = const vkgraph::VkValue*; -using VkTensorPtr = const vkgraph::VkTensor*; -using VkBytesPtr = const vkgraph::VkBytes*; - -// Flatbuffer vector types -using VkValuesVector = - const flatbuffers::Vector>*; -using BytesVector = - const flatbuffers::Vector>*; -using UIntVector = const flatbuffers::Vector*; - -vkapi::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) { - switch (vk_datatype) { - case vkgraph::VkDataType::BOOL: - return vkapi::kBool; - case vkgraph::VkDataType::UINT8: - return vkapi::kByte; - case vkgraph::VkDataType::INT8: - return vkapi::kChar; - case vkgraph::VkDataType::INT32: - return vkapi::kInt; - case vkgraph::VkDataType::INT64: - return vkapi::kLong; - case vkgraph::VkDataType::FLOAT16: - return vkapi::kHalf; - case vkgraph::VkDataType::FLOAT32: - return vkapi::kFloat; - case vkgraph::VkDataType::FLOAT64: - return vkapi::kDouble; - } -} - -utils::StorageType get_storage_type( - const vkgraph::VkStorageType& vk_storage_type) { - switch (vk_storage_type) { - case vkgraph::VkStorageType::BUFFER: - return utils::kBuffer; - case vkgraph::VkStorageType::TEXTURE_3D: - return utils::kTexture3D; - case vkgraph::VkStorageType::TEXTURE_2D: - return utils::kTexture2D; - default: - break; - } - VK_THROW("Invalid storage type encountered!"); -} - -utils::GPUMemoryLayout get_memory_layout( - const vkgraph::VkMemoryLayout& vk_memory_layout) { - switch (vk_memory_layout) { - case vkgraph::VkMemoryLayout::TENSOR_WIDTH_PACKED: - return utils::kWidthPacked; - case vkgraph::VkMemoryLayout::TENSOR_HEIGHT_PACKED: - return utils::kHeightPacked; - case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED: - return utils::kChannelsPacked; - default: - break; - } - VK_THROW("Invalid memory layout encountered!"); -} - -GraphConfig get_graph_config(ArrayRef& compile_specs) { - GraphConfig config = GraphConfig(); - - for (const CompileSpec& spec : compile_specs) { - const uint8_t* value_data = (const uint8_t*)spec.value.buffer; - const size_t value_size = spec.value.nbytes; - if (strcmp(spec.key, "storage_type_override") == 0) { - ET_CHECK_MSG(value_size == sizeof(int32_t), "Unexpected value size!"); - int value_as_int = static_cast(getUInt32LE(value_data)); - utils::StorageType storage_type = - static_cast(value_as_int); - - config.set_storage_type_override(storage_type); - } - if (strcmp(spec.key, "memory_layout_override") == 0) { - ET_CHECK_MSG(value_size == sizeof(uint32_t), "Unexpected value size!"); - uint32_t value_as_int = getUInt32LE(value_data); - utils::GPUMemoryLayout memory_layout = - static_cast(value_as_int); - - config.set_memory_layout_override(memory_layout); - } - if (strcmp(spec.key, "require_dynamic_shapes") == 0) { - ET_CHECK_MSG(value_size == sizeof(uint8_t), "Unexpected value size!"); - bool value = getBool(value_data); - - if (value) { - config.expect_dynamic_shapes = true; - } - } - } -#ifdef ET_EVENT_TRACER_ENABLED - config.enable_querypool = true; -#endif // ET_EVENT_TRACER_ENABLED - return config; -} - -class GraphBuilder { - ComputeGraph* compute_graph_; - VkGraphPtr flatbuffer_; - const uint8_t* constant_data_; - const NamedDataMap* named_data_map_; - std::vector loaded_buffers_from_map_; - - std::vector ref_mapping_; - - public: - explicit GraphBuilder( - ComputeGraph* compute_graph, - VkGraphPtr flatbuffer, - const uint8_t* constant_data, - const NamedDataMap* named_data_map) - : compute_graph_(compute_graph), - flatbuffer_(flatbuffer), - constant_data_(constant_data), - named_data_map_(named_data_map), - loaded_buffers_from_map_(), - ref_mapping_() {} - - void resize(uint32_t size) { - ref_mapping_.resize(size, INT32_MAX); - } - - bool fb_id_exists(const uint32_t fb_id) { - return fb_id < ref_mapping_.size() && ref_mapping_[fb_id] != INT32_MAX; - } - - ValueRef get_fb_id_valueref(const uint32_t fb_id) { - ET_CHECK_MSG( - fb_id_exists(fb_id), - "Trying to extract a value that hasn't yet been added to the graph."); - - return ref_mapping_[fb_id]; - } - - void add_tensor_to_graph(const uint32_t fb_id, VkTensorPtr tensor_fb) { - const vkapi::ScalarType& dtype = get_scalar_type(tensor_fb->datatype()); - utils::StorageType storage_type = - tensor_fb->storage_type() == vkgraph::VkStorageType::DEFAULT_STORAGE - ? compute_graph_->suggested_storage_type() - : get_storage_type(tensor_fb->storage_type()); - - UIntVector dims_fb = tensor_fb->dims(); - const std::vector dims_vector(dims_fb->cbegin(), dims_fb->cend()); - - utils::GPUMemoryLayout memory_layout = - tensor_fb->memory_layout() == vkgraph::VkMemoryLayout::DEFAULT_LAYOUT - ? compute_graph_->suggested_memory_layout(dims_vector) - : get_memory_layout(tensor_fb->memory_layout()); - - ValueRef ref; - if (tensor_fb->constant_id() >= 0) { - VkBytesPtr constant_bytes = - flatbuffer_->constants()->Get(tensor_fb->constant_id()); - - if (constant_bytes->named_key() != nullptr && - constant_bytes->offset() == UINT64_MAX && - named_data_map_ != nullptr) { - const std::string& data_name = constant_bytes->named_key()->str(); - Result buffer = - named_data_map_->get_data(data_name.c_str()); - - VK_CHECK_COND( - buffer.ok(), - "Failed to get constant data for key %s from named_data_map. Error code: %u", - data_name.c_str(), - static_cast(buffer.error())); - ref = compute_graph_->add_tensorref( - dims_vector, dtype, std::move(buffer.get())); - } else { - const uint8_t* tensor_data = constant_data_ + constant_bytes->offset(); - ref = compute_graph_->add_tensorref(dims_vector, dtype, tensor_data); - } - } else { - ref = compute_graph_->add_tensor( - dims_vector, - dtype, - storage_type, - memory_layout, - tensor_fb->mem_obj_id()); - } - - ref_mapping_[fb_id] = ref; - } - - void add_none_to_graph(const uint32_t fb_id) { - ValueRef ref = compute_graph_->add_none(); - ref_mapping_[fb_id] = ref; - } - - template - typename std::enable_if::value, void>::type - add_scalar_to_graph(const uint32_t fb_id, T value) { - ValueRef ref = compute_graph_->add_scalar(value); - ref_mapping_[fb_id] = ref; - } - - template - typename std::enable_if::value, void>::type - add_scalar_list_to_graph(const uint32_t fb_id, std::vector&& value) { - ValueRef ref = compute_graph_->add_scalar_list(std::move(value)); - ref_mapping_[fb_id] = ref; - } - - void add_value_list_to_graph( - const uint32_t fb_id, - std::vector&& value) { - ValueRef ref = compute_graph_->add_value_list(std::move(value)); - ref_mapping_[fb_id] = ref; - } - - void add_string_to_graph(const uint32_t fb_id, VkValuePtr value) { - const auto fb_str = value->value_as_String()->string_val(); - std::string string(fb_str->cbegin(), fb_str->cend()); - ValueRef ref = compute_graph_->add_string(std::move(string)); - ref_mapping_[fb_id] = ref; - } - - void add_symint_to_graph(const uint32_t fb_id, VkValuePtr value) { - const int32_t fb_symint = value->value_as_SymInt()->value(); - ValueRef ref = compute_graph_->add_symint(fb_symint); - ref_mapping_[fb_id] = ref; - } - - void add_value_to_graph(const uint32_t fb_id, VkValuePtr value) { - ET_CHECK_MSG( - !fb_id_exists(fb_id), - "Trying to add a value that has already been added to the graph."); - - switch (value->value_type()) { - case vkgraph::GraphTypes::Null: - add_none_to_graph(fb_id); - break; - case vkgraph::GraphTypes::Int: - add_scalar_to_graph(fb_id, value->value_as_Int()->int_val()); - break; - case vkgraph::GraphTypes::Double: - add_scalar_to_graph(fb_id, value->value_as_Double()->double_val()); - break; - case vkgraph::GraphTypes::Bool: - add_scalar_to_graph(fb_id, value->value_as_Bool()->bool_val()); - break; - case vkgraph::GraphTypes::VkTensor: - add_tensor_to_graph(fb_id, value->value_as_VkTensor()); - break; - case vkgraph::GraphTypes::IntList: - add_scalar_list_to_graph( - fb_id, - std::vector( - value->value_as_IntList()->items()->cbegin(), - value->value_as_IntList()->items()->cend())); - break; - case vkgraph::GraphTypes::DoubleList: - add_scalar_list_to_graph( - fb_id, - std::vector( - value->value_as_DoubleList()->items()->cbegin(), - value->value_as_DoubleList()->items()->cend())); - break; - case vkgraph::GraphTypes::BoolList: - add_scalar_list_to_graph( - fb_id, - std::vector( - value->value_as_BoolList()->items()->cbegin(), - value->value_as_BoolList()->items()->cend())); - break; - case vkgraph::GraphTypes::ValueList: - add_value_list_to_graph( - fb_id, - std::vector( - value->value_as_ValueList()->items()->cbegin(), - value->value_as_ValueList()->items()->cend())); - break; - case vkgraph::GraphTypes::String: - add_string_to_graph(fb_id, value); - break; - case vkgraph::GraphTypes::SymInt: - add_symint_to_graph(fb_id, value); - break; - default: - ET_CHECK_MSG(false, "Unsupported value type."); - } - } - - void build_graph() { - // Resize the mapping to the number of values in the flatbuffer - resize(flatbuffer_->values()->size()); - - // First, add all values to the graph - for (uint32_t fb_id = 0; fb_id < flatbuffer_->values()->size(); ++fb_id) { - VkValuePtr value = flatbuffer_->values()->Get(fb_id); - add_value_to_graph(fb_id, value); - } - - // Parse the inputs, which will be tensors most of the time but can also be - // symints and tensorrefs (which will be the case if the original graph had) - // mutable buffers. - for (const uint32_t fb_id : *flatbuffer_->input_ids()) { - const ValueRef ref = get_fb_id_valueref(fb_id); - if (compute_graph_->val_is_tensor(ref)) { - compute_graph_->set_input_tensor(ref); - } else { - compute_graph_->set_val_as_input(ref); - } - } - - // Parse the operators - for (OpCallPtr op_call : *(flatbuffer_->chain())) { - std::string op_name = op_call->name()->str(); - ET_CHECK_MSG(VK_HAS_OP(op_name), "Missing operator: %s", op_name.c_str()); - - std::vector args; - args.reserve(op_call->args()->size()); - for (const auto arg_fb_id : *op_call->args()) { - args.push_back(get_fb_id_valueref(static_cast(arg_fb_id))); - } - - auto vkFn = VK_GET_OP_FN(op_name); - vkFn(*compute_graph_, args); - } - - // Parse the outputs, which will be mostly tensors but may contain tensorref - // values as well if the source graph returns parameter nodes. - for (const uint32_t fb_id : *flatbuffer_->output_ids()) { - const ValueRef ref = get_fb_id_valueref(fb_id); - compute_graph_->set_output_value(ref); - } - - if (compute_graph_->graphconfig().enable_querypool) { - for (uint32_t i = 0; i < compute_graph_->prepack_nodes().size(); ++i) { - compute_graph_->prepack_nodes()[i]->set_node_id(i); - } - for (uint32_t i = 0; i < compute_graph_->execute_nodes().size(); ++i) { - compute_graph_->execute_nodes()[i]->set_node_id(i); - } - } - } -}; - -// -// Execution tools -// - -bool maybe_resize_input( - ComputeGraph* graph, - const size_t input_i, - executorch::aten::Tensor& et_tensor) { - ValueRef in_tensor_ref = graph->inputs()[input_i].value; - - const std::vector in_tensor_vk_sizes = - graph->sizes_of(in_tensor_ref); - - ET_CHECK_MSG( - et_tensor.dim() == in_tensor_vk_sizes.size(), - "Cannot resize input tensor: old ndim %zu does not match new ndim %zu", - static_cast(in_tensor_vk_sizes.size()), - static_cast(et_tensor.dim())); - - bool should_resize = false; - std::vector new_sizes(et_tensor.dim()); - for (size_t i = 0; i < et_tensor.dim(); i++) { - if (in_tensor_vk_sizes[i] != et_tensor.sizes()[i]) { - should_resize = true; - } - new_sizes.at(i) = et_tensor.sizes()[i]; - } - - if (should_resize) { - graph->resize_input(input_i, new_sizes); - } - - const size_t in_tensor_vk_numel = graph->numel_of(in_tensor_ref); - ET_CHECK_MSG( - in_tensor_vk_numel == et_tensor.numel(), - "Vulkan tensor numel %zu does not match ET tensor numel %zu", - static_cast(in_tensor_vk_numel), - static_cast(et_tensor.numel())); - - return should_resize; -} - -bool maybe_update_scalar_tensor( - ComputeGraph* graph, - const ValueRef ref, - executorch::aten::Tensor& scalar_tensor_src) { - const int32_t cur_val = graph->read_symint(ref); - int32_t scalar_tensor_val = 0; - executorch::aten::ScalarType dtype = scalar_tensor_src.scalar_type(); - if (dtype == executorch::aten::ScalarType::Int) { - scalar_tensor_val = *scalar_tensor_src.const_data_ptr(); - } else if (dtype == executorch::aten::ScalarType::Long) { - scalar_tensor_val = int32_t(*scalar_tensor_src.const_data_ptr()); - } - bool was_updated = false; - if (scalar_tensor_val != cur_val) { - graph->set_symint(ref, scalar_tensor_val); - was_updated = true; - } - return was_updated; -} - -void maybe_resize_output( - ComputeGraph* graph, - const size_t output_i, - executorch::aten::Tensor& et_tensor) { - ValueRef out_tensor_ref = graph->outputs()[output_i].value; - - const std::vector out_tensor_vk_sizes = - graph->sizes_of(out_tensor_ref); - - executorch::aten::SizesType new_output_size[kTensorDimensionLimit]; - size_t ndim = out_tensor_vk_sizes.size(); - for (int i = 0; i < ndim; ++i) { - new_output_size[i] = out_tensor_vk_sizes[i]; - } - - executorch::aten::ArrayRef output_size{ - new_output_size, ndim}; - Error err = resize_tensor(et_tensor, output_size); - - ET_CHECK_MSG(err == Error::Ok, "Failed to resize output tensor."); -} - -// -// VulkanBackend class -// - -class VulkanBackend final : public ::executorch::runtime::BackendInterface { - public: - ~VulkanBackend() override = default; - - bool is_available() const override { - // TODO(ssjia): replace with an actual Vulkan runtime availability check - return true; - } - - ET_NODISCARD Error compileModel( - const void* buffer_pointer, - ComputeGraph* compute_graph, - const NamedDataMap* named_data_map) const { - Result header = - VulkanDelegateHeader::parse(buffer_pointer); - - const uint8_t* flatbuffer_data = nullptr; - const uint8_t* constant_data = nullptr; - - if (header.ok()) { - const uint8_t* buffer_start = - reinterpret_cast(buffer_pointer); - flatbuffer_data = buffer_start + header->flatbuffer_offset; - constant_data = buffer_start + header->bytes_offset; - } else { - ET_LOG(Error, "VulkanDelegateHeader may be corrupt"); - return header.error(); - } - - ET_CHECK_OR_RETURN_ERROR( - vkgraph::VkGraphBufferHasIdentifier(flatbuffer_data), - DelegateInvalidCompatibility, - "Vulkan Delegate Serialization Format version identifier '%.4s' != expected '%.4s'", - flatbuffers::GetBufferIdentifier(flatbuffer_data), - vkgraph::VkGraphIdentifier()); - - VkGraphPtr flatbuffer_graph = vkgraph::GetVkGraph(flatbuffer_data); - - GraphBuilder builder( - compute_graph, flatbuffer_graph, constant_data, named_data_map); - - builder.build_graph(); - - compute_graph->prepare(); - compute_graph->prepare_pipelines(); - - compute_graph->prepack(); - - return Error::Ok; - } - - Result init( - BackendInitContext& context, - FreeableBuffer* processed, - ArrayRef compile_specs) const override { - ComputeGraph* compute_graph = - context.get_runtime_allocator()->allocateInstance(); - if (compute_graph == nullptr) { - return Error::MemoryAllocationFailed; - } - - GraphConfig graph_config = get_graph_config(compile_specs); - graph_config.external_adapter = vkapi::set_and_get_external_adapter(); - new (compute_graph) ComputeGraph(graph_config); - - const NamedDataMap* named_data_map = context.get_named_data_map(); - Error err = compileModel(processed->data(), compute_graph, named_data_map); - - // This backend does not need its processed data after compiling the - // model. - processed->Free(); - - if (err != Error::Ok) { - return err; - } - - return compute_graph; - } - - Error execute( - ET_UNUSED BackendExecutionContext& context, - DelegateHandle* handle, - Span args) const override { - EXECUTORCH_SCOPE_PROF("VulkanBackend::execute"); - - ComputeGraph* compute_graph = static_cast(handle); - - const size_t num_inputs = compute_graph->inputs().size(); - bool should_propagate_resize = false; - for (size_t i = 0; i < num_inputs; i++) { - const ValueRef iref = compute_graph->inputs()[i].value; - if (compute_graph->val_is_tensor(iref)) { - VK_CHECK_COND(args[i]->isTensor()); - bool was_resized = - maybe_resize_input(compute_graph, i, args[i]->toTensor()); - should_propagate_resize = should_propagate_resize || was_resized; - compute_graph->copy_into_staging( - compute_graph->inputs()[i].staging, - args[i]->toTensor().const_data_ptr(), - args[i]->toTensor().numel()); - } else if (compute_graph->val_is_symint(iref)) { - VK_CHECK_COND( - args[i]->isTensor(), - "Cannot handle symint arg to graph that is not derived from a " - "scalar tensor at the moment."); - bool was_updated = maybe_update_scalar_tensor( - compute_graph, iref, args[i]->toTensor()); - // Since symint inputs may impact tensor's sizes, trigger a resize if - // any symbolic integer shapes are updated. - should_propagate_resize = should_propagate_resize || was_updated; - } else { - VK_THROW( - "Could not handle input with type ", - compute_graph->get_val_type(iref)); - } - } - - if (should_propagate_resize) { - compute_graph->propagate_resize(); - } - - compute_graph->execute(); - - for (size_t i = 0; i < compute_graph->outputs().size(); i++) { - const size_t o = i + num_inputs; - const ValueRef oref = compute_graph->outputs()[i].value; - if (compute_graph->val_is_tensor(oref)) { - VK_CHECK_COND(args[o]->isTensor()); - maybe_resize_output(compute_graph, i, args[o]->toTensor()); - // args holds inputs directly followed by outputs, so the i'th output - // for compute_graph corresponds to the o'th arg - compute_graph->copy_from_staging( - compute_graph->outputs()[i].staging, - args[o]->toTensor().mutable_data_ptr(), - args[o]->toTensor().numel()); - } - // TensorRef values represent constant tensors which will not have been - // modified by the graph execution. Therefore, if a constant tensor is - // returned as an output, no action is required. - else if (compute_graph->val_is_tref(oref)) { - continue; - } else { - VK_THROW( - "Could not handle output with type ", - compute_graph->get_val_type(oref)); - } - } - -#ifdef ET_EVENT_TRACER_ENABLED - runtime::EventTracer* event_tracer = context.event_tracer(); - compute_graph->context()->querypool().extract_results(); - for (const auto& r : - compute_graph->context()->querypool().get_shader_timestamp_data()) { - std::string event_name = - r.kernel_name + "_" + std::to_string(r.dispatch_id); - event_tracer_log_profiling_delegate( - event_tracer, - event_name.c_str(), - /* delegate_debug_id = */ -1, - r.start_time_ns, - r.end_time_ns, - (void*)(&r.metadata), - sizeof(r.metadata)); - } -#endif // ET_EVENT_TRACER_ENABLED - - return Error::Ok; - } - - void destroy(DelegateHandle* handle) const override { - if (handle != nullptr) { - ComputeGraph* compute_graph = static_cast(handle); - compute_graph->context() - ->adapter_ptr() - ->compute_pipeline_cache() - .save_cache(); - // ComputeGraph is not trivially destructible. Since - // this was constructed manually in init(), we must destroy it manually - // here. - compute_graph->~ComputeGraph(); - } - } -}; - -auto cls = VulkanBackend(); -Backend backend{"VulkanBackend", &cls}; -static auto success_with_compiler = register_backend(backend); - -} // namespace -} // namespace vulkan -} // namespace backends -} // namespace executorch diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp deleted file mode 100644 index 2a235144342..00000000000 --- a/backends/vulkan/runtime/VulkanDelegateHeader.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include - -#pragma clang diagnostic ignored "-Wdeprecated" - -namespace executorch { -namespace backends { -namespace vulkan { - -using executorch::runtime::Error; -using executorch::runtime::Result; - -namespace { - -struct ByteSlice { - size_t offset; - size_t size; -}; - -constexpr size_t kExpectedSize = 30; -constexpr char kExpectedMagic[4] = {'V', 'H', '0', '0'}; - -constexpr ByteSlice kMagic = {4, 4}; -constexpr ByteSlice kHeaderSize = {8, 2}; -constexpr ByteSlice kFlatbufferOffset = {10, 4}; -constexpr ByteSlice kFlatbufferSize = {14, 4}; -constexpr ByteSlice kBytesOffset = {18, 4}; -constexpr ByteSlice kBytesSize = {22, 8}; - -} // namespace - -/// Interprets the 8 bytes at `data` as a little-endian uint64_t. -uint64_t getUInt64LE(const uint8_t* data) { - return (uint64_t)data[0] | ((uint64_t)data[1] << 8) | - ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) | - ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) | - ((uint64_t)data[6] << 48) | ((uint64_t)data[7] << 56); -} - -/// Interprets the 4 bytes at `data` as a little-endian uint32_t. -uint32_t getUInt32LE(const uint8_t* data) { - return (uint32_t)data[0] | ((uint32_t)data[1] << 8) | - ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 24); -} - -/// Interprets the 2 bytes at `data` as a little-endian uint32_t. -uint32_t getUInt16LE(const uint8_t* data) { - return (uint32_t)data[0] | ((uint32_t)data[1] << 8); -} - -bool getBool(const uint8_t* data) { - return data[0] != 0; -} - -bool VulkanDelegateHeader::is_valid() const { - if (header_size < kExpectedSize) { - return false; - } - if (flatbuffer_offset < header_size) { - return false; - } - if (flatbuffer_size == 0) { - return false; - } - if (bytes_offset < flatbuffer_offset + flatbuffer_size) { - return false; - } - if (bytes_size < 0) { - return false; - } - - return true; -} - -Result VulkanDelegateHeader::parse(const void* data) { - const uint8_t* header_data = (const uint8_t*)data; - - const uint8_t* magic_start = header_data + kMagic.offset; - if (std::memcmp(magic_start, kExpectedMagic, kMagic.size) != 0) { - return Error::NotFound; - } - - VulkanDelegateHeader header = VulkanDelegateHeader{ - getUInt16LE(header_data + kHeaderSize.offset), - getUInt32LE(header_data + kFlatbufferOffset.offset), - getUInt32LE(header_data + kFlatbufferSize.offset), - getUInt32LE(header_data + kBytesOffset.offset), - getUInt64LE(header_data + kBytesSize.offset), - }; - - if (!header.is_valid()) { - return Error::InvalidArgument; - } - - return header; -} - -} // namespace vulkan -} // namespace backends -} // namespace executorch diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h deleted file mode 100644 index 722f01cbb75..00000000000 --- a/backends/vulkan/runtime/VulkanDelegateHeader.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace executorch { -namespace backends { -namespace vulkan { - -// Byte decoding utilities -uint64_t getUInt64LE(const uint8_t* data); -uint32_t getUInt32LE(const uint8_t* data); -uint32_t getUInt16LE(const uint8_t* data); - -// Bool is serialized as a single byte -bool getBool(const uint8_t* data); - -struct VulkanDelegateHeader { - bool is_valid() const; - - static executorch::runtime::Result parse( - const void* data); - - uint32_t header_size; - uint32_t flatbuffer_offset; - uint32_t flatbuffer_size; - uint32_t bytes_offset; - uint64_t bytes_size; -}; - -} // namespace vulkan -} // namespace backends -} // namespace executorch diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp deleted file mode 100644 index 8599cbfffb6..00000000000 --- a/backends/vulkan/runtime/api/Context.cpp +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#ifdef VULKAN_DEBUG -#include -#include -#endif // VULKAN_DEBUG - -#ifndef VULKAN_DESCRIPTOR_POOL_SIZE -#define VULKAN_DESCRIPTOR_POOL_SIZE 1024u -#endif - -#ifndef VULKAN_QUERY_POOL_SIZE -#define VULKAN_QUERY_POOL_SIZE 4096u -#endif - -namespace vkcompute { -namespace api { - -Context::Context(vkapi::Adapter* adapter, const ContextConfig& config) - : config_(config), - // Important handles - adapter_p_(adapter), - device_(adapter_p_->device_handle()), - queue_(adapter_p_->request_queue()), - // Resource pools - command_pool_(device_, queue_.family_index, config_.cmd_pool_config), - descriptor_pool_(device_, config_.descriptor_pool_config), - fences_(device_), - // Profiling - querypool_(config_.query_pool_config, nullptr), - // Command buffer submission - cmd_mutex_{}, - cmd_(VK_NULL_HANDLE, 0u), - submit_count_{0u}, - // Memory Management - buffer_clearlist_mutex_{}, - buffers_to_clear_{}, - image_clearlist_mutex_{}, - images_to_clear_{}, - preferred_image_tiling_{VK_IMAGE_TILING_OPTIMAL} { - if (adapter_p_->linear_tiling_3d_enabled()) { - preferred_image_tiling_ = VK_IMAGE_TILING_LINEAR; - } -} - -Context::~Context() { - try { - flush(); - // Let the device know the context is done with the queue - adapter_p_->return_queue(queue_); - } catch (...) { - } -} - -void Context::initialize_querypool() { - querypool_.initialize(adapter_p_); -} - -void Context::cmd_reset_querypool() { - if (querypool_) { - set_cmd(); - querypool_.reset_querypool(cmd_); - } -} - -void Context::report_shader_dispatch_start( - const std::string& shader_name, - const utils::uvec3& global_wg_size, - const utils::WorkgroupSize& local_wg_size, - const uint32_t dispatch_id) { - if (querypool_) { - querypool_.shader_profile_begin( - cmd_, - dispatch_id, - shader_name, - vkapi::create_extent3d(global_wg_size), - vkapi::create_extent3d((utils::uvec3)local_wg_size)); - } -} - -void Context::report_shader_dispatch_end() { - if (querypool_) { - querypool_.shader_profile_end(cmd_); - } -} - -void Context::check_device_capabilities(const vkapi::ShaderInfo& shader) { - if (shader.requires_shader_int16) { - if (!adapter_p_->supports_int16_shader_types()) { - throw vkapi::ShaderNotSupportedError( - shader.kernel_name, vkapi::VulkanExtension::SHADER_INT16); - } - } - if (shader.requires_16bit_storage) { - if (!adapter_p_->supports_16bit_storage_buffers()) { - throw vkapi::ShaderNotSupportedError( - shader.kernel_name, vkapi::VulkanExtension::INT16_STORAGE); - } - } - if (shader.requires_8bit_storage) { - if (!adapter_p_->supports_8bit_storage_buffers()) { - throw vkapi::ShaderNotSupportedError( - shader.kernel_name, vkapi::VulkanExtension::INT8_STORAGE); - } - } - if (shader.requires_integer_dot_product) { - if (!adapter_p_->supports_int8_dot_product()) { - throw vkapi::ShaderNotSupportedError( - shader.kernel_name, vkapi::VulkanExtension::INTEGER_DOT_PRODUCT); - } - } -} - -vkapi::DescriptorSet Context::get_descriptor_set( - const vkapi::ShaderInfo& shader_descriptor, - const utils::WorkgroupSize& local_workgroup_size, - const vkapi::SpecVarList& additional_constants, - const uint32_t push_constants_size) { - VkDescriptorSetLayout shader_layout = - shader_layout_cache().retrieve(shader_descriptor.kernel_layout); - - VkPipelineLayout pipeline_layout = - pipeline_layout_cache().retrieve(shader_layout, push_constants_size); - - vkapi::SpecVarList spec_constants = { - SV(local_workgroup_size[0u]), - SV(local_workgroup_size[1u]), - SV(local_workgroup_size[2u])}; - - spec_constants.append(additional_constants); - - VkPipeline pipeline = pipeline_cache().retrieve( - {pipeline_layout_cache().retrieve(shader_layout, push_constants_size), - shader_cache().retrieve(shader_descriptor), - spec_constants}); - - cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size); - - return descriptor_pool().get_descriptor_set( - shader_layout, shader_descriptor.kernel_layout); -} - -void Context::register_shader_dispatch( - const vkapi::DescriptorSet& descriptors, - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::ShaderInfo& shader_descriptor, - const utils::uvec3& global_workgroup_size, - const void* push_constants_data, - const uint32_t push_constants_size) { - // Adjust the global workgroup size based on the output tile size - uint32_t global_wg_w = utils::div_up( - global_workgroup_size[0u], shader_descriptor.out_tile_size[0u]); - uint32_t global_wg_h = utils::div_up( - global_workgroup_size[1u], shader_descriptor.out_tile_size[1u]); - uint32_t global_wg_d = utils::div_up( - global_workgroup_size[2u], shader_descriptor.out_tile_size[2u]); - - // Submitting a global work group size of 0 is undefined behaviour. If this is - // detected then submit a single workgroup instead. - if (global_wg_w == 0u || global_wg_h == 0u || global_wg_d == 0u) { - global_wg_w = 1u; - global_wg_h = 1u; - global_wg_d = 1u; - } - - const utils::uvec3 effective_global_wg = { - global_wg_w, - global_wg_h, - global_wg_d, - }; - - cmd_.bind_descriptors(descriptors.get_bind_handle()); - cmd_.insert_barrier(pipeline_barrier); - - if (push_constants_size > 0 && push_constants_data != nullptr) { - const VkDescriptorSetLayout shader_layout = - shader_layout_cache().retrieve(shader_descriptor.kernel_layout); - const VkPipelineLayout pipeline_layout = - pipeline_layout_cache().retrieve(shader_layout, push_constants_size); - cmd_.set_push_constants( - pipeline_layout, push_constants_data, push_constants_size); - } - - cmd_.dispatch(effective_global_wg); -} - -void Context::register_blit( - vkapi::PipelineBarrier& pipeline_barrier, - vkapi::VulkanImage& src, - vkapi::VulkanImage& dst) { - cmd_.insert_barrier(pipeline_barrier); - cmd_.blit(src, dst); -} - -void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) { - if (cmd_) { - cmd_.end(); - adapter_p_->submit_cmd( - queue_, - cmd_.get_submit_handle(final_use), - fence_handle, - VK_NULL_HANDLE, - VK_NULL_HANDLE); - - submit_count_ = 0u; - } -} - -void Context::flush() { - VK_CHECK(vkQueueWaitIdle(queue().handle)); - - command_pool_.flush(); - descriptor_pool_.flush(); - - // If there is an existing command buffer, invalidate it - if (cmd_) { - cmd_.invalidate(); - } - - std::lock_guard bufferlist_lock(buffer_clearlist_mutex_); - std::lock_guard imagelist_lock(image_clearlist_mutex_); - buffers_to_clear_.clear(); - images_to_clear_.clear(); -} - -bool available() { - return context(); -} - -Context* context() { - static const std::unique_ptr context([]() -> Context* { - try { - const uint32_t cmd_submit_frequency = 16u; - - const vkapi::CommandPoolConfig cmd_config{ - 32u, // cmdPoolInitialSize - 8u, // cmdPoolBatchSize - }; - - const vkapi::DescriptorPoolConfig descriptor_pool_config{ - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount - VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount - 32u, // descriptorPileSizes - }; - - const vkapi::QueryPoolConfig query_pool_config{ - VULKAN_QUERY_POOL_SIZE, // maxQueryCount - 256u, // initialReserveSize - }; - - const ContextConfig config{ - cmd_submit_frequency, - cmd_config, - descriptor_pool_config, - query_pool_config, - }; - - return new Context(vkapi::runtime()->get_adapter_p(), config); - } catch (...) { - } - - return nullptr; - }()); - - return context.get(); -} - -#ifdef VULKAN_DEBUG - -#ifdef VK_KHR_pipeline_executable_properties - -VkPipeline Context::get_shader_pipeline( - const vkapi::ShaderInfo& shader, - const vkapi::SpecVarList& additional_constants) { - const uint32_t push_constants_size = 128u; - - VkDescriptorSetLayout shader_layout = - shader_layout_cache().retrieve(shader.kernel_layout); - VkPipelineLayout pipeline_layout = - pipeline_layout_cache().retrieve(shader_layout, push_constants_size); - - const utils::WorkgroupSize local_workgroup_size(4u, 4u, 1u); - vkapi::SpecVarList spec_constants = { - SV(local_workgroup_size[0u]), - SV(local_workgroup_size[1u]), - SV(local_workgroup_size[2u])}; - - spec_constants.append(additional_constants); - - VkPipeline pipeline = pipeline_cache().retrieve( - {pipeline_layout, shader_cache().retrieve(shader), spec_constants}); - - return pipeline; -} - -std::vector -Context::get_pipeline_executable_props(const VkPipeline pipeline) { - VkPipelineInfoKHR pipeline_info{ - VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR, - nullptr, - pipeline, - }; - - uint32_t shader_props_count = 0u; - vkGetPipelineExecutablePropertiesKHR( - device(), &pipeline_info, &shader_props_count, nullptr); - - std::vector pipeline_props( - shader_props_count); - for (int i = 0; i < shader_props_count; i++) { - pipeline_props.at(i).sType = - VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_PROPERTIES_KHR; - pipeline_props.at(i).pNext = nullptr; - } - vkGetPipelineExecutablePropertiesKHR( - device(), &pipeline_info, &shader_props_count, pipeline_props.data()); - - return pipeline_props; -} - -std::tuple< - std::vector, - std::vector>> -Context::get_shader_executable_irs( - const VkPipeline pipeline, - const uint32_t pipeline_exec_idx) { - VkPipelineExecutableInfoKHR exec_info{ - VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INFO_KHR, - nullptr, - pipeline, - pipeline_exec_idx, - }; - - uint32_t ir_count; - VK_CHECK(vkGetPipelineExecutableInternalRepresentationsKHR( - device(), &exec_info, &ir_count, nullptr)); - - std::vector irs(ir_count); - for (int i = 0; i < ir_count; i++) { - irs.at(i).sType = - VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INTERNAL_REPRESENTATION_KHR; - irs.at(i).pNext = nullptr; - irs.at(i).pData = nullptr; - } - VK_CHECK(vkGetPipelineExecutableInternalRepresentationsKHR( - device(), &exec_info, &ir_count, irs.data())); - - std::vector> irs_data(ir_count); - for (int i = 0; i < ir_count; i++) { - irs_data.at(i).resize(irs.at(i).dataSize); - irs.at(i).pData = irs_data.at(i).data(); - } - VK_CHECK(vkGetPipelineExecutableInternalRepresentationsKHR( - device(), &exec_info, &ir_count, irs.data())); - - return std::make_tuple(irs, irs_data); -} - -std::vector -Context::get_shader_executable_stats( - const VkPipeline pipeline, - const uint32_t pipeline_exec_idx) { - VkPipelineExecutableInfoKHR exec_info{ - VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INFO_KHR, - nullptr, - pipeline, - pipeline_exec_idx, - }; - - uint32_t stats_count; - VK_CHECK(vkGetPipelineExecutableStatisticsKHR( - device(), &exec_info, &stats_count, NULL)); - - std::vector shader_stats(stats_count); - for (int i = 0; i < stats_count; i++) { - shader_stats.at(i).sType = - VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_STATISTIC_KHR; - shader_stats.at(i).pNext = nullptr; - } - vkGetPipelineExecutableStatisticsKHR( - device(), &exec_info, &stats_count, shader_stats.data()); - - return shader_stats; -} - -std::ostream& operator<<( - std::ostream& os, - const VkPipelineExecutablePropertiesKHR& props) { - os << std::left << std::setw(10) << "name: " << props.name << std::endl; - os << std::left << std::setw(10) << "descr: " << props.description - << std::endl; - os << std::left << std::setw(10) << "subgroup: " << props.subgroupSize - << std::endl; - - return os; -} - -std::ostream& operator<<( - std::ostream& os, - const VkPipelineExecutableInternalRepresentationKHR& ir) { - os << std::left << std::setw(10) << "descr: " << ir.description << std::endl; - os << std::left << std::setw(10) << "isText: " << ir.isText << std::endl; - os << std::left << std::setw(10) << "size: " << ir.dataSize << std::endl; - if (ir.isText) { - os << "text:" << std::endl; - char* str = (char*)ir.pData; - os << str << std::endl; - } - return os; -} - -std::ostream& operator<<( - std::ostream& os, - VkPipelineExecutableStatisticKHR& stat) { - os << stat.name << ": "; - switch (stat.format) { - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR: - os << (stat.value.b32 ? "true" : "false") << std::endl; - break; - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR: - os << stat.value.i64 << std::endl; - break; - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR: - os << stat.value.u64 << std::endl; - break; - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR: - os << stat.value.f64 << std::endl; - break; - default: - break; - } - os << " " << stat.description << std::endl; - return os; -} - -std::ostream& operator<<( - std::ostream& os, - std::vector& shader_stats) { - for (int i = 0; i < shader_stats.size(); i++) { - VkPipelineExecutableStatisticKHR& stat = shader_stats.at(i); - os << stat; - } - return os; -} - -void Context::print_shader_executable_properties( - const vkapi::ShaderInfo& shader, - const vkapi::SpecVarList& spec_constants) { - VkPipeline pipeline = get_shader_pipeline(shader, spec_constants); - - std::vector pipeline_props_list = - get_pipeline_executable_props(pipeline); - - VK_CHECK_COND(pipeline_props_list.size() == 1u); - - std::cout << pipeline_props_list.at(0) << std::endl; - - std::tuple< - std::vector, - std::vector>> - irs_and_irs_data = get_shader_executable_irs(pipeline, 0u); - - std::vector& irs = - std::get<0>(irs_and_irs_data); - - std::cout << "Found " << irs.size() << " IRs" << std::endl << std::endl; - for (int i = 0; i < irs.size(); i++) { - std::cout << "====== IR " << i << ": " << irs.at(i).name - << " ======" << std::endl; - std::cout << irs.at(i) << std::endl; - } - - std::vector shader_stats = - get_shader_executable_stats(pipeline, 0u); - std::cout << "Found " << shader_stats.size() << " Statistics" << std::endl; - if (shader_stats.size() > 0) { - std::cout << "====== Statistics: ======" << std::endl; - std::cout << shader_stats << std::endl; - } -} - -#endif // VK_KHR_pipeline_executable_properties - -#endif // VULKAN_DEBUG - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h deleted file mode 100644 index 9c7301b9971..00000000000 --- a/backends/vulkan/runtime/api/Context.h +++ /dev/null @@ -1,404 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace vkcompute { -namespace api { - -struct ContextConfig final { - uint32_t cmd_submit_frequency; - vkapi::CommandPoolConfig cmd_pool_config; - vkapi::DescriptorPoolConfig descriptor_pool_config; - vkapi::QueryPoolConfig query_pool_config; -}; - -// -// Vulkan Context holds onto all relevant Vulkan state as it pertains to our -// use of Vulkan in PyTorch. A Context is associated with one, and only one, -// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch -// are associated with a Context to make tensor <-> device affinity explicit. -// The context is currently a global object, but technically it does not need -// to be if we were to make it explicit to the user. -// - -class Context final { - public: - explicit Context(vkapi::Adapter*, const ContextConfig&); - - Context(const Context&) = delete; - Context& operator=(const Context&) = delete; - - Context(Context&&) = delete; - Context& operator=(Context&&) = delete; - - ~Context(); - - private: - // Config - ContextConfig config_; - // Important handles - vkapi::Adapter* adapter_p_; - VkDevice device_; - vkapi::Adapter::Queue queue_; - // Resource Pools - vkapi::CommandPool command_pool_; - vkapi::DescriptorPool descriptor_pool_; - vkapi::FencePool fences_; - // Diagnostics - vkapi::QueryPool querypool_; - // Command buffers submission - std::mutex cmd_mutex_; - vkapi::CommandBuffer cmd_; - uint32_t submit_count_; - // Memory Management - std::mutex buffer_clearlist_mutex_; - std::vector buffers_to_clear_; - std::mutex image_clearlist_mutex_; - std::vector images_to_clear_; - // Misc - VkImageTiling preferred_image_tiling_; - - public: - // Adapter access - - inline vkapi::Adapter* adapter_ptr() { - return adapter_p_; - } - - inline VkDevice device() { - return device_; - } - - inline vkapi::Adapter::Queue& queue() { - return queue_; - } - - // Device Caches - - inline vkapi::ShaderLayoutCache& shader_layout_cache() { - return adapter_ptr()->shader_layout_cache(); - } - - inline vkapi::ShaderCache& shader_cache() { - return adapter_ptr()->shader_cache(); - } - - inline vkapi::PipelineLayoutCache& pipeline_layout_cache() { - return adapter_ptr()->pipeline_layout_cache(); - } - - inline vkapi::ComputePipelineCache& pipeline_cache() { - return adapter_ptr()->compute_pipeline_cache(); - } - - // Resource Pools - - inline vkapi::DescriptorPool& descriptor_pool() { - return descriptor_pool_; - } - - inline vkapi::FencePool& fences() { - return fences_; - } - - // Diagnostics - - inline vkapi::QueryPool& querypool() { - return querypool_; - } - - inline VkImageTiling preferred_image_tiling() { - return preferred_image_tiling_; - } - - /* - * By default, the querypool attached to a Context instance is uninitialized. - * This function triggers the querypool to be created via vkCreateQueryPool. - */ - void initialize_querypool(); - - /* - * Encodes a vkResetQueryPool command to the current command buffer, and reset - * the internal state of the querypool. If the querypool is not initialized - * this function is a no-op. - */ - void cmd_reset_querypool(); - - /* - * Encodes a vkCmdWriteTimestamp command to the current command buffer and - * record some metadata about the shader that will be dispatched. If the - * querypool is not initialized this function is a no-op. - */ - void report_shader_dispatch_start( - const std::string& shader_name, - const utils::uvec3& global_wg_size, - const utils::WorkgroupSize& local_wg_size, - const uint32_t dispatch_id = UINT32_MAX); - - /* - * Encodes a vkCmdWriteTimstamp command to the current command buffer to - * record when the last shader that was dispatched has completed execution. - * If the querypool is not initialized this function is a no-op. - */ - void report_shader_dispatch_end(); - - // Memory Management - - void register_buffer_cleanup(vkapi::VulkanBuffer& buffer) { - std::lock_guard bufferlist_lock(buffer_clearlist_mutex_); - buffers_to_clear_.emplace_back(std::move(buffer)); - } - - void register_image_cleanup(vkapi::VulkanImage& image) { - std::lock_guard imagelist_lock(image_clearlist_mutex_); - images_to_clear_.emplace_back(std::move(image)); - } - - // GPU RPC - - inline std::unique_lock dispatch_lock() { - return std::unique_lock(cmd_mutex_); - } - - inline void set_cmd(bool reusable = false) { - if (!cmd_) { - cmd_ = command_pool_.get_new_cmd(reusable); - cmd_.begin(); - } - } - - void check_device_capabilities(const vkapi::ShaderInfo& shader); - - vkapi::DescriptorSet get_descriptor_set( - const vkapi::ShaderInfo&, - const utils::WorkgroupSize&, - const vkapi::SpecVarList&, - const uint32_t push_constants_size); - - inline vkapi::DescriptorSet get_descriptor_set( - const vkapi::ShaderInfo& shader_descriptor, - const utils::WorkgroupSize& local_work_group_size) { - return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u); - } - - void register_shader_dispatch( - const vkapi::DescriptorSet&, - vkapi::PipelineBarrier&, - const vkapi::ShaderInfo&, - const utils::uvec3&, - const void* = nullptr, - const uint32_t = 0); - - void register_blit( - vkapi::PipelineBarrier&, - vkapi::VulkanImage& src, - vkapi::VulkanImage& dst); - - template - bool submit_compute_job( - const vkapi::ShaderInfo&, - vkapi::PipelineBarrier&, - const utils::uvec3&, - const utils::uvec3&, - const vkapi::SpecVarList&, - VkFence fence_handle, - const uint32_t dispatch_id, - Arguments&&...); - - void submit_cmd_to_gpu( - VkFence fence_handle = VK_NULL_HANDLE, - const bool final_use = false); - - vkapi::CommandBuffer& extract_cmd() { - return cmd_; - } - - void flush(); - -#ifdef VULKAN_DEBUG - -#ifdef VK_KHR_pipeline_executable_properties - - VkPipeline get_shader_pipeline( - const vkapi::ShaderInfo& shader, - const vkapi::SpecVarList& spec_constants); - - std::vector get_pipeline_executable_props( - const VkPipeline pipeline); - - std::tuple< - std::vector, - std::vector>> - get_shader_executable_irs( - const VkPipeline pipeline, - const uint32_t pipeline_exec_idx = 0u); - - std::vector get_shader_executable_stats( - const VkPipeline pipeline, - const uint32_t pipeline_exec_idx = 0u); - - void print_shader_executable_properties( - const vkapi::ShaderInfo& shader, - const vkapi::SpecVarList& spec_constants); - -#endif // VK_KHR_pipeline_executable_properties - -#endif // VULKAN_DEBUG -}; - -bool available(); - -// The global runtime is retrieved using this function, where it is declared as -// a static local variable. -Context* context(); - -namespace detail { - -inline void arg_is_empty( - bool& any_is_empty, - const vkapi::VulkanBuffer& buffer) { - // bool(buffer) will evaluate to false if no memory has been allocated - any_is_empty = any_is_empty || !buffer; -} - -inline void arg_is_empty(bool& any_is_empty, const vkapi::VulkanImage& image) { - // bool(image) will evaluate to false if no memory has been allocated - any_is_empty = any_is_empty || !image; -} - -inline void arg_is_empty( - bool& any_is_empty, - const vkapi::BufferBindInfo& bind_info) { - any_is_empty = any_is_empty || (bind_info.handle == VK_NULL_HANDLE); -} - -/* - Reports if any VulkanBuffer or VulkanImage argument in a variadic argument - list does not have any memory associated with it. - */ -template -inline bool any_arg_is_empty(Arguments&&... arguments) { - bool any_is_empty = false; - VK_UNUSED const int _[]{ - 0, - (arg_is_empty(any_is_empty, std::forward(arguments)), 0)..., - }; - - return any_is_empty; -} - -template -inline void bind( - vkapi::DescriptorSet& descriptor_set, - const std::index_sequence&, - Arguments&&... arguments) { - VK_UNUSED const int _[]{ - 0, - (descriptor_set.bind(Indices, std::forward(arguments)), 0)..., - }; -} - -} // namespace detail - -/* - Records a compute shader dispatch into the current command buffer. If the - number of submit_*_job calls exceeds the configured frequency, or if a fence - is provided, then the command buffer is submitted to the GPU for execution. - Returns a bool indicating whether or not the function call resulted in a GPU - queue submission. - */ -template -inline bool Context::submit_compute_job( - const vkapi::ShaderInfo& shader, - vkapi::PipelineBarrier& pipeline_barrier, - const utils::uvec3& global_work_group, - const utils::uvec3& local_work_group_size, - const vkapi::SpecVarList& specialization_constants, - VkFence fence_handle, - const uint32_t dispatch_id, - Arguments&&... arguments) { - // If any of the provided arguments does not have memory associated with it, - // then exit early as there is no work to be done. However, if a fence has - // been passed the command buffer is not empty, then the current command - // buffer must still be submitted so that the fence can be signaled. - if (detail::any_arg_is_empty(arguments...)) { - if (fence_handle != VK_NULL_HANDLE && submit_count_ > 0) { - submit_cmd_to_gpu(fence_handle); - return true; - } - return false; - } - - // Serialize recording to the shared command buffer. Do not initialize with a - // mutex just yet, since in some cases it will be externally managed. - std::unique_lock cmd_lock; - // If a fence was passed, then assume that the host intends to sync with - // the GPU, implying there will be imminent calls to fence.wait() and flush(). - // We therefore assume the mutex is externally managed in this case, and the - // calling thread has already locked the mutex prior to calling the function, - // and will release the mutex manually after calling flush(). This will - // prevent more dispatches from being recorded until we have flushed the - // Context. - if (fence_handle == VK_NULL_HANDLE) { - cmd_lock = std::unique_lock(cmd_mutex_); - } - - set_cmd(); - - report_shader_dispatch_start( - shader.kernel_name, - global_work_group, - utils::WorkgroupSize(local_work_group_size), - dispatch_id); - - // Factor out template parameter independent code to minimize code bloat. - // Note that push constants are not exposed yet via this API, therefore the - // push constants size is assumed to be 0. - vkapi::DescriptorSet descriptor_set = get_descriptor_set( - shader, - utils::WorkgroupSize(local_work_group_size), - specialization_constants, - 0u); - - detail::bind( - descriptor_set, - std::index_sequence_for{}, - std::forward(arguments)...); - - // Factor out template parameter independent code to minimize code bloat. - register_shader_dispatch( - descriptor_set, pipeline_barrier, shader, global_work_group); - - report_shader_dispatch_end(); - - submit_count_++; - if (fence_handle != VK_NULL_HANDLE || - submit_count_ >= config_.cmd_submit_frequency) { - submit_cmd_to_gpu(fence_handle); - return true; - } - - return false; -} - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/ShaderRegistry.cpp b/backends/vulkan/runtime/api/ShaderRegistry.cpp deleted file mode 100644 index f828e561a25..00000000000 --- a/backends/vulkan/runtime/api/ShaderRegistry.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { -namespace api { - -bool ShaderRegistry::has_shader(const std::string& shader_name) { - const ShaderListing::const_iterator it = listings_.find(shader_name); - return it != listings_.end(); -} - -bool ShaderRegistry::has_dispatch(const std::string& op_name) { - const Registry::const_iterator it = registry_.find(op_name); - return it != registry_.end(); -} - -void ShaderRegistry::register_shader(vkapi::ShaderInfo&& shader_info) { - if (has_shader(shader_info.kernel_name)) { - VK_THROW( - "Shader with name ", shader_info.kernel_name, "already registered"); - } - listings_.emplace(shader_info.kernel_name, shader_info); -} - -void ShaderRegistry::register_op_dispatch( - const std::string& op_name, - const DispatchKey key, - const std::string& shader_name) { - if (!has_dispatch(op_name)) { - registry_.emplace(op_name, Dispatcher()); - } - const Dispatcher::const_iterator it = registry_[op_name].find(key); - if (it != registry_[op_name].end()) { - registry_[op_name][key] = shader_name; - } else { - registry_[op_name].emplace(key, shader_name); - } -} - -const vkapi::ShaderInfo& ShaderRegistry::get_shader_info( - const std::string& shader_name) { - const ShaderListing::const_iterator it = listings_.find(shader_name); - - VK_CHECK_COND( - it != listings_.end(), - "Could not find ShaderInfo with name ", - shader_name); - - return it->second; -} - -ShaderRegistry& shader_registry() { - static ShaderRegistry registry; - return registry; -} - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/ShaderRegistry.h b/backends/vulkan/runtime/api/ShaderRegistry.h deleted file mode 100644 index f40e247c1b8..00000000000 --- a/backends/vulkan/runtime/api/ShaderRegistry.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include -#include - -#define VK_KERNEL(shader_name) \ - ::vkcompute::api::shader_registry().get_shader_info(#shader_name) - -#define VK_KERNEL_FROM_STR(shader_name_str) \ - ::vkcompute::api::shader_registry().get_shader_info(shader_name_str) - -namespace vkcompute { -namespace api { - -enum class DispatchKey : int8_t { - CATCHALL, - ADRENO, - MALI, - OVERRIDE, -}; - -class ShaderRegistry final { - using ShaderListing = std::unordered_map; - using Dispatcher = std::unordered_map; - using Registry = std::unordered_map; - - ShaderListing listings_; - Dispatcher dispatcher_; - Registry registry_; - - public: - /* - * Check if the registry has a shader registered under the given name - */ - bool has_shader(const std::string& shader_name); - - /* - * Check if the registry has a dispatch registered under the given name - */ - bool has_dispatch(const std::string& op_name); - - /* - * Register a ShaderInfo to a given shader name - */ - void register_shader(vkapi::ShaderInfo&& shader_info); - - /* - * Register a dispatch entry to the given op name - */ - void register_op_dispatch( - const std::string& op_name, - const DispatchKey key, - const std::string& shader_name); - - /* - * Given a shader name, return the ShaderInfo which contains the SPIRV binary - */ - const vkapi::ShaderInfo& get_shader_info(const std::string& shader_name); -}; - -class ShaderRegisterInit final { - using InitFn = void(); - - public: - ShaderRegisterInit(InitFn* init_fn) { - init_fn(); - }; -}; - -// The global shader registry is retrieved using this function, where it is -// declared as a static local variable. -ShaderRegistry& shader_registry(); - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h deleted file mode 100644 index b5d46b8bba4..00000000000 --- a/backends/vulkan/runtime/api/api.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.cpp b/backends/vulkan/runtime/api/containers/ParamsBuffer.cpp deleted file mode 100644 index 482a5c50be6..00000000000 --- a/backends/vulkan/runtime/api/containers/ParamsBuffer.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { -namespace api { - -namespace { - -void memcpy_to_buffer( - const vkapi::VulkanBuffer& src, - vkapi::VulkanBuffer& dst) { - vkapi::MemoryMap dst_mapping(dst, vkapi::MemoryAccessType::WRITE); - - vkapi::MemoryMap src_mapping(src, vkapi::MemoryAccessType::READ); - src_mapping.invalidate(); - - void* dst_ptr = dst_mapping.template data(); - void* src_ptr = src_mapping.template data(); - - // @lint-ignore CLANGTIDY facebook-security-vulnerable-memcpy - memcpy(dst_ptr, src_ptr, src.mem_size()); -} - -} // namespace - -ParamsBuffer::ParamsBuffer(const ParamsBuffer& other) - : context_p_(other.context_p_), vulkan_buffer_{} { - if (other.vulkan_buffer_) { - vulkan_buffer_ = context_p_->adapter_ptr()->vma().create_uniform_buffer( - other.vulkan_buffer_.mem_size()); - - memcpy_to_buffer(other.vulkan_buffer_, vulkan_buffer_); - } -} - -ParamsBuffer& ParamsBuffer::operator=(const ParamsBuffer& other) { - if (&other != this) { - context_p_ = other.context_p_; - - // Move vulkan_buffer_ to another VulkanBuffer for cleanup - if (vulkan_buffer_) { - vkapi::VulkanBuffer temp_buffer(std::move(vulkan_buffer_)); - context_p_->register_buffer_cleanup(temp_buffer); - } - // vulkan_buffer_ should now be empty - - if (other.vulkan_buffer_) { - vulkan_buffer_ = context_p_->adapter_ptr()->vma().create_uniform_buffer( - other.vulkan_buffer_.mem_size()); - - memcpy_to_buffer(other.vulkan_buffer_, vulkan_buffer_); - } - } - - return *this; -} - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h deleted file mode 100644 index ecc07892cf7..00000000000 --- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -namespace vkcompute { -namespace api { - -class ParamsBuffer final { - private: - Context* context_p_; - vkapi::VulkanBuffer vulkan_buffer_; - - public: - ParamsBuffer() : context_p_{nullptr}, vulkan_buffer_{} {} - - template - ParamsBuffer(Context* context_p, const Block& block) - : context_p_(context_p), - vulkan_buffer_( - context_p_->adapter_ptr()->vma().create_params_buffer(block)) {} - - // The last bool argument, though unused, is required to disambiguate this - // constructor from the one above. - ParamsBuffer(Context* context_p, const VkDeviceSize nbytes, const bool unused) - : context_p_(context_p), - vulkan_buffer_( - context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {} - - ParamsBuffer(const ParamsBuffer&); - ParamsBuffer& operator=(const ParamsBuffer&); - - ParamsBuffer(ParamsBuffer&&) = default; - ParamsBuffer& operator=(ParamsBuffer&&) = default; - - ~ParamsBuffer() { - if (vulkan_buffer_) { - context_p_->register_buffer_cleanup(vulkan_buffer_); - } - } - - const vkapi::VulkanBuffer& buffer() const { - return vulkan_buffer_; - } - - template - void update(const Block& block, const uint32_t offset = 0) { - // Fill the uniform buffer with data in block - { - vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite); - Block* data_ptr = mapping.template data(offset); - - *data_ptr = block; - } - } - - template - T read() const { - T val; - if (sizeof(val) != vulkan_buffer_.mem_size()) { - VK_THROW( - "Attempted to store value from ParamsBuffer to type of different size"); - } - // Read value from uniform buffer and store in val - { - vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kRead); - T* data_ptr = mapping.template data(); - - val = *data_ptr; - } - return val; - } -}; - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h deleted file mode 100644 index 1e9f569fc4a..00000000000 --- a/backends/vulkan/runtime/api/containers/StagingBuffer.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -namespace vkcompute { -namespace api { - -class StagingBuffer final { - private: - Context* context_p_; - vkapi::ScalarType dtype_; - vkapi::VulkanBuffer vulkan_buffer_; - - void* mapped_data_; - - public: - StagingBuffer( - Context* context_p, - const vkapi::ScalarType dtype, - const size_t numel) - : context_p_(context_p), - dtype_(dtype), - vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer( - element_size(dtype_) * numel)), - mapped_data_(nullptr) {} - - StagingBuffer(const StagingBuffer&) = delete; - StagingBuffer& operator=(const StagingBuffer&) = delete; - - StagingBuffer(StagingBuffer&&) = default; - StagingBuffer& operator=(StagingBuffer&&) = default; - - ~StagingBuffer() { - context_p_->register_buffer_cleanup(vulkan_buffer_); - } - - inline vkapi::ScalarType dtype() { - return dtype_; - } - - inline vkapi::VulkanBuffer& buffer() { - return vulkan_buffer_; - } - - inline void* data() { - if (!mapped_data_) { - mapped_data_ = vulkan_buffer_.allocation_info().pMappedData; - } - return mapped_data_; - } - - inline size_t numel() { - return nbytes() / element_size(dtype_); - } - - inline size_t nbytes() { - return vulkan_buffer_.mem_size(); - } - - inline void copy_from(const void* src, const size_t nbytes) { - VK_CHECK_COND(nbytes <= this->nbytes()); - memcpy(data(), src, nbytes); - vmaFlushAllocation( - vulkan_buffer_.vma_allocator(), - vulkan_buffer_.allocation(), - 0u, - VK_WHOLE_SIZE); - } - - inline void copy_to(void* dst, const size_t nbytes) { - VK_CHECK_COND(nbytes <= this->nbytes()); - vmaInvalidateAllocation( - vulkan_buffer_.vma_allocator(), - vulkan_buffer_.allocation(), - 0u, - VK_WHOLE_SIZE); - memcpy(dst, data(), nbytes); - } - - inline void set_staging_zeros() { - memset(data(), 0, nbytes()); - } -}; - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp deleted file mode 100644 index 433ae15db4e..00000000000 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ /dev/null @@ -1,1066 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - -namespace vkcompute { -namespace api { - -/* - * Used to infer the sizes of a tensor that would correspond to a given - * VulkanImage. - */ -std::vector calculate_sizes( - const vkapi::VulkanImage& image, - const utils::GPUMemoryLayout memory_layout) { - auto sizes = std::vector{ - image.extents().width, image.extents().height, image.extents().depth}; - const auto packed_dim = utils::to_packed_dim(memory_layout); - sizes.at(packed_dim) *= 4; - return sizes; -} - -std::vector calculate_dim_order( - const size_t ndim, - const int32_t packed_dim) { - // Special case for zero dim tensors - if (ndim == 0) { - return {0}; - } - std::vector dim_order(ndim); - // Explicitly convert ndim to signed to prevent underflow - int64_t last_dim = int64_t(ndim) - 1 - packed_dim; - - int64_t cur_dim = 0; - for (int d = 0; d < ndim; ++d) { - if (d == last_dim) { - cur_dim++; - } - dim_order[d] = cur_dim; - cur_dim++; - } - if (last_dim >= 0) { - dim_order[ndim - 1] = last_dim; - } - - return dim_order; -} - -std::vector calculate_strides( - const std::vector& sizes, - const std::vector& dim_order) { - // For zero dim tensors - if (sizes.size() == 0) { - return {1}; - } - - size_t ndim = sizes.size(); - std::vector strides(ndim); - - strides[dim_order[ndim - 1]] = 1; - for (int32_t i = ndim - 2; i >= 0; --i) { - if (sizes[dim_order[i + 1]] == 0) { - strides[dim_order[i]] = strides[dim_order[i + 1]]; - } else { - strides[dim_order[i]] = - strides[dim_order[i + 1]] * sizes[dim_order[i + 1]]; - } - } - - return strides; -} - -/* - * Axis mapping is somewhat analogous to strides for texture backed tensors. - * - * The axis mapping is normalized to 4 dimensions, similar to the padded sizes. - * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture - * axis that corresponds to the width, height, and channels dimension of the - * tensor. Thus the axis mapping can be considered to be in WHCN dimension - * order. - * - * The last value `axis_map.at(3)` indicates the WHCN index of the tensor - * dimension along which batches will be concatenated. This dimension can be - * referred to as the "inner dimension" To determine which image texture axis is - * used for the concatenation, a double lookup will need to be performed - * (axis_map.at(axis_map.at(3))). - * - * The reason for strucuring axis mapping this way is because for the batch dim, - * two things need to be easily derived: - * - * 1. The dim idx of the inner dimension, so that the size of the inner - * dimension can be easily determined. - * 2. The texture axis used to concatenate batches - * - * By storing the dim index of the inner dimension instead of the texture axis - * it maps to, both pieces of information are readily available. - * - * The axis mapping allows for permuted views of texture-backed tensors. - */ -std::vector calculate_axis_map( - const std::vector& sizes, - utils::AxisMapLayout axis_map_layout) { - if (axis_map_layout == utils::AxisMapLayout::OPTIMIZED) { - std::vector axis_map(sizes.size() + 1); - std::iota(axis_map.begin(), axis_map.end() - 1, 0); - - std::stable_sort( - axis_map.begin(), axis_map.end() - 1, [&sizes](size_t i1, size_t i2) { - return sizes[i1] < sizes[i2]; - }); - - assert(axis_map.size() > 0); - // Find the index of the channel dimension - for (size_t i = 0; i < axis_map.size() - 1; ++i) { - assert(sizes.size() > axis_map[i]); - if (sizes[axis_map[i]] == 2) { - axis_map.back() = i; - break; - } - } - - return axis_map; - } - // default - return {0, 1, 2, 2}; -} - -bool dim_order_is_valid(const std::vector& dim_order) { - int64_t sum = 0; - for (size_t i = 0; i < dim_order.size(); ++i) { - if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) { - return false; - } - sum += dim_order[i]; - } - int64_t n = static_cast(dim_order.size() - 1); - // Sanity check that the sum of the indices in the vector is equal to the sum - // of 0 + 1 + 2 + ... + (ndim - 1) - return sum == n * (n + 1) / 2; -} - -utils::ivec4 flip_and_unsqueeze_ivec4( - const std::vector& tensor_metadata, - const vTensor::Attribute metadata_type, - const size_t numel) { - VK_CHECK_COND(tensor_metadata.size() <= 4); - std::vector flipped_metadata = - flip_and_unsqueeze(tensor_metadata, metadata_type, numel); - return { - flipped_metadata.at(0), - flipped_metadata.at(1), - flipped_metadata.at(2), - flipped_metadata.at(3), - }; -} - -std::vector calculate_padded_sizes( - const std::vector& sizes, - const int32_t packed_dim) { - int64_t ndim = sizes.size(); - if (ndim == 0) { - ndim = 1; - } - - // Tensor sizes will be unsqueezed up to the next multiple of 4 - const int64_t ndim_up4 = utils::align_up_4(ndim); - std::vector padded_sizes(ndim_up4); - for (int64_t i = 0; i < ndim_up4; ++i) { - padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes); - } - - // Pad the packed dim to the next multiple of 4. - const int64_t dim_offset = packed_dim + 1; - const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); - padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); - - return padded_sizes; -} - -utils::uvec3 calculate_image_extents( - const std::vector& padded_sizes, - const std::vector& axis_map, - const int32_t packed_dim) { - utils::uvec3 extents({1, 1, 1}); - - // For high dimensional tensors, buffer storage must be used. No need to - // compute image extents in this case. - if (padded_sizes.size() > 4) { - return extents; - } - - // First three elements of axis_map indicate which (X,Y,Z) image axis the - // width, height, and channels dim of the tensor maps to. - for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) { - const int64_t axis = axis_map.at(whcn_dim); - const int64_t dim = padded_sizes.size() - 1 - whcn_dim; - extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); - } - - // axis_map[3] indicates the WHCN index of the dimension used for batch - // concatenation. Thus a double lookup is required to determine the image axis - // used for batch concatenation. - const int64_t concatted_whcn_dim = axis_map.at(3); - const int64_t batch_axis = axis_map.at(concatted_whcn_dim); - // Multiply the extents of the batch axis by the batch size. - extents[batch_axis] *= padded_sizes.at(0); - - VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0); - extents[axis_map.at(packed_dim)] /= 4; - return extents; -} - -/* - * The physical image extents describe the size of an allocated texture resource - * i.e. how many texels in the width, height and depth axis of the image. - * However, the axis map allows a tensor logical dimension to map to a different - * physical texture axis; in essence, it describes a permutation between the - * logical width, height, channels, etc. dimensions of a tensor and the width, - * height, depth axis of a texture. - * - * The "logical extents" is simply the physical image extents permuted by the - * axis mapping. The logical extents is useful for constructing global work - * group sizes, so that it is easier to convert the global thread ID to a - * tensor index. - */ -utils::uvec3 calculate_logical_limits( - const utils::uvec3& image_extents, - const std::vector& axis_map) { - return { - image_extents[axis_map.at(0)], - image_extents[axis_map.at(1)], - image_extents[axis_map.at(2)], - }; -} - -/* - * Convenience overload of the above function to calculate logical limits - * directly from tensor sizes. - */ -utils::uvec3 calculate_logical_limits( - const std::vector& sizes, - const std::vector& axis_map, - const int32_t packed_dim) { - return calculate_logical_limits( - calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim), - axis_map); -} - -int64_t calculate_gpu_buffer_numel( - Context* const context, - const std::vector& sizes, - const utils::uvec3 image_extents, - const utils::StorageType storage_type, - const vkapi::ScalarType dtype) { - // For texture backed tensors, simply multiply the total number of texels by 4 - if (storage_type != utils::kBuffer) { - return image_extents[0] * image_extents[1] * image_extents[2] * 4; - } - const bool is_int8 = dtype == vkapi::kChar; - const bool int8_supported = - context->adapter_ptr()->has_full_int8_buffers_support(); - const size_t numel = utils::multiply_integers(sizes); - // For int8 tensors, if the device does not support int8 buffers, then int32 - // is used instead to represent the buffer data. Therefore the number of - // elements in the buffer is aligned to the next multiple of 4. - if (is_int8 && int8_supported) { - return utils::align_up_4(numel); - } - return numel; -} - -template ::value>> -int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { - int32_t packed = static_cast( - vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) + - (extra << 16)); - return packed; -} - -int32_t create_hashed_layout( - const std::vector& dim_order, - const std::vector& axis_map, - const int32_t packed_dim, - const utils::StorageType storage_type) { - if (storage_type == utils::kBuffer) { - return pack_into_int32( - flip_and_unsqueeze(dim_order, kTensorDimOrder, 0), 0); - } - return pack_into_int32(axis_map, packed_dim); -} - -size_t calculate_max_ubo_nbytes( - const size_t min_nbytes_per_ubo, - const utils::StorageType storage_type) { - size_t ivec4_ubo_nbytes = utils::align_up(size_t(16), min_nbytes_per_ubo); - size_t uvec3_ubo_nbytes = utils::align_up(size_t(12), min_nbytes_per_ubo); - size_t int32_ubo_nbytes = utils::align_up(size_t(4), min_nbytes_per_ubo); - if (storage_type == utils::kBuffer) { - // sizes, strides, dim order, numel - return 3 * ivec4_ubo_nbytes + int32_ubo_nbytes; - } - // sizes, logical limits - return ivec4_ubo_nbytes + uvec3_ubo_nbytes; -} - -// -// vTensorStorage -// - -utils::StorageType storage_type(const vkapi::VulkanImage& image) { - const auto type = image.type(); - switch (type) { - case VK_IMAGE_TYPE_3D: - return utils::kTexture3D; - case VK_IMAGE_TYPE_2D: - return utils::kTexture2D; - default: - VK_THROW("Unsupported image type", type); - } -} - -vkapi::VulkanImage allocate_image( - Context* const context_ptr, - utils::uvec3& image_extents, - const utils::StorageType storage_type, - const VkFormat image_format, - const bool allocate_memory) { - vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); - - vkapi::ImageSampler::Properties sampler_props{ - VK_FILTER_NEAREST, - VK_SAMPLER_MIPMAP_MODE_NEAREST, - VK_SAMPLER_ADDRESS_MODE_REPEAT, - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK, - }; - - VkImageType image_type = VK_IMAGE_TYPE_3D; - VkImageViewType image_view_type; - - switch (storage_type) { - case utils::kTexture3D: - image_type = VK_IMAGE_TYPE_3D; - image_view_type = VK_IMAGE_VIEW_TYPE_3D; - break; - case utils::kTexture2D: - image_type = VK_IMAGE_TYPE_2D; - image_view_type = VK_IMAGE_VIEW_TYPE_2D; - break; - default: - // Return an empty VulkanImage by default - return vkapi::VulkanImage(); - } - - // TODO(ssjia): change to always check that the image extents do not exceed - // physical limits. Adding the check now based on `maxImageDimension3D` will - // cause some existing models to break. Anecdotally, on Adreno and - // SwiftShader devices, using 3D textures that exceed `maxImageDimension3D` - // appears to be ok. So we need to figure out if is it undefined behaviour - // or if there's a better way to figure out what the limit is. For now, only - // check during debug build so that we can detect when exceeding physical - // limits could be a potential cause for model outputs to be wrong. In the - // meantime, the threshold for using texture storage can be configured at - // export time. -#ifdef VULKAN_DEBUG - uint32_t max_extent = storage_type == utils::kTexture3D - ? adapter_ptr->max_texture3d_dim() - : adapter_ptr->max_texture2d_dim(); - - VK_CHECK_COND( - image_extents[0] <= max_extent && image_extents[1] <= max_extent && - image_extents[2] <= max_extent); -#endif - - VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props); - - return adapter_ptr->vma().create_image( - context_ptr->device(), - vkapi::create_extent3d(image_extents), - image_format, - image_type, - context_ptr->preferred_image_tiling(), - image_view_type, - sampler_props, - sampler, - /*allow_transfer = */ true, - /*allocate_memory = */ allocate_memory); -} - -vkapi::VulkanBuffer allocate_buffer( - Context* const context_ptr, - const int64_t numel, - const utils::StorageType storage_type, - const vkapi::ScalarType dtype, - const bool allocate_memory) { - vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); - - switch (storage_type) { - case utils::kBuffer: - break; - default: - // Return an empty VulkanBuffer if Buffer storage is not used - return vkapi::VulkanBuffer(); - } - - VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel()); - - return adapter_ptr->vma().create_storage_buffer( - element_size(dtype) * numel, allocate_memory); -} - -vTensorStorage::vTensorStorage( - Context* const context, - const utils::StorageType storage_type, - const std::vector& axis_map, - const int32_t packed_dim, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const bool allocate_memory) - : context_(context), - storage_type_{storage_type}, - image_extents_(calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), - axis_map, - packed_dim)), - buffer_length_{calculate_gpu_buffer_numel( - context_, - sizes, - image_extents_, - storage_type, - dtype)}, - buffer_offset_{0}, - image_(allocate_image( - context_, - image_extents_, - storage_type_, - to_vkformat(dtype), - allocate_memory)), - buffer_(allocate_buffer( - context_, - buffer_length_, - storage_type_, - dtype, - allocate_memory)), - last_access_{} {} - -vTensorStorage::vTensorStorage( - Context* const context, - const vkapi::VulkanImage& image) - : context_(context), - storage_type_{storage_type(image)}, - image_extents_( - {image.extents().width, - image.extents().height, - image.extents().depth}), - buffer_length_{0}, - buffer_offset_{0}, - image_(image), - buffer_(vkapi::VulkanBuffer()), - last_access_{} {} - -vTensorStorage::~vTensorStorage() { - flush(); -} - -void vTensorStorage::flush() { - if (image_) { - context_->register_image_cleanup(image_); - } else if (buffer_) { - context_->register_buffer_cleanup(buffer_); - } - last_access_ = {}; -} - -void vTensorStorage::transition( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags cur_stage, - const vkapi::MemoryAccessFlags cur_access) { - // Get last stage access - vkapi::PipelineStageFlags prev_stage = last_access_.stage; - vkapi::MemoryAccessFlags prev_access = last_access_.access; - - const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0; - const bool cur_written = (cur_access & vkapi::MemoryAccessType::WRITE) != 0; - - VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED; - VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED; - bool layout_changed = false; - if (image_) { - cur_layout = image_.layout(); - new_layout = vkapi::vk_layout(cur_stage, cur_access); - - layout_changed = cur_layout != new_layout; - } - - // RAW: need to make sure current read sees previous writes - // WAW: need to make sure the current write occurs after previous write so - // the final value is correct. - // WAR: need to make sure previous read does not read the value from the - // current write. - // RAR: no need for synchronization - if (prev_written || cur_written || layout_changed) { - VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage); - if (0u == src_stage) { - src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - } - VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage); - if (0u == dst_stage) { - dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - } - - pipeline_barrier.stage.src |= src_stage; - pipeline_barrier.stage.dst |= dst_stage; - - if (image_) { - pipeline_barrier.images.emplace_back( - vkapi::vk_access(prev_stage, prev_access), - vkapi::vk_access(cur_stage, cur_access), - cur_layout, - new_layout, - image_); - - image_.set_layout(new_layout); - } else if (buffer_) { - pipeline_barrier.buffers.emplace_back( - vkapi::vk_access(prev_stage, prev_access), - vkapi::vk_access(cur_stage, cur_access), - buffer_); - } - } - - last_access_.stage = cur_stage; - last_access_.access = cur_access; -} - -// -// vTensor -// - -vTensor::vTensor( - Context* const context, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const bool allocate_memory, - const utils::AxisMapLayout axis_map_layout) - : dtype_(dtype), - // Calculate tensor metadata - sizes_(sizes.begin(), sizes.end()), - packed_dim_(utils::to_packed_dim(memory_layout)), - dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), - axis_map_(calculate_axis_map(sizes_, axis_map_layout)), - strides_(calculate_strides(sizes, dim_order_)), - numel_(utils::multiply_integers(sizes_)), - hashed_layout_(create_hashed_layout( - dim_order_, - axis_map_, - packed_dim_, - storage_type)), - // Related to tensor metadata UBOs - min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, - max_ubo_nbytes_{ - calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)}, - uniforms_(), - buffer_meta_(), - // Construct Tensor storage - storage_(std::make_shared( - context, - storage_type, - axis_map_, - packed_dim_, - sizes, - dtype_, - allocate_memory)) { - // uniform_data_ only valid for low dim tensors - if (sizes.size() <= 4) { - uniform_data_ = std::make_shared(UniformData{ - numel_, - sizes_, - dim_order_, - strides_, - calculate_logical_limits(storage_->image_extents_, axis_map_)}); - } - - VK_CHECK_COND( - dim_order_is_valid(dim_order_), "computed dim order is invalid"); -} - -// NOLINTNEXTLINE -vTensor::vTensor( - Context* context, - const vkapi::VulkanImage& image, - const utils::GPUMemoryLayout memory_layout, - const utils::AxisMapLayout axis_map_layout) - : dtype_(vkapi::element_scalartype(image.format())), - // Calculate tensor metadata - sizes_(calculate_sizes(image, memory_layout)), - packed_dim_(utils::to_packed_dim(memory_layout)), - dim_order_(), - axis_map_(calculate_axis_map(sizes_, axis_map_layout)), - strides_(), - numel_(utils::multiply_integers(sizes_)), - hashed_layout_(create_hashed_layout( - dim_order_, - axis_map_, - packed_dim_, - utils::kTexture3D)), - // Related to tensor metadata UBOs - min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, - max_ubo_nbytes_{ - calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)}, - uniforms_(), - buffer_meta_(), - // Construct Tensor storage - storage_(std::make_shared(context, image)) { - uniform_data_ = std::make_shared(UniformData{ - numel_, - sizes_, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - calculate_logical_limits(storage_->image_extents_, axis_map_)}); -} - -vTensor::vTensor(vTensor& other) - : dtype_(other.dtype_), - // Copy tensor size metadata - sizes_(other.sizes_.begin(), other.sizes_.end()), - packed_dim_{other.packed_dim_}, - dim_order_(other.dim_order_.begin(), other.dim_order_.end()), - axis_map_(other.axis_map_.begin(), other.axis_map_.end()), - strides_(other.strides_.begin(), other.strides_.end()), - numel_(other.numel_), - hashed_layout_(other.hashed_layout_), - min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, - max_ubo_nbytes_{other.max_ubo_nbytes_}, - uniforms_(), - buffer_meta_(), - // Copy Tensor storage - storage_(other.storage_) { - uniform_data_ = std::make_shared(*other.get_uniform_data()); -} - -vTensor::vTensor( - vTensor& other, - const std::vector& sizes, - const std::vector& dim_order) - : dtype_(other.dtype_), - // Copy tensor size metadata - sizes_(sizes.begin(), sizes.end()), - packed_dim_(other.packed_dim_), - dim_order_(dim_order.begin(), dim_order.end()), - axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)), - strides_(calculate_strides(sizes_, dim_order_)), - numel_(other.numel_), - hashed_layout_(create_hashed_layout( - dim_order_, - axis_map_, - packed_dim_, - other.storage_type())), - min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, - max_ubo_nbytes_{other.max_ubo_nbytes_}, - uniforms_(), - buffer_meta_(), - // Copy Tensor storage - storage_(other.storage_) { - uniform_data_ = std::make_shared(UniformData{ - static_cast(utils::multiply_integers(sizes_)), - sizes_, - dim_order_, - strides_, - other.logical_limits()}); - - VK_CHECK_COND( - dim_order_is_valid(dim_order_), "new dim order provided is invalid"); -} - -vTensor::UniformData::UniformData( - const size_t numel_ll, - const std::vector& sizes, - const std::vector& dim_order, - const std::vector& strides, - const utils::uvec3& limits) - : numel(utils::safe_downcast(numel_ll)), - sizes_v(flip_and_unsqueeze_ivec4(sizes, kTensorSizes, numel_ll)), - dim_order_v( - flip_and_unsqueeze_ivec4(dim_order, kTensorDimOrder, numel_ll)), - strides_v(flip_and_unsqueeze_ivec4(strides, kTensorStrides, numel_ll)), - logical_limits(limits) {} - -uint32_t vTensor::UniformData::write_attribute( - void* dst, - const uint32_t dst_offset, - const uint32_t max_dst_size, - const Attribute attr) { -#define WRITE_ATTRIBUTE_CASE(enum_name, member_name) \ - case vTensor::Attribute::enum_name: { \ - VK_CHECK_COND( \ - (dst_offset + sizeof(member_name)) <= max_dst_size, \ - "Attempting to write tensor attribute outside data boundary."); \ - memcpy((uint8_t*)dst + dst_offset, &member_name, sizeof(member_name)); \ - return sizeof(member_name); \ - } - switch (attr) { - WRITE_ATTRIBUTE_CASE(NUMEL, numel); - WRITE_ATTRIBUTE_CASE(SIZES, sizes_v); - WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, dim_order_v); - WRITE_ATTRIBUTE_CASE(STRIDES, strides_v); - WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits); - default: - VK_THROW("Invalid Attribute"); - } -#undef WRITE_ATTRIBUTE_CASE - return 0; -} - -vTensor::BufferMetadata::BufferMetadata( - std::vector& src_sizes, - std::vector& src_dim_order, - std::vector& src_strides, - size_t src_numel) { - update(src_sizes, src_dim_order, src_strides, src_numel); -} - -void vTensor::BufferMetadata::update( - std::vector& src_sizes, - std::vector& src_dim_order, - std::vector& src_strides, - size_t src_numel) { - int32_t fixed_ndim = utils::safe_downcast(kTensorDimLimit); - - std::vector fu_sizes = flip_and_unsqueeze( - src_sizes, kTensorSizes, src_numel, fixed_ndim); - std::vector fu_dim_order = flip_and_unsqueeze( - src_dim_order, kTensorDimOrder, src_numel, fixed_ndim); - std::vector fu_strides = flip_and_unsqueeze( - src_strides, kTensorStrides, src_numel, fixed_ndim); - - for (int i = 0; i < fixed_ndim; ++i) { - sizes[i] = fu_sizes.at(i); - dim_order[i] = fu_dim_order.at(i); - strides[i] = fu_strides.at(i); - } - - ndim = utils::safe_downcast(src_sizes.size()); - numel = utils::safe_downcast(src_numel); -} - -vkapi::VulkanImage& vTensor::image( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage) & { - storage_->transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ); - return storage_->image_; -} - -vkapi::VulkanImage& vTensor::image( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage, - const vkapi::MemoryAccessFlags access) & { - storage_->transition(pipeline_barrier, stage, access); - return storage_->image_; -} - -vkapi::VulkanBuffer& vTensor::buffer( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage) & { - storage_->transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ); - return storage_->buffer_; -} - -vkapi::VulkanBuffer& vTensor::buffer( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage, - const vkapi::MemoryAccessFlags access) & { - storage_->transition(pipeline_barrier, stage, access); - return storage_->buffer_; -} - -utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { - switch (packed_dim_) { - case WHCN::kWidthDim: - return utils::kWidthPacked; - case WHCN::kHeightDim: - return utils::kHeightPacked; - case WHCN::kChannelsDim: - return utils::kChannelsPacked; - default: - VK_THROW("Invalid packed dim"); - } -} - -bool vTensor::is_contiguous() const { - if (storage_type() != utils::kBuffer) { - return false; - } - for (size_t i = 0; i < dim_order_.size(); ++i) { - if (dim_order_.at(i) != i) { - return false; - } - } - return true; -} - -size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const { - // For texture backed tensors, the metadata fields needed are: - // sizes, logical limits - size_t max_metadata_field_count = 2u; - if (storage_type() == utils::kBuffer) { - // sizes, strides, dim order, numel - max_metadata_field_count = 4u; - } - return max_metadata_field_count * nbytes_per_ubo; -} - -const vkapi::BufferBindInfo vTensor::sizes_ubo() { - VK_CHECK_COND(sizes_.size() <= 4); - return metadata_ubo_impl(&sizes_uniform_offset_, uniform_data_->sizes_v); -} - -const vkapi::BufferBindInfo vTensor::dim_order_ubo() { - VK_CHECK_COND(sizes_.size() <= 4); - return metadata_ubo_impl( - &dim_order_uniform_offset_, uniform_data_->dim_order_v); -} - -const vkapi::BufferBindInfo vTensor::strides_ubo() { - VK_CHECK_COND(sizes_.size() <= 4); - return metadata_ubo_impl(&strides_uniform_offset, uniform_data_->strides_v); -} - -const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { - VK_CHECK_COND(sizes_.size() <= 4); - return metadata_ubo_impl( - &logical_limits_uniform_offset_, uniform_data_->logical_limits); -} - -const vkapi::BufferBindInfo vTensor::numel_ubo() { - VK_CHECK_COND(sizes_.size() <= 4); - return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel); -} - -const vkapi::BufferBindInfo vTensor::buffer_meta_ubo() { - size_t ubo_nbytes = sizeof(BufferMetadata); - if (!buffer_meta_.buffer()) { - BufferMetadata data(sizes_, dim_order_, strides_, numel_); - buffer_meta_ = ParamsBuffer(storage_->context_, data); - } - return vkapi::BufferBindInfo(buffer_meta_.buffer(), 0, ubo_nbytes); -} - -VkMemoryRequirements vTensor::get_memory_requirements() const { - switch (storage_type()) { - case utils::kBuffer: - return storage_->buffer_.get_memory_requirements(); - case utils::kTexture2D: - case utils::kTexture3D: - return storage_->image_.get_memory_requirements(); - } - return {}; -} - -bool vTensor::memory_is_bound() const { - switch (storage_type()) { - case utils::kBuffer: - return storage_->buffer_.has_memory(); - case utils::kTexture2D: - case utils::kTexture3D: - return storage_->image_.has_memory(); - } -} - -void vTensor::bind_allocation(const vkapi::Allocation& allocation) { - switch (storage_type()) { - case utils::kBuffer: - storage_->buffer_.bind_allocation(allocation); - break; - case utils::kTexture2D: - case utils::kTexture3D: - storage_->image_.bind_allocation(allocation); - break; - } -} - -void vTensor::acquire_allocation(vkapi::Allocation&& allocation) { - switch (storage_type()) { - case utils::kBuffer: - storage_->buffer_.acquire_allocation(std::move(allocation)); - break; - case utils::kTexture2D: - case utils::kTexture3D: - storage_->image_.acquire_allocation(std::move(allocation)); - break; - } -} - -void vTensor::update_metadata() { - numel_ = utils::multiply_integers(sizes_); - strides_ = calculate_strides(sizes_, dim_order_); - - // Update uniform data if it has been modified - if (sizes_.size() <= 4) { - uniform_data_->numel = utils::safe_downcast(numel_); - uniform_data_->sizes_v = - flip_and_unsqueeze_ivec4(sizes_, kTensorSizes, numel_); - uniform_data_->dim_order_v = - flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_); - uniform_data_->strides_v = - flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_); - uniform_data_->logical_limits.limits = - calculate_logical_limits(sizes_, axis_map_, packed_dim_); - - if (sizes_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); - } - if (dim_order_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(uniform_data_->dim_order_v, dim_order_uniform_offset_); - } - if (strides_uniform_offset != kUniformOffsetUnset) { - uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); - } - if (numel_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(numel_, numel_uniform_offset_); - } - if (logical_limits_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update( - uniform_data_->logical_limits.limits, logical_limits_uniform_offset_); - } - } - - if (buffer_meta_.buffer()) { - BufferMetadata data(sizes_, dim_order_, strides_, numel_); - buffer_meta_.update(data); - } -} - -void vTensor::check_sizes(const std::vector& sizes) const { - if (storage_type() != utils::kBuffer) { - // For texture storage check that the current texture is large enough for - // the new sizes of the tensor. - utils::uvec3 virtual_extents = calculate_image_extents( - calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_); - - bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; - valid_resize = - valid_resize && virtual_extents[1] <= storage_->image_extents_[1]; - valid_resize = - valid_resize && virtual_extents[2] <= storage_->image_extents_[2]; - - VK_CHECK_COND( - valid_resize, - "tensor sizes requires a larger texture than the current one."); - } else { - // For buffer storage check that the current buffer is large enough for the - // new sizes of the tensor. - int64_t numel = utils::multiply_integers(sizes); - bool valid_resize = - numel + storage_->buffer_offset_ <= storage_->buffer_length_; - VK_CHECK_COND( - valid_resize, - "tensor sizes requires a larger buffer than the current one."); - } -} - -void vTensor::virtual_reconfigure( - const std::vector& new_sizes, - const std::vector& new_dim_order) { - VK_CHECK_COND( - storage_type() == utils::kBuffer, - "virtual_reconfigure is only applicable for buffer backed tensors"); - VK_CHECK_COND(new_sizes.size() == new_dim_order.size()); - VK_CHECK_COND(dim_order_is_valid(new_dim_order)); - - check_sizes(new_sizes); - sizes_ = new_sizes; - dim_order_ = new_dim_order; - - // Update the hashed layout because dim order is updated - hashed_layout_ = - create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); - - update_metadata(); -} - -void vTensor::virtual_clone(const vTensor& other) { - VK_CHECK_COND(is_view_of(other)); - sizes_ = other.sizes_; - dim_order_ = other.dim_order_; - axis_map_ = other.axis_map_; - packed_dim_ = other.packed_dim_; - hashed_layout_ = other.hashed_layout_; - - *uniform_data_ = *other.get_uniform_data(); -} - -void vTensor::virtual_resize(const std::vector& new_sizes) { - VK_CHECK_COND( - new_sizes.size() == dim_order_.size(), - "new sizes cannot modify the dimensionality of the tensor "); - - check_sizes(new_sizes); - sizes_ = new_sizes; - update_metadata(); -} - -/* - * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped - * their "identities", so we need to swap the values of dim0 and dim1 wherever - * they appear in the dim order vector. Compare this to just swapping the - * elements at dim0 and dim1 in the `sizes` vectors. - */ -void transpose_dim_order_inplace( - std::vector& dim_order, - const int64_t dim0, - const int64_t dim1) { - for (int i = 0; i < dim_order.size(); ++i) { - if (dim_order[i] == dim0) { - dim_order[i] = dim1; - } else if (dim_order[i] == dim1) { - dim_order[i] = dim0; - } - } -} - -void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { - std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1); - - const int dim0_whcn = sizes_.size() - 1 - dim0; - const int dim1_whcn = sizes_.size() - 1 - dim1; - if (packed_dim_ == dim0_whcn) { - packed_dim_ = dim1_whcn; - } else if (packed_dim_ == dim1_whcn) { - packed_dim_ = dim0_whcn; - } - - if (storage_type() == utils::kBuffer) { - transpose_dim_order_inplace(dim_order_, dim0, dim1); - } else { - // Cannot transpose batch dimension for texture storage - VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3); - std::iter_swap( - axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn); - // Update the "identity" of the concatted dimension - if (axis_map_.at(3) == dim0_whcn) { - axis_map_.at(3) = dim1_whcn; - } else if (axis_map_.at(3) == dim1_whcn) { - axis_map_.at(3) = dim0_whcn; - } - } - - // Update the hashed layout because dim order / axis mpa is updated - hashed_layout_ = - create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); - - update_metadata(); -} - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h deleted file mode 100644 index 66c1fd1e4da..00000000000 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -namespace vkcompute { -namespace api { - -static constexpr size_t kTensorDimLimit = 8; - -/* - * Given a GPUMemoryLayout value, produce a dim order vector that matches the - * given memory layout. The produced dim order vector will be in the NCHW - * dimension order - */ -std::vector calculate_dim_order( - const size_t ndim, - const int32_t packed_dim); - -/* - * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) - * dimension order, calculate the strides of the tensor. - */ -std::vector calculate_strides( - const std::vector& sizes, - const std::vector& dim_order); - -/* - * When stored on the GPU, tensor data is stored using texels (i.e. a vector of - * 4 scalar values) in order to take advantage of the GPU's native vectorization - * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4 - * types. - * - * To accommodate these vectorized types, the sizes of a tensor will be modified - * for GPU storage in the following ways: - * - * 1. The dimensionality of the tensor will be padded to a multiple of 4. - * 2. The size of the packed dimension will be padded to a multiple of 4. - * - * The "packed dimension" is determined based on the utils::GPUMemoryLayout - * argument. - */ -std::vector calculate_padded_sizes( - const std::vector& sizes, - const int32_t packed_dim); - -/* - * Calculate the image extents required of a texture backed tensor. - */ -utils::uvec3 calculate_image_extents( - const std::vector& padded_sizes, - const std::vector& axis_map, - const int32_t packed_dim); - -struct LastAccess { - vkapi::PipelineStageFlags stage; - vkapi::MemoryAccessFlags access; - - LastAccess() - : stage{vkapi::PipelineStage::NO_STAGE}, - access{vkapi::MemoryAccessType::NONE} {} - - LastAccess( - vkapi::PipelineStageFlags stage_flags, - vkapi::MemoryAccessFlags access_flags) - : stage{stage_flags}, access{access_flags} {} -}; - -/* - * Calculate the number of elements that a GPU buffer would require to store the - * contents of a tensor. This will depend on the storage type and dtype of the - * tensor, as well as the features available on the device. - */ -int64_t calculate_gpu_buffer_numel( - Context* const context, - const std::vector& sizes, - const utils::uvec3 image_extents, - const utils::StorageType storage_type, - const vkapi::ScalarType dtype); - -class vTensorStorage final { - public: - // Do not allow empty vTensorStorage construction - vTensorStorage() = default; - - vTensorStorage( - Context* context, - const utils::StorageType storage_type, - const std::vector& axis_map, - const int32_t packed_dim, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const bool allocate_memory = true); - - vTensorStorage(Context* const context, const vkapi::VulkanImage& image); - - public: - vTensorStorage(vTensorStorage& other) = delete; - vTensorStorage& operator=(const vTensorStorage& other) = delete; - - vTensorStorage(vTensorStorage&& other) = default; - vTensorStorage& operator=(vTensorStorage&& other) = default; - - ~vTensorStorage(); - - friend class vTensor; - - private: - // Context - Context* context_{}; - - utils::StorageType storage_type_; - - // Resource sizings - utils::uvec3 image_extents_{}; - int64_t buffer_length_{}; - int64_t buffer_offset_{}; - - // GPU Storage - mutable vkapi::VulkanImage image_; - mutable vkapi::VulkanBuffer buffer_; - - // Last Access - used to insert memory barriers - LastAccess last_access_; - - private: - // Registers underlying memory for cleanup - void flush(); - - // Memory barrier insertion - void transition( - vkapi::PipelineBarrier&, - const vkapi::PipelineStageFlags, - const vkapi::MemoryAccessFlags); - - // Validation - void verify() const; - - public: - inline size_t buffer_len() const { - return utils::safe_downcast(buffer_length_); - } - - inline VkFormat texture_format() { - return image_.format(); - } -}; - -class vTensor final { - struct TextureLimits { - // Alignment is required to conform with Vulkan specification; a 3 or 4 - // component vector with components of size N must have base alignment of - // 4N. - alignas(16) utils::ivec3 limits; - - TextureLimits(const utils::uvec3& ulimits) : limits{ulimits} {} - }; - - public: - explicit vTensor( - Context* context, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type = utils::kTexture3D, - const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked, - const bool allocate_memory = true, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - vTensor(const vTensor& other) = delete; - - explicit vTensor( - Context* context, - const vkapi::VulkanImage& image, - const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * This constructor allows for the creation of a vTensor that references the - * same buffer resource of another vTensor, with the same sizes and strides - * metadata. The created vTensor will not own the underlying resource. This is - * only applicable for buffer backed tensors at the moment. - * - * Once created, the sizes and strides of the aliased vTensor can be changed - * using the `virtual_reconfigure` member function. - */ - vTensor(vTensor& other); - - /* - * This constructor allows for the creation of a vTensor that references the - * same buffer resource of another vTensor, but with different sizes and - * strides metatdata. The created vTensor will not own the underlying - * resource. This is only applicable for buffer backed tensors at the moment. - * - * Note that dim order is used as the source of truth regarding the strides, - * and the new strides are computed from the new sizes and new dim order. - * Thus only the dim order is provided as an argument to this function. - * - * The offset_numel argument allows the aliased tensor's memory region to - * begin at an offset of N elements from the start of the original tensor's - * buffer. - */ - vTensor( - vTensor& other, - const std::vector& sizes, - const std::vector& dim_order); - - // To discourage making copies, the copy assignment operator is still deleted - vTensor& operator=(const vTensor& other) = delete; - - vTensor(vTensor&& other) = default; - vTensor& operator=(vTensor&& other) = default; - - ~vTensor() = default; - - enum class Attribute : uint8_t { - SIZES, - WHCN_DIM_ORDER, - STRIDES, - LOGICAL_LIMITS, - NUMEL, - }; - - class UniformData { - // Contains the number of elements in the tensor according to the canonical - // sizes. - int32_t numel; - utils::ivec4 sizes_v; - utils::ivec4 dim_order_v; - utils::ivec4 strides_v; - // See the comments documenting logical_limits() for more context. - TextureLimits logical_limits; - - friend class vTensor; - - UniformData( - const size_t numel_ll, - const std::vector& sizes, - const std::vector& dim_order, - const std::vector& strides, - const utils::uvec3& limits); - - public: - /* - * Write tensor's metadata into dst, at the given dst_offset. max_dst_size - * is the size of dst and is used to avoid out of bounds writes. - */ - uint32_t write_attribute( - void* dst, - const uint32_t dst_offset, - const uint32_t max_dst_size, - const Attribute attr); - }; - - struct BufferMetadata { - uint32_t sizes[kTensorDimLimit]; - uint32_t dim_order[kTensorDimLimit]; - uint32_t strides[kTensorDimLimit]; - uint32_t ndim; - uint32_t numel; - - BufferMetadata( - std::vector& sizes, - std::vector& dim_order, - std::vector& strides, - size_t numel); - - void update( - std::vector& sizes, - std::vector& dim_order, - std::vector& strides, - size_t numel); - }; - - private: - /* - * "Core" tensor metadata. They are the minimum amount of information required - * to construct a tensor. - */ - - // Whether the tensor has elements of type float, int, etc. - vkapi::ScalarType dtype_; - // sizes of the tensor in NCHW dimension order - std::vector sizes_; - // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for - // width, 1 for height, etc.). For texture backed tensors, this describes - // which dimension is packed along a texel. For buffer backed tensors, this - // describes which dimension has a stride of 1 (i.e. is last in the dim - // order). - int32_t packed_dim_; - - /* - * "Layout" metadata. These describe with further detail how tensor data is - * laid out in memory. However, they are considered secondary to the "core" - * metadata members above because defaults can be assumed based on a given - * memory layout. When permuting the tensor without performing a copy, these - * metadata members are the ones that will be changed. All other metadata is - * derived from a combination of sizes, memory layout, and the below members. - */ - - // dim order of the tensor; dimension indices are in NCHW dimension order - // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger - // strides precede the dims with smaller strides in the dim order. The last - // dim is always the fastest moving dim with a stride of 1. - std::vector dim_order_; - // Describes which axis of an image texture each dimension of the tensor maps - // to. The axis mapping allows texture based tensors to be permuted and - // transposed without modifying the underlying texture storage. For a more in - // depth explanation of axis mapping, see the `default_axis_map()` - // function. - std::vector axis_map_; - - /* - * The below can be consider "layout" metadata as well, but are derived from - * the above data members. - */ - - // strides of the tensor in NCHW dimension order - std::vector strides_; - - // number of elements based on the canonical sizes - size_t numel_; - - // For texture backed tensors, this int32 contains the axis map data packed - // into a single int32. For buffer backed tensors, this int32 contains the - // wchn dim order data packed into a single int32. - int32_t hashed_layout_; - - // Pre-compute these quantities to avoid frequent re-computation - size_t min_nbytes_per_ubo_; - size_t max_ubo_nbytes_; - - /* - * Utility GPU buffer that can be passed to shaders in order to convey tensor - * metadata. Uniform buffer will be initialized only the first time a ubo is - * requested. Buffer offsets will be initialized the first time they are - * accessed via the corresponding *_ubo() function. Uniform buffer's contents - * will be updated whenever virtual_resize() is called. - * - * Refer to the comments for the corresponding *_ubo() functions for more - * context about the data contained in each buffer. - */ - ParamsBuffer uniforms_; - - /* - * Used to store data for BufferMetadata to pass to shaders as buffer_meta_ubo - */ - ParamsBuffer buffer_meta_; - - uint32_t uniforms_size_ = 0u; - uint32_t sizes_uniform_offset_ = kUniformOffsetUnset; - uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset; - uint32_t strides_uniform_offset = kUniformOffsetUnset; - uint32_t numel_uniform_offset_ = kUniformOffsetUnset; - uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset; - - // Initial value of uniform buffer offsets. 1 is selected as it is essentially - // impossible for a ubo to have an offset of 1. - constexpr static uint32_t kUniformOffsetUnset = 1; - - std::shared_ptr storage_; - - std::shared_ptr uniform_data_; - - public: - /* - Texture Access - */ - - inline vkapi::VulkanImage& image() const& { - return storage_->image_; - } - - vkapi::VulkanImage& image( - vkapi::PipelineBarrier&, - const vkapi::PipelineStageFlags) &; - - vkapi::VulkanImage& image( - vkapi::PipelineBarrier&, - const vkapi::PipelineStageFlags, - const vkapi::MemoryAccessFlags) &; - - inline vkapi::VulkanBuffer& buffer() const& { - return storage_->buffer_; - } - - vkapi::VulkanBuffer& buffer( - vkapi::PipelineBarrier&, - const vkapi::PipelineStageFlags) &; - - vkapi::VulkanBuffer& buffer( - vkapi::PipelineBarrier&, - const vkapi::PipelineStageFlags, - const vkapi::MemoryAccessFlags) &; - - /* - Metadata - */ - - inline utils::StorageType storage_type() const { - return storage_->storage_type_; - } - - inline bool has_buffer_storage() const { - return storage_->storage_type_ == utils::kBuffer; - } - - public: - /* - * The logical limits of the tensor are derived from the image extents of the - * image texture used to store the tensor, but with two key differences. - * - * First, the image extents are permuted according to the axis map. This - * makes it so that the first element of the logical limit is the limit of the - * texture axis corresponding to the width dimension of the tensor, the next - * element is the limit of the texture axis corresponding to the height - * dimension and the last element is the limit of the texture axis that - * corresponds to the channels dimension of the tensor. - * - * Second, the logical limits may use smaller extents than the actual image - * extents of the image texture. This is due to dynamic shape; if the tensor's - * `virtual_resize()` function is called, then the logical limits will reflect - * the extents that would be needed to support a tensor with the updated sizes - * instead of the original sizes. - */ - inline const utils::ivec3& logical_limits() const { - return uniform_data_->logical_limits.limits; - } - - /* - * Extract an `vkapi::ScalarType` from the TensorOptions member - */ - inline vkapi::ScalarType dtype() const { - return dtype_; - } - - /* - * Provide a "best guess" of a memory layout that can be used to construct a - * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this - * tensor. In some scenarios, the exact layout of the tensor may not be able - * to be replicated due to calling `virtual_*()` functions after construction; - * however, this function will provide a memory layout that will produce the - * same `packed_dim_` as this tensor. - */ - utils::GPUMemoryLayout estimate_memory_layout() const; - - inline int32_t packed_dim() const { - return packed_dim_; - } - - /* - * Returns the WHCN index of the dimension that is used to concatenate batches - * as an int32_t. - */ - inline int32_t concat_dim() const { - return utils::safe_downcast(axis_map_.at(3)); - } - - inline const std::vector& sizes() const { - return sizes_; - } - - inline const int64_t size(size_t dim) const { - return sizes().at(dim); - } - - inline const int64_t dim() const { - return sizes_.size(); - } - - inline const std::vector& dim_order() const { - return dim_order_; - } - - inline const std::vector& strides() const { - return strides_; - } - - inline size_t numel() const { - return numel_; - } - - inline size_t nbytes() const { - return element_size(dtype()) * numel(); - } - - inline const std::vector& axis_map() const { - return axis_map_; - } - - /* - * For texture backed tensors, this function return a int32_t that contains - * the axis map + packed dimension. Each element of the axis map occupies 4 - * bits of the int32. - * - * For buffer backed tensors, the int32_t contains the WHCN dim order, where - * each element of the dim order array occupies 4 bits of the int32. - * - * This int32 is typically consumed as a specialization constant in compute - * shaders where it is subsequently unpacked. The layout data of a vTensor - * instance is typically static once created, which is why this method is - * appropriate. - */ - inline int32_t hashed_layout() const { - return hashed_layout_; - } - - /* - * Return true if the tensor's axis map is {0, 1, 2, concat_dim}. This means - * that the width dim is mapped to the width axis of the texture, the height - * dim is mapped to the height axis of the texture, the channels dim is mapped - * to the depth axis of the texture. - */ - inline bool has_standard_axis_map() const { - return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2; - } - - /* - * Return true if a buffer backed tensor's dim order matches that of a - * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }. - * Returns false for texture backed tensors. - */ - bool is_contiguous() const; - - private: - inline size_t nbytes_per_ubo() const { - return storage_->context_->adapter_ptr()->min_ubo_alignment(); - } - - size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const; - - template - const vkapi::BufferBindInfo metadata_ubo_impl( - uint32_t* param_buffer_offset, - const T& data) { - if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); - } - size_t ubo_nbytes = utils::align_up(sizeof(data), min_nbytes_per_ubo_); - if (*param_buffer_offset == kUniformOffsetUnset) { - VK_CHECK_COND( - (uniforms_size_ + ubo_nbytes) <= max_ubo_nbytes_, - "Uniform data allocation has exceeded Tensor uniform buffer size"); - *param_buffer_offset = uniforms_size_; - uniforms_size_ += ubo_nbytes; - uniforms_.update(data, *param_buffer_offset); - } - return vkapi::BufferBindInfo( - uniforms_.buffer(), *param_buffer_offset, ubo_nbytes); - } - - public: - /* - * The functions below return the buffer binding info for a UBO that contains - * some metadata of the tensor, which can be used to pass in tensor metadata - * to a compute shader. The other method of passing in tensor metadata is via - * push constants. The trade-off between each is that push constants may be - * slightly more performant and memory efficient; however, to update the - * values in a push constant due to i.e. a tensor resize between inferences, - * the command buffer must be re-encoded. On the other hand, UBOs can update - * their data by writing to their mapped memory without requiring a command - * buffer re-encode. - */ - - const vkapi::BufferBindInfo sizes_ubo(); - - const vkapi::BufferBindInfo dim_order_ubo(); - - const vkapi::BufferBindInfo strides_ubo(); - - const vkapi::BufferBindInfo logical_limits_ubo(); - - const vkapi::BufferBindInfo numel_ubo(); - - const vkapi::BufferBindInfo buffer_meta_ubo(); - - public: - inline size_t staging_buffer_numel() const { - return storage_->buffer_len(); - } - - inline size_t staging_buffer_nbytes() const { - return element_size(dtype()) * staging_buffer_numel(); - } - - /* - * Return the VmaAllocationCreateInfo of the underlying resource - */ - VmaAllocationCreateInfo get_allocation_create_info() const; - - /* - * Checks if the tensor's underlying buffer or image resource is bound to a - * memory allocation. - */ - bool memory_is_bound() const; - - /* - * Return the VkMemoryRequirements of the underlying resource - */ - VkMemoryRequirements get_memory_requirements() const; - - /* - * Binds the underlying resource to the given memory allocation - */ - void bind_allocation(const vkapi::Allocation& allocation); - - /* - * Binds and acquires a rvalue memory allocation - */ - void acquire_allocation(vkapi::Allocation&& allocation); - - private: - /* - * Assuming sizes, dim order, or axis mapping was modified, recompute all - * derived metadata and update metadata UBO with new values. - */ - void update_metadata(); - - /* - * Check that tensor sizes are valid given the current storage resource's - * limits. - */ - void check_sizes(const std::vector& sizes) const; - - public: - /* - * Change how the tensor should be interpreted by compute shaders via updating - * the size and dim order of the tensor. The new sizes and dim order may have - * different dimensionality than the current dimensionality of the tensor. - * - * This function can only be used for buffer-backed tensors, since texture - * backed buffers cannot change dimensionality or memory layout. - * - * TODO(ssjia): delete this API. prefer functions such as virtual_transpose - * instead. - */ - void virtual_reconfigure( - const std::vector& new_sizes, - const std::vector& new_dim_order); - - /* - * Set all metadata of this tensor to match the metadata of another tensor. - */ - void virtual_clone(const vTensor& other); - - /* - * Perform a virtual resize of the vTensor by modifying the size metadata that - * gets used in compute shaders. This allows the shader to treat the - * underlying resource as if it were a different size. The new sizes cannot - * modify the dimensionality of the tensor. - */ - void virtual_resize(const std::vector& new_sizes); - - /* - * Transpose the tensor in-place by updating its metadata. - */ - void virtual_transpose(const int64_t dim0, const int64_t dim1); - - /* - * Check if this vTensor instance is a view of another vTensor instance - */ - inline bool is_view_of(const vTensor& other) const { - return storage_.get() == other.storage_.get(); - } - - const std::shared_ptr& get_uniform_data() const { - VK_CHECK_COND(sizes_.size() <= 4); - return uniform_data_; - } -}; - -static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES; -static constexpr vTensor::Attribute kTensorDimOrder = - vTensor::Attribute::WHCN_DIM_ORDER; -static constexpr vTensor::Attribute kTensorStrides = - vTensor::Attribute::STRIDES; -static constexpr vTensor::Attribute kTensorLogicalLimits = - vTensor::Attribute::LOGICAL_LIMITS; -static constexpr vTensor::Attribute kTensorNumel = vTensor::Attribute::NUMEL; - -/* - * Prepare tensor metadata vector for consumption on the GPU: - * 1. Convert NCHW dim order and indexes to WCHN dim order and indexes - * 2. Unsqueeze to the next multiple of 4 dims - * 3. Convert to requested output dtype - */ -template < - typename T, - typename std::enable_if::value, int>::type = 0> -std::vector flip_and_unsqueeze( - const std::vector& tensor_metadata, - const vTensor::Attribute metadata_type, - const size_t numel, - const int32_t fixed_ndim = -1) { - const size_t ndim = tensor_metadata.size(); - size_t ndim_up4 = - std::max(utils::align_up_4(tensor_metadata.size()), size_t(4)); - - if (fixed_ndim > 0) { - VK_CHECK_COND(fixed_ndim >= ndim); - ndim_up4 = static_cast(fixed_ndim); - } - - std::vector flipped_metadata(ndim_up4); - - for (int flipped_i = 0; flipped_i < ndim; ++flipped_i) { - T val_at_dim = - utils::safe_downcast(tensor_metadata.at(ndim - 1 - flipped_i)); - if (metadata_type == kTensorDimOrder) { - val_at_dim = utils::safe_downcast(ndim - 1 - val_at_dim); - } - flipped_metadata.at(flipped_i) = val_at_dim; - } - - switch (metadata_type) { - case kTensorStrides: - for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) { - flipped_metadata.at(unsqueezed_i) = utils::safe_downcast(numel); - } - break; - case kTensorDimOrder: - for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) { - flipped_metadata.at(unsqueezed_i) = - utils::safe_downcast(unsqueezed_i); - } - break; - // Default: unsqueeze with ones - default: - for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) { - flipped_metadata.at(unsqueezed_i) = utils::safe_downcast(1); - } - break; - } - - return flipped_metadata; -} - -/* - * Same as flip and unsqueeze, but returns the metadata as an `ivec4`. - */ -utils::ivec4 flip_and_unsqueezed_ivec4( - const std::vector& tensor_metadata, - const vTensor::Attribute metadata_type, - const size_t numel); - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py deleted file mode 100644 index 3f2d616b428..00000000000 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ /dev/null @@ -1,1450 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -import argparse -import array -import codecs -import copy -import glob -import hashlib -import io -import os -import re -import shutil -import sys -from itertools import product -from multiprocessing.pool import ThreadPool - -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -import subprocess -import textwrap -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Set, Tuple, Union - -import yaml -from yaml.constructor import ConstructorError -from yaml.nodes import MappingNode - -try: - from yaml import CLoader as Loader -except ImportError: - from yaml import Loader # type: ignore[assignment, misc] - -CPP_H_NAME = "spv.h" -CPP_SRC_NAME = "spv.cpp" - -# Basic configuration settings for shaders -DEFAULT_ENV: Dict[str, Any] = { - "PRECISION": "highp", - # B is shorthand for "binding". This is used to automatically increment the - # layout binding index when declaring layout bindings. Note that a container - # type is used because integers are immutable in Python. - "B": [0], - # C is shorthand for "constant_id". This is used to automatically increment the - # constant_id index for specialization constants. - # Note that it starts at 3, as 0-2 are reserved for local workgroup size ids. - "C": [3], -} - -# Establishes relationships between different tensor types and different GLSL types -TYPE_MAPPINGS: Dict[str, Any] = { - "IMAGE_T": { - 3: { - "double": "image3D", - "float": "image3D", - "half": "image3D", - # integer dtypes - "int8": "iimage3D", - "uint8": "uimage3D", - "int16": "iimage3D", - "uint16": "uimage3D", - "int32": "iimage3D", - "uint32": "uimage3D", - "int64": "iimage3D", - "uint64": "uimage3D", - # common dtype aliases - "bool": "uimage3D", - "int": "iimage3D", - "uint": "uimage3D", - }, - 2: { - "double": "image2D", - "float": "image2D", - "half": "image2D", - # integer dtypes - "int8": "iimage2D", - "uint8": "uimage2D", - "int16": "iimage2D", - "uint16": "uimage2D", - "int32": "iimage2D", - "uint32": "uimage2D", - "int64": "iimage2D", - "uint64": "uimage2D", - # common dtype aliases - "bool": "uimage2D", - "int": "iimage2D", - "uint": "uimage2D", - }, - }, - "SAMPLER_T": { - 3: { - "double": "sampler3D", - "float": "sampler3D", - "half": "sampler3D", - # integer dtypes - "int8": "isampler3D", - "uint8": "usampler3D", - "int16": "isampler3D", - "uint16": "usampler3D", - "int32": "isampler3D", - "uint32": "usampler3D", - "int64": "isampler3D", - "uint64": "usampler3D", - # common dtype aliases - "bool": "usampler3D", - "int": "isampler3D", - "uint": "usampler3D", - }, - 2: { - "double": "sampler2D", - "float": "sampler2D", - "half": "sampler2D", - # integer dtypes - "int8": "isampler2D", - "uint8": "usampler2D", - "int16": "isampler2D", - "uint16": "usampler2D", - "int32": "isampler2D", - "uint32": "usampler2D", - "int64": "isampler2D", - "uint64": "usampler2D", - # common dtype aliases - "bool": "usampler2D", - "int": "isampler2D", - "uint": "usampler2D", - }, - }, - "IMAGE_FORMAT": { - "double": "rgba32f", - "float": "rgba32f", - "half": "rgba16f", - # integer dtypes - "int8": "rgba8i", - "uint8": "rgba8ui", - "int16": "rgba16i", - "uint16": "rgba16ui", - "int32": "rgba32i", - "uint32": "rgba32ui", - "int64": "rgba32i", - "uint64": "rgba32ui", - # common dtype aliases - "bool": "rgba8ui", - "int": "rgba32i", - "uint": "rgba32ui", - }, -} - - -def define_variable(name: str) -> str: - if name in locals(): - return f"#define {name} {locals()[name]}" - elif name in globals(): - return f"#define {name} {globals()[name]}" - else: - raise RuntimeError(f"{name} is not defined") - - -def buffer_scalar_type(dtype: str) -> str: - if dtype == "half": - return "float16_t" - elif dtype == "float": - return "float" - elif dtype == "double": - return "float64_t" - # integer dtype alias conversion - elif dtype == "bool": - return "uint8_t" - # we don't want to append _t for int32 or uint32 as int is already 32bit - elif dtype == "int32" or dtype == "uint32": - return "int" if dtype == "int32" else "uint" - elif dtype[-1].isdigit(): - return dtype + "_t" - return dtype - - -def buffer_gvec_type(dtype: str, n: int) -> str: - if n == 1: - return buffer_scalar_type(dtype) - - dtype_map = { - "half": f"f16vec{n}", - "float": f"vec{n}", - "double": f"vec{n}", # No 64bit image format support in GLSL - "int8": f"i8vec{n}", - "uint8": f"u8vec{n}", - "int16": f"i16vec{n}", - "uint16": f"u16vec{n}", - "int32": f"ivec{n}", - "int": f"ivec{n}", - "uint32": f"uvec{n}", - "uint": f"uvec{n}", - "int64": f"ivec{n}", # No 64bit image format support in GLSL - "uint64": f"uvec{n}", # No 64bit image format support in GLSL - "bool": f"u8vec{n}", - } - - vector_type = dtype_map.get(dtype) - if vector_type is None: - raise AssertionError(f"Invalid dtype: {dtype}") - - return vector_type - - -def texel_type(dtype: str) -> str: - image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] - if image_format[-1:] == "f": - return "vec4" - elif image_format[-2:] == "ui": - return "uvec4" - elif image_format[-1:] == "i": - return "ivec4" - raise AssertionError(f"Invalid image format: {image_format}") - - -def gvec_type(dtype: str, n: int) -> str: - gvec4_type = texel_type(dtype) - return gvec4_type[:-1] + str(n) - - -def texel_component_type(dtype: str) -> str: - vec4_type = texel_type(dtype) - if vec4_type[:3] == "vec": - return "float" - elif vec4_type[:4] == "ivec": - return "int" - elif vec4_type[:4] == "uvec": - return "uint" - raise AssertionError(f"Invalid vec4 type: {vec4_type}") - - -def texel_load_type(dtype: str, storage_type: str) -> str: - if storage_type.lower() == "buffer": - return buffer_gvec_type(dtype, 4) - else: - return texel_type(dtype) - - -def texel_load_component_type(dtype: str, storage_type: str) -> str: - if storage_type.lower() == "buffer": - return buffer_scalar_type(dtype) - else: - return texel_component_type(dtype) - - -def get_access_qualifier(access_type: Optional[str]) -> str: - if access_type is None: - return "" - if access_type.lower() == "r": - return "readonly" - if access_type.lower() == "w": - return "writeonly" - if access_type.lower() == "rw": - return "" - - raise AssertionError(f"Invalid access type: {access_type}") - - -def get_slot_val(slot: Union[int, List[int]]) -> int: - if isinstance(slot, list): - return slot[0] - return slot - - -def layout_declare_buffer( - slot: Union[int, List[int]], - access_type: str, - var_name: str, - dtype: str, - precision: str = "PRECISION", - is_scalar_array: bool = True, -) -> str: - array_type = buffer_gvec_type(dtype, 4) - if is_scalar_array: - array_type = buffer_scalar_type(dtype) - - out_str = f""" -layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ - {array_type} {var_name}[]; -}}; -""" - - if isinstance(slot, list): - slot[0] = slot[0] + 1 - return out_str - - -def layout_declare_image( - slot: Union[int, List[int]], - access_type: str, - var_name: str, - dtype: str, - precision: str = "PRECISION", - image_ndim: int = 3, -) -> str: - image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] - image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype] - - ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" - - if isinstance(slot, list): - slot[0] = slot[0] + 1 - return ret_str - - -def layout_declare_sampler( - slot: Union[int, List[int]], - access_type: str, - var_name: str, - dtype: str, - precision: str = "PRECISION", - access_qualifier: Optional[str] = None, - image_ndim: int = 3, -) -> str: - sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype] - - ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};" - - if isinstance(slot, list): - slot[0] = slot[0] + 1 - return ret_str - - -def layout_declare_tensor( - slot: Union[int, List[int]], - access_type: str, - var_name: str, - dtype: str, - storage_type: str, - is_scalar_array: bool = True, - precision: str = "PRECISION", -) -> str: - assert storage_type.lower() in ["buffer", "texture3d", "texture2d"] - - image_ndim = 3 - if storage_type.lower() == "texture2d": - image_ndim = 2 - - # Create buffer binding - if storage_type.lower() == "buffer": - return layout_declare_buffer( - slot, - access_type, - var_name, - dtype, - precision, - is_scalar_array=is_scalar_array, - ) - - # Create image/sampler binding - if access_type.lower() == "r": - return layout_declare_sampler( - slot, access_type, var_name, dtype, precision, image_ndim=image_ndim - ) - else: - return layout_declare_image( - slot, access_type, var_name, dtype, precision, image_ndim=image_ndim - ) - - -def layout_declare_ubo( - slot: Union[int, List[int]], *args, precision: str = "PRECISION" -) -> str: - assert len(args) % 2 == 0 - - var_list = list(zip(args[::2], args[1::2])) - - ubo_name = "" - for _, var_name in var_list: - ubo_name += var_name + "_" - - out_str = f""" -layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{ -""" - for type_name, var_name in var_list: - out_str += f" {type_name} {var_name};\n" - out_str += "};" - - if isinstance(slot, list): - slot[0] = slot[0] + 1 - return out_str - - -def layout_declare_spec_const( - slot: Union[int, List[int]], - type_name: str, - var_name: str, - initial_val: Optional[str] = None, -) -> str: - assert type_name in ["int", "uint", "float", "bool"] - - out_str = f"layout(constant_id = {get_slot_val(slot)}) const {type_name} {var_name}" - if initial_val is not None: - out_str += f" = {initial_val}" - out_str += ";" - - if isinstance(slot, list): - slot[0] = slot[0] + 1 - return out_str - - -def define_active_storage_type(storage_type: str): - if storage_type.lower() == "buffer": - return "#define USING_BUFFER" - elif storage_type.lower() == "texture3d": - return "#define USING_TEXTURE3D" - elif storage_type.lower() == "texture2d": - return "#define USING_TEXTURE2D" - else: - raise AssertionError(f"Invalid storage type: {storage_type}") - - -def define_required_extensions(dtypes: Union[str, List[str]]): - out_str = "\n" - dtype_list = dtypes if isinstance(dtypes, list) else [dtypes] - - for dtype in dtype_list: - nbit = None - glsl_type = None - if dtype == "half": - nbit = "16bit" - glsl_type = "float16" - elif dtype == "double": - # We only need to allow float64_t type usage - glsl_type = "float64" - elif dtype in ["int8", "uint8", "bool"]: - nbit = "8bit" - glsl_type = "int8" - elif dtype in ["int16", "uint16"]: - nbit = "16bit" - glsl_type = "int16" - elif dtype in ["int64", "uint64"]: - # We only need to allow int64_t and uint64_t type usage - glsl_type = "int64" - - if nbit is not None: - out_str += f"#extension GL_EXT_shader_{nbit}_storage : require\n" - if glsl_type is not None: - out_str += f"#extension GL_EXT_shader_explicit_arithmetic_types_{glsl_type} : require\n" - - return out_str - - -UTILITY_FNS: Dict[str, Any] = { - "macro_define": define_variable, - "get_pos": { - 3: lambda pos: pos, - 2: lambda pos: f"{pos}.xy", - }, - "buffer_scalar_type": buffer_scalar_type, - "buffer_gvec_type": buffer_gvec_type, - "texel_type": texel_type, - "gvec_type": gvec_type, - "texel_component_type": texel_component_type, - "texel_load_type": texel_load_type, - "texel_load_component_type": texel_load_component_type, - "layout_declare_buffer": layout_declare_buffer, - "layout_declare_image": layout_declare_image, - "layout_declare_sampler": layout_declare_sampler, - "layout_declare_tensor": layout_declare_tensor, - "layout_declare_ubo": layout_declare_ubo, - "layout_declare_spec_const": layout_declare_spec_const, - "define_active_storage_type": define_active_storage_type, - "define_required_extensions": define_required_extensions, -} - - -def extract_filename(path: str, keep_ext: bool = True) -> Any: - if keep_ext: - return os.path.basename(path) - else: - return os.path.basename(path).split(".")[0] - - -def extract_extension(path: str) -> str: - return os.path.splitext(extract_filename(path))[1][1:] - - -############################ -# SPIR-V Code Generation # -############################ - - -# https://gist.github.com/pypt/94d747fe5180851196eb -class UniqueKeyLoader(Loader): - def construct_mapping(self, node, deep=False): # type: ignore[no-untyped-def] - if not isinstance(node, MappingNode): - raise ConstructorError( - None, - None, - f"expected a mapping node, but found {node.id}", - node.start_mark, - ) - mapping = {} - for key_node, value_node in node.value: - key = self.construct_object(key_node, deep=deep) # type: ignore[no-untyped-call] - try: - hash(key) - except TypeError as e: - raise ConstructorError( - "while constructing a mapping", - node.start_mark, - "found unacceptable key ", - key_node.start_mark, - ) from e - # check for duplicate keys - if key in mapping: - raise ConstructorError( - "while constructing a mapping", - node.start_mark, - "found duplicate key", - key_node.start_mark, - ) - value = self.construct_object(value_node, deep=deep) # type: ignore[no-untyped-call] - mapping[key] = value - return mapping - - -# https://github.com/google/XNNPACK/blob/master/tools/xngen.py -def extract_leading_whitespace(line: str) -> str: - match = re.match(r"\s*", line) - return match.group(0) if match else "" - - -# https://github.com/google/XNNPACK/blob/master/tools/xngen.py -def escape(line: str) -> str: - output_parts = [] - while "${" in line: - start_pos = line.index("${") - end_pos = line.index("}", start_pos + 2) - if start_pos != 0: - output_parts.append('"' + line[:start_pos].replace('"', '\\"') + '"') - output_parts.append("str(" + line[start_pos + 2 : end_pos] + ")") - line = line[end_pos + 1 :] - if line: - output_parts.append('"' + line.replace('"', '\\"') + '"') - return " + ".join(output_parts) - - -# https://github.com/google/XNNPACK/blob/master/tools/xngen.py -def preprocess( - input_text: str, variables: Dict[str, Any], input_path: str = "codegen" -) -> str: - # Workaround to handle source files using \ to extend mecros to a new line - input_text = re.sub(r"\\$", r"\\\\", input_text, flags=re.MULTILINE) - - input_lines = input_text.splitlines() - python_lines = [] - - blank_lines = 0 - - last_indent = "" - - # List of tuples (total_index, python_indent) - indent_stack = [("", "")] - - # Indicates whether this is the first line inside Python - # code block (i.e. for, while, if, elif, else) - python_block_start = True - for input_line in input_lines: - if input_line == "": - blank_lines += 1 - continue - # Skip lint markers. - if "LINT" in input_line: - continue - - input_indent = extract_leading_whitespace(input_line) - if python_block_start: - assert input_indent.startswith(last_indent) - extra_python_indent = input_indent[len(last_indent) :] - python_indent = indent_stack[-1][1] + extra_python_indent - indent_stack.append((input_indent, python_indent)) - assert input_indent.startswith(indent_stack[-1][0]) - else: - while not input_indent.startswith(indent_stack[-1][0]): - del indent_stack[-1] - python_block_start = False - - python_indent = indent_stack[-1][1] - stripped_input_line = input_line.strip() - if stripped_input_line.startswith("$") and not stripped_input_line.startswith( - "${" - ): - if stripped_input_line.endswith(":"): - python_block_start = True - while blank_lines != 0: - python_lines.append(python_indent + "print(file=OUT_STREAM)") - blank_lines -= 1 - python_lines.append(python_indent + stripped_input_line.replace("$", "")) - else: - assert input_line.startswith(python_indent) - while blank_lines != 0: - python_lines.append(python_indent + "print(file=OUT_STREAM)") - blank_lines -= 1 - python_lines.append( - python_indent - + "print(%s, file=OUT_STREAM)" - % escape(input_line[len(python_indent) :]) - ) - last_indent = input_indent - - while blank_lines != 0: - python_lines.append(python_indent + "print(file=OUT_STREAM)") - blank_lines -= 1 - - exec_globals = dict(variables) - output_stream = io.StringIO() - exec_globals["OUT_STREAM"] = output_stream - - python_bytecode = compile("\n".join(python_lines), input_path, "exec") - exec(python_bytecode, exec_globals) - - return output_stream.getvalue() - - -class SPVGenerator: - def __init__( - self, - src_dir_paths: Union[str, List[str]], - env: Dict[Any, Any], - glslc_path: Optional[str], - glslc_flags: str = "", - replace_u16vecn: bool = False, - ) -> None: - if isinstance(src_dir_paths, str): - self.src_dir_paths = [src_dir_paths] - else: - self.src_dir_paths = src_dir_paths - - self.env = env - self.glslc_path = glslc_path - self.glslc_flags = glslc_flags.split() - self.glslc_flags_no_opt = self.glslc_flags.copy() - if "-O" in self.glslc_flags_no_opt: - self.glslc_flags_no_opt.remove("-O") - if "-Os" in self.glslc_flags_no_opt: - self.glslc_flags_no_opt.remove("-Os") - self.replace_u16vecn = replace_u16vecn - - self.src_files: Dict[str, str] = {} - self.template_yaml_files: List[str] = [] - - self.addSrcAndYamlFiles(self.src_dir_paths) - self.shader_template_params: Dict[Any, Any] = {} - for yaml_file in self.template_yaml_files: - self.parseTemplateYaml(yaml_file) - - self.output_file_map: Dict[str, Tuple[str, Dict[str, str]]] = {} - self.constructOutputMap() - - def addSrcAndYamlFiles(self, src_dir_paths: List[str]) -> None: - for src_path in src_dir_paths: - # Collect glsl source files - src_files_list = glob.glob( - os.path.join(src_path, "**", "*.[gh]lsl*"), recursive=True - ) + glob.glob(os.path.join(src_path, "**", "*.h"), recursive=True) - for file in src_files_list: - if len(file) > 1: - self.src_files[extract_filename(file, keep_ext=False)] = file - # Collect template yaml files - yaml_files = glob.glob( - os.path.join(src_path, "**", "*.yaml"), recursive=True - ) - for file in yaml_files: - if len(file) > 1: - self.template_yaml_files.append(file) - - def generateVariantCombinations( - self, - iterated_params: Dict[str, Any], - exclude_params: Optional[Set[str]] = None, - ) -> List[Any]: - if exclude_params is None: - exclude_params = set() - all_iterated_params = [] - for param_name, value_list in iterated_params.items(): - if param_name not in exclude_params: - param_values = [] - for value in value_list: - if "RANGE" in value: - value_range = value["RANGE"] - suffix = value.get("SUFFIX", "") - if isinstance(value_range, list) and len(value_range) == 2: - for i in range(value_range[0], value_range[1] + 1): - curr_suffix = ( - suffix + "_" + str(i) if suffix else str(i) - ) - param_values.append((param_name, curr_suffix, i)) - else: - raise ValueError( - f"{value['RANGE']} is not a valid range. Must be in format [start, end] (inclusive)." - ) - - elif "VALUE" in value: - suffix = value.get("SUFFIX", value["VALUE"]) - if value["VALUE"] in ["int", "uint"]: - raise ValueError( - f"Use int32 or uint32 instead of {value['VALUE']}" - ) - param_values.append((param_name, suffix, value["VALUE"])) - - else: - raise KeyError( - "Parameter must be 'VALUE: string' or 'RANGE: [a, b]'" - ) - - all_iterated_params.append(param_values) - - return list(product(*all_iterated_params)) - - def parseTemplateYaml(self, yaml_file: str) -> None: - with open(yaml_file) as f: - contents = yaml.load(f, Loader=UniqueKeyLoader) - for template_name, params_dict in contents.items(): - if template_name in self.shader_template_params: - raise KeyError(f"{template_name} params file is defined twice") - - default_params = params_dict["parameter_names_with_default_values"] - default_params["YAML_SRC_FULLPATH"] = yaml_file - params_names = set(default_params.keys()).union({"NAME"}) - - self.shader_template_params[template_name] = [] - - default_iterated_params = params_dict.get( - "generate_variant_forall", None - ) - - for variant in params_dict["shader_variants"]: - default_iterated_params_names = set( - default_iterated_params.keys() - if default_iterated_params is not None - else {} - ) - variant_params_names = set(variant.keys()) - - invalid_keys = ( - variant_params_names - - default_iterated_params_names - - params_names - - {"generate_variant_forall"} - ) - assert len(invalid_keys) == 0 - - iterated_params = variant.get( - "generate_variant_forall", default_iterated_params - ) - - if iterated_params is not None: - variant_combinations = self.generateVariantCombinations( - iterated_params, variant_params_names - ) - - for combination in variant_combinations: - default_params_copy = copy.deepcopy(default_params) - for key in variant: - if key != "generate_variant_forall": - default_params_copy[key] = variant[key] - - variant_name = variant["NAME"] - for param_value in combination: - default_params_copy[param_value[0]] = param_value[2] - if len(str(param_value[1])) > 0: - variant_name = f"{variant_name}_{param_value[1]}" - - default_params_copy["NAME"] = variant_name - default_params_copy["VARIANT_NAME"] = variant["NAME"] - - self.shader_template_params[template_name].append( - default_params_copy - ) - else: - default_params_copy = copy.deepcopy(default_params) - for key in variant: - default_params_copy[key] = variant[key] - - self.shader_template_params[template_name].append( - default_params_copy - ) - - def create_shader_params( - self, variant_params: Optional[Dict[str, Any]] = None - ) -> Dict[str, str]: - if variant_params is None: - variant_params = {} - shader_params = copy.deepcopy(self.env) - for key, value in variant_params.items(): - shader_params[key] = value - - return shader_params - - def constructOutputMap(self) -> None: - for src_name, params in self.shader_template_params.items(): - for variant in params: - src_file_fullpath = self.src_files[src_name] - - self.output_file_map[variant["NAME"]] = ( - src_file_fullpath, - self.create_shader_params(variant), - ) - - for src_name, src_file_fullpath in self.src_files.items(): - if src_name not in self.shader_template_params: - self.output_file_map[src_name] = ( - src_file_fullpath, - self.create_shader_params(), - ) - - def maybe_replace_u16vecn(self, input_text: str) -> str: - """ - There is a latency benefit to using u16vecn variables to store texture position - variables instead of ivecn, likely due to reduced register pressure. However, - SwiftShader does not support 16 bit integer types in shaders, so this is a crude - way to fallback to using ivecn to store texture positions so that testing with - SwiftShader is still possible. - """ - if not self.replace_u16vecn: - return input_text - if "codegen-nosub" in input_text: - return input_text - - # Remove extension requirement so that generated ShaderInfo does not mark it - input_text = input_text.replace( - "#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require", "" - ) - input_text = input_text.replace("u16vec", "ivec") - input_text = input_text.replace("uint16_t", "int") - return input_text - - def get_md5_checksum(self, file_path: str) -> str: - # Use a reasonably sized buffer for better performance with large files - BUF_SIZE = 65536 # 64kb chunks - - md5 = hashlib.md5() - - with open(file_path, "rb") as f: - while True: - data = f.read(BUF_SIZE) - if not data: - break - md5.update(data) - - # Get the hexadecimal digest and compare - file_md5 = md5.hexdigest() - return file_md5 - - def generateSPV( # noqa: C901 - self, - output_dir: str, - cache_dir: Optional[str] = None, - force_rebuild: bool = False, - ) -> Dict[str, str]: - # The key of this dictionary is the full path to a generated source file. The - # value is a tuple that contains 3 entries: - # - # 1. A bool indicationg if the file has changed since the last compilation; this - # is determined by comparing against the cached version. - # 2. List of other source files included by the generated file. - gen_file_meta: Dict[str, Tuple[bool, List[str], str]] = {} - - # Return value of the function mapping the abspath of compiled SPIR-V binaries - # to the abspath of the generated GLSL file they were compiled from. - spv_to_glsl_map: Dict[str, str] = {} - - # Convert output_dir to absolute path - assert os.path.exists(output_dir) - output_dir = os.path.abspath(output_dir) - - if cache_dir is not None: - assert os.path.exists(cache_dir) - - def get_glsl_includes(glsl_text): - """ - Parse GLSL text content and return a list of included files. - - Args: - glsl_text: String containing the GLSL file content to analyze - - Returns: - List of included file names (e.g., ["random.h"]) - """ - includes = [] - for line in glsl_text.splitlines(): - # Look for #include directives with quoted filenames - # Matches: #include "filename.h" or #include - include_match = re.match( - r'^\s*#include\s+[<"]([^>"]+)[>"]', line.strip() - ) - if include_match: - includes.append(include_match.group(1)) - - return includes - - def file_has_changed(gen_file_path, cached_file_path): - # If the file does not exist in the cache, then return True - if not os.path.exists(cached_file_path): - return True - current_checksum = self.get_md5_checksum(gen_file_path) - cached_checksum = self.get_md5_checksum(cached_file_path) - return current_checksum != cached_checksum - - def any_sources_changed(gen_file_path, output_dir): - """ - Given the path to a generated source file, check the gen_file_meta dict to - determine if the ANY of the source files contributing to the compilation of - this file were changed since the last successful compilation. - """ - gen_file_changed, includes_list = gen_file_meta[gen_file_path] - any_changed = gen_file_changed - for included_file in includes_list: - included_file_path = os.path.join(output_dir, included_file) - any_changed = any_changed or any_sources_changed( - included_file_path, output_dir - ) - - return any_changed - - def generate_src_file(shader_paths_pair) -> Tuple[bool, List[str]]: - """ - Given an input tuple containing the following items: - (src_file_name, (template_file_path, codegen_params)) - - This function generates src_file_name by processing - template_file_path with the Python preprocessor using the - parameters specified by codegen_params. - - Then, it returns a tuple containing: - 1. The path of the generated source file - 2. A bool indicating if the generated source file has changed since the last - compilation. - 3. A list of files included by the generated source file - """ - # name of .glsl, .glslh, or .h file to be generated - src_file_name = shader_paths_pair[0] - # path of template file used for codegen - template_file_path = shader_paths_pair[1][0] - # args to be used for codegen - codegen_params = shader_paths_pair[1][1] - - # Assume that generated files will have the same file extension as the - # source template file. - out_file_ext = extract_extension(template_file_path) - - # Construct generated file name - gen_out_path = os.path.join(output_dir, f"{src_file_name}.{out_file_ext}") - # Construct path of cached generated file - cached_gen_out_path = os.path.join( - cache_dir, f"{src_file_name}.{out_file_ext}" - ) - - # Execute codegen to generate the output file - with codecs.open(template_file_path, "r", encoding="utf-8") as input_file: - input_text = input_file.read() - input_text = self.maybe_replace_u16vecn(input_text) - output_text = preprocess(input_text, codegen_params) - - included_files = get_glsl_includes(output_text) - - with codecs.open(gen_out_path, "w", encoding="utf-8") as output_file: - output_file.write(output_text) - - file_changed = ( - file_has_changed(gen_out_path, cached_gen_out_path) or force_rebuild - ) - - # Save the generated file to cache so it can be used for future checks - if cache_dir is not None and file_changed: - shutil.copyfile(gen_out_path, cached_gen_out_path) - - return gen_out_path, file_changed, included_files - - def compile_spirv(shader_paths_pair) -> Tuple[str, str]: - """ - Given an input tuple containing the following items: - (src_file_name, (template_file_path, codegen_params)) - - Infer the path of the GLSL source file generated by generate_src_file and - compile a SPIR-V binary from it. Returns the path of the compiled SPIR-V - binary and the path of the source file used to compile it. - - This function also utilizes a caching mechanism; if generate_src_file - reported that the source file was unchanged since the last successful - compilation, AND if the SPIR-V from the last successful compilation was - stored in the cache, then directly use the cached SPIR-V without triggering - a re-compilation. - """ - # name of generated .glsl, .glslh, or .h from generate_src_file - src_file_name = shader_paths_pair[0] - # path of template file used for codegen - template_file_path = shader_paths_pair[1][0] - # args used for codegen - codegen_params = shader_paths_pair[1][1] - - # Assume that generated files will have the same file extension as the - # source template file. - out_file_ext = extract_extension(template_file_path) - - # Infer name of generated file (created by generate_src_file) - gen_out_path = os.path.join(output_dir, f"{src_file_name}.{out_file_ext}") - - # Only proceed if GLSL -> SPIR-V compilation is required for this file - if out_file_ext != "glsl": - return (None, gen_out_path) - - # Validate that the source file actually exists - assert os.path.exists(gen_out_path) and gen_out_path in gen_file_meta - - # Construct name of SPIR-V file to be compiled - spv_out_path = os.path.join(output_dir, f"{src_file_name}.spv") - - if cache_dir is not None: - # Construct the file names of cached SPIR-V file to check if they exist - # in the cache. - cached_spv_out_path = os.path.join(cache_dir, f"{src_file_name}.spv") - - can_use_cached = not any_sources_changed(gen_out_path, output_dir) - if can_use_cached and os.path.exists(cached_spv_out_path): - shutil.copyfile(cached_spv_out_path, spv_out_path) - return (spv_out_path, gen_out_path) - - vk_version = codegen_params.get("VK_VERSION", "1.1") - # Only proceed if a GLSL compiler was specified - if self.glslc_path is not None: - cmd_base = [ - self.glslc_path, - "-fshader-stage=compute", - gen_out_path, - "-o", - spv_out_path, - "--target-env=vulkan{}".format(vk_version), - "-Werror", - "-I", - output_dir, - ] - cmd = cmd_base + self.glslc_flags - - try: - subprocess.run(cmd, check=True, capture_output=True, text=True) - except subprocess.CalledProcessError as e: - opt_fail = "compilation succeeded but failed to optimize" - err_msg_base = f"Failed to compile {os.getcwd()}/{gen_out_path}: " - if opt_fail in e.stderr or opt_fail in e.stdout: - cmd_no_opt = cmd_base + self.glslc_flags_no_opt - try: - subprocess.run(cmd_no_opt, check=True, capture_output=True) - except subprocess.CalledProcessError as e_no_opt: - # Delete any existing cached SPIR-V file if it exists - if os.path.exists(cached_spv_out_path): - os.remove(cached_spv_out_path) - - raise RuntimeError( - f"{err_msg_base} {e_no_opt.stderr}" - ) from e_no_opt - - else: - # Delete any existing cached SPIR-V file if it exists - if os.path.exists(cached_spv_out_path): - os.remove(cached_spv_out_path) - - raise RuntimeError(f"{err_msg_base} {e.stderr}") from e - - # If compilation was successful, store the compiled SPIR-V file in the - # cache for future use. - if cache_dir is not None: - shutil.copyfile(spv_out_path, cached_spv_out_path) - - return (spv_out_path, gen_out_path) - - # Run codegen serially to ensure that all .glsl, .glslh, and .h files are up to - # date before compilation - for generated_file_tuple in self.output_file_map.items(): - gen_out_path, file_changed, include_list = generate_src_file( - generated_file_tuple - ) - gen_file_meta[gen_out_path] = (file_changed, include_list) - - # Parallelize SPIR-V compilation to optimize build time - with ThreadPool(os.cpu_count()) as pool: - for spv_out_path, glsl_out_path in pool.map( - compile_spirv, self.output_file_map.items() - ): - spv_to_glsl_map[spv_out_path] = glsl_out_path - - return spv_to_glsl_map - - -############################################## -# Shader Info and Shader Registry Handling # -############################################## - - -@dataclass -class ShaderInfo: - tile_size: List[int] - layouts: List[str] - weight_storage_type: str = "" - bias_storage_type: str = "" - register_for: Optional[Tuple[str, List[str]]] = None - requires_shader_int16_ext: bool = False - requires_16bit_storage_ext: bool = False - requires_8bit_storage_ext: bool = False - requires_integer_dot_product_ext: bool = False - - -def getName(filePath: str) -> str: - return os.path.basename(filePath).replace("/", "_").replace(".", "_") - - -def isDescriptorLine(lineStr: str) -> bool: - descriptorLineId = r"^layout\(set" - return re.search(descriptorLineId, lineStr) is not None - - -def isTileSizeLine(lineStr: str) -> bool: - tile_size_id = r"^ \* TILE_SIZE = \(" - return re.search(tile_size_id, lineStr) is not None - - -def findTileSizes(lineStr: str) -> List[int]: - tile_size_id = r"^ \* TILE_SIZE = \(([0-9]+), ([0-9]+), ([0-9]+)\)" - matches = re.search(tile_size_id, lineStr) - if matches is None: - raise AssertionError("matches is None in findTileSizes") - return [int(matches.group(1)), int(matches.group(2)), int(matches.group(3))] - - -def isWeightStorageTypeLine(lineStr: str) -> bool: - weight_storage_id = r"^ \* WEIGHT_STORAGE = " - return re.search(weight_storage_id, lineStr) is not None - - -def getWeightStorageType(lineStr: str) -> str: - weight_storage_id = r"^ \* WEIGHT_STORAGE = ([a-zA-Z]+_\dD)" - matches = re.search(weight_storage_id, lineStr) - if matches is None: - raise AssertionError("matches is None in getWeightStorageType") - return matches.group(1) - - -def isBiasStorageTypeLine(lineStr: str) -> bool: - weight_storage_id = r"^ \* BIAS_STORAGE = " - return re.search(weight_storage_id, lineStr) is not None - - -def getBiasStorageType(lineStr: str) -> str: - weight_storage_id = r"^ \* BIAS_STORAGE = ([a-zA-Z]+_\dD)" - matches = re.search(weight_storage_id, lineStr) - if matches is None: - raise AssertionError("matches is None in getBiasStorageType") - return matches.group(1) - - -def isRegisterForLine(lineStr: str) -> bool: - # Check for Shader Name and a list of at least one Registry Key - register_for_id = ( - r"^ \* REGISTER_FOR = \('([A-Za-z0-9_]+)'\s*,\s*\['([A-Za-z0-9_]+)'.*\]\)" - ) - return re.search(register_for_id, lineStr) is not None - - -def findRegisterFor(lineStr: str) -> Tuple[str, List[str]]: - register_for_pattern = r"'([A-Za-z0-9_]+)'" - matches = re.findall(register_for_pattern, lineStr) - if matches is None: - raise AssertionError("matches is None in getBiasStorageType") - matches_list = list(matches) - return (matches_list[0], matches_list[1:]) - - -def isExtensionRequireLine(lineStr: str) -> bool: - extension_require_id = r"^#extension ([A-Za-z0-9_]+)\s*:\s*require" - return re.search(extension_require_id, lineStr) is not None - - -typeIdMapping = { - r"image[123]D\b": "VK_DESCRIPTOR_TYPE_STORAGE_IMAGE", - r"sampler[123]D\b": "VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER", - r"\bbuffer\b": "VK_DESCRIPTOR_TYPE_STORAGE_BUFFER", - r"\buniform\b": "VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER", -} - - -def determineDescriptorType(lineStr: str) -> str: - for identifier, typeNum in typeIdMapping.items(): - if re.search(identifier, lineStr): - return typeNum - raise AssertionError( - "No matching descriptor type for " + lineStr + " in determineDescriptorType" - ) - - -def getShaderInfo(srcFilePath: str) -> ShaderInfo: - shader_info = ShaderInfo([], [], "") - with open(srcFilePath) as srcFile: - for line in srcFile: - if isDescriptorLine(line): - shader_info.layouts.append(determineDescriptorType(line)) - if isTileSizeLine(line): - shader_info.tile_size = findTileSizes(line) - if isWeightStorageTypeLine(line): - shader_info.weight_storage_type = getWeightStorageType(line) - if isBiasStorageTypeLine(line): - shader_info.bias_storage_type = getBiasStorageType(line) - if isRegisterForLine(line): - shader_info.register_for = findRegisterFor(line) - if isExtensionRequireLine(line): - if "GL_EXT_shader_explicit_arithmetic_types_int16" in line: - shader_info.requires_shader_int16_ext = True - if "GL_EXT_shader_16bit_storage" in line: - shader_info.requires_16bit_storage_ext = True - if "GL_EXT_shader_8bit_storage" in line: - shader_info.requires_8bit_storage_ext = True - if "GL_EXT_integer_dot_product" in line: - shader_info.requires_integer_dot_product_ext = True - - return shader_info - - -########################## -# C++ File Generation # -######################### - -cpp_template = """ -#include -#include -#include - -using namespace vkcompute; - -namespace at {{ -namespace native {{ -namespace vulkan {{ - -namespace {{ - -{spv_bin_arrays} - -}} - -static void register_fn() {{ - -{register_shader_infos} - -{shader_info_registry} - -}} - -static const api::ShaderRegisterInit register_shaders(®ister_fn); - -}} -}} -}} - -""" - - -def generateSpvBinStr(spvPath: str, name: str) -> Tuple[int, str]: - with open(spvPath, "rb") as fr: - next_bin = array.array("I", fr.read()) - sizeBytes = 4 * len(next_bin) - spv_bin_str = "const uint32_t {}_bin[] = {{\n{}\n}};".format( - name, - textwrap.indent(",\n".join(str(x) for x in next_bin), " "), - ) - - return sizeBytes, spv_bin_str - - -def generateShaderInfoStr(shader_info: ShaderInfo, name: str, sizeBytes: int) -> str: - tile_size = ( - f"{{{', '.join(str(x) for x in shader_info.tile_size)}}}" - if (len(shader_info.tile_size) > 0) - else "{1, 1, 1}" - ) - - shader_info_layouts = "{{{}}}".format(",\n ".join(shader_info.layouts)) - - def to_cpp_str(val: bool): - return "true" if val else "false" - - shader_info_args = [ - f'"{name}"', - f"{name}_bin", - str(sizeBytes), - shader_info_layouts, - tile_size, - to_cpp_str(shader_info.requires_shader_int16_ext), - to_cpp_str(shader_info.requires_16bit_storage_ext), - to_cpp_str(shader_info.requires_8bit_storage_ext), - to_cpp_str(shader_info.requires_integer_dot_product_ext), - ] - - shader_info_str = textwrap.indent( - "api::shader_registry().register_shader(\n vkapi::ShaderInfo(\n{args}));\n".format( - args=textwrap.indent(",\n".join(shader_info_args), " "), - ), - " ", - ) - - return shader_info_str - - -def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str: - if shader_info.register_for is None: - return "" - - (op_name, registry_keys) = shader_info.register_for - shader_dispatch_str = "" - for registry_key in registry_keys: - shader_dispatch_str = textwrap.indent( - f'api::shader_registry().register_op_dispatch("{op_name}", api::DispatchKey::{registry_key.upper()}, "{name}");', - " ", - ) - - return shader_dispatch_str - - -def genCppFiles( - spv_files: Dict[str, str], cpp_header_path: str, cpp_src_file_path: str -) -> None: - spv_bin_strs = [] - register_shader_info_strs = [] - shader_registry_strs = [] - - for spvPath, srcPath in spv_files.items(): - if spvPath is None: - continue - - name = getName(spvPath).replace("_spv", "") - - sizeBytes, spv_bin_str = generateSpvBinStr(spvPath, name) - spv_bin_strs.append(spv_bin_str) - - shader_info = getShaderInfo(srcPath) - - register_shader_info_strs.append( - generateShaderInfoStr(shader_info, name, sizeBytes) - ) - - if shader_info.register_for is not None: - shader_registry_strs.append(generateShaderDispatchStr(shader_info, name)) - - spv_bin_arrays = "\n".join(spv_bin_strs) - register_shader_infos = "\n".join(register_shader_info_strs) - shader_info_registry = "\n".join(shader_registry_strs) - - cpp = cpp_template.format( - spv_bin_arrays=spv_bin_arrays, - register_shader_infos=register_shader_infos, - shader_info_registry=shader_info_registry, - ) - - with open(cpp_src_file_path, "w") as fw: - fw.write(cpp) - - -########## -# Main # -########## - - -def parse_arg_env(items: Dict[Any, Any]) -> Dict[Any, Any]: - d = {} - if items: - for item in items: - tokens = item.split("=") - key = tokens[0].strip() - value = tokens[1].strip() - d[key] = value - return d - - -def main(argv: List[str]) -> int: - parser = argparse.ArgumentParser(description="") - parser.add_argument( - "-i", - "--glsl-paths", - nargs="+", - help='List of paths to look for GLSL source files, separated by spaces. Ex: --glsl-paths "path1 path2 path3"', - default=["."], - ) - parser.add_argument("-c", "--glslc-path", required=True, help="") - parser.add_argument( - "-t", "--tmp-dir-path", required=True, help="/tmp/vulkan_shaders/" - ) - parser.add_argument("-o", "--output-path", required=True, help="") - parser.add_argument("-f", "--force-rebuild", action="store_true", default=False) - parser.add_argument("--replace-u16vecn", action="store_true", default=False) - parser.add_argument("--optimize_size", action="store_true", help="") - parser.add_argument("--optimize", action="store_true", help="") - parser.add_argument("--spv_debug", action="store_true", default=False) - parser.add_argument( - "--env", metavar="KEY=VALUE", nargs="*", help="Set a number of key-value pairs" - ) - options = parser.parse_args() - - env = DEFAULT_ENV - env.update(TYPE_MAPPINGS) - env.update(UTILITY_FNS) - - for key, value in parse_arg_env(options.env).items(): - env[key] = value - - if not os.path.exists(options.output_path): - os.makedirs(options.output_path) - - if not os.path.exists(options.tmp_dir_path): - os.makedirs(options.tmp_dir_path) - - glslc_flags = [] - if options.optimize_size: - glslc_flags.append("-Os") - elif options.optimize: - glslc_flags.append("-O") - - if options.spv_debug: - glslc_flags.append("-g") - - glslc_flags_str = " ".join(glslc_flags) - - shader_generator = SPVGenerator( - options.glsl_paths, - env, - options.glslc_path, - glslc_flags=glslc_flags_str, - replace_u16vecn=options.replace_u16vecn, - ) - output_spv_files = shader_generator.generateSPV( - options.output_path, options.tmp_dir_path, options.force_rebuild - ) - - genCppFiles( - output_spv_files, - f"{options.output_path}/{CPP_H_NAME}", - f"{options.output_path}/{CPP_SRC_NAME}", - ) - - return 0 - - -def invoke_main() -> None: - sys.exit(main(sys.argv)) - - -if __name__ == "__main__": - invoke_main() # pragma: no cover diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp deleted file mode 100644 index 6609298b0d8..00000000000 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ /dev/null @@ -1,1116 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// @lint-ignore-every CLANGTIDY -// facebook-security-vulnerable-integer-sign-conversion - -#include - -#include - -#include - -namespace vkcompute { - -// -// VTensorPtr -// - -#define VALUE_PTR_CLASS_IMPL(classname, ctype, type_name) \ - classname::classname(ComputeGraph* const graph, const ValueRef idx) \ - : graph_(graph), ptr_(&(graph_->values_.at(idx).to##type_name())) { \ - graph_->values_in_use_++; \ - } \ - ctype* classname::operator->() const { \ - return ptr_; \ - } \ - ctype& classname::operator*() const { \ - return *ptr_; \ - } \ - classname::~classname() { \ - graph_->values_in_use_--; \ - } - -VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor) -VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef) -VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging) -VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) -VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) -VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) -VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector, ValueList) -VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt) - -#undef VALUE_PTR_CLASS_IMPL - -// -// TmpTensor -// - -TmpTensor::TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout) - : graph_p(graph_ptr), - sobj_idx(get_sobj_idx()), - vref(graph_p->add_tensor( - sizes, - dtype, - storage_type, - memory_layout, - sobj_idx)) {} - -TmpTensor::TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type) - : graph_p(graph_ptr), - sobj_idx(get_sobj_idx()), - vref(graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx)) {} - -TmpTensor::TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::GPUMemoryLayout memory_layout) - : graph_p(graph_ptr), - sobj_idx(get_sobj_idx()), - vref(graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx)) {} - -TmpTensor::TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype) - : graph_p(graph_ptr), - sobj_idx(get_sobj_idx()), - vref(graph_p->add_tensor(sizes, dtype, sobj_idx)) {} - -TmpTensor::~TmpTensor() { - // Lifetime of this temporary tensor is expired; return the shared object to - // the pool, as long as the sobj index is valid - if (sobj_idx >= 0) { - graph_p->tmp_shared_object_idxs_.emplace(sobj_idx); - } -} - -int64_t TmpTensor::get_sobj_idx() { - int64_t sobj_idx; - // If no available temporary shared objects, request a new one to be created - if (graph_p->tmp_shared_object_idxs_.empty()) { - sobj_idx = graph_p->shared_objects_.size(); - } else { - // Get the first available shared object idx - sobj_idx = graph_p->tmp_shared_object_idxs_.top(); - graph_p->tmp_shared_object_idxs_.pop(); - } - return sobj_idx; -} - -// -// ComputeGraph -// - -ComputeGraph::ComputeGraph(GraphConfig config) - : config_{config}, - prepack_descriptor_counts_{}, - execute_descriptor_counts_{}, - context_{new api::Context( - config.external_adapter ? config.external_adapter - : vkapi::runtime()->get_adapter_p(), - config_.context_config)}, - shared_objects_{}, - values_{}, - param_ubos_{}, - prepack_nodes_{}, - execute_nodes_{}, - inputs_{}, - outputs_{} { - // Ensure that descriptor counts are initialized to 0 - prepack_descriptor_counts_.descriptor_pool_max_sets = 0; - prepack_descriptor_counts_.descriptor_uniform_buffer_count = 0; - prepack_descriptor_counts_.descriptor_storage_buffer_count = 0; - prepack_descriptor_counts_.descriptor_combined_sampler_count = 0; - prepack_descriptor_counts_.descriptor_storage_image_count = 0; - - execute_descriptor_counts_.descriptor_pool_max_sets = 0; - execute_descriptor_counts_.descriptor_uniform_buffer_count = 0; - execute_descriptor_counts_.descriptor_storage_buffer_count = 0; - execute_descriptor_counts_.descriptor_combined_sampler_count = 0; - execute_descriptor_counts_.descriptor_storage_image_count = 0; - - // If certain graph config variables are not specified, then set them - // automatically. - if (config_.prepack_threshold_nbytes == 0) { - config_.prepack_threshold_nbytes = 10 * MB; - config_.prepack_initial_threshold_nbytes = 10 * MB; - } - if (config_.execute_threshold_node_count == 0) { - config_.execute_threshold_node_count = 128; - config_.execute_initial_threshold_node_count = 64; - } - - // Check if the underlying GPU can access accelerated integer dot product - // instructions - can_use_int8_dot_product_ = - context_->adapter_ptr()->supports_int8_dot_product(); -} - -ComputeGraph::~ComputeGraph() { - values_.clear(); - - prepack_nodes_.clear(); - execute_nodes_.clear(); - clear_deferred_cmds(); - - context_->flush(); -} - -std::vector ComputeGraph::extract_int_or_symint_list( - const ValueRef idx) { - const Value& val = values_.at(idx); - std::vector result; - - if (val.isIntList()) { - // If it's an IntList, return a copy of the list - return val.toConstIntList(); - } else if (val.isValueList()) { - // If it's a ValueList, extract each element as an Int or SymInt - const std::vector& value_list = val.toConstValueList(); - result.reserve(value_list.size()); - - for (const ValueRef& ref : value_list) { - const Value& element = values_.at(ref); - if (element.isInt()) { - result.push_back(element.toInt()); - } else if (element.isSymInt()) { - result.push_back(read_symint(ref)); - } else { - VK_THROW( - "ValueList element is neither Int nor SymInt, but has type ", - element.type()); - } - } - return result; - } - - VK_THROW( - "Cannot extract int or symint list from Value with type ", val.type()); -} - -utils::StorageType ComputeGraph::suggested_storage_type() { - if (config_.enable_storage_type_override) { - return config_.storage_type_override; - } - return utils::kTexture3D; -} - -bool ComputeGraph::was_value_updated(const ValueRef idx) const noexcept { - if (!is_valid_value_idx(idx)) { - return false; - } - - // Check if this ValueRef itself was updated - if (updated_values_.find(idx) != updated_values_.end()) { - return true; - } - - // If this is a ValueList, check each ValueRef in the list - if (val_is_value_list(idx)) { - const auto& value_list = values_.at(idx).toConstValueList(); - for (const auto& nested_idx : value_list) { - if (was_value_updated(nested_idx)) { - return true; - } - } - } - - return false; -} - -utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout( - const std::vector& sizes) { - if (config_.enable_memory_layout_override) { - return config_.memory_layout_override; - } - if (sizes.size() < 3) { - return utils::kWidthPacked; - } - // For 3 dimensional tensors that only have a channels dimension of 1, still - // prefer width packed. - if (utils::val_at(-3, sizes) == 1) { - return utils::kWidthPacked; - } - return utils::kChannelsPacked; -} - -bool ComputeGraph::device_name_contains(const char* substr) { - return context_->adapter_ptr()->device_name().find(substr) != - std::string::npos; -} - -void ComputeGraph::check_no_active_value_ptrs() { - VK_CHECK_COND( - values_in_use_ == 0, - "Make sure that there are no pointers stored from the return values of " - "`ComputeGraph::get_*()` functions in scope before adding Values to the " - "graph. Modifying the graph's values may cause existing pointers to be " - "invalidated."); -} - -bool ComputeGraph::is_valid_value_idx(const ValueRef idx) const noexcept { - return idx >= 0 && idx < static_cast(values_.size()); -} - -std::vector ComputeGraph::sizes_of(const ValueRef idx) const { - const Value& val = values_.at(idx); - if (val.isTensor()) { - return val.toConstTensor().sizes(); - } else if (val.isTensorRef()) { - return val.toConstTensorRef().sizes; - } - VK_THROW("Could not get sizes of value with type ", val.type()); -} - -int64_t ComputeGraph::dim_of(const ValueRef idx) const { - const Value& val = values_.at(idx); - if (val.isTensor()) { - return val.toConstTensor().dim(); - } else if (val.isTensorRef()) { - return val.toConstTensorRef().sizes.size(); - } - VK_THROW("Could not get dim of value with type ", val.type()); -} - -std::vector ComputeGraph::dim_order_of(const ValueRef idx) const { - const Value& val = values_.at(idx); - if (val.isTensor()) { - return val.toConstTensor().dim_order(); - } - VK_THROW("Could not get dim order of value with type ", val.type()); -} - -std::vector ComputeGraph::strides_of(const ValueRef idx) const { - const Value& val = values_.at(idx); - if (val.isTensor()) { - return val.toConstTensor().strides(); - } - VK_THROW("Could not get strides of value with type ", val.type()); -} - -vkapi::ScalarType ComputeGraph::dtype_of(const ValueRef idx) const { - const Value& val = values_.at(idx); - if (val.isTensor()) { - return val.toConstTensor().dtype(); - } else if (val.isTensorRef()) { - return val.toConstTensorRef().dtype; - } else if (val.isBool()) { - return vkapi::ScalarType::Bool; - } else if (val.isDouble()) { - // We downcast anyway in the shader and we want to avoid having to - // write special cases there. - return vkapi::ScalarType::Float; - } else if (val.isInt()) { - return vkapi::ScalarType::Int; - } - VK_THROW("Could not get dtype of value with type ", val.type()); -} - -bool ComputeGraph::is_contiguous_buffer_tensor(const ValueRef idx) const { - if (!val_is_tensor(idx)) { - return false; - } - if (!is_buffer_storage(idx)) { - return false; - } - return is_contiguous(idx); -} - -bool ComputeGraph::is_contiguous_texture_tensor(const ValueRef idx) const { - if (!val_is_tensor(idx)) { - return false; - } - if (is_buffer_storage(idx)) { - return false; - } - return has_standard_axis_map(idx) && packed_dim_of(idx) == 0; -} - -bool ComputeGraph::is_standard_channels_packed_texture_tensor( - const ValueRef idx) const { - if (!val_is_tensor(idx)) { - return false; - } - if (is_buffer_storage(idx)) { - return false; - } - return has_standard_axis_map(idx) && packed_dim_of(idx) == 2; -} - -bool ComputeGraph::is_2d_matrix(const ValueRef idx) const { - std::vector sizes = sizes_of(idx); - const size_t ndim = sizes.size(); - if (sizes.size() < 2) { - return false; - } - if (sizes.size() == 2) { - return true; - } - - // Check that outermost dims have size of 1 - for (int d = 0; d < ndim - 2; d++) { - if (sizes[d] != 1) { - return false; - } - } - - return true; -} - -bool ComputeGraph::is_vectorizable_contiguous_2d_matrix( - const ValueRef idx) const { - if (!is_2d_matrix(idx)) { - return false; - } - if (is_buffer_storage(idx)) { - return is_contiguous_buffer_tensor(idx) && - size_at(-1, idx) % 4 == 0; - } - return is_contiguous_texture_tensor(idx); -} - -bool ComputeGraph::is_vectorizable_width_packed_tensor( - const ValueRef idx) const { - // Not a tensor - return false - if (!val_is_tensor(idx)) { - return false; - } - if (is_buffer_storage(idx)) { - return is_contiguous_buffer_tensor(idx) && - size_at(-1, idx) % 4 == 0; - } - - return is_standard_channels_packed_texture_tensor(idx); -} - -ValueRef ComputeGraph::add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const int64_t shared_object_idx, - const utils::AxisMapLayout axis_map_layout) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(api::vTensor( - context(), - sizes, - dtype, - storage_type, - memory_layout, - false, - axis_map_layout)); - - if (shared_object_idx >= 0) { - get_shared_object(shared_object_idx).add_user(this, idx); - } - return idx; -} - -ValueRef ComputeGraph::add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const int64_t shared_object_idx, - const utils::AxisMapLayout axis_map_layout) { - return add_tensor( - sizes, - dtype, - storage_type, - suggested_memory_layout(sizes), - shared_object_idx, - axis_map_layout); -} - -ValueRef ComputeGraph::add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::GPUMemoryLayout memory_layout, - const int64_t shared_object_idx, - const utils::AxisMapLayout axis_map_layout) { - return add_tensor( - sizes, - dtype, - suggested_storage_type(), - memory_layout, - shared_object_idx, - axis_map_layout); -} - -ValueRef ComputeGraph::add_tensor_like( - const ValueRef idx, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const utils::AxisMapLayout axis_map_layout) { - return add_tensor( - sizes_of(idx), - dtype_of(idx), - storage_type, - memory_layout, - -1, - axis_map_layout); -} - -ValueRef ComputeGraph::add_tensor_like( - const ValueRef idx, - const utils::GPUMemoryLayout memory_layout, - const utils::AxisMapLayout axis_map_layout) { - return add_tensor( - sizes_of(idx), - dtype_of(idx), - storage_type_of(idx), - memory_layout, - -1, - axis_map_layout); -} - -ValueRef ComputeGraph::add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const int64_t shared_object_idx, - const utils::AxisMapLayout axis_map_layout) { - return add_tensor( - sizes, - dtype, - suggested_memory_layout(sizes), - shared_object_idx, - axis_map_layout); -} - -ValueRef ComputeGraph::add_tensor(const vkapi::VulkanImage& image) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(api::vTensor(context(), image)); - return idx; -} - -ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) { - const vTensorPtr t = get_tensor(vref); - ValueRef idx(static_cast(values_.size())); - values_.emplace_back(api::vTensor(*t)); - return idx; -} - -ValueRef ComputeGraph::add_tensor_view( - const ValueRef vref, - const std::vector& sizes, - const std::vector& strides) { - const vTensorPtr t = get_tensor(vref); - ValueRef idx(static_cast(values_.size())); - values_.emplace_back(api::vTensor(*t, sizes, strides)); - return idx; -} - -ValueRef ComputeGraph::add_tensorref( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const void* const data) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(TensorRef(sizes, dtype, data)); - total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes(); - return idx; -} - -ValueRef ComputeGraph::add_tensorref( - const std::vector& sizes, - const vkapi::ScalarType dtype, - executorch::runtime::FreeableBuffer&& buffer) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(TensorRef(sizes, dtype, std::move(buffer))); - total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes(); - return idx; -} - -ValueRef ComputeGraph::add_staging( - const vkapi::ScalarType dtype, - const size_t numel) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); - return idx; -} - -ValueRef ComputeGraph::add_none() { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(); - return idx; -} - -ValueRef ComputeGraph::add_value_list(std::vector&& value) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(std::move(value)); - return idx; -} - -ValueRef ComputeGraph::add_string(std::string&& str) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(std::move(str)); - return idx; -} - -ValueRef ComputeGraph::add_symint(const int32_t val) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(SymInt(context(), val)); - return idx; -} - -ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) { - for (int i = 0; i < values_.size(); ++i) { - if (values_.at(i).isInt() && values_.at(i).toInt() == val) { - return i; - } - } - return add_scalar(val); -} - -ValueRef ComputeGraph::set_input_tensor( - const ValueRef idx, - const bool use_staging) { - if (use_staging) { - vkapi::ScalarType dtype = get_tensor(idx)->dtype(); - // For texture storage, the buffer size needs to account for the zero - // padding applied by unused texel elements. - size_t buf_numel = get_tensor(idx)->staging_buffer_numel(); - ValueRef staging_idx = add_staging(dtype, buf_numel); - add_staging_to_tensor_node(*this, staging_idx, idx); - inputs_.push_back({idx, staging_idx}); - return staging_idx; - } - inputs_.push_back({idx, kDummyValueRef}); - return idx; -} - -ValueRef ComputeGraph::set_output_tensor( - const ValueRef idx, - const bool use_staging) { - if (use_staging) { - vkapi::ScalarType dtype = get_tensor(idx)->dtype(); - // For texture storage, the buffer size needs to account for the zero - // padding applied by unused texel elements. - size_t buf_numel = get_tensor(idx)->staging_buffer_numel(); - ValueRef staging_idx = add_staging(dtype, buf_numel); - // We only run this when the tensor is non-empty. When the underlying - // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to - // tensor, we will not be able to bind the node for execution. - if (buf_numel > 0) { - add_tensor_to_staging_node(*this, idx, staging_idx); - } - outputs_.push_back({idx, staging_idx}); - return staging_idx; - } - outputs_.push_back({idx, kDummyValueRef}); - return idx; -} - -ValueRef ComputeGraph::set_output_value(const ValueRef idx) { - if (values_.at(idx).isTensor()) { - return set_output_tensor(idx); - } - outputs_.push_back({idx, kDummyValueRef}); - return idx; -} - -vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( - const ValueRef idx) { - if (values_.at(idx).isInt()) { - const int32_t val = extract_scalar(idx); - return create_params_buffer(val); - } else if (values_.at(idx).isSymInt()) { - SymIntPtr symint = get_symint(idx); - return vkapi::BufferBindInfo(symint->gpu_buffer.buffer()); - } - VK_THROW("Cannot create a int param buffer for the given value"); -} - -vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( - const ValueRef idx, - const int32_t default_val) { - if (values_.at(idx).isNone()) { - return create_params_buffer(default_val); - } else { - return get_or_create_int_param_buffer(idx); - } -} - -void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) { - int32_t cur_val = read_symint(idx); - if (cur_val != val) { - get_symint(idx)->set(val); - // Track that this ValueRef was updated - updated_values_.insert(idx); - } -} - -int32_t ComputeGraph::read_symint(const ValueRef idx) { - return get_symint(idx)->get(); -} - -SharedObject& ComputeGraph::get_shared_object(const int64_t idx) { - if (idx >= shared_objects_.size()) { - shared_objects_.resize(static_cast(idx + 1)); - } - return shared_objects_.at(idx); -} - -void ComputeGraph::create_dedicated_allocation_for(const ValueRef idx) { - vTensorPtr tensor = get_tensor(idx); - if (!tensor->memory_is_bound()) { - VmaAllocationCreateInfo alloc_create_info = - context()->adapter_ptr()->vma().gpuonly_resource_create_info(); - tensor->acquire_allocation( - context()->adapter_ptr()->vma().create_allocation( - tensor->get_memory_requirements(), alloc_create_info)); - } -} - -void ComputeGraph::update_descriptor_counts( - const vkapi::ShaderInfo& shader_info, - bool execute) { - vkapi::DescriptorPoolConfig* config = - execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_; - - config->descriptor_pool_max_sets += 1; - for (const VkDescriptorType arg_type : shader_info.kernel_layout) { - switch (arg_type) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - config->descriptor_uniform_buffer_count += 1; - break; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - config->descriptor_storage_buffer_count += 1; - break; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - config->descriptor_combined_sampler_count += 1; - break; - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - config->descriptor_storage_image_count += 1; - break; - default: - VK_THROW("Unsupported descriptor type!"); - } - } -} - -void ComputeGraph::register_pipeline_to_create( - const vkapi::ShaderInfo& shader_info, - const utils::WorkgroupSize& local_workgroup_size, - const vkapi::SpecVarList& spec_vars, - const std::vector& push_constants) { - VkDescriptorSetLayout shader_layout = - context()->shader_layout_cache().retrieve(shader_info.kernel_layout); - - uint32_t pc_offset = 0; - std::array pc_data; - for (const auto& pc : push_constants) { - pc_offset += pc.write(pc_data.data(), pc_offset, kMaxPushConstantSize); - } - - vkapi::SpecVarList spec_constants = { - SV(local_workgroup_size[0u]), - SV(local_workgroup_size[1u]), - SV(local_workgroup_size[2u])}; - - spec_constants.append(spec_vars); - - const vkapi::ComputePipelineCache::Key desc = { - context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset), - context()->shader_cache().retrieve(shader_info), - spec_constants}; - - if (context_->pipeline_cache().contains(desc)) { - return; - } - auto it = pipeline_descriptors_.find(desc); - if (it != pipeline_descriptors_.cend()) { - return; - } - pipeline_descriptors_.insert(desc); -} - -utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) { - if (is_buffer_storage(idx)) { - return {uint32_t(numel_of(idx)), 1u, 1u}; - } - return logical_limits_of(idx); -} - -utils::uvec3 ComputeGraph::create_local_wg_size( - const utils::uvec3 global_wg_size) { - if (config_.enable_local_wg_size_override) { - return config_.local_wg_size_override; - } - - // array containing axis index and global workgroup size - std::pair global_wg_size_desc[] = { - {0u, global_wg_size[0]}, - {1u, global_wg_size[1]}, - {2u, global_wg_size[2]}}; - - // sort the global workgroup size in descending order - if (global_wg_size_desc[0].second < global_wg_size_desc[1].second) { - std::swap(global_wg_size_desc[0], global_wg_size_desc[1]); - } - if (global_wg_size_desc[1].second < global_wg_size_desc[2].second) { - std::swap(global_wg_size_desc[1], global_wg_size_desc[2]); - } - if (global_wg_size_desc[0].second < global_wg_size_desc[1].second) { - std::swap(global_wg_size_desc[0], global_wg_size_desc[1]); - } - - utils::uvec3 local_group_size = { - 8, - std::max(1u, std::min(4u, global_wg_size_desc[1].second)), - std::max(1u, std::min(2u, global_wg_size_desc[2].second))}; - - if (global_wg_size_desc[2u].second == 1) { - if (global_wg_size_desc[1u].second == 1) { - local_group_size[0u] = 64; - local_group_size[1u] = 1; - } else if (global_wg_size_desc[1u].second % 4 == 0) { - local_group_size[0u] = 16; - local_group_size[1u] = 4; - } else { - local_group_size[0u] = 32; - local_group_size[1u] = 2; - } - } - - return { - local_group_size[global_wg_size_desc[0].first], - local_group_size[global_wg_size_desc[1].first], - local_group_size[global_wg_size_desc[2].first]}; -} - -utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) { - return create_local_wg_size(create_global_wg_size(idx)); -} - -void ComputeGraph::bind_tensor_to_descriptor_set( - const ValueRef ref, - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::MemoryAccessFlags access_type, - vkapi::DescriptorSet& descriptor_set, - const uint32_t idx) { - vTensorPtr tensor = get_tensor(ref); - if (tensor->buffer()) { - vkapi::VulkanBuffer& buffer = tensor->buffer( - pipeline_barrier, vkapi::PipelineStage::COMPUTE, access_type); - descriptor_set.bind(idx, buffer); - } else { - vkapi::VulkanImage& image = tensor->image( - pipeline_barrier, vkapi::PipelineStage::COMPUTE, access_type); - descriptor_set.bind(idx, image); - } -} - -void ComputeGraph::bind_value_to_descriptor_set( - const ValueRef ref, - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::MemoryAccessFlags access_type, - vkapi::DescriptorSet& descriptor_set, - const uint32_t idx) { - if (val_is_tensor(ref)) { - bind_tensor_to_descriptor_set( - ref, pipeline_barrier, access_type, descriptor_set, idx); - } else if (val_is_staging(ref)) { - descriptor_set.bind(idx, get_staging(ref)->buffer()); - } -} - -void ComputeGraph::copy_into_staging( - const ValueRef idx, - const void* data, - const size_t numel) { - StagingPtr staging = get_staging(idx); - size_t nbytes = numel * vkapi::element_size(staging->dtype()); - staging->copy_from(data, nbytes); -} - -void ComputeGraph::copy_from_staging( - const ValueRef idx, - void* data, - const size_t numel) { - StagingPtr staging = get_staging(idx); - size_t nbytes = numel * vkapi::element_size(staging->dtype()); - staging->copy_to(data, nbytes); -} - -void ComputeGraph::prepare() { -#define MERGE_FIELD(field) \ - static_cast(std::ceil( \ - std::max( \ - execute_descriptor_counts_.field, \ - prepack_descriptor_counts_.field) * \ - config_.descriptor_pool_safety_factor)) - - uint32_t max_sets = MERGE_FIELD(descriptor_pool_max_sets); - vkapi::DescriptorPoolConfig config{ - max_sets, - std::max(MERGE_FIELD(descriptor_uniform_buffer_count), max_sets), - std::max(MERGE_FIELD(descriptor_storage_buffer_count), max_sets), - std::max(MERGE_FIELD(descriptor_combined_sampler_count), max_sets), - std::max(MERGE_FIELD(descriptor_storage_image_count), max_sets), - 1u, - }; - - if (!context_->descriptor_pool()) { - context_->descriptor_pool().init(config); - } -#undef MERGE_FIELD - - if (config_.enable_querypool) { - context_->initialize_querypool(); - } - - // Calculate the threshold at which a new command buffer should be created - // during execute() - const size_t total_node_count = execute_nodes_.size(); - size_t init_threshold = config_.execute_initial_threshold_node_count; - size_t count_threshold = config_.execute_threshold_node_count; - - // If max command buffer count is set, we need to adjust the thresholds to - // accommodate execution within the limit, if total command buffers with - // current thresholds would exceed execute_max_cmds - if (config_.execute_max_cmds > 0) { - // Worse case scenario we have one command buffer for nodes before init - // threshold and config_.execute_max_cmds - 1 command buffers for the rest - // of dispatches - - // If command buffers created after offsetting init_threshold would exceed - // max command buffer count, we need to adjust init and count thresholds - const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) > - count_threshold * (config_.execute_max_cmds - 1); - if (total_node_count > init_threshold && slicing_exceeds_max_cmds) { - // Increase count threshold so remaining nodes after offsetting init fits - // in config_.execute_max_cmds - 1 - count_threshold = static_cast(ceil( - (total_node_count - init_threshold) / - double(config_.execute_max_cmds - 1))); - } - } - - execute_threshold_node_count_ = count_threshold; -} - -void ComputeGraph::prepare_pipelines() { - for (std::unique_ptr& node : prepack_nodes_) { - node->prepare_pipelines(this); - } - for (std::unique_ptr& node : execute_nodes_) { - node->prepare_pipelines(this); - } - context_->pipeline_cache().create_pipelines(pipeline_descriptors_); - - pipeline_descriptors_ = std::unordered_set< - vkapi::ComputePipelineCache::Key, - vkapi::ComputePipelineCache::Hasher>(); -} - -void ComputeGraph::submit_current_cmd(const bool final_use) { - context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use); -} - -void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) { - vkapi::VulkanFence fence = context_->fences().get_fence(); - context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use); - fence.wait(); - context_->fences().return_fence(fence); -} - -void ComputeGraph::submit_cmd(vkapi::CommandBuffer& cmd_buf, VkFence fence) { - if (cmd_buf) { - cmd_buf.end(); - context_->adapter_ptr()->submit_cmd( - context_->queue(), cmd_buf.get_submit_handle(false), fence); - } -} - -void ComputeGraph::submit_deferred_cmds_and_wait() { - vkapi::VulkanFence fence = context_->fences().get_fence(); - - for (uint32_t i = 0; i < deferred_cmd_list_.size(); i++) { - auto& cmd = deferred_cmd_list_[i]; - - submit_cmd( - cmd, - i == (deferred_cmd_list_.size() - 1) ? fence.get_submit_handle() - : VK_NULL_HANDLE); - } - fence.wait(); - context_->fences().return_fence(fence); -} - -void ComputeGraph::clear_deferred_cmds() { - for (auto& cmd : deferred_cmd_list_) { - if (cmd) { - cmd.end(); - cmd.invalidate(); - } - } - deferred_cmd_list_.clear(); -} - -void ComputeGraph::prepack() { - int i = 0; - bool submitted = false; - const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB; - // int count = 0; - context_->set_cmd(); - for (std::unique_ptr& node : prepack_nodes_) { - // Do not trigger on the first or last prepack node. - const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1); - size_t threshold = submitted ? config_.prepack_threshold_nbytes - : config_.prepack_initial_threshold_nbytes; - if (not_terminal && staging_nbytes_in_cmd_ > threshold) { - // If reducing peak memory usage, wait for the current command buffer to - // finish executing and flush to recycle the staging memory. This will - // reduce peak memory usage, but will slightly increase load latency. - // Otherwise, just submit the current command buffer for execution and - // proceed. This results in lower load latency at the cost of higher peak - // memory usage. - if (reduce_peak_memory) { - submit_current_cmd_and_wait(); - context_->flush(); - } else { - submit_current_cmd(); - } - staging_nbytes_in_cmd_ = 0; - context_->set_cmd(); - submitted = true; - } - - node->encode(this); - i++; - } - submit_current_cmd_and_wait(/*final_use=*/true); - context_->flush(); - staging_nbytes_in_cmd_ = 0; - - // Initialize allocations for intermediate tensors - for (SharedObject& shared_object : shared_objects_) { - shared_object.allocate(this); - shared_object.bind_users(this); - } - // Make sure all remaining tensors have allocations - for (int i = 0; i < values_.size(); i++) { - if (values_.at(i).isTensor()) { - create_dedicated_allocation_for(i); - } - } -} - -void ComputeGraph::execute() { - if (deferred_cmd_list_.empty()) { - context_->flush(); - context_->set_cmd(/*reusable = */ true); - - context_->cmd_reset_querypool(); - const size_t total_node_count = execute_nodes_.size(); - uint32_t encoded_node_count = 0; - - for (std::unique_ptr& node : execute_nodes_) { - node->encode(this); - encoded_node_count++; - - // Threshold is reached when the node count reached - // execute_initial_threshold_node_count or if its a multiple of - // execute_threshold_node_count. - const bool reached_threshold = - encoded_node_count >= config_.execute_initial_threshold_node_count && - ((encoded_node_count - config_.execute_initial_threshold_node_count) % - execute_threshold_node_count_ == - 0); - - // Create a new command buffer when threashold is reached - // But avoid it if this is the last node, since last cmd buf is submitted - // after the loop - if (reached_threshold && encoded_node_count != total_node_count) { - context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false); - deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); - context_->set_cmd(true); - } - } - - vkapi::VulkanFence fence = context_->fences().get_fence(); - context_->submit_cmd_to_gpu(fence.get_submit_handle(), false); - fence.wait(); - context_->fences().return_fence(fence); - deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); - } else { - submit_deferred_cmds_and_wait(); - } - - execute_count_++; - - // Clear the set of updated values at the end of inference - updated_values_.clear(); - - // Reset the re-encoding flag at the end of inference - requires_reencode_ = false; -} - -void ComputeGraph::virtual_clone(const ValueRef dst, const ValueRef src) { - get_tensor(dst)->virtual_clone(*get_tensor(src)); -} - -void ComputeGraph::virtual_transpose( - const ValueRef tensor, - const int64_t dim0, - const int64_t dim1) { - get_tensor(tensor)->virtual_transpose(dim0, dim1); -} - -void ComputeGraph::resize_input( - const int64_t idx, - const std::vector& new_sizes) { - IOValueRef io_val = inputs_.at(idx); - virtual_resize(io_val.value, new_sizes); - updated_values_.insert(io_val.staging); -} - -void ComputeGraph::virtual_resize( - const ValueRef idx, - const std::vector& new_sizes) { - std::vector cur_sizes = sizes_of(idx); - if (cur_sizes != new_sizes) { - get_tensor(idx)->virtual_resize(new_sizes); - // Track that this ValueRef was updated - updated_values_.insert(idx); - } -} - -void ComputeGraph::propagate_resize() { - for (std::unique_ptr& node : execute_nodes_) { - node->trigger_resize(this); - } - // A command buffer re-encode will be needed if: - // 1. Any push constant data (used for tensor metadata) was updated - // 2. Compute shader dispatch parameters (i.e. compute shader, global and - // local work group sizes) were updated - if (requires_reencode_) { - clear_deferred_cmds(); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h deleted file mode 100644 index 23b5517fd22..00000000000 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ /dev/null @@ -1,1099 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include -#include - -#include - -#include - -#include -#include - -#include -#include -#include -#include - -namespace vkcompute { - -// Define valid scalar types that the Value class can -// accept -template -struct is_valid_scalar_type : std::false_type {}; - -template <> -struct is_valid_scalar_type : std::true_type {}; - -template <> -struct is_valid_scalar_type : std::true_type {}; - -template <> -struct is_valid_scalar_type : std::true_type {}; - -// -// Guarded Pointer Classes -// - -class ComputeGraph; - -#define DECL_VALUE_PTR_CLASS(classname, ctype) \ - class classname final { \ - ComputeGraph* const graph_; \ - ctype* ptr_; \ - \ - public: \ - explicit classname(ComputeGraph* const graph, const ValueRef idx); \ - ctype* operator->() const; \ - ctype& operator*() const; \ - ~classname(); \ - }; - -DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) -DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) -DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) -DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) -DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) -DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) -DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector) -DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt); - -#undef DECL_VALUE_PTR_CLASS - -// -// TmpTensor -// - -/* - * This struct is used to recycle the memory of temporary tensors that are - * created during the execution of a node. Upon construction, this struct will - * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance - * if any shared objects are available; if not, then a new one is created. A - * tensor value is then added to the `ComputeGraph` instance with the requested - * specifications. Upon destruction, the shared object index of the temporary - * tensor is returned to `tmp_shared_object_idxs_`. - * - * Note that instances of this struct can be used as if they were `ValueRef` due - * to implementation of a custom casting operator. - * - * This class should only be used to create tensors whose lifetimes exist only - * in a well defined scope (i.e. within a function). - */ -struct TmpTensor { - ComputeGraph* graph_p; - int64_t sobj_idx; - ValueRef vref; - - // - // Match all available overloads of `add_tensor` - // - - TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout); - - TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type); - - TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::GPUMemoryLayout memory_layout); - - TmpTensor( - ComputeGraph* const graph_ptr, - const std::vector& sizes, - const vkapi::ScalarType dtype); - - // No copy construction or assignment - TmpTensor(TmpTensor& other) = delete; - TmpTensor& operator=(TmpTensor& other) = delete; - - // No move construction or assignment - TmpTensor(TmpTensor&& other) = delete; - TmpTensor& operator=(TmpTensor&& other) = delete; - - // Custom cast to ValueRef - operator ValueRef() const { - return vref; - }; - - ~TmpTensor(); - - private: - // Helper function to get first available shared object index or request a new - // one to be created. - int64_t get_sobj_idx(); -}; - -// -// ComputeGraph -// - -/* - * This is the core data structure used to execute Vulkan models in graph mode. - * As opposed to ATen/eager mode where a command buffer is encoded every - * inference (since ops are executed with the model), in graph mode the ops that - * compose the model are intended to be parsed only once, upon which a command - * buffer will be encoded. Model inference will then execute the cached command - * buffer without needing to encode a new one. - */ -class ComputeGraph final { - public: - explicit ComputeGraph(GraphConfig config); - - ComputeGraph(ComputeGraph&&) = default; - ComputeGraph& operator=(ComputeGraph&&) = default; - - ~ComputeGraph(); - - private: - GraphConfig config_; - vkapi::DescriptorPoolConfig prepack_descriptor_counts_; - vkapi::DescriptorPoolConfig execute_descriptor_counts_; - - std::unique_ptr context_; - - std::vector shared_objects_; - // This stack is used by `TmpTensor` instances to recycle shared objects - // for temporary tensors. See the comments of `TmpTensor` for more details - std::stack tmp_shared_object_idxs_; - - std::vector values_; - std::vector param_ubos_; - - std::vector> prepack_nodes_; - std::vector> execute_nodes_; - - std::vector inputs_; - std::vector outputs_; - - std::unordered_set< - vkapi::ComputePipelineCache::Key, - vkapi::ComputePipelineCache::Hasher> - pipeline_descriptors_; - - // Utility constexpr to express byte quantities - constexpr static size_t MB = 1024 * 1024; - - // List of command buffers deferred for submission - std::vector deferred_cmd_list_; - - // Set to track which ValueRefs were updated during inference - std::unordered_set updated_values_; - - // Flag to indicate if re-encoding is required - bool requires_reencode_ = false; - - protected: - size_t values_in_use_ = 0; - size_t execute_count_ = 0; - - // Total number of bytes needed to store model weights - size_t total_constant_nbytes_ = 0; - - // Represents the amount of staging buffer data that will be copied if the - // current Context's command buffer is submitted now. - size_t staging_nbytes_in_cmd_ = 0; - - // Represents the nodes to wait before submitting commands. - // If command buffers created with config.execute_threshold_node_count exceeds - // config.execute_max_cmds, then execute_threshold_node_count will be - // increased to fit command buffers within the limit. Otherwise, - // execute_threshold_node_count will be set to - // config.execute_threshold_node_count. - size_t execute_threshold_node_count_ = 0; - - // Whether the underlying GPU support accelerated integer dot product - // extensions - bool can_use_int8_dot_product_ = false; - - public: - // - // Accessors - // - - inline api::Context* context() { - return context_.get(); - } - - inline std::vector& inputs() { - return inputs_; - } - - inline std::vector& outputs() { - return outputs_; - } - - inline std::vector>& prepack_nodes() { - return prepack_nodes_; - } - - inline std::vector>& execute_nodes() { - return execute_nodes_; - } - - inline GraphConfig& graphconfig() { - return config_; - } - - // Check if the ComputeGraph has a value at the specified index - bool is_valid_value_idx(const ValueRef idx) const noexcept; - - // - // Value Extraction - // - -#define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \ - inline ptr_type get_##short_name(const ValueRef idx) { \ - return ptr_type(this, idx); \ - } \ - inline bool val_is_##short_name(const ValueRef idx) const { \ - return values_.at(idx).is##type_name(); \ - } - - protected: - inline vTensorPtr get_tensor(const ValueRef idx) { - return vTensorPtr(this, idx); - } - - public: - inline bool val_is_tensor(const ValueRef idx) const { - return values_.at(idx).isTensor(); - } - - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef) - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging) - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList) - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList) - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList) - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList) - GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt); - -#undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS - -#define GET_AND_CHECK_VAL_AS_TYPE_FNS(ctype, short_name, type_name) \ - inline ctype get_##short_name(const ValueRef idx) { \ - return values_.at(idx).to##type_name(); \ - } \ - inline bool val_is_##short_name(const ValueRef idx) { \ - return values_.at(idx).is##type_name(); \ - } - - GET_AND_CHECK_VAL_AS_TYPE_FNS(int64_t, int, Int) - GET_AND_CHECK_VAL_AS_TYPE_FNS(double, double, Double) - GET_AND_CHECK_VAL_AS_TYPE_FNS(bool, bool, Bool) - GET_AND_CHECK_VAL_AS_TYPE_FNS(std::string, string, String) - -#undef GET_AND_CHECK_VAL_AS_TYPE_FNS - - inline bool val_is_none(const ValueRef idx) { - return idx == kDummyValueRef ? true : values_.at(idx).isNone(); - } - - inline bool val_is_not_none(const ValueRef idx) { - return !val_is_none(idx); - } - - inline TypeTag get_val_type(const ValueRef idx) { - return values_.at(idx).type(); - } - - // - // Tensor Properties Accessors - // - - std::vector sizes_of(const ValueRef idx) const; - - /* - * Returns the size of the tensor at `idx` along the specified dimension. - * Negative indexing is allowed. - */ - template - T size_at(const int64_t dim, const ValueRef idx) const { - const Value& val = values_.at(idx); - if (val.isTensor()) { - return static_cast(utils::val_at(dim, val.toConstTensor().sizes())); - } else if (val.isTensorRef()) { - return static_cast(utils::val_at(dim, val.toConstTensorRef().sizes)); - } - VK_THROW("Could not get sizes of value with type ", val.type()); - } - - int64_t dim_of(const ValueRef idx) const; - - std::vector dim_order_of(const ValueRef idx) const; - - std::vector strides_of(const ValueRef idx) const; - - vkapi::ScalarType dtype_of(const ValueRef idx) const; - - inline const utils::ivec3& logical_limits_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().logical_limits(); - } - - inline int32_t numel_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().numel(); - } - - inline size_t staging_buffer_numel_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().staging_buffer_numel(); - } - - inline utils::StorageType storage_type_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().storage_type(); - } - - inline bool is_buffer_storage(const ValueRef idx) const { - return values_.at(idx).toConstTensor().has_buffer_storage(); - } - - inline bool is_texture_storage(const ValueRef idx) const { - return !is_buffer_storage(idx); - } - - /* - * Checks that the following is true: - * 1. The value at `idx` is a tensor - * 2. The tensor at `idx` has buffer storage - * 3. The buffer backed tensor at `idx` has a contiguous memory layout - */ - bool is_contiguous_buffer_tensor(const ValueRef idx) const; - - /* - * Checks that the following is true: - * 1. The value at `idx` is a tensor - * 2. The tensor at `idx` has texture storage - * 3. The texture backed tensor at `idx` has a standard axis mapping - * 4. The texture backed tensor at `idx` is width packed - */ - bool is_contiguous_texture_tensor(const ValueRef idx) const; - - /* - * Checks that the following is true: - * 1. The value at `idx` is a tensor - * 2. The tensor at `idx` has texture storage - * 3. The texture backed tensor at `idx` has a standard axis mapping - * 4. The texture backed tensor at `idx` is channels packed - */ - bool is_standard_channels_packed_texture_tensor(const ValueRef idx) const; - - /* - * Checks that the value at `idx` is either a 2D tensor, or if the tensor has - * more than 2 dims, the outermost dims have size of 1, i.e. can be squeezed - * to be a 2D tensor. - */ - bool is_2d_matrix(const ValueRef idx) const; - - /* - * Same as the above, but also requires that the tensor is a contiguous - * buffer with a width divisible by 4 or a standard width packed texture. - */ - bool is_vectorizable_contiguous_2d_matrix(const ValueRef idx) const; - - /* - * Checks that the following is true: - * 1. The value at `idx` is a tensor - * 2. The tensor at `idx` is width packed - * 3. The tensor at `idx` has a standard axis mapping or is a contiguous - * buffer - */ - bool is_vectorizable_width_packed_tensor(const ValueRef idx) const; - - inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base) - const { - return values_.at(maybe_view) - .toConstTensor() - .is_view_of(values_.at(base).toConstTensor()); - } - - inline utils::GPUMemoryLayout estimate_memory_layout_of( - const ValueRef idx) const { - return values_.at(idx).toConstTensor().estimate_memory_layout(); - } - - inline int32_t hashed_layout_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().hashed_layout(); - } - - inline int32_t packed_dim_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().packed_dim(); - } - - inline int32_t concat_dim_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().concat_dim(); - } - - inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().sizes_ubo(); - } - - inline vkapi::BufferBindInfo buffer_meta_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().buffer_meta_ubo(); - } - - inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().strides_ubo(); - } - - inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().dim_order_ubo(); - } - - inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().numel_ubo(); - } - - inline bool has_standard_axis_map(const ValueRef idx) const { - return values_.at(idx).toTensor().has_standard_axis_map(); - } - - inline bool is_contiguous(const ValueRef idx) const { - return values_.at(idx).toTensor().is_contiguous(); - } - - inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().logical_limits_ubo(); - } - - inline PushConstantDataInfo sizes_pc_of(const ValueRef idx) const { - PushConstantDataInfo pc_data = PushConstantDataInfo( - values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes); - pc_data.set_value(idx); - return pc_data; - } - - inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const { - PushConstantDataInfo pc_data = PushConstantDataInfo( - values_.at(idx).toConstTensor().get_uniform_data(), - api::kTensorDimOrder); - pc_data.set_value(idx); - return pc_data; - } - - inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const { - PushConstantDataInfo pc_data = PushConstantDataInfo( - values_.at(idx).toConstTensor().get_uniform_data(), - api::kTensorStrides); - pc_data.set_value(idx); - return pc_data; - } - - inline PushConstantDataInfo logical_limits_pc_of(const ValueRef idx) const { - PushConstantDataInfo pc_data = PushConstantDataInfo( - values_.at(idx).toConstTensor().get_uniform_data(), - api::kTensorLogicalLimits); - pc_data.set_value(idx); - return pc_data; - } - - inline PushConstantDataInfo numel_pc_of(const ValueRef idx) const { - PushConstantDataInfo pc_data = PushConstantDataInfo( - values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorNumel); - pc_data.set_value(idx); - return pc_data; - } - - // - // Scalar Value Extraction - // - - bool is_scalar_or_none(const ValueRef idx) const { - const Value& value = values_.at(idx); - return value.isInt() || value.isDouble() || value.isBool() || - value.isNone(); - } - - template - T extract_scalar(const ValueRef idx) { - Value& value = values_.at(idx); - if (value.isInt()) { - return static_cast(value.toInt()); - } - if (value.isDouble()) { - return static_cast(value.toDouble()); - } - if (value.isBool()) { - return static_cast(value.toBool()); - } - VK_THROW("Cannot extract scalar from Value with type ", value.type()); - } - - template - T extract_scalar_or(const ValueRef idx, const T default_value) { - Value& value = values_.at(idx); - if (value.isNone()) { - return default_value; - } - return extract_scalar(idx); - } - - template - std::optional extract_optional_scalar(const ValueRef idx) { - if (val_is_none(idx)) { - return ::std::nullopt; - } else if (val_is_symint(idx)) { - return utils::safe_downcast(read_symint(idx)); - } else { - return extract_scalar(idx); - } - } - - template - T extract_optional_scalar(const ValueRef idx, const T default_val) { - if (val_is_none(idx)) { - return default_val; - } else if (val_is_symint(idx)) { - return utils::safe_downcast(read_symint(idx)); - } else { - return extract_scalar(idx); - } - } - - std::string extract_string(const ValueRef idx) { - return values_.at(idx).toString(); - } - - /* - * Utility function to extract a list of integers from a ValueRef. - * If the ValueRef is an IntList, returns a copy of the list. - * If the ValueRef is a ValueList, extracts each element as an Int or SymInt - * and returns the resulting list. - * Throws an error if the ValueRef is neither an IntList nor a ValueList. - */ - std::vector extract_int_or_symint_list(const ValueRef idx); - - template < - typename T, - typename std::enable_if< - std::is_integral::value && std::is_signed::value, - int>::type = 0> - T extract_whcn_dim(const ValueRef idx, const int64_t ndim) { - T dim = extract_scalar(idx); - // Normalize dim to account for negative indexing - dim = (dim % ndim + ndim) % ndim; - // Assume original value is NCHW ordering, obtain the WHCN ordering - return ndim - 1 - dim; - } - - // - // Utility functions - // - - /* - * Returns a suggested storage type (i.e. buffer or texture) that can be used - * to construct `api::vTensor`s. The storage type is typically determined by - * the GPU reported by the Vulkan context, unless a storage type override is - * defined in the graph configuration. Some GPU architectures work better with - * buffer storage, and others with texture storage. Current only texture - * storage is supported. - */ - utils::StorageType suggested_storage_type(); - - /* - * Returns a suggested memory layout (i.e. channels, width, or height packed) - * that can be used to construct `api::vTensor`s. The memory layout impacts - * which dimension will be treated as the vectorized dimension. For texture - * storage, elements along the vectorized dimension are packed into texels. - * The suggested memory layout is determined based on the sizes of the tensor, - * unless a memory layout override is defined in the graph configuration. - */ - utils::GPUMemoryLayout suggested_memory_layout( - const std::vector& sizes); - - inline bool device_is_adreno() { - return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO; - } - const std::string& device_name() { - return context()->adapter_ptr()->device_name(); - } - - bool device_name_contains(const char* substr); - - // - // Graph Building - // - - private: - void check_no_active_value_ptrs(); - - public: - /* - * Add a `api::vTensor` value to the graph with the specified properties. - * There are various convenience overloads of this function that may be used - * instead. - */ - ValueRef add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const int64_t shared_object_idx = -1, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * Add a `api::vTensor` value to the graph with the specified properties. The - * suggested memory layout will be used to construct the `api::vTensor`. - */ - ValueRef add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const int64_t shared_object_idx = -1, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * Add a `api::vTensor` value to the graph with the specified properties. The - * suggested storage type will be used to construct the `api::vTensor`. - */ - ValueRef add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::GPUMemoryLayout memory_layout, - const int64_t shared_object_idx = -1, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * Add a `api::vTensor` value to the graph with the specified properties. The - * suggested storage type and memory layout will be used to construct the - * `api::vTensor`. - */ - ValueRef add_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const int64_t shared_object_idx = -1, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * Add a `api::vTensor` value to the graph with the specified image. - */ - ValueRef add_tensor(const vkapi::VulkanImage& image); - - /* - * Add a `api::vTensor` value to the graph with the properties of `vref`. - */ - ValueRef add_tensor_like( - const ValueRef vref, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * Add a `api::vTensor` value to the graph with the properties of `vref`. The - * suggested storage type will be used to construct the `api::vTensor`. - */ - ValueRef add_tensor_like( - const ValueRef vref, - const utils::GPUMemoryLayout memory_layout, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - - /* - * Use the copy constructor of `api::vTensor` to create a "view" of the - * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for - * more details. - */ - ValueRef add_tensor_view(const ValueRef vref); - - /* - * Use the copy constructor of `api::vTensor` to create a "view" of the - * `vTensor` value at `vref` with different sizes and dim order. See the copy - * constructor of `api::vTensor` for more details. - */ - ValueRef add_tensor_view( - const ValueRef vref, - const std::vector& sizes, - const std::vector& dim_order); - - /* - * Add a `TensorRef` value to the graph with the specific properties. A - * `TensorRef` is a reference to a `api::vTensor` whose data is stored in an - * external CPU buffer. - */ - ValueRef add_tensorref( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const void* const data); - - /* - * Add a `TensorRef` value to the graph with the specific properties. A - * `TensorRef` is a reference to a `api::vTensor` whose data is stored in a - * FreeableBuffer. The TensorRef will take ownership of the FreeableBuffer. - */ - ValueRef add_tensorref( - const std::vector& sizes, - const vkapi::ScalarType dtype, - executorch::runtime::FreeableBuffer&& buffer); - - /* - * Add a staging buffer to the graph. Staging buffers are data buffers that - * use memory that is visible to both the CPU and GPU, and therefore is used - * as a intermediary when transferring data between the CPU and GPU. - */ - ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel); - - ValueRef add_none(); - - template - typename std::enable_if::value, ValueRef>::type - add_scalar(T value); - - template - typename std::enable_if::value, ValueRef>::type - add_scalar_list(std::vector&& value); - - ValueRef add_value_list(std::vector&& value); - - ValueRef add_string(std::string&& str); - - ValueRef add_symint(const int32_t val); - - /* - * Searches the graph's value list for a Int value with the specified value. - * If one is found, returns the index of the value. Otherwise, add a new value - * and return the index of the new value. - */ - ValueRef get_or_add_value_for_int(const int64_t val); - - ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true); - ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true); - - ValueRef set_output_value(const ValueRef idx); - - template - vkapi::BufferBindInfo create_params_buffer(const Block& data) { - param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data)); - return vkapi::BufferBindInfo(param_ubos_.back().buffer()); - } - - /* - * Given a ValueRef, do the following depending on the type of the Value: - * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object - * backing the SymInt. - * - If it is a regular Int, create a new ParamsBuffer using the integer value - * and return the BufferBindInfo of the created ParamsBuffer. - */ - vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx); - - vkapi::BufferBindInfo get_or_create_int_param_buffer( - const ValueRef idx, - const int32_t default_value); - - void set_symint(const ValueRef idx, const int32_t val); - - int32_t read_symint(const ValueRef idx); - - inline void set_val_as_input(const ValueRef idx) { - inputs_.push_back({idx, kDummyValueRef}); - } - - inline void set_val_as_output(const ValueRef idx) { - outputs_.push_back({idx, kDummyValueRef}); - } - - /* - * Convenience function to add an input tensor along with its staging buffer - */ - inline IOValueRef add_input_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const int64_t shared_object_idx = -1) { - ValueRef t = add_tensor(sizes, dtype, shared_object_idx); - ValueRef staging = set_input_tensor(t); - return {t, staging}; - } - - /* - * Convenience function to add an input tensor with a specific memory layout - * along with its staging buffer - */ - inline IOValueRef add_input_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::GPUMemoryLayout memory_layout, - const int64_t shared_object_idx = -1) { - ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx); - ValueRef staging = set_input_tensor(t); - return {t, staging}; - } - - /* - * Convenience function to add an input tensor with a specific storage type - * along with its staging buffer - */ - inline IOValueRef add_input_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const int64_t shared_object_idx = -1) { - ValueRef t = add_tensor(sizes, dtype, storage_type, shared_object_idx); - ValueRef staging = set_input_tensor(t); - return {t, staging}; - } - - /* - * Add an input tensor with the specified properties along with its staging - * buffer. - */ - inline IOValueRef add_input_tensor( - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const int64_t shared_object_idx = -1) { - ValueRef t = add_tensor( - sizes, dtype, storage_type, memory_layout, shared_object_idx); - ValueRef staging = set_input_tensor(t); - return {t, staging}; - } - - SharedObject& get_shared_object(const int64_t idx); - - /* - * Creates a dedicated memory allocation for a vTensor value, and have the - * tensor acquire the allocation object. If the tensor is already bound to a - * memory allocation, this function will be a no-op. - */ - void create_dedicated_allocation_for(const ValueRef idx); - - // - // Graph Preparation - // - - void update_descriptor_counts( - const vkapi::ShaderInfo& shader_info, - bool execute); - - void register_pipeline_to_create( - const vkapi::ShaderInfo& shader_info, - const utils::WorkgroupSize& local_workgroup_size, - const vkapi::SpecVarList& spec_vars, - const std::vector& push_constants); - - void prepare(); - - void prepare_pipelines(); - - // - // Dispatch Utilities - // - - /* - * Create a global workgroup size for a given `api::vTensor` value assuming - * that every shader invocation calculates one texel element of the output - * tensor. - * - * For tensors that use texture storage, the image extents of the - * `api::vTensor` will be used to set the global workgroup size. - * - * For tensor that use buffer storage, the number of texels in the texel - * buffer will be used to set the x component of the global workgroup size. - * All other components will be set to 1 (i.e. {ntexels, 1, 1} will be - * returned). - */ - utils::uvec3 create_global_wg_size(const ValueRef idx); - - /* - * Suggest a local workgroup size for a given global workgroup size. - * - * The local workgroup size will be formed to try and minimize the number of - * inactive invocations. - * - * Currently, the local workgroup size is hard-coded to contain a total of 64 - * shader invocations. In the future, this value can be configured. - */ - utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size); - - /* - * Convenience function to suggest a local workgroup size for a given - * `api::vTensor` value, assuming that every shader invocation calculates one - * texel element of the output tensor. - */ - utils::uvec3 create_local_wg_size(const ValueRef idx); - - void bind_tensor_to_descriptor_set( - const ValueRef ref, - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::MemoryAccessFlags accessType, - vkapi::DescriptorSet& descriptor_set, - const uint32_t idx); - - void bind_value_to_descriptor_set( - const ValueRef ref, - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::MemoryAccessFlags access_type, - vkapi::DescriptorSet& descriptor_set, - const uint32_t idx); - - // - // Input/Output - // - - void - copy_into_staging(const ValueRef idx, const void* data, const size_t numel); - void copy_from_staging(const ValueRef idx, void* data, const size_t numel); - - protected: - // Command Buffer Management - - /* - * Submits the current command buffer in the Context to the GPU for execution. - */ - void submit_current_cmd(const bool final_use = false); - - /* - * Submits the current command buffer in the Context to the GPU for execution, - * and wait for it to complete before returning. - */ - void submit_current_cmd_and_wait(const bool final_use = false); - - /* - * Submit one command buffer to the GPU. - */ - void submit_cmd(vkapi::CommandBuffer& cmd_buf, VkFence fence); - - /* - * Submits all the commands gathered in deferred_cmd_bufs_ to the GPU. - */ - void submit_deferred_cmds_and_wait(); - - /* - * Ends and invalidates all deferred commands. - */ - void clear_deferred_cmds(); - - public: - // - // Graph Prepacking - // - - inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) { - staging_nbytes_in_cmd_ += staging_bytes; - } - - /* - * Executes prepacking operations to transfer model weight data from the CPU - * to GPU. - */ - void prepack(); - - // - // Graph Execution - // - - void execute(); - - // - // Tensor View - // - - void virtual_clone(const ValueRef dst, const ValueRef src); - - void virtual_transpose( - const ValueRef tensor, - const int64_t dim0, - const int64_t dim1); - - // - // Dynamic Shape support - // - - void resize_input(const int64_t idx, const std::vector& new_sizes); - - void virtual_resize( - const ValueRef idx, - const std::vector& new_sizes); - - void propagate_resize(); - - // Check if a specific ValueRef (or ValueList) was updated, with recursive - // handling - bool was_value_updated(const ValueRef idx) const noexcept; - - // Set the flag to indicate that re-encoding is required - inline void set_requires_reencode() noexcept { - requires_reencode_ = true; - } - - // - // Miscellaneous Utilities - // - - inline bool int16_shader_types_enabled() const { - return context_->adapter_ptr()->supports_int16_shader_types(); - } - - inline size_t execute_count() const { - return execute_count_; - } - - inline bool can_use_int8_dot_product() const { - return can_use_int8_dot_product_; - } - - /* - * Check whether the GPU supports 8 bit buffers. - */ - inline bool int8_buffers_enabled() const { - return context_->adapter_ptr()->has_full_int8_buffers_support(); - } - - // - // Debug support (implemented in Logging.cpp) - // - - void print_readable(); - - // - // Friend classes - // - - friend class vTensorPtr; - friend class TensorRefPtr; - friend class StagingPtr; - friend class IntListPtr; - friend class DoubleListPtr; - friend class BoolListPtr; - friend class ValueListPtr; - friend class SymIntPtr; - - friend struct TmpTensor; - friend struct SharedObject; - friend class BlitNode; -}; - -template -inline typename std::enable_if::value, ValueRef>::type -ComputeGraph::add_scalar(T value) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(value); - return idx; -} - -template -inline typename std::enable_if::value, ValueRef>::type -ComputeGraph::add_scalar_list(std::vector&& value) { - ValueRef idx(static_cast(values_.size())); - check_no_active_value_ptrs(); - values_.emplace_back(std::move(value)); - return idx; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp deleted file mode 100644 index da5efbf8342..00000000000 --- a/backends/vulkan/runtime/graph/GraphConfig.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -GraphConfig::GraphConfig() { - // No automatic submissions - const uint32_t cmd_submit_frequency = UINT32_MAX; - - // Only one command buffer will be encoded at a time - const vkapi::CommandPoolConfig cmd_config{ - 1u, // cmd_pool_initial_size - 1u, // cmd_pool_batch_size - }; - - // Use lazy descriptor pool initialization by default; the graph runtime will - // tally up the number of descriptor sets needed while building the graph and - // trigger descriptor pool initialization with exact sizes before encoding the - // command buffer. - const vkapi::DescriptorPoolConfig descriptor_pool_config{ - 0u, // descriptor_pool_max_sets - 0u, // descriptor_uniform_buffer_count - 0u, // descriptor_storage_buffer_count - 0u, // descriptor_combined_sampler_count - 0u, // descriptor_storage_image_count - 0u, // descriptor_pile_sizes - }; - - const vkapi::QueryPoolConfig query_pool_config{}; - - context_config = { - cmd_submit_frequency, - cmd_config, - descriptor_pool_config, - query_pool_config, - }; - - // Empirically selected safety factor. If descriptor pools start running out - // of memory, increase this safety factor. - descriptor_pool_safety_factor = 1.25; - - // For now, force kTexture3D storage as we are still developing shader support - // for buffer storage type. - enable_storage_type_override = true; - storage_type_override = utils::kTexture3D; - - // For now, force kWidthPacked memory layout by default as we are still - // developing support for other memory layouts. In the future memory layout - // settings will be serialized as part of the graph. - enable_memory_layout_override = true; - memory_layout_override = utils::kWidthPacked; - - // QueryPool objects are used to measure execution times of individual shader - // dispatches. By default, this functionality is disabled. - enable_querypool = false; - - enable_local_wg_size_override = false; - local_wg_size_override = {}; - - expect_dynamic_shapes = false; - - external_adapter = nullptr; -} - -void GraphConfig::set_storage_type_override(utils::StorageType storage_type) { - enable_storage_type_override = true; - storage_type_override = storage_type; -} - -void GraphConfig::set_memory_layout_override( - utils::GPUMemoryLayout memory_layout) { - enable_memory_layout_override = true; - memory_layout_override = memory_layout; -} - -void GraphConfig::set_local_wg_size_override( - const utils::uvec3& local_wg_size) { - enable_local_wg_size_override = true; - local_wg_size_override = local_wg_size; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h deleted file mode 100644 index aa5cd8f8c4e..00000000000 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -struct GraphConfig final { - api::ContextConfig context_config; - - // Creating a descriptor pool with exactly the number of descriptors tallied - // by iterating through the shader layouts of shaders used in the graph risks - // the descriptor pool running out of memory, therefore apply a safety factor - // to descriptor counts when creating the descriptor pool to mitigate this - // risk. - float descriptor_pool_safety_factor; - - bool enable_storage_type_override; - utils::StorageType storage_type_override; - - bool enable_memory_layout_override; - utils::GPUMemoryLayout memory_layout_override; - - bool enable_querypool; - - bool enable_local_wg_size_override; - utils::uvec3 local_wg_size_override; - - // Whether or not the ComputeGraph should expect input shapes to be dynamic - bool expect_dynamic_shapes; - - // Execution properties that determine specifics re: how command buffer - // submission is handled, etc. 0 means this field is not set. - - // During prepacking, once this threshold is reached, submit the current - // command buffer for execution. This allows the work to be distributed over - // multiple command buffer submissions, which can improve model load - // performance and prevent crashes when loading large models. - size_t prepack_threshold_nbytes = 0; - // Threshold used for the first command buffer submission during prepacking. - // This can be set to be lower than prepack_submission_threshold_nbytes to - // submit a command buffer for execution earlier which can improve performance - // by taking more advantage of parallelism between the CPU and GPU. - size_t prepack_initial_threshold_nbytes = 0; - - // During execute, once this node count is reached, submit the current - // command buffer for execution. This allows the work to be distributed over - // multiple command buffer submissions, which can improve execution - // performance. - size_t execute_threshold_node_count = 0; - // Execute node count used for the first command buffer submission during - // execute. This can be set to be lower than execute_threshold_nbytes to - // submit a command buffer for execution earlier which can improve performance - // by taking more advantage of parallelism between the CPU and GPU. - size_t execute_initial_threshold_node_count = 0; - - // If this number is greater than 0 then, during execute create at most this - // many command buffers. - size_t execute_max_cmds = 0; - - vkapi::Adapter* external_adapter; - - // Generate a default graph config with pre-configured settings - explicit GraphConfig(); - - void set_storage_type_override(utils::StorageType storage_type); - void set_memory_layout_override(utils::GPUMemoryLayout memory_layout); - void set_local_wg_size_override(const utils::uvec3& local_wg_size); -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp deleted file mode 100644 index 081083e3a63..00000000000 --- a/backends/vulkan/runtime/graph/Logging.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include -#include - -namespace vkcompute { - -void ComputeGraph::print_readable() { - std::set input_set; - for (const IOValueRef& io_val : inputs()) { - input_set.insert(io_val.value); - } - - std::set output_set; - for (const IOValueRef& io_val : outputs()) { - output_set.insert(io_val.value); - } - - std::set prepack_set; - for (const std::unique_ptr& node : prepack_nodes()) { - prepack_set.insert(node->tref_); - prepack_set.insert(node->packed_); - } - - std::map value_ref_to_shared_object_idx; - - std::cout << "====================" << std::left << std::setfill('=') - << std::setw(40) << " Shared Object List " << std::right - << std::setfill(' ') << std::endl; - - std::cout << std::setw(6) << "idx" << std::setw(20) << "sizes" - << std::setw(24) << "users" << std::endl; - - size_t so_idx = 0; - for (const SharedObject& shared_object : shared_objects_) { - std::cout << std::setw(6) << so_idx; - { - std::stringstream ss; - ss << shared_object.aggregate_memory_requirements.size; - std::cout << std::setw(20) << ss.str(); - } - - { - std::stringstream ss; - ss << shared_object.users; - std::cout << std::setw(24) << ss.str(); - } - std::cout << std::endl; - - for (const ValueRef& user : shared_object.users) { - value_ref_to_shared_object_idx[user] = so_idx; - } - - so_idx++; - } - - std::cout << "====================" << std::left << std::setfill('=') - << std::setw(40) << " Value List " << std::right - << std::setfill(' ') << std::endl; - - std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20) - << "sizes" << std::setw(10) << "node_type" << std::setw(15) - << "storage_bytes" << std::setw(10) << "so_idx" << std::endl; - - size_t value_idx = 0; - for (Value& val : values_) { - std::cout << std::setw(6) << value_idx << std::setw(10) << val.type(); - - // sizes - std::cout << std::setw(20); - if (val.isTensor()) { - const api::vTensor& v_tensor = val.toTensor(); - std::stringstream ss; - ss << v_tensor.sizes(); - std::cout << ss.str(); - } else if (val.isTensorRef()) { - const TensorRef& tensor_ref = val.toTensorRef(); - std::stringstream ss; - ss << tensor_ref.sizes; - std::cout << ss.str(); - } else { - std::cout << ""; - } - - // Node type - std::cout << std::setw(10); - { - if (input_set.count(value_idx) > 0) { - std::cout << "INPUT"; - } else if (output_set.count(value_idx) > 0) { - std::cout << "OUTPUT"; - } else if (prepack_set.count(value_idx) > 0) { - std::cout << "PREPACK"; - } else { - std::cout << ""; - } - } - - // Actual storage bytes used - std::cout << std::setw(15); - if (val.isTensor()) { - const api::vTensor& v_tensor = val.toTensor(); - auto memory_reqs = v_tensor.get_memory_requirements(); - std::cout << memory_reqs.size; - } else { - std::cout << ""; - } - - std::cout << std::setw(10); - if (value_ref_to_shared_object_idx.count(value_idx) > 0) { - size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx); - std::cout << shared_obj_idx; - } else { - std::cout << ""; - } - - std::cout << std::endl; - value_idx++; - } - - std::cout << "====================" << std::left << std::setfill('=') - << std::setw(40) << " Prepack Node List " << std::right - << std::setfill(' ') << std::endl; - std::cout << std::setw(6) << "idx" << std::setw(32) << "shader_name" - << std::setw(8) << "tref" << std::setw(8) << "packed" << std::endl; - - size_t prepack_node_idx = 0; - for (const std::unique_ptr& node : prepack_nodes()) { - std::cout << std::setw(6) << prepack_node_idx << std::setw(32) - << node->shader_.kernel_name << std::setw(8) << node->tref_ - << std::setw(8) << node->packed_ << std::endl; - - prepack_node_idx++; - } - - std::cout << "====================" << std::left << std::setfill('=') - << std::setw(40) << " Execute Node List " << std::right - << std::setfill(' ') << std::endl; - - std::cout << std::setw(6) << "idx" << std::setw(32) << "shader_name" - << std::setw(24) << "READ_arg" << std::setw(24) << "WRITE_arg" - << std::endl; - - size_t node_idx = 0; - for (const std::unique_ptr& node : execute_nodes()) { - std::cout << std::setw(6) << node_idx; - std::cout << std::setw(32) << node->name(); - - std::stringstream read_s; - for (const ArgGroup& arg_group : node->args_) { - if (arg_group.access != vkapi::MemoryAccessType::READ) { - continue; - } - read_s << arg_group.refs; - } - std::cout << std::setw(24) << read_s.str(); - - std::stringstream write_s; - for (const ArgGroup& arg_group : node->args_) { - if (arg_group.access != vkapi::MemoryAccessType::WRITE) { - continue; - } - write_s << arg_group.refs; - } - std::cout << std::setw(24) << write_s.str(); - - std::cout << std::endl; - - node_idx++; - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h deleted file mode 100644 index fb2f66e2d6f..00000000000 --- a/backends/vulkan/runtime/graph/Logging.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include -#include - -namespace vkcompute { - -template -inline std::ostream& operator<<(std::ostream& os, const std::vector& vec) { - os << '['; - for (const auto& elem : vec) { - os << elem << ','; - } - os << ']'; - return os; // Return the ostream to allow chaining -} - -inline std::ostream& operator<<(std::ostream& os, const utils::uvec3& v) { - return utils::operator<<(os, v); -} - -inline std::ostream& operator<<(std::ostream& os, const utils::uvec4& v) { - return utils::operator<<(os, v); -} - -inline std::ostream& operator<<(std::ostream& os, const utils::ivec3& v) { - return utils::operator<<(os, v); -} - -inline std::ostream& operator<<(std::ostream& os, const utils::ivec4& v) { - return utils::operator<<(os, v); -} - -template -inline std::ostream& operator<<(std::ostream& os, const std::optional& opt) { - os << "["; - if (opt) { - os << opt.value(); - } - os << "]"; - return os; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Constant.cpp b/backends/vulkan/runtime/graph/containers/Constant.cpp deleted file mode 100644 index 4dc2cdda8f5..00000000000 --- a/backends/vulkan/runtime/graph/containers/Constant.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -TensorRef::TensorRef( - const std::vector& t_sizes, - vkapi::ScalarType t_dtype, - const void* const t_data) - : sizes{}, dtype{t_dtype}, data{t_data}, buffer{} { - size_t ndim = t_sizes.size(); - sizes.resize(ndim); - for (int i = 0; i < ndim; ++i) { - sizes[i] = t_sizes.at(i); - } -} - -TensorRef::TensorRef( - const std::vector& t_sizes, - vkapi::ScalarType t_dtype, - executorch::runtime::FreeableBuffer&& t_buffer) - : sizes{}, - dtype{t_dtype}, - data{t_buffer.data()}, - buffer{std::move(t_buffer)} { - size_t ndim = t_sizes.size(); - sizes.resize(ndim); - for (int i = 0; i < ndim; ++i) { - sizes[i] = t_sizes.at(i); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h deleted file mode 100644 index a18c284a219..00000000000 --- a/backends/vulkan/runtime/graph/containers/Constant.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace vkcompute { - -/* - * Represents a reference to a tensor that has been - * serialized with the model, such as a serialized weight - * tensor. It contains some metadata as well as a raw - * pointer to the data of the tensor, which is assumed to - * be contiguous. - */ -struct TensorRef final { - std::vector sizes; - vkapi::ScalarType dtype; - const void* data; - - // Optional FreeableBuffer for managing memory lifecycle - // This will be empty (default constructed) for the raw pointer constructor - executorch::runtime::FreeableBuffer buffer; - - explicit TensorRef( - const std::vector& t_sizes, - vkapi::ScalarType t_dtype, - const void* const t_data); - - // Constructor that takes ownership of a FreeableBuffer - explicit TensorRef( - const std::vector& t_sizes, - vkapi::ScalarType t_dtype, - executorch::runtime::FreeableBuffer&& t_buffer); - - inline size_t nbytes() const { - return utils::multiply_integers(sizes) * vkapi::element_size(dtype); - } - - // Manually free the buffer if needed (though it will be freed automatically - // on destruction) - void free_buffer() { - buffer.Free(); - } -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/PushConstantData.cpp b/backends/vulkan/runtime/graph/containers/PushConstantData.cpp deleted file mode 100644 index 7999118443b..00000000000 --- a/backends/vulkan/runtime/graph/containers/PushConstantData.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -uint32_t PushConstantDataInfo::write( - void* dst, - const uint32_t dst_offset, - const uint32_t max_dst_size) const { - if (tensorUniformData != nullptr) { - return tensorUniformData->write_attribute( - dst, dst_offset, max_dst_size, payload_.attr); - } - - VK_CHECK_COND( - (dst_offset + payload_.dataSize) <= max_dst_size, - "Attempting to write push constant data outside data boundary."); - memcpy((uint8_t*)dst + dst_offset, payload_.data, payload_.dataSize); - return payload_.dataSize; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/PushConstantData.h b/backends/vulkan/runtime/graph/containers/PushConstantData.h deleted file mode 100644 index c86232983ea..00000000000 --- a/backends/vulkan/runtime/graph/containers/PushConstantData.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -class ComputeGraph; - -constexpr uint32_t kMaxPushConstantSize = 128; -/* - * Represents a push constant data entry - * Which is either shared pointer to a tensor's uniform data with an attribute - * Or data with a maximum size of 16 bytes - */ -class PushConstantDataInfo { - std::shared_ptr tensorUniformData; - union Payload { - struct { - api::vTensor::Attribute attr; - }; - struct { - uint8_t data[16]; - uint32_t dataSize; - }; - }; - - Payload payload_; - // The value in a compute graph that this push constant data is associated - // with, if any. - ValueRef value_ = kDummyValueRef; - - public: - explicit PushConstantDataInfo( - const std::shared_ptr& tensorUniformData, - api::vTensor::Attribute attr) - : tensorUniformData(tensorUniformData) { - payload_.attr = attr; - } - - explicit PushConstantDataInfo( - const void* data, - uint32_t dataLen, - uint32_t pushConstantLen = 0) - : tensorUniformData(nullptr) { - VK_CHECK_COND( - dataLen <= 16, "Single push constant data size must be <= 16 bytes"); - payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen; - memcpy(payload_.data, data, dataLen); - } - - /* - * Function writes push constant data to the destination buffer - */ - uint32_t write( - void* dst, - const uint32_t dst_offset, - const uint32_t max_dst_size) const; - - inline bool is_tensor_metadata() const noexcept { - return tensorUniformData != nullptr; - } - - inline void set_value(ValueRef value) noexcept { - value_ = value; - } - - inline ValueRef value() const noexcept { - return value_; - } -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp deleted file mode 100644 index 10ddd6f2ca3..00000000000 --- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { - -bool SharedObject::has_user(const ValueRef idx) const { - return std::find(users.begin(), users.end(), idx) != users.end(); -} - -void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) { - vTensorPtr t = graph->get_tensor(idx); - - // Aggregate Memory Requirements - const VkMemoryRequirements mem_reqs = t->get_memory_requirements(); - aggregate_memory_requirements.size = - std::max(mem_reqs.size, aggregate_memory_requirements.size); - aggregate_memory_requirements.alignment = - std::max(mem_reqs.alignment, aggregate_memory_requirements.alignment); - aggregate_memory_requirements.memoryTypeBits |= mem_reqs.memoryTypeBits; - - users.emplace_back(idx); -} - -void SharedObject::allocate(ComputeGraph* const graph) { - if (aggregate_memory_requirements.size == 0) { - return; - } - - VmaAllocationCreateInfo alloc_create_info = - graph->context()->adapter_ptr()->vma().gpuonly_resource_create_info(); - - allocation = graph->context()->adapter_ptr()->vma().create_allocation( - aggregate_memory_requirements, alloc_create_info); -} - -void SharedObject::bind_users(ComputeGraph* const graph) { - if (users.empty()) { - return; - } - for (const ValueRef idx : users) { - graph->get_tensor(idx)->bind_allocation(allocation); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h deleted file mode 100644 index f9b16e6c202..00000000000 --- a/backends/vulkan/runtime/graph/containers/SharedObject.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -#include - -namespace vkcompute { - -class ComputeGraph; - -struct SharedObject { - friend class ComputeGraph; - - explicit SharedObject() = default; - - VkMemoryRequirements aggregate_memory_requirements; - std::vector users; - vkapi::Allocation allocation; - - bool has_user(const ValueRef idx) const; - void add_user(ComputeGraph* const graph, const ValueRef idx); - void allocate(ComputeGraph* const graph); - void bind_users(ComputeGraph* const graph); -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp deleted file mode 100644 index a59a2d40141..00000000000 --- a/backends/vulkan/runtime/graph/containers/SymInt.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -SymInt::SymInt(api::Context* context_p, const int32_t val) - : gpu_buffer(context_p, val){}; - -void SymInt::set(const int32_t val) { - gpu_buffer.update(val); -} - -int32_t SymInt::get() { - return gpu_buffer.read(); -} - -void SymInt::operator=(const int32_t val) { - gpu_buffer.update(val); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h deleted file mode 100644 index bd361aabe5a..00000000000 --- a/backends/vulkan/runtime/graph/containers/SymInt.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace vkcompute { - -/* - * Represents a symbolic integer whose value can be variable. It is implemented - * as a thin wrapper around a `ParamsBuffer` object that holds the value of the - * integer. The `ParamsBuffer` object allows the value of the symbolic integer - * to be changed from the CPU and have those changes be visible to all shaders - * that use the symbolic integer; it also allows the value of the symbolic - * integer to be the result of a compute shader. - * - * Regular scalar types represented by `TypeTag::INT` cannot be used for - * symbolic integers because their value is assumed to be constant; therefore - * the `Value` instance holding the value of the scalar does not contain - * any reference to the GPU buffers used to pass its value into compute shaders. - * Therefore, updating the value of the scalar does not impact the value seen - * by compute shaders. - */ -struct SymInt final { - api::ParamsBuffer gpu_buffer; - - explicit SymInt(api::Context* context_p, const int32_t val); - - void set(const int32_t val); - - int32_t get(); - - void operator=(const int32_t val); -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp deleted file mode 100644 index e7a8951a552..00000000000 --- a/backends/vulkan/runtime/graph/containers/Types.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -#define PRINT_CASE(name) \ - case TypeTag::name: \ - out << #name; \ - break; - -std::ostream& operator<<(std::ostream& out, const TypeTag& tag) { - switch (tag) { - PRINT_CASE(NONE) - PRINT_CASE(INT) - PRINT_CASE(DOUBLE) - PRINT_CASE(BOOL) - PRINT_CASE(TENSOR) - PRINT_CASE(STAGING) - PRINT_CASE(TENSORREF) - PRINT_CASE(INTLIST) - PRINT_CASE(DOUBLELIST) - PRINT_CASE(BOOLLIST) - PRINT_CASE(VALUELIST) - PRINT_CASE(STRING) - PRINT_CASE(SYMINT) - } - return out; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h deleted file mode 100644 index 48232179e06..00000000000 --- a/backends/vulkan/runtime/graph/containers/Types.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * Copyright 2025 Arm Limited and/or its affiliates. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace vkcompute { - -/* - * This class is modelled after c10::IValue; however, it - * is simplified and does not support as many types. - * However, the core design is the same; it is a tagged - * union over the types supported by the Vulkan Graph - * type. - */ -enum class TypeTag : uint32_t { - NONE, - // Scalar types - INT, - DOUBLE, - BOOL, - // Tensor and tensor adjacent types - TENSOR, - STAGING, - TENSORREF, - // Scalar lists - INTLIST, - DOUBLELIST, - BOOLLIST, - // Special Type - VALUELIST, - STRING, - SYMINT, -}; - -std::ostream& operator<<(std::ostream& out, const TypeTag& tag); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h deleted file mode 100644 index b73684307b2..00000000000 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include -#include -#include - -namespace vkcompute { - -using ValueRef = int32_t; - -constexpr ValueRef kDummyValueRef = -1; - -inline bool is_valid(ValueRef value_ref) { - return value_ref >= 0; -} - -struct IOValueRef { - ValueRef value; - ValueRef staging; - - // Custom cast to ValueRef - operator ValueRef() const { - return value; - }; -}; - -/* - * This class is modelled after c10::IValue; however, it is simplified and does - * not support as many types. However, the core design is the same; it is a - * tagged union over the types supported by the Vulkan Graph type. - */ -struct Value final { - private: - /* - * The union type which is used to store the value of the Value. - */ - union Payload { - /* - * Similar to IValue::Payload, trivially copyable types are nested in their - * own union. - */ - union TriviallyCopyablePayload { - TriviallyCopyablePayload() : as_int(0) {} - int64_t as_int; - double as_double; - bool as_bool; - } u; - - std::unique_ptr as_tensor; - std::unique_ptr as_staging; - TensorRef as_tensorref; - - std::vector as_int_list; - std::vector as_double_list; - std::vector as_bool_list; - - // The below is a special type that is used to represent a list of other - // values stored in the graph. One application of the type is to represent - // a list of tensors or a list of optional tensors. - std::vector as_value_list; - - std::string as_string; - - std::unique_ptr as_symint; - - Payload() : u() {} - // NOLINTNEXTLINE - ~Payload(){}; - }; - - public: - // - // Copy constructor and assignment (disabled) - // - - Value(const Value& rhs) = delete; - Value& operator=(const Value&) = delete; - - // - // Move constructor and assignment; Move assignment is disabled but - // construction is implemented to allow for use in container types. - // - - Value& operator=(Value&&) = delete; - -#define CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(type_tag, member_name) \ - case type_tag: \ - payload.u.member_name = rhs.payload.u.member_name; \ - break; - -#define CASE_MOVE_MOVEABLE_TYPE(type_tag, type, member_name, dtor_name) \ - case type_tag: \ - new (&payload.member_name) type(std::move(rhs.payload.member_name)); \ - rhs.payload.member_name.~dtor_name(); \ - break; - -#define CASE_MOVE_UNIQUE_PTR_TYPE(type_tag, member_name) \ - case type_tag: \ - payload.member_name = std::move(rhs.payload.member_name); \ - break; - - Value(Value&& rhs) noexcept : tag(rhs.tag) { - switch (tag) { - // Scalar types - CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::INT, as_int); - CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::DOUBLE, as_double); - CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::BOOL, as_bool); - // Tensor adjacent type - CASE_MOVE_MOVEABLE_TYPE( - TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef); - // Scalar lists - CASE_MOVE_MOVEABLE_TYPE( - TypeTag::INTLIST, std::vector, as_int_list, vector); - CASE_MOVE_MOVEABLE_TYPE( - TypeTag::DOUBLELIST, std::vector, as_double_list, vector); - CASE_MOVE_MOVEABLE_TYPE( - TypeTag::BOOLLIST, std::vector, as_bool_list, vector); - // Special types - CASE_MOVE_MOVEABLE_TYPE( - TypeTag::VALUELIST, std::vector, as_value_list, vector); - CASE_MOVE_MOVEABLE_TYPE( - TypeTag::STRING, std::string, as_string, basic_string); - // Tensor type - CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::TENSOR, as_tensor); - // Small tensor adjacent types - CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::STAGING, as_staging); - // Large tensor adjacent types - CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::SYMINT, as_symint); - - case TypeTag::NONE: - clearToNone(); - break; - } - rhs.clearToNone(); - } - -#undef CASE_MOVE_TRIVIALLY_COPYABLE_TYPE -#undef CASE_MOVE_MOVEABLE_TYPE -#undef CASE_MOVE_UNIQUE_PTR_TYPE - - // - // Accessors - // - - inline TypeTag type() const { - return tag; - } - - // - // Destructor - // - - ~Value() { - switch (tag) { - case TypeTag::TENSORREF: - payload.as_tensorref.~TensorRef(); - break; - case TypeTag::INTLIST: - payload.as_int_list.~vector(); - break; - case TypeTag::DOUBLELIST: - payload.as_double_list.~vector(); - break; - case TypeTag::BOOLLIST: - payload.as_bool_list.~vector(); - break; - case TypeTag::VALUELIST: - payload.as_value_list.~vector(); - break; - case TypeTag::STRING: - payload.as_string.~basic_string(); - break; - case TypeTag::STAGING: - payload.as_staging.reset(); - break; - case TypeTag::SYMINT: - payload.as_symint.reset(); - break; - case TypeTag::TENSOR: - payload.as_tensor.reset(); - break; - // Manually list out the types so that if a type here is added later and - // not handled the compiler can catch it. - case TypeTag::NONE: - case TypeTag::INT: - case TypeTag::DOUBLE: - case TypeTag::BOOL: - break; - } - } - - // - // Constructors, isType(), toType() - // - - Value() : tag(TypeTag::NONE) {} - - inline bool isNone() const { - return tag == TypeTag::NONE; - } - -#define SUPPORT_TRIVIALLY_COPYABLE_TYPE( \ - type, type_name, type_tag, member_name) \ - explicit Value(type t) : tag(type_tag) { \ - payload.u.member_name = t; \ - } \ - inline bool is##type_name() const { \ - return tag == type_tag; \ - } \ - inline const type& to##type_name() const { \ - VK_CHECK_COND( \ - is##type_name(), \ - "Expected value to have type " #type_name ", got ", \ - tag, \ - " instead."); \ - return payload.u.member_name; \ - } - - SUPPORT_TRIVIALLY_COPYABLE_TYPE(int64_t, Int, TypeTag::INT, as_int); - SUPPORT_TRIVIALLY_COPYABLE_TYPE(double, Double, TypeTag::DOUBLE, as_double); - SUPPORT_TRIVIALLY_COPYABLE_TYPE(bool, Bool, TypeTag::BOOL, as_bool); - -#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE - -#define SUPPORT_TRIVIALLY_MOVEABLE_TYPE( \ - type, type_name, type_tag, member_name) \ - explicit Value(type&& t) : tag(type_tag) { \ - new (&payload.member_name) type(std::move(t)); \ - } \ - inline bool is##type_name() const { \ - return tag == type_tag; \ - } \ - inline type& to##type_name() { \ - VK_CHECK_COND( \ - is##type_name(), \ - "Expected value to have type " #type_name ", got ", \ - tag, \ - " instead."); \ - return payload.member_name; \ - } \ - inline const type& toConst##type_name() const { \ - VK_CHECK_COND( \ - is##type_name(), \ - "Expected value to have type " #type_name ", got ", \ - tag, \ - " instead."); \ - return payload.member_name; \ - } - - SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - TensorRef, - TensorRef, - TypeTag::TENSORREF, - as_tensorref); - - SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - std::vector, - IntList, - TypeTag::INTLIST, - as_int_list); - - SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - std::vector, - DoubleList, - TypeTag::DOUBLELIST, - as_double_list); - - SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - std::vector, - BoolList, - TypeTag::BOOLLIST, - as_bool_list); - - SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - std::vector, - ValueList, - TypeTag::VALUELIST, - as_value_list); - - SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - std::string, - String, - TypeTag::STRING, - as_string); - -#undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE - -#define SUPPORT_UNIQUE_PTR_TYPE(type, type_name, type_tag, member_name) \ - explicit Value(type t) : tag(type_tag) { \ - payload.member_name = std::make_unique(std::move(t)); \ - } \ - inline bool is##type_name() const { \ - return tag == type_tag; \ - } \ - inline type& to##type_name() const { \ - VK_CHECK_COND( \ - is##type_name(), \ - "Expected value to have type " #type_name ", got ", \ - tag, \ - " instead."); \ - return *payload.member_name; \ - } \ - inline const type& toConst##type_name() const { \ - VK_CHECK_COND( \ - is##type_name(), \ - "Expected value to have type " #type_name ", got ", \ - tag, \ - " instead."); \ - return *payload.member_name; \ - } - - SUPPORT_UNIQUE_PTR_TYPE(api::vTensor, Tensor, TypeTag::TENSOR, as_tensor); - - SUPPORT_UNIQUE_PTR_TYPE( - api::StagingBuffer, - Staging, - TypeTag::STAGING, - as_staging); - - SUPPORT_UNIQUE_PTR_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint); - -#undef SUPPORT_UNIQUE_PTR_TYPE - - private: - Payload payload; - TypeTag tag; - - // - // Utility Functions - // - - inline void clearToNone() noexcept { - payload.u.as_int = -1; - tag = TypeTag::NONE; - } -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/BlitNode.cpp b/backends/vulkan/runtime/graph/ops/BlitNode.cpp deleted file mode 100644 index de1ad596069..00000000000 --- a/backends/vulkan/runtime/graph/ops/BlitNode.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { - -BlitNode::BlitNode( - ComputeGraph& graph, - ValueRef src, - ValueRef dst, - // const vkapi::ScalarType& dtype, - const ResizeFunction& resize_fn, - const std::vector& resize_args) - : ExecuteNode(resize_fn, resize_args, {}, "Blit Node"), - src_(src), - dst_(dst) { - (void)graph; -} - -void BlitNode::encode(ComputeGraph* graph) { - VK_CHECK_COND( - graph->storage_type_of(src_) != utils::kBuffer && - graph->storage_type_of(dst_) != utils::kBuffer, - "BlitNode: Only texture backed tensors are supported."); - - api::Context* const context = graph->context(); - vkapi::PipelineBarrier pipeline_barrier{}; - - std::unique_lock cmd_lock = context->dispatch_lock(); - - // Hack to get timing data for non shader op - std::string kernel_name("Blit_"); - kernel_name.reserve(32); - kernel_name += vkapi::to_string(graph->dtype_of(src_)); - kernel_name += "_to_"; - kernel_name += vkapi::to_string(graph->dtype_of(dst_)); - - context->report_shader_dispatch_start( - kernel_name, utils::uvec3(), utils::WorkgroupSize(), node_id_); - - context->register_blit( - pipeline_barrier, - graph->get_tensor(src_)->image( - pipeline_barrier, vkapi::PipelineStage::TRANSFER, vkapi::kRead), - graph->get_tensor(dst_)->image( - pipeline_barrier, vkapi::PipelineStage::TRANSFER, vkapi::kWrite)); - - context->report_shader_dispatch_end(); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/BlitNode.h b/backends/vulkan/runtime/graph/ops/BlitNode.h deleted file mode 100644 index 98d187b166a..00000000000 --- a/backends/vulkan/runtime/graph/ops/BlitNode.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -#include - -#include - -namespace vkcompute { - -/* - * Represents a tensor blit execution op in a ML model. - */ -class BlitNode final : public ExecuteNode { - friend class ComputeGraph; - - public: - explicit BlitNode( - ComputeGraph& graph, - ValueRef src, - ValueRef dst, - /*const vkapi::ScalarType& dtype,*/ - const ResizeFunction& resize_fn = nullptr, - const std::vector& resize_args = {}); - - ~BlitNode() override = default; - - void encode(ComputeGraph* graph) override; - - protected: - ValueRef src_; - ValueRef dst_; - // const vkapi::ScalarType &dtype_; -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp deleted file mode 100644 index 898a3415b7e..00000000000 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -namespace vkcompute { - -DispatchNode::DispatchNode( - ComputeGraph& graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const utils::uvec3& local_workgroup_size, - const std::vector& args, - const vkapi::ParamsBindList& params, - const std::vector& push_constants, - const vkapi::SpecVarList& spec_vars, - const std::vector& resize_args, - const ResizeFunction& resize_fn) - : ExecuteNode(resize_fn, resize_args, args, shader.kernel_name), - shader_(shader), - global_workgroup_size_(global_workgroup_size), - local_workgroup_size_(local_workgroup_size), - params_(params), - spec_vars_(spec_vars), - push_constants_(push_constants) { - graph.update_descriptor_counts(shader, /*execute = */ true); -} - -void DispatchNode::prepare_pipelines(ComputeGraph* graph) { - graph->register_pipeline_to_create( - shader_, local_workgroup_size_, spec_vars_, push_constants_); -} - -void DispatchNode::encode(ComputeGraph* graph) { - if (!shader_) { - return; - } - api::Context* const context = graph->context(); - vkapi::PipelineBarrier pipeline_barrier{}; - - context->check_device_capabilities(shader_); - - std::unique_lock cmd_lock = context->dispatch_lock(); - - write_push_constant_data(); - - context->report_shader_dispatch_start( - shader_.kernel_name, - global_workgroup_size_, - local_workgroup_size_, - node_id_); - - vkapi::DescriptorSet descriptor_set = context->get_descriptor_set( - shader_, local_workgroup_size_, spec_vars_, push_constants_offset_); - - uint32_t idx = 0; - idx = bind_values_to_descriptor_set( - graph, args_, pipeline_barrier, descriptor_set, idx); - - bind_params_to_descriptor_set(params_, descriptor_set, idx); - - context->register_shader_dispatch( - descriptor_set, - pipeline_barrier, - shader_, - global_workgroup_size_, - push_constants_data_.data(), - push_constants_offset_); - - context->report_shader_dispatch_end(); -} - -void DispatchNode::write_push_constant_data() { - push_constants_offset_ = 0; - for (const auto& push_constant : push_constants_) { - push_constants_offset_ += push_constant.write( - push_constants_data_.data(), - push_constants_offset_, - kMaxPushConstantSize); - } -} - -bool DispatchNode::trigger_resize(ComputeGraph* graph) { - const bool any_arg_updated = ExecuteNode::trigger_resize(graph); - - if (any_arg_updated) { - // If this shader uses push constants, and the tensor metadata associated - // with the push constants has changed, then the command buffer needs to be - // re-encoded since push constants cannot be updated. - for (const auto& push_constant : push_constants_) { - if (push_constant.is_tensor_metadata() && - graph->was_value_updated(push_constant.value())) { - graph->set_requires_reencode(); - } - } - } - return any_arg_updated; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h deleted file mode 100644 index 89d24a77d6e..00000000000 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -#include - -namespace vkcompute { - -class ComputeGraph; - -/* - * Represents a single shader execution op in a ML model. - */ -class DispatchNode : public ExecuteNode { - friend class ComputeGraph; - - public: - explicit DispatchNode( - ComputeGraph& graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const utils::uvec3& local_workgroup_size, - const std::vector& args, - const vkapi::ParamsBindList& params, - const std::vector& push_constants = {}, - const vkapi::SpecVarList& spec_vars = {}, - const std::vector& resize_args = {}, - const ResizeFunction& resize_fn = nullptr); - - ~DispatchNode() override = default; - - void prepare_pipelines(ComputeGraph* graph) override; - - void encode(ComputeGraph* graph) override; - - bool trigger_resize(ComputeGraph* graph) override; - - protected: - vkapi::ShaderInfo shader_; - utils::uvec3 global_workgroup_size_; - utils::WorkgroupSize local_workgroup_size_; - const vkapi::ParamsBindList params_; - const vkapi::SpecVarList spec_vars_; - const std::vector push_constants_; - - // For push constants - std::array push_constants_data_{}; - uint32_t push_constants_offset_ = 0; - - void write_push_constant_data(); - - public: - operator bool() const { - return shader_; - } -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp deleted file mode 100644 index 5a88bba88c9..00000000000 --- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { - -DynamicDispatchNode::DynamicDispatchNode( - ComputeGraph& graph, - const PickShaderFn& pick_shader_fn, - const PickGlobalFn& pick_global_wg_fn, - const PickLocalFn& pick_local_wg_fn, - const std::vector& args, - const vkapi::ParamsBindList& params, - const std::vector& push_constants, - const vkapi::SpecVarList& spec_vars, - const std::vector& resize_args, - const ResizeFunction& resize_fn) - : DispatchNode( - graph, - pick_shader_fn(&graph, args, resize_args), - {1u, 1u, 1u}, - {8u, 8u, 1u}, - args, - params, - push_constants, - spec_vars, - resize_args, - resize_fn), - pick_shader_fn_(pick_shader_fn), - pick_global_wg_fn_(pick_global_wg_fn), - pick_local_wg_fn_(pick_local_wg_fn) { - global_workgroup_size_ = - pick_global_wg_fn(&graph, shader_, args, resize_args); - local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn( - &graph, shader_, global_workgroup_size_, args, resize_args)); - - // Calculate dispatch grid similar to Context.cpp register_shader_dispatch - wg_dispatch_grid_ = { - utils::div_up(global_workgroup_size_[0], local_workgroup_size_[0]), - utils::div_up(global_workgroup_size_[1], local_workgroup_size_[1]), - utils::div_up(global_workgroup_size_[2], local_workgroup_size_[2])}; -} - -DynamicDispatchNode::DynamicDispatchNode( - ComputeGraph& graph, - const vkapi::ShaderInfo& shader, - const PickGlobalFn& pick_global_wg_fn, - const PickLocalFn& pick_local_wg_fn, - const std::vector& args, - const vkapi::ParamsBindList& params, - const std::vector& push_constants, - const vkapi::SpecVarList& spec_vars, - const std::vector& resize_args, - const ResizeFunction& resize_fn) - : DispatchNode( - graph, - shader, - {1u, 1u, 1u}, - {8u, 8u, 1u}, - args, - params, - push_constants, - spec_vars, - resize_args, - resize_fn), - pick_shader_fn_{nullptr}, - pick_global_wg_fn_(pick_global_wg_fn), - pick_local_wg_fn_(pick_local_wg_fn) { - global_workgroup_size_ = - pick_global_wg_fn(&graph, shader_, args, resize_args); - local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn( - &graph, shader_, global_workgroup_size_, args, resize_args)); - // Calculate the work group grid that will be dispatched - wg_dispatch_grid_ = { - utils::div_up(global_workgroup_size_[0], local_workgroup_size_[0]), - utils::div_up(global_workgroup_size_[1], local_workgroup_size_[1]), - utils::div_up(global_workgroup_size_[2], local_workgroup_size_[2])}; -} - -bool DynamicDispatchNode::trigger_resize(ComputeGraph* graph) { - // DispatchNode::trigger_resize() will return true if any of the values - // participating in this operation were updated. - const bool any_arg_updated = DispatchNode::trigger_resize(graph); - // Only re-compute the shader, global workgroup size, and local workgroup size - // if any of the values participating in this operation were updated. - // Otherwise, assume that these will not have changed. - if (!any_arg_updated) { - return false; - } - - // Indicates if the shader dispatch should be changed since the last time the - // command buffer was encoded. - bool dispatch_params_changed = false; - - if (pick_shader_fn_) { - vkapi::ShaderInfo new_shader = pick_shader_fn_(graph, args_, resize_args_); - // Compare shader kernel names as a proxy for shader equality - if (shader_.kernel_name != new_shader.kernel_name) { - shader_ = new_shader; - dispatch_params_changed = true; - } - } - if (pick_global_wg_fn_) { - // Note that if global workgroup size changes, then the dispatch params - // may not actually be different. The actual value to check is the - // work group grid size that will be dispatched, which is calculated - // below. - global_workgroup_size_ = - pick_global_wg_fn_(graph, shader_, args_, resize_args_); - } - if (pick_local_wg_fn_) { - utils::uvec3 new_local_wg_uvec3 = pick_local_wg_fn_( - graph, shader_, global_workgroup_size_, args_, resize_args_); - utils::WorkgroupSize new_local_wg = - utils::WorkgroupSize(new_local_wg_uvec3); - if (local_workgroup_size_ != new_local_wg) { - local_workgroup_size_ = new_local_wg; - dispatch_params_changed = true; - } - } - - // Always recompute the new dispatch grid and check if it's different - utils::uvec3 new_wg_dispatch_grid = { - utils::div_up(global_workgroup_size_[0], local_workgroup_size_[0]), - utils::div_up(global_workgroup_size_[1], local_workgroup_size_[1]), - utils::div_up(global_workgroup_size_[2], local_workgroup_size_[2])}; - - // Check if the new dispatch grid is different from the old one - if (wg_dispatch_grid_ != new_wg_dispatch_grid) { - dispatch_params_changed = true; - } - wg_dispatch_grid_ = new_wg_dispatch_grid; - - // If any of the dispatch params have changed, then the command buffer must - // be re-encoded. - if (dispatch_params_changed) { - graph->set_requires_reencode(); - } - - return true; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h deleted file mode 100644 index d3b82968eb2..00000000000 --- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -#include - -namespace vkcompute { - -class ComputeGraph; - -/* - * Represents a single shader execution op in a ML model. - */ -class DynamicDispatchNode final : public DispatchNode { - friend class ComputeGraph; - - public: - using PickShaderFn = const std::function&, - const std::vector&)>; - using PickGlobalFn = const std::function&, - const std::vector&)>; - using PickLocalFn = const std::function&, - const std::vector&)>; - - explicit DynamicDispatchNode( - ComputeGraph& graph, - const PickShaderFn& pick_shader_fn, - const PickGlobalFn& pick_global_wg_fn, - const PickLocalFn& pick_local_wg_fn, - const std::vector& args, - const vkapi::ParamsBindList& params, - const std::vector& push_constants, - const vkapi::SpecVarList& spec_vars, - const std::vector& resize_args, - const ResizeFunction& resize_fn = nullptr); - - explicit DynamicDispatchNode( - ComputeGraph& graph, - const vkapi::ShaderInfo& shader, - const PickGlobalFn& pick_global_wg_fn, - const PickLocalFn& pick_local_wg_fn, - const std::vector& args, - const vkapi::ParamsBindList& params, - const std::vector& push_constants, - const vkapi::SpecVarList& spec_vars, - const std::vector& resize_args, - const ResizeFunction& resize_fn = nullptr); - - ~DynamicDispatchNode() override = default; - - bool trigger_resize(ComputeGraph* graph) override; - - protected: - const PickShaderFn pick_shader_fn_; - const PickGlobalFn pick_global_wg_fn_; - const PickLocalFn pick_local_wg_fn_; - - utils::uvec3 wg_dispatch_grid_{1u, 1u, 1u}; - - public: - operator bool() const { - return shader_; - } -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp deleted file mode 100644 index 953f15e7b4d..00000000000 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace vkcompute { -ExecuteNode::ExecuteNode( - const ResizeFunction& resize_fn, - const std::vector& resize_args, - const std::vector& args, - const std::string& name) - : resize_fn_(resize_fn), - resize_args_(resize_args), - args_(args), - name_(name) {} - -bool ExecuteNode::trigger_resize(ComputeGraph* graph) { - const bool any_arg_updated = was_any_arg_updated(graph); - if (resize_fn_ && any_arg_updated) { - resize_fn_(graph, args_, resize_args_); - } - return any_arg_updated; -} - -bool ExecuteNode::was_any_arg_updated(const ComputeGraph* const graph) const { - // Check all ValueRefs in ArgGroups - for (const auto& arg_group : args_) { - for (const auto& value_ref : arg_group.refs) { - if (graph->was_value_updated(value_ref)) { - return true; - } - } - } - - // Check all ValueRefs in resize_args - for (const auto& value_ref : resize_args_) { - if (graph->was_value_updated(value_ref)) { - return true; - } - } - - return false; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h deleted file mode 100644 index 323036cef90..00000000000 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -class ComputeGraph; - -/* - * Represents a group of shader arguments (images and/or buffers), with a common - * access permission. - */ -struct ArgGroup { - ArgGroup(const ValueRef ref, const vkapi::MemoryAccessFlags access) - : refs{ref}, access(access) {} - - ArgGroup( - const std::vector& refs, - const vkapi::MemoryAccessFlags access) - : refs(refs), access(access) {} - - const std::vector refs; - const vkapi::MemoryAccessFlags access; -}; - -/* - * Represents a single execution op in a ML model. In graph mode, ops will be - * implemented in a derived class that implements encode, which will implement - * encoding of the shader corresponding to the op into the command buffer of a - * ComputeGraph. - */ -class ExecuteNode { - friend class ComputeGraph; - - public: - using ResizeFunction = std::function&, - const std::vector&)>; - - /* - * This overload of the DispatchNode constructor is used to register ops which - * update a tensor view. No shader is dispatched, but the node still needs to - * update the view's sizes and strides after a resize. - */ - explicit ExecuteNode( - const ResizeFunction& resize_fn = nullptr, - const std::vector& resize_args = {}, - const std::vector& args = {}, - const std::string& name = "Graph Node"); - - virtual ~ExecuteNode() = default; - - virtual void prepare_pipelines(ComputeGraph* graph) { - (void)graph; - } - - virtual void encode(ComputeGraph* graph) { - (void)graph; - } - - virtual bool trigger_resize(ComputeGraph* graph); - - bool was_any_arg_updated(const ComputeGraph* const graph) const; - - inline void set_node_id(uint32_t node_id) { - node_id_ = node_id; - } - - inline const std::string& name() const { - return name_; - } - - protected: - uint32_t node_id_; - const ResizeFunction resize_fn_; - const std::vector resize_args_; - const std::vector args_; - const std::string name_; -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp deleted file mode 100644 index 4d1f749830c..00000000000 --- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -bool OperatorRegistry::has_op(const std::string& name) { - return table_.count(name) > 0; -} - -OperatorRegistry::OpFunction& OperatorRegistry::get_op_fn( - const std::string& name) { - const auto it = table_.find(name); - VK_CHECK_COND(it != table_.end(), "Could not find operator with name ", name); - return it->second; -} - -void OperatorRegistry::register_op(const std::string& name, OpFunction& fn) { - table_.insert(std::make_pair(name, fn)); -} - -OperatorRegistry& operator_registry() { - static OperatorRegistry registry; - return registry; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h b/backends/vulkan/runtime/graph/ops/OperatorRegistry.h deleted file mode 100644 index 9d41d48afb9..00000000000 --- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -#include -#include - -#define VK_HAS_OP(name) ::vkcompute::operator_registry().has_op(name) - -#define VK_GET_OP_FN(name) ::vkcompute::operator_registry().get_op_fn(name) - -#define VK_REGISTER_OP(name, function) \ - ::vkcompute::operator_registry().register_op( \ - #name, \ - std::bind(&function, std::placeholders::_1, std::placeholders::_2)) - -#define REGISTER_OPERATORS \ - static void register_ops(); \ - static const OperatorRegisterInit reg(®ister_ops); \ - static void register_ops() - -namespace vkcompute { - -/* - * The Vulkan operator registry maps ATen operator names - * to their Vulkan delegate function implementation. It is - * a simplified version of - * executorch/runtime/kernel/operator_registry.h that uses - * the C++ Standard Library. - */ -class OperatorRegistry final { - using OpFunction = - const std::function&)>; - using OpTable = std::unordered_map; - - OpTable table_; - - public: - /* - * Check if the registry has an operator registered under the given name - */ - bool has_op(const std::string& name); - - /* - * Given an operator name, return the Vulkan delegate function - */ - OpFunction& get_op_fn(const std::string& name); - - /* - * Register a function to a given operator name - */ - void register_op(const std::string& name, OpFunction& fn); -}; - -class OperatorRegisterInit final { - using InitFn = void(); - - public: - explicit OperatorRegisterInit(InitFn* init_fn) { - init_fn(); - } -}; - -// The Vulkan operator registry is global. It is retrieved using this function, -// where it is declared as a static local variable. -OperatorRegistry& operator_registry(); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp deleted file mode 100644 index 62e1dc86f43..00000000000 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -namespace vkcompute { - -vkapi::ShaderInfo get_noop_shader(ComputeGraph& graph, const ValueRef packed) { - std::string noop_shader_name("no_op"); - add_dtype_suffix(noop_shader_name, graph.dtype_of(packed)); - add_storage_type_suffix(noop_shader_name, graph.storage_type_of(packed)); - return VK_KERNEL_FROM_STR(noop_shader_name); -} - -PrepackNode::PrepackNode( - ComputeGraph& graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const utils::uvec3& local_workgroup_size, - const ValueRef tref, - const ValueRef packed, - const vkapi::ParamsBindList& params, - const vkapi::SpecVarList& spec_vars, - const std::vector& push_constants) - : shader_(shader), - noop_shader_(get_noop_shader(graph, packed)), - global_workgroup_size_(global_workgroup_size), - local_workgroup_size_(local_workgroup_size), - tref_(tref), - packed_(packed), - params_(params), - spec_vars_(spec_vars), - push_constants_(push_constants) { - graph.update_descriptor_counts(shader, /*execute = */ false); - graph.update_descriptor_counts(noop_shader_, /*execute = */ false); -} - -api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { - // If no TensorRef is provided, create a staging buffer of zeros based on the - // Tensor metadata. - if (graph->val_is_none(tref_)) { - const std::vector packed_sizes = graph->sizes_of(packed_); - size_t numel = utils::multiply_integers(packed_sizes); - api::StagingBuffer staging( - graph->context(), graph->dtype_of(packed_), numel); - staging.set_staging_zeros(); - return staging; - } - - TensorRefPtr tref = graph->get_tref(tref_); - size_t numel = utils::multiply_integers(tref->sizes); - api::StagingBuffer staging(graph->context(), tref->dtype, numel); - graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t()); - size_t nbytes = numel * vkapi::element_size(tref->dtype); - staging.copy_from(tref->data, nbytes); - // Once the staging buffer is copied, if the TensorRef owns a FreeableBuffer, - // it can be freed. - tref->free_buffer(); - return staging; -} - -void PrepackNode::prepare_pipelines(ComputeGraph* graph) { - graph->register_pipeline_to_create( - shader_, local_workgroup_size_, spec_vars_, push_constants_); - graph->register_pipeline_to_create( - noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, {}); -} - -void PrepackNode::encode(ComputeGraph* graph) { - api::Context* const context = graph->context(); - - context->check_device_capabilities(shader_); - - api::StagingBuffer staging = create_staging_buffer(graph); - - std::unique_lock cmd_lock = context->dispatch_lock(); - - std::array push_constants_data; - uint32_t push_constants_offset = 0; - - for (const auto& push_constant : push_constants_) { - push_constants_offset += push_constant.write( - push_constants_data.data(), - push_constants_offset, - kMaxPushConstantSize); - } - - { - // If the vTensor is not yet bound to a memory allocation, create a new one - // and aquire it. - graph->create_dedicated_allocation_for(packed_); - - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::DescriptorSet descriptor_set = context->get_descriptor_set( - shader_, local_workgroup_size_, spec_vars_, push_constants_offset); - - uint32_t idx = 0; - graph->bind_tensor_to_descriptor_set( - packed_, - pipeline_barrier, - vkapi::MemoryAccessType::WRITE, - descriptor_set, - idx++); - bind_staging_to_descriptor_set(staging, descriptor_set, idx++); - bind_params_to_descriptor_set(params_, descriptor_set, idx); - - context->register_shader_dispatch( - descriptor_set, - pipeline_barrier, - shader_, - global_workgroup_size_, - push_constants_data.data(), - push_constants_offset); - } - - // Submit a compute shader that performs a no-op with the packed tensor in - // order to trigger an image layout transition from GENERAL to - // READ_ONLY_OPTIMAL. This ensures that future uses of the tensor will be - // bound with the correct image layout. - { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::DescriptorSet descriptor_set = context->get_descriptor_set( - noop_shader_, utils::WorkgroupSize(1, 1, 1)); - - graph->bind_tensor_to_descriptor_set( - packed_, - pipeline_barrier, - vkapi::MemoryAccessType::READ, - descriptor_set, - 0); - - context->register_shader_dispatch( - descriptor_set, pipeline_barrier, noop_shader_, {1, 1, 1}); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h deleted file mode 100644 index 8ce8ac9f773..00000000000 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace vkcompute { - -class ComputeGraph; - -/* - * Represents a single prepacking op in a ML model. In graph mode, ops will be - * implemented in a derived class that implements encode, which will implement - * encoding of shaders transferring necessary data (such as weights and biases) - * to the GPU. - */ -class PrepackNode final { - friend class ComputeGraph; - - public: - PrepackNode( - ComputeGraph& graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const utils::uvec3& local_workgroup_size, - const ValueRef tref, - const ValueRef packed, - const vkapi::ParamsBindList& params, - const vkapi::SpecVarList& spec_vars = {}, - const std::vector& push_constants = {}); - - ~PrepackNode() = default; - - void prepare_pipelines(ComputeGraph* graph); - - void encode(ComputeGraph* graph); - - inline void set_node_id(uint32_t node_id) { - node_id_ = node_id; - } - - protected: - uint32_t node_id_; - const vkapi::ShaderInfo shader_; - vkapi::ShaderInfo noop_shader_; - const utils::uvec3 global_workgroup_size_; - const utils::WorkgroupSize local_workgroup_size_; - const ValueRef tref_; - const ValueRef packed_; - const vkapi::ParamsBindList params_; - const vkapi::SpecVarList spec_vars_; - const std::vector push_constants_; - - private: - api::StagingBuffer create_staging_buffer(ComputeGraph* graph); -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h deleted file mode 100644 index 2ba0ccc467d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/activations.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -float hardswish(float x) { - if (x <= -3) { - return 0; - } else if (x >= 3) { - return x; - } else { - return x * (x + 3) / 6; - } -} - -vec4 hardswish(vec4 tex) { - return vec4( - hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w)); -} - -float hardshrink(float x, float lambda, float neg_lambda) { - return x * (float(x > lambda) + float(x < neg_lambda)); -} - -vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) { - return tex * - (vec4(greaterThan(tex, vec4(lambda))) + - vec4(lessThan(tex, vec4(neg_lambda)))); -} - -float hardsigmoid(float x) { - return mix(float(x >= 0.0), x / 6 + 0.5, float(abs(x) <= 3.0)); -} - -vec4 hardsigmoid(vec4 tex) { - return vec4( - hardsigmoid(tex.x), - hardsigmoid(tex.y), - hardsigmoid(tex.z), - hardsigmoid(tex.w)); -} - -float leaky_relu(float x, float negative_slope) { - return x * (float(x > 0.0) + negative_slope * float(x <= 0.0)); -} - -vec4 leaky_relu(vec4 tex, float negative_slope) { - return vec4( - leaky_relu(tex.x, negative_slope), - leaky_relu(tex.y, negative_slope), - leaky_relu(tex.z, negative_slope), - leaky_relu(tex.w, negative_slope)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.glsl deleted file mode 100644 index 1f3061ea100..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.glsl +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if HAS_BIAS: - #define HAS_BIAS - -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_mat2", DTYPE, "buffer")} -$if HAS_BIAS: - ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "out_strides")} -${layout_declare_ubo(B, "ivec4", "mat1_sizes")} -${layout_declare_ubo(B, "ivec4", "mat1_strides")} -${layout_declare_ubo(B, "ivec4", "mat2_sizes")} -${layout_declare_ubo(B, "ivec4", "mat2_strides")} -${layout_declare_ubo(B, "int", "out_numel")} -$if HAS_BIAS: - ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "mat2_is_transposed", "0")} - -void main() { - const ivec4 out_tidx = ivec4( - gl_GlobalInvocationID.x, - gl_GlobalInvocationID.y, - gl_GlobalInvocationID.z % out_sizes.z, - gl_GlobalInvocationID.z / out_sizes.z); - - if (any(greaterThanEqual(out_tidx, out_sizes))) { - return; - } - - int mat1_bufi = tidx_to_bufi( - ivec4(0, out_tidx.y, out_tidx.z, out_tidx.w), mat1_strides); - int mat2_bufi; - if (mat2_is_transposed > 0) { - mat2_bufi = tidx_to_bufi( - ivec4(0, out_tidx.x, 0, 0), mat2_strides); - } else { - mat2_bufi = tidx_to_bufi( - ivec4(out_tidx.x, 0, out_tidx.z, out_tidx.w), mat2_strides); - } - - int mat2_stride; - if (mat2_is_transposed > 0) { - mat2_stride = mat2_strides.x; - } else { - mat2_stride = mat2_strides.y; - } - - T sum = T(0.0); - for (int i = 0; i < mat1_sizes.x; ++i) { - sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi]; - - mat1_bufi += mat1_strides.x; - mat2_bufi += mat2_stride; - } - - const int out_bufi = tidx_to_bufi(out_tidx, out_strides); -#ifdef HAS_BIAS - t_out[out_bufi] = T(alpha) * T(sum) + T(beta) * t_bias[out_tidx.x]; -#else - t_out[out_bufi] = T(sum); -#endif // HAS_BIAS -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.yaml deleted file mode 100644 index b093d0c80b2..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -addmm_naive_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - HAS_BIAS: false - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_naive_buffer - - NAME: addmm_naive_buffer - HAS_BIAS: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl deleted file mode 100644 index a4ed494fe6d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -$if HAS_BIAS: - #define HAS_BIAS - -${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} -$if HAS_BIAS: - ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "ivec4", "mat1_sizes")} -${layout_declare_ubo(B, "ivec4", "mat2_sizes")} -$if HAS_BIAS: - ${layout_declare_ubo(B, "ivec4", "bias_sizes")} - ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "mat1_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 mat1_axis_map = unhash_axis_map(mat1_layout); -const lowp int mat1_packed_dim = unhash_packed_dim(mat1_layout); - -${layout_declare_spec_const(C, "int", "mat2_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 mat2_axis_map = unhash_axis_map(mat2_layout); -const lowp int mat2_packed_dim = unhash_packed_dim(mat2_layout); - -$if HAS_BIAS: - ${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")} - const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout); - const lowp int bias_packed_dim = unhash_packed_dim(bias_layout); - -#ifdef HAS_BIAS -vec4 get_bias_texel_W_packed(ivec3 logical_pos) { - ivec3 bias_pos = ivec3(0); - if (bias_sizes.y == 1) { - bias_pos[bias_axis_map.y] = 0; - } else { - bias_pos[bias_axis_map.y] = logical_pos.y; - } - if (bias_sizes.x == 1) { - bias_pos[bias_axis_map.x] = 0; - vec4 bias_texel = texelFetch(bias_tensor, bias_pos, 0); - // Only the first value is valid, the rest is 0 padding - return vec4(bias_texel.x); - } else { - bias_pos[bias_axis_map.x] = logical_pos.x; - } - - return texelFetch(bias_tensor, bias_pos, 0); -} -#endif // HAS_BIAS - -vec4 matmul_naive_k_dim_packed(const ivec3 out_lpos) { - ivec3 mat1_pos; - mat1_pos[mat1_axis_map.x] = 0; - mat1_pos[mat1_axis_map.y] = out_lpos.y; - mat1_pos[mat1_axis_map.z] = out_lpos.z; -#ifdef MAT2_IS_TRANSPOSED - const int mat2_k_axis = mat2_axis_map.x; - const int mat2_row_axis = mat2_axis_map.y; -#else - const int mat2_k_axis = mat2_axis_map.y; - const int mat2_row_axis = mat2_axis_map.x; -#endif // MAT2_IS_TRANSPOSED - - vec4 texel = vec4(0); - const int K = divup4(mat1_sizes.x); - - for (int i = 0; i < K; ++i) { - const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0); - - vec4 sums; - for (int r = 0; r < 4; ++r) { - // On-demand construction of mat2_pos appears to provide the lowest - // latency. Surprisingly, this doesn't translate to mat1_pos. - ivec3 mat2_pos = ivec3(0); - mat2_pos[mat2_k_axis] = i; - mat2_pos[mat2_row_axis] = out_lpos.x * 4 + r; -#ifndef MAT2_IS_TRANSPOSED - mat2_pos[mat2_axis_map.z] = out_lpos.z; -#endif // MAT2_IS_TRANSPOSED - sums[r] = dot(mat1_tex, texelFetch(mat2_tensor, mat2_pos, 0)); - } - - texel += sums; - - mat1_pos[mat1_axis_map.x]++; - } - - return texel; -} - -vec4 matmul_naive_k_dim_packed_row_dim_packed(const ivec3 out_lpos) { - ivec3 mat1_pos; - mat1_pos[mat1_axis_map.x] = 0; - mat1_pos[mat1_axis_map.y] = out_lpos.y; - mat1_pos[mat1_axis_map.z] = out_lpos.z; - - ivec3 mat2_pos; - mat2_pos[mat2_axis_map.x] = out_lpos.x; - mat2_pos[mat2_axis_map.y] = 0; - mat2_pos[mat2_axis_map.z] = out_lpos.z; - - ivec3 mat2_pos_offset = ivec3(0); - mat2_pos_offset[mat2_axis_map.y] = 1; - - const int mat2_y_axis = mat2_axis_map.y; - - vec4 texel = vec4(0); - const int K = divup4(mat1_sizes.x); - - for (int i = 0; - i < K; - ++i, mat1_pos[mat1_axis_map.x]++, mat2_pos[mat2_axis_map.y]+=4) { - const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0); - - for (int r = 0; r < 4; ++r) { - // On-demand construction of mat2_pos appears to provide the lowest - // latency. Surprisingly, this doesn't translate to mat1_pos. - ivec3 mat2_pos = ivec3(0); - mat2_pos[mat2_axis_map.x] = out_lpos.x; - mat2_pos[mat2_axis_map.y] = 4 * i + r; - mat2_pos[mat2_axis_map.z] = out_lpos.z; - - vec4 mat1_comp_vec = vec4(mat1_tex[r]); - texel = fma(mat1_comp_vec, texelFetch(mat2_tensor, mat2_pos, 0), texel); - } - } - - return texel; -} - -void main() { - const ivec3 out_lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(out_lpos, out_limits))) { - return; - } - - vec4 texel = vec4(0); - -#ifdef MAT2_IS_TRANSPOSED - if (mat2_packed_dim == W_DIM) { - texel = matmul_naive_k_dim_packed(out_lpos); - } else { - texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos); - } -#else - if (mat2_packed_dim == W_DIM) { - texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos); - } else { - texel = matmul_naive_k_dim_packed(out_lpos); - } -#endif // MAT2_IS_TRANSPOSED - -#ifdef HAS_BIAS - vec4 bias_texel = get_bias_texel_W_packed(out_lpos); - texel = beta * bias_texel + alpha * texel; -#endif // HAS_BIAS - - write_texel_lpos(out_tensor, out_lpos, texel, out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml deleted file mode 100644 index 33b617eed13..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -addmm_naive_texture3d: - parameter_names_with_default_values: - DTYPE: float - MAT2_IS_TRANSPOSED: false - HAS_BIAS: true - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: addmm_naive_texture3d - - NAME: matmul_naive_texture3d - HAS_BIAS: false - - NAME: linear_naive_texture3d - MAT2_IS_TRANSPOSED: true - - NAME: matmul_transposed_naive_texture3d - MAT2_IS_TRANSPOSED: true - HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl deleted file mode 100644 index 05c227f302c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -$if BATCH_MODE: - #define BATCH_MODE - -$if HAS_BIAS: - #define HAS_BIAS - -${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} -$if HAS_BIAS: - ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "mat1_sizes")} -${layout_declare_ubo(B, "ivec4", "mat2_sizes")} -$if HAS_BIAS: - ${layout_declare_ubo(B, "ivec4", "bias_sizes")} - ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "mat1_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 mat1_axis_map = unhash_axis_map(mat1_layout); - -${layout_declare_spec_const(C, "int", "mat2_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 mat2_axis_map = unhash_axis_map(mat2_layout); - -$if HAS_BIAS: - ${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")} - const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout); - -// To convince the SPIR-V compiler to unroll the loops optimally, need this -// macro -#define FOUR 4 - -#define TILE_ROWS ${TILE_ROWS} - -// we avoid mat4 and vec4 usage here as they compile to much less efficient -// SPIR-V -struct FloatMatrix_2d { - float data[TILE_ROWS][FOUR]; -}; - -struct FloatMatrix_3d { - float data[TILE_ROWS][FOUR][FOUR]; -}; - -#ifdef BATCH_MODE - #define FloatMatrix FloatMatrix_3d -#else - #define FloatMatrix FloatMatrix_2d -#endif // BATCH_MODE - -#ifdef HAS_BIAS -// get texel from self tensor (channel_packed) in addmm -vec4 get_texel_C_packed(const ivec2 idx) { - ivec3 bias_pos = ivec3(0); - if (bias_sizes.x > 1) { - bias_pos[bias_axis_map.x] = idx.x; - } - if (bias_sizes.y > 1) { - bias_pos[bias_axis_map.y] = idx.y; - } - - return texelFetch(bias_tensor, bias_pos, 0); -} -#endif // HAS_BIAS - -FloatMatrix matmul_partial(const ivec4 out_idx_tl) { - FloatMatrix results; - for (int i = 0; i < TILE_ROWS; i++) { - for (int j = 0; j < FOUR; j++) { -#ifdef BATCH_MODE - for (int k = 0; k < FOUR; k++) { - results.data[i][j][k] = 0.0f; - } -#else - results.data[i][j] = 0.0f; -#endif // BATCH_MODE - } - } - vec4 mat1_tensor_partial_load[TILE_ROWS]; - vec4 mat2_tensor_partial_load[FOUR]; - -#ifdef MAT2_IS_TRANSPOSED - const int mat2_k_axis = mat2_axis_map.x; - const int mat2_row_axis = mat2_axis_map.y; -#else - const int mat2_k_axis = mat2_axis_map.y; - const int mat2_row_axis = mat2_axis_map.x; -#endif // MAT2_IS_TRANSPOSED - -#ifdef BATCH_MODE - for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { - if (out_idx_tl.z + batch_idx >= out_sizes.z) { - break; - } -#endif // BATCH_MODE - for (int k = 0; k < mat1_sizes.x; k+=4) { - const int k_div4 = k >> 2; - // read and cache (4 x TILE_ROWS) tile of mat1 - for (int r = 0; r < TILE_ROWS; r++) { - ivec3 mat1_pos = ivec3(0); - mat1_pos[mat1_axis_map.x] = k_div4; - mat1_pos[mat1_axis_map.y] = out_idx_tl.y + r; -#ifdef BATCH_MODE - mat1_pos[mat1_axis_map.z] = out_idx_tl.z + batch_idx; -#endif // BATCH_MODE - - mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0); - } - - // read and cache (4 x 4) tile of mat2 - for (int r = 0; r < FOUR; ++r) { - ivec3 mat2_pos = ivec3(0); - mat2_pos[mat2_k_axis] = k_div4; - mat2_pos[mat2_row_axis] = out_idx_tl.x + r; -#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED) - mat2_pos[mat2_axis_map.z] = out_idx_tl.z + batch_idx; -#endif // BATCH_MODE - - mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0); - } - - // perform partial dot products and add partial result to results - for (int out_row = 0; out_row < TILE_ROWS; out_row++) { - for (int out_col = 0; out_col < FOUR; out_col++) { -#ifdef BATCH_MODE - results.data[out_row][out_col][batch_idx] += -#else - results.data[out_row][out_col] += -#endif // BATCH_MODE - dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]); - } - } - } -#ifdef BATCH_MODE - } -#endif // BATCH_MODE - - return results; -} - -// -// Write result matrix to output (3D matmul) -// - -void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) { - ivec3 out_pos = tidx_to_pos( - out_idx_tl, out_sizes, out_axis_map, out_packed_dim); - - for (int tile_c = 0; - tile_c < TILE_ROWS; - tile_c++, out_pos[out_axis_map.y]++) { - out_pos[out_axis_map.x] = out_idx_tl.x; - - for (int tile_r = 0; - tile_r < FOUR; - tile_r++, out_pos[out_axis_map.x]++) { - -#ifdef HAS_BIAS - ivec2 bias_idx; - bias_idx[bias_axis_map.x] = out_pos[out_axis_map.x]; - bias_idx[bias_axis_map.y] = out_pos[out_axis_map.y]; - float bias_val = get_texel_C_packed(bias_idx).x; -#ifdef BATCH_MODE - vec4 bias_texel = vec4(bias_val); -#else - vec4 bias_texel = vec4(bias_val, 0, 0, 0); -#endif // BATCH_MODE -#endif // HAS_BIAS - -#ifdef BATCH_MODE - vec4 out_texel = vec4( - results.data[tile_c][tile_r][0], - results.data[tile_c][tile_r][1], - results.data[tile_c][tile_r][2], - results.data[tile_c][tile_r][3]); -#else - vec4 out_texel = vec4( - results.data[tile_c][tile_r], - 0.0, - 0.0, - 0.0); -#endif // BATCH_MODE - -#ifdef HAS_BIAS - imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel); -#else - imageStore(out_tensor, out_pos, out_texel); -#endif // HAS_BIAS - } - } -} - -void main() { - // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of - // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4) - // tile of output elements will be computed. Note the sizes are written in - // (W x H x C) format. - const ivec3 tile_idx = ivec3(gl_GlobalInvocationID); - - // Calculate the tensor index of the top left element in the output tile - const ivec4 out_idx_topleft = ivec4( - tile_idx.x * 4, - tile_idx.y * TILE_ROWS, -#ifdef BATCH_MODE - tile_idx.z * 4, -#else - tile_idx.z, -#endif // BATCH_MODE - 0); - - // If the top left element is already out of range, then skip - if (any(greaterThanEqual(out_idx_topleft, out_sizes))) { - return; - } - - FloatMatrix results = matmul_partial(out_idx_topleft); - - write_results_C_packed(out_idx_topleft, results); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml deleted file mode 100644 index c82c2003d20..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -addmm_optimized: - parameter_names_with_default_values: - DTYPE: float - MAT2_IS_TRANSPOSED: false - BATCH_MODE: false - TILE_ROWS: 4 - HAS_BIAS: true - generate_variant_forall: - TILE_ROWS: - - VALUE: 4 - SUFFIX: tile_row_4 - - VALUE: 2 - SUFFIX: tile_row_2 - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: addmm_optimized - - NAME: matmul_optimized - HAS_BIAS: false - - NAME: linear_optimized - MAT2_IS_TRANSPOSED: true - - NAME: matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - HAS_BIAS: false - - NAME: batch_addmm_optimized - BATCH_MODE: true - - NAME: batch_matmul_optimized - BATCH_MODE: true - HAS_BIAS: false - - NAME: batch_linear_optimized - MAT2_IS_TRANSPOSED: true - BATCH_MODE: true - - NAME: batch_matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - BATCH_MODE: true - HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange.glsl deleted file mode 100644 index 8b1841888ad..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_ubo(1, "ivec4", "sizes")} -${layout_declare_ubo(2, "float", "start")} -${layout_declare_ubo(3, "float", "step")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); - - if (pos_out_of_bounds(pos, sizes, packed_dim)) { - return; - } - - VEC4_T outtex = VEC4_T(start + pos.x * step, 0, 0, 0); - - imageStore(t_out, pos, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml b/backends/vulkan/runtime/graph/ops/glsl/arange.yaml deleted file mode 100644 index 37b2027db85..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -arange: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: int32 - STORAGE: texture3d - PACKING: C_packed - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: arange diff --git a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.glsl deleted file mode 100644 index 2db9f842d75..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.glsl +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec3", "out_limits")} -${layout_declare_ubo(3, "ivec4", "in_sizes")} -${layout_declare_ubo(4, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} -${layout_declare_ubo(5, "int", "divisor_override", "int", "count_include_pad")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const ivec2 ipos = pos.xy * stride - padding; - - const ivec2 start = max(ivec2(0), ipos); - const ivec2 end = min(ipos + kernel_size, ivec2(in_sizes.xy)); - - VEC4_T sum = VEC4_T(0); - for (int y = start.y; y < end.y; ++y) { - for (int x = start.x; x < end.x; ++x) { - sum += texelFetch(t_in, ivec3(x, y, pos.z), 0); - } - } - - int div; - if (divisor_override > 0) { - div = divisor_override; - } else if (count_include_pad > 0) { - ivec2 empty = max(ipos + kernel_size - padding - ivec2(in_sizes.xy), ivec2(0)); - div = (kernel_size.y - empty.y) * (kernel_size.x - empty.x); - } else { - div = (end.y - start.y) * (end.x - start.x); - } - imageStore(t_out, pos, sum / div); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.yaml deleted file mode 100644 index b1e16dec8d6..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -avg_pool2d: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: avg_pool2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl deleted file mode 100644 index c2fc5a56754..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "weight_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "mean_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "var_in", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "float", "eps")} -${layout_declare_ubo(B, "int", "num_texel_per_batch")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - VEC4_T v = VEC4_T(load_texel(t_in, pos)); - - ivec3 param_pos = ivec3(pos.z % num_texel_per_batch, 0, 0); - - VEC4_T weight = VEC4_T(load_texel(weight_in, param_pos)); - VEC4_T bias = VEC4_T(load_texel(bias_in, param_pos)); - VEC4_T mean = VEC4_T(load_texel(mean_in, param_pos)); - VEC4_T var = VEC4_T(load_texel(var_in, param_pos)); - - v = ((v - mean) / sqrt(var + eps)) * weight + bias; - - write_texel(t_out, pos, v); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml deleted file mode 100644 index 116773c816a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml +++ /dev/null @@ -1,11 +0,0 @@ -batchnorm: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: batchnorm diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl deleted file mode 100644 index 6f2a93667ea..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -// Binary comparison ops require that the output is boolean and not the same as input. -$IS_COMPARISON_OP = (any([name in VARIANT_NAME for name in ["binary_eq", "binary_lt", "binary_le", "binary_gt", "binary_ge"]])) - -#define NAME ${VARIANT_NAME} - -#define VEC4_T ${texel_type(DTYPE)} -$if IS_COMPARISON_OP: - #define T ${buffer_scalar_type("uint8")} - #define VEC4_OUT_T ${texel_type("uint8")} -$else: - #define T ${buffer_scalar_type(DTYPE)} - #define VEC4_OUT_T VEC4_T - -#define op(X, Y, A) ${OPERATOR} - -${define_active_storage_type(STORAGE)} -${define_required_extensions(DTYPE)} - - -$if IS_COMPARISON_OP: - ${define_required_extensions("uint8")} - -layout(std430) buffer; - -#include "indexing.glslh" - -$if IS_COMPARISON_OP: - ${layout_declare_tensor(B, "w", "t_out", "uint8", STORAGE)} -$else: - ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} - -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} - -$if STORAGE == "buffer": - ${layout_declare_ubo(B, "BufferMetadata", "outp")} - ${layout_declare_ubo(B, "BufferMetadata", "inp")} - ${layout_declare_ubo(B, "BufferMetadata", "other")} - - layout(push_constant) uniform restrict Block { - float alpha; - }; -$else: - layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 other_sizes; - ivec2 broadcast_params; - float alpha; - }; - -#include "broadcasting_utils.h" -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} - -$if STORAGE == "buffer": - const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); -$else: - const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); - const lowp int packed_dim = unhash_packed_dim(out_layout); - - const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - - const lowp ivec4 other_axis_map = unhash_axis_map(other_layout); - -#ifdef USING_BUFFER - -void main() { - const uint out_bufi = gl_GlobalInvocationID.x; - if (out_bufi >= numel(outp)) { - return; - } - - // Simple case; no broadcasting - if (are_equal(inp, other)) { - t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha))); - return; - } - - TensorIndex outp_tidx; - linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx); - - TensorIndex inp_tidx = outp_tidx; - clamp_tensor_idx(inp, inp_tidx); - - TensorIndex other_tidx = outp_tidx; - clamp_tensor_idx(other, other_tidx); - - uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx); - uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx); - - t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha))); -} - -#else // USING_TEXTURE - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim); - - if (any(greaterThanEqual(tidx, out_sizes))) { - return; - } - - // broadcast on logical sizes - ivec4 in_idx = broadcast_indices(tidx, in_sizes); - VEC4_T in_texel = VEC4_T(load_texel( - t_in, - // read axis mapped texel - tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim))); - - // broadcast on logical sizes - ivec4 other_idx = broadcast_indices(tidx, other_sizes); - VEC4_T other_texel = VEC4_T(load_texel( - t_other, - // read axis mapped texel - tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim))); - - // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment. - if (broadcast_params.x > 0) { - in_texel = in_texel.xxxx; - } - if (broadcast_params.y > 0) { - other_texel = other_texel.xxxx; - } - - write_texel_lpos( - t_out, - lpos, - VEC4_OUT_T(op(in_texel, other_texel, alpha)), - out_axis_map); -} - -#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml deleted file mode 100644 index 70793628d80..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -binary_op: - parameter_names_with_default_values: - OPERATOR: X + A * Y - NDIM: 3 - DTYPE: float - PACKING: C_packed - generate_variant_forall: - STORAGE: - - VALUE: texture3d - - VALUE: buffer - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: binary_add - - NAME: binary_sub - OPERATOR: X - A * Y - - NAME: binary_mul - OPERATOR: X * Y - - NAME: binary_div - OPERATOR: X / Y - - NAME: binary_pow - OPERATOR: pow(X, Y) - - NAME: binary_floor_divide - OPERATOR: floor(X / Y) - - NAME: binary_minimum - OPERATOR: min(X, Y) - - NAME: binary_eq_int32 - OPERATOR: X == Y - DTYPE: int32 - - NAME: binary_eq_buffer - OPERATOR: abs(X - Y) < 1e-5 - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - NAME: binary_eq_texture3d - OPERATOR: all(lessThanEqual(abs(X - Y), VEC4_T(1e-5))) - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - NAME: binary_lt_buffer - OPERATOR: X < Y - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_lt_texture3d - OPERATOR: all(lessThan(X, Y)) - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_le_buffer - OPERATOR: X <= Y - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_le_texture3d - OPERATOR: all(lessThanEqual(X, Y)) - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_gt_buffer - OPERATOR: X > Y - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_gt_texture3d - OPERATOR: all(greaterThan(X, Y)) - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_ge_buffer - OPERATOR: X >= Y - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - NAME: binary_ge_texture3d - OPERATOR: all(greaterThanEqual(X, Y)) - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl deleted file mode 100644 index ac39dd36fc3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type(STORAGE)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -#extension GL_EXT_control_flow_attributes : require - -${layout_declare_buffer(B, "w", "nchw_out", "int")} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -$if USE_PUSH_CONST: - layout(push_constant) uniform restrict Block { - ivec4 tensor_sizes; - int out_numel; - }; -$else: - ${layout_declare_ubo(B, "ivec4", "tensor_sizes")} - ${layout_declare_ubo(B, "int", "out_numel")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 axis_map = unhash_axis_map(t_layout); -const lowp int packed_dim = unhash_packed_dim(t_layout); - -void main() { - const int out_buf_idx = int(gl_GlobalInvocationID.x); - // On the CPU, the number of elements is determined based on a buffer of int8 - // elements. However, on the GPU, since the int8 data type is not supported - // each group of 4 elements is interepreted as 1 int32 element. Thus each - // thread is actually writing to 4 output elements from the perspective of the - // CPU. - if (out_buf_idx * 4 >= out_numel) { - return; - } - - ivec4 values; - int in_buf_idx = 4 * out_buf_idx; - - [[unroll]] for (int i = 0; i < 4; ++i) { - const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes); - const ivec4 texture_pos = to_texture_elem_pos( - tidx, tensor_sizes, packed_dim); - values[i] = ivec4(load_texel(t_in, texture_pos.xyz))[texture_pos.w]; - in_buf_idx++; - } - - // Manually pack 4x 8-bit integers into a 32 bit integer. Note that little - // endian is assumed, since most processors use little endian. Thus the - // "later" values are placed in most significant bytes. - int packed = ((values[3] & 0xFF) << 24) - | ((values[2] & 0xFF) << 16) - | ((values[1] & 0xFF) << 8) - | ((values[0] & 0xFF)); - - nchw_out[out_buf_idx] = packed; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml deleted file mode 100644 index 0386c261203..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -bitw8_image_to_nchw_nobitw8buffer: - parameter_names_with_default_values: - STORAGE: texture3d - DTYPE: int8 - USE_PUSH_CONST: True - generate_variant_forall: - STORAGE: - - VALUE: texture2d - - VALUE: texture3d - DTYPE: - - VALUE: int8 - - VALUE: uint8 - shader_variants: - - NAME: bitw8_image_to_nchw_nobitw8buffer - - NAME: bitw8_image_to_nchw_nobitw8buffer_no_pc - USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h deleted file mode 100644 index 840e98a25ed..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -ivec4 broadcast_indices(const ivec4 out_idx, const ivec4 in_sizes) { - ivec4 in_idx = out_idx; - for (int i = 0; i < 4; ++i) { - if (out_idx[i] >= in_sizes[i]) { - in_idx[i] = 0; - } - } - return in_idx; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl deleted file mode 100644 index 9d4b18f0d10..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "out_buf", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "in_buf", DTYPE, STORAGE)} -${layout_declare_ubo(2, "int", "numel")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - int tid = int(gl_GlobalInvocationID.x); - if (tid >= numel) { - return; - } - out_buf[tid] = in_buf[tid]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml deleted file mode 100644 index e8bb86dbf6a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -buffer_to_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: buffer_to_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl deleted file mode 100644 index 6d164ae2645..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl +++ /dev/null @@ -1,36 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing.glslh" - -${layout_declare_tensor(B, "w", "nchw_buf", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "BufferMetadata", "inp")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// This constant is unused in this shader but is kept so that the signature is -// consistent with image_to_nchw. -${layout_declare_spec_const(C, "int", "unused", "0")} - -void main() { - uint inp_bufi = gl_GlobalInvocationID.x; - if (inp_bufi>= numel(inp)) { - return; - } - - TensorIndex inp_tidx; - linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx); - - uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx); - - nchw_buf[nchwi] = t_inp[inp_bufi]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml deleted file mode 100644 index 929108cca5e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -buffer_to_nchw: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - USE_PUSH_CONST: True - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: buffer_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh deleted file mode 100644 index cfe5baa9c1d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef CHOOSE_QPARAMS_GLSLH -#define CHOOSE_QPARAMS_GLSLH - -// mapping_type : 0 = ASYM, 1 = SYM, 2 = SYM_NO_CLIP -void calc_scale_zp( - float lo, float hi, - int qmin, int qmax, - int mapping_type, - float eps, - out float scale, out int zp) { - // Handle case where lo and hi are +/-INF (no valid values found) - if (isinf(lo) || isinf(hi)) { - lo = 0.0; - hi = 0.0; - } - - float minv = min(lo, 0.0); - float maxv = max(hi, 0.0); - - if (mapping_type == 0) { // asymmetric - scale = (maxv - minv) / float(qmax - qmin); - - // Handle zero or very small scale - if (scale == 0.0 || isinf(1.0/scale)) { - scale = eps; - } - - if (scale < eps) { - float org_scale = scale; - scale = eps; - - // Adjust min and max based on new scale to maintain proper quantization range - if (minv == 0.0) { - maxv = eps * float(qmax - qmin); - } else if (maxv == 0.0) { - minv = -eps * float(qmax - qmin); - } else { - float amplifier = eps / org_scale; - minv *= amplifier; - maxv *= amplifier; - } - } - - // Calculate zero_point (matching reference implementation) - float initial_zero_point = float(qmin) - round(minv / scale); - zp = int(clamp(initial_zero_point, float(qmin), float(qmax))); - } else { // symmetric -- centred - float scale_sym; - if (mapping_type == 1) { // SYM - float M = max(abs(minv), abs(maxv)); - scale_sym = M / (float(qmax - qmin) * 0.5); - } else { // SYM_NO_CLIP - float smin = abs(minv) / max(abs(float(qmin)), 1.0); // Avoid division by zero - float smax = maxv / max(float(qmax), 1.0); // Avoid division by zero - scale_sym = max(smin, smax); - } - - // Handle zero or very small scale - if (scale_sym == 0.0 || isinf(1.0/scale_sym)) { - scale_sym = eps; - } - - scale = max(scale_sym, eps); - zp = int((qmax + qmin + 1) >> 1); // mid-point – always fits - } -} - -#endif // CHOOSE_QPARAMS_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl deleted file mode 100644 index 7e21bcf0eba..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define IN_T ${buffer_scalar_type(IN_DTYPE)} -#define SCALE_OUT_T ${buffer_scalar_type(SCALE_OUT_DTYPE)} -#define ZP_OUT_T ${buffer_scalar_type(ZP_OUT_DTYPE)} - -#define ${MODE} - -${define_active_storage_type("buffer")} -${define_required_extensions(IN_DTYPE)} -${define_required_extensions(SCALE_OUT_DTYPE)} -${define_required_extensions(ZP_OUT_DTYPE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "buffer")} -${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")} - -$if MODE == "per_tensor": - layout(push_constant) uniform restrict Block { - int quant_min; - int quant_max; - float eps; - }; -$if MODE == "per_token": - layout(push_constant) uniform restrict Block { - int num_tokens; - int quant_min; - int quant_max; - }; -$if MODE == "block_wise": - layout(push_constant) uniform BlockPC { - ivec4 blockSize; // WHCN (>=1) - ivec4 numBlocks; // #blocks along W,H,C,N - ivec4 blockStride; // {1, #W, #W * #H, #W * #H * #C} - int mapping_type; // 0=ASYM, 1=SYM, 2=SYM_NO_CLIP - int quant_min; - int quant_max; - float eps; - }; - -${layout_declare_ubo(B, "ivec4", "t_in_sizes")} -${layout_declare_ubo(B, "ivec4", "t_in_strides")} -${layout_declare_ubo(B, "ivec4", "t_scale_sizes")} -${layout_declare_ubo(B, "ivec4", "t_scale_strides")} -${layout_declare_ubo(B, "ivec4", "t_zero_point_sizes")} -${layout_declare_ubo(B, "ivec4", "t_zero_point_strides")} - -#include "indexing_utils.h" -#include "choose_qparams.glslh" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#define NWORKERS 64 - -// Shared memory for reduction - must match local work group size -shared float shared_min[NWORKERS]; -shared float shared_max[NWORKERS]; - -/* - Quantization Parameter Computation Shader (Buffer Storage) - This shader computes quantization parameters (scale and zero_point) for converting - floating-point tensors to n-bit integer representations while preserving the - original data range as much as possible. The computed parameters enable efficient - quantization by mapping the continuous floating-point range to discrete integer values. - - Important Considerations: - (+) The input tensor is assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension) - - Workgroup Configuration: - - choose_qparams_per_tensor - This mode computes a single set of quantization parameters for the entire tensor. - Uses parallel reduction across all threads to find global min/max values. - - (*) global_wg_size: {1, 1, 1} (single workgroup processes entire tensor) - (*) local_wg_size: {64, 1, 1} (matches NWORKERS for shared memory) - - - choose_qparams_per_token - This mode computes separate quantization parameters for each token in the tensor. - Each workgroup processes one token independently to find token-specific min/max. - - (*) global_wg_size: {num_tokens, 1, 1} (one workgroup per token) - (*) local_wg_size: {1, 1, 1} (single thread per token) - - - choose_qparams_block_wise - This mode computes quantization parameters for each block of elements, allowing - fine-grained control over quantization granularity within the tensor. Each block - is processed independently to find its own min/max values and compute corresponding - scale and zero_point parameters. - - (*) global_wg_size: {nBlocks, 1u, 1u} (one workgroup per block) - (*) local_wg_size: {1, 1, 1} (single thread per block) - - Block-wise quantization supports multiple mapping types for scale/zero_point calculation: - - - mapping_type = 0 (ASYMMETRIC): - Uses asymmetric quantization where the full floating-point range [min, max] is - mapped to the quantized range [quant_min, quant_max]. This preserves the original - data distribution but may not center zero optimally. - - Calculation: - scale = (max - min) / (quant_max - quant_min) - zero_point = quant_min - round(min / scale) - - Example: For range [-3.5, 10.2] mapping to int4 [-8, 7]: - scale = (10.2 - (-3.5)) / (7 - (-8)) = 13.7 / 15 = 0.913 - zero_point = -8 - round(-3.5 / 0.913) = -8 - (-4) = -4 - - - mapping_type = 1 (SYMMETRIC): - Uses symmetric quantization where the range is centered around zero. The scale - is computed based on the maximum absolute value, ensuring zero is exactly - representable in the quantized domain. - - Calculation: - max_abs = max(abs(min), abs(max)) - scale = max_abs / ((quant_max - quant_min) / 2) - zero_point = (quant_max + quant_min + 1) / 2 // midpoint - - Example: For range [-3.5, 10.2] mapping to int4 [-8, 7]: - max_abs = max(3.5, 10.2) = 10.2 - scale = 10.2 / ((7 - (-8)) / 2) = 10.2 / 7.5 = 1.36 - zero_point = (-8 + 7 + 1) / 2 = 0 - - - mapping_type = 2 (SYMMETRIC_NO_CLIPPING_ERR): - A variant of symmetric quantization that minimizes clipping errors by computing - separate scales for positive and negative ranges, then using the maximum. This - reduces quantization error on the dominant range while ensuring no values are - clipped. - - Calculation: - smin = abs(min) / abs(quant_min) // scale for negative range - smax = max / quant_max // scale for positive range - scale = max(smin, smax) // use larger scale to avoid clipping - zero_point = (quant_max + quant_min + 1) / 2 // midpoint - - Example: For range [-3.5, 10.2] mapping to int4 [-8, 7]: - smin = 3.5 / 8 = 0.4375 - smax = 10.2 / 7 = 1.457 - scale = max(0.4375, 1.457) = 1.457 // use smax to avoid clipping positives - zero_point = (-8 + 7 + 1) / 2 = 0 - - Tree Reduction Algorithm for Min/Max Finding: - The shader uses a parallel tree reduction algorithm to efficiently find minimum and - maximum values across multiple threads. This approach reduces the number of memory - accesses and synchronization points compared to sequential scanning. - - Example with 8 threads processing values [10, 1, 8, 1, 0, 2, 3, 5]: - - Step 1 - Initial Population: - Each thread loads its assigned value into shared memory arrays. - shared_min: | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | - shared_max: | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | - Thread ID: | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - - Step 2 - Stride 1 (Compare Adjacent Pairs): - Threads 0,2,4,6 compare with threads 1,3,5,7 respectively. - shared_min: | 1 | | 1 | | 0 | | 3 | | (min(10,1), min(8,1), min(0,2), min(3,5)) - shared_max: | 10 | | 8 | | 2 | | 5 | | (max(10,1), max(8,1), max(0,2), max(3,5)) - Active: | 0 | | 2 | | 4 | | 6 | | - - Step 3 - Stride 2 (Compare Pairs of Pairs): - Threads 0,4 compare with threads 2,6 respectively. - shared_min: | 1 | | | | 0 | | | | (min(1,1), min(0,3)) - shared_max: | 10 | | | | 5 | | | | (max(10,8), max(2,5)) - Active: | 0 | | | | 4 | | | | - - Step 4 - Stride 4 (Final Comparison): - Thread 0 compares with thread 4 to get final result. - shared_min: | 0 | | | | | | | | (min(1,0) = 0) - shared_max: | 10 | | | | | | | | (max(10,5) = 10) - Active: | 0 | | | | | | | | - - Final Result: global_min = 0, global_max = 10 (stored in shared_min[0], shared_max[0]) - - The tree reduction completes in log_2(N) steps where N is the number of threads, - providing O(log N) time complexity instead of O(N) for sequential reduction. - - Quantization Parameter Calculation: - Once min/max values are determined, the shader computes: - - scale = (max - min) / (quant_max - quant_min) - - zero_point = quantization offset to map floating-point zero to integer range - - Mode-Specific Behavior: - - Per-Tensor: Single workgroup with strided access across entire tensor - - Per-Token: Multiple workgroups, each processing one token independently - - Block-Wise: Each thread processes assigned blocks using nested loops over block dimensions -*/ - -#ifdef per_tensor - -void choose_qparams_per_tensor() { - uint global_id = gl_GlobalInvocationID.x; - uint local_id = gl_LocalInvocationID.x; - uint total_threads = gl_NumWorkGroups.x * gl_WorkGroupSize.x; - - uint total_elements = uint(t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w); - - // Each thread processes multiple elements with stride - float thread_min = 1.0/0.0; // +infinity - float thread_max = -1.0/0.0; // -infinity - bool found_valid = false; - - for (uint i = global_id; i < total_elements; i += total_threads) { - float val = t_in[i]; - if (!isnan(val) && !isinf(val)) { - if (!found_valid) { - thread_min = val; - thread_max = val; - found_valid = true; - } else { - thread_min = min(thread_min, val); - thread_max = max(thread_max, val); - } - } - } - - // Intra-group reduction using shared memory - shared_min[local_id] = thread_min; - shared_max[local_id] = thread_max; - barrier(); - - // Tree reduction within work group - for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) { - if (local_id < stride) { - float other_min = shared_min[local_id + stride]; - float other_max = shared_max[local_id + stride]; - - if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) { - shared_min[local_id] = other_min; - } - if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) { - shared_max[local_id] = other_max; - } - } - barrier(); - } - - // Final result calculation (single workgroup only) - if (local_id == 0) { - float global_min = shared_min[0]; - float global_max = shared_max[0]; - - float scale_val; - int zero_point_val; - // Use default values: mapping_type=0 (ASYMMETRIC), eps from push constant - calc_scale_zp(global_min, global_max, quant_min, quant_max, 0, eps, scale_val, zero_point_val); - - t_scale[0] = SCALE_OUT_T(scale_val); - t_zero_point[0] = ZP_OUT_T(zero_point_val); - } -} - -#elif defined(per_token) - -void choose_qparams_per_token() { - uint total_elements = uint(t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w); - uint token_size = total_elements / uint(num_tokens); - - const uint TOTAL_TOKENS = uint(num_tokens); - - /* each invocation handles token-ids: id, id+STRIDE, id+2·STRIDE … */ - const uint STRIDE = gl_WorkGroupSize.x * gl_NumWorkGroups.x; - for (uint token_id = gl_GlobalInvocationID.x; token_id < TOTAL_TOKENS; token_id += STRIDE) { - // Calculate the start and end indices for this token - uint token_start = token_id * token_size; - uint token_end = token_start + token_size; - - // Each thread processes the entire token - float lo = 1.0/0.0; // +INF - float hi = -1.0/0.0; // -INF - bool found_valid = false; - - // Process all elements in this token - for (uint i = token_start; i < token_end; i++) { - float val = t_in[i]; - if (!isnan(val) && !isinf(val)) { - if (!found_valid) { - lo = hi = val; - found_valid = true; - } else { - lo = min(lo, val); - hi = max(hi, val); - } - } - } - - if (!found_valid) { - // If no valid values were found, use default values - lo = 0.0; - hi = 0.0; - } - - // Calculate scale and zero point directly - float scale_val; - int zero_point_val; - // Use default values: mapping_type=0 (ASYMMETRIC), eps=1e-5 - calc_scale_zp(lo, hi, quant_min, quant_max, 0, 1e-5, scale_val, zero_point_val); - - // Write results - t_scale[token_id] = SCALE_OUT_T(scale_val); - t_zero_point[token_id] = ZP_OUT_T(zero_point_val); - } -} - -#elif defined(block_wise) - -ivec4 block_id_to_coord(uint bid) { - ivec4 bc; - bc.w = int(bid) / blockStride.w; - - int r = int(bid) - bc.w * blockStride.w; - bc.z = r / blockStride.z; - - r -= bc.z * blockStride.z; - bc.y = r / blockStride.y; - - r -= bc.y * blockStride.y; - bc.x = r; - return bc; -} - -void choose_qparams_block_wise() { - const uint TOTAL_BLOCKS = uint(numBlocks.x * numBlocks.y * numBlocks.z * numBlocks.w); - - // each invocation handles block-ids: id, id+STRIDE, id+2·STRIDE - const uint STRIDE = gl_WorkGroupSize.x * gl_NumWorkGroups.x; - for (uint block_id = gl_GlobalInvocationID.x; block_id < TOTAL_BLOCKS; block_id += STRIDE) { - // block -> WHCN coordinate - ivec4 bc = block_id_to_coord(block_id); - ivec4 blockStart = bc * blockSize; // first element (inclusive) - ivec4 blockEnd = blockStart + blockSize; // last element (exclusive) - - // min / max scan over the block - float lo = 1.0/0.0; // +INF - float hi = -1.0/0.0; // -INF - bool found_valid = false; - - // Calculate actual block dimensions - ivec4 actualBlockSize = blockEnd - blockStart; - int blockElements = actualBlockSize.x * actualBlockSize.y * actualBlockSize.z * actualBlockSize.w; - - // Linear iteration over block elements - for (int elemIdx = 0; elemIdx < blockElements; ++elemIdx) { - // Convert linear index to 4D coordinates within block - int remaining = elemIdx; - int dn = remaining / (actualBlockSize.x * actualBlockSize.y * actualBlockSize.z); - remaining -= dn * (actualBlockSize.x * actualBlockSize.y * actualBlockSize.z); - int dc = remaining / (actualBlockSize.x * actualBlockSize.y); - remaining -= dc * (actualBlockSize.x * actualBlockSize.y); - int dh = remaining / actualBlockSize.x; - int dw = remaining - dh * actualBlockSize.x; - - ivec4 tidx = blockStart + ivec4(dw, dh, dc, dn); - uint idx = tidx_to_bufi(tidx, t_in_strides); - float v = t_in[idx]; - - if (!isnan(v) && !isinf(v)) { - if (!found_valid) { - lo = hi = v; - found_valid = true; - } else { - lo = min(lo, v); - hi = max(hi, v); - } - } - } - - // Handle the case where no valid values were found in the block - if (!found_valid) { - lo = 0.0; - hi = 0.0; - } - - float scale_val; - int zero_point_val; - calc_scale_zp(lo, hi, quant_min, quant_max, mapping_type, eps, scale_val, zero_point_val); - - t_scale[block_id] = SCALE_OUT_T(scale_val); - t_zero_point[block_id] = ZP_OUT_T(zero_point_val); - } -} - -#endif - -void main() { - choose_qparams_${MODE}(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml deleted file mode 100644 index 8459b043baa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml +++ /dev/null @@ -1,22 +0,0 @@ -choose_qparams_buffer: - parameter_names_with_default_values: - IN_DTYPE: float - SCALE_OUT_DTYPE: float - ZP_OUT_DTYPE: int32 - MODE: per_tensor - generate_variant_forall: - IN_DTYPE: - - VALUE: float - SCALE_OUT_DTYPE: - - VALUE: float - ZP_OUT_DTYPE: - - VALUE: int32 - - VALUE: int8 - - VALUE: float - shader_variants: - - NAME: choose_qparams_tensor_buffer - MODE: per_tensor - - NAME: choose_qparams_per_token_asymmetric_buffer - MODE: per_token - - NAME: choose_qparams_block_wise_buffer - MODE: block_wise diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl deleted file mode 100644 index 653b0a251c0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define T ${texel_load_component_type(DTYPE, STORAGE)} - -#define NUM_OUTPUTS_PER_WG ${NUM_OUTPUTS_PER_WG} -#define NUM_WORKERS_PER_OUTPUT ${NUM_WORKERS_PER_OUTPUT} - -// Maximum total threads in a work group -#define MAX_THREADS 256 - -${define_active_storage_type(STORAGE)} -${define_required_extensions("int8")} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -#include "common.glslh" - -${layout_declare_tensor(B, "w", "t_scales", "float", "buffer")} -${layout_declare_tensor(B, "w", "t_zps", "int", "buffer")} -${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "input_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(push_constant) uniform PushConstants { - int quant_min; - int quant_max; -}; - -// Shared memory for cooperative min/max finding -shared T shared_min[NUM_OUTPUTS_PER_WG][NUM_WORKERS_PER_OUTPUT]; -shared T shared_max[NUM_OUTPUTS_PER_WG][NUM_WORKERS_PER_OUTPUT]; - -const float SMALL_SCALE_THRESHOLD = 6.1e-5; - -void calculate_scale_and_zero_point( - float min_val, - float max_val, - int qmin, - int qmax, - out float scale, - out int8_t zero_point) { - - // Extend the [min, max] interval to ensure it contains 0 - min_val = min(min_val, 0.0); - max_val = max(max_val, 0.0); - - // Calculate scale - scale = (max_val - min_val) / float(qmax - qmin); - - // Handle special cases for scale - if (scale == 0.0 || isinf(1.0 / scale)) { - scale = 0.1; - } - - // Cut off small scale - if (scale < SMALL_SCALE_THRESHOLD) { - float org_scale = scale; - scale = SMALL_SCALE_THRESHOLD; - // Adjust the min and max based on the new scale - if (min_val == 0.0) { - max_val = SMALL_SCALE_THRESHOLD * float(qmax - qmin); - } else if (max_val == 0.0) { - min_val = -SMALL_SCALE_THRESHOLD * float(qmax - qmin); - } else { - float amplifier = SMALL_SCALE_THRESHOLD / org_scale; - min_val *= amplifier; - max_val *= amplifier; - } - } - - // Zero-point computation - float zero_point_from_min = float(qmin) - min_val / scale; - float zero_point_from_max = float(qmax) - max_val / scale; - float zero_point_from_min_error = abs(float(qmin)) - abs(min_val / scale); - float zero_point_from_max_error = abs(float(qmax)) - abs(max_val / scale); - - float initial_zero_point = zero_point_from_min_error < zero_point_from_max_error - ? zero_point_from_min - : zero_point_from_max; - - // Nudge zero point to be an integer - int nudged_zero_point; - if (initial_zero_point < float(qmin)) { - nudged_zero_point = qmin; - } else if (initial_zero_point > float(qmax)) { - nudged_zero_point = qmax; - } else { - nudged_zero_point = int(round(initial_zero_point)); - } - - zero_point = int8_t(nudged_zero_point); -} - -#ifdef USING_BUFFER - -VEC4_T load_input_x4(const int x4, const int y, const int ntexels_x) { - return t_input[(y * ntexels_x) + x4]; -} - -#else // USING_TEXTURE - -VEC4_T load_input_x4(const int x4, const int y, const int ntexels_x) { - return texelFetch(t_input, ivec3(x4, y, 0), 0); -} - -#endif // USING_BUFFER - -void main() { - const int worker_id = int(gl_LocalInvocationID.x); - const int output_id = int(gl_LocalInvocationID.y); - - const int output_y = int(gl_GlobalInvocationID.y); - - if (output_y >= input_sizes.y) { - return; - } - - // Input is 2D tensor (height x width), width-packed - // Each channel corresponds to a row in the tensor - const int X4 = div_4(input_sizes.x); - - // Initialize thread-local min/max - float local_min = 1e30; - float local_max = -1e30; - - // Each thread processes elements along their assigned output_id with stride - // NUM_WORKERS_PER_OUTPUT - for (int x4 = worker_id; x4 < X4; x4 += NUM_WORKERS_PER_OUTPUT) { - VEC4_T in_texel = load_input_x4(x4, output_y, X4); - for (int i = 0; i < 4; i++) { - local_min = min(local_min, in_texel[i]); - local_max = max(local_max, in_texel[i]); - } - } - - // Store thread-local results in shared memory - shared_min[output_id][worker_id] = local_min; - shared_max[output_id][worker_id] = local_max; - - memoryBarrierShared(); - barrier(); - - // Tree reduction to compute the overall result - for (int i = NUM_WORKERS_PER_OUTPUT / 2; i > 0; i >>= 1) { - if (worker_id < i) { - shared_min[output_id][worker_id] = min( - shared_min[output_id][worker_id], - shared_min[output_id][worker_id + i]); - shared_max[output_id][worker_id] = max( - shared_max[output_id][worker_id], - shared_max[output_id][worker_id + i]); - } - memoryBarrierShared(); - barrier(); - } - - // Only first thread will write out result - if (worker_id == 0) { - local_min = shared_min[output_id][0]; - local_max = shared_max[output_id][0]; - - float scale; - int8_t zero_point; - calculate_scale_and_zero_point( - local_min, local_max, quant_min, quant_max, scale, zero_point); - - t_scales[output_y] = scale; - t_zps[output_y] = zero_point; - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml deleted file mode 100644 index 3608f7193bf..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -choose_qparams_per_row: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - NUM_OUTPUTS_PER_WG: 1 - NUM_WORKERS_PER_OUTPUT: 64 - generate_variant_forall: - STORAGE: - - VALUE: texture3d - - VALUE: buffer - DTYPE: - - VALUE: float - shader_variants: - - NAME: choose_qparams_per_row_o1w64 - - NAME: choose_qparams_per_row_o4w16 - NUM_OUTPUTS_PER_WG: 4 - NUM_WORKERS_PER_OUTPUT: 16 diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl deleted file mode 100644 index a17a3ae41dd..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl +++ /dev/null @@ -1,533 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define IN_T ${buffer_scalar_type(IN_DTYPE)} -#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")} -#define SCALE_OUT_T ${buffer_scalar_type(SCALE_OUT_DTYPE)} -#define ZP_OUT_T ${buffer_scalar_type(ZP_OUT_DTYPE)} - -#define ${MODE} - -${define_active_storage_type("texture3d")} -${define_required_extensions(IN_DTYPE)} -${define_required_extensions(SCALE_OUT_DTYPE)} -${define_required_extensions(ZP_OUT_DTYPE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -$if MODE != "block_wise": - ${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "texture3d")} - ${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "texture3d")} -$else: - ${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "buffer")} - ${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "buffer")} - -${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")} - -$if MODE == "per_tensor": - layout(push_constant) uniform restrict Block { - int quant_min; - int quant_max; - float eps; - }; -$if MODE == "per_token": - layout(push_constant) uniform restrict Block { - int num_tokens; - int quant_min; - int quant_max; - }; -$if MODE == "block_wise": - layout(push_constant) uniform BlockPC { - ivec4 blockSize; // WHCN (>=1) - ivec4 numBlocks; // #blocks along W,H,C,N - ivec4 blockStride; // {1, #W, #W * #H, #W * #H * #C} - int mapping_type; // 0=ASYM, 1=SYM, 2=SYM_NO_CLIP - int quant_min; - int quant_max; - float eps; - }; - -${layout_declare_ubo(B, "ivec3", "t_in_limits")} -$if MODE != "block_wise": - ${layout_declare_ubo(B, "ivec3", "t_scale_limits")} - ${layout_declare_ubo(B, "ivec3", "t_zero_point_limits")} -$else: - ${layout_declare_ubo(B, "ivec4", "t_scale_sizes")} - ${layout_declare_ubo(B, "ivec4", "t_scale_strides")} - ${layout_declare_ubo(B, "ivec4", "t_zero_point_sizes")} - ${layout_declare_ubo(B, "ivec4", "t_zero_point_strides")} - - -#include "indexing_utils.h" -#include "choose_qparams.glslh" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#define NWORKERS 64 - -// Shared memory for reduction - must match local work group size -shared float shared_min[NWORKERS]; -shared float shared_max[NWORKERS]; - -/*/* - Quantization Parameter Computation Shader (Buffer Storage) - This shader computes quantization parameters (scale and zero_point) for converting - floating-point tensors to n-bit integer representations while preserving the - original data range as much as possible. The computed parameters enable efficient - quantization by mapping the continuous floating-point range to discrete integer values. - - Important Considerations: - (+) The input tensor is assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension) - - Workgroup Configuration: - - choose_qparams_per_tensor - This mode computes a single set of quantization parameters for the entire tensor. - Uses parallel reduction across all threads to find global min/max values. - - (*) global_wg_size: default - (*) local_wg_size: default - - - choose_qparams_per_token - This mode computes separate quantization parameters for each token in the tensor. - Each workgroup processes one token independently to find token-specific min/max. - - (*) global_wg_size: default - (*) local_wg_size: {1, 1, 1} - - - choose_qparams_block_wise - This mode computes quantization parameters for each block of elements, allowing - fine-grained control over quantization granularity within the tensor. Each block - is processed independently to find its own min/max values and compute corresponding - scale and zero_point parameters. - - NOTE: This mode currently only supports buffer storage for the output. - - (*) global_wg_size: {nBlocks, 1u, 1u} (one workgroup per block) - (*) local_wg_size: {1, 1, 1} (single thread per block) - - Tree Reduction Algorithm for Min/Max Finding: - The shader uses a parallel tree reduction algorithm to efficiently find minimum and - maximum values across multiple threads. This approach reduces the number of memory - accesses and synchronization points compared to sequential scanning. - - Example with 8 threads processing values [10, 1, 8, 1, 0, 2, 3, 5]: - - Step 1 - Initial Population: - Each thread loads its assigned value into shared memory arrays. - shared_min: | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | - shared_max: | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | - Thread ID: | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - - Step 2 - Stride 1 (Compare Adjacent Pairs): - Threads 0,2,4,6 compare with threads 1,3,5,7 respectively. - shared_min: | 1 | | 1 | | 0 | | 3 | | (min(10,1), min(8,1), min(0,2), min(3,5)) - shared_max: | 10 | | 8 | | 2 | | 5 | | (max(10,1), max(8,1), max(0,2), max(3,5)) - Active: | 0 | | 2 | | 4 | | 6 | | - - Step 3 - Stride 2 (Compare Pairs of Pairs): - Threads 0,4 compare with threads 2,6 respectively. - shared_min: | 1 | | | | 0 | | | | (min(1,1), min(0,3)) - shared_max: | 10 | | | | 5 | | | | (max(10,8), max(2,5)) - Active: | 0 | | | | 4 | | | | - - Step 4 - Stride 4 (Final Comparison): - Thread 0 compares with thread 4 to get final result. - shared_min: | 0 | | | | | | | | (min(1,0) = 0) - shared_max: | 10 | | | | | | | | (max(10,5) = 10) - Active: | 0 | | | | | | | | - - Final Result: global_min = 0, global_max = 10 (stored in shared_min[0], shared_max[0]) - - The tree reduction completes in log_2(N) steps where N is the number of threads, - providing O(log N) time complexity instead of O(N) for sequential reduction. - - Quantization Parameter Calculation: - Once min/max values are determined, the shader computes: - - scale = (max - min) / (quant_max - quant_min) - - zero_point = quantization offset to map floating-point zero to integer range - - Mode-Specific Behavior: - - Per-Tensor: Single workgroup with strided access across entire tensor - - Per-Token: Multiple workgroups, each processing one token independently -*/ - -#ifdef per_tensor - -void choose_qparams_per_tensor() { - uint global_id = gl_GlobalInvocationID.x; - uint local_id = gl_LocalInvocationID.x; - uint group_id = gl_WorkGroupID.x; - uint total_threads = gl_NumWorkGroups.x * gl_WorkGroupSize.x; - - uint total_texels = uint(t_in_limits.x * t_in_limits.y * t_in_limits.z); - - // Each thread processes multiple texels with stride - float thread_min = 1.0/0.0; // +infinity - float thread_max = -1.0/0.0; // -infinity - bool found_valid = false; - - // Process texels with stride across all threads - for (uint texel_idx = global_id; texel_idx < total_texels; texel_idx += total_threads) { - // Convert linear texel index to 3D coordinates - uint z = texel_idx / uint(t_in_limits.x * t_in_limits.y); - uint remainder = texel_idx % uint(t_in_limits.x * t_in_limits.y); - uint y = remainder / uint(t_in_limits.x); - uint x = remainder % uint(t_in_limits.x); - ivec3 texel_pos = ivec3(int(x), int(y), int(z)); - - FVEC4_T texel_data = load_texel(t_in, texel_pos); - - // For texture storage, we assume width-packed (packed_dim = 0) - // Calculate number of valid elements in this texel (handle padding) - int packed_dim = 0; // Width dimension is packed - ivec4 sizes = ivec4(t_in_limits, 1); // Convert limits to sizes format - ivec4 tensor_coord = to_tensor_idx(texel_pos, sizes, packed_dim); - - // Calculate total tensor elements to determine padding - int total_elements = t_in_limits.x * t_in_limits.y * t_in_limits.z * 4; - int linear_tensor_idx = tensor_coord.x + tensor_coord.y * sizes.x + - tensor_coord.z * sizes.x * sizes.y; - int remaining_elements = total_elements - (linear_tensor_idx); - int valid_elements = min(4, remaining_elements); - - // Find min/max within this texel, considering only valid elements - if (valid_elements >= 1 && !isnan(texel_data.x) && !isinf(texel_data.x)) { - if (!found_valid) { - thread_min = texel_data.x; - thread_max = texel_data.x; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.x); - thread_max = max(thread_max, texel_data.x); - } - } - - if (valid_elements >= 2 && !isnan(texel_data.y) && !isinf(texel_data.y)) { - if (!found_valid) { - thread_min = texel_data.y; - thread_max = texel_data.y; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.y); - thread_max = max(thread_max, texel_data.y); - } - } - - if (valid_elements >= 3 && !isnan(texel_data.z) && !isinf(texel_data.z)) { - if (!found_valid) { - thread_min = texel_data.z; - thread_max = texel_data.z; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.z); - thread_max = max(thread_max, texel_data.z); - } - } - - if (valid_elements >= 4 && !isnan(texel_data.w) && !isinf(texel_data.w)) { - if (!found_valid) { - thread_min = texel_data.w; - thread_max = texel_data.w; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.w); - thread_max = max(thread_max, texel_data.w); - } - } - } - - // Intra-workgroup reduction using shared memory - shared_min[local_id] = thread_min; - shared_max[local_id] = thread_max; - barrier(); - - // Tree reduction within work group - for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) { - if (local_id < stride) { - float other_min = shared_min[local_id + stride]; - float other_max = shared_max[local_id + stride]; - - if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) { - shared_min[local_id] = other_min; - } - if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) { - shared_max[local_id] = other_max; - } - } - barrier(); - } - - // Final result calculation (single workgroup only for reliability) - if (local_id == 0 && group_id == 0) { - float global_min = shared_min[0]; - float global_max = shared_max[0]; - - float scale_val; - int zero_point_val; - calc_scale_zp(global_min, global_max, quant_min, quant_max, 0, eps, scale_val, zero_point_val); - - write_texel(t_scale, ivec3(0, 0, 0), vec4(SCALE_OUT_T(scale_val), 0.0, 0.0, 0.0)); - write_texel(t_zero_point, ivec3(0, 0, 0), ivec4(ZP_OUT_T(zero_point_val), 0, 0, 0)); - } -} - -#elif defined(per_token) - -void choose_qparams_per_token() { - // Each token is processed by multiple workgroups for parallel reduction - uint local_id = gl_LocalInvocationID.x; - uint group_id = gl_WorkGroupID.x; - uint total_workgroups = gl_NumWorkGroups.x; - - uint total_texels = uint(t_in_limits.x * t_in_limits.y * t_in_limits.z); - - // Calculate texels per token (assuming last dimension contains the token data) - // For per-token quantization, we assume tokens are along the last dimension - uint texels_per_token = total_texels / uint(num_tokens); - - // Calculate how many tokens each workgroup should process - uint tokens_per_workgroup = (uint(num_tokens) + total_workgroups - 1) / total_workgroups; - - // Calculate which tokens this workgroup is responsible for - uint start_token = group_id * tokens_per_workgroup; - uint end_token = min(start_token + tokens_per_workgroup, uint(num_tokens)); - - // Process each token assigned to this workgroup - for (uint token_id = start_token; token_id < end_token; token_id++) { - // Calculate the texel range for this token - uint token_start_texel = token_id * texels_per_token; - uint token_end_texel = token_start_texel + texels_per_token; - - // Each thread processes multiple texels within the token - float thread_min = 1.0/0.0; // +infinity - float thread_max = -1.0/0.0; // -infinity - bool found_valid = false; - - // Process texels within this token only - for (uint texel_idx = token_start_texel + local_id; texel_idx < token_end_texel; texel_idx += gl_WorkGroupSize.x) { - // Convert linear texel index to 3D coordinates - uint z = texel_idx / uint(t_in_limits.x * t_in_limits.y); - uint remainder = texel_idx % uint(t_in_limits.x * t_in_limits.y); - uint y = remainder / uint(t_in_limits.x); - uint x = remainder % uint(t_in_limits.x); - ivec3 texel_pos = ivec3(int(x), int(y), int(z)); - - FVEC4_T texel_data = load_texel(t_in, texel_pos); - - // For texture storage, we assume width-packed (packed_dim = 0) - // Calculate number of valid elements in this texel (handle padding) - int packed_dim = 0; // Width dimension is packed - ivec4 sizes = ivec4(t_in_limits, 1); // Convert limits to sizes format - ivec4 tensor_coord = to_tensor_idx(texel_pos, sizes, packed_dim); - - // Calculate total tensor elements to determine padding - int total_elements = t_in_limits.x * t_in_limits.y * t_in_limits.z * 4; - int linear_tensor_idx = tensor_coord.x + tensor_coord.y * sizes.x + - tensor_coord.z * sizes.x * sizes.y; - int remaining_elements = total_elements - (linear_tensor_idx); - int valid_elements = min(4, remaining_elements); - - // Find min/max within this texel, considering only valid elements - if (valid_elements >= 1 && !isnan(texel_data.x) && !isinf(texel_data.x)) { - if (!found_valid) { - thread_min = texel_data.x; - thread_max = texel_data.x; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.x); - thread_max = max(thread_max, texel_data.x); - } - } - - if (valid_elements >= 2 && !isnan(texel_data.y) && !isinf(texel_data.y)) { - if (!found_valid) { - thread_min = texel_data.y; - thread_max = texel_data.y; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.y); - thread_max = max(thread_max, texel_data.y); - } - } - - if (valid_elements >= 3 && !isnan(texel_data.z) && !isinf(texel_data.z)) { - if (!found_valid) { - thread_min = texel_data.z; - thread_max = texel_data.z; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.z); - thread_max = max(thread_max, texel_data.z); - } - } - - if (valid_elements >= 4 && !isnan(texel_data.w) && !isinf(texel_data.w)) { - if (!found_valid) { - thread_min = texel_data.w; - thread_max = texel_data.w; - found_valid = true; - } else { - thread_min = min(thread_min, texel_data.w); - thread_max = max(thread_max, texel_data.w); - } - } - } - - // Intra-workgroup reduction using shared memory - shared_min[local_id] = thread_min; - shared_max[local_id] = thread_max; - barrier(); - - // Tree reduction within work group - for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) { - if (local_id < stride) { - float other_min = shared_min[local_id + stride]; - float other_max = shared_max[local_id + stride]; - - // Handle infinity values properly - if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) { - shared_min[local_id] = other_min; - } - if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) { - shared_max[local_id] = other_max; - } - } - barrier(); - } - - // Final calculation for this token - if (local_id == 0) { - float token_min = shared_min[0]; - float token_max = shared_max[0]; - - float scale_val; - int zero_point_val; - calc_scale_zp(token_min, token_max, quant_min, quant_max, 0, 1e-5, scale_val, zero_point_val); - - // Convert token_id to 3D coordinates for output texture - // Assuming output tensors have the same layout as input but with different dimensions - uint out_z = token_id / uint(t_scale_limits.x * t_scale_limits.y); - uint out_remainder = token_id % uint(t_scale_limits.x * t_scale_limits.y); - uint out_y = out_remainder / uint(t_scale_limits.x); - uint out_x = out_remainder % uint(t_scale_limits.x); - ivec3 out_pos = ivec3(int(out_x), int(out_y), int(out_z)); - - write_texel(t_scale, out_pos, vec4(SCALE_OUT_T(scale_val), 0.0, 0.0, 0.0)); - write_texel(t_zero_point, out_pos, ivec4(ZP_OUT_T(zero_point_val), 0, 0, 0)); - } - - // Synchronize before processing next token - barrier(); - } -} - -#elif defined(block_wise) - -ivec4 block_id_to_coord(uint bid) { - ivec4 bc; - bc.w = int(bid) / blockStride.w; - - int r = int(bid) - bc.w * blockStride.w; - bc.z = r / blockStride.z; - - r -= bc.z * blockStride.z; - bc.y = r / blockStride.y; - - r -= bc.y * blockStride.y; - bc.x = r; - return bc; -} - -void choose_qparams_block_wise() { - const uint T = uint(numBlocks.x * numBlocks.y * numBlocks.z * numBlocks.w); - const uint STRIDE = gl_WorkGroupSize.x * gl_NumWorkGroups.x; - - // tensor full size in WHCN order - const ivec4 tensorSz = blockSize * numBlocks; - - // Process blocks with stride for better parallelization - for (uint blkIdx = gl_GlobalInvocationID.x; blkIdx < T; blkIdx += STRIDE) { - // block index in WHCN - const ivec4 b4d = block_id_to_coord(blkIdx); - const ivec4 blockStart = b4d * blockSize; - const ivec4 blockEnd = blockStart + blockSize; - - // scan all elements inside the block - float vmin = 3.402823e38; // +FLT_MAX - float vmax = -3.402823e38; // -FLT_MAX - bool found_valid = false; - - // Calculate total elements in block for linear iteration - const int blockElements = blockSize.x * blockSize.y * blockSize.z * blockSize.w; - - // Linear iteration over block elements (more cache-friendly) - for (int elemIdx = 0; elemIdx < blockElements; ++elemIdx) { - // Convert linear index to 4D coordinates within block - int remaining = elemIdx; - int dn = remaining / (blockSize.x * blockSize.y * blockSize.z); - remaining -= dn * (blockSize.x * blockSize.y * blockSize.z); - int dc = remaining / (blockSize.x * blockSize.y); - remaining -= dc * (blockSize.x * blockSize.y); - int dh = remaining / blockSize.x; - int dw = remaining - dh * blockSize.x; - - ivec4 tidx = blockStart + ivec4(dw, dh, dc, dn); - - // skip padding when tensor size is not an exact multiple of block - if (any(greaterThanEqual(tidx, tensorSz))) { continue; } - - // tensor index -> (x,y,z,component) inside input texture - ivec4 posi = to_texture_elem_pos(tidx, tensorSz, 0); // 0 = W_DIM (width packed) - - // fetch texel and pick the element inside it - FVEC4_T texl = load_texel(t_in, posi.xyz); - float v; - if (posi.w == 0) v = texl.x; - else if (posi.w == 1) v = texl.y; - else if (posi.w == 2) v = texl.z; - else v = texl.w; - - if (!isnan(v) && !isinf(v)) { - if (!found_valid) { - vmin = vmax = v; - found_valid = true; - } else { - vmin = min(vmin, v); - vmax = max(vmax, v); - } - } - } - - // Handle case where no valid values were found - if (!found_valid) { - vmin = 0.0; - vmax = 0.0; - } - - // compute scale / zero‑point (same maths as buffer kernel) - float scale; - int zp; - calc_scale_zp(vmin, vmax, quant_min, quant_max, mapping_type, eps, scale, zp); - - // Write the scalar values directly to buffer using linear index - t_scale[blkIdx] = SCALE_OUT_T(scale); - t_zero_point[blkIdx] = ZP_OUT_T(zp); - } -} - -#endif - -void main() { - choose_qparams_${MODE}(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml deleted file mode 100644 index 12228822d4b..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml +++ /dev/null @@ -1,22 +0,0 @@ -choose_qparams_texture: - parameter_names_with_default_values: - IN_DTYPE: float - SCALE_OUT_DTYPE: float - ZP_OUT_DTYPE: int32 - MODE: per_tensor - generate_variant_forall: - IN_DTYPE: - - VALUE: float - SCALE_OUT_DTYPE: - - VALUE: float - ZP_OUT_DTYPE: - - VALUE: int32 - - VALUE: int8 - - VALUE: float - shader_variants: - - NAME: choose_qparams_tensor_texture3d - MODE: per_tensor - - NAME: choose_qparams_per_token_asymmetric_texture3d - MODE: per_token - - NAME: choose_qparams_block_wise_texture3d - MODE: block_wise diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.glsl b/backends/vulkan/runtime/graph/ops/glsl/clone.glsl deleted file mode 100644 index 3bd1af8bb0c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/clone.glsl +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#include "indexing_utils.h" - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "out_limits")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - imageStore(t_out, pos, load_texel(t_in, pos)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.yaml b/backends/vulkan/runtime/graph/ops/glsl/clone.yaml deleted file mode 100644 index 1fdbf506bfd..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/clone.yaml +++ /dev/null @@ -1,11 +0,0 @@ -clone: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: clone diff --git a/backends/vulkan/runtime/graph/ops/glsl/col2im.glsl b/backends/vulkan/runtime/graph/ops/glsl/col2im.glsl deleted file mode 100644 index c105ef18719..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/col2im.glsl +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)} -#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)} - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if INPUT_STORAGE == "buffer": - #define INPUT_BUFFER - -#define TILE_M4 1 -#define TILE_N4 1 -#define TILE_K4 1 - -#define TILE_M 4 -#define TILE_N 4 -#define TILE_K 4 - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "conv2d_common.glslh" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} - -// Sizes of the convolution output image -${layout_declare_ubo(B, "ivec4", "output_sizes")} -// Sizes of the convolution input image -${layout_declare_ubo(B, "ivec4", "input_sizes")} -// Sizes of the im2col matrix of the convolution output -${layout_declare_ubo(B, "ivec4", "matrix_sizes")} - -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "conv2d_fp_im2col_block_store.glslh" - -#ifdef INPUT_BUFFER - -void load_matrix_tile( - out FPOutTile tile, - const int n4, - const int m_start, - const int N4) { - [[unroll]] for (int m = 0; m < TILE_M; m++) { - tile.data[m][0] = t_input[(m_start + m) * N4 + n4]; - } -} - -#else // INPUT_TEXTURE - -void load_matrix_tile( - out FPOutTile tile, - const int n4, - const int m_start, - const int N4) { - [[unroll]] for (int m = 0; m < TILE_M; m++) { - tile.data[m][0] = texelFetch( - t_input, ivec3(n4, m_start + m, 0), 0); - } -} - -#endif // INPUT_BUFFER - -void main() { - // Each thread loads and writes a 4 wide x 4 high block of the matrix - const int n4 = int(gl_GlobalInvocationID.x); - const int m4 = int(gl_GlobalInvocationID.y); - - const int n = mul_4(n4); - const int m = mul_4(m4); - - if (n >= matrix_sizes.x || m >= matrix_sizes.y) { - return; - } - - FPOutTile tile; - - const int N4 = div_4(matrix_sizes.x); - load_matrix_tile(tile, n4, m, N4); - write_im2col_tile_as_image(tile, n4, m); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/col2im.yaml b/backends/vulkan/runtime/graph/ops/glsl/col2im.yaml deleted file mode 100644 index b6d0972271a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/col2im.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -col2im: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: texture3d - INPUT_STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: col2im_texture3d_buffer - - NAME: col2im_texture3d_texture3d - INPUT_STORAGE: texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh deleted file mode 100644 index 732b7006c2c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef COMMON_GLSLH -#define COMMON_GLSLH - -#define mul_2(x) ((x) << 1) -#define mul_4(x) ((x) << 2) -#define mul_8(x) ((x) << 3) - -#define div_2(x) ((x) >> 1) -#define div_4(x) ((x) >> 2) -#define div_8(x) ((x) >> 3) - -#define div_up_2(x) (((x) + 1) >> 1) -#define div_up_4(x) (((x) + 3) >> 2) -#define div_up_8(x) (((x) + 7) >> 3) - -#define align_up_2(x) ((x + 1) & -2) -#define align_up_4(x) ((x + 3) & -4) -#define align_up_8(x) ((x + 7) & -8) - -#define mod_2(x) ((x) & 1) -#define mod_4(x) ((x) & 3) -#define mod_8(x) ((x) & 7) - -struct TensorIndex4D { - ivec4 data; -}; - -#ifdef DEBUG_MODE - -#extension GL_EXT_debug_printf : require - -void printTensorIndex4D(const TensorIndex4D index) { - debugPrintfEXT( - "tensor_idx: %d, %d, %d, %d\\n", - index.data.x, - index.data.y, - index.data.z, - index.data.w); -} - -#endif // DEBUG_MODE - -#endif // COMMON_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl deleted file mode 100644 index e34ecaf8309..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("buffer")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "rw", "t_out", DTYPE, "buffer")} - -$for i in range(NUM_INPUTS): - ${layout_declare_tensor(B, "r", "t_inp" + str(i), DTYPE, "buffer")} - -${layout_declare_tensor(B, "r", "t_concat_offset", "int", "buffer")} - -${layout_declare_ubo(B, "int", "concat_dim")} - -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "out_strides")} - -$for i in range(NUM_INPUTS): - ${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_sizes")} - ${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_strides")} - -${layout_declare_ubo(B, "int", "out_numel")} - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#define NUM_INPUTS ${NUM_INPUTS} - -#include "concat_utils.glslh" - -/* - * This shader template concatenates up to NUM_INPUT input tensors to the - * output tensor along the concat_dim. Elements from the input tensor will - * be inserted along the output's concat_dim starting at concat_offset. - */ -void main() { - const int tid = ivec3(gl_GlobalInvocationID).x; - - // The 1-3 input tensors are interpreted as one concatenated tensor ("volume") - // along the concat_dim for the purposes of tensor indexing. Each thread is - // responsible for reading one item from this volume and writing it to the - // appropriate output location. - ivec4 inp_volume_sizes = out_sizes; - inp_volume_sizes[concat_dim] = total_concat_dim_numel(); - - // Account for 0 size input tensors - if (any(lessThanEqual(inp_volume_sizes, ivec4(0)))) { - return; - } - - ivec4 inp_volume_tidx = nchwi_to_tidx(tid, inp_volume_sizes); - - // bounds check - if (any(greaterThanEqual(inp_volume_tidx, inp_volume_sizes))) { - return; - } - - int concat_offset = t_concat_offset[0]; - - ivec4 out_tidx = inp_volume_tidx; - out_tidx[concat_dim] += concat_offset; - - const uint out_bufi = tidx_to_bufi(out_tidx, out_strides); - - // Go through the list of input tensors, and find which input this output - // element should be read from. - $for i in range(NUM_INPUTS): - if (inp_volume_tidx[concat_dim] < inp${i}_sizes[concat_dim]) { - int inp_bufi = tidx_to_bufi(inp_volume_tidx, inp${i}_strides); - t_out[out_bufi] = t_inp${i}[inp_bufi]; - return; - } - else { - inp_volume_tidx[concat_dim] -= inp${i}_sizes[concat_dim]; - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml deleted file mode 100644 index 39f96df5e90..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml +++ /dev/null @@ -1,14 +0,0 @@ -concat_buffer: - parameter_names_with_default_values: - DTYPE: float - NUM_INPUTS: 2 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: concat_1_buffer - NUM_INPUTS: 1 - - NAME: concat_2_buffer - - NAME: concat_3_buffer - NUM_INPUTS: 3 diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl deleted file mode 100644 index afab0c524d6..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -#define USING_TEXTURE3D - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "rw", "t_out", DTYPE, "texture3d")} - -$for i in range(NUM_INPUTS): - ${layout_declare_tensor(B, "r", "t_inp" + str(i), DTYPE, "texture3d")} - -${layout_declare_tensor(B, "r", "t_concat_offset", "int", "buffer")} - -${layout_declare_ubo(B, "int", "concat_dim")} - -$in_metadata = "" -$for i in range(NUM_INPUTS): - $in_metadata += "ivec4 inp" + str(i) + "_sizes;\n" - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ${in_metadata} -}; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -$for i in range(NUM_INPUTS): - ${layout_declare_spec_const(C, "int", "inp" + str(i) + "_layout", "DEFAULT_LAYOUT")} - const lowp ivec4 inp${i}_axis_map = unhash_axis_map(inp${i}_layout); - const lowp int inp${i}_packed_dim = unhash_packed_dim(inp${i}_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#define NUM_INPUTS ${NUM_INPUTS} - -#include "concat_utils.glslh" - -/* - * This shader template concatenates up to NUM_INPUT input tensors to the - * output tensor along the concat_dim. Elements from the input tensor will - * be inserted along the output's concat_dim starting at concat_offset. - * - * Each thread is responsible for writing out one output texel. The data - * required for the output texel may be read from multiple input texels of one - * input tensor. - */ -void main() { - const int tid = ivec3(gl_GlobalInvocationID).x; - - // Sum of the sizes of all input tensors along the concat_dim - const int concat_numel = total_concat_dim_numel(); - - // The 1-3 input tensors are interpreted as one concatenated tensor ("volume") - // along the concat_dim for the purposes of tensor indexing. Each thread is - // responsible for writing out 4 elements along the packed dim of the output - // tensor by reading the source data from the input tensor(s). - ivec4 inp_volume_sizes = out_sizes; - inp_volume_sizes[concat_dim] = total_concat_dim_numel(); - - // Reconstruct inp_volume_texel_sizes from Concat.cpp - ivec4 inp_volume_texel_sizes = inp_volume_sizes; - inp_volume_texel_sizes[out_packed_dim] = DIV_UP_4( - inp_volume_texel_sizes[out_packed_dim] - ) + 1; - - // tensor index of the first element that will be read from the input volume - ivec4 inp_volume_start_tidx = nchwi_to_tidx(tid, inp_volume_texel_sizes); - inp_volume_start_tidx[out_packed_dim] = MUL_4( - inp_volume_start_tidx[out_packed_dim] - ); - - int concat_offset = t_concat_offset[0]; - - // tensor index of the first element that will be written to the output tensor - ivec4 out_write_start_tidx = inp_volume_start_tidx; - out_write_start_tidx[concat_dim] += concat_offset; - - // To write to the the desired output element, we will need to load the texel - // to which the element belongs. Calculate the tensor index of the first - // element of that texel. - ivec4 out_read_start_tidx = out_write_start_tidx; - out_read_start_tidx[out_packed_dim] = ALIGN_DOWN_4( - out_write_start_tidx[out_packed_dim]); - - // bounds check - if (any(greaterThanEqual(out_read_start_tidx, out_sizes))) { - return; - } - - ivec3 out_pos = tidx_to_pos( - out_read_start_tidx, - out_sizes, - out_axis_map, - out_packed_dim - ); - - VEC4_T out_texel = imageLoad(t_out, out_pos); - - VEC4_T test_texel = VEC4_T(-1.0); - - for (int comp = 0; comp < 4; ++comp) { - ivec4 out_tidx = out_read_start_tidx; - out_tidx[out_packed_dim] += comp; - - - // It's possible that the current texel element has been written to as part - // of the previous input batch; if so, then don't overwrite this texel - // element - if (out_tidx[concat_dim] < concat_offset) { - test_texel[comp] = -5.0; - continue; - } - - // Calculate the tidx of the input volume that corresponds to this output - // element - ivec4 inp_volume_tidx = out_tidx; - inp_volume_tidx[concat_dim] -= concat_offset; - - // go through the list of input tensors, and figure out which input this - // output element should be read from. - $for i in range(NUM_INPUTS): - if (inp_volume_tidx[concat_dim] < inp${i}_sizes[concat_dim]) { - // Special fast path case if, for the first output texel element, the - // corresponding input element is at the start of the texel it belongs - // to. In this case, the input texel can be written as-is to the output - // texel. Also require that The entire input texel is valid and does not - // contain any padding elements. - if (comp == 0 && - out_tidx[out_packed_dim] % 4 == 0 && - inp_volume_tidx[inp${i}_packed_dim] % 4 == 0 && - inp_volume_tidx[inp${i}_packed_dim] + 3 < inp${i}_sizes[inp${i}_packed_dim]) { - const ivec3 in_pos = tidx_to_pos( - inp_volume_tidx, - inp${i}_sizes, - inp${i}_axis_map, - inp${i}_packed_dim); - - out_texel = texelFetch(t_inp${i}, in_pos, 0); - break; - } - - // Otherwise, locate the specific input element required - const ivec4 in_posi = tidx_to_posi( - inp_volume_tidx, - inp${i}_sizes, - inp${i}_axis_map, - inp${i}_packed_dim); - - out_texel[comp] = texelFetch(t_inp${i}, in_posi.xyz, 0)[in_posi.w]; - test_texel[comp] = out_texel[comp]; - continue; - } - else { - inp_volume_tidx[concat_dim] -= inp${i}_sizes[concat_dim]; - } - } - - imageStore(t_out, out_pos, out_texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml deleted file mode 100644 index ed5003382a1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml +++ /dev/null @@ -1,14 +0,0 @@ -concat_texture: - parameter_names_with_default_values: - DTYPE: float - NUM_INPUTS: 2 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: concat_1_texture3d - NUM_INPUTS: 1 - - NAME: concat_2_texture3d - - NAME: concat_3_texture3d - NUM_INPUTS: 3 diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/concat_utils.glslh deleted file mode 100644 index 000b86a7fce..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/concat_utils.glslh +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef CONCAT_UTILS_H -#define CONCAT_UTILS_H - - -/********************************** - * Concatenation utililty functions - * - */ - -/* - * Returns the total number of elements along the concatenation dim that will - * be concatenated in this input batch. - */ -$for N in range(1, 4): - #if NUM_INPUTS == ${N} - int total_concat_dim_numel() { - int total = 0; - $for i in range(N): - total += inp${i}_sizes[concat_dim]; - - return total; - } - #endif - -#endif // CONCAT_UTILS_H diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl deleted file mode 100644 index 4e3b91e6c49..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define op(X, A, B) ${OPERATOR} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} - -${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")} - -${layout_declare_ubo(B, "float", "out_min", "float", "out_max")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -${layout_declare_spec_const(C, "int", "kernel_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 kernel_axis_map = unhash_axis_map(kernel_layout); - -${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout); - -// Let us define -// -// input = (N, in_C, in_L), -// output = (N, out_C, out_L), -// groups = G, -// kernel = K, -// -// which results in shapes -// -// weight = (out_C, in_C / G, K), -// bias = (out_C,). -// -// This implementation performs N x out_C x out_L shader invocations, where each invocation -// calculates the rolling kernel of the length dimension for each batch, i.e., -// computes out_L results. -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(lpos, out_limits))) { - return; - } - - // "out_c" is the output's channel index where we write our result. - // Across shader invocations, this is the only value that varies. - const int out_c = lpos.y; - - // "in_c" tracks the input's channel start index. - // We iterate over the input group that corresponds to the output group. - const int c_start = (out_c / out_group_size) * in_group_size; - const int c_end = c_start + in_group_size; - - // "out_l" tracks the output's length index where we write our result. - const int out_l = lpos.x; - - // "N" is the batch index - const int N = lpos.z; - - // "in_l" tracks the input's length start index for our input-kernel overlay - // region. - const int in_l = out_l * stride - padding; - VEC4_T sum = VEC4_T(0); - - const int out_c_packed_index = out_c >> 2; - const int out_c_packed_lane = out_c & 0x3; - - for (int in_c = c_start; in_c < c_end; ++in_c) { - // "k" tracks the kernel's index for our input-kernel computation. - // It reads out-of-bound zeros, but trying to avoid them complicates - // for-loop conditions, which results in worse performance. - - // The weight tensor is channel-packed. It may not be trival choice for - // performance reason since need to have more data fetch. The reason is - // for some sequence model, we found that the weight tensor - // (out_channel, in_channel / group, kernel) often has a large - // out_channel >> kernel, leading to non-optimal use of memory as the - // weight tensor gets very deep. As a mitigation, we use channel-packing - // for the weight tensor, yielding a 75% reduction in weight-tensor - // memory. - - // It is possible to further reduce the memory footprint by swapping the - // dimensions, using x extent for out_channel, and y for kernel. - for (int k = 0; k < kernel_size; k++) { - const ivec3 w_lpos = ivec3(k, in_c % in_group_size, out_c_packed_index); - const VEC4_T weight_texel = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map); - VEC4_T weight = VEC4_T(weight_texel[out_c_packed_lane]); - - const ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, N), in_axis_map); - sum = fma(weight, load_texel(t_in, in_pos), sum); - } - } - - const VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c_packed_index, 0, 0), bias_axis_map); - const ivec3 out_lpos = ivec3(out_l, out_c, N); - write_texel_lpos(t_out, out_lpos, op(sum + bias[out_c_packed_lane], out_min, out_max), out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml deleted file mode 100644 index 2266649d2b9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv1d: - parameter_names_with_default_values: - OPERATOR: X - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv1d - - NAME: conv1d_clamp - OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl deleted file mode 100644 index 0f5dbc41273..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} -${layout_declare_ubo(4, "ivec3", "out_limits")} -${layout_declare_ubo(5, "ivec4", "in_sizes")} -${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} -${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")} -${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "ngroups", "1")} - -/* - * Computes a 2D convolution. Each shader invocation calculates the output at - * a single output location. - */ -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - // Compute the index of the top-left element of the overlay region. Negative - // indices indicate that the top-left element is in a region added by padding. - const ivec2 ipos = pos.xy * stride - padding; - - // Compute the start and end of the input indices to load. Padding is assumed - // to be constant 0 padding, so reads from the padding region are skipped. - ivec2 start = ipos; - if (start.x < 0) { - // number of "steps" to get to >= zero is div_up(-start, dilation) - int num_steps = ((-ipos.x) + dilation.x - 1) / dilation.x; - start.x = ipos.x + num_steps * dilation.x; - } - if (start.y < 0) { - // number of "steps" to get to >= zero is div_up(-start, dilation) - int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y; - start.y = ipos.y + num_steps * dilation.y; - } - const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy)); - // Compute the start of the kernel based on how far we are skipping ahead when - // reading the input. Note that these are "canonical" indices. - ivec2 kstart = (start - ipos) / dilation; - // During prepacking, the weight tensor was rearranged in order to optimize - // for data access linearity in this shader. Therefore we need to adjust the - // canonical coordinates to the corresponding index in the rearranged weight - // tensor. The x-coordinate is multipled by 4 since each group of 4 channels - // is folded into the X axis. The y-coordinate is offset based on the z- - // coordinate because the 2D planes were stacked atop each other vertically. - kstart.x *= 4; - kstart.y += pos.z * kernel_size.y; - - // Perform the convolution by iterating over the overlay region. - VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); - const int ic4 = in_group_size / 4; - - int z_start = 0; - int z_end = ic4; - if (ngroups > 1) { - const int group_size = (out_limits.z) / ngroups; - const int group_idx = pos.z / group_size; - - z_start = ic4 * group_idx; - z_end = z_start + ic4; - } - - for (int z4 = z_start; z4 < z_end; ++z4, kstart.x += kernel_size.x * 4) { - for (int y = start.y, ky = kstart.y; y < end.y; y += dilation.y, ++ky) { - for (int x = start.x, kx = kstart.x; x < end.x; x += dilation.x, kx += 4) { - const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, z4), 0); - const ivec4 kxs = kx + ivec4(0, 1, 2, 3); - - // To explain the calculation below, the contents of in_texel and the - // group of 4 texels loaded from t_kernel are shown: - // - // in_texel t_kernel - // -x-> ---x---> - // +---+ +----+----+----+----+ - // ^ | w | ^ | D0 | D1 | D2 | D3 | - // | +---+ | +----+----+----+----+ - // | | z | | | C0 | C1 | C2 | C3 | - // z +---+ z +----+----+----+----+ - // | | y | | | B0 | B1 | B2 | B3 | - // | +---+ | +----+----+----+----+ - // | x | | A0 | A1 | A2 | A3 | - // +---+ +----+----+----+----+ - // - // In the t_kernel graphic, cells sharing the same letter are from - // the same batch/output channel index, and the number denotes a unique - // channel index. To calculate the output texel, the following - // calculation is performed: - // - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | D0 | | y | | D1 | | z | | D2 | | w | | D3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | C0 | | y | | C1 | | z | | C2 | | w | | C3 | - // +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+ - // | x | | B0 | | y | | B1 | | z | | B2 | | w | | B3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | A0 | | y | | A1 | | z | | A2 | | w | | A3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // - // which is expressed in the following statements. - - sum = fma(in_texel.xxxx, texelFetch(t_kernel, ivec2(kxs.x, ky), 0), sum); - sum = fma(in_texel.yyyy, texelFetch(t_kernel, ivec2(kxs.y, ky), 0), sum); - sum = fma(in_texel.zzzz, texelFetch(t_kernel, ivec2(kxs.z, ky), 0), sum); - sum = fma(in_texel.wwww, texelFetch(t_kernel, ivec2(kxs.w, ky), 0), sum); - } - } - } - - imageStore(t_out, pos, op(sum, out_min, out_max)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml deleted file mode 100644 index 1a5ed58876c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d - - NAME: conv2d_clamp - OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh deleted file mode 100644 index 41825cba867..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef CONV2D_COMMON_GLSLH -#define CONV2D_COMMON_GLSLH - -#include "common.glslh" - -struct Conv2DParams { - ivec2 kernel_size; - ivec2 stride; - ivec2 padding; - ivec2 dilation; - int groups; - int out_channels_per_group; - int in_channels_per_group; - int logical_K_per_group; - int K_per_group; - int K4_per_group; - int logical_K; - int K; - int K4; -}; - -#ifdef DEBUG_MODE - -void printConv2DParams(const Conv2DParams params) { - debugPrintfEXT("Conv2DParams: \\n"); - debugPrintfEXT( - " kernel_size: %d, %d\\n", params.kernel_size.x, params.kernel_size.y); - debugPrintfEXT(" stride: %d, %d\\n", params.stride.x, params.stride.y); - debugPrintfEXT(" padding: %d, %d\\n", params.padding.x, params.padding.y); - debugPrintfEXT(" dilation: %d, %d\\n", params.dilation.x, params.dilation.y); - debugPrintfEXT(" groups: %d\\n", params.groups); - debugPrintfEXT( - " out_channels_per_group: %d\\n", params.out_channels_per_group); - debugPrintfEXT( - " in_channels_per_group: %d\\n", params.in_channels_per_group); - debugPrintfEXT(" logical_K_per_group: %d\\n", params.logical_K_per_group); - debugPrintfEXT(" K_per_group: %d\\n", params.K_per_group); - debugPrintfEXT(" K4_per_group: %d\\n", params.K4_per_group); -} - -#endif // DEBUG_MODE - -#endif // CONV2D_COMMON_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl deleted file mode 100644 index 02fbef29b75..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} -${layout_declare_ubo(4, "ivec3", "out_limits")} -${layout_declare_ubo(5, "ivec4", "in_sizes")} -${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} -${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")} -${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "ngroups", "1")} - -/* - * Computes a depthwise convolution. Each shader invocation calculates the - * output at a single output location. - */ -void main() { - const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x; - const ivec3 pos = ivec3( - gl_GlobalInvocationID.x % out_limits.x, - div_by_x % out_limits.y, - div_by_x / out_limits.y); - - if (pos.z >= out_limits.z) { - return; - } - - // Compute the index of the top-left element of the overlay region. Negative - // indices indicate that the top-left element is in a region added by padding. - const ivec2 ipos = pos.xy * stride - padding; - - // Compute the start and end of the input indices to load. Padding is assumed - // to be constant 0 padding, so reads from the padding region are skipped. - const ivec2 start = ipos; - const ivec2 end = ipos + overlay_region.xy; - - VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); - int kx = 0; - for (int y = start.y; y < end.y; y += dilation.y) { - for (int x = start.x; x < end.x; x += dilation.x) { - // The weight kernel was rearranged such that every NxN filter is - // flattened to fit in one row. Each filter was then stacked on top of - // each other vertically. - const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0); - sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum); - ++kx; - } - } - - imageStore(t_out, pos, op(sum, out_min, out_max)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml deleted file mode 100644 index 5202cddba76..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_dw: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_dw - - NAME: conv2d_dw_clamp - OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl deleted file mode 100644 index 19250419baf..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define TILE_SIZE ${TILE_SIZE} - -#define BATCH_SIZE_X ${BATCH_SIZE_X} - -#define BATCH_SIZE_Y ${BATCH_SIZE_Y} - -#define LOCAL_WG_SIZE 64 - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_limits; - ivec4 in_sizes; - ivec2 kernel_size; - ivec2 stride; - ivec2 padding; - ivec2 dilation; - ivec2 overlay_region; - int in_group_size; - int dummy_padding; - float out_min; - float out_max; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - * Computes a depthwise convolution. Each shader invocation calculates the - * output at a single output location. - */ - -void main() { - // x and y are divided by batch size to determine 3d position - // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z - const ivec2 out_limits_xy_scaled = (out_limits.xy + ivec2(BATCH_SIZE_X, BATCH_SIZE_Y) - 1) / ivec2(BATCH_SIZE_X, BATCH_SIZE_Y); - - const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x; - ivec3 pos = ivec3( - gl_GlobalInvocationID.x % out_limits_xy_scaled.x, - div_by_x, - gl_GlobalInvocationID.y); - - // do not process if top pixel does not fit within the output range - if (pos.y >= out_limits_xy_scaled.y || pos.z >= out_limits.z) { - return; - } - - // scale pos.xy by batch sizes, because that's the top pixel to be processed - pos.x *= BATCH_SIZE_X; - pos.y *= BATCH_SIZE_Y; - - // Compute the index of the top-left element of the overlay region. Negative - // indices indicate that the top-left element is in a region added by padding. - const ivec2 ipos = pos.xy * stride - padding; - - // Compute the start and end of the input indices to load. Padding is assumed - // to be constant 0 padding, so any reads from the padding region is skipped. - const ivec2 start = ipos; - - // sum outputs - VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X]; - - for (int i = 0; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++) { - sum[i] = VEC4_T(0); - } - - // array to store input texels - VEC4_T in_texels[TILE_SIZE + BATCH_SIZE_X - 1]; - - // array to store kernel data of previous y - VEC4_T prev_kernel_line[TILE_SIZE]; - - int kx = 0; - for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) { - for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) { - in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0); - } - - // from 2nd iteration onwards accumulate dot product in 2nd sum - // based on kernel line data fetched in previous iteration and input texel from this iteration - if (i > 0) { - for (int j = 0; j < TILE_SIZE; j++) { - for (int s = 0; s < BATCH_SIZE_X; s++) { - sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]); - } - } - } - - // accumulate dot product in 1st sum only until tile size - if (i < TILE_SIZE) { - for (int j = 0; j < TILE_SIZE; j++, kx++) { - prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0); - for (int s = 0; s < BATCH_SIZE_X; s++) { - sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]); - } - } - } - } - - const VEC4_T bias = texelFetch(t_bias, ivec2(pos.z, 0), 0); - for (int y = 0; y < BATCH_SIZE_Y; y++) { - for (int x = 0; x < BATCH_SIZE_X; x++) { - const ivec3 out_pos = ivec3(pos.x + x, pos.y + y, pos.z); - if (all(lessThan(out_pos.xy, out_limits.xy))) { - imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max)); - } - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml deleted file mode 100644 index 9cf6c22c6ca..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_dw_output_tile: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - TILE_SIZE: 3 - BATCH_SIZE_X: 4 - BATCH_SIZE_Y: 2 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_dw_output_tile_3x3 - - NAME: conv2d_dw_output_tile_3x3_clamp - OPERATOR: clamp(X, A, B) - - NAME: conv2d_dw_output_tile_5x5 - TILE_SIZE: 5 - - NAME: conv2d_dw_output_tile_5x5_clamp - OPERATOR: clamp(X, A, B) - TILE_SIZE: 5 diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl deleted file mode 100644 index f5361d40b66..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${texel_type(DTYPE)} -#define SCALAR_T ${texel_component_type(DTYPE)} - -#include "indexing_utils.h" - -$if DTYPE == "half": - #extension GL_EXT_shader_16bit_storage : require - -layout(std430) buffer; - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; -layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - BUF_T buffer_in[]; -}; - -layout(push_constant) uniform PRECISION restrict Block { - ivec4 sizes; - ivec4 original_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -/* - * Computes special prepacking for a depthwise convolution. Each shader invocation - * calculates the input buffer location to read into the desired texel. This - * packing was originally developed on CPU here: - * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L58-L118 - */ -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); - - if (any(greaterThanEqual(idx, sizes))) { - return; - } - - // Map tensor_idx to normal buffer_i - const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); - - // Compute modified tensor_idx by inverting the CPU function - const int N = original_sizes.w; - const int C = original_sizes.z; - const int H = original_sizes.y; - const int W = original_sizes.x; - const int Y = sizes.y; - - const ivec4 p1 = p0 / W; - const ivec4 p2 = p1 / H; - - const ivec4 n = (p2 % Y) * 4 + (p2 / Y); - const ivec4 h = p1 % H; - const ivec4 w = p0 % W; - - // Map modified tensor_idx to modifed buffer_i - // Zero out if modified tensor idx is out of bounds - const ivec4 buf_i = n * C*H*W + h * W + w; - const bvec4 mask = bvec4(lessThan(n, ivec4(N))); - - VEC4_T texel = VEC4_T(0); - if (mask.x) { - texel.x = SCALAR_T(buffer_in[buf_i.x]); - } - if (mask.y) { - texel.y = SCALAR_T(buffer_in[buf_i.y]); - } - if (mask.z) { - texel.z = SCALAR_T(buffer_in[buf_i.z]); - } - if (mask.w) { - texel.w = SCALAR_T(buffer_in[buf_i.w]); - } - - imageStore(image_out, pos.xy, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml deleted file mode 100644 index 33342145a82..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_dw_prepack_weights: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_dw_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl deleted file mode 100644 index f161c1ba460..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define TILE_SIZE ${TILE_SIZE} - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_limits; - ivec4 in_sizes; - ivec2 kernel_size; - ivec2 stride; - ivec2 padding; - ivec2 dilation; - ivec2 overlay_region; - int in_group_size; - int dummy_padding; - float out_min; - float out_max; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - * Computes a depthwise convolution. Each shader invocation calculates the - * output at a single output location. - */ - -void main() { - const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x; - const ivec3 pos = ivec3( - gl_GlobalInvocationID.x % out_limits.x, - div_by_x, - gl_GlobalInvocationID.y); - - // do not process if top pixel does not fit within the output range - if (pos.y >= out_limits.y || pos.z >= out_limits.z) { - return; - } - - // Compute the index of the top-left element of the overlay region. Negative - // indices indicate that the top-left element is in a region added by padding. - const ivec2 ipos = pos.xy * stride - padding; - - // Compute the start and end of the input indices to load. Padding is assumed - // to be constant 0 padding, so any reads from the padding region is skipped. - const ivec2 start = ipos; - - VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); - int kx = 0; - for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) { - for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) { - // The weight kernel was rearranged such that every NxN filter is - // flattened to fit in one row. Each filter was then stacked on top of - // each other vertically. - const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0); - sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum); - kx++; - } - } - - imageStore(t_out, pos, op(sum, out_min, out_max)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml deleted file mode 100644 index f2ece8fa0f9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_dw_sned_output_tile: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - TILE_SIZE: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_dw_sned_output_tile_3x3 - - NAME: conv2d_dw_sned_output_tile_3x3_clamp - OPERATOR: clamp(X, A, B) - - NAME: conv2d_dw_sned_output_tile_5x5 - TILE_SIZE: 5 - - NAME: conv2d_dw_sned_output_tile_5x5_clamp - OPERATOR: clamp(X, A, B) - TILE_SIZE: 5 diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block.glslh deleted file mode 100644 index 7add8c4cd16..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block.glslh +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef CONV2D_FP_IM2COL_BLOCK -#define CONV2D_FP_IM2COL_BLOCK - -/* - * Defines utilities to convert between (col, row) indices of an im2col matrix - * and 4-dimension tensor indices of image tensors. - * - * Requires: - * - output_sizes to be defined in the shader layout, corresponding to the sizes - * of the output image of the convolution op. - * - image_sizes to be defined in the shader layout, corresponding to the sizes - * of the input image of the convolution op. - * - conv2d_params to be defined in the shader layout - */ - -#extension GL_EXT_control_flow_attributes : require - -#include "common.glslh" -#include "conv2d_common.glslh" - -struct Im2ColMatrixIdx { - int row; - int col; - // Relevant for grouped convolution. This indicates the column index relative - // to the first column in the group. - int col_idx_in_group; - int group_idx; -}; - -void unwrap_m(out TensorIndex4D out_tidx_base, const int m) { - out_tidx_base.data[3] = m / (output_sizes.y * output_sizes.x); - out_tidx_base.data[1] = (m / output_sizes.x) % output_sizes.y; - out_tidx_base.data[0] = m % output_sizes.x; - - // Initialize channels to 0; assume it will be set later on - out_tidx_base.data[2] = 0; -} - -void im2col_tidx_to_output_tidx( - out TensorIndex4D output_tidx, - const Im2ColMatrixIdx im2col_tidx) { - unwrap_m(output_tidx, im2col_tidx.row); - // Set channels - output_tidx.data.z = im2col_tidx.col; -} - -/* - * Converts im2col matrix position to corresponding 4D tensor index, accounting - * for grouped convolutions. The conversion should ensure that all data within - * the same group occupy a contiguous block in memory. - */ -void im2col_idx_to_input_tidx( - out TensorIndex4D input_tidx, - const Im2ColMatrixIdx im2col_idx) { - TensorIndex4D output_tidx; - unwrap_m(output_tidx, im2col_idx.row); - - const int in_channels_per_group = conv2d_params.in_channels_per_group; - // Determine the corresponding position within the convolution window based - // on the col index (more specifically, the col index within the group) - const int channel_within_group = - im2col_idx.col_idx_in_group % in_channels_per_group; - const int kernel_x = (im2col_idx.col_idx_in_group / in_channels_per_group) % - conv2d_params.kernel_size.x; - const int kernel_y = im2col_idx.col_idx_in_group / - (in_channels_per_group * conv2d_params.kernel_size.x); - - // Calculate the actual input channel index - const int channel_idx = - im2col_idx.group_idx * conv2d_params.in_channels_per_group + - channel_within_group; - - // Calculate corresponding input coordinates based on output position - // associated with the row index. - const int input_y = int(output_tidx.data.y * conv2d_params.stride.y) - - int(conv2d_params.padding.y) + int(kernel_y * conv2d_params.dilation.y); - const int input_x = int(output_tidx.data.x * conv2d_params.stride.x) - - int(conv2d_params.padding.x) + int(kernel_x * conv2d_params.dilation.x); - - input_tidx.data = ivec4(input_x, input_y, channel_idx, output_tidx.data.w); -} - -// 4x4 block of the im2col matrix -struct FPIm2ColBlock { - VEC4_T data[4]; -}; - -#endif // CONV2D_FP_IM2COL_BLOCK diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_load.glslh deleted file mode 100644 index c02b070e17e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_load.glslh +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef CONV2D_FP_IM2COL_BLOCK_LOAD -#define CONV2D_FP_IM2COL_BLOCK_LOAD - -/* - * Defines utilities to load data for a 4x4 im2col matrix block from an - * input image and store the data as a FPInputTile. - * - * Requires: - * - t_input to be defined in the shader layout, representing the texture of the - * source image - * - conv2d_params to be defined in the shader layout - */ - -#extension GL_EXT_control_flow_attributes : require - -#extension GL_EXT_debug_printf : require - -#include "common.glslh" -#include "conv2d_common.glslh" -#include "conv2d_fp_im2col_block.glslh" -#include "linear_fp_input_tile.glslh" - -VEC4_T load_input_texel(const TensorIndex4D tidx) { - // Assumes batch size is 1 and channels packing - return texelFetch( - t_input, ivec3(tidx.data.x, tidx.data.y, div_4(tidx.data.z)), 0); -} - -T load_input_texel_element(const TensorIndex4D tidx) { - const int channels_texel_idx = div_4(tidx.data.z); - const int texel_comp = mod_4(tidx.data.z); - // Assumes batch size is 1 and channels packing - return texelFetch( - t_input, - ivec3(tidx.data.x, tidx.data.y, channels_texel_idx), - 0)[texel_comp]; -} - -// k4 -> group of 4 input channels idx -// m -> flattened batch, output width, output height dim idx -/* - * Fast impl for when the input image's channels per group is a multiple of 4. - * In this case, it is guaranteed that a texel loaded from the input can be - * stored directly to the output without any additional filtering. - */ -void load_im2col_block_fast( - out FPIm2ColBlock block, - const int k4, - const int m4, - const int logical_K, - const int M) { - Im2ColMatrixIdx im2col_idx; - im2col_idx.col = mul_4(k4); // k - im2col_idx.row = mul_4(m4); // m - - // Due to the assumption that in_channels_per_group % 4 == 0, it is - // guaranteed that the next 4 columns (including this one) is part of the - // same group. - im2col_idx.group_idx = im2col_idx.col / conv2d_params.K_per_group; - im2col_idx.col_idx_in_group = im2col_idx.col % conv2d_params.K_per_group; - - [[unroll]] for (int m_off = 0; m_off < 4; ++m_off) { - if (im2col_idx.row >= M) { - block.data[m_off] = VEC4_T(0); - continue; - } - - TensorIndex4D input_tidx; - im2col_idx_to_input_tidx(input_tidx, im2col_idx); - - // Load the texel - block.data[m_off] = load_input_texel(input_tidx); - - im2col_idx.row++; - } -} - -/* - * If input image channels is not a multiple of 4, then it is likely that for - * some matrix texels, the source data is split between different texels of the - * source image. In this case it's better to retreive each element individually. - */ -void load_im2col_block_slow( - out FPIm2ColBlock block, - const int k4, - const int m4, - const int logical_K, - const int M) { - Im2ColMatrixIdx im2col_idx_base; - im2col_idx_base.col = mul_4(k4); - im2col_idx_base.row = mul_4(m4); - - im2col_idx_base.group_idx = im2col_idx_base.col / conv2d_params.K_per_group; - im2col_idx_base.col_idx_in_group = - im2col_idx_base.col % conv2d_params.K_per_group; - - [[unroll]] for (int m_off = 0; m_off < 4; ++m_off) { - [[unroll]] for (int k_off = 0; k_off < 4; ++k_off) { - Im2ColMatrixIdx im2col_idx = im2col_idx_base; - im2col_idx.row += m_off; - im2col_idx.col_idx_in_group += k_off; - - // bounds checking - if (im2col_idx.col_idx_in_group >= conv2d_params.logical_K_per_group || - im2col_idx.row >= M) { - block.data[m_off][k_off] = T(0); - continue; - } - - TensorIndex4D input_tidx; - im2col_idx_to_input_tidx(input_tidx, im2col_idx); - - block.data[m_off][k_off] = load_input_texel_element(input_tidx); - } - } -} - -void load_im2col_block( - out FPIm2ColBlock block, - const int k4, - const int m4, - const int logical_K, - const int M) { - if (mod_4(conv2d_params.in_channels_per_group) == 0) { - load_im2col_block_fast(block, k4, m4, logical_K, M); - } else { - load_im2col_block_slow(block, k4, m4, logical_K, M); - } -} - -void load_input_im2col_tile( - out FPInputTile tile, - const int k4_start, - const int m4_start, - const int logical_K, - const int M) { - FPIm2ColBlock block; -#if TILE_K4 == 1 - [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { - load_im2col_block(block, k4_start, m4_start + m4, logical_K, M); - for (int row = 0; row < 4; ++row) { - const int m = mul_4(m4) + row; - tile.data[m][0] = block.data[row]; - } - } - -#else - [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - load_im2col_block(block, k4_start + k4, m4_start + m4, logical_K, M); - for (int row = 0; row < 4; ++row) { - const int m = mul_4(m4) + row; - tile.data[m][k4] = block.data[row]; - } - } - } - -#endif -} - -#endif // CONV2D_FP_IM2COL_BLOCK_LOAD diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_store.glslh deleted file mode 100644 index 2171d75c628..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_store.glslh +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef CONV2D_FP_IM2COL_BLOCK_STORE -#define CONV2D_FP_IM2COL_BLOCK_STORE - -/* - * Defines utilities to store data for a 4x4 im2col output matrix block computed - * from matrix multiplication to an output image. - * - * Requires: - * - t_output to be defined in the shader layout, representing the texture of - * the output image - */ - -#extension GL_EXT_control_flow_attributes : require - -#include "common.glslh" -#include "conv2d_common.glslh" -#include "conv2d_fp_im2col_block.glslh" -#include "linear_fp_output_tile.glslh" - -// TODO: implement buffer support -void write_output_texel(const VEC4_T out_texel, const TensorIndex4D tidx) { - // Assume batch size is 1 - imageStore( - t_output, ivec3(tidx.data.x, tidx.data.y, div_4(tidx.data.z)), out_texel); -} - -void write_im2col_tile_as_image( - const FPOutTile tile, - const int n4_start, - const int m_start) { - Im2ColMatrixIdx im2col_tidx; - im2col_tidx.col = mul_4(n4_start); - im2col_tidx.row = m_start; -#if TILE_K4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - TensorIndex4D output_tidx; - im2col_tidx_to_output_tidx(output_tidx, im2col_tidx); - - if (any(greaterThanEqual(output_tidx.data, output_sizes))) { - continue; - } - write_output_texel(tile.data[m][0], output_tidx); - im2col_tidx.row++; - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - TensorIndex4D output_tidx; - im2col_tidx_to_output_tidx(output_tidx, im2col_tidx); - - write_output_texel(tile.data[m][k4], output_tidx); - im2col_tidx.row++; - } - } - -#endif -} - -#endif // CONV2D_FP_IM2COL_BLOCK_STORE diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl deleted file mode 100644 index d2f3f615f74..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${texel_type(DTYPE)} -#define SCALAR_T ${texel_component_type(DTYPE)} - -#include "indexing_utils.h" - -$if DTYPE == "half": - #extension GL_EXT_shader_16bit_storage : require - -layout(std430) buffer; - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; -layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - BUF_T buffer_in[]; -}; - -layout(push_constant) uniform PRECISION restrict Block { - ivec4 sizes; - ivec4 original_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -/* - * Computes special prepacking for a 2D convolution. Each shader invocation - * calculates the input buffer locations to read into the desired texel. This - * packing was originally developed on CPU here: - * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211 - */ -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); - - if (any(greaterThanEqual(idx, sizes))) { - return; - } - - // Map tensor_idx to normal buffer_i - const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); - - // Compute modified tensor_idx by inverting the CPU function - const int N = original_sizes.w; - const int C = original_sizes.z; - const int H = original_sizes.y; - const int W = original_sizes.x; - const int J = sizes.x / (4*W); - const int K = sizes.y / H; - - const ivec4 p1 = p0 / 4; - const ivec4 p2 = p1 / W; - const ivec4 p3 = p2 / J; - const ivec4 p4 = p3 / H; - - const ivec4 n = (p4 % K) * 4 + (p4 / K); - const ivec4 c = (p2 % J) * 4 + (p0 % 4); - const ivec4 h = p3 % H; - const ivec4 w = p1 % W; - - // Map modified tensor_idx to modified buffer_i - // Zero out if modified tensor idx is out of bounds - const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w; - const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C)))); - - VEC4_T texel = VEC4_T(0); - if (mask.x) { - texel.x = SCALAR_T(buffer_in[buf_i.x]); - } - if (mask.y) { - texel.y = SCALAR_T(buffer_in[buf_i.y]); - } - if (mask.z) { - texel.z = SCALAR_T(buffer_in[buf_i.z]); - } - if (mask.w) { - texel.w = SCALAR_T(buffer_in[buf_i.w]); - } - - imageStore(image_out, pos.xy, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml deleted file mode 100644 index 28cf63dc163..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_prepack_weights: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl deleted file mode 100644 index 4c6031152ee..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define TILE_SIZE_X ${TILE_SIZE_X} -#define TILE_SIZE_Y ${TILE_SIZE_Y} - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_limits; - ivec2 stride; - ivec2 padding; - int in_group_size; - int dummy_padding; - float out_min; - float out_max; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "ngroups", "1")} - -#extension GL_EXT_control_flow_attributes : require - -/* - * Computes a 2D pointwise convolution of an NxN output tile. Calculating an - * output tile for pointwise convolution is more efficient because the kernel - * size is only 1x1, making it easier to re-use loaded texels from t_kernel. - */ -void main() { - const int out_limits_scaled[2] = - {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X, - (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y}; - - const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]); - const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)}; - - // If the top left position is out of bounds, then this invocation will have - // no work to do. - if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) { - return; - } - - // Output position for TILE_SIZE = 2 - // +--------+--------+ - // | pos[0] | pos[1] | - // +--------+--------+ - // | pos[2] | pos[3] | - // +--------+--------+ - int pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; - for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) { - for (int x = 0; x < TILE_SIZE_X; ++x) { - pos[i * 2] = out_pos[0] * TILE_SIZE_X + x; - pos[i * 2 + 1] = out_pos[1] * TILE_SIZE_Y + y; - i++; - } - } - - // Compute the index of the input texture that needs to be loaded for each - // output position. Note that negative indices can be produced indicating that - // the top-left element is in a region added by padding. - int ipos[TILE_SIZE_X * TILE_SIZE_Y * 2]; - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - ipos[i * 2] = pos[i * 2] * stride.x - padding.x; - ipos[i * 2 + 1] = pos[i * 2 + 1] * stride.y - padding.y; - } - - // Final output array where each element is a tensor value. - // Tuple of consecutive 4 elements represents a single output texel. - float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; - - const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0); - - // Initialize the output array with the bias value - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) { - sum[i] = bias.x; - sum[i + 1] = bias.y; - sum[i + 2] = bias.z; - sum[i + 3] = bias.w; - } - - int z4 = 0; - // Since the kernel is 1x1, we only have to loop over the depth dimension. - for (int z = 0; z < in_group_size; z += 4, ++z4) { - // During prepacking, the weight tensor has been permuted so that the - // channel (IC) dim is along the x-axis, and the batch (OC) dim is along - // the z-axis. - float kernel_values[4 * 4]; // 4 channels, 4 elements per channel - - // Load kernel values from texels to array - [[unroll]] for (int i = 0; i < 4; ++i) { - const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0); - kernel_values[i * 4 + 0] = k_tex.x; - kernel_values[i * 4 + 1] = k_tex.y; - kernel_values[i * 4 + 2] = k_tex.z; - kernel_values[i * 4 + 3] = k_tex.w; - } - - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i * 2], ipos[i * 2 + 1], z4), 0); - // Load the input texel into an array - float tex_values[4]; - tex_values[0] = in_tex.x; - tex_values[1] = in_tex.y; - tex_values[2] = in_tex.z; - tex_values[3] = in_tex.w; - - // For 2x2 tile size algorithm works as follows. - // To explain the calculations below, the contents of one in_tex and the - // group of 4 texels loaded from t_kernel are shown: - // - // in_tex t_kernel - // -x-> ---x---> - // +---+ +----+----+----+----+ - // ^ | w | ^ | D0 | D1 | D2 | D3 | - // | +---+ | +----+----+----+----+ - // | | z | | | C0 | C1 | C2 | C3 | - // z +---+ z +----+----+----+----+ - // | | y | | | B0 | B2 | B2 | B3 | - // | +---+ | +----+----+----+----+ - // | x | | A0 | A1 | A2 | A3 | - // +---+ +----+----+----+----+ - // - // In the t_kernel graphic, cells sharing the same letter are from - // the same batch/output channel index, and the number denotes a unique - // channel index. To calculate the output texel, the following - // calculation is performed: - // - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | D0 | | y | | D1 | | z | | D2 | | w | | D3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | C0 | | y | | C1 | | z | | C2 | | w | | C3 | - // +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+ - // | x | | B0 | | y | | B1 | | z | | B2 | | w | | B3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | A0 | | y | | A1 | | z | | A2 | | w | | A3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // - // which is what is expressed in the following calculations. This is done - // for each output position. - for (int j = 0; j < 4; ++j) { - sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j]; - } - } - } - - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]); - if (all(lessThan(pos_l, out_limits.xyz))) { - imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml deleted file mode 100644 index d4cb69d7648..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_pw: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - TILE_SIZE_X: 1 - TILE_SIZE_Y: 4 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_pw - - NAME: conv2d_pw_clamp - OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl deleted file mode 100644 index 9f84afeb1a1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define TILE_SIZE_X uint16_t(${TILE_SIZE_X}) -#define TILE_SIZE_Y uint16_t(${TILE_SIZE_Y}) - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_limits; - ivec2 stride; - ivec2 padding; - int in_group_size; - int dummy_padding; - float out_min; - float out_max; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "ngroups", "1")} - -#extension GL_EXT_control_flow_attributes : require - -/* - * Computes a 2D pointwise convolution of an NxN output tile. Calculating an - * output tile for pointwise convolution is more efficient because the kernel - * size is only 1x1, making it easier to re-use loaded texels from t_kernel. - */ -void main() { - const int out_limits_scaled[2] = - {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X, - (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y}; - - const uint16_t div_by_x = uint16_t(gl_GlobalInvocationID.x / out_limits_scaled[0]); - const uint16_t out_pos_xy[2] = {uint16_t(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x}; - const int out_pos_z = int(gl_GlobalInvocationID.y); - - // If the top left position is out of bounds, then this invocation will have - // no work to do. - if (out_pos_xy[1] >= out_limits_scaled[1] || out_pos_z >= out_limits.z) { - return; - } - - // Output position for TILE_SIZE = 2 - // +--------+--------+ - // | pos[0] | pos[1] | - // +--------+--------+ - // | pos[2] | pos[3] | - // +--------+--------+ - uint16_t pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; - for (uint16_t y = uint16_t(0), i = uint16_t(0); y < TILE_SIZE_Y; ++y) { - for (uint16_t x = uint16_t(0); x < TILE_SIZE_X; ++x) { - pos[i * 2] = out_pos_xy[0] * TILE_SIZE_X + x; - pos[i * 2 + 1] = out_pos_xy[1] * TILE_SIZE_Y + y; - i++; - } - } - - // Final output array where each element is a tensor value. - // Tuple of consecutive 4 elements represents a single output texel. - float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; - - // Initialize the output array with the bias value - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i++) { - sum[i] = 0; - } - - int z4 = 0; - // Since the kernel is 1x1, we only have to loop over the depth dimension. - for (int z = 0; z < in_group_size; z += 4, ++z4) { - // During prepacking, the weight tensor has been permuted so that the - // channel (IC) dim is along the x-axis, and the batch (OC) dim is along - // the z-axis. - float kernel_values[4 * 4]; // 4 channels, 4 elements per channel - - // Load kernel values from texels to array - [[unroll]] for (int i = 0; i < 4; ++i) { - const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos_z), 0); - kernel_values[i * 4 + 0] = k_tex.x; - kernel_values[i * 4 + 1] = k_tex.y; - kernel_values[i * 4 + 2] = k_tex.z; - kernel_values[i * 4 + 3] = k_tex.w; - } - - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const vec4 in_tex = texelFetch(t_in, ivec3(pos[i * 2], pos[i * 2 + 1], z4), 0); - // Load the input texel into an array - float tex_values[4]; - tex_values[0] = in_tex.x; - tex_values[1] = in_tex.y; - tex_values[2] = in_tex.z; - tex_values[3] = in_tex.w; - - // For 2x2 tile size algorithm works as follows. - // To explain the calculations below, the contents of one in_tex and the - // group of 4 texels loaded from t_kernel are shown: - // - // in_tex t_kernel - // -x-> ---x---> - // +---+ +----+----+----+----+ - // ^ | w | ^ | D0 | D1 | D2 | D3 | - // | +---+ | +----+----+----+----+ - // | | z | | | C0 | C1 | C2 | C3 | - // z +---+ z +----+----+----+----+ - // | | y | | | B0 | B2 | B2 | B3 | - // | +---+ | +----+----+----+----+ - // | x | | A0 | A1 | A2 | A3 | - // +---+ +----+----+----+----+ - // - // In the t_kernel graphic, cells sharing the same letter are from - // the same batch/output channel index, and the number denotes a unique - // channel index. To calculate the output texel, the following - // calculation is performed: - // - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | D0 | | y | | D1 | | z | | D2 | | w | | D3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | C0 | | y | | C1 | | z | | C2 | | w | | C3 | - // +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+ - // | x | | B0 | | y | | B1 | | z | | B2 | | w | | B3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | A0 | | y | | A1 | | z | | A2 | | w | | A3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // - // which is what is expressed in the following calculations. This is done - // for each output position. - for (int j = 0; j < 4; ++j) { - sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j]; - } - } - } - - const vec4 bias = texelFetch(t_bias, ivec2(out_pos_z, 0), 0); - - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos_z); - if (all(lessThan(pos_l.xy, out_limits.xy))) { - const vec4 out_sum = vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]); - imageStore(t_out, pos_l, op(out_sum + bias, out_min, out_max)); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml deleted file mode 100644 index ebfee11c405..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_pw_s1p0: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - TILE_SIZE_X: 1 - TILE_SIZE_Y: 4 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv2d_pw_s1p0 - - NAME: conv2d_pw_s1p0_clamp - OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.glsl deleted file mode 100644 index e2b239800a8..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.glsl +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)} -#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)} - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if INPUT_STORAGE == "buffer": - #define INPUT_BUFFER -$if WEIGHT_STORAGE == "buffer": - #define WEIGHT_BUFFER - -#define TILE_M4 ${TILE_M4} -#define TILE_K4 ${TILE_K4} -#define TILE_N4 ${TILE_N4} - -#define TILE_M ${TILE_M4 * 4} -#define TILE_K ${TILE_K4 * 4} -#define TILE_N ${TILE_N4 * 4} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "conv2d_common.glslh" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "output_sizes")} -${layout_declare_ubo(B, "ivec4", "input_sizes")} -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "apply_bias", "1")} - -#include "linear_fp_input_tile_load.glslh" -#include "linear_int8_weight_tile_load.glslh" -#include "linear_fp_weight_scales_load.glslh" -#include "linear_fp_bias_load.glslh" -#include "linear_fp_output_tile_fp_int8_compute.glslh" -#include "linear_fp_output_tile_fp_compute.glslh" -#include "conv2d_fp_im2col_block_store.glslh" - -void main() { - // Each thread writes out a 4 wide x 4 high tile of output values - const int out_tile_x = int(gl_GlobalInvocationID.x); - const int out_tile_y = int(gl_GlobalInvocationID.y); - - const int n = int(out_tile_x * TILE_N); - const int m = int(out_tile_y * TILE_M); - - const int n4 = div_4(n); - const int m4 = div_4(m); - - // M = flattened output width, height, batches dims - const int M = output_sizes.x * output_sizes.y * output_sizes.w; - // N = output channels - const int N = output_sizes.z; - - if (n >= N || m >= M) { - return; - } - - const int group_idx = n / conv2d_params.out_channels_per_group; - const int input_k4_offset = conv2d_params.K4_per_group * group_idx; - - const int K4 = conv2d_params.K4; - const int N4 = div_up_4(N); - - FPOutTile out_tile; - initialize(out_tile); - - FPInputTile in_tile; - Int8WeightTile int8_weight_tile; - - const bool dont_check_bounds = (M - m) >= TILE_M; - - if (dont_check_bounds) { - for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) { - load_input_tile_no_checks(in_tile, k4 + input_k4_offset, m, K4, M); - load_int8_weight_tile(int8_weight_tile, n4, k4, N4); - fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile); - } - } else { - for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) { - load_input_tile_with_checks(in_tile, k4 + input_k4_offset, m, K4, M); - load_int8_weight_tile(int8_weight_tile, n4, k4, N4); - fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile); - } - } - - FPPerOutChannelParams weight_scales_tile; - load_weight_scales_tile(weight_scales_tile, n4); - - if (apply_bias > 0) { - FPPerOutChannelParams bias_tile; - load_bias_tile(bias_tile, n4); - - apply_scales_and_biases(out_tile, weight_scales_tile, bias_tile); - } - else { - apply_scales(out_tile, weight_scales_tile); - } - - write_im2col_tile_as_image(out_tile, n4, m); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.yaml deleted file mode 100644 index 9b3b5aa2c0a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_q8csw_linear_tiled: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: texture3d - INPUT_STORAGE: buffer - WEIGHT_STORAGE: texture2d - TILE_M4: 1 - TILE_N4: 1 - TILE_K4: 1 - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: conv2d_q8csw_linear_tiled_texture3d_buffer_texture2d - - NAME: conv2d_q8csw_linear_tiled_texture3d_buffer_buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.glsl deleted file mode 100644 index f74a1311095..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.glsl +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)} -#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)} - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if PACKED_INT8_INPUT_STORAGE == "buffer": - #define PACKED_INT8_INPUT_BUFFER -$if WEIGHT_STORAGE == "buffer": - #define WEIGHT_BUFFER - -#define TILE_M4 ${TILE_M4} -#define TILE_K4 ${TILE_K4} -#define TILE_N4 ${TILE_N4} - -#define TILE_M ${TILE_M4 * 4} -#define TILE_K ${TILE_K4 * 4} -#define TILE_N ${TILE_N4 * 4} - -${define_required_extensions(DTYPE)} - -#extension GL_EXT_integer_dot_product : require - -layout(std430) buffer; - -#include "conv2d_common.glslh" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "output_sizes")} -${layout_declare_ubo(B, "ivec4", "input_sizes")} -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} - -layout(push_constant) uniform restrict Block { - float input_scale; - int input_zp; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "apply_bias", "1")} - -#include "linear_int8_input_tile_load.glslh" -#include "linear_int8_weight_tile_load.glslh" -#include "linear_int_weight_sums_load.glslh" -#include "linear_fp_weight_scales_load.glslh" -#include "linear_fp_output_tile_int8_int8_compute.glslh" -#include "linear_fp_bias_load.glslh" -#include "conv2d_fp_im2col_block_store.glslh" - -void main() { - // Each thread writes out a 4 wide x 4 high tile of output values - const int out_tile_x = int(gl_GlobalInvocationID.x); - const int out_tile_y = int(gl_GlobalInvocationID.y); - - const int n = int(out_tile_x * TILE_N); - const int m = int(out_tile_y * TILE_M); - - const int n4 = div_4(n); - const int m4 = div_4(m); - - // M = flattened output width, height, batches dims - const int M = output_sizes.x * output_sizes.y * output_sizes.w; - // N = output channels - const int N = output_sizes.z; - - if (n >= N || m >= M) { - return; - } - - const int group_idx = n / conv2d_params.out_channels_per_group; - const int input_k4_offset = conv2d_params.K4_per_group * group_idx; - - const int K4 = conv2d_params.K4; - const int N4 = div_up_4(N); - - Int32Accum out_accum; - initialize(out_accum); - - Int8InputTile int8_in_tile; - Int8WeightTile int8_weight_tile; - - for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) { - load_int8_input_tile(int8_in_tile, k4 + input_k4_offset, m4, K4); - load_int8_weight_tile(int8_weight_tile, n4, k4, N4); - - int_accumulate_with_int8_weight(out_accum, int8_in_tile, int8_weight_tile); - } - - FPPerOutChannelParams weight_scales_tile; - load_weight_scales_tile(weight_scales_tile, n4); - - IntPerOutChannelParams weight_sums_tile; - load_weight_sums_tile(weight_sums_tile, n4); - - FPOutTile out_tile; - initialize(out_tile); - if (apply_bias > 0) { - FPPerOutChannelParams bias_tile; - load_bias_tile(bias_tile, int(n4)); - - accumulate_out_tile_with_int_accum( - out_tile, - out_accum, - input_scale, - input_zp, - weight_sums_tile, - weight_scales_tile, - bias_tile); - } - else { - accumulate_out_tile_with_int_accum( - out_tile, - out_accum, - input_scale, - input_zp, - weight_sums_tile, - weight_scales_tile); - } - - write_im2col_tile_as_image(out_tile, n4, m); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.yaml deleted file mode 100644 index 629001765c1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv2d_q8ta_q8csw_linear_tiled: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: texture3d - PACKED_INT8_INPUT_STORAGE: buffer - WEIGHT_STORAGE: texture2d - TILE_M4: 1 - TILE_N4: 1 - TILE_K4: 1 - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: conv2d_q8ta_q8csw_linear_tiled_texture3d_buffer_texture2d - - NAME: conv2d_q8ta_q8csw_linear_tiled_texture3d_buffer_buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl deleted file mode 100644 index 740fe10e048..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define op(X, A, B) ${OPERATOR} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} -${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} -${layout_declare_ubo(4, "ivec3", "out_limits")} -${layout_declare_ubo(5, "ivec4", "in_sizes")} -${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} -${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")} -${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -/* - * Computes a 2D transpose convolution. Each shader invocation calculates the - * output at a single output location. For details, refer to conv2d.glsl which - * uses a similar approach. - */ -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - ivec2 ipos = pos.xy + padding; - - const ivec2 start = max( - ivec2(0), - ivec2(ceil((vec2(ipos) - kernel_size + 1) / vec2(stride)))); - const ivec2 end = - min(ivec2(in_sizes.xy), - ivec2(floor(vec2(ipos) / vec2(stride))) + 1); - - const int ic = in_group_size; - const int kx_stride = ic * (stride.x - 1); - - int ky_start = overlay_region.y - 1 - (ipos.y - stride.y * start.y) + pos.z * kernel_size.y; - int kx_start = (overlay_region.x - 1 - (ipos.x - stride.x * start.x)) * ic; - - VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); - for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += stride.y) { - for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) { - for (int z4 = 0; z4 < ic / 4; ++z4, kx += 4) { - const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, z4), 0); - const ivec4 kxs = kx + ivec4(0, 1, 2, 3); - - sum = fma(in_texel.xxxx, texelFetch(t_kernel, ivec2(kxs.x, ky), 0), sum); - sum = fma(in_texel.yyyy, texelFetch(t_kernel, ivec2(kxs.y, ky), 0), sum); - sum = fma(in_texel.zzzz, texelFetch(t_kernel, ivec2(kxs.z, ky), 0), sum); - sum = fma(in_texel.wwww, texelFetch(t_kernel, ivec2(kxs.w, ky), 0), sum); - } - } - } - - imageStore(t_out, pos, op(sum, out_min, out_max)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml deleted file mode 100644 index 0940444bf7d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv_transpose2d: - parameter_names_with_default_values: - OPERATOR: X - NDIM: 3 - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv_transpose2d - - NAME: conv_transpose2d_clamp - OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl deleted file mode 100644 index 0b10683cee4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${texel_type(DTYPE)} -#define SCALAR_T ${texel_component_type(DTYPE)} - -#include "indexing_utils.h" - -$if DTYPE == "half": - #extension GL_EXT_shader_16bit_storage : require - -layout(std430) buffer; - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; -layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - BUF_T buffer_in[]; -}; - -layout(push_constant) uniform PRECISION restrict Block { - ivec4 sizes; - ivec4 original_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -/* - * Computes special prepacking for a 2D transpose convolution. Each shader - * invocation calculates the input buffer locations to read into the desired - * texel. This packing was originally developed on CPU here: - * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211 - */ -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); - - if (any(greaterThanEqual(idx, sizes))) { - return; - } - - // Map tensor_idx to normal buffer_i - const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); - - // Compute modified tensor_idx by inverting the CPU function - const int N = original_sizes.w; - const int C = original_sizes.z; - const int H = original_sizes.y; - const int W = original_sizes.x; - const int J = sizes.y / H; - const int K = sizes.x / (4*W); - - const ivec4 p1 = p0 / (4*K); - const ivec4 p2 = p1 / W; - const ivec4 p3 = p2 / H; - - const ivec4 n = p0 % (4*K); - const ivec4 c = (p3 % J) * 4 + (p3 / J); - const ivec4 h = H-1 - p2 % H; - const ivec4 w = W-1 - p1 % W; - - // Map modified tensor_idx to modifed buffer_i - // Zero out if modified tensor idx is out of bounds - const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w; - const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C)))); - - VEC4_T texel = VEC4_T(0); - if (mask.x) { - texel.x = SCALAR_T(buffer_in[buf_i.x]); - } - if (mask.y) { - texel.y = SCALAR_T(buffer_in[buf_i.y]); - } - if (mask.z) { - texel.z = SCALAR_T(buffer_in[buf_i.z]); - } - if (mask.w) { - texel.w = SCALAR_T(buffer_in[buf_i.w]); - } - - imageStore(image_out, pos.xy, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml deleted file mode 100644 index d933cd097aa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -conv_transpose2d_prepack_weights: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: conv_transpose2d_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl deleted file mode 100644 index 39aa9b11a0d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - // Operates on (x, y, z) logical extents. - // channel_range is stored in range.w - ivec4 range; - // Analogus to range variable in copy. It defines the # of channel being - // copied. - // dst channel offset is stored in dst_offset.w - ivec4 dst_offset; - int src_channel_offset; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -void main() { - // Note: Unlike other shaders, the range is often not equal to the destination - // texture extent. - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(lpos, range.xyz))) { - return; - } - - const ivec3 out_lpos = lpos + dst_offset.xyz; - - const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim); - - // First read the existing values to make sure the boundary values stay. - VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map); - - ivec4 in_tidx = out_tidx; - for (int i=0; i<4; i++) { - - in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i; - - // Handle the partial update for begining of channel in an existing tensor. - // If the source channel index is below zero or exceeds the range, we skip - // updating the element to avoid overwriting existing data. - if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) { - continue; - } - - // Readjust for the source offset. - in_tidx[packed_dim] += src_channel_offset; - - ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim); - v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w]; - } - - write_texel_lpos(t_out, out_lpos, v, out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml deleted file mode 100644 index 984d9a09d43..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml +++ /dev/null @@ -1,12 +0,0 @@ -copy_channel_offset: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: copy_channel_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl deleted file mode 100644 index 178814a90c3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type(STORAGE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec3 range; - // xyz is source offset w is channel size - ivec4 src_offset; - // xyz is destination offset w is channel size - ivec4 dst_offset; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -${layout_declare_spec_const(C, "int", "batch_index_function", "0")} - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range))) { - return; - } - - ivec3 in_pos = pos + src_offset.xyz; - ivec3 out_pos = pos + dst_offset.xyz; - if (src_offset.w > 0) { - if (batch_index_function == 1) { - // batch index is calculated using source channel size - const int channel_index = pos.z % src_offset.w; - const int batch_index = pos.z / src_offset.w; - out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w; - } else if (batch_index_function == 2) { - // batch index is calculated using destination channel size - const int channel_index = pos.z % dst_offset.w; - const int batch_index = pos.z / dst_offset.w; - in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w; - } - } - - write_texel_lpos( - t_out, - out_pos, - load_texel_lpos(t_in, in_pos, in_axis_map), - out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml deleted file mode 100644 index 09f5ca36ea4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml +++ /dev/null @@ -1,17 +0,0 @@ -copy_offset: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - VALUE: int8 - - VALUE: uint8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d - shader_variants: - - NAME: copy_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl deleted file mode 100644 index 3100565d08a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec4 range; - - // xyz is source offset w is channel size - ivec4 src_offset; - - // xyz is destination offset w is channel size - ivec4 dst_offset; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range.xyz))) { - return; - } - - // Position in input tensor - ivec3 in_pos = pos + src_offset.xyz; - in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2); - - // Read input value mapping to this output texel - VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map); - - // Starting offset to read from a texel - const int src_lane_offset = src_offset[packed_dim] & 0x3; - const bool has_src_lane_offset = src_lane_offset != 0; - - // If input lane offset is non zero i.e packed texel is composed from multiple sources - if (has_src_lane_offset) { - // Boundary values will come from next input texel in the packed dim. - ivec3 next_in_pos = in_pos; - next_in_pos[packed_dim] = in_pos[packed_dim] + 1; - VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map); - - // Keep input values from the end of current input pixel based on src_lane_offset - // offset 1 means the first lane of current input texel is not a part of the output texel - // offset 2 means first 2 lanes are not and so on - // Copy next texel's values towards the end of input texel, based on lane offset - // offset 1 means the first lane from next texel is part of the input texel - // offset 2 means first 2 lanes from next texel is part of the input texel and so on - if (src_lane_offset == 1) { - in_value = ivec4(in_value.yzw, next_value.x); - } else if (src_lane_offset == 2) { - in_value = ivec4(in_value.zw, next_value.xy); - } else { - in_value = ivec4(in_value.w, next_value.xyz); - } - } - - // Starting offset to write at within a texel - const int out_lane_offset = dst_offset[packed_dim] & 0x3; - const bool has_dst_lane_offset = out_lane_offset != 0; - - ivec3 out_pos = pos + dst_offset.xyz; - out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2); - - VEC4_T out_value; - - // If lane offset is non zero i.e packed texel is composed from multiple sources - if (has_dst_lane_offset) { - // When position in packed dim is > 0 - if (pos[packed_dim] > 0) { - // Boundary values will come from previous input texel in the packed dim. - ivec3 prev_in_pos = in_pos; - prev_in_pos[packed_dim] = in_pos[packed_dim] - 1; - VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map); - - // Shift values toward the beginning based on out_lane_offset - // offset 1 means the last lane from the previous texel is a part of the output texel - // offset 2 means last 2 lanes and so on - if (out_lane_offset == 1) { - out_value.x = prev_value.w; - } else if (out_lane_offset == 2) { - out_value.xy = prev_value.zw; - } else { - out_value.xyz = prev_value.yzw; - } - } else { - // When position in packed dim is == 0 - // Boundary values will be the previous texel values. - out_value = load_texel_lpos(existing_out, out_pos, out_axis_map); - } - - // Copy input values towards the end of output array, based on lane offset - // offset 1 means the first lane from previous texel is part of the output texel starting at offset - // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on - if (out_lane_offset == 1) { - out_value.yzw = in_value.xyz; - } else if (out_lane_offset == 2) { - out_value.zw = in_value.xy; - } else { - out_value.w = in_value.x; - } - } else { - out_value = in_value; - } - - write_texel_lpos( - t_out, - out_pos, - out_value, - out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml deleted file mode 100644 index 6e55876cb28..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml +++ /dev/null @@ -1,12 +0,0 @@ -copy_packed_dim_offset: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: copy_packed_dim_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glslh b/backends/vulkan/runtime/graph/ops/glsl/dequantize.glslh deleted file mode 100644 index 7194bebda35..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glslh +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef DEQUANTIZE_GLSLH -#define DEQUANTIZE_GLSLH - -OUT_T dequantize_val(IN_T qvalue, float scale_val, int zero_point_val) { - return OUT_T(float(int(qvalue) - zero_point_val) * scale_val); -} - -#endif // DEQUANTIZE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl deleted file mode 100644 index 57dc2d53fff..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define IN_T ${buffer_scalar_type(IN_DTYPE)} -#define OUT_T ${buffer_scalar_type(OUT_DTYPE)} -#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)} -#define ZP_T ${buffer_scalar_type(ZP_DTYPE)} - -#define ${MODE} - -${define_active_storage_type("buffer")} -${define_required_extensions(IN_DTYPE)} -${define_required_extensions(OUT_DTYPE)} -${define_required_extensions(SCALE_DTYPE)} -${define_required_extensions(ZP_DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")} - -$if MODE == "per_tensor": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int quant_min; - int quant_max; - }; -$if MODE == "per_token": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int num_tokens; - int quant_min; - int quant_max; - }; -$if MODE == "per_channel": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int axis; - int num_channels; - int quant_min; - int quant_max; - }; -$if MODE == "block_wise": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - ivec4 blockSize; // bW, bH, bC, bN - ivec4 numBlocks; // tW/bW, tH/bH, tC/bC, tN/bN - ivec4 blockStride; // pre-computed linear strides for the block grid - int quant_min; - int quant_max; - }; - -${layout_declare_ubo(B, "int", "out_numel")} -${layout_declare_ubo(B, "ivec4", "t_in_sizes")} -${layout_declare_ubo(B, "ivec4", "t_in_strides")} -${layout_declare_ubo(B, "ivec4", "t_out_sizes")} -${layout_declare_ubo(B, "ivec4", "t_out_strides")} - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} - -#include "dequantize.glslh" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); -const lowp ivec4 in_dim_order = unhash_dim_order(in_layout); - -/* - Dequantization Shader (Buffer Storage) - This shader converts n-bit integer tensor values back to floating-point representations - using pre-computed quantization parameters (scale and zero_point). The dequantization - reconstructs the original floating-point values from their discrete integer representations - with minimal precision loss. - - Important Considerations: - (+) All input tensors are assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension) - (+) The axis map layout is assumed to be a standard layout for scales and zero_points - (++) The scale and zero_point tensors must be implemented as buffers - - Workgroup Configuration: - - dequantize_per_tensor - This mode reverses the uniform quantization applied across the entire tensor by using the - single scale and zero_point values to convert quantized integer values back to their original - floating-point representation. - - (*) global_wg_size: default - (*) local_wg_size: default - - - dequantize_per_token - This mode reverses the quantization applied individually to each token (or element) in the - input by using separate scale and zero_point values for each token. For a tensor of shape - [B, S, H], it applies the inverse transformation token-wise across the B*S tokens, converting - quantized values back to their original floating-point representation for each group of H - elements independently. - - (*) global_wg_size: default - (*) local_wg_size: default - - - dequantize_per_channel - This mode reverses the quantization applied separately to each channel of the input tensor - by using distinct scale and zero_point values for each channel. For a tensor of shape - [B, C, H, W] with axis = 1, it applies the inverse transformation channel-wise across the C - channels, converting quantized values back to their original floating-point representation - independently for each channel. - - (*) global_wg_size: default - (*) local_wg_size: default - - - dequantize_block_wise - This mode reverses the block-wise quantization applied to groups of elements by using separate - scale and zero_point values for each block. Equivalent to dequantize_affine, it applies the - inverse affine transformation per block to convert quantized values back to their original - floating-point representation. For example, if the tensor shape is [6, 9, 4] and - blockSize = [3, 3, 2], the tensor is divided into 12 blocks, each containing 18 elements, - and dequantization is performed independently on each block. - - (*) global_wg_size: default - (*) local_wg_size: default - - Dequantization Formula: - value = (qvalue - zero_point) * scale -*/ - -#ifdef per_tensor - -void dequantize_per_tensor() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T qvalue = t_in[in_bufi]; - OUT_T value = dequantize_val(qvalue, float(t_scale[0]), int(t_zero_point[0])); - - t_out[out_bufi] = value; -} - -#elif defined(per_token) - -void dequantize_per_token() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T qvalue = t_in[in_bufi]; - - int token_idx = 0; - - if (t_out_sizes.w > 1) { - // 4D tensor - token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y; - } else if (t_out_sizes.z > 1) { - // 3D tensor - token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y; - } else if (t_out_sizes.y > 1) { - // 2D tensor - token_idx = out_tidx.y; - } - // For 1D tensor, token_idx remains 0 - - token_idx = min(token_idx, num_tokens - 1); - - OUT_T value = dequantize_val(qvalue, float(t_scale[token_idx]), int(t_zero_point[token_idx])); - - t_out[out_bufi] = value; -} - -#elif defined(per_channel) - -void dequantize_per_channel() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T qvalue = t_in[in_bufi]; - - // Calculate channel index based on the dequantization axis (already converted to WHCN) - // The axis parameter is now in WHCN coordinate system: - // axis 0 -> W dimension (tidx.x) - // axis 1 -> H dimension (tidx.y) - // axis 2 -> C dimension (tidx.z) - // axis 3 -> N dimension (tidx.w) - int channel_idx = 0; - - if (axis == 0) { - channel_idx = out_tidx.x; - } else if (axis == 1) { - channel_idx = out_tidx.y; - } else if (axis == 2) { - channel_idx = out_tidx.z; - } else if (axis == 3) { - channel_idx = out_tidx.w; - } - - channel_idx = min(channel_idx, num_channels - 1); - - OUT_T value = dequantize_val(qvalue, float(t_scale[channel_idx]), int(t_zero_point[channel_idx])); - - t_out[out_bufi] = value; -} - -#else // block_wise - -void dequantize_block_wise() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T qvalue = t_in[in_bufi]; - - const ivec4 bcoord = out_tidx / blockSize; - - const int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w; - - const OUT_T value = dequantize_val(qvalue, float(t_scale[block_id]), int(t_zero_point[block_id])); - - t_out[out_bufi] = value; -} - -#endif - -void main() { - dequantize_${MODE}(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml deleted file mode 100644 index a4375038a75..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dequantize_buffer: - parameter_names_with_default_values: - IN_DTYPE: int32 - OUT_DTYPE: float - SCALE_DTYPE: float - ZP_DTYPE: int32 - MODE: per_tensor - generate_variant_forall: - IN_DTYPE: - - VALUE: uint8 - - VALUE: int8 - - VALUE: int32 - OUT_DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - SCALE_DTYPE: - - VALUE: float - ZP_DTYPE: - - VALUE: int8 - - VALUE: int32 - - VALUE: float - shader_variants: - - NAME: dequantize_per_tensor_buffer - MODE: per_tensor - - NAME: dequantize_per_token_buffer - MODE: per_token - - NAME: dequantize_per_channel_buffer - MODE: per_channel - - NAME: dequantize_block_wise_buffer - MODE: block_wise diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl deleted file mode 100644 index 19276cd8f7f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define IN_T ${buffer_scalar_type(IN_DTYPE)} -#define IVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")} - -#define OUT_T ${buffer_scalar_type(OUT_DTYPE)} -#define FVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")} -#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)} -#define ZP_T ${buffer_scalar_type(ZP_DTYPE)} - -#define ${MODE} - -${define_active_storage_type("texture3d")} -${define_required_extensions(IN_DTYPE)} -${define_required_extensions(OUT_DTYPE)} -${define_required_extensions(SCALE_DTYPE)} -${define_required_extensions(ZP_DTYPE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")} - -$if MODE == "per_tensor": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int quant_min; - int quant_max; - }; -$if MODE == "per_token": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int num_tokens; - int quant_min; - int quant_max; - }; -$if MODE == "per_channel": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int axis; - int num_channels; - int quant_min; - int quant_max; - }; -$if MODE == "block_wise": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - ivec4 blockSize; // bW, bH, bC, bN - ivec4 numBlocks; // tW/bW, tH/bH, tC/bC, tN/bN - ivec4 blockStride; // pre-computed linear strides for the block grid - int quant_min; - int quant_max; - }; - -${layout_declare_ubo(B, "ivec3", "t_in_limits")} -${layout_declare_ubo(B, "ivec3", "t_out_limits")} - -#include "indexing_utils.h" -#include "dequantize.glslh" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - * DEQUANTIZATION SHADER (TEXTURE STORAGE) - * - * This shader converts n-bit integer tensor values back to floating-point representations - * using pre-computed quantization parameters (scale and zero_point). The dequantization - * reconstructs the original floating-point values from their discrete integer representations - * with minimal precision loss. - * - * ALGORITHM: - * 1. Load quantized integer texel (4 values) from 3D texture - * 2. Apply dequantization formula to each component: value = (qvalue - zero_point) * scale - * 3. Store reconstructed floating-point texel to output texture - * - * WORKGROUP CONFIGURATION: - * - Per-Tensor Mode: - * - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing - * - Local WG Size: Default (typically {8, 8, 1} or based on global WG size) - * - Per-Token Mode: - * - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing - * - Local WG Size: Default (typically {8, 8, 1} or based on global WG size) - * - * SUPPORTED CONFIGURATIONS: - * - Texture Storage: Uses 3D texture indexing with texel-based processing - * - Assumes width-packed layout (packed_dim = 0) for input/output textures - * - Handles texel padding for non-multiple-of-4 tensor dimensions - * - For per-token mode: scale/zero_point tensors must use buffer storage - * - Input/output textures: Must use standard axis mapping for per-token mode - * - * DEQUANTIZATION FORMULA VISUALIZATION: - * For integer range [quant_min, quant_max] mapped back to [min_val, max_val]: - * - * Integer Domain: Floating Point Domain: - * quant_min ──────────────► min_val - * │ │ - * │ scale = (max_val - min_val) / (quant_max - quant_min) - * │ zero_point = quant_min - round(min_val / scale) - * │ │ - * quant_max ──────────────► max_val - * - * Texel Dequantization Process: - * Input Texel: [-103, -128, -123, -96] (int4) - * Per-component dequantization with scale=0.1, zero_point=-128: - * Component 0: (-103 - (-128)) * 0.1 = 25 * 0.1 = 2.5 - * Component 1: (-128 - (-128)) * 0.1 = 0 * 0.1 = 0.0 - * Component 2: (-123 - (-128)) * 0.1 = 5 * 0.1 = 0.5 - * Component 3: (-96 - (-128)) * 0.1 = 32 * 0.1 = 3.2 - * Output Texel: [2.5, 0.0, 0.5, 3.2] (float4) - * - * PER-TENSOR DEQUANTIZATION: - * - Single scale and zero_point values for entire tensor - * - All texel components use same dequantization parameters - * - Parameters passed as push constants for efficiency - * - Each thread processes one texel (4 elements) independently - * - Formula: value[i] = (qvalue[i] - zero_point) * scale - * - * PER-TOKEN DEQUANTIZATION: - * - Separate scale and zero_point for each token - * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements) - * - Parameters stored in buffer arrays indexed by token_id - * - Each thread calculates token_id from its 3D texture position - * - Scale/zero_point buffers accessed directly (not as textures) - * - Formula: value[i] = (qvalue[i] - zero_point[token_id]) * scale[token_id] - * - * Token ID calculation for texel at position (x, y, z): - * - 3D tensor: token_id = z * texture_height + y - * - 2D tensor: token_id = y - * - 1D tensor: token_id = 0 - */ - -#ifdef per_tensor - -void dequantize_per_tensor() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - // Skip if out of bounds - if (any(greaterThanEqual(pos, t_in_limits))) { - return; - } - - IVEC4_T intex = load_texel(t_in, pos); - FVEC4_T outtex; - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T qvalue = IN_T(intex[i]); - OUT_T value = dequantize_val(qvalue, float(t_scale[0]), int(t_zero_point[0])); - - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - write_texel(t_out, pos, outtex); -} - -#elif defined(per_token) - -void dequantize_per_token() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) { - return; - } - - IVEC4_T intex = load_texel(t_in, pos); - - int token_idx = 0; - ivec3 dims = t_in_limits; - - if (dims.z > 1) { - // 3D tensor - token_idx = pos.z * dims.y + pos.y; - } else if (dims.y > 1) { - // 2D tensor - token_idx = pos.y; - } - // For 1D tensor, token_idx remains 0 - - token_idx = min(token_idx, num_tokens - 1); - - // Scale and zero_point are prepacked as buffers, so direct access - float scale_val = float(t_scale[token_idx]); - int zero_point_val = int(t_zero_point[token_idx]); - - FVEC4_T outtex; - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T qvalue = IN_T(intex[i]); - OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val); - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - - write_texel(t_out, pos, outtex); -} - -#elif defined(per_channel) - -void dequantize_per_channel() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) { - return; - } - - IVEC4_T intex = load_texel(t_in, pos); - FVEC4_T outtex; - - // Calculate channel index based on the dequantization axis (already converted to WHCN) - // The axis parameter is now in WHCN coordinate system: - // axis 0 -> W dimension (pos.x) - // axis 1 -> H dimension (pos.y) - // axis 2 -> C dimension (pos.z) - // axis 3 -> N dimension (batch folding in texture storage) - - if (axis == 0) { - // Width dimension - each texel component has different channel index - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T qvalue = IN_T(intex[i]); - int channel_idx = pos.x * 4 + i; - channel_idx = min(channel_idx, num_channels - 1); - - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val); - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - } else if (axis == 1) { - int channel_idx = pos.y; - channel_idx = min(channel_idx, num_channels - 1); - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T qvalue = IN_T(intex[i]); - OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val); - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - } else if (axis == 2) { - // Channel dimension - for 4D tensors, need to account for batch-channel folding - // The Z coordinate contains folded batch*channel information - // We need to extract the actual channel index from the folded dimension - int folded_idx = pos.z; - int channel_idx = folded_idx % num_channels; - - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T qvalue = IN_T(intex[i]); - OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val); - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - } else if (axis == 3) { - // Batch dimension - for 4D tensors, need to account for batch-channel folding - // The Z coordinate contains folded batch*channel information - // We need to extract the actual channel index from the folded dimension - int folded_idx = pos.z; - // In this case num_channels actually corresponds to the number of channels - // the C dimension N(C)HW - int channel_idx = folded_idx / num_channels; - - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T qvalue = IN_T(intex[i]); - OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val); - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - } - - write_texel(t_out, pos, outtex); -} - -#else // block_wise - -void dequantize_block_wise() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) - return; - - IVEC4_T intex = load_texel(t_in, pos); - FVEC4_T outtex; - - ivec4 base_tidx = ivec4(pos.x * 4, pos.y, pos.z, 0); - int foldedZ = pos.z; - - int C_total = numBlocks.z * blockSize.z; - - [[unroll]] for (int i = 0; i < 4; ++i) { - ivec4 tidx = ivec4(base_tidx.x + i, base_tidx.y, (foldedZ % C_total), (foldedZ / C_total)); - - ivec4 bcoord = tidx / blockSize; - int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w; - - IN_T qvalue = IN_T(intex[i]); - OUT_T value = dequantize_val(qvalue, float(t_scale[block_id]), int(t_zero_point[block_id])); - $if OUT_DTYPE == "double": - outtex[i] = float(value); - $else: - outtex[i] = value; - } - - write_texel(t_out, pos, outtex); -} - -#endif - -void main() { - dequantize_${MODE}(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml deleted file mode 100644 index 7a58e9410d3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dequantize_texture: - parameter_names_with_default_values: - IN_DTYPE: int32 - OUT_DTYPE: float - SCALE_DTYPE: float - ZP_DTYPE: int32 - MODE: per_tensor - generate_variant_forall: - IN_DTYPE: - - VALUE: uint8 - - VALUE: int8 - - VALUE: int32 - OUT_DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - SCALE_DTYPE: - - VALUE: float - ZP_DTYPE: - - VALUE: int8 - - VALUE: int32 - - VALUE: float - shader_variants: - - NAME: dequantize_per_tensor_texture3d - MODE: per_tensor - - NAME: dequantize_per_token_texture3d - MODE: per_token - - NAME: dequantize_per_channel_texture3d - MODE: per_channel - - NAME: dequantize_block_wise_texture3d - MODE: block_wise diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl deleted file mode 100644 index 73a444cd84d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)} -${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "sizes")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -${layout_declare_spec_const(C, "int", "weight_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 weight_axis_map = unhash_axis_map(weight_layout); - -void main() { - const ivec3 out_lpos = ivec3(gl_GlobalInvocationID); - const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim); - if (any(greaterThanEqual(out_tidx, sizes))) { - return; - } - VEC4_T out_texel; - - // Consider optimizing via W-packing format for t_in and t_weight. - for (int i = 0; i < 4; ++i) { - // Read input tensor for embedding index. - const ivec3 in_lpos = ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4); - const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4]; - - // Read weight tensor for embedding, it is height-packed. - const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem / 4, 0); - out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map)[in_texel_elem % 4]; - } - - write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml b/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml deleted file mode 100644 index 0e7b491c433..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -embedding: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: embedding diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl deleted file mode 100644 index ce433040b66..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing.glslh" - -${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")} - -${layout_declare_ubo(B, "BufferMetadata", "outp")} -${layout_declare_ubo(B, "BufferMetadata", "inp")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const uint outp_bufi = gl_GlobalInvocationID.x; - if (outp_bufi >= numel(outp)) { - return; - } - - TensorIndex outp_tidx; - linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx); - - // Map output tensor index to input tensor index by taking modulo - // with input tensor sizes for each dimension - TensorIndex inp_tidx = outp_tidx; - for (int d = 0; d < ndim(inp); ++d) { - uint inp_size = size_at(inp, d); - uint outp_idx = idx_at(outp_tidx, d); - inp_tidx.data[div_4(d)][mod_4(d)] = outp_idx % inp_size; - } - - const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx); - // Copy data from input to output - t_outp[outp_bufi] = t_inp[inp_bufi]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml deleted file mode 100644 index 6d90e1fa8b1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml +++ /dev/null @@ -1,10 +0,0 @@ -expand_buffer: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: expand_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.glsl deleted file mode 100644 index 8509fdf1f49..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.glsl +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define T ${buffer_scalar_type(DTYPE)} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -// Flash Attention inputs: Query, Key, Value tensors -${layout_declare_tensor(B, "rw", "t_O", DTYPE, "buffer")} -${layout_declare_tensor(B, "rw", "t_l", "float", "buffer")} -${layout_declare_tensor(B, "rw", "t_m", "float", "buffer")} -${layout_declare_tensor(B, "r", "t_Q", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_K", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_V", DTYPE, "buffer")} - -${layout_declare_ubo(B, "ivec4", "Q_sizes")} // [B, H, N, D] -${layout_declare_ubo(B, "ivec4", "K_sizes")} -${layout_declare_ubo(B, "ivec4", "V_sizes")} -${layout_declare_ubo(B, "ivec4", "O_sizes")} - -${layout_declare_ubo(B, "ivec3", "l_sizes")} // [B, H, N] -${layout_declare_ubo(B, "ivec3", "m_sizes")} // [B, H, N] - -${layout_declare_ubo(B, "float", "scale")} -${layout_declare_ubo(B, "int", "block_size_r")} // Br (num rows in Q block) -${layout_declare_ubo(B, "int", "block_size_c")} // Bc (num cols in K/V block) -${layout_declare_ubo(B, "int", "input_pos")} // Starting position for causal masking -${layout_declare_ubo(B, "int", "num_heads")} // Number of query heads -${layout_declare_ubo(B, "int", "num_kv_heads")} // Number of key/value heads -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// Maximum block sizes to prevent array overflow -#define MAX_BR 64 -#define MAX_BC 128 - -void main() { - // Each thread processes one row block - const int thread_id = int(gl_GlobalInvocationID.x); - - // Tensor dimensions: Q_sizes = [D, H, N, B] from graph.sizes_ubo() - // The UBO layout is different from the PyTorch tensor layout - const int head_dim = Q_sizes.x; // D (head dim) - const int num_heads = Q_sizes.y; // H (num heads) - const int seq_len = Q_sizes.z; // N (sequence length) - const int batch_size = Q_sizes.w; // B (batch) - - // Block sizes - const int Br = block_size_r; - const int Bc = block_size_c; - - const int Tr = (seq_len + Br - 1) / Br; // Number of row blocks - const int total_row_blocks = batch_size * num_heads * Tr; - - if (thread_id >= total_row_blocks) { - return; - } - - // Decode thread_id to (batch, head, row_block) - const int batch = thread_id / (num_heads * Tr); - const int remaining = thread_id % (num_heads * Tr); - const int head = remaining / Tr; - const int row_block = remaining % Tr; - - // Calculate row range for this block - const int row_start = row_block * Br; - const int row_end = min(row_start + Br, seq_len); - const int actual_Br = row_end - row_start; - - // Base indices for this batch - const int q_base = batch * (seq_len * num_heads * head_dim); - const int k_base = batch * (seq_len * num_heads * head_dim); - const int v_base = batch * (seq_len * num_heads * head_dim); - const int o_base = batch * (seq_len * num_heads * head_dim); - const int lm_base = batch * (seq_len * num_heads); - - // STEP 2: Initialize O = 0, l = 0, m = -inf for this row block - for (int r = 0; r < actual_Br; r++) { - const int seq_pos = row_start + r; - const int lm_idx = lm_base + head * seq_len + seq_pos; - - t_l[lm_idx] = 0.0; - t_m[lm_idx] = -1.0 / 0.0; // -infinity - - for (int dim = 0; dim < head_dim; dim++) { - const int o_idx = o_base + seq_pos * (num_heads * head_dim) + head * head_dim + dim; - t_O[o_idx] = T(0.0); - } - } - - // STEP 5: Outer loop over column blocks (For K, V tensors) - const int Tc = (seq_len + Bc - 1) / Bc; // Number of column blocks - for (int j = 0; j < Tc; j++) { - const int col_start = j * Bc; - const int col_end = min(col_start + Bc, seq_len); - const int actual_Bc = col_end - col_start; - - // STEP 6-8 done implicitly below - - // Load current statistics for all rows in this block - float m_i[MAX_BR]; - float l_i[MAX_BR]; - for (int r = 0; r < actual_Br; r++) { - const int seq_pos = row_start + r; - const int lm_idx = lm_base + head * seq_len + seq_pos; - m_i[r] = t_m[lm_idx]; - l_i[r] = t_l[lm_idx]; - } - - // STEP 9: Compute Sij = Qi * Kj^T - T S_block[MAX_BR][MAX_BC]; // Use MAX_BR and MAX_BC constants - float m_tilde_ij[MAX_BR]; // Row maxes (float to match l/m) - float l_tilde_ij[MAX_BR]; // Row sums (float to match l/m) - - // Initialize row statistics - for (int r = 0; r < actual_Br; r++) { - m_tilde_ij[r] = -1.0 / 0.0; // -infinity - l_tilde_ij[r] = 0.0; - } - - // Compute attention scores Sij = Qi @ Kj^T - for (int r = 0; r < actual_Br; r++) { - const int global_row = row_start + r; - for (int c = 0; c < actual_Bc; c++) { - const int global_col = col_start + c; - - // For multi-query attention: map query head to KV head - const int kv_head = (head * num_kv_heads) / num_heads; - - // Dot product: Q[seq_pos, :] · K[col_pos, :] - T score = T(0.0); - for (int dim = 0; dim < head_dim; dim++) { - const int q_idx = q_base + global_row * (num_heads * head_dim) + head * head_dim + dim; - const int k_idx = k_base + global_col * (num_kv_heads * head_dim) + kv_head * head_dim + dim; - score += t_Q[q_idx] * t_K[k_idx]; - } - score *= scale; - - - // Apply causal masking: mask if global_col > global_row + input_pos - if (global_col > global_row + input_pos) { - score = T(-1.0 / 0.0); // Set to negative infinity - } - - S_block[r][c] = score; - - // Track row maximum (after masking) - m_tilde_ij[r] = max(m_tilde_ij[r], float(score)); - } - } - - // STEP 10: Compute P'ij = exp(Sij − m'ij) and l'ij = rowsum(P'ij) - for (int r = 0; r < actual_Br; r++) { - // Handle the case where all scores are -inf (fully masked row) - if (isinf(m_tilde_ij[r]) && m_tilde_ij[r] < 0.0) { - // All scores are -inf, so all probabilities are 0 - for (int c = 0; c < actual_Bc; c++) { - S_block[r][c] = T(0.0); - } - l_tilde_ij[r] = 0.0; - } else { - // Normal case: compute softmax - for (int c = 0; c < actual_Bc; c++) { - S_block[r][c] = exp(S_block[r][c] - T(m_tilde_ij[r])); - l_tilde_ij[r] += float(S_block[r][c]); - } - } - } - - // STEP 11: Softmax update - float m_new_i[MAX_BR]; - float l_new_i[MAX_BR]; - for (int r = 0; r < actual_Br; r++) { - m_new_i[r] = max(m_i[r], m_tilde_ij[r]); - - l_new_i[r] = exp(m_i[r] - m_new_i[r]) * l_i[r] + exp(m_tilde_ij[r] - m_new_i[r]) * l_tilde_ij[r]; - } - - // STEP 12: Update Oi - for (int r = 0; r < actual_Br; r++) { - const int global_row = row_start + r; - float alpha = exp(m_i[r] - m_new_i[r]); - float beta = exp(m_tilde_ij[r] - m_new_i[r]); - - // For multi-query attention: map query head to KV head - const int kv_head = (head * num_kv_heads) / num_heads; - - for (int dim = 0; dim < head_dim; dim++) { - const int o_idx = o_base + global_row * (num_heads * head_dim) + head * head_dim + dim; - - // Compute P'ij @ Vj for this dimension - T pv_sum = T(0.0); - for (int c = 0; c < actual_Bc; c++) { - const int global_col = col_start + c; - const int v_idx = v_base + global_col * (num_kv_heads * head_dim) + kv_head * head_dim + dim; - pv_sum += S_block[r][c] * t_V[v_idx]; - } - - // Check for division by zero before updating output - if (l_new_i[r] <= 0.0) { - t_O[o_idx] = T(0.0); // Set to zero to avoid NaN - } else { - // Oi = (alpha * l_i * Oi + beta * P'ij @ Vj) / l_new_i - t_O[o_idx] = (T(alpha) * T(l_i[r]) * t_O[o_idx] + T(beta) * pv_sum) / T(l_new_i[r]); - } - } - } - - // STEP 13: Update li, mi - for (int r = 0; r < actual_Br; r++) { - const int seq_pos = row_start + r; - const int lm_idx = lm_base + head * seq_len + seq_pos; - t_l[lm_idx] = l_new_i[r]; - t_m[lm_idx] = m_new_i[r]; - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.yaml deleted file mode 100644 index 795ab906caa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -flash_attention_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: flash_attention_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.glsl deleted file mode 100644 index 1f72a583410..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.glsl +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define T ${buffer_scalar_type(DTYPE)} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -// Flash Attention inputs: Query, Key, Value tensors using texture storage -${layout_declare_tensor(B, "rw", "t_O", DTYPE, "texture3d")} -${layout_declare_tensor(B, "rw", "t_l", "float", "texture3d")} -${layout_declare_tensor(B, "rw", "t_m", "float", "texture3d")} -${layout_declare_tensor(B, "r", "t_Q", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_K", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_V", DTYPE, "texture3d")} - -${layout_declare_ubo(B, "ivec4", "Q_sizes")} // [B, H, N, D] -${layout_declare_ubo(B, "ivec4", "K_sizes")} -${layout_declare_ubo(B, "ivec4", "V_sizes")} -${layout_declare_ubo(B, "ivec4", "O_sizes")} - -${layout_declare_ubo(B, "ivec3", "l_sizes")} // [B, H, N] -${layout_declare_ubo(B, "ivec3", "m_sizes")} // [B, H, N] - -${layout_declare_ubo(B, "float", "scale")} -${layout_declare_ubo(B, "int", "block_size_r")} // Br (num rows in Q block) -${layout_declare_ubo(B, "int", "block_size_c")} // Bc (num cols in K/V block) -${layout_declare_ubo(B, "int", "input_pos")} // Starting position for causal masking -${layout_declare_ubo(B, "int", "num_heads")} // Number of query heads -${layout_declare_ubo(B, "int", "num_kv_heads")} // Number of key/value heads - -// Axis mapping setup for proper texture indexing -${layout_declare_spec_const(C, "int", "Q_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 Q_axis_map = unhash_axis_map(Q_layout); -const lowp int Q_packed_dim = unhash_packed_dim(Q_layout); - -${layout_declare_spec_const(C, "int", "K_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 K_axis_map = unhash_axis_map(K_layout); -const lowp int K_packed_dim = unhash_packed_dim(K_layout); - -${layout_declare_spec_const(C, "int", "V_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 V_axis_map = unhash_axis_map(V_layout); -const lowp int V_packed_dim = unhash_packed_dim(V_layout); - -${layout_declare_spec_const(C, "int", "O_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 O_axis_map = unhash_axis_map(O_layout); -const lowp int O_packed_dim = unhash_packed_dim(O_layout); - -${layout_declare_spec_const(C, "int", "l_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 l_axis_map = unhash_axis_map(l_layout); -const lowp int l_packed_dim = unhash_packed_dim(l_layout); - -${layout_declare_spec_const(C, "int", "m_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 m_axis_map = unhash_axis_map(m_layout); -const lowp int m_packed_dim = unhash_packed_dim(m_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// Maximum block sizes to prevent array overflow -#define MAX_BR 64 -#define MAX_BC 128 - -// Texture access helper functions using proper axis mapping -// Q_sizes, K_sizes, V_sizes, O_sizes are [D, H, N, B] (UBO layout) -// l_sizes, m_sizes are [B, H, N] (UBO layout) -T load_tensor_Q(int batch, int seq_pos, int head, int dim) { - ivec4 tidx = ivec4(dim, head, seq_pos, batch); // Match [D, H, N, B] order - ivec3 pos = tidx_to_pos(tidx, Q_sizes, Q_axis_map, Q_packed_dim); - int component = tidx[Q_packed_dim] % 4; - vec4 texel = texelFetch(t_Q, pos, 0); - return T(texel[component]); -} - -T load_tensor_K(int batch, int seq_pos, int head, int dim) { - ivec4 tidx = ivec4(dim, head, seq_pos, batch); // Match [D, H, N, B] order - ivec3 pos = tidx_to_pos(tidx, K_sizes, K_axis_map, K_packed_dim); - int component = tidx[K_packed_dim] % 4; - vec4 texel = texelFetch(t_K, pos, 0); - return T(texel[component]); -} - -T load_tensor_V(int batch, int seq_pos, int head, int dim) { - ivec4 tidx = ivec4(dim, head, seq_pos, batch); // Match [D, H, N, B] order - ivec3 pos = tidx_to_pos(tidx, V_sizes, V_axis_map, V_packed_dim); - int component = tidx[V_packed_dim] % 4; - vec4 texel = texelFetch(t_V, pos, 0); - return T(texel[component]); -} - -T load_tensor_O(int batch, int seq_pos, int head, int dim) { - ivec4 tidx = ivec4(dim, head, seq_pos, batch); // Match [D, H, N, B] order - ivec3 pos = tidx_to_pos(tidx, O_sizes, O_axis_map, O_packed_dim); - int component = tidx[O_packed_dim] % 4; - vec4 texel = imageLoad(t_O, pos); - return T(texel[component]); -} - -void store_tensor_O(int batch, int seq_pos, int head, int dim, T value) { - ivec4 tidx = ivec4(dim, head, seq_pos, batch); // Match [D, H, N, B] order - ivec3 pos = tidx_to_pos(tidx, O_sizes, O_axis_map, O_packed_dim); - int component = tidx[O_packed_dim] % 4; - vec4 texel = imageLoad(t_O, pos); - texel[component] = float(value); - imageStore(t_O, pos, texel); -} - -float load_tensor_l(int batch, int head, int seq_pos) { - ivec4 tidx = ivec4(seq_pos, head, batch, 0); // Match [N, H, B] order (with padding) - ivec3 pos = tidx_to_pos(tidx, ivec4(l_sizes, 1), l_axis_map, l_packed_dim); - int component = tidx[l_packed_dim] % 4; - vec4 texel = imageLoad(t_l, pos); - return texel[component]; -} - -void store_tensor_l(int batch, int head, int seq_pos, float value) { - ivec4 tidx = ivec4(seq_pos, head, batch, 0); // Match [N, H, B] order (with padding) - ivec3 pos = tidx_to_pos(tidx, ivec4(l_sizes, 1), l_axis_map, l_packed_dim); - int component = tidx[l_packed_dim] % 4; - vec4 texel = imageLoad(t_l, pos); - texel[component] = value; - imageStore(t_l, pos, texel); -} - -float load_tensor_m(int batch, int head, int seq_pos) { - ivec4 tidx = ivec4(seq_pos, head, batch, 0); // Match [N, H, B] order (with padding) - ivec3 pos = tidx_to_pos(tidx, ivec4(m_sizes, 1), m_axis_map, m_packed_dim); - int component = tidx[m_packed_dim] % 4; - vec4 texel = imageLoad(t_m, pos); - return texel[component]; -} - -void store_tensor_m(int batch, int head, int seq_pos, float value) { - ivec4 tidx = ivec4(seq_pos, head, batch, 0); // Match [N, H, B] order (with padding) - ivec3 pos = tidx_to_pos(tidx, ivec4(m_sizes, 1), m_axis_map, m_packed_dim); - int component = tidx[m_packed_dim] % 4; - vec4 texel = imageLoad(t_m, pos); - texel[component] = value; - imageStore(t_m, pos, texel); - -} - -void main() { - // Each thread processes one row block - same as buffer version - const int thread_id = int(gl_GlobalInvocationID.x); - - // Tensor dimensions: Q_sizes = [D, H, N, B] - const int head_dim = Q_sizes.x; // D (head dim) - const int num_heads_val = Q_sizes.y; // H (num heads) - const int seq_len = Q_sizes.z; // N (sequence length) - const int batch_size = Q_sizes.w; // B (batch) - - // Block sizes - const int Br = block_size_r; - const int Bc = block_size_c; - - const int Tr = (seq_len + Br - 1) / Br; // Number of row blocks - const int total_row_blocks = batch_size * num_heads_val * Tr; - - if (thread_id >= total_row_blocks) { - return; - } - - // Decode thread_id to (batch, head, row_block) - const int batch = thread_id / (num_heads_val * Tr); - const int remaining = thread_id % (num_heads_val * Tr); - const int head = remaining / Tr; - const int row_block = remaining % Tr; - - // Calculate row range for this block - const int row_start = row_block * Br; - const int row_end = min(row_start + Br, seq_len); - const int actual_Br = row_end - row_start; - - // STEP 1: Initialize only this thread's row block - // Each thread initializes its own rows to avoid cross-workgroup synchronization issues - for (int r = 0; r < actual_Br; r++) { - const int seq_pos = row_start + r; - - // Initialize l and m textures for this row block's positions - ivec4 l_tidx = ivec4(batch, head, seq_pos, 0); - ivec3 l_pos = tidx_to_pos(l_tidx, ivec4(l_sizes, 1), l_axis_map, l_packed_dim); - vec4 l_texel = vec4(0.0); - imageStore(t_l, l_pos, l_texel); - - ivec4 m_tidx = ivec4(batch, head, seq_pos, 0); - ivec3 m_pos = tidx_to_pos(m_tidx, ivec4(m_sizes, 1), m_axis_map, m_packed_dim); - vec4 m_texel = vec4(-1e10); - imageStore(t_m, m_pos, m_texel); - - // Initialize output tensor for this row block - for (int dim = 0; dim < head_dim; dim++) { - store_tensor_O(batch, seq_pos, head, dim, T(0.0)); - } - } - - // STEP 5: Outer loop over column blocks (For K, V tensors) - const int Tc = (seq_len + Bc - 1) / Bc; // Number of column blocks - for (int j = 0; j < Tc; j++) { - const int col_start = j * Bc; - const int col_end = min(col_start + Bc, seq_len); - const int actual_Bc = col_end - col_start; - - // Load current statistics for all rows in this block - float m_i[MAX_BR]; - float l_i[MAX_BR]; - for (int r = 0; r < actual_Br; r++) { - const int seq_pos = row_start + r; - m_i[r] = load_tensor_m(batch, head, seq_pos); - l_i[r] = load_tensor_l(batch, head, seq_pos); - } - - // STEP 9: Compute Sij = Qi * Kj^T - T S_block[MAX_BR][MAX_BC]; - float m_tilde_ij[MAX_BR]; // Row maxes - float l_tilde_ij[MAX_BR]; // Row sums - - // Initialize row statistics - for (int r = 0; r < actual_Br; r++) { - m_tilde_ij[r] = -1.0 / 0.0; // -infinity - l_tilde_ij[r] = 0.0; - } - - // Compute attention scores Sij = Qi @ Kj^T - for (int r = 0; r < actual_Br; r++) { - const int global_row = row_start + r; - for (int c = 0; c < actual_Bc; c++) { - const int global_col = col_start + c; - - // For multi-query attention: map query head to KV head - const int kv_head = (head * num_kv_heads) / num_heads_val; - - // Dot product: Q[seq_pos, :] · K[col_pos, :] - T score = T(0.0); - for (int dim = 0; dim < head_dim; dim++) { - T q_val = load_tensor_Q(batch, global_row, head, dim); - T k_val = load_tensor_K(batch, global_col, kv_head, dim); - score += q_val * k_val; - } - score *= scale; - - - // Apply causal masking: mask if global_col > global_row + input_pos - bool masked = (global_col > global_row + input_pos); - if (masked) { - score = T(-1.0 / 0.0); // Set to negative infinity - } - - S_block[r][c] = score; - - - // Track row maximum (after masking) - m_tilde_ij[r] = max(m_tilde_ij[r], float(score)); - } - } - - // STEP 10: Compute P'ij = exp(Sij − m'ij) and l'ij = rowsum(P'ij) - for (int r = 0; r < actual_Br; r++) { - // Handle the case where all scores are -inf (fully masked row) - if (isinf(m_tilde_ij[r]) && m_tilde_ij[r] < 0.0) { - // All scores are -inf, so all probabilities are 0 - for (int c = 0; c < actual_Bc; c++) { - S_block[r][c] = 0.0; - } - l_tilde_ij[r] = 0.0; - } else { - // Normal case: compute softmax - for (int c = 0; c < actual_Bc; c++) { - S_block[r][c] = exp(S_block[r][c] - T(m_tilde_ij[r])); - l_tilde_ij[r] += float(S_block[r][c]); - } - } - } - - // STEP 11: Softmax update - float m_new_i[MAX_BR]; - float l_new_i[MAX_BR]; - for (int r = 0; r < actual_Br; r++) { - m_new_i[r] = max(m_i[r], m_tilde_ij[r]); - l_new_i[r] = exp(m_i[r] - m_new_i[r]) * l_i[r] + exp(m_tilde_ij[r] - m_new_i[r]) * l_tilde_ij[r]; - - } - - // STEP 12: Update Oi - for (int r = 0; r < actual_Br; r++) { - const int global_row = row_start + r; - float alpha = exp(m_i[r] - m_new_i[r]); - float beta = exp(m_tilde_ij[r] - m_new_i[r]); - - // For multi-query attention: map query head to KV head - const int kv_head = (head * num_kv_heads) / num_heads_val; - - for (int dim = 0; dim < head_dim; dim++) { - // Compute P'ij @ Vj for this dimension - T pv_sum = T(0.0); - for (int c = 0; c < actual_Bc; c++) { - const int global_col = col_start + c; - T v_val = load_tensor_V(batch, global_col, kv_head, dim); - pv_sum += S_block[r][c] * v_val; - } - - // Check for division by zero before updating output - if (l_new_i[r] <= 0.0) { - store_tensor_O(batch, global_row, head, dim, T(0.0)); - } else { - // Oi = (alpha * l_i * Oi + beta * P'ij @ Vj) / l_new_i - T current_o = load_tensor_O(batch, global_row, head, dim); - T new_o = (T(alpha) * T(l_i[r]) * current_o + T(beta) * pv_sum) / T(l_new_i[r]); - store_tensor_O(batch, global_row, head, dim, new_o); - - } - } - } - - // STEP 13: Update li, mi - for (int r = 0; r < actual_Br; r++) { - const int seq_pos = row_start + r; - store_tensor_l(batch, head, seq_pos, l_new_i[r]); - store_tensor_m(batch, head, seq_pos, m_new_i[r]); - } - - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.yaml deleted file mode 100644 index 909b8bfd3a9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -flash_attention_texture3d: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: flash_attention_texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/flip.glsl b/backends/vulkan/runtime/graph/ops/glsl/flip.glsl deleted file mode 100644 index 2291d1b6e4f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/flip.glsl +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "dims")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - VEC4_T out_texel = VEC4_T(0); - uint src_x = pos.x; - uint src_y = pos.y; - uint src_z = pos.z; - - int flattened_channels = int(ceil(out_sizes.z / 4.0)); - - // Width - if (dims.x == 1) { - src_x = out_sizes.x - 1 - pos.x; - } - // Height - if (dims.y == 1) { - src_y = out_sizes.y - 1 - pos.y; - } - // Batch - if (dims.w == 1) { - uint n = pos.z / flattened_channels; - uint src_n = out_sizes.w - 1 - n; - uint c4 = pos.z - n * flattened_channels; - src_z = src_n * flattened_channels + c4; - } - - uint prev_src_z = src_z; - for (int p = 0; p < 4; ++p) { - uint src_p = p; - - // Channel - if (dims.z == 1) { - uint nc = (pos.z / flattened_channels) * flattened_channels; - uint c4 = pos.z - nc; - uint c = c4 * 4 + p; - uint src_c = out_sizes.z - 1 - c; - - src_z = (dims.w == 1) - ? prev_src_z - c4 + src_c / 4 // Batch and Channel - : nc + src_c / 4; // Channel only - src_p = src_c % 4; - } - - VEC4_T in_texel = VEC4_T(texelFetch(t_in, ivec3(src_x, src_y, src_z), 0)); - out_texel[p] = in_texel[src_p]; - } - imageStore(t_out, pos, out_texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/flip.yaml b/backends/vulkan/runtime/graph/ops/glsl/flip.yaml deleted file mode 100644 index f5e7c874773..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/flip.yaml +++ /dev/null @@ -1,14 +0,0 @@ -flip: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: flip diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.glsl b/backends/vulkan/runtime/graph/ops/glsl/full.glsl deleted file mode 100644 index 81f1f182cdf..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/full.glsl +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define POS ${get_pos[NDIM]("pos")} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "sizes")} -${layout_declare_ubo(B, "float", "fill_value")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); - - if (any(greaterThanEqual(idx, sizes))) { - return; - } - - VEC4_T outtex = VEC4_T(fill_value); - const int packed_dim_size = sizes[packed_dim]; - int packed_idx = idx[packed_dim]; - - if (packed_idx + 3 >= packed_dim_size) { - ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3); - VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size))); - outtex = outtex * valid_idx; - } - - imageStore(t_out, POS, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml deleted file mode 100644 index eff78a7938d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -full: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: C_packed - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: full diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl deleted file mode 100644 index 93a2c53e013..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl +++ /dev/null @@ -1,38 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_ubo(1, "ivec4", "in_sizes")} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "int", "stride", "float", "offset")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); - - if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { - return; - } - int width = in_sizes.x; - VEC4_T outtex; - if (pos.x == 0) { - float value = (pos.y % width + offset) * stride; - outtex = VEC4_T(value, 0, 0, 0); - } else if (pos.x == 1) { - float value = (pos.y / width + offset) * stride; - outtex = VEC4_T(value, 0, 0, 0); - } - - imageStore(t_out, pos, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml deleted file mode 100644 index 654edca6108..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml +++ /dev/null @@ -1,12 +0,0 @@ -grid_priors: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: C_packed - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: grid_priors diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl deleted file mode 100644 index 70fdf2bae17..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#include "broadcasting_utils.h" -#include "indexing_utils.h" - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_mean", DTYPE, "buffer")} -${layout_declare_tensor(B, "w", "t_rstd", DTYPE, "buffer")} - -${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} - -${layout_declare_ubo(B, "ivec4", "mean_strides")} -${layout_declare_ubo(B, "int", "mean_numel")} -${layout_declare_ubo(B, "ivec3", "in_limits")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} - -layout(push_constant) uniform PRECISION restrict Block { - int group; - float epsilon; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "mean_layout", "DEFAULT_DIM_ORDER")} -const lowp ivec4 mean_dim_order = unhash_dim_order(mean_layout); - -#define LOCAL_WORK_GROUP_SIZE 64 -shared float shared_sum[LOCAL_WORK_GROUP_SIZE]; -shared float shared_sum_sq[LOCAL_WORK_GROUP_SIZE]; - -/* - * Computes the mean and standard deviation of one group of channels of the - * input tensor for the group normalization operator. - * - * Given a tensor of shape [W, H, C, N] the mean and standard deviation tensors - * will have a shape of [G, N] where G = C / group. - * - * The input tensor is assumed to be a channels-packed texture tensor with the - * standard axis mapping. The output tensors are assumed to be contiguous buffer - * tensors. - * - * Algorithm: - * 1. Each shader invocation corresponds to one group in one batch - * 2. The local work group cooperatively reduces over all spatial locations (H×W) - * and all channels within the group (C/group channels) - * 3. Uses shared memory for efficient parallel reduction - * 4. Main thread (local ID 0) writes the final mean and rstd to buffer - * - * Global work group size: {N, 1, 1} - * N is the number of elements in the tensor buffer; each thread computes one - * output element. - * - * Local work group size: {1, float, 1} - * float should be a power of 2, recommended 64 or 128 threads. This allows - * efficient tree-based reduction in shared memory. Each local group will - * cooperate to compute the output element. - * - * Each shader invocation will compute the mean and standard deviation for one - * channel group in the input, and write out the corresponding result. - */ -void group_norm_reduce_C_packed() { - const int global_idx = int(gl_GlobalInvocationID.x); - const int local_idx = int(gl_LocalInvocationID.y); - - // Calculate group dimensions - const int D = in_sizes.z / group; // channels per group - const int HxW = in_sizes.y * in_sizes.x; // spatial size - const int group_size = D * HxW; // total elements per group - - // Convert global index to (group_idx, batch_idx) - const ivec4 mean_tidx = bufi_to_tidx(global_idx, mean_strides, mean_dim_order); - - // Initialize local sums - float local_sum = 0.0; - float local_sum_sq = 0.0; - int local_count = 0; - - // Calculate the range of channels for this group - const int group_start_channel = mean_tidx.x * D; - const int group_end_channel = group_start_channel + D; - - // Calculate the range of texels that contain channels from this group - const int start_texel_idx = group_start_channel / 4; - const int end_texel_idx = divup4(group_end_channel); - const int texels_in_group = end_texel_idx - start_texel_idx; - - // Total texels to process across all spatial locations - const int total_texels = texels_in_group * HxW; - - // Each thread processes a subset of texels - const int texels_per_thread = (total_texels + LOCAL_WORK_GROUP_SIZE - 1) / LOCAL_WORK_GROUP_SIZE; - const int start_texel = local_idx * texels_per_thread; - const int end_texel = min(start_texel + texels_per_thread, total_texels); - - // Process assigned texels - for (int texel_idx = start_texel; texel_idx < end_texel; texel_idx++) { - // Convert texel index to spatial and channel coordinates - const int spatial_idx = texel_idx / texels_in_group; - const int texel_in_group = texel_idx % texels_in_group; - - // Convert to spatial coordinates - const int w = spatial_idx % in_sizes.x; - const int h = spatial_idx / in_sizes.x; - - // Calculate the global texel index - const int global_texel_idx = start_texel_idx + texel_in_group; - - // Convert to texture position using default axis mapping - ivec3 tex_pos = ivec3(w, h, global_texel_idx); - - // Adjust for batch dimension if needed - if (in_sizes.w > 1) { - // default axis mapping means channels is the batch concat dim - tex_pos.z += mean_tidx.y * divup4(in_sizes.z); - } - - // Check bounds and load texel - if (all(lessThan(tex_pos, in_limits))) { - const vec4 texel_val = load_texel(t_in, tex_pos); - - // Process all components of the texel that belong to this group - const int texel_start_channel = global_texel_idx * 4; - for (int comp = 0; comp < 4; comp++) { - const int current_channel = texel_start_channel + comp; - - // Check if this component belongs to the current group - if (current_channel >= group_start_channel && current_channel < group_end_channel) { - const float val = texel_val[comp]; - local_sum += val; - local_sum_sq += val * val; - local_count++; - } - } - } - } - - // Store local results in shared memory - shared_sum[local_idx] = local_sum; - shared_sum_sq[local_idx] = local_sum_sq; - - // Synchronize threads - memoryBarrierShared(); - barrier(); - - // Perform tree-based reduction in shared memory - for (int stride = LOCAL_WORK_GROUP_SIZE / 2; stride > 0; stride /= 2) { - if (local_idx < stride) { - shared_sum[local_idx] += shared_sum[local_idx + stride]; - shared_sum_sq[local_idx] += shared_sum_sq[local_idx + stride]; - } - memoryBarrierShared(); - barrier(); - } - - // Main thread writes the result - if (local_idx == 0 && global_idx < mean_numel) { - const float total_sum = shared_sum[0]; - const float total_sum_sq = shared_sum_sq[0]; - const float count = float(group_size); - - // Calculate mean and reciprocal standard deviation - const float mean_val = total_sum / count; - const float variance = (total_sum_sq / count) - (mean_val * mean_val); - const float rstd_val = 1.0 / sqrt(variance + epsilon); - - // Write to buffer-backed tensors - t_mean[global_idx] = BUF_T(mean_val); - t_rstd[global_idx] = BUF_T(rstd_val); - } -} - -void main() { - group_norm_reduce_C_packed(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.yaml deleted file mode 100644 index 00c357a1d6e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -group_norm_reduce_texture: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: group_norm_reduce_texture diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.glsl deleted file mode 100644 index 8440481963a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.glsl +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#include "broadcasting_utils.h" -#include "indexing_utils.h" - -#define PRECISION ${PRECISION} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} - -${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_weight", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_mean", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_rstd", DTYPE, "buffer")} - -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec3", "weight_limits")} -${layout_declare_ubo(B, "ivec4", "mean_strides")} - -layout(push_constant) uniform PRECISION restrict Block { - int group; - float epsilon; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - * Applies group normalization to t_in, and write the results to t_out. The mean - * and rstd of the input tensor are precomputed and passed in as t_mean and - * t_rstd. - * - * Given an input tensor t_in of shape [N, C, H, W], the mean and rstd will have - * shape [N, C / ngroup], and the output will have the same shape as t_in. The - * weight and bias tensor will have a shape of [C]. - * - * In this implementation, the input and output tensors are assumed to be - * channels packed textures with standard axis mapping. - * - * The weight and bias tensors are assumed to be width packed textures with - * standard axis mapping. - * - * The mean and rstd tensors are assumed to be contiguous buffer-backed tensors. - */ -void apply_group_norm() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - // Check bounds - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - // Convert texture position to tensor coordinates using default axis mapping - // and channels packing - ivec4 out_tidx = ivec4(pos.x, pos.y, mul4(pos.z), 0); - - // Handle batch dimension if batches > 1 - if (out_sizes.w > 1) { - const int C_aligned = alignup4(out_sizes.z); - // default axis mapping means channels is the batch concatenation dim - const int batch_idx = out_tidx.z / C_aligned; - out_tidx.w = batch_idx; - out_tidx.z = out_tidx.z % C_aligned; - } - - // Load input texel (contains 4 consecutive channels) - const vec4 input_texel = load_texel(t_in, pos); - - // Load weight and bias texels, which are width-packed; each element along the - // width dim corresponds to a channel in the input tensor. - const ivec3 weight_pos = ivec3(out_tidx.z / 4, 0, 0); - const vec4 weight_texel = load_texel(t_weight, weight_pos); - const vec4 bias_texel = load_texel(t_bias, weight_pos); - - // Calculate which channels this texel represents - // For channels-packed layout: texel at position z contains channels [z, z+1, z+2, z+3] - const int base_channel = out_tidx.z; - - // Calculate buffer indices for mean/rstd lookup - // Mean/rstd tensors have shape [G, N] where G = C/group - const int batch_idx = out_tidx.w; - const int channels_per_group = out_sizes.z / group; - - vec4 bias; - // Process each element of the output texel individually, since each element - // may belong to a different channel group - for (int i = 0; i < 4; ++i) { - const int channel_idx = base_channel + i; - // Handle case where padding channels are added - if (channel_idx >= out_sizes.z) { - bias[i] = input_texel[i]; - continue; - } - - // Calculate group index for this channel - const int group_idx = channel_idx / channels_per_group; - - // Create tensor index for mean/rstd buffer access - const ivec4 mean_tidx = ivec4(group_idx, batch_idx, 0, 0); - const int mean_bufi = tidx_to_bufi(mean_tidx, mean_strides); - - // Load mean and rstd values for this channel - const float mean_val = t_mean[mean_bufi]; - const float rstd_val = t_rstd[mean_bufi]; - - // Apply group normalization with weight and bias: ((input - mean) * rstd) * weight + bias - const float normalized = (input_texel[i] - mean_val) * rstd_val; - bias[i] = normalized * weight_texel[i] + bias_texel[i]; - } - - // Write result to output texture - write_texel(t_out, pos, bias); -} - -void main() { - apply_group_norm(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.yaml deleted file mode 100644 index b50853be3b0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -group_norm_texture: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: group_norm_texture diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/im2col.glsl deleted file mode 100644 index f045d4e9702..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/im2col.glsl +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#extension GL_EXT_debug_printf : enable - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)} -#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)} - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if INPUT_STORAGE == "buffer": - #define INPUT_BUFFER - -#define TILE_M4 1 -#define TILE_N4 1 -#define TILE_K4 1 - -#define TILE_M 4 -#define TILE_N 4 -#define TILE_K 4 - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "conv2d_common.glslh" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} - -// Sizes of the im2col matrix of the convolution input -${layout_declare_ubo(B, "ivec4", "matrix_sizes")} -// Sizes of the input image -${layout_declare_ubo(B, "ivec4", "input_sizes")} -// Sizes of the output image -${layout_declare_ubo(B, "ivec4", "output_sizes")} - -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "conv2d_fp_im2col_block_load.glslh" - -#ifdef OUTPUT_BUFFER - -void write_tile( - const FPInputTile in_tile, - const int k4, - const int m_start, - const int K4) { - [[unroll]] for (int m = 0; m < TILE_M; m++) { - t_output[(m_start + m) * K4 + k4] = in_tile.data[m][0]; - } -} - -#else // OUTPUT_TEXTURE - -void write_tile( - const FPInputTile in_tile, - const int k4, - const int m_start, - const int K4) { - [[unroll]] for (int m = 0; m < TILE_M; m++) { - imageStore(t_output, ivec3(k4, m_start + m, 0), vec4(in_tile.data[m][0])); - } -} - -#endif // OUTPUT_BUFFER - -void main() { - // Each thread writes out a 4 wide x 4 high block of the output matrix. The - // thread position corresponds to the block index. - const int k4 = int(gl_GlobalInvocationID.x); - const int m4 = int(gl_GlobalInvocationID.y); - - // Convert block idx to tensor idx - const int k = mul_4(k4); - const int m = mul_4(m4); - - const int in_channels_per_group = input_sizes.z / conv2d_params.groups; - - // Logical K dim size (unpadded) - const int logical_K = conv2d_params.logical_K; - // Physical K dim, which contains padding elements - const int K = matrix_sizes.x; - - // M dim, which represents the number of flattened output width, height, - // batches. Unlike K, there is no difference between the physical and logical - // sizes. - const int M = matrix_sizes.y; - - if (k >= K || m >= M) { - return; - } - - FPInputTile in_tile; - load_input_im2col_tile(in_tile, k4, m4, logical_K, M); - - // Number of texels in the x dim of the output matrix - const int K4 = div_4(K); - write_tile(in_tile, k4, m, K4); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col.yaml b/backends/vulkan/runtime/graph/ops/glsl/im2col.yaml deleted file mode 100644 index dd486b0e1a6..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/im2col.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -im2col: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: buffer - INPUT_STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: im2col_buffer_texture3d - - NAME: im2col_texture3d_texture3d - OUTPUT_STORAGE: texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl deleted file mode 100644 index d7bef9f0163..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_buffer(B, "w", "buf_out", DTYPE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -$if USE_PUSH_CONST: - layout(push_constant) uniform restrict Block { - ivec4 sizes; - $if not TO_STAGING: - ivec4 buf_strides; - }; -$else: - ${layout_declare_ubo(B, "ivec4", "sizes")} - $if not TO_STAGING: - ${layout_declare_ubo(B, "ivec4", "buf_strides")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 axis_map = unhash_axis_map(t_layout); -const lowp int packed_dim = unhash_packed_dim(t_layout); - -void write_out_texel(VEC4_T texel, ivec4 tidx) { - $if TO_STAGING: - const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim); - $else: - const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim); - - if (tidx[packed_dim] < sizes[packed_dim]) { - buf_out[buf_indices.x] = BUF_T(texel.x); - } - if (tidx[packed_dim] + 1 < sizes[packed_dim]) { - buf_out[buf_indices.y] = BUF_T(texel.y); - } - if (tidx[packed_dim] + 2 < sizes[packed_dim]) { - buf_out[buf_indices.z] = BUF_T(texel.z); - } - if (tidx[packed_dim] + 3 < sizes[packed_dim]) { - buf_out[buf_indices.w] = BUF_T(texel.w); - } -} - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); - - if (any(greaterThanEqual(tidx, sizes))) { - return; - } - - const VEC4_T intex = load_texel(t_in, lpos_to_pos(lpos, axis_map)); - write_out_texel(intex, tidx); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml deleted file mode 100644 index 646d8f1be81..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -image_to_nchw: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - TO_STAGING: True - USE_PUSH_CONST: True - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: image_to_nchw_texture3d - - NAME: image_to_nchw_texture2d - STORAGE: texture2d - - NAME: clone_image_to_buffer - TO_STAGING: False - - NAME: image_to_nchw_no_pc_texture3d - USE_PUSH_CONST: False - - NAME: image_to_nchw_no_pc_texture2d - STORAGE: texture2d - USE_PUSH_CONST: False - - NAME: clone_image_to_buffer_no_pc - TO_STAGING: False - USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select.glsl deleted file mode 100644 index 4500d43b932..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/index_select.glsl +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(2, "r", "t_idx", "int", STORAGE)} -${layout_declare_ubo(3, "ivec4", "sizes")} -${layout_declare_ubo(4, "int", "gpu_dim", "int", "stride")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - - if (pos_out_of_bounds(out_pos, sizes, packed_dim)) { - return; - } - - const int out_idx = out_pos[gpu_dim] / stride; - const int within_stride = out_pos[gpu_dim] % stride; - const int in_idx = texelFetch(t_idx, ivec3(out_idx, 0, 0), 0).x; - - ivec3 in_pos = out_pos; - in_pos[gpu_dim] = in_idx * stride + within_stride; - - imageStore(t_out, out_pos, texelFetch(t_in, in_pos, 0)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_select.yaml deleted file mode 100644 index abef2225cd9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/index_select.yaml +++ /dev/null @@ -1,12 +0,0 @@ -index_select: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: index_select diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl deleted file mode 100644 index 76ec540838c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(2, "r", "t_idx", "int", STORAGE)} -${layout_declare_ubo(3, "ivec4", "out_sizes")} -${layout_declare_ubo(4, "ivec4", "in_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - - if (pos_out_of_bounds(out_pos, out_sizes, packed_dim)) { - return; - } - - const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim); - const ivec4 buffer_ixs = tidx_to_nchwi(idx, out_sizes, packed_dim); - - VEC4_T out_texel; - for (int i = 0; i < 4; ++i) { - const ivec4 out_tidx = nchwi_to_tidx(buffer_ixs[i], out_sizes); - int out_channel = out_tidx.z; - int in_channel = texelFetch(t_idx, ivec3(out_channel, 0, 0), 0).x; - - ivec4 in_tidx = out_tidx; - in_tidx.z = in_channel; - - ivec4 in_elem_pos = to_texture_elem_pos(in_tidx, in_sizes, packed_dim); - - VEC4_T in_texel = texelFetch(t_in, in_elem_pos.xyz, 0); - - out_texel[i] = in_texel[in_elem_pos.w]; - } - imageStore(t_out, out_pos, out_texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.yaml deleted file mode 100644 index a306e3ce47d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.yaml +++ /dev/null @@ -1,12 +0,0 @@ -index_select_channel: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: index_select_channel diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh deleted file mode 100644 index 81783422ab4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef INDEXING_GLSLH -#define INDEXING_GLSLH - -#define DIMLIMIT 8 -#define DIMLIMIT_DIV4 2 - -#define mul_4(x) ((x) << 2) -#define div_4(x) ((x) >> 2) - -#define mod_4(x) ((x) & 3) - -// -// BufferMetadata -// - -struct BufferMetadata { - uvec4 sizes[DIMLIMIT_DIV4]; - uvec4 dim_order[DIMLIMIT_DIV4]; - uvec4 strides[DIMLIMIT_DIV4]; - uvec2 ndim_numel; -}; - -uint ndim(const BufferMetadata meta) { - return meta.ndim_numel[0]; -} - -int int_ndim(const BufferMetadata meta) { - return int(meta.ndim_numel[0]); -} - -uint numel(const BufferMetadata meta) { - return meta.ndim_numel[1]; -} - -uint dim_order_at(const BufferMetadata meta, const int dim) { - return meta.dim_order[div_4(dim)][mod_4(dim)]; -} - -uint dim_order_at(const BufferMetadata meta, const uint dim) { - return meta.dim_order[div_4(dim)][mod_4(dim)]; -} - -uint stride_at(const BufferMetadata meta, const int dim) { - return meta.strides[div_4(dim)][mod_4(dim)]; -} - -uint stride_at(const BufferMetadata meta, const uint dim) { - return meta.strides[div_4(dim)][mod_4(dim)]; -} - -uint size_at(const BufferMetadata meta, const int dim) { - return meta.sizes[div_4(dim)][mod_4(dim)]; -} - -uint size_at(const BufferMetadata meta, const uint dim) { - return meta.sizes[div_4(dim)][mod_4(dim)]; -} - -bool are_equal(const BufferMetadata meta1, const BufferMetadata meta2) { - // sizes and strides must be the same to be considered equal - if (meta1.sizes[0] != meta2.sizes[0]) { - return false; - } - if (meta1.sizes[1] != meta2.sizes[1]) { - return false; - } - if (meta1.strides[0] != meta2.strides[0]) { - return false; - } - if (meta1.strides[1] != meta2.strides[1]) { - return false; - } - return true; -} - -// -// TensorIndex -// - -struct TensorIndex { - uvec4 data[DIMLIMIT_DIV4]; -}; - -void initialize(out TensorIndex tidx) { - tidx.data[0] = uvec4(0); - tidx.data[1] = uvec4(0); -} - -uint idx_at(const TensorIndex tidx, const int dim) { - return tidx.data[div_4(dim)][mod_4(dim)]; -} - -void permute(inout TensorIndex tidx, const ivec4 permute_order[DIMLIMIT_DIV4]) { - TensorIndex new_tidx = tidx; - for (int d = 0; d < DIMLIMIT; ++d) { - int src_dim = permute_order[div_4(d)][mod_4(d)]; - new_tidx.data[div_4(d)][mod_4(d)] = idx_at(tidx, src_dim); - } - tidx = new_tidx; -} - -// -// Index Conversions -// - -void contiguous_idx_to_tensor_idx( - const BufferMetadata meta, - uint contiguous_idx, - out TensorIndex tidx) { - initialize(tidx); - int dim = int_ndim(meta); - int i = 0; - - uint contiguous_strides[DIMLIMIT]; - contiguous_strides[0] = 1; - for (int d = 1; d < DIMLIMIT; ++d) { - contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1]; - } - - for (int d = max(dim - 1, 0); d >= 0; d--) { - uint dim_stride = contiguous_strides[d]; - - tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / dim_stride; - contiguous_idx = contiguous_idx % dim_stride; - } -} - -uint tensor_idx_to_contiguous_idx( - const BufferMetadata meta, - const TensorIndex tidx) { - uint contiguous_strides[DIMLIMIT]; - contiguous_strides[0] = 1; - for (int d = 1; d < DIMLIMIT; ++d) { - contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1]; - } - - uint contig_idx = 0; - for (int d = 0; d < ndim(meta); ++d) { - contig_idx += contiguous_strides[d] * idx_at(tidx, d); - } - return contig_idx; -} - -void linear_idx_to_tensor_idx( - const BufferMetadata meta, - uint linear_idx, - out TensorIndex tidx) { - initialize(tidx); - int dim = int_ndim(meta); - int i = 0; - for (int d = max(dim - 1, 0); d >= 0; d--) { - uint dim_idx = dim_order_at(meta, d); - uint dim_stride = stride_at(meta, dim_idx); - - tidx.data[div_4(dim_idx)][mod_4(dim_idx)] = linear_idx / dim_stride; - linear_idx = linear_idx % dim_stride; - } -} - -uint tensor_idx_to_linear_idx( - const BufferMetadata meta, - const TensorIndex tidx) { - uint lin_idx = 0; - for (int d = 0; d < ndim(meta); ++d) { - lin_idx += stride_at(meta, d) * idx_at(tidx, d); - } - return lin_idx; -} - -void clamp_tensor_idx(const BufferMetadata meta, inout TensorIndex tidx) { - tidx.data[0] = min(tidx.data[0], meta.sizes[0] - 1); - tidx.data[1] = min(tidx.data[1], meta.sizes[1] - 1); -} - -// -// Debug utilities -// - -#ifdef DEBUG_MODE - -void printTensorIndex(const TensorIndex tidx) { - debugPrintfEXT( - "TensorIndex: tidx=[%u %u %u %u %u %u %u %u]\\n", - tidx.data[0][0], tidx.data[0][1], tidx.data[0][2], tidx.data[0][3], - tidx.data[1][0], tidx.data[1][1], tidx.data[1][2], tidx.data[1][3] - ); -} - -void printBufferMetadata(const BufferMetadata meta) { - debugPrintfEXT( - "BufferMetadata: ndim=%u numel=%u\\n sizes=[%u %u %u %u %u %u %u %u]\\n dim_order=[%u %u %u %u %u %u %u %u]\\n strides=[%u %u %u %u %u %u %u %u]\\n", - meta.ndim_numel[0], meta.ndim_numel[1], - meta.sizes[0][0], meta.sizes[0][1], meta.sizes[0][2], meta.sizes[0][3], - meta.sizes[1][1], meta.sizes[1][1], meta.sizes[1][2], meta.sizes[1][3], - meta.dim_order[0][0], meta.dim_order[0][1], - meta.dim_order[0][2], meta.dim_order[0][3], - meta.dim_order[1][0], meta.dim_order[1][1], - meta.dim_order[1][2], meta.dim_order[1][3], - meta.strides[0][0], meta.strides[0][1], - meta.strides[0][2], meta.strides[0][3], - meta.strides[1][1], meta.strides[1][1], - meta.strides[1][2], meta.strides[1][3] - ); -} - -#endif - -#endif // INDEXING_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h deleted file mode 100644 index fdb6f514a3e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ /dev/null @@ -1,417 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef INDEXING_UTILS_H -#define INDEXING_UTILS_H - -/* - * The functions defined in this header file use the following shorthand to - * represent tensor related data structures. - * - * tidx - ivec4 tensor indices, listed in WHCN order. - * - * pos - ivec3 texel position, used to fetch from an image texture via the - * texelFetch(image, pos, lod) GLSL function. - * posi - ivec4 texel element position. It is the same as pos, except with an - * additional component of the index of an element within the texel. - * lpos - ivec3 logical position, listed in WHC order. This is a permutation of - * texture position based on a tensor's axis_map. lpos.x is the position - * component that corresponds to the tensor's width dimension, lpos.y is - * the position component that corresponds to the tensor's height dim, - * and so on. - * - * bufi - int index into a GPU buffer that backs a tensor. - * nchwi - int index into a staging buffer for a tensor. The data in the - * staging buffer is stored in contiguous data layout, irrespective of - * the tensor's strides. - */ - -// Width Dim Index, assuming WHCN order -#define W_DIM 0 -// Height, assuming WHCN order -#define H_DIM 1 -// Channels, assuming WHCN order -#define C_DIM 2 - -/* - * Fast division by 4 using bit shifting - */ -#define div4(x) ((x) >> 2) - -/* - * Fast multiplication by 4 using bit shifting - */ -#define mul4(x) ((x) << 2) - -/* - * Divides input and rounds up to 4 - */ -#define divup4(x) (((x) + 3) >> 2) - -/* - * Divides input by denominator and rounds up - */ -#define divup(x, d) (((x) + (d) - 1) / (d)) - -/* - * Aligns input to the next multiple of 4 - */ -#define alignup4(x) (((x) + 3) & -4) - -/* - * Fast modulo by 4 using bit masking - */ -#define mod4(x) ((x) & 3) - -#define ALIGN_DOWN_4(x) ((x) & ~3) - -#define ALIGN_UP_4(x) (((x) + 3) & ~3) - -#define DIV_UP_8(x) (((x) + 7) >> 3) -#define DIV_UP_4(x) (((x) + 3) >> 2) - -#define DIV_4(x) ((x) >> 2) -#define DIV_2(x) ((x) >> 1) - -#define MUL_8(x) ((x) << 3) -#define MUL_4(x) ((x) << 2) -#define MUL_2(x) ((x) << 1) - -/* - * Get the staging buffer indices that contain the data of the texel that - * corresponds to the provided tensor index. Since the texel have 4 elements, - * 4 buffer indices will be retrieved. - */ -ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) { - ivec4 strides = - ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z); - - int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + - tidx.w * strides.w; - - return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; -} - -/* - * Get the buffer indices that contain the data of the texel that corresponds to - * to the provided tensor index. Since the texel have 4 elements, 4 buffer - * indices will be retrieved. - */ -ivec4 tidx_to_4bufi( - const ivec4 tidx, - const ivec4 strides, - const int packed_dim) { - int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + - tidx.w * strides.w; - - return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; -} - -/* - * Given a buffer index to a contiguous tensor and the tensor's sizes, return - * the tensor index that corresponds to the buffer index. - */ -ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) { - const int nchwi_div_x = nchwi / sizes.x; - const int nchwi_div_y = nchwi_div_x / sizes.y; - return ivec4( - nchwi % sizes.x, - nchwi_div_x % sizes.y, - nchwi_div_y % sizes.z, - nchwi_div_y / sizes.z); -} - -int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { - const int sizes_xy = sizes.x * sizes.y; - return tidx.w * sizes_xy * sizes.z + tidx.z * sizes_xy + tidx.y * sizes.x + - tidx.x; -} - -ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) { - ivec4 idx; - for (int i = 3; i >= 0; i--) { - int dim = dim_order[i]; - idx[dim] = bufi / strides[dim]; - bufi %= strides[dim]; - } - return idx; -} - -/* - * bufi_to_tidx but assumes that the tensor is contiguous - */ -ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) { - ivec4 idx; - for (int i = 3; i >= 0; i--) { - idx[i] = bufi / strides[i]; - bufi %= strides[i]; - } - return idx; -} - -int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { - return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + - tidx.w * strides.w; -} - -ivec4 lpos_to_tidx( - ivec3 lpos, - ivec4 sizes, - const int batch_inner_dim, - const int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - // Moving 1 texel along the packed dim traverses 4 tensor elements - lpos[packed_dim] *= 4; - - ivec4 tidx = ivec4(lpos, 0); - - if (sizes.w > 1) { - tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim]; - tidx[batch_inner_dim] %= sizes[batch_inner_dim]; - } - return tidx; -} - -ivec3 tidx_to_lpos( - ivec4 tidx, - ivec4 sizes, - const int batch_inner_dim, - const int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - ivec3 lpos = tidx.xyz; - - // Adjust batch inner dim by batch index if needed - if (sizes.w > 1) { - lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim]; - } - // Fast division by 4, since moving 1 texel along the packed dim traverses 4 - // tensor elements. - lpos[packed_dim] >>= 2; - return lpos; -} - -ivec3 tidx_to_pos( - ivec4 tidx, - ivec4 sizes, - const ivec4 axis_map, - const int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - ivec3 pos; - for (int dim = 0; dim < 3; ++dim) { - pos[axis_map[dim]] = tidx[dim]; - } - - // Adjust batch inner dim by batch index if needed - if (sizes.w > 1) { - pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w]; - } - // Fast division by 4, since moving 1 texel along the packed dim traverses 4 - // tensor elements. - pos[axis_map[packed_dim]] >>= 2; - return pos; -} - -ivec4 tidx_to_posi( - ivec4 tidx, - ivec4 sizes, - const ivec4 axis_map, - const int packed_dim) { - return ivec4( - tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4); -} - -ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { - ivec3 pos; - pos[axis_map.x] = lpos.x; - pos[axis_map.y] = lpos.y; - pos[axis_map.z] = lpos.z; - return pos; -} - -#ifdef USING_BUFFER -#define load_texel(buf, idx) buf[idx] -#elif defined(USING_TEXTURE2D) -#define load_texel(im, pos) texelFetch(im, pos.xy, 0) -#define load_texel_lpos(im, lpos, axis_map) \ - texelFetch(im, lpos_to_pos(lpos, axis_map).xy, 0) -#else // defined(USING_TEXTURE3D) -#define load_texel(im, pos) texelFetch(im, pos, 0) -#define load_texel_lpos(im, lpos, axis_map) \ - texelFetch(im, lpos_to_pos(lpos, axis_map), 0) -#endif - -#ifdef USING_BUFFER -#define write_texel(buf, idx, texel) buf[idx] = texel -#elif defined(USING_TEXTURE2D) -#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel) -#define write_texel_lpos(im, lpos, texel, axis_map) \ - imageStore(im, lpos_to_pos(lpos, axis_map).xy, texel) -#else // defined(USING_TEXTURE3D) -#define write_texel(im, pos, texel) imageStore(im, pos, texel) -#define write_texel_lpos(im, lpos, texel, axis_map) \ - imageStore(im, lpos_to_pos(lpos, axis_map), texel) -#endif - -/* - * Converts hashed layout to a ivec4 containing the axis map data and an int - * containing the packed dim respectively. Each value takes up 4 bits in the - * packed int, and values are read from least significant half byte (right-most) - * to most significant half byte (left-most). - * e.g. 0x20122, 2 -> ivec4(0, 1, 2, 2) - * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1) - */ -#define unhash_axis_map(hash) \ - (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))) - -/* - * - */ -#define unhash_dim_order(hash) \ - (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))) - -#define unhash_packed_dim(hash) int(hash >> 16 & 0xf) - -#define DEFAULT_LAYOUT 0x02210 - -#define DEFAULT_DIM_ORDER 0x03210 - -#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3) - -/************************ - * Deprecated Functions * - ************************/ - -// The below functions and macros are in the process of being deprecated in -// favor of newer indexing functions that account for axis mapping and have more -// explicit function names and more updated terminology. - -/* - * Describes which texture axis the "batches" dimension runs along in a 4D - * texture. - * - * Currently it is set to 2 since we represent batches by concatenating along - * the channels dim, which has index 2 in (W, H, C, N) order and maps to the - * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) - * order. - */ -#define BATCH_AXIS 2 - -// -// (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion -// - -/* - * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim - * is packed along a texel - * Output: Whether the texel position is outside the bounds of the image texture - * given the size and packed dimension of the tensor. - */ -bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - ivec3 max_pos = sizes.xyz; - max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS]; - max_pos[packed_dim] /= 4; - return (any(greaterThanEqual(pos, max_pos))); -} - -/* - * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, - * which dim is packed along a texel - * Returns: the (w, h, c, n) tensor index cooresponding to the first element of - * the texel at the specified position - */ -ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - // Packed dim contains 4 elements per texel - pos[packed_dim] *= 4; - // Construct the initial tensor index via swizzling -#if BATCH_AXIS == 2 - ivec4 tensor_idx = pos.xyzz; -#endif -#if BATCH_AXIS == 1 - ivec4 tensor_idx = pos.xyzy; -#endif -#if BATCH_AXIS == 0 - ivec4 tensor_idx = pos.xyzx; -#endif - // Adjust the axis that the batch dim runs along - tensor_idx[3] /= sizes[BATCH_AXIS]; - tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS]; - - return tensor_idx; -} - -/* - * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim - * is packed along a texel - * Returns: the (x, y, z) texture position containing element of the tensor at - * the specified index - */ -ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - ivec3 pos = idx.xyz; - pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS]; - pos[packed_dim] /= 4; - return pos; -} - -/* - * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim - * is packed along a texel - * Returns: the (x, y, z, i) texture position containing the element of the - * tensor at the specified index, where i is the component within the - * texel to which the element belongs - */ -ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - // pos[4] is set to a placeholder value - ivec4 pos = idx.xyzx; - pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS]; - pos[packed_dim] >>= 2; - pos.w = idx[packed_dim] & 0x3; - return pos; -} - -// -// Miscellaneous Utility Functions and Macros -// - -// Given a buffer(1-D) index cur, compute a new index where the corresponding -// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane -// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a -// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and -// plane=2*24=48. -#define swap_adj_dims(cur, x, y, plane) \ - cur + \ - plane * \ - ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \ - (x - 1) * ((cur % (y * plane)) / plane)) - -// Return the x, y, z and index value the channel-packed 3D tensor from the {n, -// c, h, w}-index. -ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) { - int aligned_c = alignup4(sizes.y); - int c_stride = aligned_c / 4; - - return ivec4(nchw.w, nchw.z, nchw.x * c_stride + nchw.y / 4, nchw.y % 4); -} - -#endif // INDEXING_UTILS_H diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl deleted file mode 100644 index 8028362c3e5..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl +++ /dev/null @@ -1,80 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type(STORAGE)} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "cache", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "projected", DTYPE, STORAGE)} -$if STORAGE == "buffer": - ${layout_declare_ubo(B, "int", "projected_numel")} - ${layout_declare_ubo(B, "ivec4", "cache_strides")} - ${layout_declare_ubo(B, "int", "input_pos")} -$else: - ${layout_declare_ubo(B, "ivec3", "projected_limits")} - ${layout_declare_ubo(B, "int", "input_pos")} - - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - * t_cache will have sizes of (max_batch_size, max_seq_len, n_heads, head_dim). - * t_projected will have sizes of (batch_size, seq_len, n_heads, head_dim). - * - * The cache update inserts the values of t_projected into t_cache at the index - * specified by input_pos at the seq_len dimension. It is equivalent to calling - - * t_cache = t_cache.slice_scatter( - * t_projected, dim=1, start=input_pos, end=input_pos+seq_len) - * - * Note that this shader is implemented assuming that max_batch_size is 1. - */ - -#ifdef USING_BUFFER - -/*************************** - ** Buffer Implementation ** - ***************************/ - -void main() { - int projected_bufi = int(gl_GlobalInvocationID.x); - // Bump cache index forward by input_pos elements along the seq_len dimension. - // cache_strides contains the strides of the cache tensor. - int cache_bufi = input_pos * cache_strides.z + projected_bufi; - if (projected_bufi >= projected_numel) { - return; - } - cache[cache_bufi] = projected[projected_bufi]; -} - -#else - -/**************************** - ** Texture Implementation ** - ****************************/ - -// Note that this shader assumes the that tensors are width packed, i.e. -// packed_dim = 0 -void main() { - const ivec3 projected_pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(projected_pos, projected_limits))) { - return; - } - - const ivec3 cache_pos = ivec3( - projected_pos.x, - projected_pos.y, - projected_pos.z + input_pos); - - write_texel(cache, cache_pos, load_texel(projected, projected_pos)); -} - -#endif // USING_BUFFER diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml deleted file mode 100644 index e2a96234465..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -kv_cache_update: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - STORAGE: - - VALUE: buffer - - VALUE: texture3d - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: kv_cache_update diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh deleted file mode 100644 index da326b26e93..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines common functions and structs to be used across matrix multiplication - * operators. - */ - -#ifndef LINEAR_COMMON_GLSLH -#define LINEAR_COMMON_GLSLH - -#include "common.glslh" - -int sign_extend_8bit(const int val) { - if ((val & 0x80) != 0) { - return val | (~0xFF); - } - return val; -} - -int extract_8bit_from_packed_int_le(const int packed, const int i) { - // account for little endian - int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF); - return byte; -} - -// Extract a 4-bit value from a packed int (little endian) -// It is assumed that the 4-bit value is in the range [0, 15] -int extract_4bit_from_packed_int_le(const int packed, const int col) { - // Extract the 4-bit value from the 8-bit value - int val = packed >> (4 * col) & 0xF; - return val; -} - -// Convenience overload for packed uint -int extract_4bit_from_packed_uint_le(const uint packed, const int col) { - // Extract the 4-bit value from the 8-bit value - int val = int(packed >> (4 * col)) & 0xF; - return val; -} - -#endif // LINEAR_COMMON_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_bias_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_bias_load.glslh deleted file mode 100644 index f3d32be8b3d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_bias_load.glslh +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_FP_BIAS_LOAD_GLSLH -#define LINEAR_FP_BIAS_LOAD_GLSLH - -#include "linear_fp_per_out_channel_params.glslh" - -VEC4_T load_bias_x4(const int n4) { - return t_bias[n4]; -} - -void load_bias_tile(out FPPerOutChannelParams bias, const int n4_start) { -#if TILE_N4 == 1 - bias.data[0] = load_bias_x4(n4_start); - -#else - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - bias.data[n4] = load_bias_x4(n4_start + n4); - } - -#endif -} - -#endif // LINEAR_FP_BIAS_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile.glslh deleted file mode 100644 index 68eee57a132..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile.glslh +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_FP_INPUT_TILE_GLSLH -#define LINEAR_FP_INPUT_TILE_GLSLH - -/* - * Defines the FPInputTile struct, which is used to represent a tile of the - * input matrix of a matrix multiplication operation. - * - * Settings: - * - TILE_M: number of rows in the tile - * - TILE_K4: number of (groups of 4) columns in the tile - */ - -#extension GL_EXT_control_flow_attributes : require - -struct FPInputTile { - VEC4_T data[TILE_M][TILE_K4]; -}; - -#ifdef DEBUG_MODE - -void printFPInputTile(const FPInputTile in_tile) { - debugPrintfEXT("input_tile: \\n"); - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - debugPrintfEXT( - " %f, %f, %f, %f, \\n", - in_tile.data[m][k4].x, - in_tile.data[m][k4].y, - in_tile.data[m][k4].z, - in_tile.data[m][k4].w); - } - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_FP_INPUT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile_load.glslh deleted file mode 100644 index 6697003935f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile_load.glslh +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions to load a FPInputTile from input buffer/texture. - * - * Requires: - * - t_input to be declared in the shader layout (input buffer/texture) - * - * Settings: - * - INPUT_BUFFER to indicate input resource is a buffer, otherwise texture is - * assumed. - */ - -#ifndef LINEAR_FP_INPUT_TILE_LOAD_GLSLH -#define LINEAR_FP_INPUT_TILE_LOAD_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_fp_input_tile.glslh" - -#ifdef INPUT_BUFFER - -VEC4_T load_input_x4(const int k4, const int m, const int ntexels_k) { - return t_input[(m * ntexels_k) + k4]; -} - -#else - -VEC4_T load_input_x4(const int k4, const int m, const int ntexels_k) { - return texelFetch(t_input, ivec3(k4, m, 0), 0); -} - -#endif // INPUT_BUFFER - -// To be used if (M - m_start >= TILE_M) || (K4 - k4_start >= TILE_K4) -void load_input_tile_no_checks( - out FPInputTile in_tile, - const int k4_start, - const int m_start, - const int K4, - const int M) { -#if TILE_K4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - in_tile.data[m][0] = load_input_x4(k4_start, m_start + m, K4); - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - in_tile.data[m][k4] = load_input_x4(k4_start + k4, m_start + m, K4); - } - } -#endif -} - -// To be used if near tensor boundaries -void load_input_tile_with_checks( - out FPInputTile in_tile, - const int k4_start, - const int m_start, - const int K4, - const int M) { -#if TILE_K4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - if (m_start + m < M) { - in_tile.data[m][0] = load_input_x4(k4_start, m_start + m, K4); - } else { - in_tile.data[m][0] = VEC4_T(0.0); - } - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - if (m_start + m < M && k4_start + k4 < K4) { - in_tile.data[m][k4] = load_input_x4(k4_start + k4, m_start + m, K4); - } else { - in_tile.data[m][k4] = VEC4_T(0.0); - } - } - } -#endif -} - -#endif // LINEAR_FP_INPUT_TILE_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile.glslh deleted file mode 100644 index dd571229a9c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile.glslh +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines the FPOutTile struct, which is used to represent a tile of the output - * matrix of a matrix multiplication operation. - * - * Settings: - * - TILE_M: number of rows in the output tile - * - TILE_N4: number of (groups of 4) columns in the output tile - */ - -#ifndef LINEAR_FP_OUTPUT_TILE_GLSLH -#define LINEAR_FP_OUTPUT_TILE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -struct FPOutTile { - VEC4_T data[TILE_M][TILE_N4]; -}; - -void initialize(out FPOutTile out_tile) { -#if TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - out_tile.data[m][0] = VEC4_T(0); - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - out_tile.data[m][n4] = VEC4_T(0); - } - } -#endif -} - -void add(inout FPOutTile out_tile, const FPOutTile other_out_tile) { -#if TILE_M > 1 && TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - out_tile.data[m][0] += other_out_tile.data[m][0]; - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - out_tile.data[m][n4] += other_out_tile.data[m][n4]; - } - } -#endif -} - -#ifdef DEBUG_MODE - -void printFPOutTile(const FPOutTile tile) { - debugPrintfEXT("output_tile: \\n"); - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - debugPrintfEXT( - " %f, %f, %f, %f,", - tile.data[m][n4].x, - tile.data[m][n4].y, - tile.data[m][n4].z, - tile.data[m][n4].w); - } - debugPrintfEXT("\\n"); - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_FP_OUTPUT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh deleted file mode 100644 index ee50ad87f74..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions to compute a FPOutTile using fp input and weight tiles. - */ - -#ifndef LINEAR_FP_OUTPUT_TILE_FP_COMPUTE_GLSLH -#define LINEAR_FP_OUTPUT_TILE_FP_COMPUTE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_common.glslh" -#include "linear_fp_input_tile.glslh" -#include "linear_fp_output_tile.glslh" -#include "linear_fp_per_out_channel_params.glslh" -#include "linear_fp_weight_tile.glslh" - -/* - * Accumulates floating point input tile and floating point weight tile into - * floating point output tile. - */ -void fp_accumulate_with_fp_weight( - inout FPOutTile accum, - FPInputTile in_tile, - FPWeightTile w_tile) { -#if TILE_N4 == 1 && TILE_K4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - accum.data[m][0] = - fma(VEC4_T(in_tile.data[m][0][0]), - w_tile.data[mul_4(0)][0], - accum.data[m][0]); - - accum.data[m][0] = - fma(VEC4_T(in_tile.data[m][0][1]), - w_tile.data[mul_4(0) + 1][0], - accum.data[m][0]); - - accum.data[m][0] = - fma(VEC4_T(in_tile.data[m][0][2]), - w_tile.data[mul_4(0) + 2][0], - accum.data[m][0]); - - accum.data[m][0] = - fma(VEC4_T(in_tile.data[m][0][3]), - w_tile.data[mul_4(0) + 3][0], - accum.data[m][0]); - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - const int n = mul_4(n4); - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - accum.data[m][n4] = - fma(VEC4_T(in_tile.data[m][k4][0]), - w_tile.data[mul_4(k4)][n4], - accum.data[m][n4]); - - accum.data[m][n4] = - fma(VEC4_T(in_tile.data[m][k4][1]), - w_tile.data[mul_4(k4) + 1][n4], - accum.data[m][n4]); - - accum.data[m][n4] = - fma(VEC4_T(in_tile.data[m][k4][2]), - w_tile.data[mul_4(k4) + 2][n4], - accum.data[m][n4]); - - accum.data[m][n4] = - fma(VEC4_T(in_tile.data[m][k4][3]), - w_tile.data[mul_4(k4) + 3][n4], - accum.data[m][n4]); - } - } - } - -#endif -} - -/* - * Applies per output channel weight scales to the output tile. - */ -void apply_scales(inout FPOutTile tile, const FPPerOutChannelParams scales) { -#if TILE_M > 1 && TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - tile.data[m][0] = tile.data[m][0] * scales.data[0]; - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - tile.data[m][n4] = tile.data[m][n4] * scales.data[n4]; - } - } -#endif -} - -/* - * Applies per output channel weight scales and per output channel biases to the - * output tile. - */ -void apply_scales_and_biases( - inout FPOutTile tile, - const FPPerOutChannelParams scales, - const FPPerOutChannelParams bias) { -#if TILE_M > 1 && TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - tile.data[m][0] = tile.data[m][0] * scales.data[0] + bias.data[0]; - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - tile.data[m][n4] = tile.data[m][n4] * scales.data[n4] + bias.data[n4]; - } - } -#endif -} - -void accumulate_out_tile_with_out_tile( - inout FPOutTile accum, - const FPOutTile other) { - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - accum.data[m][n4] = accum.data[m][n4] + other.data[m][n4]; - } - } -} - -#endif // LINEAR_FP_OUTPUT_TILE_FP_COMPUTE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int4_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int4_compute.glslh deleted file mode 100644 index 0606759e393..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int4_compute.glslh +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions to compute a FPOutTile using fp input and weight tiles. - */ - -#ifndef LINEAR_FP_OUTPUT_TILE_FP_INT4_COMPUTE_GLSLH -#define LINEAR_FP_OUTPUT_TILE_FP_INT4_COMPUTE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_common.glslh" -#include "linear_fp_input_tile.glslh" -#include "linear_fp_output_tile.glslh" -#include "linear_fp_per_out_channel_params.glslh" -#include "linear_int4_weight_tile.glslh" - -// Unpacks a int containing 4 packed 8-bit integers into a vec4 containing each -// of the 4 unpacked 8-bit integers. -VEC4_T unpack_packed_4xint4(const int int8x4, const int n4_group) { - return VEC4_T( - extract_4bit_from_packed_int_le(int8x4, n4_group + 0), - extract_4bit_from_packed_int_le(int8x4, n4_group + 2), - extract_4bit_from_packed_int_le(int8x4, n4_group + 4), - extract_4bit_from_packed_int_le(int8x4, n4_group + 6)); -} - -T extract_4bit_from_weight_block( - const ivec4 block, - const int col, - const int row) { - return T(((block[row] >> (4 * col)) & 0xF) - 8); -} - -void fp_accumulate_with_int4_weight( - inout FPOutTile accum, - FPInputTile in_tile, - Int4WeightTile w_tile, - FPPerOutChannelParams scales_tile, - FPPerOutChannelParams zeros_tile) { - // Accum tile is indexed as accum[m][n4][n4i] - // -> gives fp accumulator for output tile element at (x = n, y = m) - // Input tile is indexed as in_tile.data[m][k4] - // -> gives vec4 containing the fp inputs at index - // (k, m), (k + 1, m), (k + 2, m), (k + 3, m) - // Weight tile is indexed as w_tile.data[k4][n8][n4i] - // -> gives packed integer containing the 8x 4-bit quantized values at index - // (n, k), (n, k + 1), (n, k + 2), (n, k + 3), - // (n + 4, k), (n + 4, k + 1), (n + 4, k + 2), (n + 4, k + 3) - VEC4_T weight_texels[2]; -#if TILE_K4 == 1 && TILE_N8 == 1 - [[unroll]] for (int k = 0; k < 4; ++k) { - const int base_col_1 = mul_2(k); - const int base_col_2 = base_col_1 + 1; - weight_texels[0] = VEC4_T( - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 0), - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 1), - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 2), - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 3)); - weight_texels[1] = VEC4_T( - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 0), - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 1), - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 2), - extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 3)); - - weight_texels[0] = - fma(weight_texels[0], scales_tile.data[0], zeros_tile.data[0]); - weight_texels[1] = - fma(weight_texels[1], scales_tile.data[1], zeros_tile.data[1]); - - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - accum.data[m][0] = fma( - VEC4_T(in_tile.data[m][0][k]), weight_texels[0], accum.data[m][0]); - accum.data[m][1] = fma( - VEC4_T(in_tile.data[m][0][k]), weight_texels[1], accum.data[m][1]); - } - } - -#else - // TODO(ssjia): Implement generic case - not implemented - -#endif -} - -#endif // LINEAR_FP_OUTPUT_TILE_FP_INT4_COMPUTE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh deleted file mode 100644 index b2ab64a1573..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions to compute a FPOutTile using fp input and weight tiles. - */ - -#ifndef LINEAR_FP_OUTPUT_TILE_FP_INT8_COMPUTE_GLSLH -#define LINEAR_FP_OUTPUT_TILE_FP_INT8_COMPUTE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_common.glslh" -#include "linear_fp_input_tile.glslh" -#include "linear_fp_output_tile.glslh" -#include "linear_int8_weight_tile.glslh" - -// Unpacks a int containing 4 packed 8-bit integers into a vec4 containing each -// of the 4 unpacked 8-bit integers. -VEC4_T unpack_packed_4xint8(int int8x4) { - return VEC4_T( - extract_8bit_from_packed_int_le(int8x4, 0), - extract_8bit_from_packed_int_le(int8x4, 1), - extract_8bit_from_packed_int_le(int8x4, 2), - extract_8bit_from_packed_int_le(int8x4, 3)); -} - -void fp_accumulate_with_int8_weight( - inout FPOutTile accum, - FPInputTile in_tile, - Int8WeightTile w_tile) { - // Accum tile is indexed as accum[m][n4][n4i] - // -> gives fp accumulator for output tile element at (x = n, y = m) - // Input tile is indexed as in_tile.data[m][k4] - // -> gives vec4 containing the fp inputs at index - // (k, m), (k + 1, m), (k + 2, m), (k + 3, m) - // Weight tile is indexed as w_tile.data[k4][n4][n4i] - // -> gives packed integer containing the 4x 8-bit quantized values at index - // (n, k), (n, k + 1), (n, k + 2), (n, k + 3) - VEC4_T weight_texel; -#if TILE_K4 == 1 && TILE_N4 == 1 - [[unroll]] for (int k = 0; k < 4; ++k) { - // Unpack one column of weights - weight_texel = VEC4_T( - extract_8bit_from_packed_int_le(w_tile.data[0][0][0], k), - extract_8bit_from_packed_int_le(w_tile.data[0][0][1], k), - extract_8bit_from_packed_int_le(w_tile.data[0][0][2], k), - extract_8bit_from_packed_int_le(w_tile.data[0][0][3], k)); - - for (int m = 0; m < TILE_M; ++m) { - accum.data[m][0] = - fma(VEC4_T(in_tile.data[m][0][k]), weight_texel, accum.data[m][0]); - } - } - -#else - // TODO(ssjia): implement the general case - not implemented - -#endif -} - -#endif // LINEAR_FP_OUTPUT_TILE_FP_INT8_COMPUTE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh deleted file mode 100644 index b04074eba75..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions to compute a FPOutTile using int8 input and weight tiles. - * - * Settings: - * - TILE_M: The number of rows in the output tile. - * - TILE_N4: The number of (groups of 4) columns in the output tile. - */ - -#ifndef LINEAR_FP_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH -#define LINEAR_FP_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH - -#extension GL_EXT_control_flow_attributes : require -#extension GL_EXT_integer_dot_product : require - -#include "linear_common.glslh" -#include "linear_fp_output_tile.glslh" -#include "linear_fp_per_out_channel_params.glslh" -#include "linear_int8_input_tile.glslh" -#include "linear_int8_weight_tile.glslh" -#include "linear_int_per_out_channel_params.glslh" - -// Stores integer accumulators for an output tile. -struct Int32Accum { - ivec4 data[TILE_M][TILE_N4]; -}; - -// Initialize values to 0 -void initialize(out Int32Accum out_accum) { -#if TILE_N4 == 1 - [[unroll]] for (int y = 0; y < TILE_M; ++y) { - out_accum.data[y][0] = ivec4(0); - } - -#else - [[unroll]] for (int y = 0; y < TILE_M; ++y) { - [[unroll]] for (int x4 = 0; x4 < TILE_K4; ++x4) { - out_accum.data[y][x4] = ivec4(0); - } - } -#endif -} - -// Accumulate int8 input and weight tiles into integer accumulator tile -void int_accumulate_with_int8_weight( - inout Int32Accum accum, - Int8InputTile in_tile, - Int8WeightTile w_tile) { - // Accum tile is indexed as accum[m][n4][n4i] - // -> gives integer accumulator for output tile element at (x = n, y = m) - // Input tile is indexed as in_tile.data[m4][k4][m4i] - // -> gives packed integer containing the 4x 8-bit quantized values at index - // (k, m), (k + 1, m), (k + 2, m), (k + 3, m) - // Weight tile is indexed as w_tile.data[k4][n4][n4i] - // -> gives packed integer containing the 4x 8-bit quantized values at index - // (n, k), (n, k + 1), (n, k + 2), (n, k + 3) -#if TILE_M4 == 1 && TILE_K4 == 1 && TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - // n = 0 - accum.data[m][0][0] = dotPacked4x8AccSatEXT( - in_tile.data[0][0][m], w_tile.data[0][0][0], accum.data[m][0][0]); - // n = 1 - accum.data[m][0][1] = dotPacked4x8AccSatEXT( - in_tile.data[0][0][m], w_tile.data[0][0][1], accum.data[m][0][1]); - // n = 2 - accum.data[m][0][2] = dotPacked4x8AccSatEXT( - in_tile.data[0][0][m], w_tile.data[0][0][2], accum.data[m][0][2]); - // n = 3 - accum.data[m][0][3] = dotPacked4x8AccSatEXT( - in_tile.data[0][0][m], w_tile.data[0][0][3], accum.data[m][0][3]); - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - const int m4 = div_4(m); - const int m4i = mod_4(m); - [[unroll]] for (int n = 0; n < TILE_N; ++n) { - const int n4 = div_4(n); - const int n4i = mod_4(n); - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - accum.data[m][n4][n4i] = dotPacked4x8AccSatEXT( - in_tile.data[m4][k4][m4i], - w_tile.data[k4][n4][n4i], - accum.data[m][n4][n4i]); - } - } - } - -#endif -} - -/* - * Computes final weight matrix output tile using: - * - int8 accumulator tile - * - per output channel weight sums - * - per output channel scales - */ -void accumulate_out_tile_with_int_accum( - inout FPOutTile out_tile, - const Int32Accum accum, - const float input_q_scale, - const int input_q_zp, - const IntPerOutChannelParams weight_sums, - const FPPerOutChannelParams weight_scales) { - ivec4 input_zp_vec = ivec4(-input_q_zp); -#if TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - // Unfortunately fma doesn't work with ivec4. Prefer to preserve integer - // format for as long as possible to avoid precision loss. - ivec4 accum_adjusted = - input_zp_vec * weight_sums.data[0] + accum.data[m][0]; - out_tile.data[m][0] = - fma(VEC4_T(accum_adjusted), - input_q_scale * weight_scales.data[0], - out_tile.data[m][0]); - } - -#else - // TODO(ssjia): Implement the general case - not implemented - -#endif -} - -void accumulate_out_tile_with_int_accum( - inout FPOutTile out_tile, - const Int32Accum accum, - const float input_q_scale, - const int input_q_zp, - const IntPerOutChannelParams weight_sums, - const FPPerOutChannelParams weight_scales, - const FPPerOutChannelParams bias) { - ivec4 input_zp_vec = ivec4(-input_q_zp); -#if TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - // Apply scale and zero points to the int accumulator - ivec4 accum_adjusted = - input_zp_vec * weight_sums.data[0] + accum.data[m][0]; - out_tile.data[m][0] = - fma(VEC4_T(accum_adjusted), - input_q_scale * weight_scales.data[0], - out_tile.data[m][0]); - out_tile.data[m][0] += bias.data[0]; - } - -#else - // TODO(ssjia): Implement the general case - not implemented - -#endif -} - -#ifdef DEBUG_MODE - -void printInt32Accum(const Int32Accum tile) { - debugPrintfEXT("int accum: \\n"); - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - debugPrintfEXT( - " %d, %d, %d, %d,", - tile.data[m][n4].x, - tile.data[m][n4].y, - tile.data[m][n4].z, - tile.data[m][n4].w); - } - debugPrintfEXT("\\n"); - } -} - -#endif - -#endif // LINEAR_FP_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_store.glslh deleted file mode 100644 index a4019204cc3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_store.glslh +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions store a FpOutTile to output buffer/texture. - * - * Requires: - * - t_output to be declared in the shader layout - * - * Settings: - * - OUTPUT_BUFFER to indicate t_output is a vec4 buffer, otherwise texture - * storage is assumed. - */ - -#ifndef LINEAR_FP_OUTPUT_TILE_STORE_GLSLH -#define LINEAR_FP_OUTPUT_TILE_STORE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_fp_output_tile.glslh" - -#ifdef OUTPUT_BUFFER - -void write_output_x4( - const VEC4_T out_texel, - const int n4, - const int m, - const int N4) { - t_output[m * N4 + n4] = out_texel; -} - -#else - -void write_output_x4( - const VEC4_T out_texel, - const int n4, - const int m, - const int N4) { - imageStore(t_output, ivec3(n4, m, 0), out_texel); -} - -#endif // OUTPUT_BUFFER - -void write_output_tile( - const FPOutTile out_tile, - const int n4_start, - const int m_start, - const int N4) { -#if TILE_K4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - write_output_x4(out_tile.data[m][0], n4_start, m_start + m, N4); - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - write_output_x4(out_tile.data[m][n4], n4_start + n4, m_start + m, N4); - } - } -#endif -} - -// To be used if M - m >= TILE_M && N4 - n4 >= TILE_N4 -void write_output_tile_no_checks( - const FPOutTile out_tile, - const int n4_start, - const int m_start, - const int N4, - const int M) { -#if TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - write_output_x4(out_tile.data[m][0], n4_start, m_start + m, N4); - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - write_output_x4(out_tile.data[m][n4], n4_start + n4, m_start + m, N4); - } - } -#endif -} - -// To be used if close to tensor boundaries -void write_output_tile_with_checks( - const FPOutTile out_tile, - const int n4_start, - const int m_start, - const int N4, - const int M) { -#if TILE_N4 == 1 - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - if (m_start + m < M) { - write_output_x4(out_tile.data[m][0], n4_start, m_start + m, N4); - } - } - -#else - [[unroll]] for (int m = 0; m < TILE_M; ++m) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - if (m_start + m < M && n4_start + n4 < N4) { - write_output_x4(out_tile.data[m][n4], n4_start + n4, m_start + m, N4); - } - } - } -#endif -} - -#endif // LINEAR_FP_OUTPUT_TILE_STORE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_per_out_channel_params.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_per_out_channel_params.glslh deleted file mode 100644 index 72b22988414..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_per_out_channel_params.glslh +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines common functions and structs to be used across matrix multiplication - * operators. - */ - -#ifndef LINEAR_FP_PER_OUT_CHANNEL_PARAMS_GLSLH -#define LINEAR_FP_PER_OUT_CHANNEL_PARAMS_GLSLH - -#include "common.glslh" - -#extension GL_EXT_control_flow_attributes : require - -// Represents floating point parameter tensors where each element is associated -// with an output channel, such as weight scales, biases, etc. -struct FPPerOutChannelParams { - VEC4_T data[TILE_N4]; -}; - -#ifdef DEBUG_MODE - -void printFPPerOutChannelParams(const FPPerOutChannelParams params) { - debugPrintfEXT("per_out_channel_params: \\n"); - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - debugPrintfEXT( - " %f, %f, %f, %f, \\n", - params.data[n4].x, - params.data[n4].y, - params.data[n4].z, - params.data[n4].w); - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_FP_PER_OUT_CHANNEL_PARAMS_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_scales_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_scales_load.glslh deleted file mode 100644 index 1286c1d082f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_scales_load.glslh +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_FP_WEIGHT_SCALES_LOAD_GLSLH -#define LINEAR_FP_WEIGHT_SCALES_LOAD_GLSLH - -#include "linear_fp_per_out_channel_params.glslh" - -VEC4_T load_weight_scale_x4(const int n4) { - return t_weight_scales[n4]; -} - -VEC4_T load_scale_x4(const int n4, const int quant_group_idx, const int N4) { - return t_weight_scales[quant_group_idx * N4 + n4]; -} - -void load_weight_scales_tile( - out FPPerOutChannelParams scales, - const int n4_start) { -#if TILE_N4 == 1 - scales.data[0] = load_weight_scale_x4(n4_start); - -#else - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - scales.data[n4] = load_weight_scale_x4(n4_start + n4); - } - -#endif -} - -void load_weight_scales_tile_for_group( - out FPPerOutChannelParams scales, - const int n4_start, - const int quant_group_idx, - const int N4) { -#if TILE_N4 == 1 - scales.data[0] = load_scale_x4(n4_start, quant_group_idx, N4); - -#else - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - scales.data[n4] = load_scale_x4(n4_start + n4, quant_group_idx, N4); - } - -#endif -} - -#endif // LINEAR_FP_WEIGHT_SCALES_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_tile.glslh deleted file mode 100644 index f44bbbc1565..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_tile.glslh +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines the FPWeightTile struct, which is used to represent a fp tile of a - * weight matrix in matrix multiplication. - * - * Settings: - * - TILE_K: number of rows in the output tile - * - TILE_N4: number of (groups of 4) columns in the output tile - */ - -#ifndef LINEAR_FP_WEIGHT_TILE_GLSLH -#define LINEAR_FP_WEIGHT_TILE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "common.glslh" - -struct FPWeightTile { - VEC4_T data[TILE_K][TILE_N4]; -}; - -#ifdef LINEAR_INT8_WEIGHT_TILE_GLSLH - -int sign_extend(const int val) { - if ((val & 0x80) != 0) { - return val | (~0xFF); - } - return val; -} - -T extract_8bit_value(const Int8WeightTile w_tile, const int k, const int n) { -#if TILE_K4 == 1 && TILE_N4 == 1 - const int k4i = k; - const int n4i = n; - ivec4 block = w_tile.data[0][0]; - -#else - const int k4 = div_4(k); - const int k4i = mod_4(k); - - const int n4 = div_4(n); - const int n4i = mod_4(n); - - ivec4 block = w_tile.data[k4][n4]; -#endif - - int col = block[n4i]; - int val = (col >> (k4i * 8)) & 0xFF; - - return T(sign_extend(val)); -} - -void unpack(out FPWeightTile fp_w_tile, const Int8WeightTile w_tile) { -#if TILE_K > 1 && TILE_N4 == 1 - [[unroll]] for (int k = 0; k < TILE_K; ++k) { - fp_w_tile.data[k][0][0] = extract_8bit_value(w_tile, k, 0); - fp_w_tile.data[k][0][1] = extract_8bit_value(w_tile, k, 1); - fp_w_tile.data[k][0][2] = extract_8bit_value(w_tile, k, 2); - fp_w_tile.data[k][0][3] = extract_8bit_value(w_tile, k, 3); - } - -#else - [[unroll]] for (int k = 0; k < TILE_M; ++k) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - const int n = mul_4(n4); - fp_w_tile.data[k][n4][0] = extract_8bit_value(w_tile, k, n); - fp_w_tile.data[k][n4][1] = extract_8bit_value(w_tile, k, n + 1); - fp_w_tile.data[k][n4][2] = extract_8bit_value(w_tile, k, n + 2); - fp_w_tile.data[k][n4][3] = extract_8bit_value(w_tile, k, n + 3); - } - } -#endif -} - -#endif // LINEAR_INT8_WEIGHT_TILE_GLSLH - -#ifdef DEBUG_MODE - -void printFPWeightTile(const FPWeightTile tile) { - debugPrintfEXT("weight_tile: \\n"); - [[unroll]] for (int k = 0; k < TILE_K; ++k) { - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - debugPrintfEXT( - " %f, %f, %f, %f, ", - tile.data[k][n4].x, - tile.data[k][n4].y, - tile.data[k][n4].z, - tile.data[k][n4].w); - } - debugPrintfEXT("\\n"); - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_FP_WEIGHT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_block.glslh deleted file mode 100644 index d813224c3aa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_block.glslh +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_INT4_WEIGHT_BLOCK_GLSLH -#define LINEAR_INT4_WEIGHT_BLOCK_GLSLH - -/* - * This file defines utilties to perform weight prepacking of quantized int4 - * matrix multiplation weights. It also defines utilities to load source - * weight data from inputbuffer, and write out a packed weight block to output - * texture/buffer. - * - * Note: 2 4-bit values are packed into each 8-bit value in the source data. - * - * Requires: - * - t_packed_int4_weight to be defined in shader layout (output texture/buffer) - * - t_int4_weight to be defined in shader layout (input buffer) - * - * Settings: - * - USING_BUFFER to indicate if output resource is a buffer. Otherwise texture - * is assumed. - */ - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_common.glslh" - -// Represents source data for 2 8Kx4N block of the weight matrix read from the -// input buffer. Each int element contains 8 packed 4-bit values along the K -// dimension. Overall the data represents 8Kx8N block. -struct Int4Weight2xBlockSourceData { - uint data[8]; -}; - -// Represents data for a packed 4Kx8N block of the weight matrix to be written -// out to output texture/buffer. An individual block was originally a 4Kx8N -// block in the original weight tensor, and then the top and bottom halves are -// concatenated along the width dim. -struct Int4WeightBlockPacked { - ivec4 data; -}; - -void load_block_source_data_no_checks( - out Int4Weight2xBlockSourceData src_data, - const int k8, - const int n_start, - const int ntexels_K, - const int N) { - [[unroll]] for (int n = 0; n < 8; ++n) { - src_data.data[n] = t_int4_weight[(n_start + n) * ntexels_K + k8]; - } -} - -// To be used if K - k_start < 4 -void load_block_source_data_with_checks( - out Int4Weight2xBlockSourceData src_data, - const int k8, - const int n_start, - const int ntexels_K, - const int N) { - [[unroll]] for (int n = 0; n < 8; ++n) { - if (n_start + n < N) { - src_data.data[n] = t_int4_weight[(n_start + n) * ntexels_K + k8]; - } else { - src_data.data[n] = 0x88888888; - } - } -} - -int pack_8x4bit_signed_into_int( - const int val0, - const int val1, - const int val2, - const int val3, - const int val4, - const int val5, - const int val6, - const int val7) { - return int( - ((val7 & 0xF) << 28) | ((val6 & 0xF) << 24) | ((val5 & 0xF) << 20) | - ((val4 & 0xF) << 16) | ((val3 & 0xF) << 12) | ((val2 & 0xF) << 8) | - ((val1 & 0xF) << 4) | ((val0 & 0xF))); -} - -void create_packed_blocks( - out Int4WeightBlockPacked block1, - out Int4WeightBlockPacked block2, - const Int4Weight2xBlockSourceData src_data) { - [[unroll]] for (int row = 0; row < 4; ++row) { - const int row_idx_1 = row; - const int row_idx_2 = row + 4; - block1.data[row] = pack_8x4bit_signed_into_int( - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 0), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 0), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 1), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 1), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 2), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 2), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 3), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 3)); - - block2.data[row] = pack_8x4bit_signed_into_int( - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 4), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 4), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 5), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 5), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 6), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 6), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 7), - extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 7)); - } -} - -#ifdef USING_BUFFER - -void write_packed_block( - const Int4WeightBlockPacked block, - const int k4, - const int n8, - const int nblocks_K) { - t_packed_int4_weight[n8 * nblocks_K + k4] = block.data; -} - -#else // USING_TEXTURE - -void write_packed_block( - const Int4WeightBlockPacked block, - const int k4, - const int n8, - const int nblocks_K) { - imageStore(t_packed_int4_weight, ivec2(k4, n8), block.data); -} - -#endif // USING_BUFFER - -#ifdef DEBUG_MODE - -void printInt4Weight2xBlockSourceData( - const Int4Weight2xBlockSourceData src_data) { - debugPrintfEXT("int4_weight_block_source_data: \\n"); - [[unroll]] for (int row = 0; row < 8; ++row) { - debugPrintfEXT("row %i (raw: %u): ", row, src_data.data[row]); - // Extract and print individual 4-bit values directly from packed int - [[unroll]] for (int col = 0; col < 8; ++col) { - int val_4bit = extract_4bit_from_packed_uint_le(src_data.data[row], col); - debugPrintfEXT("[%i] ", val_4bit); - } - debugPrintfEXT("\\n"); - } -} - -void printInt4WeightBlockPacked(const Int4WeightBlockPacked block) { - debugPrintfEXT("int4_weight_block_packed: \\n"); - // Print unpacked 4-bit values for each int in block.data - [[unroll]] for (int i = 0; i < 4; ++i) { - debugPrintfEXT("block.data[%i] 4-bit values: ", i); - [[unroll]] for (int col = 0; col < 8; ++col) { - int val_4bit = extract_4bit_from_packed_int_le(block.data[i], col); - debugPrintfEXT("[%i] ", val_4bit); - } - debugPrintfEXT("\\n"); - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_INT4_WEIGHT_BLOCK_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile.glslh deleted file mode 100644 index 559459f14a8..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile.glslh +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_INT4_WEIGHT_TILE_GLSLH -#define LINEAR_INT4_WEIGHT_TILE_GLSLH - -#include "linear_common.glslh" -#include "linear_fp_weight_tile.glslh" - -/* - * Defines the Int4WeightTile struct, which is used to represent a tile of the - * quantized int4 weight matrix of a quantized matrix multiplication operation. - * - * Settings: - * - TILE_K4: number of (groups of 4) rows in the weight tile - * - TILE_N8: number of (groups of 8) columns in the weight tile - */ - -#extension GL_EXT_control_flow_attributes : require - -struct Int4WeightTile { - ivec4 data[TILE_K4][TILE_N8]; -}; - -void unpack_int4_weight_tile( - out FPWeightTile int8_tile, - const Int4WeightTile int4_tile) { -#if TILE_K4 == 1 && TILE_N8 == 1 - for (int k = 0; k < TILE_K; ++k) { - const int col_idx_1 = 2 * k; - const int col_idx_2 = 2 * k + 1; - int8_tile.data[k][0][0] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][0], col_idx_1)); - int8_tile.data[k][0][1] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][1], col_idx_1)); - int8_tile.data[k][0][2] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][2], col_idx_1)); - int8_tile.data[k][0][3] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][3], col_idx_1)); - - // n4 = 1 - int8_tile.data[k][1][0] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][0], col_idx_2)); - int8_tile.data[k][1][1] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][1], col_idx_2)); - int8_tile.data[k][1][2] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][2], col_idx_2)); - int8_tile.data[k][1][3] = - T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][3], col_idx_2)); - } - -#else - for (int k = 0; k < TILE_K; ++k) { - const int k4 = div_4(k); - const int k4i = mod_4(k); - for (int n8 = 0; n8 < TILE_N8; ++n8) { - const int n4 = mul_2(n8); - const int col_idx_1 = 2 * k4i; - const int col_idx_2 = 2 * k4i + 1; - int8_tile.data[k][n4][0] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][0], col_idx_1)); - int8_tile.data[k][n4][1] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][1], col_idx_1)); - int8_tile.data[k][n4][2] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][2], col_idx_1)); - int8_tile.data[k][n4][3] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][3], col_idx_1)); - - int8_tile.data[k][n4 + 1][0] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][0], col_idx_2)); - int8_tile.data[k][n4 + 1][1] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][1], col_idx_2)); - int8_tile.data[k][n4 + 1][2] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][2], col_idx_2)); - int8_tile.data[k][n4 + 1][3] = T(extract_4bit_from_packed_int_le( - int4_tile.data[k4][n8][3], col_idx_2)); - } - } - -#endif -} - -#ifdef DEBUG_MODE - -void printInt4WeightTile(const Int4WeightTile block) { - debugPrintfEXT("int4_weight_tile: \\n"); - // Print unpacked 4-bit values for each int in block.data - [[unroll]] for (int i = 0; i < TILE_K; ++i) { - const int k4 = div_4(i); - const int k4i = mod_4(i); - debugPrintfEXT("block.data[%i] 4-bit values: ", i); - [[unroll]] for (int col = 0; col < TILE_N; ++col) { - int val_4bit = - extract_4bit_from_packed_int_le(block.data[k4][0][k4i], col); - debugPrintfEXT("[%i] ", val_4bit); - } - debugPrintfEXT("\\n"); - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_INT4_WEIGHT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile_load.glslh deleted file mode 100644 index 033e0082436..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile_load.glslh +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_INT4_WEIGHT_TILE_LOAD_GLSLH -#define LINEAR_INT4_WEIGHT_TILE_LOAD_GLSLH - -/* - * Defines functions to load a Int4WeightTile from input buffer/texture. - * - * Requires: - * - t_packed_int4_weight to be declared in the shader layout (input - * buffer/texture) - * - * Settings: - * - WEIGHT_BUFFER to indicate t_packed_int4_weight is a buffer, otherwise - * texture storage is assumed. - */ - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_int4_weight_tile.glslh" - -#ifdef WEIGHT_BUFFER - -ivec4 load_int4_weight_block( - const int block_x, - const int block_y, - const int nblocks_x) { - return t_packed_int4_weight[(block_y * nblocks_x) + block_x]; -} - -#else // WEIGHT_TEXTURE - -ivec4 load_int4_weight_block( - const int block_x, - const int block_y, - const int nblocks_x) { - return texelFetch(t_packed_int4_weight, ivec2(block_x, block_y), 0); -} - -#endif // WEIGHT_BUFFER - -void load_int4_weight_tile( - out Int4WeightTile weight_tile, - const int block_x, - const int block_y, - const int nblocks_x) { -#if TILE_K4 == 1 && TILE_N8 == 1 - weight_tile.data[0][0] = load_int4_weight_block(block_x, block_y, nblocks_x); - -#elif TILE_K4 == 1 && TILE_N8 > 1 - [[unroll]] for (int x = 0; x < TILE_N8; ++x) { - weight_tile.data[0][x] = - load_int4_weight_block(block_x + x, block_y, nblocks_x); - } - -#elif TILE_K4 > 1 && TILE_N8 == 1 - [[unroll]] for (int y = 0; y < TILE_K4; ++y) { - weight_tile.data[y][0] = - load_int4_weight_block(block_x, block_y + y, nblocks_x); - } - -#else - [[unroll]] for (int y = 0; y < TILE_K4; ++y) { - [[unroll]] for (int x = 0; x < TILE_N8; ++x) { - weight_tile.data[y][x] = - load_int4_weight_block(block_x + x, block_y + y, nblocks_x); - } - } -#endif -} - -#endif // LINEAR_INT4_WEIGHT_TILE_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh deleted file mode 100644 index 9535de21f7b..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * This file defines utilties to perform int8 quantization and block packing of - * matrix multiplation inputs. It also defines utilities to store packed block - * data to an output buffer or texture. - * - * Requires: - * - t_packed_int8_input to be defined in shader layout (output buffer/texture) - * - * Settings: - * - OUTPUT_BUFFER to indicate if output resource is a buffer. Otherwise texture - * is assumed. - */ - -#ifndef LINEAR_INT8_INPUT_BLOCK_GLSLH -#define LINEAR_INT8_INPUT_BLOCK_GLSLH - -#define TILE_M 4 -#define TILE_K4 1 - -#include "linear_fp_input_tile.glslh" - -struct Int8InputBlock { - ivec4 data; -}; - -ivec4 quantize( - const VEC4_T val, - const float q_inv_scale, - const int q_zero_point) { - vec4 quantized = round(vec4(val) * q_inv_scale) + q_zero_point; - // hard-code 8 bit quantization range - return clamp(ivec4(quantized), -128, 127); -} - -int pack_into_int32(const ivec4 quant_vals) { - int packed = ((quant_vals[0] & 0xFF) << 0) | ((quant_vals[1] & 0xFF) << 8) | - ((quant_vals[2] & 0xFF) << 16) | ((quant_vals[3] & 0xFF) << 24); - - return packed; -} - -void quantize_and_pack( - out Int8InputBlock packed, - const FPInputTile in_block, - const float q_inv_scale, - const int q_zero_point) { - for (int row = 0; row < 4; ++row) { - ivec4 quantized_inputs = - quantize(in_block.data[row][0], q_inv_scale, q_zero_point); - packed.data[row] = pack_into_int32(quantized_inputs); - } -} - -#ifdef OUTPUT_BUFFER - -void write_block( - const Int8InputBlock block, - const int block_x, - const int block_y, - const int nblocks_x) { - t_packed_int8_input[block_y * nblocks_x + block_x] = block.data; -} - -#else // OUTPUT_TEXTURE - -void write_block( - const Int8InputBlock block, - const int block_x, - const int block_y, - const int nblocks_x) { - imageStore(t_packed_int8_input, ivec3(block_x, block_y, 0), block.data); -} - -#endif // OUTPUT_BUFFER - -#endif // LINEAR_INT8_INPUT_BLOCK_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh deleted file mode 100644 index 89a7e1b3f89..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines the Int8InputTile struct, which is used to represent a tile of the - * quantized int8 input matrix of a quantized matrix multiplication operation. - * - * Settings: - * - TILE_M4: number of (groups of 4) rows in the tile - * - TILE_K4: number of (groups of 4) columns in the tile - */ - -#ifndef LINEAR_INT8_INPUT_TILE_GLSLH -#define LINEAR_INT8_INPUT_TILE_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -struct Int8InputTile { - ivec4 data[TILE_M4][TILE_K4]; -}; - -#ifdef DEBUG_MODE - -#include "linear_common.glslh" - -void printInt8InputTile(const Int8InputTile tile) { - debugPrintfEXT( - "Int8InputTile [TILE_M4=%d][TILE_K4=%d]:\\n", TILE_M4, TILE_K4); - - [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - debugPrintfEXT(" tile[%d][%d] (ivec4): ", m4, k4); - - // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit - // values - [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) { - int packed_int = tile.data[m4][k4][vec_idx]; - debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int); - - // Extract 4 8-bit values from this packed integer - [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) { - int val = extract_8bit_from_packed_int_le(packed_int, byte_idx); - if (byte_idx < 3) { - debugPrintfEXT("%d, ", val); - } else { - debugPrintfEXT("%d] ", val); - } - } - } - debugPrintfEXT("\\n"); - } - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_INT8_INPUT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile_load.glslh deleted file mode 100644 index c79badab6c6..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile_load.glslh +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines functions to load a Int8InputTile from input buffer/texture. - * - * Requires: - * - t_packed_int8_input to be declared in the shader layout - * - * Settings: - * - PACKED_INT8_INPUT_BUFFER to indicate resource is a buffer, otherwise - * texture storage is assumed. - */ - -#ifndef LINEAR_INT8_INPUT_TILE_LOAD_GLSLH -#define LINEAR_INT8_INPUT_TILE_LOAD_GLSLH - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_int8_input_tile.glslh" - -#ifdef PACKED_INT8_INPUT_BUFFER - -ivec4 load_int8_input_block( - const int block_x, - const int block_y, - const int nblocks_x) { - return t_packed_int8_input[(block_y * nblocks_x) + block_x]; -} - -#else - -ivec4 load_int8_input_block( - const int block_x, - const int block_y, - const int nblocks_x) { - return texelFetch(t_packed_int8_input, ivec3(block_x, block_y, 0), 0); -} - -#endif // PACKED_INT8_INPUT_BUFFER - -void load_int8_input_tile( - out Int8InputTile in_tile, - const int block_x, - const int block_y, - const int nblocks_x) { -#if TILE_M4 == 1 && TILE_K4 == 1 - in_tile.data[0][0] = load_int8_input_block(block_x, block_y, nblocks_x); - -#elif TILE_M4 == 1 && TILE_K4 > 1 - [[unroll]] for (int x = 0; x < TILE_K4; ++x) { - in_tile.data[0][x] = load_int8_input_block(block_x + x, block_y, nblocks_x); - } - -#elif TILE_M4 > 1 && TILE_K4 == 1 - [[unroll]] for (int y = 0; y < TILE_M4; ++y) { - in_tile.data[y][0] = load_int8_input_block(block_x, block_y + y, nblocks_x); - } - -#else - [[unroll]] for (int y = 0; y < TILE_M4; ++y) { - [[unroll]] for (int x = 0; x < TILE_K4; ++x) { - in_tile.data[y][x] = - load_int8_input_block(block_x + x, block_y + y, nblocks_x); - } - } -#endif -} - -#endif // LINEAR_INT8_INPUT_TILE_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_block.glslh deleted file mode 100644 index 6e98caea49e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_block.glslh +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_INT8_WEIGHT_BLOCK_GLSLH -#define LINEAR_INT8_WEIGHT_BLOCK_GLSLH - -/* - * This file defines utilties to perform weight prepacking of quantized int8 - * matrix multiplation weights. It also defines utilities to load source - * weight data from inputbuffer, and write out a packed weight block to output - * texture/buffer. - * - * Requires: - * - t_packed_int8_weight to be defined in shader layout (output texture/buffer) - * - t_int8_weight to be defined in shader layout (input buffer) - * - * Settings: - * - USING_BUFFER to indicate if output resource is a buffer. Otherwise texture - * is assumed. - */ - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_common.glslh" - -// Represents data for a 4x4 block of the weight matrix read from the input -// buffer. -struct Int8WeightBlock { - ivec4 data; -}; - -void load_block_data_no_checks( - out Int8WeightBlock block, - const int k4, - const int n_start, - const int ntexels_K, - const int N) { - [[unroll]] for (int n = 0; n < 4; ++n) { - block.data[n] = t_int8_weight[(n_start + n) * ntexels_K + k4]; - } -} - -void load_block_data_with_checks( - out Int8WeightBlock block, - const int k4, - const int n_start, - const int ntexels_K, - const int N) { - [[unroll]] for (int n = 0; n < 4; ++n) { - if (n_start + n < N) { - block.data[n] = t_int8_weight[(n_start + n) * ntexels_K + k4]; - } else { - block.data[n] = 0; - } - } -} - -#ifdef USING_BUFFER - -void write_weight_block( - const Int8WeightBlock block, - const int n4, - const int k4, - const int ntexels_N) { - t_packed_int8_weight[k4 * ntexels_N + n4] = block.data; -} - -#else // USING_TEXTURE - -void write_weight_block( - const Int8WeightBlock block, - const int n4, - const int k4, - const int ntexels_N) { - imageStore(t_packed_int8_weight, ivec2(n4, k4), block.data); -} - -#endif // USING_BUFFER - -#ifdef DEBUG_MODE - -void printInt8WeightBlock(const Int8WeightBlockPacked block) { - debugPrintfEXT("int8_weight_block_packed: \\n"); - debugPrintfEXT( - "%i %i %i %i \\n", - block.data[0], - block.data[1], - block.data[2], - block.data[3]); -} - -#endif // DEBUG_MODE - -#endif // LINEAR_INT8_WEIGHT_BLOCK_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile.glslh deleted file mode 100644 index f312db543db..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile.glslh +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_INT8_WEIGHT_TILE_GLSLH -#define LINEAR_INT8_WEIGHT_TILE_GLSLH - -/* - * Defines the Int8WeightTile struct, which is used to represent a tile of the - * quantized int8 weight matrix of a quantized matrix multiplication operation. - * - * Settings: - * - TILE_K4: number of (groups of 4) rows in the weight tile - * - TILE_N4: number of (groups of 4) columns in the weight tile - */ - -#extension GL_EXT_control_flow_attributes : require - -struct Int8WeightTile { - ivec4 data[TILE_K4][TILE_N4]; -}; - -#ifdef DEBUG_MODE - -void printInt8WeightTile(const Int8WeightTile tile) { - debugPrintfEXT( - "Int8WeightTile [TILE_K4=%d][TILE_N4=%d]:\\n", TILE_K4, TILE_N4); - - [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { - [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { - debugPrintfEXT(" tile[%d][%d] (ivec4): ", m4, k4); - - // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit - // values - [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) { - int packed_int = tile.data[m4][k4][vec_idx]; - debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int); - - // Extract 4 8-bit values from this packed integer - [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) { - int val = extract_8bit_from_packed_int_le(packed_int, byte_idx); - if (byte_idx < 3) { - debugPrintfEXT("%d, ", val); - } else { - debugPrintfEXT("%d] ", val); - } - } - } - debugPrintfEXT("\\n"); - } - } -} - -#endif // DEBUG_MODE - -#endif // LINEAR_INT8_WEIGHT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile_load.glslh deleted file mode 100644 index fe16d3469b3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile_load.glslh +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_INT8_WEIGHT_TILE_LOAD_GLSLH -#define LINEAR_INT8_WEIGHT_TILE_LOAD_GLSLH - -/* - * Defines functions to load a Int8WeightTile from input buffer/texture. - * - * Requires: - * - t_packed_int8_weight to be declared in the shader layout (input - * buffer/texture) - * - * Settings: - * - WEIGHT_BUFFER to indicate t_packed_int8_weight is a buffer, otherwise - * texture storage is assumed. - */ - -#extension GL_EXT_control_flow_attributes : require - -#include "linear_int8_weight_tile.glslh" - -#ifdef WEIGHT_BUFFER - -ivec4 load_int8_weight_block( - const int block_x, - const int block_y, - const int nblocks_x) { - return t_packed_int8_weight[(block_y * nblocks_x) + block_x]; -} - -#else // WEIGHT_TEXTURE - -ivec4 load_int8_weight_block( - const int block_x, - const int block_y, - const int nblocks_x) { - return texelFetch(t_packed_int8_weight, ivec2(block_x, block_y), 0); -} - -#endif // WEIGHT_BUFFER - -void load_int8_weight_tile( - out Int8WeightTile weight_tile, - const int block_x, - const int block_y, - const int nblocks_x) { -#if TILE_K4 == 1 && TILE_N4 == 1 - weight_tile.data[0][0] = load_int8_weight_block(block_x, block_y, nblocks_x); - -#elif TILE_K4 == 1 && TILE_N4 > 1 - [[unroll]] for (int x = 0; x < TILE_N4; ++x) { - weight_tile.data[0][x] = - load_int8_weight_block(block_x + x, block_y, nblocks_x); - } - -#elif TILE_K4 > 1 && TILE_N4 == 1 - [[unroll]] for (int y = 0; y < TILE_M4; ++y) { - weight_tile.data[y][0] = - load_int8_weight_block(block_x, block_y + y, nblocks_x); - } - -#else - [[unroll]] for (int y = 0; y < TILE_K4; ++y) { - [[unroll]] for (int x = 0; x < TILE_N4; ++x) { - weight_tile.data[y][x] = - load_int8_weight_block(block_x + x, block_y + y, nblocks_x); - } - } -#endif -} - -#endif // LINEAR_INT8_WEIGHT_TILE_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int_per_out_channel_params.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int_per_out_channel_params.glslh deleted file mode 100644 index ca29fd52780..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int_per_out_channel_params.glslh +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Defines common functions and structs to be used across matrix multiplication - * operators. - */ - -#ifndef LINEAR_INT_PER_OUT_CHANNEL_PARAMS_GLSLH -#define LINEAR_INT_PER_OUT_CHANNEL_PARAMS_GLSLH - -#include "common.glslh" - -#extension GL_EXT_control_flow_attributes : require - -// Represents floating point parameter tensors where each element is associated -// with an output channel, such as weight scales, biases, etc. -struct IntPerOutChannelParams { - ivec4 data[TILE_N4]; -}; - -#ifdef DEBUG_MODE - -void printIntPerOutChannelParams(const IntPerOutChannelParams params) { - debugPrintfEXT("per_out_channel_params: \\n"); - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - debugPrintfEXT( - " %d, %d, %d, %d, ", - params.data[n4].x, - params.data[n4].y, - params.data[n4].z, - params.data[n4].w); - } - debugPrintfEXT("\\n"); -} - -#endif // DEBUG_MODE - -#endif // LINEAR_INT_PER_OUT_CHANNEL_PARAMS_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int_weight_sums_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int_weight_sums_load.glslh deleted file mode 100644 index 1a17f99ea4e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int_weight_sums_load.glslh +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef LINEAR_FP_WEIGHT_SUMS_LOAD_GLSLH -#define LINEAR_FP_WEIGHT_SUMS_LOAD_GLSLH - -#include "linear_int_per_out_channel_params.glslh" - -ivec4 load_weight_sum_x4(const int n4) { - return ivec4(t_weight_sums[n4]); -} - -void load_weight_sums_tile( - out IntPerOutChannelParams sums, - const int n4_start) { -#if TILE_N4 == 1 - sums.data[0] = load_weight_sum_x4(n4_start); - -#else - [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { - sums.data[n4] = load_weight_sum_x4(n4_start + n4); - } - -#endif -} - -#endif // LINEAR_FP_WEIGHT_SUMS_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl deleted file mode 100644 index 6f0d890a9c4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)} -#define T ${texel_load_component_type(DTYPE, IO_STORAGE)} - -$if IO_STORAGE == "buffer": - #define OUTPUT_BUFFER - #define INPUT_BUFFER -$if WEIGHT_STORAGE == "buffer": - #define WEIGHT_BUFFER - -#define TILE_N8 ${TILE_N8} - -#define TILE_K4 ${TILE_K4} -#define TILE_N4 ${TILE_N8 * 2} - -#define TILE_M ${TILE_M} -#define TILE_K ${TILE_K4 * 4} -#define TILE_N ${TILE_N8 * 8} - -#define WGS ${WGS} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "output_sizes")} -${layout_declare_ubo(B, "ivec4", "input_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "apply_bias", "0")} -${layout_declare_spec_const(C, "int", "K4_per_group", "0")} - -#include "common.glslh" -#include "linear_fp_input_tile_load.glslh" -#include "linear_int4_weight_tile_load.glslh" -#include "linear_fp_weight_scales_load.glslh" -#include "linear_fp_output_tile_fp_int4_compute.glslh" -#include "linear_fp_output_tile_fp_compute.glslh" -#include "linear_fp_output_tile_store.glslh" -#include "linear_fp_bias_load.glslh" - -shared FPOutTile partial_sums[WGS]; - -void main() { - const int lid = int(gl_LocalInvocationID.x); - const int n8 = int(gl_GlobalInvocationID.y); - - // The output tensor will have a shape of [n, 1, 1, 1]. Each thread computes - // 8 output elements, so each thread will write to 8 elements starting at the - // tensor index (gid.x * 8, 0, 0, 0). - const int n = mul_8(n8); - const int n4 = mul_2(n8); - const int K4 = div_up_4(input_sizes.x); - const int N4 = div_up_4(output_sizes.x); - - const int group_size = mul_4(K4_per_group); - - if (n >= output_sizes.x) { - return; - } - - FPOutTile out_tile; - initialize(out_tile); - - FPInputTile in_tile; - Int4WeightTile int4_weight_tile; - - FPPerOutChannelParams weight_scales_tile; - FPPerOutChannelParams weight_zeros_tile; - weight_zeros_tile.data[0] = VEC4_T(0.0); - weight_zeros_tile.data[1] = VEC4_T(0.0); - - // initialize the group index to a value larger than the largest possible - int cur_group_idx = input_sizes.x; - - for (int k4 = lid; k4 < div_up_4(input_sizes.x); k4 += WGS) { - const int group_idx = k4 / K4_per_group; - - // Only update the scales/zeros if the current iteration is now working on a - // new quantization group. - if (group_idx != cur_group_idx) { - load_weight_scales_tile_for_group(weight_scales_tile, n4, group_idx, N4); - cur_group_idx = group_idx; - } - - load_input_tile_no_checks(in_tile, k4, 0, K4, 1); - load_int4_weight_tile(int4_weight_tile, k4, n8, K4); - - fp_accumulate_with_int4_weight( - out_tile, - in_tile, - int4_weight_tile, - weight_scales_tile, - weight_zeros_tile); - } - - partial_sums[lid] = out_tile; - - memoryBarrierShared(); - barrier(); - - // Tree reduction to compute the overall result. - for (int i = WGS / 2; i > 0; i /= 2) { - if (lid < i) { - accumulate_out_tile_with_out_tile( - partial_sums[lid], partial_sums[lid + i]); - } - memoryBarrierShared(); - barrier(); - } - - // Only the first thread will write out result - if (lid == 0) { - out_tile = partial_sums[0]; - write_output_tile_with_checks(out_tile, n4, 0, N4, 1); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml deleted file mode 100644 index bb5f44d4086..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_q4gsw_coop: - parameter_names_with_default_values: - DTYPE: float - IO_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - TILE_M: 1 - TILE_K4: 1 - TILE_N8: 1 - WGS: 64 - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: linear_q4gsw_coop_texture3d_texture2d - - NAME: linear_q4gsw_coop_texture3d_buffer - WEIGHT_STORAGE: buffer - - NAME: linear_q4gsw_coop_buffer_texture2d - IO_STORAGE: buffer - - NAME: linear_q4gsw_coop_buffer_buffer - IO_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl deleted file mode 100644 index 0ad91643219..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)} -#define T ${texel_load_component_type(DTYPE, IO_STORAGE)} - -$if IO_STORAGE == "buffer": - #define OUTPUT_BUFFER - #define INPUT_BUFFER -$if WEIGHT_STORAGE == "buffer": - #define WEIGHT_BUFFER - -#define TILE_N8 ${TILE_N8} - -#define TILE_M4 ${TILE_M4} -#define TILE_K4 ${TILE_K4} -#define TILE_N4 ${TILE_N8 * 2} - -#define TILE_M ${TILE_M4 * 4} -#define TILE_K ${TILE_K4 * 4} -#define TILE_N ${TILE_N8 * 8} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "common.glslh" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "output_sizes")} -${layout_declare_ubo(B, "ivec4", "input_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "apply_bias", "0")} -${layout_declare_spec_const(C, "int", "K4_per_group", "0")} - -#include "linear_fp_input_tile_load.glslh" -#include "linear_int4_weight_tile_load.glslh" -#include "linear_fp_weight_scales_load.glslh" -#include "linear_fp_bias_load.glslh" -#include "linear_fp_output_tile_fp_int4_compute.glslh" -#include "linear_fp_output_tile_fp_compute.glslh" -#include "linear_fp_output_tile_store.glslh" - -void main() { - const int out_tile_x = int(gl_GlobalInvocationID.x); - const int out_tile_y = int(gl_GlobalInvocationID.y); - - const int n = out_tile_x * TILE_N; - const int m = out_tile_y * TILE_M; - - const int n8 = div_8(n); - const int n4 = div_4(n); - const int m4 = div_4(m); - - if (n >= output_sizes.x || m >= output_sizes.y) { - return; - } - - const int M = input_sizes.y; - const int K4 = div_up_4(input_sizes.x); - const int N4 = div_up_4(output_sizes.x); // number of texels in each row - const int N8 = div_up_8(output_sizes.x); // number of texels in each row - - bool should_print = (n8 == 0) && (m4 == 0); - should_print = false; - - // VEC4_T out_texels[4][2]; - FPOutTile out_tile; - initialize(out_tile); - - FPInputTile in_tile; - Int4WeightTile int4_weight_tile; - - FPPerOutChannelParams weight_scales_tile; - FPPerOutChannelParams weight_zeros_tile; - weight_zeros_tile.data[0] = VEC4_T(0.0); - weight_zeros_tile.data[1] = VEC4_T(0.0); - - const int num_groups = K4 / K4_per_group; - - for (int group_i = 0; group_i < num_groups; ++group_i) { - // Load quantization scales and zeros for the current group - load_weight_scales_tile_for_group(weight_scales_tile, n4, group_i, N4); - - for (int k4_inner = 0; k4_inner < K4_per_group; k4_inner++) { - const int k4 = group_i * K4_per_group + k4_inner; - - load_input_tile_no_checks(in_tile, k4, m, K4, M); - load_int4_weight_tile(int4_weight_tile, k4, n8, K4); - - fp_accumulate_with_int4_weight( - out_tile, - in_tile, - int4_weight_tile, - weight_scales_tile, - weight_zeros_tile); - } - } - - write_output_tile_with_checks(out_tile, n4, m, N4, M); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.yaml deleted file mode 100644 index 5a6bcb711bb..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_q4gsw_tiled: - parameter_names_with_default_values: - DTYPE: float - IO_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - TILE_M4: 1 - TILE_K4: 1 - TILE_N8: 1 - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: linear_q4gsw_tiled_texture3d_texture2d - - NAME: linear_q4gsw_tiled_texture3d_buffer - WEIGHT_STORAGE: buffer - - NAME: linear_q4gsw_tiled_buffer_texture2d - IO_STORAGE: buffer - WEIGHT_STORAGE: texture2d - - NAME: linear_q4gsw_tiled_buffer_buffer - IO_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.glsl deleted file mode 100644 index b6d932f0015..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.glsl +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)} -#define T ${texel_load_component_type(DTYPE, IO_STORAGE)} - -$if IO_STORAGE == "buffer": - #define OUTPUT_BUFFER - #define INPUT_BUFFER -$if WEIGHT_STORAGE == "buffer": - #define WEIGHT_BUFFER - -#define TILE_M4 ${TILE_M4} -#define TILE_K4 ${TILE_K4} -#define TILE_N4 ${TILE_N4} - -#define TILE_M ${TILE_M4 * 4} -#define TILE_K ${TILE_K4 * 4} -#define TILE_N ${TILE_N4 * 4} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "output_sizes")} -${layout_declare_ubo(B, "ivec4", "input_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "uint", "apply_bias", "0")} - -#include "linear_fp_input_tile_load.glslh" -#include "linear_int8_weight_tile_load.glslh" -#include "linear_fp_weight_tile.glslh" -#include "linear_fp_output_tile_fp_compute.glslh" -#include "linear_fp_output_tile_fp_int8_compute.glslh" -#include "linear_fp_output_tile_store.glslh" -#include "linear_fp_weight_scales_load.glslh" -#include "linear_fp_bias_load.glslh" - -void main() { - // Each thread writes out a 4 wide x 4 high tile of output values - const int out_tile_x = int(gl_GlobalInvocationID.x); - const int out_tile_y = int(gl_GlobalInvocationID.y); - - const int n = out_tile_x * TILE_N; - const int m = out_tile_y * TILE_M; - - const int n4 = div_4(n); - const int m4 = div_4(m); - - if (n >= output_sizes.x || m >= output_sizes.y) { - return; - } - - const int M = input_sizes.y; - const int K4 = div_up_4(input_sizes.x); - const int N4 = div_up_4(output_sizes.x); - - FPOutTile out_tile; - initialize(out_tile); - - FPInputTile in_tile; - Int8WeightTile int8_weight_tile; - - const bool dont_check_bounds = (M - m) >= TILE_M; - if (dont_check_bounds) { - for (int k4 = 0; k4 < K4; k4 += TILE_K4) { - load_input_tile_no_checks(in_tile, k4, m, K4, M); - load_int8_weight_tile(int8_weight_tile, n4, k4, N4); - fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile); - } - } else { - for (int k4 = 0; k4 < K4; k4 += TILE_K4) { - load_input_tile_with_checks(in_tile, k4, m, K4, M); - load_int8_weight_tile(int8_weight_tile, n4, k4, N4); - fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile); - } - } - - FPPerOutChannelParams weight_scales_tile; - load_weight_scales_tile(weight_scales_tile, n4); - - if (apply_bias > 0) { - FPPerOutChannelParams bias_tile; - load_bias_tile(bias_tile, n4); - - apply_scales_and_biases(out_tile, weight_scales_tile, bias_tile); - } - else { - apply_scales(out_tile, weight_scales_tile); - } - - if (dont_check_bounds) { - write_output_tile_no_checks(out_tile, n4, m, N4, M); - } else { - write_output_tile_with_checks(out_tile, n4, m, N4, M); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.yaml deleted file mode 100644 index 242c4471b3d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_q8csw_tiled: - parameter_names_with_default_values: - DTYPE: float - IO_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - TILE_M4: 1 - TILE_N4: 1 - TILE_K4: 1 - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: linear_q8csw_tiled_texture3d_texture2d - - NAME: linear_q8csw_tiled_texture3d_buffer - WEIGHT_STORAGE: buffer - - NAME: linear_q8csw_tiled_buffer_texture2d - IO_STORAGE: buffer - WEIGHT_STORAGE: texture2d - - NAME: linear_q8csw_tiled_buffer_buffer - IO_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.glsl deleted file mode 100644 index 9f7e00e3317..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.glsl +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)} -#define T int - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if PACKED_INT8_INPUT_STORAGE == "buffer": - #define PACKED_INT8_INPUT_BUFFER -$if WEIGHT_STORAGE == "buffer": - #define WEIGHT_BUFFER - -#define TILE_M4 ${TILE_M4} -#define TILE_K4 ${TILE_K4} -#define TILE_N4 ${TILE_N4} - -#define TILE_M ${TILE_M4 * 4} -#define TILE_K ${TILE_K4 * 4} -#define TILE_N ${TILE_N4 * 4} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} - -${layout_declare_spec_const(C, "int", "apply_bias", "0")} - -${layout_declare_ubo(B, "ivec4", "output_sizes")} -${layout_declare_ubo(B, "ivec4", "input_sizes")} - -layout(push_constant) uniform restrict Block { - float input_scale; - int input_zp; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "linear_int8_input_tile_load.glslh" -#include "linear_int8_weight_tile_load.glslh" -#include "linear_fp_output_tile_int8_int8_compute.glslh" -#include "linear_fp_output_tile_store.glslh" -#include "linear_fp_weight_scales_load.glslh" -#include "linear_int_weight_sums_load.glslh" -#include "linear_fp_bias_load.glslh" - -void main() { - // Each thread writes out a 4 wide x 4 high tile of output values - const int out_tile_x = int(gl_GlobalInvocationID.x); - const int out_tile_y = int(gl_GlobalInvocationID.y); - - const int n = out_tile_x * TILE_N; - const int m = out_tile_y * TILE_M; - - const int n4 = div_4(n); - const int m4 = div_4(m); - - if (n >= output_sizes.x || m >= output_sizes.y) { - return; - } - - const int M = output_sizes.y; - const int K4 = div_up_4(input_sizes.x); - const int N4 = div_up_4(output_sizes.x); - - Int32Accum out_accum; - initialize(out_accum); - - Int8InputTile int8_in_tile; - Int8WeightTile int8_weight_tile; - - // No checks are needed since packed input and weight are structured in units - // of 4x4 blocks. - for (int k4 = 0; k4 < K4; k4 += TILE_K4) { - load_int8_input_tile(int8_in_tile, k4, m4, K4); - load_int8_weight_tile(int8_weight_tile, n4, k4, N4); - - int_accumulate_with_int8_weight(out_accum, int8_in_tile, int8_weight_tile); - } - - FPPerOutChannelParams weight_scales_tile; - load_weight_scales_tile(weight_scales_tile, n4); - - IntPerOutChannelParams weight_sums_tile; - load_weight_sums_tile(weight_sums_tile, n4); - - FPOutTile out_tile; - initialize(out_tile); - - if (apply_bias > 0) { - FPPerOutChannelParams bias_tile; - load_bias_tile(bias_tile, n4); - - accumulate_out_tile_with_int_accum( - out_tile, - out_accum, - input_scale, - input_zp, - weight_sums_tile, - weight_scales_tile, - bias_tile); - } - else { - accumulate_out_tile_with_int_accum( - out_tile, - out_accum, - input_scale, - input_zp, - weight_sums_tile, - weight_scales_tile); - } - - if (M - m >= TILE_M) { - write_output_tile_no_checks(out_tile, n4, m, N4, M); - } else { - write_output_tile_with_checks(out_tile, n4, m, N4, M); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml deleted file mode 100644 index aa1de3077fc..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_q8ta_q8csw_tiled: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: texture3d - PACKED_INT8_INPUT_STORAGE: buffer - WEIGHT_STORAGE: texture2d - TILE_M4: 1 - TILE_N4: 1 - TILE_K4: 1 - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: linear_q8ta_q8csw_tiled_texture3d_buffer_texture2d - - NAME: linear_q8ta_q8csw_tiled_buffer_buffer_texture2d - OUTPUT_STORAGE: buffer - PACKED_INT8_INPUT_STORAGE: buffer - WEIGHT_STORAGE: texture2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl deleted file mode 100644 index 4dd83f0d4ed..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define FLOAT_T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type(STORAGE)} - -${define_required_extensions(DTYPE)} -$if STORAGE == "buffer": - ${define_required_extensions("int8")} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)} -${layout_declare_tensor(2, "r", "t_qmat2", "int8", STORAGE)} -${layout_declare_tensor(3, "r", "t_scales", DTYPE, STORAGE)} - -$if STORAGE == "buffer": - layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 out_strides; - ivec4 mat1_sizes; - ivec4 mat1_strides; - ivec4 qmat2_strides; - ivec4 scales_strides; - int out_numel; - }; -$else: - layout(push_constant) uniform restrict Block { - ivec3 out_limits; - ivec4 mat1_sizes; - }; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// This header file must be defined after the layout descriptors have been -// declared because the functions in the header assume some variables have been -// declared as layout descriptors. - -#ifdef USING_BUFFER - -#ifndef FLOAT_T -#define FLOAT_T float -#endif - -void main() { - const int out_bufi = int(gl_GlobalInvocationID.x); - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides); - - const FLOAT_T scale = t_scales[out_tidx.x]; - - FLOAT_T outval = FLOAT_T(0.0); - - int mat1_offset = out_tidx.y * mat1_strides.y + out_tidx.z * qmat2_strides.z; - int qmat2_offset = out_tidx.x; - - // TODO(ssjia): optimize memory access pattern by traversing mat1 x in inner loop - for (int i = 0; i < mat1_sizes.x; i++) { - const FLOAT_T mat1_val = t_mat1[mat1_offset]; - const FLOAT_T mat2_val = FLOAT_T(t_qmat2[qmat2_offset]); - - outval += mat1_val * mat2_val; - - mat1_offset++; - qmat2_offset += qmat2_strides.y; - } - - t_out[out_bufi] = outval * scale; -} - -#else // USING_TEXTURE - -void main() { - const ivec2 out_pos = ivec2( - gl_GlobalInvocationID.x % out_limits.x, - gl_GlobalInvocationID.x / out_limits.x); - - if (out_pos.y >= out_limits.y) { - return; - } - - const int qmat2_pos_x = out_pos.x; - - VEC4_T outtex = VEC4_T(0); - - const VEC4_T scales = load_texel(t_scales, ivec3(out_pos.x, 0, 0)); - - VEC4_T mat1_tex; - VEC4_T mat2_tex[4]; - for ( - int i = 0, x = 0; - i < mat1_sizes.x; - i += 4, x++) - { - mat1_tex = load_texel(t_mat1, ivec3(x, out_pos.y, 0)); - - mat2_tex[0] = load_texel(t_qmat2, ivec3(out_pos.x, i, 0)); - mat2_tex[1] = load_texel(t_qmat2, ivec3(out_pos.x, i + 1, 0)); - mat2_tex[2] = load_texel(t_qmat2, ivec3(out_pos.x, i + 2, 0)); - mat2_tex[3] = load_texel(t_qmat2, ivec3(out_pos.x, i + 3, 0)); - - outtex += mat1_tex.x * mat2_tex[0] + mat1_tex.y * mat2_tex[1] + mat1_tex.z * mat2_tex[2] + mat1_tex.w * mat2_tex[3]; - } - - outtex *= scales; - write_texel(t_out, ivec3(out_pos, 0), outtex); -} - -#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.yaml deleted file mode 100644 index 800007406f0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qcsnw: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - MAT1_PACKING: W_packed - MAT2_PACKING: W_packed - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - STORAGE: - - VALUE: texture3d - - VALUE: buffer - shader_variants: - - NAME: linear_qcs8w_W_packed_W_packed - - NAME: linear_qcs8w_W_packed_H_packed - MAT2_PACKING: H_packed diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl deleted file mode 100644 index c766a3cd7d0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} - -#define TILE_ROWS ${TILE_ROWS} -#define TILE_TXCOLS ${TILE_TXCOLS} - -#define NGROUPS 8 -#define NWORKERS 8 - -${define_required_extensions(DTYPE)} - -$if WEIGHT_STORAGE == "buffer": - ${define_required_extensions("int8")} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)} -$if QUANT_NBITS == 4: - ${layout_declare_tensor(B, "r", "t_weight", "uint8", WEIGHT_STORAGE, is_scalar_array=False)} -$else: - ${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 weight_sizes; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -shared VEC4_T partial_sums[NGROUPS][NWORKERS][TILE_ROWS][TILE_TXCOLS]; - -void main() { - // txcol stands for "texel column". One txcol corresponds to 4 scalar columns. - $if TILE_TXCOLS > 1: - const uint global_wg_x = uint(divup(out_sizes.x, 4 * TILE_TXCOLS)); - const uint out_txcol = uint( - (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS); - $else: - const uint global_wg_x = uint(divup4(out_sizes.x)); - const uint out_txcol = uint(gl_GlobalInvocationID.x % global_wg_x); - - const uint out_row = uint( - (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS); - - $if QUANT_NBITS == 4: - const uint weight_txcol = uint(out_txcol / 2); - - const int gid = int(gl_LocalInvocationID.x); // group id - const int wid = int(gl_LocalInvocationID.z); // worker id - - if (out_row >= out_sizes.y) { - return; - } - - VEC4_T mat1[TILE_ROWS]; - VEC4_T qmat2[4][TILE_TXCOLS]; - VEC4_T local_sums[TILE_ROWS][TILE_TXCOLS]; - - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - local_sums[r][${c}] = VEC4_T(0.0); - } - - VEC4_T scales[TILE_TXCOLS]; - $for c in range(TILE_TXCOLS): - $if SCALES_STORAGE == "buffer": - scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]); - $else: - scales[${c}] = VEC4_T( - texelFetch(t_scales, ivec2(out_txcol + ${c}, 0), 0)); - - for (int pos = (4 * wid), txpos = wid; - pos < in_sizes.x; - pos += (4 * NWORKERS), txpos += NWORKERS) { - $if WEIGHT_STORAGE == "buffer": - uint qmat2_bufi; - uint weight_row_txstride = div4(weight_sizes.x); - - // Preload weight tensor - [[unroll]] for (int r = 0; r < 4; r++) { - $if QUANT_NBITS == 4: - $for c in range(0, TILE_TXCOLS, 2): - $if WEIGHT_STORAGE == "buffer": - qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol; - const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}] - $else: - const uvec4 packed_weight_tex = texelFetch( - t_weight, ivec2(weight_txcol + ${c}, pos + r), 0); - - qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0); - qmat2[r][${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0); - $else: - $for c in range(TILE_TXCOLS): - $if WEIGHT_STORAGE == "buffer": - qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol; - qmat2[r][${c}] = t_weight[qmat2_bufi + ${c}]; - $else: - qmat2[r][${c}] = VEC4_T( - texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0)); - } - - $if IN_STORAGE == "buffer": - uint in_row_txstride = div4(in_sizes.x); - - // Preload input tensor - [[unroll]] for (int i = 0; i < TILE_ROWS; i++) { - $if IN_STORAGE == "buffer": - mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos]; - $else: - mat1[i] = VEC4_T( - texelFetch(t_in, ivec3(txpos, out_row + i, 0), 0)); - } - - // Accumulate partial output - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - local_sums[r][${c}] += mat1[r].x * qmat2[0][${c}] + - mat1[r].y * qmat2[1][${c}] + - mat1[r].z * qmat2[2][${c}] + - mat1[r].w * qmat2[3][${c}]; - } - } - - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - partial_sums[gid][wid][r][${c}] = local_sums[r][${c}]; - } - - memoryBarrierShared(); - barrier(); - - if (wid != 0) { - return; - } - - VEC4_T sums[TILE_ROWS][TILE_TXCOLS]; - - for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - sums[r][${c}] = VEC4_T(0.0); - - [[unroll]] for (int worker = 0; worker < NWORKERS; ++worker) { - $for c in range(TILE_TXCOLS): - sums[r][${c}] += partial_sums[gid][worker][r][${c}]; - } - } - - $if OUT_STORAGE == "buffer": - uint out_bufi; - uint out_row_txstride = div4(out_sizes.x); - - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - $if OUT_STORAGE == "buffer": - if (out_row + r < out_sizes.y) { - out_bufi = (out_row + r) * out_row_txstride + out_txcol; - t_out[out_bufi + ${c}] = sums[r][${c}] * scales[${c}]; - } - $else: - imageStore( - t_out, - ivec3(out_txcol + ${c}, out_row + r, 0), - sums[r][${c}] * scales[${c}]); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml deleted file mode 100644 index 3dff6855142..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qcsnw_coop: - parameter_names_with_default_values: - DTYPE: float - IN_STORAGE: texture3d - OUT_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - SCALES_STORAGE: texture2d - TILE_ROWS: 4 - TILE_TXCOLS: 1 - QUANT_NBITS: 8 - generate_variant_forall: - TILE_ROWS: - - VALUE: 1 - SUFFIX: o4x1 - shader_variants: - - NAME: linear_qcs8w_coop_texture3d_texture3d_texture2d_texture2d_float - - NAME: linear_qcs8w_coop_buffer_buffer_texture2d_texture2d_float - IN_STORAGE: buffer - OUT_STORAGE: buffer - - NAME: linear_qcs8w_coop_buffer_buffer_buffer_buffer_float - IN_STORAGE: buffer - OUT_STORAGE: buffer - WEIGHT_STORAGE: buffer - SCALES_STORAGE: buffer - - NAME: linear_qcs4w_coop_texture3d_texture3d_texture2d_texture2d_float - TILE_TXCOLS: 2 - QUANT_NBITS: 4 - - NAME: linear_qcs4w_coop_buffer_buffer_texture2d_texture2d_float - IN_STORAGE: buffer - OUT_STORAGE: buffer - TILE_TXCOLS: 2 - QUANT_NBITS: 4 diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl deleted file mode 100644 index f6f05aab7ca..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} - -#define TILE_ROWS ${TILE_ROWS} -#define TILE_TXCOLS ${TILE_TXCOLS} - -${define_required_extensions(DTYPE)} - -$if WEIGHT_STORAGE == "buffer": - ${define_required_extensions("int8")} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)} -$if QUANT_NBITS == 4: - ${layout_declare_tensor(B, "r", "t_weight", "uint8", WEIGHT_STORAGE, is_scalar_array=False)} -$else: - ${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)} - - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 weight_sizes; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require - -void main() { - // txcol stands for "texel column". One txcol corresponds to 4 scalar columns. - $if TILE_TXCOLS > 1: - const uint16_t global_wg_x = uint16_t(divup(out_sizes.x, 4 * TILE_TXCOLS)); - const uint16_t out_txcol = uint16_t( - (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS); - $else: - const uint16_t global_wg_x = uint16_t(divup4(out_sizes.x)); - const uint16_t out_txcol = uint16_t(gl_GlobalInvocationID.x % global_wg_x); - - const uint16_t out_row = uint16_t( - (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS); - - $if QUANT_NBITS == 4: - const uint16_t weight_txcol = uint16_t(out_txcol / 2); - - if (out_row >= uint16_t(out_sizes.y)) { - return; - } - - VEC4_T mat1[TILE_ROWS]; - VEC4_T qmat2[4][TILE_TXCOLS]; - VEC4_T sums[TILE_ROWS][TILE_TXCOLS]; - - VEC4_T scales[TILE_TXCOLS]; - $for c in range(TILE_TXCOLS): - $if SCALES_STORAGE == "buffer": - scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]); - $else: - scales[${c}] = VEC4_T( - texelFetch(t_scales, u16vec2(out_txcol + ${c}, 0), 0)); - - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - sums[r][${c}] = VEC4_T(0.0); - } - - for (uint16_t pos = uint16_t(0), txpos = uint16_t(0); - pos < uint16_t(in_sizes.x); - pos += uint16_t(4), txpos += uint16_t(1)) { - $if WEIGHT_STORAGE == "buffer": - uint qmat2_bufi; - uint weight_row_txstride = div4(weight_sizes.x); - - // Preload weight tensor - [[unroll]] for (int r = 0; r < 4; r++) { - $if QUANT_NBITS == 4: - $for c in range(0, TILE_TXCOLS, 2): - $if WEIGHT_STORAGE == "buffer": - qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol; - const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}] - $else: - const uvec4 packed_weight_tex = texelFetch( - t_weight, u16vec2(weight_txcol + ${c}, pos + r), 0); - - qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0); - qmat2[r][${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0); - $else: - $for c in range(TILE_TXCOLS): - $if WEIGHT_STORAGE == "buffer": - qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol; - qmat2[r][${c}] = t_weight[qmat2_bufi + ${c}]; - $else: - qmat2[r][${c}] = VEC4_T( - texelFetch(t_weight, u16vec2(out_txcol + ${c}, pos + r), 0)); - } - - $if IN_STORAGE == "buffer": - uint in_row_txstride = div4(in_sizes.x); - - // Preload input tensor - [[unroll]] for (int i = 0; i < TILE_ROWS; i++) { - $if IN_STORAGE == "buffer": - mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos]; - $else: - mat1[i] = VEC4_T( - texelFetch(t_in, u16vec3(txpos, out_row + i, 0), 0)); - } - - // Accumulate output - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - sums[r][${c}] += mat1[r].x * qmat2[0][${c}] + - mat1[r].y * qmat2[1][${c}] + - mat1[r].z * qmat2[2][${c}] + - mat1[r].w * qmat2[3][${c}]; - } - } - - // Store to output tensor - $if OUT_STORAGE == "buffer": - uint out_bufi; - uint out_row_txstride = div4(out_sizes.x); - - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $for c in range(TILE_TXCOLS): - $if OUT_STORAGE == "buffer": - if (out_row + r < out_sizes.y) { - out_bufi = (out_row + r) * out_row_txstride + out_txcol; - t_out[out_bufi + ${c}] = sums[r][${c}] * scales[${c}]; - } - $else: - imageStore( - t_out, - ivec3(out_txcol + ${c}, out_row + r, 0), - sums[r][${c}] * scales[${c}]); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml deleted file mode 100644 index 1c9ec4e524a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qcsnw_tiled: - parameter_names_with_default_values: - DTYPE: float - IN_STORAGE: texture3d - OUT_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - SCALES_STORAGE: texture2d - TILE_ROWS: 4 - TILE_TXCOLS: 1 - QUANT_NBITS: 8 - generate_variant_forall: - TILE_ROWS: - - VALUE: 1 - SUFFIX: o4x1 - - VALUE: 2 - SUFFIX: o4x2 - - VALUE: 4 - SUFFIX: o4x4 - shader_variants: - - NAME: linear_qcs8w_tiled_texture3d_texture3d_texture2d_texture2d_float - - NAME: linear_qcs8w_tiled_buffer_buffer_texture2d_texture2d_float - IN_STORAGE: buffer - OUT_STORAGE: buffer - - NAME: linear_qcs8w_tiled_buffer_buffer_buffer_buffer_float - IN_STORAGE: buffer - OUT_STORAGE: buffer - WEIGHT_STORAGE: buffer - SCALES_STORAGE: buffer - - NAME: linear_qcs4w_tiled_texture3d_texture3d_texture2d_texture2d_float - TILE_TXCOLS: 2 - QUANT_NBITS: 4 - - NAME: linear_qcs4w_tiled_buffer_buffer_texture2d_texture2d_float - IN_STORAGE: buffer - OUT_STORAGE: buffer - TILE_TXCOLS: 2 - QUANT_NBITS: 4 diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.glsl deleted file mode 100644 index 150efeef1ad..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.glsl +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${texel_load_component_type(DTYPE, IO_STORAGE)} -#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)} - -#define WGS ${WGS} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_qmat2", "uint", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)} - -layout(push_constant) uniform restrict Block { - ivec4 output_sizes; - ivec4 input_sizes; - ivec4 weight_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int group_size = 64; - -shared VEC4_T partial_sums[WGS][2]; - -$if IO_STORAGE == "buffer": - #define BUFFER_IO -$if WEIGHT_STORAGE == "buffer": - #define BUFFER_WEIGHT - -#include "qlinear_utils.glslh" - -void main() { - const uint lid = gl_LocalInvocationID.x; - const uint n8 = gl_GlobalInvocationID.y; - // The output tensor will have a shape of [n, 1, 1, 1]. Each thread computes - // 8 output elements, so each thread will write to 8 elements starting at the - // tensor index (gid.x * 8, 0, 0, 0). - const uint n = MUL_8(n8); - const uint K4 = DIV_UP_4(input_sizes.x); - - if (n >= output_sizes.x) { - return; - } - - VEC4_T out_texels[2]; - out_texels[0] = VEC4_T(0); - out_texels[1] = VEC4_T(0); - - // initialize the group index to a value larger than the largest possible - uint cur_group_idx = input_sizes.x; - - // Each thread in the work group accumulates a partial result. - for (uint k4 = lid; k4 < DIV_UP_4(input_sizes.x); k4 += WGS) { - const uint k = MUL_4(k4); - const uint group_idx = k / group_size; - - VEC4_T scales[2]; - VEC4_T zeros[2]; - - // Only update the scales/zeros if the current iteration is now working on a - // new quantization group. - if (group_idx != cur_group_idx) { - // The qparams tensor contains the quantization scales and zeros, with - // shape [2, N, K / group_size, 1]. - // Loading a texel from the qparams tensor will return 2 scales and 2 - // zeros for 2 adjacent output channels. - uint qparams_bufi = group_idx * DIV_2(output_sizes.x) + DIV_2(n); - VEC4_T scales_zeros_texels[4]; - $for comp in range(4): - scales_zeros_texels[${comp}] = t_qparams[qparams_bufi++]; - - scales[0] = VEC4_T(scales_zeros_texels[0].xz, scales_zeros_texels[1].xz); - zeros[0] = VEC4_T(scales_zeros_texels[0].yw, scales_zeros_texels[1].yw); - - scales[1] = VEC4_T(scales_zeros_texels[2].xz, scales_zeros_texels[3].xz); - zeros[1] = VEC4_T(scales_zeros_texels[2].yw, scales_zeros_texels[3].yw); - - cur_group_idx = group_idx; - } - // The input tensor will have a shape of [K, 1, 1, 1]; in each iteration, - // load 4 elements starting from the tensor index (k, 0, 0, 0). - VEC4_T in_texel = load_input_texel_1d(k4); - // Extract each element of the in_texel into a separate vectorized variable; - // these are used to "broadcast" the input values in subsequent fma calls. - VEC4_T in_texel_val[4]; - $for comp in range(4): - in_texel_val[${comp}] = VEC4_T(in_texel[${comp}]); - - uvec4 packed_weight_block = load_transposed_weight_block(k4, n8, K4); - - VEC4_T weight_texels[2]; - $for comp in range(4): - { - weight_texels[0].x = extract_4bit_from_transposed_block(packed_weight_block, 0, ${comp}); - weight_texels[0].y = extract_4bit_from_transposed_block(packed_weight_block, 1, ${comp}); - weight_texels[0].z = extract_4bit_from_transposed_block(packed_weight_block, 2, ${comp}); - weight_texels[0].w = extract_4bit_from_transposed_block(packed_weight_block, 3, ${comp}); - - weight_texels[1].x = extract_4bit_from_transposed_block(packed_weight_block, 4, ${comp}); - weight_texels[1].y = extract_4bit_from_transposed_block(packed_weight_block, 5, ${comp}); - weight_texels[1].z = extract_4bit_from_transposed_block(packed_weight_block, 6, ${comp}); - weight_texels[1].w = extract_4bit_from_transposed_block(packed_weight_block, 7, ${comp}); - - weight_texels[0] = fma(weight_texels[0], scales[0], zeros[0]); - weight_texels[1] = fma(weight_texels[1], scales[1], zeros[1]); - - out_texels[0] = fma(in_texel_val[${comp}], weight_texels[0], out_texels[0]); - out_texels[1] = fma(in_texel_val[${comp}], weight_texels[1], out_texels[1]); - } - } - - partial_sums[lid][0] = out_texels[0]; - partial_sums[lid][1] = out_texels[1]; - - memoryBarrierShared(); - barrier(); - - // Tree reduction to compute the overall result. - for (int i = WGS / 2; i > 0; i /= 2) { - if (lid < i) { - partial_sums[lid][0] = partial_sums[lid][0] + partial_sums[lid + i][0]; - partial_sums[lid][1] = partial_sums[lid][1] + partial_sums[lid + i][1]; - } - memoryBarrierShared(); - barrier(); - } - - // Only the first thread will write out result - if (lid == 0) { - out_texels[0] = partial_sums[0][0]; - out_texels[1] = partial_sums[0][1]; - - uint n4 = DIV_4(n); - write_output_texel_1d(out_texels[0], n4); - if (n + 4 < output_sizes.x) { - write_output_texel_1d(out_texels[1], n4 + 1); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.yaml deleted file mode 100644 index 04e803a2e94..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qga4w_coop: - parameter_names_with_default_values: - DTYPE: float - IO_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - WGS: 64 - shader_variants: - - NAME: linear_qga4w_coop_texture3d_texture3d_texture2d_float - - NAME: linear_qga4w_coop_buffer_buffer_texture2d_float - IO_STORAGE: buffer - - NAME: linear_qga4w_coop_buffer_buffer_buffer_float - IO_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.glsl deleted file mode 100644 index 97327ea5818..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.glsl +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${texel_load_component_type(DTYPE, IO_STORAGE)} -#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_qmat2", "uint", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)} - -layout(push_constant) uniform restrict Block { - ivec4 output_sizes; - ivec4 input_sizes; - ivec4 weight_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int group_size = 64; - -$if IO_STORAGE == "buffer": - #define BUFFER_IO -$if WEIGHT_STORAGE == "buffer": - #define BUFFER_WEIGHT - -#include "qlinear_utils.glslh" - -void main() { - // Each thread writes out a 8 wide x 4 high tile of output values - const uint n8 = gl_GlobalInvocationID.x; - const uint m4 = gl_GlobalInvocationID.y; - - const uint n = MUL_8(n8); // output col idx - const uint m = MUL_4(m4); // output row idx - const uint n4 = MUL_2(n8); // output col texel idx - - const uint group_num = input_sizes.x / group_size; - const uint group_ntexels = DIV_UP_4(group_size); - - if (n >= output_sizes.x || m >= output_sizes.y) { - return; - } - - const uint K4 = DIV_UP_4(input_sizes.x); - const uint N4 = DIV_UP_4(output_sizes.x); // number of texels in each row - - VEC4_T out_texels[4][2]; - // Initialize to 0 - $for row_i in range(4): - $for col_i in range(2): - out_texels[${row_i}][${col_i}] = VEC4_T(0.00); - - for (uint group_i = 0; group_i < group_num; ++group_i) { - // Load quantization scales and zeros for the current group - VEC4_T scales[2]; - VEC4_T zeros[2]; - { - uint qparams_bufi = group_i * DIV_2(output_sizes.x) + DIV_2(n); - - VEC4_T scales_zeros_texels[4]; - $for comp in range(4): - scales_zeros_texels[${comp}] = t_qparams[qparams_bufi++]; - - scales[0] = VEC4_T(scales_zeros_texels[0].xz, scales_zeros_texels[1].xz); - zeros[0] = VEC4_T(scales_zeros_texels[0].yw, scales_zeros_texels[1].yw); - - scales[1] = VEC4_T(scales_zeros_texels[2].xz, scales_zeros_texels[3].xz); - zeros[1] = VEC4_T(scales_zeros_texels[2].yw, scales_zeros_texels[3].yw); - } - - for (uint inner_k4 = 0; inner_k4 < group_ntexels; inner_k4++) { - const uint k4 = group_i * group_ntexels + inner_k4; - - // Load 4x4 block of the input tensor, with the top left corner of the - // block at (k, m) - VEC4_T in_texels[4]; - $for comp in range(4): - in_texels[${comp}] = load_input_texel_2d(k4, m + ${comp}, K4); - - uvec4 packed_weight_block = load_transposed_weight_block(k4, n8, K4); - - VEC4_T weight_texels[2]; - $for tile_k in range(4): - // Process weight row k + comp - { - // Weight columns n + 0, 1, 2, 3 - weight_texels[0].x = extract_4bit_from_transposed_block(packed_weight_block, 0, ${tile_k}); - weight_texels[0].y = extract_4bit_from_transposed_block(packed_weight_block, 1, ${tile_k}); - weight_texels[0].z = extract_4bit_from_transposed_block(packed_weight_block, 2, ${tile_k}); - weight_texels[0].w = extract_4bit_from_transposed_block(packed_weight_block, 3, ${tile_k}); - - // Weight colums n + 4, 5, 6, 7 - weight_texels[1].x = extract_4bit_from_transposed_block(packed_weight_block, 4, ${tile_k}); - weight_texels[1].y = extract_4bit_from_transposed_block(packed_weight_block, 5, ${tile_k}); - weight_texels[1].z = extract_4bit_from_transposed_block(packed_weight_block, 6, ${tile_k}); - weight_texels[1].w = extract_4bit_from_transposed_block(packed_weight_block, 7, ${tile_k}); - - weight_texels[0] = fma(weight_texels[0], scales[0], zeros[0]); - weight_texels[1] = fma(weight_texels[1], scales[1], zeros[1]); - - $for tile_m in range(4): - out_texels[${tile_m}][0] = fma(VEC4_T(in_texels[${tile_m}][${tile_k}]), weight_texels[0], out_texels[${tile_m}][0]); - out_texels[${tile_m}][1] = fma(VEC4_T(in_texels[${tile_m}][${tile_k}]), weight_texels[1], out_texels[${tile_m}][1]); - } - } - } - - for (uint row_i = 0; row_i < 4 && m + row_i < output_sizes.y; ++row_i) { - write_output_texel_2d(out_texels[row_i][0], n4, m + row_i, N4); - if (n + 4 < output_sizes.x) { - write_output_texel_2d(out_texels[row_i][1], n4 + 1, m + row_i, N4); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.yaml deleted file mode 100644 index 94d10dcf978..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qga4w_tiled: - parameter_names_with_default_values: - DTYPE: float - IO_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - shader_variants: - - NAME: linear_qga4w_tiled_texture3d_texture3d_texture2d_float - - NAME: linear_qga4w_tiled_buffer_buffer_texture2d_float - IO_STORAGE: buffer - - NAME: linear_qga4w_tiled_buffer_buffer_buffer_float - IO_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.glsl deleted file mode 100644 index 174ea1cc9bb..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.glsl +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} - -#define TILE_ROWS ${TILE_ROWS} - -#define NGROUPS 8 -#define NWORKERS 8 - -${define_required_extensions(DTYPE)} -$if IN_STORAGE == "buffer": - ${define_required_extensions("int8")} -$if WEIGHT_STORAGE == "buffer": - ${define_required_extensions("uint8")} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_mat1", "int8", IN_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", "float", PARAMS_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_zeros", "int", PARAMS_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input_scale", "float", PARAMS_STORAGE, is_scalar_array=True)} -${layout_declare_tensor(B, "r", "t_input_zero_point", "int", PARAMS_STORAGE, is_scalar_array=True)} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 mat1_sizes; - ivec4 qmat2_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int group_size = 64; - -shared vec4 partial_results[NGROUPS][NWORKERS][TILE_ROWS][2]; - -/* - * This shader computes a linear operator between a quantized int8 input matrix - * x and a weights matrix that is quantized to 4 bits, producing a float output. - * - * This shader implements a co-operative algorithm to compute the output. The - * work group size is {NGROUP, 1, NWORKERS}, and each group of NWORKERS threads - * cooperative to compute TILE_ROWS * 2 output texels. Therefore, - * NGROUP * TILE_ROWS * 2 output texels are computed across one work group. - * - * The threads co-operate by each thread computing a partial reduction along the - * K dimension. To illustrate the computation, consider a scalar variant of the - * algorithm that computes the dot product of 2 vectors. Also assume that - * NWORKERS is 8. - * - * Thread 1 in each group will compute: - * (mat1[0] * mat2[0]) + (mat1[8] * mat2[8]) + (mat1[16] * mat2[16]) + ... - * - * Thread 2 in each group will compute: - * (mat1[1] * mat2[1]) + (mat2[9] * mat2[9]) + (mat1[17] * mat2[17]) + ... - * - * Thread 3 in each group will compute: - * (mat1[2] * mat2[2]) + (mat2[10] * mat2[10]) + (mat1[18] * mat2[18]) + ... - * - * The partial accumulations is structured such that memory accesses in each - * loop iteration can be coalesced. - * - * Then, at the end first thread in each group will accumulate the partial - * accumulations computed by each thread to obtain the final result. - * - * Note that this shader assumes that all tensors are width packed. - */ - -void main() { - const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS; - const uint out_col = gl_GlobalInvocationID.x << 3; - const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1; - - const uint gid = gl_LocalInvocationID.x; // group id - const uint wid = gl_LocalInvocationID.z; // worker id - - if (out_col >= out_sizes.x || out_row >= out_sizes.y) { - return; - } - - const int num_blocks = mat1_sizes.x / group_size; - - ivec4 mat1_quantized[TILE_ROWS]; - ivec4 qmat2_quantized[4][2]; - vec4 final_result[TILE_ROWS][2]; - - // Initialize accumulators - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - final_result[r][0] = vec4(0.0); - final_result[r][1] = vec4(0.0); - } - - vec4 scales[2]; - vec4 zeros[2]; - - $if WEIGHT_STORAGE == "buffer": - const int qmat2_stride = qmat2_sizes.x >> 2; - $if PARAMS_STORAGE == "buffer": - const int qparams_stride = out_sizes.x >> 2; - - for (int block_idx = 0; block_idx < num_blocks; ++block_idx) { - $if PARAMS_STORAGE == "buffer": - scales[0] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx]; - scales[1] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx + 1]; - - zeros[0] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx]); - zeros[1] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx + 1]); - $else: - scales[0] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx, block_idx, 0), 0); - scales[1] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx + 1, block_idx, 0), 0); - - zeros[0] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx, block_idx, 0), 0)); - zeros[1] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx + 1, block_idx, 0), 0)); - - ivec4 int32_sums[TILE_ROWS][2]; - int input_sums[TILE_ROWS]; - - // Initialize accumulators for this block - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - int32_sums[r][0] = ivec4(0); - int32_sums[r][1] = ivec4(0); - input_sums[r] = 0; - } - - for (int g_idx = 4 * int(wid); g_idx < group_size; g_idx += (4 * NWORKERS)) { - const int k = block_idx * group_size + g_idx; - - // Preload B (weights) - keep as quantized integers - [[unroll]] for (int r = 0; r < 4; ++r) { - $if WEIGHT_STORAGE == "buffer": - const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x]; - $else: - const uvec4 packed_weight_tex = texelFetch( - t_qmat2, - ivec2(gl_GlobalInvocationID.x, k + r), - 0); - - // Unpack 4-bit weights to integers and subtract zero point (8 for 4-bit) - qmat2_quantized[r][0] = ivec4((packed_weight_tex & 0xF0) >> 4) - 8; - qmat2_quantized[r][1] = ivec4(packed_weight_tex & 0x0F) - 8; - } - - // Preload A (quantized input) - keep as quantized integers - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $if IN_STORAGE == "buffer": - mat1_quantized[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2] - t_input_zero_point[int(out_row) + r]; - $else: - mat1_quantized[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0) - t_input_zero_point[int(out_row) + r]; - } - - // Accumulate in integer arithmetic: (input_quantized - input_zero_point) * (weight_quantized - weight_zero_point) - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - input_sums[r] += mat1_quantized[r].x + mat1_quantized[r].y + mat1_quantized[r].z + mat1_quantized[r].w; - - int32_sums[r][0] += mat1_quantized[r].x * qmat2_quantized[0][0] - + mat1_quantized[r].y * qmat2_quantized[1][0] - + mat1_quantized[r].z * qmat2_quantized[2][0] - + mat1_quantized[r].w * qmat2_quantized[3][0]; - - int32_sums[r][1] += mat1_quantized[r].x * qmat2_quantized[0][1] - + mat1_quantized[r].y * qmat2_quantized[1][1] - + mat1_quantized[r].z * qmat2_quantized[2][1] - + mat1_quantized[r].w * qmat2_quantized[3][1]; - } - } - - // Incorporates this block's results into the final accumulation - // Following proper quantization paradigm: result = input_scale * weight_scale * - // Sum((input_quantized - input_zero) * (weight_quantized - weight_zero)) - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - if (out_row + r >= out_sizes.y) { - continue; - } - - float input_scale = t_input_scale[int(out_row) + r]; - float input_sum_scalar = float(input_sums[r]); - - // Apply proper quantization paradigm: input_scale * weight_scale * (accumulator - weight_zero * input_sum) - final_result[r][0] += input_scale * scales[0] * (vec4(int32_sums[r][0]) - zeros[0] * input_sum_scalar); - final_result[r][1] += input_scale * scales[1] * (vec4(int32_sums[r][1]) - zeros[1] * input_sum_scalar); - } - } - - // Store worker results in shared memory - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - partial_results[gid][wid][r][0] = final_result[r][0]; - partial_results[gid][wid][r][1] = final_result[r][1]; - } - - memoryBarrierShared(); - barrier(); - - // Only the first worker in each group accumulates and writes output - if (wid != 0) { - return; - } - - vec4 cooperative_result[TILE_ROWS][2]; - - for (int r = 0; r < TILE_ROWS; ++r) { - cooperative_result[r][0] = vec4(0.0); - cooperative_result[r][1] = vec4(0.0); - [[unroll]] for (int worker = 0; worker < NWORKERS; ++worker) { - cooperative_result[r][0] += partial_results[gid][worker][r][0]; - cooperative_result[r][1] += partial_results[gid][worker][r][1]; - } - } - - // Apply final output quantization - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $if OUT_STORAGE == "buffer": - t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = cooperative_result[r][0]; - t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = cooperative_result[r][1]; - $else: - imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), cooperative_result[r][0]); - imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), cooperative_result[r][1]); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.yaml deleted file mode 100644 index 9f6db77094a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qta8a_qga4w_coop: - parameter_names_with_default_values: - DTYPE: float - OUT_STORAGE: texture3d - IN_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - PARAMS_STORAGE: buffer - TILE_ROWS: 1 - shader_variants: - - NAME: linear_qta8a_qga4w_coop_texture3d_texture3d_texture2d_float - - NAME: linear_qta8a_qga4w_coop_buffer_buffer_texture2d_float - OUT_STORAGE: buffer - IN_STORAGE: buffer - - NAME: linear_qta8a_qga4w_coop_buffer_buffer_buffer_float - OUT_STORAGE: buffer - IN_STORAGE: buffer - WEIGHT_STORAGE: buffer - - NAME: linear_qta8a_qga4w_coop_buffer_texture2d_buffer_float - OUT_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.glsl deleted file mode 100644 index dbb7da998f4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.glsl +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} - -#define TILE_ROWS ${TILE_ROWS} - -${define_required_extensions(DTYPE)} -$if IN_STORAGE == "buffer": - ${define_required_extensions("int8")} -$if WEIGHT_STORAGE == "buffer": - ${define_required_extensions("uint8")} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_mat1", "int8", IN_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_scales", "float", PARAMS_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_weight_zeros", "int", PARAMS_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input_scale", "float", "buffer", is_scalar_array=True)} -${layout_declare_tensor(B, "r", "t_input_zero_point", "int", "buffer", is_scalar_array=True)} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 mat1_sizes; - ivec4 qmat2_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int group_size = 64; - -/* - * This shader computes a linear operator between a quantized int8 input matrix - * x and a weights matrix that is quantized to 4 bits, producing a float output. - * - * The (W, H, C) shape of each tensor is: - * - x: (K, M) - quantized int8 input with per-token quantization - * - weights: (N / 2, K) - * - The weights tensor has a data type of `uint8`. Each element in the tensor - * contains 2 4-bit values packed into a uint8. - * - See the pack_int4_linear_weight_transposed_interleave shader to see more - * details on how the weight tensor is stored. - * - qparams: (2, N, number_of_groups) - * - This tensor contains the scales and zeros quantization parameters for the - * weights tensor. The weight tensor is quantized group-wise, which means - * that every `group_size` elements along the K dimension of the weights - * tensor has independent quantization parameters. Along the width dim, the - * first value contains the scale for the group and the second value - * contains the zero point for the group. - * - input_scale: (num_tokens,) - per-token scale values for input quantization - * - input_zero_point: (num_tokens,) - per-token zero points for input quantization - * - output: (N, M) - float output - * - * Each thread computes a tile of TILE_ROWS * 2 texels of the output tensor. - * - * Note that this shader assumes that all tensors are width packed. - */ - -void main() { - const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS; - const uint out_col = gl_GlobalInvocationID.x << 3; - const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1; - - if (out_col >= out_sizes.x || out_row >= out_sizes.y) { - return; - } - - const int num_blocks = mat1_sizes.x / group_size; - - ivec4 mat1_quantized[TILE_ROWS]; - ivec4 qmat2_quantized[4][2]; - vec4 final_result[TILE_ROWS][2]; - - // Initialize accumulatoxrs - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - final_result[r][0] = vec4(0.0); - final_result[r][1] = vec4(0.0); - } - - vec4 scales[2]; - vec4 zeros[2]; - - $if WEIGHT_STORAGE == "buffer": - const int qmat2_stride = qmat2_sizes.x >> 2; - $if PARAMS_STORAGE == "buffer": - const int qparams_stride = out_sizes.x >> 2; - - for (int block_idx = 0; block_idx < num_blocks; ++block_idx) { - $if PARAMS_STORAGE == "buffer": - scales[0] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx]; - scales[1] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx + 1]; - - zeros[0] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx]); - zeros[1] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx + 1]); - $else: - scales[0] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx, block_idx, 0), 0); - scales[1] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx + 1, block_idx, 0), 0); - - zeros[0] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx, block_idx, 0), 0)); - zeros[1] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx + 1, block_idx, 0), 0)); - - ivec4 int32_sums[TILE_ROWS][2]; - int input_sums[TILE_ROWS]; - - // Initialize accumulators - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - int32_sums[r][0] = ivec4(0); - int32_sums[r][1] = ivec4(0); - input_sums[r] = 0; - } - - for (int g_idx = 0; g_idx < group_size; g_idx += 4) { - const int k = block_idx * group_size + g_idx; - - // Preload B (weights) - keep as quantized integers - [[unroll]] for (int r = 0; r < 4; ++r) { - $if WEIGHT_STORAGE == "buffer": - const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x]; - $else: - const uvec4 packed_weight_tex = texelFetch( - t_qmat2, - ivec2(gl_GlobalInvocationID.x, k + r), - 0); - - // Unpack 4-bit weights to integers (subtract 8 as the 4-bit zero point) - qmat2_quantized[r][0] = ivec4((packed_weight_tex & 0xF0) >> 4) - 8; - qmat2_quantized[r][1] = ivec4(packed_weight_tex & 0x0F) - 8; - } - - // Preload A (quantized input) - keep as quantized integers - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $if IN_STORAGE == "buffer": - mat1_quantized[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2] - t_input_zero_point[int(out_row) + r]; - $else: - mat1_quantized[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0) - t_input_zero_point[int(out_row) + r]; - } - - // Accumulate in integer arithmetic: (input_quantized - input_zero_point) * (weight_quantized - weight_zero_point) - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - input_sums[r] += mat1_quantized[r].x + mat1_quantized[r].y + mat1_quantized[r].z + mat1_quantized[r].w; - - int32_sums[r][0] += mat1_quantized[r].x * qmat2_quantized[0][0] - + mat1_quantized[r].y * qmat2_quantized[1][0] - + mat1_quantized[r].z * qmat2_quantized[2][0] - + mat1_quantized[r].w * qmat2_quantized[3][0]; - - int32_sums[r][1] += mat1_quantized[r].x * qmat2_quantized[0][1] - + mat1_quantized[r].y * qmat2_quantized[1][1] - + mat1_quantized[r].z * qmat2_quantized[2][1] - + mat1_quantized[r].w * qmat2_quantized[3][1]; - } - } - - // Incorporates this block's results into the final accumulation - // Following proper quantization paradigm: result = input_scale * weight_scale * - // Sum((input_quantized - input_zero) * (weight_quantized - weight_zero)) - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - if (out_row + r >= out_sizes.y) { - continue; - } - - float input_scale = t_input_scale[int(out_row) + r]; - float input_sum_scalar = float(input_sums[r]); - - // Apply proper quantization paradigm: input_scale * weight_scale * (accumulator - weight_zero * input_sum) - final_result[r][0] += input_scale * scales[0] * (vec4(int32_sums[r][0]) - zeros[0] * input_sum_scalar); - final_result[r][1] += input_scale * scales[1] * (vec4(int32_sums[r][1]) - zeros[1] * input_sum_scalar); - } - } - - // Apply ALL scaling at the very end - [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { - $if OUT_STORAGE == "buffer": - if (out_row + r < out_sizes.y) { - t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = final_result[r][0]; - t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = final_result[r][1]; - } - $else: - imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), final_result[r][0]); - imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), final_result[r][1]); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.yaml deleted file mode 100644 index c96d693834b..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -linear_qta8a_qga4w_tiled: - parameter_names_with_default_values: - DTYPE: float - OUT_STORAGE: texture3d - IN_STORAGE: texture3d - WEIGHT_STORAGE: texture2d - PARAMS_STORAGE: buffer - TILE_ROWS: 3 - shader_variants: - - NAME: linear_qta8a_qga4w_tiled_texture3d_texture3d_texture2d_float - - NAME: linear_qta8a_qga4w_tiled_buffer_buffer_texture2d_float - OUT_STORAGE: buffer - IN_STORAGE: buffer - - NAME: linear_qta8a_qga4w_tiled_buffer_buffer_buffer_float - OUT_STORAGE: buffer - IN_STORAGE: buffer - WEIGHT_STORAGE: buffer - - NAME: linear_qta8a_qga4w_tiled_buffer_texture2d_buffer_float - OUT_STORAGE: buffer - WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl deleted file mode 100644 index 28afe5a822f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define FLT_MIN -3.402823466e+38 - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "w", "t_idx", "int", STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} -${layout_declare_ubo(B, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "write_indices", "1")} - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const ivec2 ipos = pos.xy * stride - padding; - - const ivec2 start = ipos; - const ivec2 end = ipos + kernel_size * dilation; - - vec4 out_texel = vec4(FLT_MIN); - ivec4 idx_texel = ivec4(0); - - for (int y = start.y; y < end.y; y += dilation.y) { - for (int x = start.x; x < end.x; x += dilation.x) { - if ((x >= 0 && x < in_sizes.x) && (y >= 0 && y < in_sizes.y)) { - const vec4 cur_texel = load_texel(t_in, ivec3(x, y, pos.z)); - - // Set idx if value is greatest in the pool; else, keep the existing idx. - ivec4 cur_idx = ivec4(x + int(in_sizes.x) * y); - ivec4 mask = ivec4(greaterThan(cur_texel, out_texel)); - idx_texel = ivec4(mix(idx_texel, cur_idx, mask)); - - out_texel = max(cur_texel, out_texel); - } - } - } - - imageStore(t_out, pos, out_texel); - if (write_indices > 0) { - imageStore(t_idx, pos, idx_texel); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml deleted file mode 100644 index d8e3aa599f5..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -max_pool2d: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: max_pool2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl deleted file mode 100644 index 7897f0e8133..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#include "broadcasting_utils.h" -#include "indexing_utils.h" - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#define T ${texel_component_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "w", "t_mean", DTYPE, STORAGE)} -${layout_declare_tensor(B, "w", "t_rstd", DTYPE, STORAGE)} - -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE)} - -layout(push_constant) uniform PRECISION restrict Block { - ivec3 out_limits; - ivec4 sizes; - float epsilon; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -#define MAX_WORKGROUP_SIZE 64 - -// Shared memory factor increases shared memory allocation by a scale that should either be 1 or a power of 2. -// -// Increasing factor allows more data to be stored in shared memory and increase thread utilization during reduction. -// Why? Because when performing reduction, the number of active threads becomes half in each iteration. -// Increasing scaling factor increases the thread occupancy and hence utilize the GPU better. -// eg. -// If local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 1, 32 elements will be loaded into shared memory. -// First iteration of reduce will have 16 threads sum up 32 elements. -// Second iteration will have 8 threads sum up 16 elements from previous iteration and so on. -// So thread utilization starts at 50%. -// -// By contrast if local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 2, 64 elements will be loaded into shared memory. -// First iteration of reduce will have 32 threads sum up 64 elements. -// Second iteration will have 32 threads sum up 16 elements from previous iteration and so on. -// Thus thread utilization starts at 100%. -#define SHARED_MEMORY_FACTOR 1 - -#define offset_pos_index(index) ((index) + ((index) >> 3)) - -shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)]; - -// Function to reduce input data in workgroup's x dimension -// -// The implementation resembles reduction as depicted below -// | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | 2 | 3 | 2 | 7 | 0 | 11 | 0 | 2 | current_stride -> 1 -// | / | / | / | / | / | / | / | / -// | / | / | / | / | / | / | / | / -// | / | / | / | / | / | / | / | / -// | 11 | 1 | 9 | 1 | 2 | 2 | 8 | 5 | 5 | 3 | 9 | 7 | 11 | 11 | 2 | 2 | current_stride -> 2 -// | / | / | / | / -// | / | / | / | / -// | / | / | / | / -// | 20 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |14 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 4 -// | / | / -// | / | / -// | / | / -// | / | / -// | / | / -// | 30 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 8 -// | / -// | / -// | / -// | / -// | / -// | / -// | / -// | / -// | / -// | 57 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride = -> 16 -// -// Threads access shared index in following pattern -// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 1 -// Shared Index | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | X | X | X | X | X | X | X | X | index *= 1 -// -// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 2 -// Shared Index | 0 | 4 | 8 | 12 | X | X | X | X | X | X | X | X | X | X | X | X | index *= 2 -// -// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 4 -// Shared Index | 0 | 8 | X | X | X | X | X | X | X | X | X | X | X | X | X | X | index *= 4 -// -// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 8 -// Shared Index | 0 | X | X | X | X | X | X | X | X | X | X | X | X | X | X | X | index *= 8 - -void reduce_input(const int width_stride, const int shared_idx_offset) { - // wait for all shared memory writes to finish - memoryBarrierShared(); - barrier(); - - // loop log(width_stride) times - for (int current_stride = 1, index = int(gl_LocalInvocationID.x << 1); current_stride < width_stride; current_stride *= 2, index <<= 1) { - // if the index at this thread is within the width stride - if (index < width_stride) { - const int local_shared_idx = shared_idx_offset + index; - // add the value at current stride to this thread's value - shared_input[offset_pos_index(local_shared_idx)] += shared_input[offset_pos_index(local_shared_idx + current_stride)]; - } - - memoryBarrierShared(); - barrier(); - } -} - -void reduce_non_packed_dim() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - const int width = int(sizes.x); - ivec3 in_pos = lpos_to_pos(lpos, in_axis_map); - - // width batch read stride - const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR; - - // local memory starting offset for this thread - const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y); - - // local memory index for this thread - const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x); - - VEC4_T mean = VEC4_T(0); - VEC4_T var = VEC4_T(0); - - // Loop over the width in stride increments - for (int width_offset = 0; width_offset < width; width_offset += width_stride) { - // Read input in shared memory - for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { - in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); - - VEC4_T in_val = VEC4_T(0); - if (all(lessThan(in_pos, out_limits))) { - in_val = load_texel(t_in, in_pos); - } - mean += in_val; - } - } - - shared_input[offset_pos_index(shared_idx)] = mean; - reduce_input(width_stride, shared_idx_offset); - mean = shared_input[offset_pos_index(shared_idx_offset)] / width; - - memoryBarrierShared(); - barrier(); - - // Loop over the width in stride increments - for (int width_offset = 0; width_offset < width; width_offset += width_stride) { - // Read input in shared memory - for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { - in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); - - VEC4_T in_val = mean; - if (all(lessThan(in_pos, out_limits))) { - in_val = load_texel(t_in, in_pos); - } - - const VEC4_T delta = in_val - mean; - var += delta * delta; - } - } - - shared_input[offset_pos_index(shared_idx)] = var; - reduce_input(width_stride, shared_idx_offset); - var = shared_input[offset_pos_index(shared_idx_offset)] / width; - - VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5)); - VEC4_T offset = -rstd * mean; - - VEC4_T v = load_texel(t_in, lpos); - VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)).xxxx; - VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)).xxxx; - VEC4_T outtex = (v * rstd + offset) * weight + bias; - - if (all(lessThan(lpos, out_limits))) { - write_texel_lpos(t_out, lpos, outtex, out_axis_map); - } - - if (gl_GlobalInvocationID.x == 0) { - write_texel(t_mean, lpos, mean); - write_texel(t_rstd, lpos, rstd); - } -} - -void reduce_packed_dim() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - const int width = int(sizes.x); - ivec3 in_pos = lpos_to_pos(lpos, in_axis_map); - - // width batch read stride - const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR; - - // local memory starting offset for this thread - const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y); - - // local memory index for this thread - const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x); - - const int last_packed_width_index = divup4(width) - 1; - T mean = T(0); - T var = T(0); - const int remain = width & 3; - - const int in_pos_x_limit = out_limits[in_axis_map.x]; - - VEC4_T accum = VEC4_T(0); - // Loop over the width in stride increments - for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) { - // Read input in shared memory - for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { - const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); - in_pos[in_axis_map.x] = in_pos_x; - - VEC4_T in_val = VEC4_T(0); - if (in_pos_x < in_pos_x_limit) { - in_val = load_texel(t_in, in_pos); - } - - if (in_pos_x == last_packed_width_index && remain != 0) { - const int remain_inv = 4 - remain; - in_val.y = mix(in_val.y, T(0), remain_inv > 2); - in_val.z = mix(in_val.z, T(0), remain_inv > 1); - in_val.w = mix(in_val.w, T(0), remain_inv > 0); - } - accum += in_val; - } - } - - shared_input[offset_pos_index(shared_idx)] = accum; - reduce_input(width_stride, shared_idx_offset); - VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)]; - mean = (val.x + val.y + val.z + val.w) / width; - - memoryBarrierShared(); - barrier(); - - VEC4_T delta2 = VEC4_T(0); - - // Loop over the width in stride increments - for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) { - // Read input in shared memory - for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { - const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); - in_pos[in_axis_map.x] = in_pos_x; - - VEC4_T in_val = VEC4_T(mean); - if (in_pos_x < in_pos_x_limit) { - in_val = load_texel(t_in, in_pos); - } - - if (in_pos_x == last_packed_width_index && remain != 0) { - const int remain_inv = 4 - remain; - in_val.y = mix(in_val.y, mean.x, remain_inv > 2); - in_val.z = mix(in_val.z, mean.x, remain_inv > 1); - in_val.w = mix(in_val.w, mean.x, remain_inv > 0); - } - - const VEC4_T delta = in_val - mean; - delta2 += delta * delta; - } - } - - shared_input[offset_pos_index(shared_idx)] = delta2; - reduce_input(width_stride, shared_idx_offset); - val = shared_input[offset_pos_index(shared_idx_offset)]; - var = (val.x + val.y + val.z + val.w) / width; - - T rstd = pow(var + epsilon, T(-0.5)); - T offset = -rstd * mean; - - VEC4_T v = load_texel(t_in, lpos); - VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)); - VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)); - VEC4_T outtex = (v * rstd + offset) * weight + bias; - - if (all(lessThan(lpos, out_limits))) { - write_texel_lpos(t_out, lpos, outtex, out_axis_map); - } - - if (gl_GlobalInvocationID.x == 0) { - write_texel(t_mean, lpos, VEC4_T(mean)); - write_texel(t_rstd, lpos, VEC4_T(rstd)); - } -} - -void main() { - // if packed dimension width - if (in_packed_dim != W_DIM) { - reduce_non_packed_dim(); - } else { - reduce_packed_dim(); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml deleted file mode 100644 index ac478599f8a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -native_layer_norm: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: native_layer_norm diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl deleted file mode 100644 index 1a2c257baec..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -#extension GL_EXT_control_flow_attributes : require - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(B, "r", "nchw_in", "int")} - -$if USE_PUSH_CONST: - layout(push_constant) uniform restrict Block { - ivec4 sizes; - }; -$else: - ${layout_declare_ubo(B, "ivec4", "sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "transpose_hw", "0")} - -const lowp ivec4 axis_map = unhash_axis_map(t_layout); -const lowp int packed_dim = unhash_packed_dim(t_layout); - -/* - * Extends sign of int8 - */ -int extend_sign(int x) { - return x | mix(0, 0xFFFFFF00, x >= (1 << 7)); -} - -ivec4 read_texel(ivec4 tidx) { - const ivec4 tidx_to_use = ivec4(mix(tidx.xy, tidx.yx, bvec2(transpose_hw == 1)), tidx.zw); - const ivec4 sizes_to_use = ivec4(mix(sizes.xy, sizes.yx, bvec2(transpose_hw == 1)), sizes.zw); - const int packed_dim_to_use = mix(packed_dim, packed_dim ^ transpose_hw, packed_dim < 2); - - const ivec4 buf_indices = tidx_to_nchwi( - tidx_to_use, sizes_to_use, packed_dim_to_use); - - const int mask = (1 << 8) - 1; - - ivec4 out_tex = ivec4(0); - - [[unroll]] for (int i = 0; i < 4; ++i) { - if (tidx[packed_dim] + i < sizes[packed_dim]) { - const int in_texel = nchw_in[buf_indices[i] >> 2]; - int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3))) & mask; - extracted_val = extend_sign(extracted_val); - out_tex[i] = extracted_val; - } - } - - return out_tex; -} - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); - - if (any(greaterThanEqual(tidx, sizes))) { - return; - } - - write_texel(t_out, lpos_to_pos(lpos, axis_map), VEC4_T(read_texel(tidx))); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml deleted file mode 100644 index 0b8bbecb7bd..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -nchw_to_bitw8_image_nobitw8buffer: - parameter_names_with_default_values: - STORAGE: texture3d - DTYPE: int8 - USE_PUSH_CONST: True - generate_variant_forall: - STORAGE: - - VALUE: texture2d - - VALUE: texture3d - DTYPE: - - VALUE: int8 - - VALUE: uint8 - shader_variants: - - NAME: nchw_to_bitw8_image_nobitw8buffer - - NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc - USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl deleted file mode 100644 index 074624dc37e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl +++ /dev/null @@ -1,48 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing.glslh" - -${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "BufferMetadata", "outp")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// This constant is unused in this shader but is kept so that the signature is -// consistent with nchw_to_image. -${layout_declare_spec_const(C, "int", "unused", "0")} -${layout_declare_spec_const(C, "int", "transpose_hw", "0")} - -void main() { - const uint outp_bufi = int(gl_GlobalInvocationID.x); - if (outp_bufi >= numel(outp)) { - return; - } - - TensorIndex outp_tidx; - uint nchwi; - - linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx); - - if (transpose_hw == 1) { - BufferMetadata transposed_meta = outp; - transposed_meta.sizes[0].xy = transposed_meta.sizes[0].yx; - outp_tidx.data[0].xy = outp_tidx.data[0].yx; - nchwi = tensor_idx_to_contiguous_idx(transposed_meta, outp_tidx); - } - // Normal case - else { - nchwi = tensor_idx_to_contiguous_idx(outp, outp_tidx); - } - - t_outp[outp_bufi] = nchw_in[nchwi]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml deleted file mode 100644 index 9d6c3aa76a9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -nchw_to_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - USE_PUSH_CONST: True - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: nchw_to_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl deleted file mode 100644 index f3f604e10cd..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define SCALAR_T ${texel_load_component_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(B, "r", "buf_in", DTYPE)} - -$if USE_PUSH_CONST: - layout(push_constant) uniform restrict Block { - ivec4 sizes; - $if not FROM_STAGING: - ivec4 buf_strides; - }; -$else: - ${layout_declare_ubo(B, "ivec4", "sizes")} - $if not FROM_STAGING: - ${layout_declare_ubo(B, "ivec4", "buf_strides")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "transpose_hw", "0")} - -const lowp ivec4 axis_map = unhash_axis_map(t_layout); -const lowp int packed_dim = unhash_packed_dim(t_layout); - -VEC4_T read_texel(ivec4 tidx) { - ivec4 tidx_to_use = tidx; - ivec4 sizes_to_use = sizes; - int packed_dim_to_use = packed_dim; - if (transpose_hw == 1) { - sizes_to_use.xy = sizes_to_use.yx; - tidx_to_use.xy = tidx.yx; - - if (packed_dim == 1) { - packed_dim_to_use = 0; - } - if (packed_dim == 0) { - packed_dim_to_use = 1; - } - } - - $if FROM_STAGING: - const ivec4 buf_indices = tidx_to_nchwi(tidx_to_use, sizes_to_use, packed_dim_to_use); - $else: - const ivec4 buf_indices = tidx_to_4bufi(tidx_to_use, buf_strides, packed_dim_to_use); - - VEC4_T texel = VEC4_T(0); - if (tidx[packed_dim] < sizes[packed_dim]) { - texel.x = SCALAR_T(buf_in[buf_indices.x]); - } - if (tidx[packed_dim] + 1 < sizes[packed_dim]) { - texel.y = SCALAR_T(buf_in[buf_indices.y]); - } - if (tidx[packed_dim] + 2 < sizes[packed_dim]) { - texel.z = SCALAR_T(buf_in[buf_indices.z]); - } - if (tidx[packed_dim] + 3 < sizes[packed_dim]) { - texel.w = SCALAR_T(buf_in[buf_indices.w]); - } - return texel; -} - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); - if (any(greaterThanEqual(tidx, sizes))) { - return; - } - - $if DTYPE == "double" and DTYPE == "int64": - VEC4_T texel = read_texel(tidx); - write_texel(t_out, lpos_to_pos(lpos, axis_map), texel); - $else: - write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml deleted file mode 100644 index 85119c8d508..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -nchw_to_image: - parameter_names_with_default_values: - STORAGE: texture3d - DTYPE: float - FROM_STAGING: True - USE_PUSH_CONST: True - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: nchw_to_image_texture3d - - NAME: nchw_to_image_texture2d - STORAGE: texture2d - - NAME: clone_buffer_to_image - FROM_STAGING: False - - NAME: nchw_to_image_no_pc_texture3d - USE_PUSH_CONST: False - - NAME: nchw_to_image_no_pc_texture2d - STORAGE: texture2d - USE_PUSH_CONST: False - - NAME: clone_buffer_to_image_no_pc - FROM_STAGING: False - USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl deleted file mode 100644 index 325635a5716..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_required_extensions(DTYPE)} - -#include "broadcasting_utils.h" -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "r", "t_out", DTYPE, STORAGE)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() {} diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml deleted file mode 100644 index f888e8661d3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -no_op: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - VALUE: uint32 - - VALUE: int8 - - VALUE: uint8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d - - VALUE: buffer - shader_variants: - - NAME: no_op diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.glsl deleted file mode 100644 index e42cf05dd7f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.glsl +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_qmat2", "uint", STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", "uint", "buffer")} - -layout(push_constant) uniform restrict Block { - ivec4 qmat2_sizes; - ivec2 orig_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -$if STORAGE == "buffer": - #define BUFFER_WEIGHT - -#include "qlinear_weight_pack_utils.glslh" - -#define extract_4bit(input_block_data, col, row) \ - (extract_4bit_from_packed_uint_le(input_block_data[row], col)) - -/* - * This shader packs the weight tensor into blocks for efficient consumption. - * - * The input tensor has shape [K/2, N] where each element is a uint8 containing - * 2 packed 4-bit values. The logical tensor shape is [K, N] of 4-bit values. - * - * The transformation partitions the tensor into blocks of size 4x8 (4-bit values) - * and transposes each block to 8x4, then packs the result so that each uvec4 - * contains an entire transposed block. - * - * Original block (4x8 4-bit values, shown as 2x8 uint8 values): - * w00|w10, w20|w30, - * w01|w11, w21|w31, - * w02|w12, w22|w32, - * w03|w13, w23|w33, - * w04|w14, w24|w34, - * w05|w15, w25|w35, - * w06|w16, w26|w36, - * w07|w17, w27|w37, - * - * Transposed block (8x4 4-bit values, packed into uvec4): - * w00|w01, w02|w03, w04|w05, w06|w07 - * w10|w11, w12|w13, w14|w15, w16|w17 - * w20|w21, w22|w23, w24|w25, w26|w27 - * w30|w31, w32|w33, w34|w35, w36|w37 - */ -void main() { - // Each thread writes out 2 adjacent 8 wide x 4 high transposed block. Each - // block is packed as one uvec4. - ivec2 block_pos = ivec2( - MUL_2(gl_GlobalInvocationID.x), - gl_GlobalInvocationID.y); - - // There are K wide x N high 4-bit values in the original weight tensor - const int input_width = orig_sizes.x; // K - const int input_height = orig_sizes.y; // N - - const int input_width_uint = DIV_UP_8(input_width); - - // Original block spans 4 wide x 8 high 4-bit values. Since uint is used to - // read the input tensor, each block spans 0.5 wide x 8 high uint values. - const ivec2 block_start = ivec2( - DIV_2(block_pos.x), - MUL_8(block_pos.y)); - - // Check bounds - if (block_start.x >= input_width_uint || block_start.y >= input_height) { - return; - } - - // Read input block. Note that this block will contain the source data for - // both output blocks, as it contains 1 wide x 8 high uint values, which is - // equivalent to 8 wide x 8 high 4-bit values. - uint input_block_data[8]; - - // Read in 8 rows along the same column of uints, each uint contains 4 4-bit - // values. This will be the source data for the transposed block. - for (int i = 0; i < 8; ++i) { - uint input_bufi = (block_start.y + i) * input_width_uint + block_start.x; - input_block_data[i] = t_input[input_bufi]; - } - - for (int col_offset = 0; col_offset <= 4; col_offset+=4) { - uvec4 output_block; - - output_block.x = pack_8x4bit_into_uint( - extract_4bit(input_block_data, col_offset, 0), - extract_4bit(input_block_data, col_offset, 1), - extract_4bit(input_block_data, col_offset, 2), - extract_4bit(input_block_data, col_offset, 3), - extract_4bit(input_block_data, col_offset, 4), - extract_4bit(input_block_data, col_offset, 5), - extract_4bit(input_block_data, col_offset, 6), - extract_4bit(input_block_data, col_offset, 7)); - - output_block.y = pack_8x4bit_into_uint( - extract_4bit(input_block_data, col_offset + 1, 0), - extract_4bit(input_block_data, col_offset + 1, 1), - extract_4bit(input_block_data, col_offset + 1, 2), - extract_4bit(input_block_data, col_offset + 1, 3), - extract_4bit(input_block_data, col_offset + 1, 4), - extract_4bit(input_block_data, col_offset + 1, 5), - extract_4bit(input_block_data, col_offset + 1, 6), - extract_4bit(input_block_data, col_offset + 1, 7)); - - output_block.z = pack_8x4bit_into_uint( - extract_4bit(input_block_data, col_offset + 2, 0), - extract_4bit(input_block_data, col_offset + 2, 1), - extract_4bit(input_block_data, col_offset + 2, 2), - extract_4bit(input_block_data, col_offset + 2, 3), - extract_4bit(input_block_data, col_offset + 2, 4), - extract_4bit(input_block_data, col_offset + 2, 5), - extract_4bit(input_block_data, col_offset + 2, 6), - extract_4bit(input_block_data, col_offset + 2, 7)); - - output_block.w = pack_8x4bit_into_uint( - extract_4bit(input_block_data, col_offset + 3, 0), - extract_4bit(input_block_data, col_offset + 3, 1), - extract_4bit(input_block_data, col_offset + 3, 2), - extract_4bit(input_block_data, col_offset + 3, 3), - extract_4bit(input_block_data, col_offset + 3, 4), - extract_4bit(input_block_data, col_offset + 3, 5), - extract_4bit(input_block_data, col_offset + 3, 6), - extract_4bit(input_block_data, col_offset + 3, 7)); - - const uint qmat2_texel_stride_x = DIV_UP_4(qmat2_sizes.x); - write_transposed_weight_block( - output_block, - block_pos.x, - block_pos.y, - qmat2_texel_stride_x); - - if (MUL_8(block_start.x) + 4 >= input_width) { - return; - } - // Otherwise, implement the block position to write to the next block in the - // following iteration. - block_pos.x += 1; - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.yaml deleted file mode 100644 index c72a2cc1df6..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -pack_int4_linear_weight_transposed_block_4x8: - parameter_names_with_default_values: - STORAGE: buffer - shader_variants: - - NAME: pack_int4_linear_weight_transposed_block_4x8_buffer - STORAGE: buffer - - NAME: pack_int4_linear_weight_transposed_block_4x8_texture2d - STORAGE: texture2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl deleted file mode 100644 index 0079526c248..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if not NO_INT8_BUFFERS: - ${define_required_extensions("uint8")} -$if STORAGE == "buffer": - ${define_required_extensions("int8")} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_qmat2", "uint8", STORAGE, is_scalar_array=False)} -$if NO_INT8_BUFFERS: - ${layout_declare_tensor(B, "r", "nchw_4x2", "uint", "buffer")} -$else: - ${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")} - -layout(push_constant) uniform restrict Block { - ivec4 qmat2_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -$if NO_INT8_BUFFERS: - #define BUF_T uint -$else: - #define BUF_T uint8_t - -$if STORAGE == "buffer": - #define UVEC4_T u8vec4 -$else: - #define UVEC4_T uvec4 - -uint get_first(const BUF_T packed) { - return (packed & 0xF0) >> 4; -} - -uint get_second(const BUF_T packed) { - return packed & 0x0F; -} - -uint combine(const uint first, const uint second) { - return (first << 4 | second); -} - -$if NO_INT8_BUFFERS: - uint extract_comp(const uint packed4, const uint idx) { - return (packed4 >> (idx * 8)) & 0xFF; - } - -/* - * This shader packs the weight tensor into a texture. - * - * The original tensor has a (W, H) shape of (K / 2, N) and each scalar element - * is a uint8_t, which contains 2 packed 4 bit uint values. - * - * The transform performed by this shader is to first transpose the tensor, so - * the shape of the packed tensor becomes (N / 2, K). Then, the 4 bit integers - * are re-packed in groups of 8. For each 4 uint8_t values, the "left" 4-bits - * of each value contain the 0, 1, 2, 3 4-bit values, and the "right" 4-bits of - * each value contain the 4, 5, 6, 7 4-bit values. - * - * As a concrete example, consider the following weight tensor. The | demarks - * the packing boundary, so 1| 2 represents a single uint8_t value with 1 in the - * leftmost 4 bits and 2 in the rightmost 4 bits. - * - * 1| 2, 3| 4, 5| 6, 7| 8, - * 9|10, 11|12, 13|14, 15|16, - * 17|18, 19|20, 21|22, 23|24, - * 25|26, 27|28, 29|30, 31|32, - * 33|34, 35|36, 37|38, 39|40, - * 41|42, 43|44, 45|46, 47|48, - * 49|50, 51|52, 53|54, 55|56, - * 57|58, 59|60, 61|62, 63|64, - * - * After packing, the packed tensor would contain - * - * 1|33, 9|41, 17|49, 25|57, - * 2|34, 10|42, 18|50, 26|58, - * 3|35, 11|43, 19|51, 27|59, - * 4|36, 12|44, 20|52, 28|60, - * 5|37, 13|45, 21|53, 29|61, - * 6|38, 14|46, 22|54, 30|62, - * 7|39, 15|47, 23|55, 31|63, - * 8|40, 16|48, 24|56, 32|64, - * - * The purpose of interleaving is to make it easier to extract the unpacked - * values in order using the u8vec4 vectorized type. With the packing in place, - * The 4-bit values can be extracted via - * - * u8vec4 packed; - * u8vec4 vals_0123 = (packed & 0xF0) >> 4; - * u8vec4 vals_4567 = (packed | 0x0F); - */ -void main() { - // Each thread writes 2 output texels along the height axis - ivec2 packed_pos = ivec2( - gl_GlobalInvocationID.x, - gl_GlobalInvocationID.y << 1); - - // The packed tensor is width packed - if ((packed_pos.x << 2) >= qmat2_sizes.x || packed_pos.y >= qmat2_sizes.y) { - return; - } - - int out_col = packed_pos.x << 3; - int out_row = packed_pos.y; - - int in_col = out_row; - int in_int8_col = in_col >> 1; - int in_row = out_col; - - int in_numrows = qmat2_sizes.x << 1; - int in_numcols = qmat2_sizes.y; - int in_num_int8_cols = qmat2_sizes.y >> 1; - - uint in_vals[8][2]; - for (int r = 0; r < 8; ++r) { - if (in_row + r < in_numrows) { - uint scalar_idx = (in_row + r) * in_num_int8_cols + in_int8_col; - $if NO_INT8_BUFFERS: - BUF_T in_val_packed_texel = nchw_4x2[scalar_idx >> 2]; - const uint packed_idx = scalar_idx % 4; - uint in_val_packed = extract_comp(in_val_packed_texel, packed_idx); - $else: - BUF_T in_val_packed = nchw_4x2[scalar_idx]; - - in_vals[r][0] = get_first(in_val_packed); - in_vals[r][1] = get_second(in_val_packed); - } else { - in_vals[r][0] = uint(0); - in_vals[r][1] = uint(0); - } - } - - UVEC4_T out_tex_1 = UVEC4_T( - combine(in_vals[0][0], in_vals[4][0]), - combine(in_vals[1][0], in_vals[5][0]), - combine(in_vals[2][0], in_vals[6][0]), - combine(in_vals[3][0], in_vals[7][0])); - - UVEC4_T out_tex_2 = UVEC4_T( - combine(in_vals[0][1], in_vals[4][1]), - combine(in_vals[1][1], in_vals[5][1]), - combine(in_vals[2][1], in_vals[6][1]), - combine(in_vals[3][1], in_vals[7][1])); - - $if STORAGE == "buffer": - int stride = qmat2_sizes.x >> 2; - t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1; - t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2; - $else: - imageStore(t_qmat2, packed_pos.xy, out_tex_1); - imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml deleted file mode 100644 index 145f4301f14..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -pack_int4_linear_weight_transposed_interleaved: - parameter_names_with_default_values: - STORAGE: texture2d - NO_INT8_BUFFERS: false - shader_variants: - - NAME: pack_int4_linear_weight_transposed_interleaved_texture2d - - NAME: pack_int4_linear_weight_transposed_interleaved_buffer - STORAGE: buffer - - NAME: pack_int4_linear_weight_transposed_interleaved_nobitw8buffer_texture2d - NO_INT8_BUFFERS: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.glsl deleted file mode 100644 index b9f5c994910..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.glsl +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type(STORAGE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_packed_int4_weight", "int", STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_int4_weight", "uint", "buffer")} - -layout(push_constant) uniform restrict Block { - ivec4 qmat2_sizes; - ivec2 orig_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "common.glslh" -#include "linear_int4_weight_block.glslh" - -void main() { - const int k8 = int(gl_GlobalInvocationID.x); - const int n8 = int(gl_GlobalInvocationID.y); - - const int K = orig_sizes.x; - const int N = orig_sizes.y; - - // Each shader invocation processes a 4x8 block of the input data. - const int K4 = div_up_4(K); - const int K8 = div_up_8(K); - const int N8 = div_up_8(N); - - // Check bounds - if (n8 >= N8 || k8 >= K8) { - return; - } - - Int4Weight2xBlockSourceData src_data; - const int n = mul_8(n8); - if (N - n >= 8) { - load_block_source_data_no_checks(src_data, k8, n, K8, N); - } else { - load_block_source_data_with_checks(src_data, k8, n, K8, N); - } - - // A 8Kx8K block of the weight matrix is loaded into memory. This will be - // split into two blocks each holding 4Kx8N worth of data. - // The first block contains data for k + (0, 1, 2, 3) i.e. the first 4 columns - // of the loaded weight block. - Int4WeightBlockPacked packed_block_1; - // The second block contains data for k + (4, 5, 6, 7) i.e. the second 4 cols - // of the loaded weight block - Int4WeightBlockPacked packed_block_2; - create_packed_blocks(packed_block_1, packed_block_2, src_data); - - const int k4 = mul_2(k8); - write_packed_block(packed_block_1, k4, n8, K4); - write_packed_block(packed_block_2, k4 + 1, n8, K4); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.yaml deleted file mode 100644 index 7a145ec95d7..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -pack_q4_linear_weight: - parameter_names_with_default_values: - STORAGE: buffer - shader_variants: - - NAME: pack_q4_linear_weight_buffer - STORAGE: buffer - - NAME: pack_q4_linear_weight_texture2d - STORAGE: texture2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.glsl deleted file mode 100644 index f2c74b67283..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.glsl +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type(STORAGE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_packed_int8_weight", "int", STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_int8_weight", "int", "buffer")} - -layout(push_constant) uniform restrict Block { - ivec4 qmat2_sizes; - ivec2 orig_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "common.glslh" -#include "linear_int8_weight_block.glslh" - -void main() { - // The size of the source weight tensor is [W=K, H=N]. Each shader invocation - // processes a 4x4 block. The thread position corresponds to the block index. - int n4 = int(gl_GlobalInvocationID.x); - int k4 = int(gl_GlobalInvocationID.y); - - const int K = orig_sizes.x; - const int N = orig_sizes.y; - - // Determine the total number of blocks and check bounds - const int N4 = div_up_4(N); - const int K4 = div_up_4(K); - if (n4 >= N4 || k4 >= K4) { - return; - } - - // Each block is represented as an ivec4. Each int corresponds to a row i.e. - // N dim of the weight tensor and contains data for 4 columns i.e. K dim. - Int8WeightBlock block; - const int n = mul_4(n4); - if (N - n >= 4) { - load_block_data_no_checks(block, k4, n, K4, N); - } else { - load_block_data_with_checks(block , k4, n, K4, N); - } - - // The weight blocks are stored in a tranposed manner, such that weight blocks - // are indexed like packed_weight[k4][n4]. This is to optimize memory - // coalescing when computing tiled GEMM. - write_weight_block(block, n4, k4, N4); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.yaml deleted file mode 100644 index 13e6d43b2c5..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -pack_q8_linear_weight: - parameter_names_with_default_values: - STORAGE: buffer - shader_variants: - - NAME: pack_q8_linear_weight_buffer - STORAGE: buffer - - NAME: pack_q8_linear_weight_texture2d - STORAGE: texture2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl deleted file mode 100644 index 8c01ebef897..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl +++ /dev/null @@ -1,80 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "ivec4", "in_sizes")} -${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")} -${layout_declare_ubo(5, "float", "fill_value")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); - - if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { - return; - } - - VEC4_T outtex = VEC4_T(fill_value); - // mask_z/y/x is used to determine whether need to fecth data from input tensor - bool mask_z = (idx.z + 3) < pad_front || idx.z > (pad_front + in_sizes.z - 1); - bool mask_y = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1; - bool mask_x = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1; - - if (!mask_z && mask_y && mask_x) { - // channel_mask is to determine the situation that when padding channel dimension, - // in one texel, some elements are filled vaule and some value are from input tensor - ivec4 c_ind = ivec4(idx.z) + ivec4(0, 1, 2, 3); - ivec4 channel_mask = ivec4(lessThan(c_ind, ivec4(pad_front))) + ivec4(greaterThan(c_ind, ivec4(pad_front + in_sizes.z - 1))); - - ivec4 in_idx = idx; - in_idx.x -= pad_left; - in_idx.y -= pad_top; - in_idx.z -= divup4(pad_front) * 4; - const int shift = pad_front % 4; - VEC4_T cur_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0); - VEC4_T next_in_texel; - // When shift is not 0, we need to read 2 texels from input tensor to write into output - // for example: - // input texel is [[1 2 3 4], [5 6 x x]] and front_pad = 2 - // output texel is [[p p 1 2], [3 4 5 6]], where p is the filled value then need to fetch 2 texels to fill [3 4 5 6]. - if (shift != 0) { - in_idx.z += 4; - next_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0); - } else { - next_in_texel = cur_in_texel; - } - - VEC4_T inter_texel; - for (int i = 0; i < 4; i++) { - if (i < shift) { - inter_texel[i] = cur_in_texel[4-shift+i]; - } else { - inter_texel[i] = next_in_texel[i-shift]; - } - } - outtex = inter_texel * (VEC4_T(1) - channel_mask) + outtex * channel_mask; - } - - int packed_idx = idx[packed_dim]; - const int packed_dim_size = out_sizes[packed_dim]; - if (packed_idx + 3 >= packed_dim_size) { - ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3); - VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size))); - outtex = outtex * valid_idx; - } - - imageStore(t_out, pos, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml deleted file mode 100644 index 02afc3846a2..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml +++ /dev/null @@ -1,12 +0,0 @@ -pad_channel: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: C_packed - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: pad_channel diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl deleted file mode 100644 index c5b2c692bdc..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl +++ /dev/null @@ -1,50 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "ivec4", "in_sizes")} -${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")} -${layout_declare_ubo(5, "float", "fill_value")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); - - if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { - return; - } - - bool mask_height = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1; - bool mask_width = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1; - - VEC4_T outtex = VEC4_T(fill_value); - if (mask_height && mask_width) { - ivec4 in_idx = idx; - in_idx.x -= pad_left; - in_idx.y -= pad_top; - outtex = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0); - } - - int packed_idx = idx[packed_dim]; - const int packed_dim_size = out_sizes[packed_dim]; - if (packed_idx + 3 >= packed_dim_size) { - ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3); - VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size))); - outtex = outtex * valid_idx; - } - - imageStore(t_out, pos, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml deleted file mode 100644 index dd74ec9cc28..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml +++ /dev/null @@ -1,12 +0,0 @@ -pad_height_width: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: C_packed - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: pad_height_width diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl deleted file mode 100644 index 3447ab07552..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("buffer")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing.glslh" - -${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")} - -${layout_declare_ubo(B, "BufferMetadata", "outp")} -${layout_declare_ubo(B, "BufferMetadata", "inp")} - -${layout_declare_ubo(B, "ivec4[DIMLIMIT_DIV4]", "permute_order")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const uint inp_bufi = gl_GlobalInvocationID.x; - if (inp_bufi >= numel(inp)) { - return; - } - - TensorIndex inp_tidx; - linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx); - - TensorIndex outp_tidx = inp_tidx; - permute(outp_tidx, permute_order); - - const uint outp_bufi = tensor_idx_to_linear_idx(outp, outp_tidx); - // Copy data from input to output - t_outp[outp_bufi] = t_inp[inp_bufi]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml deleted file mode 100644 index 81675ae8917..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml +++ /dev/null @@ -1,10 +0,0 @@ -permute_buffer: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: permute_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl deleted file mode 100644 index 274077f4181..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("texture3d")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j -}; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// Convert output tensor index to input tensor index based on permutation -ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { - ivec4 in_tidx; - - // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] - in_tidx[permute_dims.x] = out_tidx.x; - in_tidx[permute_dims.y] = out_tidx.y; - in_tidx[permute_dims.z] = out_tidx.z; - in_tidx[permute_dims.w] = out_tidx.w; - - return in_tidx; -} - -// Check if we can use the fast path where texels from the input tensor can be -// copied directly into the output tensor. This occurs when the packed dimension -// is preserved in the permutation, i.e. reading a texel from the output tensor -// produces 4 texels along the same dimension as reading a texel from the input -// tensor. -bool can_use_fast_path() { - // Fast path is possible when the packed dimension is preserved in the permutation - // This means permute_dims[out_packed_dim] == in_packed_dim - return permute_dims[out_packed_dim] == in_packed_dim; -} - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); - - if (any(greaterThanEqual(out_tidx, out_sizes))) { - return; - } - - if (can_use_fast_path()) { - // Fast path: packed dimension is preserved, so we can copy texels directly - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); - } - else { - // Slow path: packed dimension is not preserved, so each element of the - // output texel may be "sourced" from a different texel in the input tensor. - // Therefore each output texel element is processed individually. - VEC4_T out_texel = VEC4_T(0); - - for (int texel_i = 0; texel_i < 4; ++texel_i) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - int element_idx = in_tidx[in_packed_dim] % 4; - - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - T selected_value = T(in_texel[element_idx]); - - out_texel[texel_i] = selected_value; - - out_tidx[out_packed_dim]++; - } - - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml deleted file mode 100644 index f68b8dcdd3d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml +++ /dev/null @@ -1,10 +0,0 @@ -permute_texture: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: permute_texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/qlinear_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/qlinear_utils.glslh deleted file mode 100644 index 80ec44c153a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/qlinear_utils.glslh +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef QLINEAR_UTILS_H -#define QLINEAR_UTILS_H - -/*********************************** - * Packed Weight data read/write functions - * - * These functions assume that t_qmat2 is declared in the shader layout as a storage - * buffer or storage image. - */ - -#ifdef BUFFER_WEIGHT - -uvec4 load_transposed_weight_block(const uint k4, const uint n8, const uint K4) { - return t_qmat2[n8 * K4 + k4]; -} - -#else // TEXTURE_WEIGHT - -uvec4 load_transposed_weight_block(const uint k4, const uint n8, const uint K4) { - return texelFetch(t_qmat2, ivec2(k4, n8), 0); -} - -#endif // BUFFER_WEIGHT - -/*********************************** - * Packed weight data extraction functions - */ - -/* - * uvec4 block contains a packed 4 high x 8 wide matrix of 4-bit signed integers. This - * function extracts the 4-bit values at the given column and row index. - * - * Each uint in the uvec4 corresponds to one row; thus the desired row can be extracted - * via block[row]. From there, column 0 is packed in bits 28-31, column 1 is packed into - * bits 24-27, column 3 is packed into bits 20-23, and so on. To extract the desired - * value: - * - * 1. First, shift the row uint by 4 * (7 - col) bits - * 2. Apply a mask of 0b1111 = 15 - * - * Finally, convert the masked value to int and subtract it by int to obtain the desired - * signed integer. - */ -T extract_4bit_from_transposed_block(const uvec4 block, const uint col, const uint row) { - return T(int((block[row] >> (4 * (7 - col))) & 15) - 8); -} - -/*********************************** - * Input/Output read/write functions - * - * These functions assume that t_input and t_output are declared in the shader layout as - * storage buffers or storage images. - */ - -#ifdef BUFFER_IO - -VEC4_T load_input_texel_1d(const uint k4) { - return t_input[k4]; -} - -VEC4_T load_input_texel_2d( - const uint k4, - const uint m, - const uint K4) { - return t_input[(m * K4) + k4]; -} - -void write_output_texel_1d(const VEC4_T out_texel, const uint n4) { - t_output[n4] = out_texel; -} - -void write_output_texel_2d( - const VEC4_T out_texel, - const uint n4, - const uint m, - const uint N4) { - t_output[m * N4 + n4] = out_texel; -} - -#else // TEXTURE_IO - -VEC4_T load_input_texel_1d(const uint k4) { - return texelFetch(t_input, ivec3(k4, 0, 0), 0); -} - -VEC4_T load_input_texel_2d( - const uint k4, - const uint m, - const uint K4) { - return texelFetch(t_input, ivec3(k4, m, 0), 0); -} - - -void write_output_texel_1d(const VEC4_T out_texel, const uint n4) { - imageStore(t_output, ivec3(n4, 0, 0), out_texel); -} - -void write_output_texel_2d( - const VEC4_T out_texel, - const uint n4, - const uint m, - const uint N4) { - imageStore(t_output, ivec3(n4, m, 0), out_texel); -} - -#endif // BUFFER_IO - -#endif // QLINEAR_UTILS_H diff --git a/backends/vulkan/runtime/graph/ops/glsl/qlinear_weight_pack_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/qlinear_weight_pack_utils.glslh deleted file mode 100644 index 1f481f4f859..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/qlinear_weight_pack_utils.glslh +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef QLINEAR_WEIGHT_PACK_UTILS_H -#define QLINEAR_WEIGHT_PACK_UTILS_H - -/*********************************** - * Packed Weight data write functions - * - * These functions assume that t_qmat2 has been defined in the shader layout as either - * a storage buffer or a storage image. - */ - -#ifdef BUFFER_WEIGHT - -void write_transposed_weight_block(const uvec4 block, const uint k4, const uint n8, const uint K4) { - t_qmat2[n8 * K4 + k4] = block; -} - -#else // TEXTURE_WEIGHT - -void write_transposed_weight_block(const uvec4 block, const uint k4, const uint n8, const uint K4) { - imageStore(t_qmat2, ivec2(k4, n8), block); -} - -#endif // BUFFER_WEIGHT - -/*********************************** - * Utilities for packing weight data - */ - -uint extract_4bit_from_packed_uint_le(const uint packed, const uint i) { - // account for little endian - uint byte = packed >> (8 * (i / 2)) & 255; - return (byte >> (4 - 4 * (i % 2))) & 15; -} - -uint pack_8x4bit_into_uint( - const uint val0, - const uint val1, - const uint val2, - const uint val3, - const uint val4, - const uint val5, - const uint val6, - const uint val7) { - return uint( - (val0 << 28) | (val1 << 24) | (val2 << 20) | (val3 << 16) | (val4 << 12) | - (val5 << 8) | (val6 << 4) | val7 - ); -} - -#endif // QLINEAR_WEIGHT_PACK_UTILS_H diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh b/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh deleted file mode 100644 index cde72e41ac7..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef QUANTIZE_GLSLH -#define QUANTIZE_GLSLH - -OUT_T quantize_val(IN_T value, float scale_val, int zero_point_val) { - float inv_scale = 1.0 / scale_val; - - float rounded_float = round(inv_scale * float(value)); - - int qvalue = zero_point_val + int(rounded_float); - - qvalue = max(qvalue, quant_min); - qvalue = min(qvalue, quant_max); - - return OUT_T(qvalue); -} - -#endif // QUANTIZE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.glsl deleted file mode 100644 index 450d6376537..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.glsl +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)} -#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)} - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if INPUT_STORAGE == "buffer": - #define INPUT_BUFFER - -#define TILE_M4 1 -#define TILE_N4 1 -#define TILE_K4 1 - -#define TILE_M 4 -#define TILE_N 4 -#define TILE_K 4 - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "conv2d_common.glslh" - -${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} - -// Sizes of the im2col matrix of the convolution input -${layout_declare_ubo(B, "ivec4", "matrix_sizes")} -// Sizes of the input image -${layout_declare_ubo(B, "ivec4", "input_sizes")} -// Sizes of the output image -${layout_declare_ubo(B, "ivec4", "output_sizes")} - -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} - -layout(push_constant) uniform restrict Block { - float inv_scale; - int zp; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "conv2d_fp_im2col_block_load.glslh" -#include "linear_int8_input_block.glslh" - -void main() { - // The quantized and packed im2col matrix can be conceptualized as a 2D matrix - // with K/4 columns and M/4 rows. Each element of the matrix is a ivec4 which - // contains packed data for a 4 wide x 4 high block of the original im2col - // matrix. Each shader invocation works on writing out one ivec4, i.e. one - // block of the quantized and packed matrix. - - // Thread id corresponds to the block index - const int k4 = int(gl_GlobalInvocationID.x); - const int m4 = int(gl_GlobalInvocationID.y); - - // Convert block idx to tensor idx - const int k = mul_4(k4); - const int m = mul_4(m4); - - const int logical_K = conv2d_params.logical_K; - // Similarly, compute the logical size of the M dim. - const int logical_M = output_sizes.x * output_sizes.y * output_sizes.w; - - // Check if tensor indices are out of bounds - if (k >= logical_K || m >= logical_M) { - return; - } - - FPInputTile in_tile; - load_input_im2col_tile(in_tile, k4, m4, logical_K, logical_M); - - Int8InputBlock packed_block; - quantize_and_pack(packed_block, in_tile, inv_scale, zp); - - // Number of texels in the x dim of the output matrix - const int K4 = div_4(matrix_sizes.x); - write_block(packed_block, k4, m4, K4); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.yaml deleted file mode 100644 index 93f8269d607..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -quantize_and_pack_im2col: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: buffer - INPUT_STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: float - shader_variants: - - NAME: quantize_and_pack_im2col_buffer_texture3d - - NAME: quantize_and_pack_im2col_texture3d_texture3d - OUTPUT_STORAGE: texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.glsl deleted file mode 100644 index 6ba9343f10d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.glsl +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)} -#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)} - -$if OUTPUT_STORAGE == "buffer": - #define OUTPUT_BUFFER -$if INPUT_STORAGE == "buffer": - #define INPUT_BUFFER - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "common.glslh" - -${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} - -$if GRANULARITY == "per_channel": - ${layout_declare_tensor(B, "r", "t_scale", DTYPE, "buffer")} - -${layout_declare_ubo(B, "ivec4", "input_sizes")} - -layout(push_constant) uniform restrict Block { - float inv_scale; - int zp; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "linear_int8_input_block.glslh" -#include "linear_fp_input_tile_load.glslh" - -void main() { - // Each input block contains 4x4 int8 quantized values, which are packed into - // a ivec4. k4 and m4 represent the "block index" of the current block being - // processed. - int k4 = int(gl_GlobalInvocationID.x); - int m4 = int(gl_GlobalInvocationID.y); - - const int K = input_sizes.x; - const int M = input_sizes.y; - - // K4 and M4 represent the number of blocks in each dimension. - const int K4 = div_up_4(K); - const int M4 = div_up_4(M); - - if (k4 >= K4 || m4 >= M4) { - return; - } - - // row of the input tensor to start loading from. Note the input tensor is - // interpreted as a t - const int m = mul_4(m4); - - const bool dont_check_bounds = (M - m) >= 4; - - FPInputTile in_tile; - if (dont_check_bounds) { - load_input_tile_no_checks(in_tile, k4, m, K4, M); - } else { - load_input_tile_with_checks(in_tile, k4, m, K4, M); - } - - Int8InputBlock packed_block; - quantize_and_pack(packed_block, in_tile, inv_scale, zp); - - write_block(packed_block, k4, m4, K4); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.yaml deleted file mode 100644 index 37721db1ba8..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -quantize_and_pack_linear_input: - parameter_names_with_default_values: - DTYPE: float - OUTPUT_STORAGE: texture3d - INPUT_STORAGE: texture3d - STORAGE: texture3d - GRANULARITY: per_tensor - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: quantize_and_pack_linear_input_per_tensor_texture3d_texture3d - - NAME: quantize_and_pack_linear_input_per_tensor_buffer_texture3d - OUTPUT_STORAGE: buffer - - NAME: quantize_and_pack_linear_input_per_tensor_buffer_buffer - OUTPUT_STORAGE: buffer - INPUT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl deleted file mode 100644 index 7bf3a932c6c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define IN_T ${buffer_scalar_type(IN_DTYPE)} -#define OUT_T ${buffer_scalar_type(OUT_DTYPE)} -#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)} -#define ZP_T ${buffer_scalar_type(ZP_DTYPE)} - -#define ${MODE} - -${define_active_storage_type("buffer")} -${define_required_extensions(IN_DTYPE)} -${define_required_extensions(OUT_DTYPE)} -${define_required_extensions(SCALE_DTYPE)} -${define_required_extensions(ZP_DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")} - -$if MODE == "per_tensor": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int quant_min; - int quant_max; - }; -$if MODE == "per_token": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int num_tokens; - int quant_min; - int quant_max; - }; -$if MODE == "per_channel": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int axis; - int num_channels; - int quant_min; - int quant_max; - }; -$if MODE == "block_wise": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - ivec4 blockSize; // bW, bH, bC, bN - ivec4 numBlocks; // tW/bW, tH/bH, tC/bC, tN/bN - ivec4 blockStride; // pre-computed linear strides for the block grid - int quant_min; - int quant_max; - }; - -${layout_declare_ubo(B, "int", "out_numel")} -${layout_declare_ubo(B, "ivec4", "t_in_sizes")} -${layout_declare_ubo(B, "ivec4", "t_in_strides")} -${layout_declare_ubo(B, "ivec4", "t_out_sizes")} -${layout_declare_ubo(B, "ivec4", "t_out_strides")} - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} - -#include "quantize.glslh" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); -const lowp ivec4 in_dim_order = unhash_dim_order(in_layout); - -/* - Quantization Shader (Buffer Storage) - This shader converts floating-point tensor values to n-bit integer representations - using pre-computed quantization parameters (scale and zero_point). The quantization - maps floating-point values to a discrete integer range while preserving the original - data distribution as much as possible. - - Important Considerations: - (+) All input tensors are assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension) - (+) The axis map layout is assumed to be a standard layout for scales and zero_points - (++) The scale and zero_point tensors must be implemented as buffers - - Workgroup Configuration: - - quantize_per_tensor - This mode applies uniform quantization across the entire tensor using a single scale - and zero_point value. - - (*) global_wg_size: default - (*) local_wg_size: default - - - quantize_per_token - This mode applies quantization individually to each token (or element) in the input, - using separate scale and zero_point values for each token. For instance if we have - a tensor of shape [B, S, H] then we have B*S tokens (and s+zp pairs) of H elements each. - - (*) global_wg_size: default - (*) local_wg_size: default - - - quantize_per_channel - This mode applies quantization separately to each channel of the input tensor, using - distinct scale and zero_point values for each channel. For example, if the tensor shape - is [B, C, H, W] and axis = 1, quantization parameters are computed per channel C, allowing - each channel to be quantized independently. - - (*) global_wg_size: default - (*) local_wg_size: default - - - quantize_block_wise - This mode applies quantization in blocks or groups of elements, allowing different scale - and zero_point values for each block. It is equivalent to quantize_affine, where quantization - parameters are affine transformations applied per block. For example, if the tensor shape - is [6, 9, 4] and blockSize = [3, 3, 2], then we have 12 blocks each with 18 elements. - - (*) global_wg_size: default - (*) local_wg_size: default - - Quantization Formula: - qvalue = clamp(round(value / scale) + zero_point, quant_min, quant_max). -*/ - -#ifdef per_tensor - -void quantize_per_tensor() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T value = t_in[in_bufi]; - OUT_T qvalue = quantize_val(value, float(t_scale[0]), int(t_zero_point[0])); - - t_out[out_bufi] = qvalue; -} - -#elif defined(per_token) - -void quantize_per_token() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T value = t_in[in_bufi]; - - int token_idx = 0; - - if (t_out_sizes.w > 1) { - // 4D tensor - token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y; - } else if (t_out_sizes.z > 1) { - // 3D tensor - token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y; - } else if (t_out_sizes.y > 1) { - // 2D tensor - token_idx = out_tidx.y; - } - // For 1D tensor, token_idx remains 0 - - token_idx = min(token_idx, num_tokens - 1); - - OUT_T qvalue = quantize_val(value, float(t_scale[token_idx]), int(t_zero_point[token_idx])); - - t_out[out_bufi] = qvalue; -} - -#elif defined(per_channel) - -void quantize_per_channel() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T value = t_in[in_bufi]; - - // Calculate channel index based on the quantization axis (already converted to WHCN) - // The axis parameter is now in WHCN coordinate system: - // axis 0 -> W dimension (tidx.x) - // axis 1 -> H dimension (tidx.y) - // axis 2 -> C dimension (tidx.z) - // axis 3 -> N dimension (tidx.w) - int channel_idx = 0; - - if (axis == 0) { - channel_idx = out_tidx.x; - } else if (axis == 1) { - channel_idx = out_tidx.y; - } else if (axis == 2) { - channel_idx = out_tidx.z; - } else if (axis == 3) { - channel_idx = out_tidx.w; - } - - channel_idx = min(channel_idx, num_channels - 1); - - OUT_T qvalue = quantize_val(value, float(t_scale[channel_idx]), int(t_zero_point[channel_idx])); - - t_out[out_bufi] = qvalue; -} - -#else // block_wise - -void quantize_block_wise() { - const int out_bufi = int(gl_GlobalInvocationID.x); - - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); - const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); - - IN_T value = t_in[in_bufi]; - - const ivec4 bcoord = out_tidx / blockSize; - - const int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w; - - const OUT_T qvalue = quantize_val(value, float(t_scale[block_id]), int(t_zero_point[block_id])); - - t_out[out_bufi] = qvalue; -} - -#endif - -void main() { - quantize_${MODE}(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml deleted file mode 100644 index fb5853ecd20..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml +++ /dev/null @@ -1,31 +0,0 @@ -quantize_buffer: - parameter_names_with_default_values: - IN_DTYPE: float - OUT_DTYPE: int32 - SCALE_DTYPE: float - ZP_DTYPE: int32 - MODE: per_tensor - generate_variant_forall: - IN_DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - OUT_DTYPE: - - VALUE: uint8 - - VALUE: int8 - - VALUE: int32 - SCALE_DTYPE: - - VALUE: float - ZP_DTYPE: - - VALUE: int8 - - VALUE: int32 - - VALUE: float - shader_variants: - - NAME: quantize_per_tensor_buffer - MODE: per_tensor - - NAME: quantize_per_token_buffer - MODE: per_token - - NAME: quantize_per_channel_buffer - MODE: per_channel - - NAME: quantize_block_wise_buffer - MODE: block_wise diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl deleted file mode 100644 index 12e5769f50d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define IN_T ${buffer_scalar_type(IN_DTYPE)} -#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")} - -#define OUT_T ${buffer_scalar_type(OUT_DTYPE)} -#define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")} -#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)} -#define ZP_T ${buffer_scalar_type(ZP_DTYPE)} - -#define ${MODE} - -${define_active_storage_type("texture3d")} -${define_required_extensions(IN_DTYPE)} -${define_required_extensions(OUT_DTYPE)} -${define_required_extensions(SCALE_DTYPE)} -${define_required_extensions(ZP_DTYPE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")} - -$if MODE == "per_tensor": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int quant_min; - int quant_max; - }; -$if MODE == "per_token": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int num_tokens; - int quant_min; - int quant_max; - }; -$if MODE == "per_channel": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict Block { - int axis; - int num_channels; - int quant_min; - int quant_max; - }; -$if MODE == "block_wise": - ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")} - ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")} - - layout(push_constant) uniform restrict BlockPC { - ivec4 blockSize; // WHCN - ivec4 numBlocks; // (#W,#H,#C,#N) - ivec4 blockStride; // {1, #W, #W * #H, #W * #H * #C} - int quant_min; - int quant_max; - }; - -${layout_declare_ubo(B, "ivec3", "t_in_limits")} -${layout_declare_ubo(B, "ivec3", "t_out_limits")} - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} - -#include "quantize.glslh" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - Quantization Shader (Texture Storage) - This shader converts floating-point tensor values to n-bit integer representations - using pre-computed quantization parameters (scale and zero_point). The quantization - maps floating-point values to a discrete integer range while preserving the original - data distribution as much as possible. - - Important Considerations: - (+) All input tensors are assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension) - (+) The axis map layout is assumed to be a standard layout for scales and zero_points - (++) The scale and zero_point tensors must be implemented as buffers - - Workgroup Configuration: - - quantize_per_tensor - This mode applies uniform quantization across the entire tensor using a single scale - and zero_point value. - - (*) global_wg_size: default - (*) local_wg_size: default - - - quantize_per_token - This mode applies quantization individually to each token (or element) in the input, - using separate scale and zero_point values for each token. For instance if we have - a tensor of shape [B, S, H] then we have B*S tokens (and s+zp pairs) of H elements each. - - (*) global_wg_size: default - (*) local_wg_size: default - - - quantize_per_channel - This mode applies quantization separately to each channel of the input tensor, using - distinct scale and zero_point values for each channel. For example, if the tensor shape - is [B, C, H, W] and axis = 1, quantization parameters are computed per channel C, allowing - each channel to be quantized independently. - - (*) global_wg_size: default - (*) local_wg_size: Default with special handling for batch dimension. When quantizing along - the batch axis, Z dimension is set to 1 to ensure correct workgroup dispatching. Otherwise, - uses standard workgroup size derived from global workgroup dimensions. - - - quantize_block_wise - This mode applies quantization in blocks or groups of elements, allowing different scale - and zero_point values for each block. It is equivalent to quantize_affine, where quantization - parameters are affine transformations applied per block. For example, if the tensor shape - is [6, 9, 4] and blockSize = [3, 3, 2], then we have 12 blocks each with 18 elements. - - (*) global_wg_size: default - (*) local_wg_size: Default with special handling for batch dimension. When quantizing along - the batch axis, Z dimension is set to 1 to ensure correct workgroup dispatching. Otherwise, - uses standard workgroup size derived from global workgroup dimensions. - - Quantization Formula: - qvalue = clamp(round(value / scale) + zero_point, quant_min, quant_max). -*/ - -#ifdef per_tensor - -void quantize_per_tensor() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) { - return; - } - - FVEC4_T intex = load_texel(t_in, pos); - IVEC4_T outtex; - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T value = IN_T(intex[i]); - OUT_T qvalue = quantize_val(value, float(t_scale[0]), int(t_zero_point[0])); - outtex[i] = qvalue; - } - write_texel(t_out, pos, outtex); -} - -#elif defined(per_token) - -void quantize_per_token() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) { - return; - } - - FVEC4_T intex = load_texel(t_in, pos); - - int token_idx = 0; - ivec3 dims = t_in_limits; - - if (dims.z > 1) { - // 3D tensor - token_idx = pos.z * dims.y + pos.y; - } else if (dims.y > 1) { - // 2D tensor - token_idx = pos.y; - } - // For 1D tensor, token_idx remains 0 - - token_idx = min(token_idx, num_tokens - 1); - - // Scale and zero_point are prepacked as buffers, so direct access - float scale_val = float(t_scale[token_idx]); - int zero_point_val = int(t_zero_point[token_idx]); - - IVEC4_T outtex; - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T value = IN_T(intex[i]); - OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); - outtex[i] = qvalue; - } - - write_texel(t_out, pos, outtex); -} - -#elif defined(per_channel) - -void quantize_per_channel() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) { - return; - } - - FVEC4_T intex = load_texel(t_in, pos); - IVEC4_T outtex; - - // Calculate channel index based on the quantization axis (already converted to WHCN) - // The axis parameter is now in WHCN coordinate system: - // axis 0 -> W dimension (pos.x for texture, but width-packed so pos.x * 4 + component) - // axis 1 -> H dimension (pos.y) - // axis 2 -> C dimension (pos.z / C), but for 4D tensors this includes batch-channel folding - // axis 3 -> N dimension (pos.z / N), but for 4D tensors this includes batch-channel folding - - if (axis == 0) { - // Width dimension - each texel component has different channel index - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T value = IN_T(intex[i]); - int channel_idx = pos.x * 4 + i; - channel_idx = min(channel_idx, num_channels - 1); - - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); - outtex[i] = qvalue; - } - } else if (axis == 1) { - // Height dimension - all texel components use same channel index - int channel_idx = pos.y; - channel_idx = min(channel_idx, num_channels - 1); - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T value = IN_T(intex[i]); - OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); - outtex[i] = qvalue; - } - } else if (axis == 2) { - // Channel dimension - for 4D tensors, need to account for batch-channel folding - // The Z coordinate contains folded batch*channel information - // We need to extract the actual channel index from the folded dimension - int folded_idx = pos.z; - int channel_idx = folded_idx % num_channels; - - float scale_val = float(t_scale[channel_idx]); - int zero_point_val = int(t_zero_point[channel_idx]); - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T value = IN_T(intex[i]); - OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); - outtex[i] = qvalue; - } - } else if (axis == 3) { - // Batch dimension - for 4D tensors, need to account for batch-channel folding - // The Z coordinate contains folded batch*channel information - // We need to extract the actual batch index from the folded dimension - int folded_idx = pos.z; - int batch_idx = folded_idx / num_channels; - - float scale_val = float(t_scale[batch_idx]); - int zero_point_val = int(t_zero_point[batch_idx]); - - [[unroll]] for (int i = 0; i < 4; ++i) { - IN_T value = IN_T(intex[i]); - OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); - outtex[i] = qvalue; - } - } - - write_texel(t_out, pos, outtex); -} - -#else // block_wise - -void quantize_block_wise() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, t_in_limits))) - return; - - FVEC4_T intex = load_texel(t_in, pos); - IVEC4_T outtex; - - ivec4 base_tidx = ivec4(pos.x * 4, pos.y, pos.z, 0); - int foldedZ = pos.z; - - int C_total = numBlocks.z * blockSize.z; - - [[unroll]] for (int i = 0; i < 4; ++i) { - ivec4 tidx = ivec4(base_tidx.x + i, base_tidx.y, (foldedZ % C_total), (foldedZ / C_total)); - - ivec4 bcoord = tidx / blockSize; - int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w; - - IN_T value = IN_T(intex[i]); - OUT_T qvalue = quantize_val(value, float(t_scale[block_id]), int(t_zero_point[block_id])); - outtex[i] = qvalue; - } - - write_texel(t_out, pos, outtex); -} - -#endif - -void main() { - quantize_${MODE}(); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml deleted file mode 100644 index 03d418ff2f7..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml +++ /dev/null @@ -1,31 +0,0 @@ -quantize_texture: - parameter_names_with_default_values: - IN_DTYPE: float - OUT_DTYPE: int32 - SCALE_DTYPE: float - ZP_DTYPE: int32 - MODE: per_tensor - generate_variant_forall: - IN_DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - OUT_DTYPE: - - VALUE: uint8 - - VALUE: int8 - - VALUE: int32 - SCALE_DTYPE: - - VALUE: float - ZP_DTYPE: - - VALUE: int8 - - VALUE: int32 - - VALUE: float - shader_variants: - - NAME: quantize_per_tensor_texture3d - MODE: per_tensor - - NAME: quantize_per_token_texture3d - MODE: per_token - - NAME: quantize_per_channel_texture3d - MODE: per_channel - - NAME: quantize_block_wise_texture3d - MODE: block_wise diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl deleted file mode 100644 index 7a6263d9f55..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "tin_limits")} -${layout_declare_ubo(B, "ivec4", "tin_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = 0; -layout(constant_id = 4) const int reduce_dim = 0; -layout(constant_id = 5) const int group_dim = 1; - -// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of -// threads that will co-operate to compute one reduction output. There may be -// multiple groups computing distinct reduction outputs within one work group. -#define NWORKERS 4 - -// Sets an upper limit on the total size of a work group based on how many -// elements are allocated in the shared memory array below. Each thread in the -// work group will write into its assigned element in the shared array. -#define MAX_NTHREADS 16 - - -shared vec4 shared_vecs[MAX_NTHREADS]; - -#include "indexing_utils.h" - -int tid_to_smi(const ivec2 tid) { - return tid.x + tid.y * NWORKERS; -} - -/* - * The functions below compute reduction along a single dimension for a tensor. - * The shader template generalize reduction by abstracting the initial value of - * the accumulator, the calculation used to update the accumulator with new - * values, and a postprocessing calculation that can be used to modify the - * accumulator before writing to output. - * - * This shader also utilize shared memory to have multiple threads help compute - * the max and sum reduction operations. A total of NGROUPS x NWORKERS threads - * are expected to be launched. Each group works on a unique reduction "row", and - * within a group NWORKERS threads co-operate to compute the max and sum of one - * "row". Each worker in the group is responsible for computing a partial output - * of the "row" and uploading it to shared memory; the overall reduction output - * can then be determined by aggregating the partial outputs stored in shared - * memory. - * - * As a caveat, this shader does not currently support cases where `batch` > 1 - * and the reduce dim happens to also be the batch concatenation dim. To support - * this, there will need to be additional logic to set the starting value of - * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case, - * supporting this case is left as an exercise for when it is required. - */ - -// Initializing the accumulator accepts the first value in the reduction row, -// since some reduction operations (i.e. amax, amin) prefer to initialize with -// a data point instead of a static value. -#define INIT_ACCUM(first_val) ${INIT_ACCUM} -#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM} -// Useful for operators such as mean which want to perform a final calculation -// with the accumulator. -#define POSTPROCESS(accum) ${POSTPROCESS} - -/* - * Computes reduction where the reduction dim is orthogonal to the packed dim. - * This case is simpler because each element of a texel belongs to a separate - * reduction "group", meaning we don't have to perform reduction along a texel. - */ -void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - scan_pos[reduce_dim] = 0; - vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos)); - - scan_pos[reduce_dim] = tid.x; - // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of - // the reduction row - for (int i = tid.x; i < tin_sizes[reduce_dim]; - i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); - } - // Write partial output to shared memory and synchronize work group - shared_vecs[smi] = accum; - barrier(); - - // Since the reduction row is reduced to only one element, only the "main" - // thread in the group needs aggregate the partial outputs - if (tid.x == 0) { - // Iterate over the partial outputs to obtain the overall output - int group_i = tid.y * NWORKERS; - accum = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; i++, group_i++) { - accum = UPDATE_ACCUM(accum, shared_vecs[group_i]); - } - - // Determine if there are any padding elements in the final texel of the - // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); - // Detect if this thread is working on the final texels of the packed - // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); - - // Explicitly set padding elements to 0 - if (is_last_texel && nspill > 0) { - [[unroll]] for (int i = nspill; i < 4; i++) { - accum[i] = 0; - } - } - scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, POSTPROCESS(accum)); - } -} - -/* - * Compute reduction where the reduction dim is also the packed dim. This case is - * complex because the reduction needs to occur over the individual texels. - * Therefore, in this algorithm each element of the accumulator texels are - * themselves partial outputs. Special care has to be taken to ignore padding - * elements in texels (which occur when the size of the packed dim is not a - * multiple of 4) so that they do not influence the output of reduction. - */ -void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); - // Only reduce up to the last "complete" texel. The last texel will need to be - // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; - - scan_pos[reduce_dim] = 0; - vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x)); - - // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of - // the reduction row - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x * 4; i < reduce_len; - i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); - } - // For the last texel in the dim, if there are padding elements then each - // element of the texel needs to be processed individually such that the - // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { - const vec4 intex = load_texel(tin, scan_pos); - for (int i = 0; i < nspill; i++) { - accum.x = UPDATE_ACCUM(accum.x, intex[i]); - } - } - // Write partial output to shared memory and synchronize work group - shared_vecs[smi] = accum; - barrier(); - - // Since the reduction row is reduced to only one element, only the "main" - // thread in the group needs aggregate the partial outputs - if (tid.x == 0) { - // Iterate over the partial maximums to obtain the overall maximum - int group_i = tid.y * NWORKERS; - accum = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; i++, group_i++) { - accum = UPDATE_ACCUM(accum, shared_vecs[group_i]); - } - // Each element of the texel is itself a partial maximum; iterate over the - // texel to find the actual maximum - float accum_final = accum.x; - [[unroll]] for (int i = 1; i < 4; i++) { - accum_final = UPDATE_ACCUM(accum[i], accum_final); - } - - scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, POSTPROCESS(vec4(accum_final, 0, 0, 0))); - } -} - -void main() { - ivec3 scan_pos = ivec3(gl_GlobalInvocationID); - scan_pos[reduce_dim] = 0; - - const ivec2 tid = ivec2( - gl_LocalInvocationID[reduce_dim], - gl_LocalInvocationID[group_dim]); - - if (any(greaterThanEqual(scan_pos, tin_limits))) { - return; - } - - if (reduce_dim != packed_dim) { - reduce_nonpacked_dim(tid, scan_pos); - } else { - reduce_packed_dim(tid, scan_pos); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml deleted file mode 100644 index 21a7132b8db..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -reduce: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - INIT_ACCUM: VEC4_T(0) - UPDATE_ACCUM: accum + new_val - POSTPROCESS: accum - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: sum - - NAME: mean - POSTPROCESS: (accum / tin_sizes[reduce_dim]) - - NAME: amax - INIT_ACCUM: first_val - UPDATE_ACCUM: max(accum, new_val) - POSTPROCESS: accum - - NAME: amin - INIT_ACCUM: first_val - UPDATE_ACCUM: min(accum, new_val) - POSTPROCESS: accum diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl deleted file mode 100644 index 98370a9bcde..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "tin_limits")} -${layout_declare_ubo(B, "ivec4", "tin_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = 0; -layout(constant_id = 4) const int reduce_dim1 = 0; -layout(constant_id = 5) const int reduce_dim2 = 1; -layout(constant_id = 6) const int group_dim = 2; - -// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of -// threads that will co-operate to compute one reduction output. There may be -// multiple groups computing distinct reduction outputs within one work group. -#define NWORKERS 4 - -// Sets an upper limit on the total size of a work group based on how many -// elements are allocated in the shared memory array below. Each thread in the -// work group will write into its assigned element in the shared array. -#define MAX_NTHREADS 16 - - -shared vec4 shared_vecs[MAX_NTHREADS]; - -#include "indexing_utils.h" - -int tid_to_smi(const ivec2 tid) { - return tid.x + tid.y * NWORKERS; -} - -// Initializing the accumulator accepts the first value in the reduction row, -// since some reduction operations (i.e. amax, amin) prefer to initialize with -// a data point instead of a static value. -#define INIT_ACCUM(first_val) ${INIT_ACCUM} -#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM} -// Useful for operators such as mean which want to perform a final calculation -// with the accumulator. -#define POSTPROCESS(accum) ${POSTPROCESS} - -void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - scan_pos[reduce_dim1] = 0; - scan_pos[reduce_dim2] = 0; - vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos)); - - // First dimension reduction - scan_pos[reduce_dim1] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim1]; - i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) { - - // Second dimension reduction - scan_pos[reduce_dim2] = 0; - for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) { - accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); - } - } - - // Write partial output to shared memory and synchronize - shared_vecs[smi] = accum; - barrier(); - - // Main thread aggregates results - if (tid.x == 0) { - // Iterate over the partial outputs to obtain the overall output - int group_i = tid.y * NWORKERS; - accum = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; i++, group_i++) { - accum = UPDATE_ACCUM(accum, shared_vecs[group_i]); - } - - // Determine if there are any padding elements in the final texel of the - // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); - // Detect if this thread is working on the final texels of the packed - // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); - - // Explicitly set padding elements to 0 - if (is_last_texel && nspill > 0) { - [[unroll]] for (int i = nspill; i < 4; i++) { - accum[i] = 0; - } - } - scan_pos[reduce_dim1] = 0; - scan_pos[reduce_dim2] = 0; - write_texel(tout, scan_pos, POSTPROCESS(accum)); - } -} - -void main() { - ivec3 scan_pos = ivec3(gl_GlobalInvocationID); - scan_pos[reduce_dim1] = 0; - scan_pos[reduce_dim2] = 0; - - const ivec2 tid = ivec2( - gl_LocalInvocationID[reduce_dim1], - gl_LocalInvocationID[group_dim]); - - if (any(greaterThanEqual(scan_pos, tin_limits))) { - return; - } - - reduce_2d_non_packed_dim(tid, scan_pos); -} \ No newline at end of file diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.yaml deleted file mode 100644 index fdc5eb9f105..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -reduce2d: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - INIT_ACCUM: VEC4_T(0) - UPDATE_ACCUM: accum + new_val - POSTPROCESS: accum - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: sum2d - - NAME: mean2d - POSTPROCESS: (accum / (tin_sizes[reduce_dim1] * tin_sizes[reduce_dim2])) - - NAME: amax2d - INIT_ACCUM: first_val - UPDATE_ACCUM: max(accum, new_val) - POSTPROCESS: accum - - NAME: amin2d - INIT_ACCUM: first_val - UPDATE_ACCUM: min(accum, new_val) - POSTPROCESS: accum diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl deleted file mode 100644 index 441cd57c17d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { - ivec4 range; - // source tensor sizes in WHCB dims respectively - ivec4 src_dims; - // destination tensor repeats in WHCB dims respectively - ivec4 dst_repeats; -}; - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range.xyz))) { - return; - } - - // expand position in packed dim - pos[packed_dim] <<= 2; - - // channel size aligned by 4 when tensors are channel packed raw value otherwise - const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z); - - // find input texel's WHCB index - const int width_index = pos.x % src_dims.x; - const int height_index = pos.y % src_dims.y; - int channel_index; - int batch_index; - - // if tensors are channel packed - if (packed_dim == C_DIM) { - // the output channels in a batch will be channel size * channel repetitions aligned by 4 - const int out_channel_size = alignup4(src_dims.z * dst_repeats.z); - - // batch index in the output - const int out_pos_batch_index = pos.z / out_channel_size; - - // source batch index for based on current output pos - batch_index = out_pos_batch_index % src_dims.w; - - // batch repetition count for current output pos - const int batch_repetition_index = out_pos_batch_index / src_dims.w; - - // calculate input channel index based on current output pos and batch index - // its done this way because we want source channel to restart from zero when a batch index increments - // also batch_index will reset to zero after hitting batch repetition count - // so track the current repetition in batch_repetition_index so it can be used for determining current_index - channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z; - } else { - // the output channels in a batch will be channel size * channel repetitions - const int out_channel_size = src_dims.z * dst_repeats.z; - - // source batch index for based on current output pos - batch_index = (pos.z / out_channel_size) % src_dims.w; - - // source channel index is current output pos wrapped based on channel count - channel_index = pos.z % src_dims.z; - } - - // input texel's WCB position - const ivec3 in_pos = ivec3(width_index, height_index, channel_index); - - // squeeze position in packed dim - pos[packed_dim] >>= 2; - - // packed dim index of texel last fetched - int fetched_in_pos_packed_dim = -1; - - // fetched input texel - VEC4_T in_value; - - // output texel value - VEC4_T out_value = VEC4_T(0); - - int src_lane_offset = in_pos[packed_dim]; - - for (int i=0; i<4; i++) { - if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) { - fetched_in_pos_packed_dim = (src_lane_offset >> 2); - - ivec3 curr_in_pos = in_pos; - curr_in_pos[packed_dim] = src_lane_offset; - curr_in_pos.z = curr_in_pos.z + batch_index * channel_size; - curr_in_pos[packed_dim] >>= 2; - - in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map)); - } - - out_value[i] = in_value[src_lane_offset & 0x3]; - - src_lane_offset++; - // if packed index exceeded source packed dim round to zero - src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]); - } - - write_texel_lpos( - t_out, - pos, - out_value, - out_axis_map); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml deleted file mode 100644 index f40d94142e1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml +++ /dev/null @@ -1,14 +0,0 @@ -repeat: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - VALUE: int8 - - VALUE: uint8 - shader_variants: - - NAME: repeat diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl deleted file mode 100644 index 42c7f86aea8..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict RepeatArgs { - // With input_size (n, c_i, h, w) and repeat r - // out_size == (n, c_i * r, h, w) - ivec4 out_sizes; - ivec4 in_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - - -void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - - const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim); - - if (any(greaterThanEqual(out_whcn, out_sizes))) { - return; - } - - VEC4_T v; - // Loop over the 4 elements in texel, calculate the corresponding elem, and - // fetch. Not most efficient algorithm because likely we fetch same texel - // multiple times in this loop. - - for (int i=0; i<4;i++) { - ivec4 in_whcn = out_whcn; - in_whcn.z = (out_whcn.z + i) % in_sizes.z; - - ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim); - - v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w]; - } - - imageStore(image_out, out_pos, v); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml deleted file mode 100644 index 4147e82965a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml +++ /dev/null @@ -1,10 +0,0 @@ -repeat_channel: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: repeat_channel diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl deleted file mode 100644 index 1a8e677a38f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "tin_limits")} - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "tout_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 tout_axis_map = unhash_axis_map(tout_layout); - -${layout_declare_spec_const(C, "int", "tin_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 tin_axis_map = unhash_axis_map(tin_layout); - -${layout_declare_spec_const(C, "int", "nrepeats", "1")} -${layout_declare_spec_const(C, "int", "repeat_dim", "1")} - -void main() { - const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(tin_lpos, tin_limits))) { - return; - } - - const VEC4_T intex = load_texel_lpos(tin, tin_lpos, tin_axis_map); - - ivec3 tout_lpos = tin_lpos; - tout_lpos[repeat_dim] *= nrepeats; - - for (int i = 0; i < nrepeats; ++i, tout_lpos[repeat_dim]++) { - write_texel_lpos(tout, tout_lpos, intex, tout_axis_map); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml deleted file mode 100644 index 5c284a580c9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml +++ /dev/null @@ -1,10 +0,0 @@ -repeat_interleave: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: repeat_interleave diff --git a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.glsl deleted file mode 100644 index 30375728921..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.glsl +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "xqout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "w", "xkout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "xq", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "xk", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "freqs_cos", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "freqs_sin", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "xqout_limits")} -${layout_declare_ubo(B, "ivec3", "xkout_limits")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = 0; - -#include "indexing_utils.h" - -/* - * This shader computes rotary positional embeddings which are used in the Llama - * model architecture. There are 4 input tensors with the following shapes. - * Note that head_dim = embedding_dim / num_heads - * - * 1. xq (batch_size, sequence_len, num_heads, head_dim) - * 2. xk (batch_size, sequence_len, num_kv_heads, head_dim) - * 3. freqs_cos (sequence_len, head_dim / 2) - * 4. freqs_cos (sequence_len, head_dim / 2) - * - * Two output tensors are produced, with the same shapes as xq and xk - * respectively. - * - * The computation of rotary positional embeddings can be summarized with the - * following equations: - * - * xq_out[2i] = xq[2i] * freqs_cos[i] - xq[2i + 1] * freqs_sin[i] - * xq_out[2i + 1] = xq[2i] * freqs_sin[i] + xq[2i + 1] * freqs_cos[i] - * - * Essentially, taking each row along head_dim of the xq and xk tensors, each - * row is split into even and odd elements (xq[2i] and xq[2i + 1] respectively). - * The even components of the output multiply the even components of the inputs - * with the freqs_cos tensor, and the odd components of the inputs with the - * freqs_sin tensor. The odd components of the output swap this. Throughout the - * implementation the even components have the _r suffix and the odd components - * have the _i suffix; this is a reference to complex numbers which can be used - * to represent rotations. - * - * Note that this implementation assumes that all input tensors have the width - * dim as the packed dim. - */ -void main() { - // Each thread will write to two output locations to maximize data re-use. - // One texel loaded from the freqs_cos/freqs_sin tensors can be used to - // calculate two output texels. - const ivec3 x_pos_1 = ivec3( - gl_GlobalInvocationID.x * 2, gl_GlobalInvocationID.yz); - const ivec3 x_pos_2 = ivec3(x_pos_1.x + 1, x_pos_1.yz); - - if (any(greaterThanEqual(x_pos_2, xqout_limits))) { - return; - } - - const ivec3 freqs_pos = ivec3(gl_GlobalInvocationID.xz, 0); - - VEC4_T cos_tex = load_texel(freqs_cos, freqs_pos); - VEC4_T sin_tex = load_texel(freqs_sin, freqs_pos); - - // Compute xqout - - VEC4_T x_tex_1 = load_texel(xq, x_pos_1); - VEC4_T x_tex_2 = load_texel(xq, x_pos_2); - - // Separate into even and odd elements - VEC4_T x_r = VEC4_T(x_tex_1.xz, x_tex_2.xz); - VEC4_T x_i = VEC4_T(x_tex_1.yw, x_tex_2.yw); - - VEC4_T xout_r = x_r * cos_tex - x_i * sin_tex; - VEC4_T xout_i = x_r * sin_tex + x_i * cos_tex; - - VEC4_T xout_tex_1 = VEC4_T(xout_r.x, xout_i.x, xout_r.y, xout_i.y); - VEC4_T xout_tex_2 = VEC4_T(xout_r.z, xout_i.z, xout_r.w, xout_i.w); - - write_texel(xqout, x_pos_1, xout_tex_1); - write_texel(xqout, x_pos_2, xout_tex_2); - - // n_heads will be greater than or equal to n_kv_heads, therefore xq and xqout - // may have a larger height dim than xk and xkout. Only compute xkout if this - // invocation is still within bounds. - if (any(greaterThanEqual(x_pos_2, xkout_limits))) { - return; - } - - // Compute xkout - - x_tex_1 = load_texel(xk, x_pos_1); - x_tex_2 = load_texel(xk, x_pos_2); - - x_r = VEC4_T(x_tex_1.xz, x_tex_2.xz); - x_i = VEC4_T(x_tex_1.yw, x_tex_2.yw); - - xout_r = x_r * cos_tex - x_i * sin_tex; - xout_i = x_r * sin_tex + x_i * cos_tex; - - xout_tex_1 = VEC4_T(xout_r.x, xout_i.x, xout_r.y, xout_i.y); - xout_tex_2 = VEC4_T(xout_r.z, xout_i.z, xout_r.w, xout_i.w); - - write_texel(xkout, x_pos_1, xout_tex_1); - write_texel(xkout, x_pos_2, xout_tex_2); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.yaml b/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.yaml deleted file mode 100644 index a81fd564d10..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.yaml +++ /dev/null @@ -1,10 +0,0 @@ -rotary_embedding: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: rotary_embedding diff --git a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.glsl b/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.glsl deleted file mode 100644 index 09857451f7c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.glsl +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${texel_type(DTYPE)} - -${define_active_storage_type(STORAGE)} -${define_required_extensions(DTYPE)} -${define_required_extensions(SCALAR_VALUE_TYPE)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_ubo(B, buffer_scalar_type(SCALAR_VALUE_TYPE), "scalar_value")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#ifdef USING_BUFFER - -void main() { - const int i = int(gl_GlobalInvocationID.x); - - if (i > 0) { - return; - } - - t_out[i] = BUF_T(scalar_value); -} - -# else // !USING_BUFFER - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - // Scalar tensor is a special case where the packed dim is always 1. - if (any(greaterThanEqual(pos, ivec3(1)))) { - return; - } - - VEC4_T outtex = VEC4_T(scalar_value); - write_texel(t_out, pos, outtex); -} - -#endif // !USING_BUFFER diff --git a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.yaml b/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.yaml deleted file mode 100644 index cd45b80c4dc..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -scalar_tensor: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - SCALAR_VALUE_TYPE: float - PACKING: C_packed - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - STORAGE: - - VALUE: texture3d - - VALUE: buffer - SCALAR_VALUE_TYPE: - - VALUE: float - - VALUE: int32 - - VALUE: bool - shader_variants: - - NAME: scalar_tensor diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl deleted file mode 100644 index 1e854bf7f85..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type(STORAGE)} -${define_required_extensions(DTYPE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "rw", "attn_weight", DTYPE, STORAGE)} - -$if STORAGE == "buffer": - ${layout_declare_ubo(B, "ivec4", "attn_weight_sizes")} - ${layout_declare_ubo(B, "ivec4", "attn_weight_strides")} -$else: - ${layout_declare_ubo(B, "ivec3", "attn_weight_limits")} - -${layout_declare_ubo(B, "int", "input_pos")} -${layout_declare_ubo(B, "float", "scale")} - - -#include "indexing_utils.h" - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -// Negative infinity is represented by having sign bit be 1, all exponent bits -// be 1, all mantissa bits be 0. -#define NEGATIVE_INF_BITS 0xFF800000 -const float negative_infinity = NEGATIVE_INF_BITS; - -#ifdef USING_BUFFER - -/* - * This implementations applies a scale and mask to the attention weight tensor - * of an SDPA block. The sizes of the attention weight is - * (batch_size, n_heads, seq_len, input_pos + seq_len) - * Conceptually the weights represent the relationship between each token in the - * sequence with each token preceding it. - * - * The scale applied is 1.0 / sqrt(head_dim_length) - * - * The mask applied is a bit more complicated. Imagine you create a square - * matrix of size (input_pos + seq_len, input_pos + seq_len), and then set the - * lower triangular section of the matrix to -inf. Then, slice the matrix along - * the row dimension starting from input_pos to input_pos + seq_len. You end up - * with a partial mask with size (seq_len, input_pos + seq_len). This is the - * mask that is applied to the attention weight. - * - * In the shader, instead of generating the mask, the index of the elment is - * inspected to determine if it would have been masked. Given an element at - * tensor index (n, c, h, w), it would be masked if w < h + input_pos. - */ - -/*************************** - ** Buffer Implementation ** - ***************************/ - -void main() { - const ivec4 attn_weight_idx = ivec4( - gl_GlobalInvocationID.x, - gl_GlobalInvocationID.y, - gl_GlobalInvocationID.z, - 0); - - if (any(greaterThanEqual(attn_weight_idx, attn_weight_sizes))) { - return; - } - - const T scale_conv = T(scale); - - const int attn_weight_id = tidx_to_bufi(attn_weight_idx, attn_weight_strides); - if (attn_weight_idx.x <= attn_weight_idx.y + input_pos) { - attn_weight[attn_weight_id] = attn_weight[attn_weight_id] * scale_conv; - } else { - attn_weight[attn_weight_id] = T(negative_infinity); - } -} - -#else - -/**************************** - ** Texture Implementation ** - ****************************/ - -/* - * This implementation assumes that the attention weight is width packed, i.e. - * the packed dim of the attn_weight is 0. - */ -void main() { - const ivec3 attn_weight_pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(attn_weight_pos, attn_weight_limits))) { - return; - } - - vec4 outtex = imageLoad(attn_weight, attn_weight_pos) * scale; - - // Mask out the upper triangular of attn_weight to -inf - [[unroll]] for (int i = 0; i < 4; ++i) { - if (attn_weight_pos.x * 4 + i > attn_weight_pos.y + input_pos) { - outtex[i] = negative_infinity; - } - } - - write_texel(attn_weight, attn_weight_pos, outtex); -} - -#endif // USING_BUFFER diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml deleted file mode 100644 index ca8806fe000..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml +++ /dev/null @@ -1,13 +0,0 @@ -sdpa_attn_weight_scale_and_mask: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - STORAGE: - - VALUE: buffer - - VALUE: texture3d - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: sdpa_attn_weight_scale_and_mask diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh deleted file mode 100644 index 6509015b4b6..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef SELECT_GLSLH -#define SELECT_GLSLH - -#ifndef USING_BUFFER - -/* - * Enable the fast path if a texel loaded from the input texture can be used as - * is to store to the output texture. The following conditions must be met: - * - * 1. The input and output textures have the same packed dimension. - * 2. The selected_dim must not be the packed dimension of the input. - * 3. The packed dimension of the input must "map" to the packed dimension of - * the output. This occurs if selected_dim is greater than the packed dimension - * of the input. - */ -bool can_use_fast_path() { - if (out_packed_dim != in_packed_dim) { - return false; - } - if (selected_dim <= in_packed_dim) { - return false; - } - return true; -} - -#endif // USING_BUFFER - -/* - * Given an output tensor index, return the corresponding input tensor index for - * the select operator. This is done by "inserting" the select index at the - * selected_dim in the input tensor index. - * - * A simple example is (note all tensor index are in WHCN order): - * out_tidx = [7, 5, 9] - * selected_dim = 2 - * index = 3 - * in_tidx = [7, 3, 5, 9] - * - * This function assumes that the following variables are defined in the layout: - * - in_sizes - * - selected_dim - * - index - */ -ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { - ivec4 in_tidx = ivec4(0); - - int adjusted_index = index; - if (index < 0) { - adjusted_index = index + in_sizes[selected_dim]; - } - - // Handle different dimensions for selection - if (selected_dim == 0) { - // Select from width dimension - in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z); - } else if (selected_dim == 1) { - // Select from height dimension - in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z); - } else if (selected_dim == 2) { - // Select from channel dimension - in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z); - } else if (selected_dim == 3) { - // Select from batch dimension - in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index); - } - - return in_tidx; -} - -#endif // SELECT_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/set_zero.glsl b/backends/vulkan/runtime/graph/ops/glsl/set_zero.glsl deleted file mode 100644 index d01780b9e30..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/set_zero.glsl +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("buffer")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} - -${layout_declare_ubo(B, "int", "out_numel")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const int out_bufi = ivec3(gl_GlobalInvocationID).x; - if (out_bufi >= out_numel) { - return; - } - - t_out[out_bufi] = T(0); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/set_zero.yaml b/backends/vulkan/runtime/graph/ops/glsl/set_zero.yaml deleted file mode 100644 index cee87c468b1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/set_zero.yaml +++ /dev/null @@ -1,8 +0,0 @@ -set_zero: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: int32 - shader_variants: - - NAME: set_zero diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh deleted file mode 100644 index 87325754f4d..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef SLICE_GLSLH -#define SLICE_GLSLH - -#ifndef USING_BUFFER - -/** - * Enable the fast path if a texel loaded from the input texture can be used as - * is to store to the output texture. The following conditions must be met: - * - * 1. The input and output textures have the same packed dimension. - * 2. The select_dim must not be the packed dimension of the input. - */ -bool can_use_fast_path() { - if (out_packed_dim != in_packed_dim) { - return false; - } - if (in_packed_dim == selected_dim) { - return false; - } - return true; -} - -#endif // USING_BUFFER - -/* - * Converts output tensor indices to input tensor indices for the slice operation. - * This function maps the output indices to the corresponding input indices based on - * the slice parameters (start, step, selected_dim). - * - * Parameters assumed to be defined in the layout specifier: - * - in_sizes - * - selected_dim - * - start - * - step - */ -ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { - ivec4 in_tidx = out_tidx; - - int adjusted_start = start; - if (start < 0) { - adjusted_start = start + in_sizes[selected_dim]; - } - - in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step; - - return in_tidx; -} - -#endif // SLICE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl deleted file mode 100644 index d35492bc367..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define op1(X) ${OPERATOR1} - -#define op2(X, Y) ${OPERATOR2} - -${define_active_storage_type(STORAGE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "tout_limits")} -${layout_declare_ubo(B, "ivec4", "tin_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = 0; -layout(constant_id = 4) const int reduce_dim = 0; -layout(constant_id = 5) const int group_dim = 1; - -// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of -// threads that will co-operate to compute one reduction output. There may be -// multiple groups computing distinct reduction outputs within one work group. -#define NWORKERS 4 - -// Sets an upper limit on the total size of a work group based on how many -// elements are allocated in the shared memory array below. Each thread in the -// work group will write into its assigned element in the shared array. -#define MAX_NTHREADS 16 - -shared vec4 shared_vecs[MAX_NTHREADS]; - -#include "indexing_utils.h" - -int tid_to_smi(const ivec2 tid) { - return tid.x + tid.y * NWORKERS; -} - -/* - * The shaders below compute softmax for a tensor. Softmax is an interesting mix - * between a reduction operator and a unary elementwise operator, defined as - * exp(x) / (sum of exp(x)). The general flow of the computation is: - * - * First, find the maximum element along the reduction dim. The maximum element - * is used to preserve numerical stability, since division of exponents is - * translation invariant. - * - * Next, compute the sum of exp(x - max_element) along the reduction dim. - * - * Finally, for each element along the reduction dim, we compute the output as - * exp(x - max_element) / sum_of_exponents. - * - * The shaders below also utilize shared memory to have multiple threads help - * compute the max and sum reduction operations. A total of NGROUPS x NWORKERS - * threads are launched. Each group works on a unique reduction "row", and - * within a group NWORKERS threads co-operate to compute the max and sum of one - * "row". Each worker in the group is responsible for computing a partial output - * of the "row" and uploading it to shared memory; the overall reduction output - * can then be determined by aggregating the partial outputs stored in shared - * memory. - * - * As a caveat, this shader does not currently support cases where `batch` > 1 - * and the reduce dim happens to also be the batch concatenation dim. To support - * this, there will need to be additional logic to set the starting value of - * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case, - * supporting this case is left as an exercise for when it is required. - * - * As a final note, log softmax is supported with this shader as well since via - * the op1 and op2 macro definitions. See the corresponding YAML file for more - * details. - */ - -/* - * Computes softmax where the reduction dim is orthogonal to the packed dim. - * This case is simpler because each element of a texel belongs to a separate - * reduction dim, meaning we don't have to perform reduction along a texel. - */ -void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - // used to iterate over all shared memory in the group - int group_i; - - scan_pos[reduce_dim] = tid.x; - vec4 max_elements = load_texel(tin, scan_pos); - // This thread computes a partial maximum - for (int i = tid.x; i < tin_sizes[reduce_dim]; - i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - max_elements = max(max_elements, load_texel(tin, scan_pos)); - } - shared_vecs[smi] = max_elements; - barrier(); - // Iterate over the partial maximums to obtain the overall maximum - group_i = tid.y * NWORKERS; - max_elements = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; ++i, group_i++) { - max_elements = max(max_elements, shared_vecs[group_i]); - } - - scan_pos[reduce_dim] = tid.x; - vec4 denominators = vec4(0); - // Compute partial sum - for (int i = tid.x; i < tin_sizes[reduce_dim]; - i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - denominators += exp(load_texel(tin, scan_pos) - max_elements); - } - shared_vecs[smi] = denominators; - barrier(); - // Iterate over the partial sums to obtain the overall sum - group_i = tid.y * NWORKERS; - denominators = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; ++i, group_i++) { - denominators += shared_vecs[group_i]; - } - - // Determine if there are any padding elements in the final texel of the - // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); - // Detect if this thread is working on the final texels of the packed - // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tout_limits[packed_dim] - 1); - - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim]; - i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - const vec4 numerators = op1(load_texel(tin, scan_pos) - max_elements); - vec4 outtex = op2(numerators, denominators); - // For the last texel in the packed dim, make sure that the padding elements - // are explicitly set to 0. Otherwise, they may influence computations later - // down the line. - if (is_last_texel && nspill > 0) { - [[unroll]] for (int i = nspill; i < 4; ++i) { - outtex[i] = 0; - } - } - write_texel(tout, scan_pos, outtex); - } -} - -/* - * Compute softmax where the reduction dim is also the packed dim. This case is - * complex because the reduction needs to occur over the individual texels. - * Therefore, in this algorithm each element of the accumulator texels are - * themselves partial outputs. Special care has to be taken to ignore padding - * elements in texels (which occur when the size of the packed dim is not a - * multiple of 4) so that they do not influence the output of reduction. - */ -void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - // used to iterate over all shared memory in the group - int group_i; - - const int nspill = mod4(tin_sizes[packed_dim]); - const int reduce_len = tin_sizes[packed_dim] - nspill; - - scan_pos[reduce_dim] = tid.x; - vec4 max_elements = vec4(load_texel(tin, scan_pos).x); - for (int i = tid.x * 4; i < reduce_len; - i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - max_elements = max(max_elements, load_texel(tin, scan_pos)); - } - // For the last texel in the dim, if there are padding elements then each - // element of the texel needs to be processed individually such that the - // padding elements are ignored - if (scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1 && nspill > 0) { - const vec4 intex = load_texel(tin, scan_pos); - for (int i = 0; i < nspill; ++i) { - max_elements.x = max(intex[i], max_elements.x); - } - } - shared_vecs[smi] = max_elements; - barrier(); - // Iterate over the partial maximums to obtain the overall maximum - group_i = tid.y * NWORKERS; - max_elements = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; ++i, group_i++) { - max_elements = max(max_elements, shared_vecs[group_i]); - } - // Each element of the texel is itself a partial maximum; iterate over the - // texel to find the actual maximum - float max_element = max_elements.x; - [[unroll]] for (int i = 1; i < 4; ++i) { - max_element = max(max_elements[i], max_element); - } - - scan_pos[reduce_dim] = tid.x; - vec4 denominators = vec4(0); - for (int i = tid.x * 4; i < reduce_len; - i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - denominators += exp(load_texel(tin, scan_pos) - max_element); - } - // For the last texel in the dim, if there are padding elements then each - // element of the texel needs to be processed individually such that the - // padding elements are ignored - if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) { - const vec4 intex = load_texel(tin, scan_pos); - for (int i = 0; i < nspill; ++i) { - denominators.x += exp(intex[i] - max_element); - } - } - shared_vecs[smi] = denominators; - barrier(); - // Iterate over the partial sums to obtain the overall sum - group_i = tid.y * NWORKERS; - denominators = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; ++i, group_i++) { - denominators += shared_vecs[group_i]; - } - // Reduce over the accumulated texel to find the overall sum - float denominator = 0; - [[unroll]] for (int i = 0; i < 4; ++i) { - denominator += denominators[i]; - } - - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x * 4; i < reduce_len; - i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - const vec4 numerators = op1(load_texel(tin, scan_pos) - max_element); - write_texel(tout, scan_pos, op2(numerators, denominator)); - } - // For the last texel in the dim, if there are padding elements then the - // padding elements need to be set to 0 explicitly, otherwise they may - // influence subsequent operations. - if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) { - const vec4 numerator = op1(load_texel(tin, scan_pos) - max_element); - vec4 outtex = op2(numerator, denominator); - [[unroll]] for (int i = nspill; i < 4; ++i) { - outtex[i] = 0; - } - write_texel(tout, scan_pos, outtex); - } -} - -void main() { - ivec3 scan_pos = ivec3(gl_GlobalInvocationID); - scan_pos[reduce_dim] = 0; - - const ivec2 tid = ivec2( - gl_LocalInvocationID[reduce_dim], - gl_LocalInvocationID[group_dim]); - - if (any(greaterThanEqual(scan_pos, tout_limits))) { - return; - } - - if (reduce_dim != packed_dim) { - softmax_nonpacked_dim(tid, scan_pos); - } else { - softmax_packed_dim(tid, scan_pos); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml b/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml deleted file mode 100644 index d50bbb85f33..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -softmax: - parameter_names_with_default_values: - OPERATOR1: exp(X) - OPERATOR2: X / Y - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: softmax - - NAME: log_softmax - OPERATOR1: X - OPERATOR2: X - log(Y) diff --git a/backends/vulkan/runtime/graph/ops/glsl/tan.glsl b/backends/vulkan/runtime/graph/ops/glsl/tan.glsl deleted file mode 100644 index 876cd43ad08..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/tan.glsl +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type(STORAGE)} - -#include "indexing_utils.h" - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -$if STORAGE == "buffer": - ${layout_declare_ubo(2, "int", "numel")} -$else: - ${layout_declare_ubo(2, "ivec3", "out_limits")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "activations.h" - -#ifdef USING_BUFFER - -void main() { - const int i = int(gl_GlobalInvocationID.x); - if (i >= numel) { - return; - } - - float in_val = float(t_in[i]); - t_out[i] = T(tan(in_val)); -} - -#else - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - VEC4_T in_texel = texelFetch(t_in, pos, 0); - imageStore(t_out, pos, VEC4_T(tan(in_texel))); -} - -#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/tan.yaml b/backends/vulkan/runtime/graph/ops/glsl/tan.yaml deleted file mode 100644 index ad0755bfad0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/tan.yaml +++ /dev/null @@ -1,13 +0,0 @@ -tan: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - STORAGE: - - VALUE: texture3d - - VALUE: buffer - shader_variants: - - NAME: tan diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl deleted file mode 100644 index 7605c59c72f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define UBO_PARAMS ${UBO_PARAMS} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("buffer")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" -${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} - -$if UBO_PARAMS: - $if OP_NAME == "slice": - ${layout_declare_ubo(B, "int", "start")} - ${layout_declare_ubo(B, "int", "step")} - - $if OP_NAME == "select": - ${layout_declare_ubo(B, "int", "index")} - -layout(push_constant) uniform restrict Block { - ivec4 in_sizes; - ivec4 out_strides; - ivec4 in_strides; - int out_numel; - int selected_dim; - $if not UBO_PARAMS: - $if OP_NAME == "slice": - int start; - int step; - - $if OP_NAME == "select": - int index; -}; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "${OP_NAME}.glslh" - -void main() { - const int out_bufi = ivec3(gl_GlobalInvocationID).x; - if (out_bufi >= out_numel) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - - const int in_bufi = tidx_to_bufi(in_tidx, in_strides); - t_out[out_bufi] = t_in[in_bufi]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml deleted file mode 100644 index f68b2bd1250..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml +++ /dev/null @@ -1,21 +0,0 @@ -transfer_buffer: - parameter_names_with_default_values: - DTYPE: float - OP_NAME: select - UBO_PARAMS: False - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: select_buffer - OP_NAME: select - - NAME: slice_buffer - OP_NAME: slice - - NAME: select_ubo_buffer - OP_NAME: select - UBO_PARAMS: True - - NAME: slice_ubo_buffer - OP_NAME: slice - UBO_PARAMS: True diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl deleted file mode 100644 index 0f34713cb43..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define UBO_PARAMS ${UBO_PARAMS} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("texture3d")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} - -$if UBO_PARAMS: - $if OP_NAME == "slice": - ${layout_declare_ubo(B, "int", "start")} - ${layout_declare_ubo(B, "int", "step")} - - $if OP_NAME == "select": - ${layout_declare_ubo(B, "int", "index")} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - int selected_dim; - $if not UBO_PARAMS: - $if OP_NAME == "slice": - int start; - int step; - - $if OP_NAME == "select": - int index; -}; - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "${OP_NAME}.glslh" - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); - - if (any(greaterThanEqual(out_tidx, out_sizes))) { - return; - } - - if (can_use_fast_path()) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); - } - else { - VEC4_T out_texel = VEC4_T(0); - for (int texel_i = 0; texel_i < 4; ++texel_i) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - int element_idx = in_tidx[in_packed_dim] % 4; - - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - T selected_value = T(in_texel[element_idx]); - - out_texel[texel_i] = selected_value; - - out_tidx[out_packed_dim]++; - } - - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml deleted file mode 100644 index 6922f120e49..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml +++ /dev/null @@ -1,21 +0,0 @@ -transfer_texture: - parameter_names_with_default_values: - DTYPE: float - OP_NAME: select - UBO_PARAMS: False - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: select_texture3d - OP_NAME: select - - NAME: slice_texture3d - OP_NAME: slice - - NAME: select_ubo_texture3d - OP_NAME: select - UBO_PARAMS: True - - NAME: slice_ubo_texture3d - OP_NAME: slice - UBO_PARAMS: True diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl deleted file mode 100644 index bb7ce482a7a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define T ${buffer_scalar_type(DTYPE)} - -#define op(X, A, B) ${OPERATOR} - -${define_active_storage_type(STORAGE)} - -#include "indexing_utils.h" - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform restrict Block { -$if STORAGE == "buffer": - int numel; -$else: - ivec4 out_limits; -float minimum; -float maximum; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "activations.h" - -#ifdef USING_BUFFER - -void main() { - const int i = int(gl_GlobalInvocationID.x); - if (i >= numel) { - return; - } - - float in_val = float(t_in[i]); - t_out[i] = T(op(in_val, minimum, maximum)); -} - -#else - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits.xyz))) { - return; - } - - VEC4_T in_texel = texelFetch(t_in, pos, 0); - imageStore(t_out, pos, VEC4_T(op(in_texel, minimum, maximum))); -} - -#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml deleted file mode 100644 index 47f538aee6c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml +++ /dev/null @@ -1,48 +0,0 @@ -unary_op: - parameter_names_with_default_values: - OPERATOR: clamp(X, A, B) - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - STORAGE: - - VALUE: texture3d - - VALUE: buffer - shader_variants: - - NAME: abs - OPERATOR: abs(X) - - NAME: clamp - OPERATOR: clamp(X, A, B) - - NAME: clamp_int32 - OPERATOR: clamp(X, A, B) - DTYPE: int32 - - NAME: cos - OPERATOR: cos(X) - - NAME: exp - OPERATOR: exp(X) - - NAME: gelu - OPERATOR: 0.5 * X * (1 + tanh(sqrt(2 / 3.141593) * (X + 0.044715 * X * X * X))) - - NAME: neg - OPERATOR: -X - - NAME: sigmoid - OPERATOR: 1 / (1 + exp(-1 * X)) - - NAME: sin - OPERATOR: sin(X) - - NAME: sqrt - OPERATOR: sqrt(X) - - NAME: rsqrt - OPERATOR: (1 / sqrt(X)) - - NAME: tanh - OPERATOR: tanh(clamp(X, -15.0, 15.0)) - - NAME: hardshrink - OPERATOR: hardshrink(X, A, B) - - NAME: hardswish - OPERATOR: hardswish(X) - - NAME: hardsigmoid - OPERATOR: hardsigmoid(X) - - NAME: leaky_relu - OPERATOR: leaky_relu(X, A) - - NAME: round - OPERATOR: round(X) diff --git a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.glsl deleted file mode 100644 index ba02da1c301..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.glsl +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("buffer")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "concat_offset", DTYPE, "buffer")} - -${layout_declare_ubo(B, "int", "concat_dim")} - -$for i in range(NUM_INPUTS): - ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - // Only one thread needs to update the offset - if (gl_GlobalInvocationID.x != 0) { - return; - } - - // Sum up the sizes along the concat dimension for all input tensors - int total_size = 0; - $for i in range(NUM_INPUTS): - total_size += in${i+1}_sizes[concat_dim]; - - // Add to the current offset - concat_offset[0] += T(total_size); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.yaml deleted file mode 100644 index 35e8740e0a3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.yaml +++ /dev/null @@ -1,13 +0,0 @@ -update_concat_offset: - parameter_names_with_default_values: - DTYPE: float - NUM_INPUTS: 2 - generate_variant_forall: - DTYPE: - - VALUE: int32 - shader_variants: - - NAME: update_concat_offset_1 - NUM_INPUTS: 1 - - NAME: update_concat_offset_2 - - NAME: update_concat_offset_3 - NUM_INPUTS: 3 diff --git a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.glsl deleted file mode 100644 index 85b63ad20ba..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.glsl +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "out_limits")} -${layout_declare_ubo(B, "ivec3", "in_limits")} -${layout_declare_ubo(B, "vec2", "recip_scales")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int align_corners = 0; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - ivec2 max_in_xy = in_limits.xy - 1; - vec2 scaled_xy; - - if (align_corners == 1) { - scaled_xy = pos.xy * recip_scales; - } else { - scaled_xy = (pos.xy + 0.5) * recip_scales - 0.5; - } - - $if MODE == "nearest": - const ivec2 ipos = clamp(ivec2(round(scaled_xy)), ivec2(0), max_in_xy); - VEC4_T out_tex = texelFetch(t_in, ivec3(ipos, pos.z), 0); - $elif MODE == "bilinear": - vec2 upper_xy = ceil(scaled_xy); - vec2 lower_xy = floor(scaled_xy); - - // Clamp coordinates to valid input range - upper_xy = clamp(upper_xy, ivec2(0), max_in_xy); - lower_xy = clamp(lower_xy, ivec2(0), max_in_xy); - - // Calculate interpolation weights - vec2 interp_weights = (scaled_xy - lower_xy); - - // Sample the four nearest texels - VEC4_T sample00 = texelFetch(t_in, ivec3(lower_xy.x, lower_xy.y, pos.z), 0); - VEC4_T sample10 = texelFetch(t_in, ivec3(upper_xy.x, lower_xy.y, pos.z), 0); - VEC4_T sample01 = texelFetch(t_in, ivec3(lower_xy.x, upper_xy.y, pos.z), 0); - VEC4_T sample11 = texelFetch(t_in, ivec3(upper_xy.x, upper_xy.y, pos.z), 0); - - // Perform bilinear interpolation - VEC4_T out_tex = mix( - mix(sample00, sample10, interp_weights.x), - mix(sample01, sample11, interp_weights.x), - interp_weights.y - ); - - imageStore(t_out, pos, out_tex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.yaml deleted file mode 100644 index 3bd1c282e13..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -upsample_2d: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - MODE: nearest - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: upsample_nearest2d - - NAME: upsample_bilinear2d - MODE: bilinear diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl deleted file mode 100644 index 30f283d6f01..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "out_buf", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "in_buf", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec4", "in_sizes")} -${layout_declare_ubo(B, "ivec4", "in_strides")} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "out_strides")} - -layout(push_constant) uniform PushConstants { - int unbiased; -} pc; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int reduce_dim = 0; - -#define NWORKERS 4 -#define MAX_THREADS 16 - -shared T shared_sum[NWORKERS]; -shared T shared_sum_sq[NWORKERS]; -shared int shared_count[NWORKERS]; - -#include "indexing_utils.h" - -void main() { - const ivec4 out_idx = ivec4( - gl_GlobalInvocationID.x, - gl_GlobalInvocationID.y, - gl_GlobalInvocationID.z % out_sizes.z, - gl_GlobalInvocationID.z / out_sizes.z); - - const uint tid = gl_LocalInvocationID[reduce_dim]; - - shared_sum[tid] = T(0); - shared_sum_sq[tid] = T(0); - shared_count[tid] = 0; - barrier(); - - const int R = in_sizes[reduce_dim]; - const uint N = gl_WorkGroupSize[reduce_dim]; - - // Each workgroup processes a contiguous chunk of the input tensor - // along the reduce_dim. Each thread processes a part of this chunk. - uint q = R / N; - uint rem = R % N; - - uint len = q + (tid < rem ? 1u : 0u); - uint base = tid * q + min(tid, rem); - - T sum = T(0); - T sum_sq = T(0); - int count = 0; - - ivec4 in_idx = out_idx; - for (uint off = 0u; off < len; ++off) { - uint i = base + off; - in_idx[reduce_dim] = int(i); - - // out_idx is a 4D index, so for tensors with reduce_dim == 2, - // we need to set the reduce_dim + 1 to 0 as gl_GlobalInvocationID.z - // is influenced by the tid - if (reduce_dim == 2) { - in_idx[reduce_dim + 1] -= int(tid); - } - - T v = in_buf[tidx_to_bufi(in_idx, in_strides)]; - - sum += v; - sum_sq += v * v; - count += 1; - } - - shared_sum[tid] = sum; - shared_sum_sq[tid] = sum_sq; - shared_count[tid] = count; - barrier(); - - if (tid == 0u) { - T tot_sum = T(0); - T tot_sum_sq = T(0); - int tot_count = 0; - - for (uint i = 0; i < N; ++i) { - tot_sum += shared_sum[i]; - tot_sum_sq += shared_sum_sq[i]; - tot_count += shared_count[i]; - } - - T var; - if (tot_count > 0) { - T mean = tot_sum / T(tot_count); - var = (tot_sum_sq / T(tot_count)) - (mean * mean); - if (pc.unbiased != 0 && tot_count > 1) { - var *= T(tot_count) / T(tot_count - 1); - } - } else{ - // NaN to match PyTorch behavior - var = T(0.0/0.0); - } - - out_buf[tidx_to_bufi(out_idx, out_strides)] = var; - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml deleted file mode 100644 index 7cb783775c9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -var_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: var_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl deleted file mode 100644 index faeac01fcd2..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "tin_limits")} -${layout_declare_ubo(B, "ivec4", "tin_sizes")} - -layout(push_constant) uniform PushConstants { - int unbiased; -} pc; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = 0; -layout(constant_id = 4) const int reduce_dim = 0; -layout(constant_id = 5) const int group_dim = 1; - -// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of -// threads that will co-operate to compute one reduction output. There may be -// multiple groups computing distinct reduction outputs within one work group. -#define NWORKERS 4 - -// Sets an upper limit on the total size of a work group based on how many -// elements are allocated in the shared memory array below. Each thread in the -// work group will write into its assigned element in the shared array. -#define MAX_NTHREADS 16 - -shared VEC4_T shared_sum[MAX_NTHREADS]; -shared VEC4_T shared_sum_sq[MAX_NTHREADS]; -shared int shared_count[MAX_NTHREADS]; - -#include "indexing_utils.h" - -int tid_to_smi(const ivec2 tid) { - return tid.x + tid.y * NWORKERS; -} - -VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) { - VEC4_T mean = sum / float(count); - VEC4_T variance = (sum_sq / float(count)) - (mean * mean); - - if ((pc.unbiased != 0) && (count > 1)) { - variance = variance * (float(count) / float(count - 1.0)); - } - - return variance; -} - -void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - VEC4_T sum = VEC4_T(0); - VEC4_T sum_sq = VEC4_T(0); - int count = 0; - - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim]; - i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - VEC4_T val = load_texel(tin, scan_pos); - sum += val; - sum_sq += val * val; - count += 1; - } - // Write partial output to shared memory and synchronize work group - shared_sum[smi] = sum; - shared_sum_sq[smi] = sum_sq; - shared_count[smi] = count; - barrier(); - - // Since the reduction row is reduced to only one element, only the "main" - // thread in the group needs aggregate the partial outputs - if (tid.x == 0) { - int group_i = tid.y * NWORKERS; - sum = shared_sum[group_i]; - sum_sq = shared_sum_sq[group_i]; - count = shared_count[group_i]; - - for (int i = 1; i < NWORKERS; i++) { - int idx = tid.y * NWORKERS + i; - sum += shared_sum[idx]; - sum_sq += shared_sum_sq[idx]; - count += shared_count[idx]; - } - - // Determine if there are any padding elements in the final texel of the - // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); - // Detect if this thread is working on the final texels of the packed - // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); - - VEC4_T variance = calculate_variance(sum, sum_sq, count); - - // Explicitly set padding elements to 0 - if (is_last_texel && nspill > 0) { - [[unroll]] for (int i = nspill; i < 4; i++) { - variance[i] = 0; - } - } - - scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, variance); - } -} - -/* - * Compute reduction where the reduction dim is also the packed dim. This case is - * complex because the reduction needs to occur over the individual texels. - * Therefore, in this algorithm each element of the accumulator texels are - * themselves partial outputs. Special care has to be taken to ignore padding - * elements in texels (which occur when the size of the packed dim is not a - * multiple of 4) so that they do not influence the output of reduction. - */ -void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); - // Only reduce up to the last "complete" texel. The last texel will need to be - // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; - - VEC4_T sum = VEC4_T(0); - VEC4_T sum_sq = VEC4_T(0); - int count = 0; - - // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of - // the reduction row - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x * 4; i < reduce_len; - i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - VEC4_T val = load_texel(tin, scan_pos); - sum += val; - sum_sq += val * val; - count += 4; - } - // For the last texel in the dim, if there are padding elements then each - // element of the texel needs to be processed individually such that the - // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { - const VEC4_T val = load_texel(tin, scan_pos); - for (int i = 0; i < nspill; i++) { - sum.x += val[i]; - sum_sq.x += val[i] * val[i]; - count += 1; - } - } - // Write partial output to shared memory and synchronize work group - shared_sum[smi] = sum; - shared_sum_sq[smi] = sum_sq; - shared_count[smi] = count; - barrier(); - - // Since the reduction row is reduced to only one element, only the "main" - // thread in the group needs aggregate the partial outputs - if (tid.x == 0) { - sum = shared_sum[tid.y * NWORKERS]; - sum_sq = shared_sum_sq[tid.y * NWORKERS]; - count = shared_count[tid.y * NWORKERS]; - for (int i = 1; i < NWORKERS; i++) { - int idx = tid.y * NWORKERS + i; - sum += shared_sum[idx]; - sum_sq += shared_sum_sq[idx]; - count += shared_count[idx]; - } - - // Combine across the elements of the combined state - float total_sum = sum.x + sum.y + sum.z + sum.w; - float total_sum_sq = sum_sq.x + sum_sq.y + sum_sq.z + sum_sq.w; - int total_count = count; - - float mean = total_sum / float(total_count); - float variance = (total_sum_sq / float(total_count)) - (mean * mean); - - if ((pc.unbiased != 0) && (total_count > 1)) { - variance = variance * (float(total_count) / float(total_count - 1.0)); - } - - scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, VEC4_T(variance, 0, 0, 0)); - } -} - -void main() { - ivec3 scan_pos = ivec3(gl_GlobalInvocationID); - scan_pos[reduce_dim] = 0; - - const ivec2 tid = ivec2( - gl_LocalInvocationID[reduce_dim], - gl_LocalInvocationID[group_dim]); - - if (any(greaterThanEqual(scan_pos, tin_limits))) { - return; - } - - if (reduce_dim != packed_dim) { - reduce_nonpacked_dim(tid, scan_pos); - } else { - reduce_packed_dim(tid, scan_pos); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml deleted file mode 100644 index 9cecbedca1a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -var_texture3d: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: var_texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl deleted file mode 100644 index 599879514e3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform PRECISION restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int in_packed_dim = C_DIM; - -layout(constant_id = 4) const int out_packed_dim = C_DIM; - -void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - ivec4 out_tensor_idx = to_tensor_idx(out_pos, out_sizes, out_packed_dim); - - if (all(greaterThanEqual(out_tensor_idx, out_sizes))) { - return; - } - - // Assume there is a virtual continous buffer in nchw format. From the output - // pos, we first calculate the index in the virual buffer, and then calculate - // the input position from the indx. - const ivec4 buf_indices = tidx_to_nchwi(out_tensor_idx, out_sizes, out_packed_dim); - - VEC4_T value = VEC4_T(0); - // Need to look up the 4 values in the output texel separately. - for (int i = 0 ; i < 4; i++) { - if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) { - ivec4 user_coor = nchwi_to_tidx(buf_indices[i], in_sizes); - ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim); - VEC4_T intex = texelFetch(t_in, in_pos_elem.xyz, 0); - value[i] = intex[in_pos_elem.w]; - } - } - - imageStore(t_out, out_pos, value); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.yaml b/backends/vulkan/runtime/graph/ops/glsl/view.yaml deleted file mode 100644 index 33364a25225..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/view.yaml +++ /dev/null @@ -1,12 +0,0 @@ -view: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - shader_variants: - - NAME: view diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl deleted file mode 100644 index 2c02803a9b1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl +++ /dev/null @@ -1,44 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -#include "indexing.glslh" - -${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "BufferMetadata", "outp")} -${layout_declare_ubo(B, "BufferMetadata", "inp")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -/* - * The insight behind the view operation is that the contiguous index of each - * tensor element in the input and output tensors are the same. - */ -void main() { - const uint outp_bufi = gl_GlobalInvocationID.x; - if (outp_bufi >= numel(outp)) { - return; - } - - TensorIndex outp_tidx; - linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx); - - // To map the output to the input, find the input element that has the same - // contiguous index as the output element. - const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx); - - TensorIndex inp_tidx; - contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx); - - const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx); - - t_outp[outp_bufi] = t_inp[inp_bufi]; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml deleted file mode 100644 index ec92bf483c8..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -view_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: double - - VALUE: int8 - - VALUE: uint8 - - VALUE: int32 - shader_variants: - - NAME: view_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl deleted file mode 100644 index fe6304c0fa0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ /dev/null @@ -1,99 +0,0 @@ -// where.glsl - -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define T ${buffer_scalar_type(DTYPE)} -#define COND_T ${buffer_scalar_type("bool")} - -${define_active_storage_type(STORAGE)} -${define_required_extensions(DTYPE)} -${define_required_extensions("bool")} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_condition", "bool", STORAGE)} -${layout_declare_tensor(B, "r", "t_self", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} - - -#include "indexing_utils.h" - -$if STORAGE == "buffer": - ${layout_declare_ubo(B, "int", "out_numl")} - ${layout_declare_ubo(B, "ivec4", "out_strides")} - ${layout_declare_ubo(B, "ivec4", "cond_strides")} - ${layout_declare_ubo(B, "ivec4", "self_strides")} - ${layout_declare_ubo(B, "ivec4", "other_strides")} -$else: - ${layout_declare_ubo(B, "ivec3", "out_limits")} - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#ifdef USING_BUFFER - -void main() { - int out_bufi = int(gl_GlobalInvocationID.x); - if (out_bufi >= out_numl) { - return; - } - - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); - - const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides); - const int self_bufi = tidx_to_bufi(out_tidx, self_strides); - const int other_bufi = tidx_to_bufi(out_tidx, other_strides); - - COND_T cond = t_condition[cond_bufi] ; - T v_self = t_self[self_bufi]; - T v_other = t_other[other_bufi]; - - if (cond > 0) { - t_out[out_bufi] = v_self; - } else { - t_out[out_bufi] = v_other; - } -} - -#else // !USING_BUFFER - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - vec4 cond = load_texel(t_condition, pos); - VEC4_T selftex = load_texel(t_self, pos); - VEC4_T othertex = load_texel(t_other, pos); - - VEC4_T outtex; - - for (int idx = 0; idx < 4; ++idx) { - if (cond[idx] == 1) { - outtex[idx] = selftex[idx]; - } else { - outtex[idx] = othertex[idx]; - } - } - write_texel(t_out, pos, outtex); -} - #endif // !USING_BUFFER diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.yaml b/backends/vulkan/runtime/graph/ops/glsl/where.yaml deleted file mode 100644 index edbd843a336..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/where.yaml +++ /dev/null @@ -1,12 +0,0 @@ -where: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - STORAGE: - - VALUE: texture3d - - VALUE: buffer - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: where diff --git a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp b/backends/vulkan/runtime/graph/ops/impl/Arange.cpp deleted file mode 100644 index 3171fbeb488..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include - -#include - -namespace vkcompute { - -void resize_arange_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - - int start_val = 0; - int step_val = 1; - if (!graph->val_is_none(extra_args.at(0))) { - start_val = graph->extract_scalar(extra_args.at(0)); - } - const int end_val = graph->extract_scalar(extra_args.at(1)); - if (!graph->val_is_none(extra_args.at(2))) { - step_val = graph->extract_scalar(extra_args.at(2)); - } - - const std::vector out_sizes = { - utils::div_up(end_val - start_val, step_val)}; - - graph->virtual_resize(out, out_sizes); -} - -void check_arange_input( - ComputeGraph& graph, - const ValueRef start, - const ValueRef end, - const ValueRef step) { - if (!graph.val_is_none(start) && !graph.val_is_int(end)) { - VK_THROW("arange: start must be int!"); - } - if (!graph.val_is_none(end) && !graph.val_is_int(end)) { - VK_THROW("arange: end must be int!"); - } - if (!graph.val_is_none(step) && !graph.val_is_int(end)) { - VK_THROW("arange: step must be int!"); - } -} - -void add_arange_node( - ComputeGraph& graph, - const ValueRef start, - const ValueRef end, - const ValueRef step, - const ValueRef out) { - float start_val = 0.0f; - float step_val = 1.0f; - - if (graph.val_is_none(end)) { - VK_THROW("arange: end must be specified!"); - } - - if (!graph.val_is_none(start)) { - if (graph.val_is_int(start)) { - start_val = static_cast(graph.extract_scalar(start)); - } else { - start_val = graph.extract_scalar(start); - } - } - if (!graph.val_is_none(step)) { - if (graph.val_is_int(step)) { - step_val = static_cast(graph.extract_scalar(step)); - } else { - step_val = graph.extract_scalar(step); - } - } - - std::string kernel_name("arange"); - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}}, - // Shader params buffers - {graph.sizes_ubo(out), - graph.create_params_buffer(start_val), - graph.create_params_buffer(step_val)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {start, end, step}, - // Resizing Logic - resize_arange_node)); -} - -void arange(ComputeGraph& graph, const std::vector& args) { - return add_arange_node(graph, args[0], args[1], args[2], args[7]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.arange.start_step, arange); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp deleted file mode 100644 index 757afd06849..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include -#include - -#include - -namespace vkcompute { - -ValueRef check_and_prepack_arg( - ComputeGraph& graph, - ValueRef arg_ref, - const utils::StorageType stype, - int64_t num_channels, - const std::string& debug_name) { - VK_CHECK_COND( - graph.val_is_tref(arg_ref), - "native_batch_norm requires ", - debug_name, - " to be a constant tensorref"); - VK_CHECK_COND(graph.get_tref(arg_ref)->sizes[0] == num_channels); - - // batch_norm's param are broadcasted on the channel dimension. - // In this implementation, we pack the weights along the x dimension, and - // in the shader, we lookup using the along the x. - return prepack_standard(graph, arg_ref, stype, utils::kWidthPacked); -} - -void add_native_batch_norm_node( - ComputeGraph& graph, - ValueRef in_ref, - ValueRef weight_ref, - ValueRef bias_ref, - ValueRef mean_ref, - ValueRef var_ref, - ValueRef eps_ref, - ValueRef out_tuple_ref) { - const std::vector in_sizes = graph.sizes_of(in_ref); - const std::vector out_sizes = graph.sizes_of(in_ref); - - VK_CHECK_COND(in_sizes.size() == 4, "BatchNorm only support 4d tensor"); - VK_CHECK_COND(out_sizes.size() == 4, "BatchNorm only support 4d tensor"); - - // Only the first element of the return value is propagated. The remaining 2 - // elements are zero-size dummy tensor. - const ValueRef out_ref = graph.get_value_list(out_tuple_ref)->at(0); - - const utils::StorageType stype = graph.storage_type_of(out_ref); - - const int64_t num_channels = dim_at(in_sizes); - - const ValueRef arg_weight = - check_and_prepack_arg(graph, weight_ref, stype, num_channels, "weight"); - const ValueRef arg_bias = - check_and_prepack_arg(graph, bias_ref, stype, num_channels, "bias"); - const ValueRef arg_mean = - check_and_prepack_arg(graph, mean_ref, stype, num_channels, "mean"); - const ValueRef arg_var = - check_and_prepack_arg(graph, var_ref, stype, num_channels, "var"); - const float epsilon = graph.extract_scalar(eps_ref); - - VK_CHECK_COND(!graph.val_is_tref(out_ref), "Output should not be tref"); - - const std::vector out_tensor_sizes = graph.sizes_of(out_ref); - VK_CHECK_COND( - dim_at(out_tensor_sizes) == num_channels, - "out channel must match in channel"); - - std::string kernel_name = "batchnorm"; - add_dtype_suffix(kernel_name, graph.dtype_of(out_ref)); - - const int32_t num_texel_per_batch = - utils::div_up_4((dim_at(in_sizes))); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out_ref, vkapi::kWrite}, - {{in_ref, arg_weight, arg_bias, arg_mean, arg_var}, vkapi::kRead}}, - {graph.logical_limits_ubo(out_ref), - graph.create_params_buffer(epsilon), - graph.create_params_buffer(num_texel_per_batch)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void native_batch_norm(ComputeGraph& graph, const std::vector& args) { - // args[5] is momentum. It is not used in the calculation. - return add_native_batch_norm_node( - graph, args[0], args[1], args[2], args[3], args[4], args[6], args[7]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP( - aten._native_batch_norm_legit_no_training.default, native_batch_norm); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp deleted file mode 100644 index 025b483eab7..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -void check_binary_op_args( - ComputeGraph& graph, - const ValueRef self, - const ValueRef other, - const ValueRef out) { - VK_CHECK_COND(graph.packed_dim_of(self) == graph.packed_dim_of(other)); - VK_CHECK_COND(graph.packed_dim_of(self) == graph.packed_dim_of(out)); - - const std::vector self_sizes = graph.sizes_of(self); - const std::vector other_sizes = graph.sizes_of(other); - const std::vector out_sizes = graph.sizes_of(out); - - std::vector broadcasted_sizes = - calculate_broadcasted_output_size(self_sizes, other_sizes); - VK_CHECK_COND(out_sizes == broadcasted_sizes); -} - -void resize_binary_op_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - - // TODO(T183442143): Verify tensors are broadcastable. - const ValueRef self = args.at(1).refs.at(0); - const ValueRef other = args.at(1).refs.at(1); - - const std::vector self_sizes = graph->sizes_of(self); - const std::vector other_sizes = graph->sizes_of(other); - const std::vector new_out_sizes = - calculate_broadcasted_output_size(self_sizes, other_sizes); - - graph->virtual_resize(out, new_out_sizes); -} - -void add_binary_op_texture_node( - ComputeGraph& graph, - const ValueRef in1, - const ValueRef in2, - const ValueRef alpha, - const ValueRef out, - const std::string& op_name) { - ValueRef arg1 = prepack_standard_like(graph, in1, out, true); - ValueRef arg2 = prepack_standard_like(graph, in2, out, true); - - check_binary_op_args(graph, arg1, arg2, out); - - float alpha_val = 1.0f; - // String is checked since floor_div passes in an unused string argument in - // place of alpha - if (is_valid(alpha) && !graph.val_is_string(alpha)) { - alpha_val = graph.extract_scalar(alpha); - } - - const struct BinaryOpsParams { - const utils::ivec2 broadcast_params; - const float alpha_val; - } binary_ops_params{create_broadcast_params(graph, arg1, arg2), alpha_val}; - - std::string kernel_name("binary_"); - kernel_name.reserve(kShaderNameReserve); - kernel_name += op_name; - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(in1)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{arg1, arg2}, vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - {{graph.sizes_pc_of(out), - graph.sizes_pc_of(arg1), - graph.sizes_pc_of(arg2), - PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(arg1), - graph.hashed_layout_of(arg2)}, - // Resize Args - {}, - // Resizing Logic - resize_binary_op_node)); -} - -void add_binary_op_buffer_node( - ComputeGraph& graph, - const ValueRef in1, - const ValueRef in2, - const ValueRef alpha, - const ValueRef out, - const std::string& op_name) { - // check_binary_op_args(*t_in1, *t_in2, *t_out); - - float alpha_val = 1.0f; - // String is checked since floor_div passes in an unused string argument in - // place of alpha - if (is_valid(alpha) && !graph.val_is_string(alpha)) { - alpha_val = graph.extract_scalar(alpha); - } - - std::string kernel_name("binary_"); - kernel_name.reserve(kShaderNameReserve); - kernel_name += op_name; - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - - add_dtype_suffix(kernel_name, graph.dtype_of(in1)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{in1, in2}, vkapi::kRead}}, - // Shader params buffers - {graph.buffer_meta_ubo(out), - graph.buffer_meta_ubo(in1), - graph.buffer_meta_ubo(in2)}, - // Push Constants - {{ - PushConstantDataInfo(&alpha_val, sizeof(float)), - }}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in1), - graph.hashed_layout_of(in2)}, - // Resize Args - {}, - // Resizing Logic - resize_binary_op_node)); -} - -void add_binary_op_node( - ComputeGraph& graph, - const ValueRef in1, - const ValueRef in2, - const ValueRef alpha, - const ValueRef out, - const std::string& op_name) { - if (graph.is_buffer_storage(out)) { - add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name); - } else { - add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name); - } -} - -#define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_binary_op_node( \ - graph, args[0], args[1], args[2], args[3], #op_name); \ - } - -#define DEFINE_BINARY_OP_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_binary_op_node( \ - graph, args[0], args[1], kDummyValueRef, args[2], #op_name); \ - } - -DEFINE_BINARY_OP_WITH_ALPHA_FN(add); -DEFINE_BINARY_OP_WITH_ALPHA_FN(sub); - -// Floor div does not have an alpha, but a string argument (which is unused) is -// passed in at the same location as the alpha argument in other op. -DEFINE_BINARY_OP_WITH_ALPHA_FN(floor_divide); - -DEFINE_BINARY_OP_FN(mul); -DEFINE_BINARY_OP_FN(div); -DEFINE_BINARY_OP_FN(pow); -DEFINE_BINARY_OP_FN(minimum); -DEFINE_BINARY_OP_FN(eq); -DEFINE_BINARY_OP_FN(lt); -DEFINE_BINARY_OP_FN(le); -DEFINE_BINARY_OP_FN(gt); -DEFINE_BINARY_OP_FN(ge); - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.add.Tensor, add); - VK_REGISTER_OP(aten.sub.Tensor, sub); - VK_REGISTER_OP(aten.mul.Tensor, mul); - VK_REGISTER_OP(aten.div.Tensor, div); - VK_REGISTER_OP(aten.div.Tensor_mode, floor_divide); - VK_REGISTER_OP(aten.pow.Tensor_Tensor, pow); - VK_REGISTER_OP(aten.minimum.default, minimum); - VK_REGISTER_OP(aten.eq.Tensor, eq); - VK_REGISTER_OP(aten.lt.Tensor, lt); - VK_REGISTER_OP(aten.le.Tensor, le); - VK_REGISTER_OP(aten.gt.Tensor, gt); - VK_REGISTER_OP(aten.ge.Tensor, ge); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp deleted file mode 100644 index 0d0be08bb38..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp +++ /dev/null @@ -1,815 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include - -namespace vkcompute { - -void resize_choose_qparams_per_row( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - ValueRef input_scales = args.at(0).refs.at(0); - ValueRef input_zeros = args.at(0).refs.at(1); - ValueRef input = args.at(1).refs.at(0); - - std::vector new_sizes = graph->sizes_of(input_scales); - const size_t ndim = new_sizes.size(); - - const int64_t input_height = graph->size_at(-2, input); - new_sizes.at(ndim - 1) = input_height; - - graph->virtual_resize(input_scales, new_sizes); - graph->virtual_resize(input_zeros, new_sizes); -} - -utils::uvec3 choose_qparams_pick_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - // For per-tensor quantization, we want a single workgroup that can handle - // all elements with proper reduction. The shader uses NWORKERS=64 threads. - const ValueRef input = args.at(1).refs.at(0); - - if (graph->is_buffer_storage(input)) { - // For buffer storage, use a single workgroup in X dimension - // The shader will handle strided access across all elements - return {1u, 1u, 1u}; - } else { - // For texture storage, use the default logic - return graph->create_global_wg_size(args.at(0).refs.at(0)); - } -} - -utils::uvec3 choose_qparams_pick_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - - if (graph->is_buffer_storage(input)) { - // For buffer storage, use 64 threads in X dimension to match NWORKERS - // This ensures the shared memory arrays are properly sized - return {64u, 1u, 1u}; - } else { - // For texture storage, use the default logic - return graph->create_local_wg_size(global_workgroup_size); - } -} - -utils::uvec3 choose_qparams_per_token_pick_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - - if (graph->is_buffer_storage(input)) { - // For per-token quantization, we need one workgroup per token - // Calculate number of tokens (product of all dimensions except the last - // one) - const auto input_sizes = graph->sizes_of(input); - int64_t num_tokens = 1; - for (size_t i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - return {static_cast(num_tokens), 1u, 1u}; - } else { - // For texture storage, use the default logic - return graph->create_global_wg_size(args.at(0).refs.at(0)); - } -} - -utils::uvec3 choose_qparams_per_token_pick_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - - if (graph->is_buffer_storage(input)) { - return {1u, 1u, 1u}; - } else { - // For texture storage, use the default logic - return graph->create_local_wg_size(global_workgroup_size); - } -} - -utils::uvec3 choose_qparams_block_wise_pick_global_wg_size( - ComputeGraph* g, - const vkapi::ShaderInfo&, - const std::vector& a, - const std::vector& r) { - const ValueRef input = a.at(2).refs.at(0); - const auto blkRef = r.at(0); - const auto inSz = g->sizes_of(input); - const auto blkList = g->get_int_list(blkRef); - - // Use same code as in add_choose_qparams_block_wise_node - utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*blkList); - utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(inSz); - - // Calculate numBlocks: ceil(tensorSize / blockSize) (both in WHCN order) - utils::ivec4 nBlk = { - (tensor_size_whcn[0] + block_size_vec[0] - 1) / block_size_vec[0], - (tensor_size_whcn[1] + block_size_vec[1] - 1) / block_size_vec[1], - (tensor_size_whcn[2] + block_size_vec[2] - 1) / block_size_vec[2], - (tensor_size_whcn[3] + block_size_vec[3] - 1) / block_size_vec[3]}; - - uint32_t nBlocks = nBlk[0] * nBlk[1] * nBlk[2] * nBlk[3]; - - // For texture storage, use more threads to better utilize GPU parallelism - // Each thread can process multiple blocks with stride - if (g->is_buffer_storage(input)) { - return {nBlocks, 1u, 1u}; - } else { - // For texture storage, use more workgroups to better utilize GPU - // Aim for ~64-256 threads per workgroup for good occupancy - uint32_t preferred_threads_per_wg = 64; - uint32_t num_workgroups = - (nBlocks + preferred_threads_per_wg - 1) / preferred_threads_per_wg; - num_workgroups = std::max(1u, std::min(num_workgroups, nBlocks)); - return {num_workgroups * preferred_threads_per_wg, 1u, 1u}; - } -} - -utils::uvec3 choose_qparams_block_wise_pick_local_wg_size( - ComputeGraph* g, - const vkapi::ShaderInfo&, - const utils::uvec3& global_wg_size, - const std::vector& a, - const std::vector&) { - const ValueRef input = a.at(2).refs.at(0); - - if (g->is_buffer_storage(input)) { - return {1u, 1u, 1u}; - } else { - // For texture storage, use 64 threads per workgroup for better occupancy - uint32_t local_size = std::min(64u, global_wg_size[0]); - return {local_size, 1u, 1u}; - } -} - -vkapi::ShaderInfo pick_choose_qparams_per_row_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - - // number of output channels - const int64_t width = graph->size_at(-1, input); - const int64_t height = graph->size_at(-2, input); - - std::string kernel_name = "choose_qparams_per_row"; - if (width > 256 || height == 1) { - kernel_name += "_o1w64"; - } else { - kernel_name += "_o4w16"; - } - add_storage_type_suffix(kernel_name, graph->storage_type_of(input)); - add_dtype_suffix(kernel_name, graph->dtype_of(input)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -utils::uvec3 pick_choose_qparams_per_row_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - const uint32_t height = graph->size_at(-2, input); - return {1u, height, 1u}; -} - -utils::uvec3 pick_choose_qparams_per_row_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)global_workgroup_size; - (void)args; - (void)resize_args; - - uint32_t outputs_per_wg = 1u; - uint32_t workers_per_output = 64u; - - if (shader.kernel_name.find("o4w16") != std::string::npos) { - outputs_per_wg = 4u; - workers_per_output = 16u; - } - - return {workers_per_output, outputs_per_wg, 1u}; -} - -void add_choose_qparams_tensor_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& eps, - const ValueRef& scale_out, - const ValueRef& zero_point_out) { - std::string kernel_name("choose_qparams_tensor"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale_out)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point_out)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(zero_point_out)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - float eps_val = static_cast(graph.get_double(eps)); - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(scale_out), - graph.strides_ubo(scale_out), - graph.sizes_ubo(zero_point_out), - graph.strides_ubo(zero_point_out)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), - graph.logical_limits_ubo(scale_out), - graph.logical_limits_ubo(zero_point_out)}; - } - - push_constants = { - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - PushConstantDataInfo(&eps_val, sizeof(float)), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - choose_qparams_pick_global_wg_size, - choose_qparams_pick_local_wg_size, - // Inputs and Outputs - {{scale_out, vkapi::kWrite}, - {zero_point_out, vkapi::kWrite}, - {input, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_choose_qparams_per_token_asymmetric_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale_out, - const ValueRef& zero_point_out) { - std::string kernel_name("choose_qparams_per_token_asymmetric"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale_out)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point_out)); - - // Calculate number of tokens (product of all dimensions except the last one) - int64_t num_tokens = 1; - const auto input_sizes = graph.sizes_of(input); - for (size_t i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - int num_tokens_val = static_cast(num_tokens); - int quant_min_val = -128; // Fixed for asymmetric quantization - int quant_max_val = 127; // Fixed for asymmetric quantization - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(scale_out), - graph.strides_ubo(scale_out), - graph.sizes_ubo(zero_point_out), - graph.strides_ubo(zero_point_out)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), - graph.logical_limits_ubo(scale_out), - graph.logical_limits_ubo(zero_point_out)}; - } - - push_constants = { - PushConstantDataInfo(&num_tokens_val, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - choose_qparams_per_token_pick_global_wg_size, - choose_qparams_per_token_pick_local_wg_size, - // Inputs and Outputs - {{scale_out, vkapi::kWrite}, - {zero_point_out, vkapi::kWrite}, - {input, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_choose_qparams_per_row_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& input_scales, - const ValueRef& input_zps) { - int32_t quant_min_val = -128; - int32_t quant_max_val = 127; - - // Int8 range by default - if (graph.val_is_none(quant_min)) { - quant_min_val = -128; - } else { - quant_min_val = graph.extract_scalar(quant_min); - } - - // Int8 range by default - if (graph.val_is_none(quant_min)) { - quant_max_val = 127; - } else { - quant_max_val = graph.extract_scalar(quant_max); - } - - vkapi::ParamsBindList param_ubos = { - graph.sizes_ubo(input), - }; - std::vector push_constants = { - PushConstantDataInfo(&quant_min_val, sizeof(int32_t)), - PushConstantDataInfo(&quant_max_val, sizeof(int32_t)), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_choose_qparams_per_row_shader, - pick_choose_qparams_per_row_global_wg_size, - pick_choose_qparams_per_row_local_wg_size, - // Inputs and Outputs - {{{input_scales, input_zps}, vkapi::kWrite}, {input, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_choose_qparams_per_row)); -} - -void add_choose_qparams_block_wise_node( - ComputeGraph& graph, - ValueRef input, - ValueRef block_size, - int mapping_type, // 0 / 1 / 2 - ValueRef quant_min, - ValueRef quant_max, - ValueRef eps, - ValueRef scale_out, - ValueRef zp_out) { - const auto input_sizes = graph.sizes_of(input); - const auto block_size_list = graph.get_int_list(block_size); - - // For shader compatibility, we still need to convert to WHCN order - // but the output shape calculation is now handled correctly in resize - // function - utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*block_size_list); - utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(input_sizes); - - // Calculate numBlocks: ceil(tensorSize / blockSize) (both in WHCN order) - utils::ivec4 num_blocks_vec = { - (tensor_size_whcn[0] + block_size_vec[0] - 1) / block_size_vec[0], - (tensor_size_whcn[1] + block_size_vec[1] - 1) / block_size_vec[1], - (tensor_size_whcn[2] + block_size_vec[2] - 1) / block_size_vec[2], - (tensor_size_whcn[3] + block_size_vec[3] - 1) / block_size_vec[3]}; - - // Calculate blockStride: pre-computed linear strides for the block grid - utils::ivec4 block_stride_vec = { - 1, - num_blocks_vec[0], - num_blocks_vec[0] * num_blocks_vec[1], - num_blocks_vec[0] * num_blocks_vec[1] * num_blocks_vec[2]}; - - // Handle optional quant_min and quant_max parameters - int qmin, qmax; - if (graph.val_is_none(quant_min) || graph.val_is_none(quant_max)) { - // Use default values based on target_dtype (similar to - // _get_and_check_qmin_qmax) For now, assume int8 range as default - this - // should match the Python implementation - qmin = -128; - qmax = 127; - } else { - qmin = static_cast(graph.get_int(quant_min)); - qmax = static_cast(graph.get_int(quant_max)); - } - - float eps_val; - if (graph.val_is_none(eps)) { - // Use default eps value (similar to Python implementation) - eps_val = 1.192092896e-07f; // torch.finfo(torch.float32).eps - } else { - eps_val = static_cast(graph.get_double(eps)); - } - - // Create push constants vector - std::vector push_constants = { - PushConstantDataInfo(&block_size_vec, sizeof(block_size_vec)), - PushConstantDataInfo(&num_blocks_vec, sizeof(num_blocks_vec)), - PushConstantDataInfo(&block_stride_vec, sizeof(block_stride_vec)), - PushConstantDataInfo(&mapping_type, sizeof(int)), - PushConstantDataInfo(&qmin, sizeof(int)), - PushConstantDataInfo(&qmax, sizeof(int)), - PushConstantDataInfo(&eps_val, sizeof(float))}; - - std::string kernel_name("choose_qparams_block_wise"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale_out)); - add_dtype_suffix(kernel_name, graph.dtype_of(zp_out)); - - vkapi::ParamsBindList param_ubos; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(scale_out), - graph.strides_ubo(scale_out), - graph.sizes_ubo(zp_out), - graph.strides_ubo(zp_out)}; - } else { - // For texture input, the shader uses buffer storage for outputs - // so we need buffer UBOs for the output tensors - param_ubos = { - graph.logical_limits_ubo(input), - graph.sizes_ubo(scale_out), - graph.strides_ubo(scale_out), - graph.sizes_ubo(zp_out), - graph.strides_ubo(zp_out)}; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - choose_qparams_block_wise_pick_global_wg_size, - choose_qparams_block_wise_pick_local_wg_size, - // Inputs and Outputs - {{scale_out, vkapi::kWrite}, - {zp_out, vkapi::kWrite}, - {input, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - {}, - // Resize Args - {block_size}, - // Resizing Logic - nullptr)); -} - -void choose_qparams_tensor_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef eps = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef out_tuple_ref = args[arg_idx++]; - - ValueRef scale_out = kDummyValueRef; - ValueRef zero_point_out = kDummyValueRef; - - { - const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref); - scale_out = out_tuple->at(0); - zero_point_out = out_tuple->at(1); - } - - // Void the unused dtype parameter to match ATen signature - (void)dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale_out)); - VK_CHECK_COND(graph.val_is_tensor(zero_point_out)); - - // Verify input is a floating point type - VK_CHECK_COND(graph.dtype_of(input) == vkapi::kFloat); - - // Get scale and zero point output dtypes - vkapi::ScalarType scale_out_dtype = graph.dtype_of(scale_out); - vkapi::ScalarType zero_point_out_dtype = graph.dtype_of(zero_point_out); - - // Verify supported output types for scale (fp32 only for now) - VK_CHECK_COND(scale_out_dtype == vkapi::kFloat); - - // Verify supported output types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_out_dtype == vkapi::kInt || - zero_point_out_dtype == vkapi::kChar || - zero_point_out_dtype == vkapi::kFloat); - - // Check that texture storage is width packed - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim); - } - - add_choose_qparams_tensor_node( - graph, input, quant_min, quant_max, eps, scale_out, zero_point_out); -} - -void choose_qparams_per_token_asymmetric_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef out_tuple_ref = args[arg_idx++]; - - ValueRef scale_out = kDummyValueRef; - ValueRef zero_point_out = kDummyValueRef; - - { - const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref); - scale_out = out_tuple->at(0); - zero_point_out = out_tuple->at(1); - } - - // Void the unused parameter to match ATen signature - (void)dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale_out)); - VK_CHECK_COND(graph.val_is_tensor(zero_point_out)); - - // Verify input is a floating point type - VK_CHECK_COND(graph.dtype_of(input) == vkapi::kFloat); - - // Get scale and zero point output dtypes - vkapi::ScalarType scale_out_dtype = graph.dtype_of(scale_out); - vkapi::ScalarType zero_point_out_dtype = graph.dtype_of(zero_point_out); - - // Verify supported output types for scale (fp32 only for now) - VK_CHECK_COND(scale_out_dtype == vkapi::kFloat); - - // Verify supported output types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_out_dtype == vkapi::kInt || - zero_point_out_dtype == vkapi::kChar || - zero_point_out_dtype == vkapi::kFloat); - - // Check that texture storage is width packed - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim); - } - - add_choose_qparams_per_token_asymmetric_node( - graph, input, scale_out, zero_point_out); -} - -bool can_use_choose_qparams_per_row( - ComputeGraph& graph, - const ValueRef input, - const ValueRef block_size, - const ValueRef input_zero_point) { - if (!graph.is_vectorizable_contiguous_2d_matrix(input)) { - return false; - } - - std::vector input_sizes = graph.sizes_of(input); - const IntListPtr block_size_vals = graph.get_int_list(block_size); - const size_t ndim = block_size_vals->size(); - - // Check for per y - dim quantization - if (utils::val_at(-1, input_sizes) != utils::val_at(-1, *block_size_vals)) { - return false; - } - - for (int d = 0; d < ndim - 1; ++d) { - if (block_size_vals->at(d) != 1) { - return false; - } - } - return true; -} - -void choose_qparams_affine_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef mapping_type = args[arg_idx++]; - const ValueRef block_size = args[arg_idx++]; - const ValueRef target_dtype = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef eps = args[arg_idx++]; - const ValueRef scale_dtype = args[arg_idx++]; - const ValueRef zero_point_dtype = args[arg_idx++]; - const ValueRef out_tuple_ref = args[arg_idx++]; - - // Suppress unused variable warnings - (void)target_dtype; - (void)scale_dtype; - (void)zero_point_dtype; - - ValueRef scale_out = kDummyValueRef; - ValueRef zero_point_out = kDummyValueRef; - - { - const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref); - scale_out = out_tuple->at(0); - zero_point_out = out_tuple->at(1); - } - - // Use fast path if certain conditions are met - if (can_use_choose_qparams_per_row( - graph, input, block_size, zero_point_out)) { - return add_choose_qparams_per_row_node( - graph, input, quant_min, quant_max, scale_out, zero_point_out); - } - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale_out)); - VK_CHECK_COND(graph.val_is_tensor(zero_point_out)); - - // Verify input is a floating point type - VK_CHECK_COND(graph.dtype_of(input) == vkapi::kFloat); - - // Get scale and zero point dtypes from arguments - vkapi::ScalarType scale_out_dtype = graph.dtype_of(scale_out); - vkapi::ScalarType zero_point_out_dtype = graph.dtype_of(zero_point_out); - - // Verify supported output types for scale (fp32 only for now) - VK_CHECK_COND(scale_out_dtype == vkapi::kFloat); - - // Verify supported output types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_out_dtype == vkapi::kInt || - zero_point_out_dtype == vkapi::kChar || - zero_point_out_dtype == vkapi::kFloat); - - // Check that texture storage is width packed - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim); - } - - const auto input_sizes = graph.sizes_of(input); - const auto block_size_list = graph.get_int_list(block_size); - VK_CHECK_COND(block_size_list->size() == input_sizes.size()); - - std::string mapping_type_str = graph.get_string(mapping_type); - int mapping_type_val = 0; // Default to ASYMMETRIC - - if (mapping_type_str == "ASYMMETRIC" || mapping_type_str.empty()) { - mapping_type_val = 0; // ASYMMETRIC - } else if (mapping_type_str == "SYMMETRIC") { - mapping_type_val = 1; - } else if (mapping_type_str == "SYMMETRIC_NO_CLIPPING_ERR") { - mapping_type_val = 2; - } else { - VK_THROW("Unsupported mapping_type: ", mapping_type_str); - } - - add_choose_qparams_block_wise_node( - graph, - input, - block_size, - mapping_type_val, - quant_min, - quant_max, - eps, - scale_out, - zero_point_out); -} - -void choose_qparams_per_row( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef input_scales = args[arg_idx++]; - const ValueRef input_zps = args[arg_idx++]; - - // ValueRef scale_out = kDummyValueRef; - // ValueRef zero_point_out = kDummyValueRef; - // - // { - // const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref); - // scale_out = out_tuple->at(0); - // zero_point_out = out_tuple->at(1); - // } - // - - add_choose_qparams_per_row_node( - graph, input, quant_min, quant_max, input_scales, input_zps); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP( - quantized_decomposed.choose_qparams.tensor, choose_qparams_tensor_impl); - VK_REGISTER_OP( - quantized_decomposed.choose_qparams_per_token_asymmetric.default, - choose_qparams_per_token_asymmetric_impl); - - // Register the per-channel quantization operator - VK_REGISTER_OP(etvk.choose_qparams_per_row.default, choose_qparams_per_row); - - // TorchAO affine choose_qparams operators - VK_REGISTER_OP( - torchao.choose_qparams_affine.default, choose_qparams_affine_impl); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp deleted file mode 100644 index 0ae9d53a481..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include - -#include -#include -#include - -namespace vkcompute { - -void resize_clone_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - // TODO: support for when dimensionality doesn't match, i.e. clone is used to - // implement squeeze. - if (graph->dim_of(out) == graph->dim_of(in)) { - graph->virtual_resize(out, graph->sizes_of(in)); - } -} - -void add_clone_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef out) { - std::string kernel_name = "clone"; - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter Buffers - {graph.logical_limits_ubo(out)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_clone_node)); -} - -utils::uvec3 clone_image_to_buffer_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef image = args.at(1).refs.at(0); - return graph->create_global_wg_size(image); -} - -void add_image_to_buffer_node( - ComputeGraph& graph, - const ValueRef image, - const ValueRef buffer) { - std::string kernel_name = "clone_image_to_buffer"; - add_dtype_suffix(kernel_name, graph.dtype_of(image)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - shader, - clone_image_to_buffer_global_wg_size, - default_pick_local_wg_size, - // Input and Outputs - {{buffer, vkapi::kWrite}, {image, vkapi::kRead}}, - // Parameter Buffers - {}, - // Push Constants - {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)}, - // Specialization Constants - {graph.hashed_layout_of(image)}, - // Resize Args - {}, - // Resizing Logic - resize_clone_node)); -} - -void add_buffer_to_image_node( - ComputeGraph& graph, - const ValueRef buffer, - const ValueRef image) { - std::string kernel_name = "clone_buffer_to_image"; - add_dtype_suffix(kernel_name, graph.dtype_of(image)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - shader, - default_pick_global_wg_size, - default_pick_local_wg_size, - // Input and Outputs - {{image, vkapi::kWrite}, {buffer, vkapi::kRead}}, - // Parameter Buffers - {}, - // Push Constants - {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)}, - // Specialization Constants - {graph.hashed_layout_of(image)}, - // Resize Args - {}, - // Resizing Logic - resize_clone_node)); -} - -void clone(ComputeGraph& graph, const std::vector& args) { - const ValueRef src = args[0]; - const ValueRef dst = args[2]; - - const utils::StorageType src_storage = graph.storage_type_of(src); - const utils::StorageType dst_storage = graph.storage_type_of(dst); - if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) { - if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) { - return add_clone_node(graph, src, dst); - } else { - return add_view_node(graph, src, kDummyValueRef, dst); - } - } - if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) { - return add_image_to_buffer_node(graph, src, dst); - } - if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) { - return add_buffer_to_image_node(graph, src, dst); - } - - std::vector extra_args = {}; - // Buffer to buffer copy - return add_view_copy_buffer_node( - graph, src, dst, extra_args, resize_clone_node); -} - -// Clone node is not the most efficient implementation for the aten.clone -// operation. A more efficient implementation can be achieved during vulkan -// export with the use of shared object. This clone node is introduced to enable -// a "copy" mechanism if there is no alternative (e.g. during direct -// ComputeGraph manipulation, we need to make a copy of a Tensor). - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.clone.default, clone); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.h b/backends/vulkan/runtime/graph/ops/impl/Clone.h deleted file mode 100644 index 8efaa259a24..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -void add_clone_node(ComputeGraph& graph, const ValueRef in, const ValueRef out); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp deleted file mode 100644 index 6c701224f7f..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -utils::uvec3 default_pick_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return graph->create_global_wg_size(out); -} - -utils::uvec3 default_pick_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)args; - (void)resize_args; - return graph->create_local_wg_size(global_workgroup_size); -} - -utils::uvec3 pick_hw_square_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)graph; - (void)shader; - (void)args; - (void)resize_args; - // Some inactive invocations are okay; set 6 as the threshold to use the - // a square wg size. - if (global_workgroup_size[0u] >= 6 && global_workgroup_size[1u] >= 6) { - return {8u, 8u, 1u}; - } - // If width dim is sufficiently small, then bias towards height dim to reduce - // the number of inactive invocations. - if (global_workgroup_size[0u] < 6u) { - return {4u, 16u, 1u}; - } - return {16u, 4u, 1u}; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h deleted file mode 100644 index 1831ab2a845..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Common.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace vkcompute { - -/** - * Creates a global workgroup size based on the first output tensor in the args. - * This is a utility function that extracts the output tensor from - * args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it. - */ -utils::uvec3 default_pick_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args); - -/** - * Creates a local workgroup size based on the first output tensor in the args. - * This is a utility function that extracts the output tensor from - * args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it. - */ -utils::uvec3 default_pick_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args); - -/** - * Constructs a local work group size with the shape {W, H, 1}. The function - * will try to set W == H == sqrt(num_invocations), where num_invocations is - * typically 64. This configuration is good for ops like matrix multiplication - * as it reduces the total volume of unique data that the entire work group - * will need to read from input tensors in order to produce the output data. - * To compute an output tile of {W, H, 1}, the work group will need to read - * H unique rows = H * K unique elements from the input tensor and W unique cols - * = W * K elements from the weight tensor, resulting in (W + H) * K unique - * elements in total. - */ -utils::uvec3 pick_hw_square_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp deleted file mode 100644 index 0a4acb6cef3..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include -#include - -namespace vkcompute { - -std::vector get_concat_sizes( - ComputeGraph& graph, - ValueRef all_input_refs, - const int64_t concat_dim) { - ValueListPtr in_value_refs = graph.get_value_list(all_input_refs); - // Get the sizes of the first input tensor as a starting point - std::vector new_out_sizes = graph.sizes_of(in_value_refs->at(0)); - - // Sum up the sizes along the concatenation dimension - for (size_t i = 1; i < in_value_refs->size(); ++i) { - const std::vector in_sizes = graph.sizes_of(in_value_refs->at(i)); - new_out_sizes.at(concat_dim) += in_sizes.at(concat_dim); - } - - return new_out_sizes; -} - -void resize_concat_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef all_inputs = extra_args.at(0); - - int64_t concat_dim = graph->extract_scalar(extra_args.at(1)); - - // Normalize concat_dim if negative - const int64_t ndim = graph->dim_of(out); - if (concat_dim < 0) { - concat_dim += ndim; - } - - // Calculate the new sizes - std::vector new_out_sizes = - get_concat_sizes(*graph, all_inputs, concat_dim); - - // Resize the output tensor - graph->virtual_resize(out, new_out_sizes); -} - -utils::uvec3 concat_pick_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& extra_args) { - (void)shader; - (void)extra_args; - - const ValueRef out = args.at(0).refs.at(0); - const std::vector inputs_in_batch = args.at(1).refs; - - int64_t concat_dim = graph->extract_scalar(extra_args.at(1)); - - // Normalize concat_dim if negative - const int64_t ndim = graph->dim_of(out); - if (concat_dim < 0) { - concat_dim += ndim; - } - - // The concat shader concatenates N input tensors at a time to the output - // tensor. Since the shader may need to be invoked multiple times to finish - // concatenation when the number of input tensors is >N, the global workgroup - // is based on the volume of input data being concatenated in this batch, - // as opposed to the overall size of the output tensor. Conceptually, the - // global work group size represents which elements of the output tensor will - // be written to during this dispatch. - - uint32_t total_input_numel = 0; - int64_t concat_dim_numel = 0; - for (const ValueRef input : inputs_in_batch) { - total_input_numel += graph->numel_of(input); - concat_dim_numel += graph->size_at(concat_dim, input); - } - - if (graph->is_buffer_storage(out)) { - return {total_input_numel, 1, 1}; - } - - // The texture implementation is similar, except each invocation writes out 4 - // output elements along the packed dim (i.e. one texel). In this case, the - // global work group size represents the number of output texels that will be - // written to in this batch, rather than the number of output elements. Note - // that to update an element of the output, the entire texel that contains it - // will need to be loaded, updated, then written back. - - std::vector inp_volume_sizes = graph->sizes_of(out); - inp_volume_sizes.at(concat_dim) = concat_dim_numel; - - // Calculate what the image extents would be of a tensor with the input - // volume's sizes. This produces the number of texels that would need to be - // written to. - const int32_t packed_dim = graph->packed_dim_of(out); - std::vector inp_volume_texel_sizes = - api::calculate_padded_sizes(inp_volume_sizes, packed_dim); - // If the concat_dim is the same as the packed dim, and the concat_offset for - // this input batch is not a multiple of 4, then the data from an input texel - // may be split up between two output texels. For example: - // I0 , I1 , I2 , I2 - // O0 , O1 , O2 , X | X , X , X , X - // Therefore, 1 texel is added to the packed dim to account for this. - inp_volume_texel_sizes.at(3 - packed_dim) = - utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1; - - const uint32_t inp_volume_texel_numel = - utils::multiply_integers(inp_volume_texel_sizes); - - return {inp_volume_texel_numel, 1, 1}; - - // The texture implementation is similar, expect each thread is responsible - // for writing out an entire output texel. Therefore, the overall global work - // group size will be the concatenation of the texture extents of the input - // tensors in this batch. - - // One complication is when the previous concatenation batch does not write - // up to a texel boundary. An example is if the previous concatenation batch - // only wrote 7 elements along the concatenation dim. The first input element - // would then have to be inserted at the last element of the final texel - // written by the previous batch. To account for this, initialize the - // workgroup size at the concatenation dim to 1 (need to read N total texels - // along the concat dim for input tensors + up to 1 texel from the output - // tensor). - - // The axis along which to concatenate the input texture extents - int64_t extent_concat_axis = nchw_dim_to_whcn_dim(concat_dim, ndim); - // For batch concatenation, the concat axis is the batch-concatenation axis - if (concat_dim == 4) { - extent_concat_axis = graph->concat_dim_of(out); - } - - utils::uvec3 global_workgroup_size = graph->create_global_wg_size(out); - global_workgroup_size[concat_dim] = 0; - for (const ValueRef input : inputs_in_batch) { - utils::uvec3 texture_extents = graph->logical_limits_of(input); - global_workgroup_size[extent_concat_axis] += texture_extents[concat_dim]; - } - - return global_workgroup_size; -} - -void add_concat_node( - ComputeGraph& graph, - const ValueRef tensors_ref, - const ValueRef dim_ref, - const ValueRef out) { - std::vector in_value_refs; - - { - const ValueListPtr tensors = graph.get_value_list(tensors_ref); - - for (const ValueRef in : *tensors) { - in_value_refs.push_back(in); - } - } - - const int64_t dim = graph.extract_scalar(dim_ref); - - const int64_t ndim = graph.dim_of(in_value_refs.at(0)); - int64_t normalized_dim = dim; - if (normalized_dim < 0) { - normalized_dim += ndim; - } - - const int64_t dim_whcn = nchw_dim_to_whcn_dim(normalized_dim, ndim); - const ValueRef dim_whcn_ref = graph.get_or_add_value_for_int(dim_whcn); - - // Create a temporary tensor to hold the concat offset - TmpTensor concat_offset( - &graph, {1}, vkapi::kInt, utils::kBuffer, utils::kWidthPacked); - - // Add node to set concat_offset to 0 - { - std::string kernel_name = "set_zero"; - add_dtype_suffix(kernel_name, graph.dtype_of(concat_offset)); - - vkapi::ParamsBindList param_buffers = {graph.numel_ubo(concat_offset)}; - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - {1, 1, 1}, - {1, 1, 1}, - // Inputs and Outputs - {{concat_offset, vkapi::kWrite}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); - } - - // Process inputs in batches of up to 3 tensors - const size_t batch_size = 3; - for (size_t batch_start = 0; batch_start < in_value_refs.size(); - batch_start += batch_size) { - const size_t batch_end = - std::min(batch_start + batch_size, in_value_refs.size()); - const size_t current_batch_size = batch_end - batch_start; - - std::vector batch_inputs; - for (size_t i = batch_start; i < batch_end; ++i) { - batch_inputs.push_back(in_value_refs.at(i)); - } - - // Add concat node for this batch - { - vkapi::ParamsBindList param_buffers = { - graph.get_or_create_int_param_buffer(dim_whcn_ref, 0)}; - - std::vector push_constants; - vkapi::SpecVarList spec_vars; - - if (graph.is_buffer_storage(out)) { - param_buffers.append(graph.sizes_ubo(out)); - param_buffers.append(graph.strides_ubo(out)); - - for (const ValueRef in_ref : batch_inputs) { - param_buffers.append(graph.sizes_ubo(in_ref)); - param_buffers.append(graph.strides_ubo(in_ref)); - } - - param_buffers.append(graph.numel_ubo(out)); - - spec_vars = {graph.hashed_layout_of(out)}; - } else { - push_constants = {graph.sizes_pc_of(out)}; - - spec_vars = {graph.hashed_layout_of(out)}; - - for (const ValueRef in_ref : batch_inputs) { - push_constants.push_back(graph.sizes_pc_of(in_ref)); - spec_vars.append(graph.hashed_layout_of(in_ref)); - } - } - - std::string kernel_name = "concat"; - if (current_batch_size == 1) { - kernel_name += "_1"; - } else if (current_batch_size == 2) { - kernel_name += "_2"; - } else if (current_batch_size == 3) { - kernel_name += "_3"; - } - if (graph.is_buffer_storage(out)) { - kernel_name += "_buffer"; - } else { - kernel_name += "_texture3d"; - } - - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - DispatchNode::ResizeFunction resize_fn = nullptr; - if (batch_start == 0) { - resize_fn = resize_concat_node; - } - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - concat_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kReadWrite}, - {batch_inputs, vkapi::kRead}, - {concat_offset, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {tensors_ref, dim_ref}, - // Resizing Logic - resize_fn)); - } - - // Add node to update concat_offset (except for the last batch) - if (batch_end < in_value_refs.size()) { - vkapi::ParamsBindList param_buffers = { - graph.get_or_create_int_param_buffer(dim_whcn_ref, 0)}; - - for (const ValueRef in_ref : batch_inputs) { - param_buffers.append(graph.sizes_ubo(in_ref)); - } - - std::string kernel_name = "update_concat_offset"; - if (current_batch_size == 1) { - kernel_name += "_1"; - } else if (current_batch_size == 2) { - kernel_name += "_2"; - } else if (current_batch_size == 3) { - kernel_name += "_3"; - } - add_dtype_suffix(kernel_name, graph.dtype_of(concat_offset)); - - vkapi::SpecVarList spec_vars = {}; - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - {1u, 1u, 1u}, - {1u, 1u, 1u}, - // Inputs and Outputs - {{concat_offset, vkapi::kWrite}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - nullptr)); - } - } -} - -void cat_tensor(ComputeGraph& graph, const std::vector& args) { - // Extract arguments - const ValueRef tensors_ref = args.at(0); - const ValueRef dim_ref = args.at(1); - const ValueRef out = args.at(2); - - // Add concat node - add_concat_node(graph, tensors_ref, dim_ref, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.cat.default, cat_tensor); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp deleted file mode 100644 index ded1defe973..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ /dev/null @@ -1,787 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include - -#include -#include - -#include - -namespace vkcompute { - -enum class Conv2dMethod : uint8_t { - Depthwise, - Pointwise, - SlidingWindow, - Transposed, -}; - -void resize_conv2d_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - - size_t ndim = graph->dim_of(self); - std::vector new_out_sizes(ndim); - const bool transposed = graph->get_bool(extra_args.at(4)); - - std::vector self_sizes = graph->sizes_of(self); - // Batch, Channel - if (ndim == 4) { - new_out_sizes.at(ndim - 4) = self_sizes.at(ndim - 4); - } - - TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0)); - const auto& weight_sizes = weight_ref->sizes; - new_out_sizes.at(ndim - 3) = - transposed ? weight_sizes.at(ndim - 3) : weight_sizes.at(ndim - 4); - - // Height, Width - const auto& new_out_sizes_hw = calc_out_sizes_hw( - *graph, - self_sizes, - extra_args.at(0), - /*kernel_size_only = */ false, - {extra_args.at(1), extra_args.at(2), extra_args.at(3), extra_args.at(5)}, - transposed); - new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0); - new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1); - - graph->virtual_resize(out, new_out_sizes); -} - -void resize_conv1d_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0)); - - const int64_t stride_size = graph->get_int_list(extra_args.at(1))->at(0); - const int64_t padding_size = graph->get_int_list(extra_args.at(2))->at(0); - const int64_t dilation_size = graph->get_int_list(extra_args.at(3))->at(0); - - const std::vector& weight_sizes = weight_ref->sizes; - - const std::vector in_sizes = graph->sizes_of(self); - const size_t ndim = in_sizes.size(); - std::vector new_out_sizes(ndim); - - const int64_t kernel_size = weight_sizes.at(2); - const int64_t in_length = in_sizes.at(2); - - new_out_sizes.at(0) = in_sizes.at(0); - new_out_sizes.at(1) = weight_sizes.at(0); - new_out_sizes.at(2) = calc_out_size( - in_length, kernel_size, stride_size, padding_size, dilation_size, false); - - graph->virtual_resize(out, new_out_sizes); -} - -ValueRef prepack_biases( - ComputeGraph& graph, - const ValueRef vref, - const ValueRef weight, - const bool transposed, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout) { - auto sizes = graph.sizes_of(weight); - const int64_t out_channels = transposed ? sizes.at(1) : sizes.at(0); - - ValueRef v = graph.add_tensor( - {out_channels}, graph.dtype_of(weight), storage_type, memory_layout); - - vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(graph, v); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - shader, - graph.create_global_wg_size(v), - graph.create_local_wg_size(v), - vref, - v, - {}, - // Specialization constants - {graph.hashed_layout_of(v)}, - {graph.sizes_pc_of(v)})); - - return v; -} - -vkapi::ShaderInfo get_conv2d_shader( - ComputeGraph& graph, - const ValueRef out, - const bool prepack_weights, - const Conv2dMethod method, - const ValueRef weight, - const bool clamp_out = false, - const bool stride_equals_dilation = false, - const bool stride_1_padding_0 = false) { - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - switch (method) { - case Conv2dMethod::Depthwise: - kernel_name = "conv2d_dw"; - if (!prepack_weights) { - if (!stride_equals_dilation) { - kernel_name += "_sned"; - } - const auto& weight_sizes = graph.get_tref(weight)->sizes; - if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) { - kernel_name += "_output_tile_3x3"; - } - if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) { - kernel_name += "_output_tile_5x5"; - } - } - break; - case Conv2dMethod::Pointwise: - if (prepack_weights) { - kernel_name = "conv2d"; - } else { - kernel_name = stride_1_padding_0 ? "conv2d_pw_s1p0" : "conv2d_pw"; - } - break; - case Conv2dMethod::SlidingWindow: - kernel_name = "conv2d"; - break; - case Conv2dMethod::Transposed: - kernel_name = "conv_transpose2d"; - break; - } - if (prepack_weights) { - kernel_name += "_prepack_weights"; - } else if (clamp_out) { - kernel_name += "_clamp"; - } - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -std::vector get_final_sizes( - const std::vector& original_sizes, - const Conv2dMethod method) { - int64_t batch_padded = utils::align_up_4(utils::val_at(-4, original_sizes)); - int64_t channels_padded = - utils::align_up_4(utils::val_at(-3, original_sizes)); - int64_t height = utils::val_at(-2, original_sizes); - int64_t width = utils::val_at(-1, original_sizes); - - switch (method) { - case Conv2dMethod::Depthwise: - return std::vector{4, batch_padded / 4, height * width}; - case Conv2dMethod::Pointwise: - case Conv2dMethod::SlidingWindow: - return std::vector{ - 4, batch_padded * height / 4, channels_padded * width}; - case Conv2dMethod::Transposed: - return std::vector{ - 4, channels_padded * height / 4, batch_padded * width}; - } -} - -ValueRef prepack_weights( - ComputeGraph& graph, - const ValueRef vref, - const Conv2dMethod method) { - const auto original_sizes = graph.sizes_of(vref); - const auto final_sizes = get_final_sizes(original_sizes, method); - - ValueRef v = graph.add_tensor( - final_sizes, - graph.dtype_of(vref), - utils::kTexture2D, - utils::kChannelsPacked); - - vkapi::ShaderInfo shader = - get_conv2d_shader(graph, v, /*prepack_weights = */ true, method, vref); - - const auto original_sizes_pc = - utils::make_ivec4(original_sizes, /*reverse = */ true); - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - shader, - graph.create_global_wg_size(v), - graph.create_local_wg_size(v), - vref, - v, - {}, - // Specialization constants - {graph.packed_dim_of(v)}, - {graph.sizes_pc_of(v), - PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))})); - - return v; -} - -void check_conv_args( - ComputeGraph& graph, - const ValueRef in, - const ValueRef out) { - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); -} - -struct Conv2dParams final { - utils::ivec2 overlay_region; - int in_group_size; -}; - -struct OutputParams final { - float out_min; - float out_max; -}; - -Conv2dParams create_conv2d_params( - ComputeGraph& graph, - const ValueRef weight, - const Kernel2dParams& p, - const bool transposed) { - const auto& overlay_region = utils::make_ivec2({ - p.kernel_size[0] + (p.kernel_size[0] - 1) * (p.dilation[0] - 1), - p.kernel_size[1] + (p.kernel_size[1] - 1) * (p.dilation[1] - 1), - }); - const auto weight_sizes = graph.sizes_of(weight); - const int32_t in_group_size = utils::safe_downcast( - utils::align_up_4(transposed ? weight_sizes.at(0) : weight_sizes.at(1))); - return {overlay_region, in_group_size}; -} - -void check_conv2d_params(const Kernel2dParams& p, const bool transposed) { - if (transposed) { - if (p.dilation[0] > 1 || p.dilation[1] > 1) { - VK_THROW( - "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!"); - } - } -} - -Conv2dMethod get_conv2d_method( - ComputeGraph& graph, - const ValueRef weight, - const int64_t groups, - const bool transposed) { - const auto weight_sizes = graph.sizes_of(weight); - if (!transposed && weight_sizes.at(0) == groups && weight_sizes.at(1) == 1) { - return Conv2dMethod::Depthwise; - } - if (transposed) { - return Conv2dMethod::Transposed; - } - if (weight_sizes.at(2) == 1 && weight_sizes.at(3) == 1) { - return Conv2dMethod::Pointwise; - } - return Conv2dMethod::SlidingWindow; -} - -utils::uvec2 get_conv2d_dw_dispatch_divisor( - const std::vector& weight_sizes) { - if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) { - return {4u, 2u}; - } - if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) { - return {4u, 2u}; - } - return {4u, 2u}; -} - -utils::uvec3 create_conv2d_global_wg_size( - ComputeGraph& graph, - const Conv2dMethod method, - const ValueRef out, - const ValueRef weight_data, - const bool stride_equals_dilation) { - if (method == Conv2dMethod::Pointwise) { - const utils::uvec3 image_extents = graph.logical_limits_of(out); - return { - utils::div_up(image_extents[0u], 1u), - utils::div_up(image_extents[1u], 4u), - image_extents[2u]}; - } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) { - const utils::uvec3 image_extents = graph.create_global_wg_size(out); - const utils::uvec2 div = - get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes); - return { - utils::div_up(image_extents[0], div[0]), - utils::div_up(image_extents[1], div[1]), - image_extents[2]}; - } else { - return graph.create_global_wg_size(out); - } -} - -// Custom global workgroup size function for conv2d -utils::uvec3 conv2d_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef weight_data = resize_args.at(0); - - // Determine method from shader name - Conv2dMethod method; - if (shader.kernel_name.find("conv2d_dw") != std::string::npos) { - method = Conv2dMethod::Depthwise; - } else if ( - shader.kernel_name.find("conv2d_pw") != std::string::npos || - (shader.kernel_name.find("conv2d") != std::string::npos && - shader.kernel_name.find("conv_transpose2d") == std::string::npos)) { - // Check if it's pointwise by examining weight sizes - const auto& weight_sizes = graph->get_tref(weight_data)->sizes; - if (weight_sizes.at(2) == 1 && weight_sizes.at(3) == 1) { - method = Conv2dMethod::Pointwise; - } else { - method = Conv2dMethod::SlidingWindow; - } - } else if (shader.kernel_name.find("conv_transpose2d") != std::string::npos) { - method = Conv2dMethod::Transposed; - } else { - method = Conv2dMethod::SlidingWindow; - } - - // Determine stride_equals_dilation from shader name - bool stride_equals_dilation = - shader.kernel_name.find("_sned") == std::string::npos; - - utils::uvec3 wg_size = create_conv2d_global_wg_size( - *graph, method, out, weight_data, stride_equals_dilation); - - if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) { - wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; - } - - return wg_size; -} - -// Custom local workgroup size function for conv2d -utils::uvec3 conv2d_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)args; - (void)resize_args; - - // Determine method from shader name - Conv2dMethod method; - if (shader.kernel_name.find("conv2d_dw") != std::string::npos) { - method = Conv2dMethod::Depthwise; - } else if ( - shader.kernel_name.find("conv2d_pw") != std::string::npos || - (shader.kernel_name.find("conv2d") != std::string::npos && - shader.kernel_name.find("conv_transpose2d") == std::string::npos)) { - method = Conv2dMethod::Pointwise; - } else { - method = Conv2dMethod::SlidingWindow; - } - - if (method == Conv2dMethod::Pointwise) { - uint32_t local_wg_size_y = 1; - if (global_workgroup_size[1] % 8 == 0) { - local_wg_size_y = 8; - } else if (global_workgroup_size[1] % 4 == 0) { - local_wg_size_y = 4; - } else if (global_workgroup_size[1] % 2 == 0) { - local_wg_size_y = 2; - } - return {64 / local_wg_size_y, local_wg_size_y, 1}; - } else if (method == Conv2dMethod::Depthwise) { - return {64, 1, 1}; - } else { - return graph->create_local_wg_size(global_workgroup_size); - } -} - -// Custom global workgroup size function for conv1d -utils::uvec3 conv1d_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - - return {// out length - graph->size_at(-1, out), - // out channels - static_cast(graph->size_at(-2, out)), - // out batches - utils::div_up_4(graph->size_at(-3, out))}; -} - -void add_conv2d_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef weight_data, - const ValueRef bias, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef transposed, - const ValueRef output_padding, - const ValueRef groups, - const ValueRef out_min, - const ValueRef out_max, - const ValueRef out, - const bool clamp_out) { - const bool transposed_val = graph.get_bool(transposed); - - float out_min_val = 0.0f; - float out_max_val = 0.0f; - if (out_min != kDummyValueRef) { - out_min_val = graph.extract_scalar(out_min); - } - if (out_max != kDummyValueRef) { - out_max_val = graph.extract_scalar(out_max); - } - - const int64_t groups_val = graph.get_int(groups); - - const Conv2dMethod method = - get_conv2d_method(graph, weight_data, groups_val, transposed_val); - - ValueRef arg_weight = prepack_weights(graph, weight_data, method); - ValueRef arg_bias = prepack_biases( - graph, - bias, - weight_data, - transposed_val, - /* storage_type = */ utils::kTexture2D, - /* memory_layout = */ utils::kWidthPacked); - - const std::vector in_sizes = graph.sizes_of(in); - if (in_sizes.at(0) > 1) { - VK_THROW("conv2d: input batch size > 1 is not supported yet!"); - } - - check_conv_args(graph, in, out); - - Kernel2dParams kernel_params = create_kernel2d_params( - graph, - weight_data, - /*kernel_size_only = */ false, - stride, - padding, - dilation); - Conv2dParams extra_params = - create_conv2d_params(graph, weight_data, kernel_params, transposed_val); - - const bool stride_equals_dilation = - (kernel_params.stride[0] == kernel_params.dilation[0] && - kernel_params.stride[1] == kernel_params.dilation[1]); - - const bool stride_1_padding_0 = - (kernel_params.stride[0] == 1 && kernel_params.stride[1] == 1 && - kernel_params.padding[0] == 0 && kernel_params.padding[1] == 0); - - OutputParams out_params = {out_min_val, out_max_val}; - - check_conv2d_params(kernel_params, transposed_val); - - vkapi::ShaderInfo shader = get_conv2d_shader( - graph, - out, - /*prepack_weights = */ false, - method, - weight_data, - clamp_out, - stride_equals_dilation, - stride_1_padding_0); - - utils::uvec3 wg_size = create_conv2d_global_wg_size( - graph, method, out, weight_data, stride_equals_dilation); - - utils::uvec3 local_wg_size; - if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) { - wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; - } - - if (method == Conv2dMethod::Pointwise) { - uint32_t local_wg_size_y = 1; - if (wg_size[1] % 8 == 0) { - local_wg_size_y = 8; - } else if (wg_size[1] % 4 == 0) { - local_wg_size_y = 4; - } else if (wg_size[1] % 2 == 0) { - local_wg_size_y = 2; - } - local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1}; - } else if (method == Conv2dMethod::Depthwise) { - local_wg_size = {64, 1, 1}; - } else { - local_wg_size = graph.create_local_wg_size(wg_size); - } - - vkapi::ParamsBindList param_buffers; - std::vector push_constants; - if (method == Conv2dMethod::Pointwise) { - const utils::ivec4 kernel_param_stride_pad = { - kernel_params.stride[0], - kernel_params.stride[1], - kernel_params.padding[0], - kernel_params.padding[1], - }; - - struct Conv2dPWParams final { - int in_group_size; - int dummy_padding; - OutputParams out_params; - } param{extra_params.in_group_size, 0, out_params}; - - push_constants = { - graph.logical_limits_pc_of(out), - PushConstantDataInfo( - &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)), - PushConstantDataInfo(¶m, sizeof(param)), - }; - } else if (method == Conv2dMethod::Depthwise) { - const utils::ivec4 kernel_param_size_stride = { - kernel_params.kernel_size[0], - kernel_params.kernel_size[1], - kernel_params.stride[0], - kernel_params.stride[1]}; - - const utils::ivec4 kernel_param_pad_dial = { - kernel_params.padding[0], - kernel_params.padding[1], - kernel_params.dilation[0], - kernel_params.dilation[1]}; - - push_constants = { - graph.logical_limits_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo( - &kernel_param_size_stride, sizeof(kernel_param_size_stride)), - PushConstantDataInfo( - &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)), - PushConstantDataInfo( - &extra_params, sizeof(extra_params), sizeof(utils::ivec4)), - PushConstantDataInfo(&out_params, sizeof(out_params)), - }; - } else { - param_buffers = { - graph.logical_limits_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(kernel_params), - graph.create_params_buffer(extra_params), - graph.create_params_buffer(out_params), - }; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - shader, - conv2d_global_wg_size, - conv2d_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - {utils::safe_downcast(groups_val)}, - // Resize Args - {weight_data, stride, padding, dilation, transposed, output_padding}, - // Resizing Logic - resize_conv2d_node)); -} - -void add_conv1d_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef weight, - const ValueRef bias, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef out_min, - const ValueRef out_max, - const ValueRef out, - const bool clamp_out) { - ValueRef arg_weight = prepack_standard( - graph, - weight, - graph.storage_type_of(out), - utils::kChannelsPacked, - /* passthrough = */ false, - utils::kOptimizedAxisMap); - ValueRef arg_bias = prepack_biases( - graph, - bias, - weight, - /*transposed = */ false, - /*storage_type = */ utils::kTexture3D, - /*memory_layout = */ utils::kWidthPacked); - - float out_min_val = 0.0f; - float out_max_val = 0.0f; - if (out_min != kDummyValueRef) { - out_min_val = graph.extract_scalar(out_min); - } - if (out_max != kDummyValueRef) { - out_max_val = graph.extract_scalar(out_max); - } - - const int64_t groups_val = graph.get_int(groups); - - const std::vector in_sizes = graph.sizes_of(in); - const std::vector weight_sizes = graph.sizes_of(arg_weight); - const std::vector out_sizes = graph.sizes_of(out); - - check_conv_args(graph, in, out); - - const int32_t in_channels = in_sizes.at(1); - const int32_t out_channels = weight_sizes.at(0); - const int32_t kernel_size = weight_sizes.at(2); - const int32_t stride_size = graph.get_int_list(stride)->at(0); - const int32_t padding_size = graph.get_int_list(padding)->at(0); - const int32_t dilation_size = graph.get_int_list(dilation)->at(0); - const int32_t in_group_size = static_cast(in_channels / groups_val); - const int32_t out_group_size = - static_cast(out_channels / groups_val); - - Kernel1dParams kernel_params = { - kernel_size, - stride_size, - padding_size, - dilation_size, - in_group_size, - out_group_size}; - - const OutputParams out_params = {out_min_val, out_max_val}; - - std::string kernel_name("conv1d"); - if (clamp_out) { - kernel_name += "_clamp"; - } - kernel_name.reserve(kShaderNameReserve); - - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - conv1d_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}}, - // Shader params buffers - { - graph.logical_limits_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(kernel_params), - graph.create_params_buffer(out_params), - }, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - graph.hashed_layout_of(arg_weight), - graph.hashed_layout_of(arg_bias)}, - // Resize Args - {weight, stride, padding, dilation}, - // Resizing Logic - resize_conv1d_node)); -} - -void conv(ComputeGraph& graph, const std::vector& args) { - int64_t in_ndim = graph.dim_of(args[0]); - if (in_ndim == 4) { - if (args.size() == 10) { - // ordinary conv2d - return add_conv2d_node( - graph, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - args[6], - args[7], - args[8], - /*out_min = */ kDummyValueRef, - /*out_max = */ kDummyValueRef, - args[9], - false); - } else { - // conv2d with clamp - return add_conv2d_node( - graph, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - args[6], - args[7], - args[8], - args[9], - args[10], - args[11], - true); - } - } else { - if (args.size() == 10) { - // ordinary conv1d - return add_conv1d_node( - graph, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - args[8], - /*out_min = */ kDummyValueRef, - /*out_max = */ kDummyValueRef, - args[9], - false); - } else { - // conv1d with clamp - return add_conv1d_node( - graph, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - args[8], - args[9], - args[10], - args[11], - true); - } - } -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.convolution.default, conv); - VK_REGISTER_OP(conv_with_clamp.default, conv); - VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp deleted file mode 100644 index bd648dbae2d..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -using utils::ivec3; -using utils::ivec4; -using utils::uvec3; - -void add_copy_offset_node( - ComputeGraph& graph, - const ValueRef in, - const ivec3& range, - const ivec4& src_offset, - const ivec4& dst_offset, - const ValueRef out, - bool calc_out_pos_using_src_chnl, - bool calc_in_pos_using_dst_chnl) { - std::string kernel_name = "copy_offset"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - { - PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)), - PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)), - PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)), - }, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - (calc_out_pos_using_src_chnl ? 1 - : calc_in_pos_using_dst_chnl ? 2 - : 0)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_copy_packed_dim_offset_node( - ComputeGraph& graph, - const ValueRef in, - const ivec3& range, - const ivec4& src_offset, - const ivec4& dst_offset, - const ValueRef out) { - // Check the packed dimension is same for both tensors, also check if the - // packed dimension is Width or Height. Since the function does not support - // channel packing. - VK_CHECK_COND( - graph.packed_dim_of(in) == graph.packed_dim_of(out) && - (graph.packed_dim_of(in) == WHCN::kWidthDim || - graph.packed_dim_of(in) == WHCN::kHeightDim)); - - std::string kernel_name = "copy_packed_dim_offset"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const std::vector in_sizes = graph.sizes_of(in); - const std::vector out_sizes = graph.sizes_of(out); - - // A copy of range with the last element set to batch size of the input tensor - ivec4 final_range = { - range[0], range[1], range[2], dim_at(in_sizes, kBatch4D)}; - ivec3 global_wg_size = graph.logical_limits_of(out); - - const auto packed_dim = graph.packed_dim_of(in); - // The starting offset in a texel where this tensor will start copying from - const auto src_lane_offset = src_offset[packed_dim] & 0x3; - // The starting offset in a texel where this tensor will start copying to - const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; - - // The total packed texels this tensor will be copied from - // The first texel of tensor data in packed dimension will be copied from - // remaining lanes from current source Hence (4 - src_lane_offset) is added - // to tensor size in packed dimension - const auto src_packed_size = utils::div_up_4( - (4 - src_lane_offset) + utils::val_at(-packed_dim, out_sizes)); - - // The total packed texels this tensor will be copied to - // The first texel of tensor data in packed dimension will be copied to - // remaining lanes from previous write Hence (4 - dst_lane_offset) is added - // to tensor size in packed dimension - const auto dst_packed_size = utils::div_up_4( - (4 - dst_lane_offset) + utils::val_at(-packed_dim, in_sizes)); - - // If the starting src offset is not 0, and the total packed texels is - // greater than the source texel range - const bool has_additional_src_work = - src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; - // If the starting dst offset is not 0, and the total packed texels is - // greater than the source texel range - const bool has_additional_dst_work = - dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; - - if (has_additional_src_work || has_additional_dst_work) { - global_wg_size[packed_dim]++; // Increase the global work group size in - // packed dimension - final_range[packed_dim]++; // Increase the range in packed dimension - } - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {out, vkapi::kRead}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - { - PushConstantDataInfo( - &final_range, sizeof(final_range), sizeof(ivec4)), - PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)), - PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)), - }, - // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_copy_channel_offset_node( - ComputeGraph& graph, - const ValueRef in, - int32_t channel_range, - int32_t src_channel_offset, - int32_t dst_channel_offset, - const ValueRef out) { - // Likely need to prepad these numbers. - const std::vector in_sizes = graph.sizes_of(in); - const std::vector out_sizes = graph.sizes_of(out); - - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); - - // NOTE: This function should be able to support 1d and 2d tensors when - // range=1, src_offset=dst_offset=1. - VK_CHECK_COND(graph.dim_of(in) >= 3, "Src dim should be at least 3"); - VK_CHECK_COND(graph.dim_of(out) >= 3, "Dst dim should be at least 3"); - - VK_CHECK_COND( - dim_at(in_sizes) >= src_channel_offset + channel_range, - "Src channel (", - src_channel_offset, - ") and range (", - channel_range, - ") should be less than or equal to input tensor's channel size (", - dim_at(in_sizes), - ")"); - - VK_CHECK_COND( - dim_at(out_sizes) >= dst_channel_offset + channel_range, - "Dst channel (", - dst_channel_offset, - ") and range (", - channel_range, - ") should be less than or equal to input tensor's channel size (", - dim_at(out_sizes), - ")"); - - VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative"); - VK_CHECK_COND( - src_channel_offset >= 0, "Src channel offset must be non-negative"); - VK_CHECK_COND( - dst_channel_offset >= 0, "Dst channel offset must be non-negative"); - - std::string kernel_name = "copy_channel_offset"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - int32_t out_channels = dim_at(out_sizes); - - // Copy one batch at a time. - for (int batch_idx = 0; batch_idx < dim_at(in_sizes); batch_idx++) { - // Mapping the tensor NCHW coordinates into texture XYZ coordinates - int32_t dst_first_z = dst_channel_offset / 4; - int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4; - - // We copy the entire width and height dimension. For the channel dimension, - // we use the z-dimension of the global_size to specify the texture range. - // The shader combines the global invocation id and the dst_offset to get - // the actual coordinate. - - const ivec3 dst_offset{ - 0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)}; - - const uvec3 global_size{ - utils::safe_downcast(dim_at(in_sizes)), - utils::safe_downcast(dim_at(in_sizes)), - utils::safe_downcast(dst_last_z - dst_first_z + 1)}; - const uvec3 local_size = graph.create_local_wg_size(global_size); - - const utils::ivec4 range_params = { - static_cast(global_size[0]), - static_cast(global_size[1]), - static_cast(global_size[2]), - channel_range}; - - const ivec4 offset_params = { - dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset}; - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_size, - local_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {out, vkapi::kRead}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - {graph.sizes_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&range_params, sizeof(range_params)), - PushConstantDataInfo(&offset_params, sizeof(offset_params)), - PushConstantDataInfo(&src_channel_offset, sizeof(src_channel_offset))}, - // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); - } -} - -void add_copy_offset_node( - ComputeGraph& graph, - ValueRef in, - ValueRef range_ref, - ValueRef src_offset_ref, - ValueRef dst_offset_ref, - ValueRef out) { - ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref)); - ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref)); - ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref)); - - ivec4 src_offset = {src[0], src[1], src[2], 0}; - ivec4 dst_offset = {dst[0], dst[1], dst[2], 0}; - - add_copy_offset_node( - graph, in, range, src_offset, dst_offset, out, false, false); -} - -void copy_offset(ComputeGraph& graph, const std::vector& args) { - add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]); -} - -void copy_channel_offset( - ComputeGraph& graph, - const std::vector& args) { - ValueRef in = args[0]; - ValueRef channel_range_ref = args[1]; - ValueRef src_channel_offset_ref = args[2]; - ValueRef dst_channel_offset_ref = args[3]; - ValueRef out = args[4]; - - auto channel_range = graph.extract_scalar(channel_range_ref); - auto src_channel_offset = - graph.extract_scalar(src_channel_offset_ref); - auto dst_channel_offset = - graph.extract_scalar(dst_channel_offset_ref); - - add_copy_channel_offset_node( - graph, in, channel_range, src_channel_offset, dst_channel_offset, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(etvk.copy_offset, copy_offset); - VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h deleted file mode 100644 index 41956d482d9..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the -// texture extents specified by the range, src_offset, and dst_offset (all are -// in texture coordinate (x, y, z) from the input image to the output image. -// src_offset.w and dst_offset.w may contain channel size information. -// -// It is possible to have input and output to point to the same image -// object. But when the source range and destination range overlap, the behavior -// is undefined. -// -// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl -// can be used to specify an indexing function in the shader -// If calc_out_pos_using_src_chnl is set to true channel and batch index will be -// calculated based on source channel size and will be used to determine -// destination texel position. -// -// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be -// calculated based on destination channel size and will be used to determine -// source texel position. -// -// If both are true calc_out_pos_using_src_chnl is picked. If both are false no -// index calculation happens. -void add_copy_offset_node( - ComputeGraph& graph, - const ValueRef in, - const utils::ivec3& range, - const utils::ivec4& src_offset, - const utils::ivec4& dst_offset, - const ValueRef out, - bool calc_out_pos_using_src_chnl, - bool calc_in_pos_using_dst_chnl); - -// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that -// its used when copying packed dimension, if tensor is width or height packed. -// src_offset.w and dst_offset.w may contain channel size information. -// -// It copies the texture extents specified by the range, src_offset, and -// dst_offset (all are in texture coordinate (x, y, z) from the input image to -// the output image. -void add_copy_packed_dim_offset_node( - ComputeGraph& graph, - const ValueRef in, - const utils::ivec3& range, - const utils::ivec4& src_offset, - const utils::ivec4& dst_offset, - const ValueRef out); - -// add_copy_channel_offset_node behaves similar to add_copy_node, except that it -// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW). -// The range and offset arguments are in the tensor coordinate. It assumes the -// underlying texture is channel-packed. -// -// This function is specialized implementation for copying -// channel packed values. The complication comes from when reading / writing the -// channel dimension on indices that are not aligned to packing, we will need -// be careful about the boundaries. -// -// It achieves the following: -// out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] = -// in [:, src_channel_offset:src_channel_offset + channel_range, :, :] -void add_copy_channel_offset_node( - ComputeGraph& graph, - const ValueRef in, - int32_t channel_range, - int32_t src_channel_offset, - int32_t dst_channel_offset, - const ValueRef out); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp deleted file mode 100644 index a217734653d..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp +++ /dev/null @@ -1,843 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include - -namespace vkcompute { - -void resize_dequantize_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const std::vector in_sizes = graph->sizes_of(in); - graph->virtual_resize(out, in_sizes); -} - -utils::uvec3 dequantize_per_channel_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)args; - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - - utils::uvec3 local_wg_size = - graph->create_local_wg_size(global_workgroup_size); - - // WORKAROUND: The CommandBuffer::dispatch function divides - // global_workgroup_size by local_workgroup_size to get the number of - // workgroups to dispatch. We need to ensure that we dispatch the correct - // number of workgroups in the Z dimension to cover all batch-channel - // combinations. - // - // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], - // local_wg_size[2]) might reduce the number of workgroups dispatched. To - // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension, - // we set local_wg_size[2] = 1. - const auto input_sizes = graph->sizes_of(input); - if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) && - global_workgroup_size[2] > 1) { - local_wg_size[2] = 1; - } - - return local_wg_size; -} - -utils::uvec3 dequantize_block_wise_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef input = args.at(1).refs.at(0); - - utils::uvec3 local_wg_size = - graph->create_local_wg_size(global_workgroup_size); - - // WORKAROUND: The CommandBuffer::dispatch function divides - // global_workgroup_size by local_workgroup_size to get the number of - // workgroups to dispatch. We need to ensure that we dispatch the correct - // number of workgroups in the Z dimension to cover all batch-channel - // combinations. - // - // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], - // local_wg_size[2]) might reduce the number of workgroups dispatched. To - // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension, - // we set local_wg_size[2] = 1. - const auto input_sizes = graph->sizes_of(input); - if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) && - global_workgroup_size[2] > 1) { - local_wg_size[2] = 1; - } - - return local_wg_size; -} - -void add_dequantize_per_tensor_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("dequantize_per_tensor"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(input)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; - } - - push_constants = { - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_dequantize_node)); -} - -void add_dequantize_per_token_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("dequantize_per_token"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(input)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - int num_tokens = static_cast(graph.sizes_of(scale)[0]); - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; - } - - push_constants = { - PushConstantDataInfo(&num_tokens, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_dequantize_node)); -} - -void add_dequantize_per_channel_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& axis, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("dequantize_per_channel"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - int axis_val = static_cast(graph.get_int(axis)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(input)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - // Normalize axis and convert from NCHW to WHCN using utility functions - const auto input_sizes = graph.sizes_of(input); - const int64_t ndim = graph.dim_of(input); - - // Normalize axis to handle negative indices - axis_val = normalize(axis_val, ndim); - - // Convert from NCHW axis to WHCN axis for shader (vulkan representation) - int axis_whcn = nchw_dim_to_whcn_dim(axis_val, ndim); - - int num_channels; - if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) { - // For batch dimension dequantization in 4D tensors, pass the actual number - // of channels so the shader can correctly unfold the batch-channel folding - num_channels = static_cast(input_sizes[1]); // Channel dimension - } else { - num_channels = static_cast(input_sizes[axis_val]); - } - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; - } - - push_constants = { - PushConstantDataInfo(&axis_whcn, sizeof(int)), - PushConstantDataInfo(&num_channels, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - dequantize_per_channel_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_dequantize_node)); -} - -void add_dequantize_block_wise_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& block_size, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("dequantize_block_wise"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(input)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - const auto input_sizes = graph.sizes_of(input); - const auto block_size_list = graph.get_int_list(block_size); - - // Convert dimensions to WHCN order for shader - utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*block_size_list); - utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(input_sizes); - - // Calculate numBlocks: tensorSize / blockSize (both in WHCN order) - utils::ivec4 num_blocks_vec = { - tensor_size_whcn[0] / block_size_vec[0], - tensor_size_whcn[1] / block_size_vec[1], - tensor_size_whcn[2] / block_size_vec[2], - tensor_size_whcn[3] / block_size_vec[3]}; - - // Calculate blockStride: pre-computed linear strides for the block grid - utils::ivec4 block_stride_vec = { - 1, - num_blocks_vec[0], - num_blocks_vec[0] * num_blocks_vec[1], - num_blocks_vec[0] * num_blocks_vec[1] * num_blocks_vec[2]}; - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; - } - - push_constants = { - PushConstantDataInfo(&block_size_vec, sizeof(block_size_vec)), - PushConstantDataInfo(&num_blocks_vec, sizeof(num_blocks_vec)), - PushConstantDataInfo(&block_stride_vec, sizeof(block_stride_vec)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - dequantize_block_wise_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_dequantize_node)); -} - -void dequantize_per_tensor_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef output_dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warnings - dtype and output_dtype are inferred - (void)dtype; - (void)output_dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is an integer type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kByte || - graph.dtype_of(input) == vkapi::kChar || - graph.dtype_of(input) == vkapi::kInt); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - add_dequantize_per_tensor_node( - graph, input, scale, zero_point, quant_min, quant_max, output); -} - -void dequantize_per_token_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef output_dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warnings - dtype and output_dtype are inferred - (void)dtype; - (void)output_dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is an integer type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kByte || - graph.dtype_of(input) == vkapi::kChar || - graph.dtype_of(input) == vkapi::kInt); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - // Calculate number of tokens (product of all dimensions except the last one) - int64_t num_tokens = 1; - const auto input_sizes = graph.sizes_of(input); - for (size_t i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - const auto scale_sizes = graph.sizes_of(scale); - const auto zero_point_sizes = graph.sizes_of(zero_point); - - // Calculate total number of elements in scale and zero_point tensors - int64_t scale_numel = 1; - for (size_t i = 0; i < scale_sizes.size(); i++) { - scale_numel *= scale_sizes[i]; - } - - int64_t zero_point_numel = 1; - for (size_t i = 0; i < zero_point_sizes.size(); i++) { - zero_point_numel *= zero_point_sizes[i]; - } - - // Check that the total number of elements matches num_tokens - // This allows for both 1D tensors (size [num_tokens]) and reshaped tensors - // (size [num_tokens, 1]) - VK_CHECK_COND(scale_numel == num_tokens); - VK_CHECK_COND(zero_point_numel == num_tokens); - - add_dequantize_per_token_node( - graph, input, scale, zero_point, quant_min, quant_max, output); -} - -void dequantize_per_channel_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef axis = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef output_dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warnings - dtype and output_dtype are inferred - (void)dtype; - (void)output_dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is an integer type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kByte || - graph.dtype_of(input) == vkapi::kChar || - graph.dtype_of(input) == vkapi::kInt); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - // Normalize axis - int axis_val = static_cast(graph.get_int(axis)); - const auto input_sizes = graph.sizes_of(input); - int ndim = graph.dim_of(input); - if (axis_val < 0) { - axis_val += ndim; - } - - // Verify axis is valid - VK_CHECK_COND(axis_val >= 0 && axis_val < ndim); - - // Get number of channels along the specified axis - int64_t num_channels = input_sizes[axis_val]; - - const auto scale_sizes = graph.sizes_of(scale); - const auto zero_point_sizes = graph.sizes_of(zero_point); - - // Calculate total number of elements in scale and zero_point tensors - int64_t scale_numel = 1; - for (size_t i = 0; i < scale_sizes.size(); i++) { - scale_numel *= scale_sizes[i]; - } - - int64_t zero_point_numel = 1; - for (size_t i = 0; i < zero_point_sizes.size(); i++) { - zero_point_numel *= zero_point_sizes[i]; - } - - // Check that the total number of elements matches num_channels - VK_CHECK_COND(scale_numel == num_channels); - VK_CHECK_COND(zero_point_numel == num_channels); - - add_dequantize_per_channel_node( - graph, input, scale, zero_point, axis, quant_min, quant_max, output); -} - -void dequantize_affine_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef block_size = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef input_dtype = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef output_dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warnings - (void)input_dtype; - (void)output_dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is an integer type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kByte || - graph.dtype_of(input) == vkapi::kChar || - graph.dtype_of(input) == vkapi::kInt); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - // Verify block_size is valid (each dimension must divide evenly into input - // size) - const auto input_sizes = graph.sizes_of(input); - const auto block_size_list = graph.get_int_list(block_size); - VK_CHECK_COND(block_size_list->size() == input_sizes.size()); - - for (size_t i = 0; i < input_sizes.size(); i++) { - if ((*block_size_list)[i] > 1) { - VK_CHECK_COND( - input_sizes[i] % (*block_size_list)[i] == 0, - "Input size at dimension ", - i, - " (", - input_sizes[i], - ") must be divisible by block_size at dimension ", - i, - " (", - (*block_size_list)[i], - ")"); - } - } - - add_dequantize_block_wise_node( - graph, - input, - block_size, - scale, - zero_point, - quant_min, - quant_max, - output); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP( - quantized_decomposed.dequantize_per_tensor.tensor, - dequantize_per_tensor_impl); - VK_REGISTER_OP( - quantized_decomposed.dequantize_per_token.default, - dequantize_per_token_impl); - VK_REGISTER_OP( - quantized_decomposed.dequantize_per_channel.default, - dequantize_per_channel_impl); - - // TorchAO affine dequantization operators - VK_REGISTER_OP(torchao.dequantize_affine.default, dequantize_affine_impl); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp deleted file mode 100644 index 475e7796b09..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -#include - -#include - -namespace vkcompute { - -using utils::GPUMemoryLayout; -using utils::StorageType; - -void check_embedding_args( - ComputeGraph& graph, - const ValueRef weight, - const ValueRef in, - const ValueRef out) { - // The packing logic may not be trivial here. Input and output are Channel - // Packed, which is default for the Vulkan backend. However, weight vector is - // height-packed instead of channel-packed for space reason. - VK_CHECK_COND(graph.packed_dim_of(weight) == WHCN::kHeightDim); - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); -} - -void add_embedding_node( - ComputeGraph& graph, - ValueRef weight, - ValueRef in, - ValueRef out) { - check_embedding_args(graph, weight, in, out); - - std::string kernel_name = "embedding"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {{in, weight}, vkapi::kRead}}, - { - graph.sizes_ubo(out), - }, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - graph.hashed_layout_of(weight)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void embedding(ComputeGraph& graph, const std::vector& args) { - ValueRef in = args[1]; - ValueRef out = args[5]; - - ValueRef weight = prepack_standard( - graph, - args[0], - StorageType::TEXTURE_2D, - GPUMemoryLayout::TENSOR_HEIGHT_PACKED); - - add_embedding_node(graph, weight, in, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.embedding.default, embedding); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Expand.cpp b/backends/vulkan/runtime/graph/ops/impl/Expand.cpp deleted file mode 100644 index 1623a26b2a1..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Expand.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -void add_expand_buffer_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef size, - const ValueRef out) { - std::string kernel_name = "expand"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers = { - graph.buffer_meta_ubo(out), - graph.buffer_meta_ubo(in), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {size}, - // Resizing Logic - nullptr)); -} - -void expand(ComputeGraph& graph, const std::vector& args) { - int idx = 0; - const ValueRef in = args.at(idx++); - const ValueRef size = args.at(idx++); - const ValueRef implicit = args.at(idx++); - (void)implicit; - const ValueRef out = args.at(idx++); - - if (graph.is_buffer_storage(out)) { - return add_expand_buffer_node(graph, in, size, out); - } - - VK_THROW("Expand operator only supports buffer storage"); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.expand_copy.default, expand); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Flip.cpp b/backends/vulkan/runtime/graph/ops/impl/Flip.cpp deleted file mode 100644 index 52288734704..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Flip.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -// Custom global workgroup size function for flip -utils::uvec3 flip_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return graph->create_global_wg_size(out); -} - -void check_flip_args( - ComputeGraph& graph, - const ValueRef in, - const ValueRef out) { - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); -} - -void resize_flip_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - graph->virtual_resize(out, graph->sizes_of(in)); -} - -utils::ivec4 create_whcn_bitmap( - const std::vector& list, - const int64_t ndim) { - std::vector bm(4, 0); - for (const auto e : list) { - auto x = (e % ndim + ndim) % ndim; // normalize - x = ndim - 1 - x; // reverse - bm.at(x) = 1; - } - return utils::make_ivec4(bm); -} - -void add_flip_node( - ComputeGraph& graph, - const ValueRef in, - const std::vector& dim_list, - const ValueRef out) { - check_flip_args(graph, in, out); - - const auto dim_bitmap = create_whcn_bitmap(dim_list, graph.dim_of(in)); - - std::string kernel_name("flip"); - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - flip_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {in, vkapi::kRead}, - }, - // Parameter buffers - { - graph.logical_limits_ubo(out), - graph.sizes_ubo(out), - graph.create_params_buffer(dim_bitmap), - }, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_flip_node)); -} - -void flip(ComputeGraph& graph, const std::vector& args) { - ValueRef in = args[0]; - auto dims = graph.get_int_list(args[1]); - ValueRef out = args[2]; - - add_flip_node(graph, in, *dims, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.flip.default, flip); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp deleted file mode 100644 index fe2676e91e0..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Full.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include - -#include - -namespace vkcompute { - -void resize_full_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - std::vector out_sizes; - if (graph->val_is_tensor(extra_args.at(0))) { - out_sizes = graph->sizes_of(extra_args.at(0)); - } else { - out_sizes = *graph->get_int_list(extra_args.at(0)); - } - - graph->virtual_resize(out, out_sizes); -} - -void add_full_node( - ComputeGraph& graph, - const ValueRef size_or_in, - const ValueRef fill_value, - const ValueRef out) { - float fill_value_val = graph.extract_scalar(fill_value); - - std::string kernel_name("full"); - kernel_name.reserve(kShaderNameReserve); - - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}}, - // Shader params buffers - {graph.sizes_ubo(out), graph.create_params_buffer(fill_value_val)}, - // Push Constants - {}, - // Specialization Constants - {graph.packed_dim_of(out)}, - // Resize Args - {size_or_in}, - // Resizing Logic - resize_full_node)); -} - -void full(ComputeGraph& graph, const std::vector& args) { - return add_full_node(graph, args[0], args[1], args[args.size() - 1]); -} - -void zeros(ComputeGraph& graph, const std::vector& args) { - return add_full_node( - graph, args[0], graph.add_scalar(0), args[args.size() - 1]); -} - -void ones(ComputeGraph& graph, const std::vector& args) { - return add_full_node( - graph, args[0], graph.add_scalar(1), args[args.size() - 1]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.full.default, full); - VK_REGISTER_OP(aten.full_like.default, full); - VK_REGISTER_OP(aten.zeros.default, zeros); - VK_REGISTER_OP(aten.zeros_like.default, zeros); - VK_REGISTER_OP(aten.ones.default, ones); - VK_REGISTER_OP(aten.ones_like.default, ones); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp deleted file mode 100644 index 5f39c16d405..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include - -namespace vkcompute { - -struct GridPriorsParam final { - int32_t stride; - float offset; -}; - -void resize_grid_priors_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = extra_args.at(0); - const std::vector in_sizes = graph->sizes_of(in); - const int64_t height = in_sizes.at(in_sizes.size() - 2); - const int64_t width = in_sizes.at(in_sizes.size() - 1); - const std::vector sizes = {height * width, 2}; - graph->virtual_resize(out, sizes); -} - -void add_grid_priors_node( - ComputeGraph& graph, - const ValueRef& in, - const ValueRef& stride_ref, - const ValueRef& offset_ref, - const ValueRef& out) { - const int32_t stride = graph.extract_scalar(stride_ref); - const float offset = graph.extract_scalar(offset_ref); - - std::string kernel_name = "grid_priors"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const GridPriorsParam param = {stride, offset}; - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - }, - // Shader params buffers - { - graph.sizes_ubo(in), - graph.sizes_ubo(out), - graph.create_params_buffer(param), - }, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {in}, - // Resizing Logic - resize_grid_priors_node)); -} - -void grid_priors(ComputeGraph& graph, const std::vector& args) { - return add_grid_priors_node(graph, args[0], args[1], args[2], args[3]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.grid_priors.default, grid_priors); -} -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/GroupNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/GroupNorm.cpp deleted file mode 100644 index 368b95c9d3b..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/GroupNorm.cpp +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include - -#include - -namespace vkcompute { - -utils::uvec3 group_norm_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)graph; - (void)shader; - (void)global_workgroup_size; - (void)args; - (void)resize_args; - - return {1, 64, 1}; -} - -void resize_group_norm_texture_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - // Extract tensor references from args - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - const ValueRef mean = args.at(1).refs.at(3); - const ValueRef rstd = args.at(1).refs.at(4); - - // Extract group from resize args - const int64_t group_val = graph->extract_scalar(resize_args.at(0)); - - // Get input tensor sizes using ComputeGraph APIs - const std::vector in_sizes = graph->sizes_of(in); - - // Output tensor should have the same size as input - graph->virtual_resize(out, in_sizes); - - // Mean and rstd tensors should have size {num_batches, num_groups} - const int64_t N = in_sizes.at(0); // batch dimension - const std::vector mean_rstd_sizes = {N, group_val}; - - // Resize mean and rstd tensors - graph->virtual_resize(mean, mean_rstd_sizes); - graph->virtual_resize(rstd, mean_rstd_sizes); -} - -void add_native_group_norm_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef weight_data, - const ValueRef bias_data, - const ValueRef N, - const ValueRef C, - const ValueRef HxW, - const ValueRef group, - const ValueRef eps, - const ValueRef out, - const ValueRef mean, - const ValueRef rstd) { - (void)N; - (void)C; - (void)HxW; - - const ValueRef arg_weight = prepack_standard( - graph, - weight_data, - graph.storage_type_of(in), - utils::kWidthPacked, - false); - const ValueRef arg_bias = prepack_standard( - graph, bias_data, graph.storage_type_of(in), utils::kWidthPacked, false); - - const int64_t group_val = graph.extract_scalar(group); - const float epsilon = graph.extract_scalar(eps); - - const std::vector in_sizes = graph.sizes_of(in); - - std::string kernel_name("group_norm_reduce_texture"); - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const struct { - int32_t group; - float epsilon; - } params_uniform = {static_cast(group_val), epsilon}; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - group_norm_local_wg_size, - // Inputs and Outputs - {{{mean, rstd}, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - { - graph.strides_ubo(mean), - graph.numel_ubo(mean), - graph.logical_limits_ubo(in), - graph.sizes_ubo(in), - }, - // Push Constants - { - PushConstantDataInfo(¶ms_uniform, sizeof(params_uniform)), - }, - // Specialization Constants - { - graph.hashed_layout_of(mean), - }, - // Resize Args - {group}, - // Resizing Logic - nullptr)); - - // Compute element-wise normalization, now that mean and rstd have been - // computed. - std::string norm_kernel_name("group_norm_texture"); - norm_kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(norm_kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(norm_kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, - {{in, arg_weight, arg_bias, mean, rstd}, vkapi::kRead}}, - // Shader params buffers - { - graph.logical_limits_ubo(out), - graph.sizes_ubo(out), - graph.logical_limits_ubo(arg_weight), - graph.strides_ubo(mean), - }, - // Push Constants - { - PushConstantDataInfo(¶ms_uniform, sizeof(params_uniform)), - }, - // Specialization Constants - { - graph.hashed_layout_of(in), - }, - // Resize Args - {group}, - // Resizing Logic - resize_group_norm_texture_node)); -} - -void native_group_norm(ComputeGraph& graph, const std::vector& args) { - // Assign each element of the args vector to const ValueRef variables - const ValueRef in = args.at(0); - const ValueRef weight_data = args.at(1); - const ValueRef bias_data = args.at(2); - const ValueRef N = args.at(3); - const ValueRef C = args.at(4); - const ValueRef HxW = args.at(5); - const ValueRef group = args.at(6); - const ValueRef eps = args.at(7); - const ValueRef out_tuple_ref = args.at(8); - - ValueRef out = kDummyValueRef; - ValueRef mean = kDummyValueRef; - ValueRef rstd = kDummyValueRef; - - { - const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref); - out = out_tuple->at(0); - mean = out_tuple->at(1); - rstd = out_tuple->at(2); - } - - VK_CHECK_COND(graph.val_is_tref(weight_data)); - VK_CHECK_COND(graph.val_is_tref(bias_data)); - - // Check expected storage types and memory layouts for tensor variables - VK_CHECK_COND(graph.is_standard_channels_packed_texture_tensor(in)); - VK_CHECK_COND(graph.is_standard_channels_packed_texture_tensor(out)); - - VK_CHECK_COND(graph.is_contiguous_buffer_tensor(mean)); - VK_CHECK_COND(graph.is_contiguous_buffer_tensor(rstd)); - - return add_native_group_norm_node( - graph, - in, - weight_data, - bias_data, - N, - C, - HxW, - group, - eps, - out, - mean, - rstd); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.native_group_norm.default, native_group_norm); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp deleted file mode 100644 index 576711a86f1..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -void check_index_select_args( - ComputeGraph& graph, - const ValueRef in, - const ValueRef idx, - const ValueRef out) { - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(idx) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); -} - -void add_index_select_channel_node( - ComputeGraph& graph, - ValueRef in, - ValueRef idx, - ValueRef out) { - check_index_select_args(graph, in, idx, out); - - std::string kernel_name = "index_select_channel"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {{in, idx}, vkapi::kRead}}, - {graph.sizes_ubo(out), graph.sizes_ubo(in)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -struct IndexSelectParams final { - int32_t gpu_dim; - int32_t stride; -}; - -IndexSelectParams create_index_select_params( - ComputeGraph& graph, - const int64_t dim_idx, - const ValueRef in) { - if (dim_idx == kWidth4D) { - return {0, 1}; - } else if (dim_idx == kHeight4D) { - return {1, 1}; - } else if (dim_idx == kBatch4D) { - const std::vector in_sizes = graph.sizes_of(in); - int64_t n_channels = dim_at(in_sizes, kChannel4D); - int64_t stride = utils::div_up_4(n_channels); - return {2, static_cast(stride)}; - } else { - VK_THROW("Unexpected dim_idx!"); - } -} - -void add_index_select_node( - ComputeGraph& graph, - ValueRef in, - const int64_t dim_idx, - ValueRef idx, - ValueRef out) { - check_index_select_args(graph, in, idx, out); - - IndexSelectParams params = create_index_select_params(graph, dim_idx, in); - - std::string kernel_name = "index_select"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {{in, idx}, vkapi::kRead}}, - {graph.sizes_ubo(out), graph.create_params_buffer(params)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -int64_t get_dim_idx(ComputeGraph& graph, ValueRef in, ValueRef dim_ref) { - int64_t dim = graph.extract_scalar(dim_ref); - const int64_t ndim = graph.dim_of(in); - dim = normalize(dim, ndim); - - // Convert to DimIndex - this replicates normalize_to_dim_index logic - return dim < 0 ? dim : dim - ndim; -} - -void index_select(ComputeGraph& graph, const std::vector& args) { - ValueRef in = args[0]; - ValueRef dim_ref = args[1]; - ValueRef idx = args[2]; - ValueRef out = args[3]; - - const int64_t dim_idx = get_dim_idx(graph, in, dim_ref); - if (dim_idx == kChannel4D) { - add_index_select_channel_node(graph, in, idx, out); - } else { - add_index_select_node(graph, in, dim_idx, idx, out); - } -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.index_select.default, index_select); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp deleted file mode 100644 index 38d70271f4f..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -// Custom global workgroup size function for addmm_naive_texture -utils::uvec3 addmm_naive_texture_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return graph->logical_limits_of(out); -} - -// Custom global workgroup size function for addmm_naive_buffer -utils::uvec3 addmm_naive_buffer_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return { - graph->size_at(-1, out), - graph->size_at(-2, out), - graph->size_at(-3, out) * graph->size_at(-4, out)}; -} - -// Custom global workgroup size function for addmm_optimized -utils::uvec3 addmm_optimized_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - - std::vector mat1_sizes = graph->sizes_of(mat1); - int mat1_dims = mat1_sizes.size(); - - utils::uvec3 global_size = graph->logical_limits_of(out); - - if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(global_size, {4, 2, 1}); - } else { - global_size = utils::divup_vec(global_size, {4, 4, 1}); - } - return global_size; -} - -// Custom local workgroup size function for addmm_optimized -utils::uvec3 addmm_optimized_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)args; - (void)resize_args; - return adaptive_work_group_size(global_workgroup_size); -} - -void check_addmm_args( - ComputeGraph& graph, - const ValueRef self, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef beta, - const ValueRef alpha, - const ValueRef out) { - (void)alpha; - (void)beta; - - std::vector self_sizes = graph.sizes_of(self); - std::vector mat1_sizes = graph.sizes_of(mat1); - std::vector mat2_sizes = graph.sizes_of(mat2_data); - - VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3); - VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); - - VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out)); - - VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes)); - - if (utils::val_at(-1, self_sizes) != 1) { - VK_CHECK_COND( - utils::val_at(-1, self_sizes) == utils::val_at(-1, mat2_sizes)); - } - if (utils::val_at(-2, self_sizes) != 1) { - VK_CHECK_COND( - utils::val_at(-2, self_sizes) == utils::val_at(-2, mat1_sizes)); - } -} - -void resize_addmm_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - const ValueRef mat2 = args.at(1).refs.at(1); - - const bool mat2_is_transposed = graph->get_bool(extra_args.at(0)); - - const std::vector mat1_sizes = graph->sizes_of(mat1); - const std::vector mat2_sizes = graph->sizes_of(mat2); - - const int out_cols = utils::val_at(-2, mat1_sizes); - const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2_sizes) - : utils::val_at(-1, mat2_sizes); - - std::vector new_out_sizes(3); - if (mat1_sizes.size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1_sizes.at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - graph->virtual_resize(out, new_out_sizes); -} - -struct Params final { - float alpha; - float beta; -}; - -void add_addmm_naive_texture_node( - ComputeGraph& graph, - const ValueRef self_data, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef beta, - const ValueRef alpha, - const ValueRef out, - const Params& params, - const ValueRef mat2_is_transposed) { - utils::StorageType stype = graph.storage_type_of(out); - ValueRef self = prepack_standard( - graph, self_data, stype, utils::kWidthPacked, /*passthrough = */ true); - ValueRef mat2 = prepack_standard( - graph, mat2_data, stype, utils::kHeightPacked, /*passthrough = */ true); - - std::string kernel_name = - graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - utils::uvec3 global_wg_size = graph.logical_limits_of(out); - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - addmm_naive_texture_global_wg_size, - pick_hw_square_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, mat2, self}, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(out), - graph.logical_limits_ubo(out), - graph.sizes_ubo(mat1), - graph.sizes_ubo(mat2), - graph.sizes_ubo(self), - graph.create_params_buffer(params), - }, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(mat1), - graph.hashed_layout_of(mat2), - graph.hashed_layout_of(self)}, - // Resize Args - {mat2_is_transposed}, - // Resizing Logic - resize_addmm_node)); -} - -void add_addmm_naive_buffer_node( - ComputeGraph& graph, - const ValueRef self_data, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef beta, - const ValueRef alpha, - const ValueRef out, - const Params& params, - const ValueRef mat2_is_transposed) { - (void)beta; - (void)alpha; - ValueRef mat2 = prepack_standard( - graph, - mat2_data, - graph.storage_type_of(out), - utils::kHeightPacked, - /*passthrough = */ true); - ValueRef self = prepack_standard( - graph, - self_data, - graph.storage_type_of(out), - utils::kWidthPacked, - /*passthrough = */ true); - - std::string kernel_name = "addmm_naive_buffer"; - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - utils::uvec3 global_size = { - graph.size_at(-1, out), - graph.size_at(-2, out), - graph.size_at(-3, out) * graph.size_at(-4, out)}; - - int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef && - graph.get_bool(mat2_is_transposed)) - ? 1 - : 0; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - addmm_naive_buffer_global_wg_size, - pick_hw_square_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, mat2, self}, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(out), - graph.strides_ubo(out), - graph.sizes_ubo(mat1), - graph.strides_ubo(mat1), - graph.sizes_ubo(mat2), - graph.strides_ubo(mat2), - graph.numel_ubo(out), - graph.create_params_buffer(params), - }, - // Push Constants - {}, - // Specialization Constants - {mat2_is_transposed_val}, - // Resize Args - {mat2_is_transposed}, - // Resizing Logic - resize_addmm_node)); -} - -void add_addmm_optimized_node( - ComputeGraph& graph, - const ValueRef self_data, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef beta, - const ValueRef alpha, - const ValueRef out, - const Params& params, - const ValueRef mat2_is_transposed) { - utils::StorageType stype = graph.storage_type_of(out); - ValueRef self = prepack_standard( - graph, self_data, stype, utils::kChannelsPacked, /*passthrough=*/true); - ValueRef mat2 = prepack_standard( - graph, mat2_data, stype, utils::kHeightPacked, /*passthrough=*/true); - - // Ensure mat1 is width packed - ValueRef mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked); - auto viewFn = VK_GET_OP_FN("aten.view_copy.default"); - viewFn(graph, {mat1, graph.add_none(), mat1_W_packed}); - - const bool mat2_is_transposed_val = graph.get_bool(mat2_is_transposed); - - // Ensure mat2 is height packed - ValueRef mat2_packed = mat2; - const utils::GPUMemoryLayout mat2_layout = - mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked; - if (graph.estimate_memory_layout_of(mat2) != mat2_layout) { - mat2_packed = graph.add_tensor_like(mat2, mat2_layout); - viewFn(graph, {mat2, graph.add_none(), mat2_packed}); - } - - std::string kernel_name = graph.get_bool(mat2_is_transposed) - ? "linear_optimized" - : "addmm_optimized"; - - std::vector mat1_sizes = graph.sizes_of(mat1_W_packed); - int mat1_dims = mat1_sizes.size(); - if (mat1_dims == 3) { - kernel_name = "batch_" + kernel_name; - } - if (mat1_sizes.at(mat1_dims - 2) < 8) { - kernel_name += "_tile_row_2"; - } else { - kernel_name += "_tile_row_4"; - } - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - addmm_optimized_global_wg_size, - addmm_optimized_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, - {{mat1_W_packed, mat2_packed, self}, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(out), - graph.sizes_ubo(mat1_W_packed), - graph.sizes_ubo(mat2_packed), - graph.sizes_ubo(self), - graph.create_params_buffer(params), - }, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(mat1_W_packed), - graph.hashed_layout_of(mat2_packed), - graph.hashed_layout_of(self)}, - // Resize Args - {mat2_is_transposed}, - // Resizing Logic - resize_addmm_node)); -} - -void add_addmm_node( - ComputeGraph& graph, - const ValueRef self, - const ValueRef mat1, - const ValueRef mat2, - const ValueRef beta, - const ValueRef alpha, - const ValueRef out, - const ValueRef mat2_is_transposed) { - float alpha_val = 1.0f; - float beta_val = 1.0f; - - if (alpha != kDummyValueRef) { - alpha_val = graph.extract_scalar(alpha); - } - if (beta != kDummyValueRef) { - beta_val = graph.extract_scalar(beta); - } - - Params params = {alpha_val, beta_val}; - if (graph.is_buffer_storage(out)) { - add_addmm_naive_buffer_node( - graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed); - } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) { - add_addmm_optimized_node( - graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed); - } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) { - add_addmm_naive_texture_node( - graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed); - } else { - VK_THROW("Input should be channel packed or width packed."); - } -} - -void addmm(ComputeGraph& graph, const std::vector& args) { - check_addmm_args(graph, args[0], args[1], args[2], args[3], args[4], args[5]); - ValueRef mat2_is_transposed = graph.add_scalar(false); - return add_addmm_node( - graph, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - mat2_is_transposed); -} - -void linear(ComputeGraph& graph, const std::vector& args) { - ValueRef input = args.at(0); - ValueRef weight_data = args.at(1); - ValueRef bias = args.at(2); - ValueRef out = args.at(3); - ValueRef weight = prepack_standard( - graph, - weight_data, - graph.storage_type_of(out), - utils::kWidthPacked, - /*passthrough = */ true); - ValueRef mat2_is_transposed = graph.add_scalar(true); - - if (graph.val_is_none(bias)) { - return add_matmul_node(graph, input, weight, out, mat2_is_transposed); - } else { - return add_addmm_node( - graph, - bias, - input, - weight, - kDummyValueRef, - kDummyValueRef, - out, - mat2_is_transposed); - } -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.addmm.default, addmm); - VK_REGISTER_OP(aten.linear.default, linear); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp deleted file mode 100644 index 47ecf5f18d2..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -void check_matmul_args( - const ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef out) { - std::vector mat1_sizes = graph.sizes_of(mat1); - std::vector mat2_sizes = graph.sizes_of(mat2_data); - - VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3); - VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); - - VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out)); - - VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes)); -} - -void resize_matmul_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - const ValueRef mat2 = args.at(1).refs.at(1); - - bool mat2_is_transposed = graph->get_bool(resize_args.at(0)); - - const std::vector mat1_sizes = graph->sizes_of(mat1); - const std::vector mat2_sizes = graph->sizes_of(mat2); - - const int out_cols = utils::val_at(-2, mat1_sizes); - const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2_sizes) - : utils::val_at(-1, mat2_sizes); - - const int64_t out_dim = graph->dim_of(out); - std::vector new_out_sizes(mat1_sizes); - new_out_sizes.at(out_dim - 1) = out_rows; - new_out_sizes.at(out_dim - 2) = out_cols; - - graph->virtual_resize(out, new_out_sizes); -} - -/** - * Custom global workgroup size function for naive buffer matmul operations. - */ -utils::uvec3 matmul_naive_buffer_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return { - graph->size_at(-1, out), - graph->size_at(-2, out), - graph->size_at(-3, out) * graph->size_at(-4, out)}; -} - -void add_matmul_naive_buffer_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef out, - const ValueRef mat2_is_transposed) { - ValueRef mat2 = prepack_standard( - graph, - mat2_data, - graph.storage_type_of(out), - utils::kHeightPacked, - /*passthrough = */ true); - - std::string kernel_name = "matmul_naive_buffer"; - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef && - graph.get_bool(mat2_is_transposed)) - ? 1 - : 0; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - matmul_naive_buffer_global_wg_size, - pick_hw_square_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(out), - graph.strides_ubo(out), - graph.sizes_ubo(mat1), - graph.strides_ubo(mat1), - graph.sizes_ubo(mat2), - graph.strides_ubo(mat2), - graph.numel_ubo(out), - }, - // Push Constants - {}, - // Specialization Constants - {mat2_is_transposed_val}, - // Resize Args - {mat2_is_transposed}, - // Resizing Logic - resize_matmul_node)); -} - -vkapi::ShaderInfo pick_matmul_naive_texture3d_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const bool is_transposed = graph->get_bool(resize_args.at(0)); - - std::string kernel_name = - is_transposed ? "matmul_transposed_naive" : "matmul_naive"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph->storage_type_of(out)); - add_dtype_suffix(kernel_name, graph->dtype_of(out)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -void add_matmul_naive_texture3d_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef out, - const ValueRef mat2_is_transposed) { - ValueRef mat2 = prepack_standard( - graph, - mat2_data, - graph.storage_type_of(out), - utils::kHeightPacked, - /*passthrough = */ true); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_matmul_naive_texture3d_shader, - default_pick_global_wg_size, - pick_hw_square_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(out), - graph.logical_limits_ubo(out), - graph.sizes_ubo(mat1), - graph.sizes_ubo(mat2), - }, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(mat1), - graph.hashed_layout_of(mat2)}, - // Resize Args - {mat2_is_transposed}, - // Resizing Logic - resize_matmul_node)); -} - -vkapi::ShaderInfo pick_matmul_optimized_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1_W_packed = resize_args.at(1); - const bool mat2_is_transposed_val = graph->get_bool(resize_args.at(0)); - - std::string kernel_name = mat2_is_transposed_val - ? "matmul_transposed_optimized" - : "matmul_optimized"; - - std::vector mat1_sizes = graph->sizes_of(mat1_W_packed); - size_t mat1_dims = mat1_sizes.size(); - if (mat1_dims == 3) { - kernel_name = "batch_" + kernel_name; - } - if (mat1_sizes.at(mat1_dims - 2) < 8) { - kernel_name += "_tile_row_2"; - } else { - kernel_name += "_tile_row_4"; - } - - add_dtype_suffix(kernel_name, graph->dtype_of(out)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -utils::uvec3 matmul_optimized_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1_W_packed = resize_args.at(1); - - const std::vector mat1_sizes = graph->sizes_of(mat1_W_packed); - const size_t mat1_dims = mat1_sizes.size(); - - utils::uvec3 global_size = graph->logical_limits_of(out); - if (mat1_sizes.at(mat1_dims - 2) < 8) { - // Use `logical_extents` instead of `image_extents` because the workgroup - // axes need to correspond to tensor dimensions. - global_size = utils::divup_vec(global_size, {4, 2, 1}); - } else { - global_size = utils::divup_vec(global_size, {4, 4, 1}); - } - - return global_size; -} - -void add_matmul_optimized_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef out, - const ValueRef mat2_is_transposed) { - ValueRef mat2 = prepack_standard( - graph, - mat2_data, - graph.storage_type_of(out), - utils::kHeightPacked, - /*passthrough = */ true); - - // Ensure mat1 is width packed - TmpTensor mat1_tmp( - &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked); - ValueRef mat1_W_packed = mat1; - auto viewFn = VK_GET_OP_FN("aten.view_copy.default"); - if (graph.packed_dim_of(mat1) != WHCN::kWidthDim) { - mat1_W_packed = mat1_tmp; - viewFn(graph, {mat1, graph.add_none(), mat1_W_packed}); - } - - const bool mat2_is_transposed_val = graph.get_bool(mat2_is_transposed); - - // Ensure mat2 to height packed - ValueRef mat2_packed = mat2; - const utils::GPUMemoryLayout mat2_layout = - mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked; - TmpTensor mat2_tmp( - &graph, graph.sizes_of(mat2), graph.dtype_of(mat2), mat2_layout); - if (graph.estimate_memory_layout_of(mat2) != mat2_layout) { - mat2_packed = mat2_tmp; - viewFn(graph, {mat2, graph.add_none(), mat2_packed}); - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_matmul_optimized_shader, - matmul_optimized_global_wg_size, - pick_hw_square_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1_W_packed, mat2_packed}, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(out), - graph.sizes_ubo(mat1_W_packed), - graph.sizes_ubo(mat2_packed), - }, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(mat1_W_packed), - graph.hashed_layout_of(mat2_packed)}, - // Resize Args - {mat2_is_transposed, mat1_W_packed}, - // Resizing Logic - resize_matmul_node)); -} - -void add_matmul_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef out, - const ValueRef mat2_is_transposed) { - if (graph.is_buffer_storage(out)) { - add_matmul_naive_buffer_node( - graph, mat1, mat2_data, out, mat2_is_transposed); - } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) { - add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed); - } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) { - add_matmul_naive_texture3d_node( - graph, mat1, mat2_data, out, mat2_is_transposed); - } else { - VK_THROW("Input texture should be channel packed or width packed."); - } -} - -void matmul(ComputeGraph& graph, const std::vector& args) { - check_matmul_args(graph, args[0], args[1], args[2]); - const ValueRef mat2_is_transposed = graph.add_scalar(false); - return add_matmul_node(graph, args[0], args[1], args[2], mat2_is_transposed); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.mm.default, matmul); - VK_REGISTER_OP(aten.bmm.default, matmul); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.h b/backends/vulkan/runtime/graph/ops/impl/MatMul.h deleted file mode 100644 index 38f7907f1b6..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -void add_matmul_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef out, - const ValueRef mat2_is_transposed); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp deleted file mode 100644 index 8e15b56b208..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -std::vector calc_out_mean_sizes( - const std::vector& self_sizes, - int64_t normalized_shape_dim) { - std::vector output_size = self_sizes; - int64_t self_dim = self_sizes.size(); - for (int64_t i = 0; i < normalized_shape_dim; ++i) { - output_size.at(self_dim - i - 1) = 1; - } - return output_size; -} - -void resize_native_layer_norm_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mean = args.at(0).refs.at(1); - const ValueRef rstd = args.at(0).refs.at(2); - const ValueRef in = args.at(1).refs.at(0); - const std::vector in_sizes = graph->sizes_of(in); - - const auto normalized_shape_dim = - graph->get_int_list(extra_args.at(0))->size(); - - const std::vector mean_size = - calc_out_mean_sizes(in_sizes, normalized_shape_dim); - - graph->virtual_resize(out, in_sizes); - graph->virtual_resize(mean, mean_size); - graph->virtual_resize(rstd, mean_size); -} - -void add_native_layer_norm_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef normalized_shape, - const ValueRef weight_data, - const ValueRef bias_data, - const ValueRef eps, - const ValueRef out) { - const auto normalized_shape_dim = - graph.get_int_list(normalized_shape)->size(); - if (normalized_shape_dim > 1) { - VK_THROW("native_layer_norm only supports normalized_shape with dim == 1"); - } - - if (graph.val_is_none(weight_data)) { - VK_THROW("native_layer_norm requires weight to be non-None"); - } - - if (graph.val_is_none(bias_data)) { - VK_THROW("native_layer_norm requires bias to be non-None"); - } - - ValueRef arg_weight = prepack_standard_like(graph, weight_data, in); - ValueRef arg_bias = prepack_standard_like(graph, bias_data, in); - - const auto out_val = graph.get_value_list(out); - const ValueRef out_tensor = out_val->at(0); - const ValueRef mean_tensor = out_val->at(1); - const ValueRef rstd_tensor = out_val->at(2); - - float epsilon = graph.extract_scalar(eps); - - VK_CHECK_COND(check_same_packed_dim(graph, in, out_tensor)); - - const std::vector in_sizes = graph.sizes_of(in); - - utils::uvec3 global_size = graph.logical_limits_of(out_tensor); - utils::uvec3 local_size; - - // Since the shader sets shared memory scale factor > 1, if dispatch is - // greater than maximum WG size. Setting WG size in X axis to max WG size, - // would allow best thread utilization. - if (global_size[0] > 64) { - local_size = {64, 1, 1}; - } else { - // If thread size in X axis is smaller or equal to maximum WG size, we can - // let the function decide the best WG size. - local_size = graph.create_local_wg_size(global_size); - } - - std::string kernel_name("native_layer_norm"); - kernel_name.reserve(kShaderNameReserve); - - add_dtype_suffix(kernel_name, graph.dtype_of(out_tensor)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{{out_tensor, mean_tensor, rstd_tensor}, vkapi::kWrite}, - {{in, arg_weight, arg_bias}, vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - { - graph.logical_limits_pc_of(out_tensor), - graph.sizes_pc_of(out_tensor), - PushConstantDataInfo(&epsilon, sizeof(epsilon)), - }, - // Specialization Constants - { - graph.hashed_layout_of(in), - graph.hashed_layout_of(out_tensor), - }, - // Resize Args - {normalized_shape}, - // Resizing Logic - resize_native_layer_norm_node)); -} - -void native_layer_norm(ComputeGraph& graph, const std::vector& args) { - return add_native_layer_norm_node( - graph, args[0], args[1], args[2], args[3], args[4], args[5]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.native_layer_norm.default, native_layer_norm); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp deleted file mode 100644 index d225af05633..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include - -#include - -namespace vkcompute { - -struct PadParam final { - int32_t left; - int32_t top; - int32_t front; -}; - -PadParam creat_pad_param(const std::vector& pad) { - if (pad.size() == 2) { - return PadParam{static_cast(pad[0]), 0, 0}; - } else if (pad.size() == 4) { - return PadParam{ - static_cast(pad[0]), static_cast(pad[2]), 0}; - } else if (pad.size() == 6) { - return PadParam{ - static_cast(pad[0]), - static_cast(pad[2]), - static_cast(pad[4])}; - } else { - VK_THROW("invalid pad form"); - } -} - -void resize_constant_pad_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - const IntListPtr pad_vec = graph->get_int_list(extra_args.at(0)); - std::vector in_size = graph->sizes_of(self); - int dim = in_size.size() - 1; - for (int i = 0; i < pad_vec->size(); i += 2) { - in_size.at(dim) += pad_vec->at(i) + pad_vec->at(i + 1); - dim--; - } - - graph->virtual_resize(out, in_size); -} - -void add_constant_pad_nd_node( - ComputeGraph& graph, - const ValueRef& in, - const ValueRef& pad, - const ValueRef& fill_value, - const ValueRef& out) { - const float fill_value_val = graph.extract_scalar(fill_value); - const IntListPtr pad_vec = graph.get_int_list(pad); - - std::string kernel_name = ""; - const PadParam pad_param = creat_pad_param(*pad_vec); - - if (pad_vec->size() <= 4) { - kernel_name = "pad_height_width"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - } else { - kernel_name = "pad_channel"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.sizes_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(pad_param), - graph.create_params_buffer(fill_value_val)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {pad}, - // Resizing Logic - resize_constant_pad_node)); -} - -void constant_pad_nd(ComputeGraph& graph, const std::vector& args) { - return add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.constant_pad_nd.default, constant_pad_nd); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp deleted file mode 100644 index 9ac4c963bc3..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -using utils::ivec2; -using utils::ivec3; -using utils::ivec4; -using utils::uvec4; - -namespace { - -void check_args( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out) { - (void)permute_dims; - VK_CHECK_COND(check_same_packed_dim(graph, in, out)); - - // This implementation doesn't not requires the input tensor to have the same - // dim size as the argument. The code will work as long as the input tensor's - // dim size is shorter than the permute dim array. In this case, the code - // assume size of 1 at the higher dimensions. -} - -} // namespace - -void resize_permute_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args[0].refs[0]; - const ValueRef in = args[1].refs[0]; - - const std::vector in_sizes = graph->sizes_of(in); - const std::vector out_sizes = graph->sizes_of(out); - - const std::vector permute_dims = - graph->extract_int_or_symint_list(resize_args[0]); - - if (in_sizes.size() == out_sizes.size() && - in_sizes.size() == permute_dims.size()) { - std::vector new_out_sizes(out_sizes.size(), 1); - const int64_t out_ndim = std::max(in_sizes.size(), out_sizes.size()); - for (int i = 0; i < out_ndim; i++) { - const int64_t permute_dim = permute_dims.at(i); - new_out_sizes.at(i) = in_sizes.at(permute_dim); - } - graph->virtual_resize(out, new_out_sizes); - } - // Case where permute is being used to implement squeeze - else if ( - in_sizes.size() > out_sizes.size() && - in_sizes.size() == permute_dims.size()) { - std::vector new_out_sizes(out_sizes.size(), 1); - const size_t offset = in_sizes.size() - out_sizes.size(); - for (int i = 0; i < out_sizes.size(); i++) { - const int64_t permute_dim = permute_dims.at(i + offset); - new_out_sizes.at(i) = in_sizes.at(permute_dim); - } - graph->virtual_resize(out, new_out_sizes); - } - // Case where Permute is being used to implement unsqueeze - else if ( - in_sizes.size() < out_sizes.size() && - out_sizes.size() == permute_dims.size()) { - std::vector new_out_sizes(out_sizes.size(), 1); - const size_t offset = out_sizes.size() - in_sizes.size(); - for (int i = 0; i < out_sizes.size(); i++) { - int64_t permute_dim = permute_dims.at(i) - offset; - if (permute_dim >= 0) { - new_out_sizes.at(i) = in_sizes.at(permute_dim); - } - } - graph->virtual_resize(out, new_out_sizes); - } else { - VK_THROW("Invalid permute dims"); - } -} - -void add_permute_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out) { - check_args(graph, in, permute_dims, out); - - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order valueto WHCN order value - // 2. Reverse the order of the permute array from NCHW order to WHCN order - ivec4 whcn_permute_dims{0, 1, 2, 3}; - { - IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); - const int32_t permute_ndim = - utils::safe_downcast(permute_dims_ptr->size()); - - for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0; - nchw_i--, whcn_i++) { - const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i); - const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw; - - whcn_permute_dims[whcn_i] = permute_dim_whcn; - } - } - - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers; - std::vector push_constants; - vkapi::SpecVarList spec_vars; - - const int32_t out_channels = dim_at(graph.sizes_of(out)); - const int32_t in_channels = dim_at(graph.sizes_of(in)); - - const int32_t packed_dim = graph.packed_dim_of(in); - ivec2 channel_info = {out_channels, in_channels}; - if (packed_dim == WHCN::kChannelsDim) { - channel_info[0] = utils::align_up_4(channel_info[0]); - channel_info[1] = utils::align_up_4(channel_info[1]); - } - - push_constants = { - graph.sizes_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))}; - - spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {permute_dims}, - // Resizing Logic - resize_permute_node)); -} - -struct WHCNPermuteDims { - int32_t whcn_permute_dims[api::kTensorDimLimit]; - - void initialize(const std::vector& permute_dims) { - const int32_t permute_ndim = permute_dims.size(); - for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { - const int32_t nchw_i = permute_ndim - 1 - whcn_i; - int64_t index_val = permute_dims.at(nchw_i); - if (index_val < 0) { - index_val += permute_ndim; - } - const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; - whcn_permute_dims[whcn_i] = permute_dim_whcn; - } - for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; - whcn_i++) { - whcn_permute_dims[whcn_i] = whcn_i; - } - } -}; - -void add_permute_buffer_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out) { - check_args(graph, in, permute_dims, out); - - WHCNPermuteDims whcn_permute_dims; - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order valueto WHCN order value - // 2. Extend the permute array to kTensorDimLimit - { - IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); - whcn_permute_dims.initialize(*permute_dims_ptr); - } - - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers = { - graph.buffer_meta_ubo(out), - graph.buffer_meta_ubo(in), - graph.create_params_buffer(whcn_permute_dims), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {permute_dims}, - // Resizing Logic - resize_permute_node)); -} - -void permute(ComputeGraph& graph, const std::vector& args) { - int idx = 0; - const ValueRef in = args.at(idx++); - const ValueRef permute_dims = args.at(idx++); - const ValueRef out = args.at(idx++); - - if (graph.is_buffer_storage(args[2])) { - return add_permute_buffer_node(graph, in, permute_dims, out); - } - return add_permute_node(graph, in, permute_dims, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.permute.default, permute); - VK_REGISTER_OP(aten.permute_copy.default, permute); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.h b/backends/vulkan/runtime/graph/ops/impl/Permute.h deleted file mode 100644 index 0f17a4a26b0..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include - -namespace vkcompute { - -void add_permute_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp deleted file mode 100644 index 250fcdd5490..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -void check_pool2d_args( - ComputeGraph& graph, - const ValueRef in, - const ValueRef out) { - VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim); - VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim); -} - -void resize_pool2d_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - bool is_max_pool2d = extra_args.at(3) != kDummyValueRef; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - - const std::vector self_sizes = graph->sizes_of(self); - size_t ndim = self_sizes.size(); - std::vector new_out_sizes(ndim); - - // Batch, Channel - if (ndim == 4) { - new_out_sizes.at(ndim - 4) = self_sizes.at(ndim - 4); - } - new_out_sizes.at(ndim - 3) = self_sizes.at(ndim - 3); - - // Height, Width - const auto& new_out_sizes_hw = calc_out_sizes_hw( - *graph, - self_sizes, - extra_args.at(0), - /*kernel_size_only = */ true, - {extra_args.at(1), extra_args.at(2), extra_args.at(3), extra_args.at(4)}); - new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0); - new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1); - - graph->virtual_resize(out, new_out_sizes); - - if (is_max_pool2d) { - const ValueRef indices = args.at(0).refs.at(1); - graph->virtual_resize(indices, new_out_sizes); - } -} - -// -// max_pool2d -// - -void add_max_pool2d_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef ceil_mode, - const ValueRef out) { - ValueRef out_tensor = out; - // Placeholder tensor to fill binding slot for indices tensor in case we are - // computing max_pool2d instead of max_pool2d_with_indices. - TmpTensor tmp_indices_tensor = - TmpTensor(&graph, {}, graph.dtype_of(in), graph.storage_type_of(in)); - ValueRef indices_tensor = tmp_indices_tensor.vref; - int32_t write_indices = 0; - if (graph.val_is_value_list(out)) { - const auto out_val = graph.get_value_list(out); - out_tensor = out_val->at(0); - indices_tensor = out_val->at(1); - write_indices = 1; - } - - check_pool2d_args(graph, in, out_tensor); - - std::string kernel_name("max_pool2d"); - add_dtype_suffix(kernel_name, graph.dtype_of(out_tensor)); - - Kernel2dParams kernel_params = create_kernel2d_params( - graph, - kernel_size, - /*kernel_size_only = */ true, - stride, - padding, - dilation); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{{out_tensor, indices_tensor}, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - { - graph.logical_limits_ubo(out_tensor), - graph.sizes_ubo(in), - graph.create_params_buffer(kernel_params), - }, - // Push Constants - {}, - // Specialization Constants - {write_indices}, - // Resize Args - {kernel_size, stride, padding, dilation, ceil_mode}, - // Resizing Logic - resize_pool2d_node)); -} - -void max_pool2d(ComputeGraph& graph, const std::vector& args) { - return add_max_pool2d_node( - graph, args[0], args[1], args[2], args[3], args[4], args[5], args[6]); -} - -// -// avg_pool2d -// - -struct DivisorParams final { - int32_t divisor_override; - bool count_include_pad; -}; - -DivisorParams create_divisor_params( - ComputeGraph& graph, - const ValueRef divisor_override, - const ValueRef count_include_pad) { - return { - graph.val_is_int(divisor_override) - ? static_cast(graph.get_int(divisor_override)) - : 0, - graph.get_bool(count_include_pad)}; -} - -void add_avg_pool2d_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef ceil_mode, - const ValueRef count_include_pad, - const ValueRef divisor_override, - const ValueRef out) { - check_pool2d_args(graph, in, out); - - std::string kernel_name("avg_pool2d"); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - Kernel2dParams kernel_params = - create_kernel2d_params(graph, kernel_size, stride, padding); - - DivisorParams divisor_params = - create_divisor_params(graph, divisor_override, count_include_pad); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.logical_limits_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(kernel_params), - graph.create_params_buffer(divisor_params)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {kernel_size, - stride, - padding, - /*dilation= */ kDummyValueRef, - ceil_mode}, - // Resizing Logic - resize_pool2d_node)); -} - -void avg_pool2d(ComputeGraph& graph, const std::vector& args) { - return add_avg_pool2d_node( - graph, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - args[6], - args[7]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.avg_pool2d.default, avg_pool2d); - VK_REGISTER_OP(aten.max_pool2d_with_indices.default, max_pool2d); - VK_REGISTER_OP(aten.max_pool2d.default, max_pool2d); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp deleted file mode 100644 index 88f77261f4f..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp +++ /dev/null @@ -1,836 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -void resize_quantize_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const std::vector in_sizes = graph->sizes_of(in); - graph->virtual_resize(out, in_sizes); -} - -utils::uvec3 quantize_per_channel_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)args; - (void)resize_args; - - const ValueRef input = args.at(1).refs.at(0); - - utils::uvec3 local_wg_size = - graph->create_local_wg_size(global_workgroup_size); - - // WORKAROUND: The CommandBuffer::dispatch function divides - // global_workgroup_size by local_workgroup_size to get the number of - // workgroups to dispatch. For per-channel quantization along the batch axis, - // we need to ensure that we dispatch the correct number of workgroups in the - // Z dimension to cover all batch-channel combinations. - // - // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], - // local_wg_size[2]) might reduce the number of workgroups dispatched. To - // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension, - // we set local_wg_size[2] = 1. - const auto input_sizes = graph->sizes_of(input); - if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) && - global_workgroup_size[2] > 1) { - local_wg_size[2] = 1; - } - - return local_wg_size; -} - -utils::uvec3 quantize_block_wise_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef input = args.at(1).refs.at(0); - - utils::uvec3 local_wg_size = - graph->create_local_wg_size(global_workgroup_size); - - // WORKAROUND: The CommandBuffer::dispatch function divides - // global_workgroup_size by local_workgroup_size to get the number of - // workgroups to dispatch. For per-channel quantization along the batch axis, - // we need to ensure that we dispatch the correct number of workgroups in the - // Z dimension to cover all batch-channel combinations. - // - // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], - // local_wg_size[2]) might reduce the number of workgroups dispatched. To - // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension, - // we set local_wg_size[2] = 1. - const auto input_sizes = graph->sizes_of(input); - if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) && - global_workgroup_size[2] > 1) { - local_wg_size[2] = 1; - } - - return local_wg_size; -} - -void add_quantize_per_tensor_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("quantize_per_tensor"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(output)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; - } - - push_constants = { - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_quantize_node)); -} - -void add_quantize_per_token_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("quantize_per_token"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(output)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - int num_tokens = static_cast(graph.sizes_of(scale)[0]); - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output), - }; - push_constants = { - PushConstantDataInfo(&num_tokens, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - } else { - param_ubos = { - graph.logical_limits_ubo(input), - graph.logical_limits_ubo(output), - }; - push_constants = { - PushConstantDataInfo(&num_tokens, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - } - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_quantize_node)); -} - -void add_quantize_per_channel_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& axis, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("quantize_per_channel"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - int axis_val = static_cast(graph.get_int(axis)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(output)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - // Normalize axis and convert from NCHW to WHCN using utility functions - const auto input_sizes = graph.sizes_of(input); - const int64_t ndim = graph.dim_of(input); - - // Normalize axis to handle negative indices - axis_val = normalize(axis_val, ndim); - - // Convert from NCHW axis to WHCN axis for shader (vulkan representation) - int axis_whcn = nchw_dim_to_whcn_dim(axis_val, ndim); - - int num_channels; - if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) { - // For batch dimension quantization in 4D tensors, pass the actual number of - // channels so the shader can correctly unfold the batch-channel folding - num_channels = static_cast(input_sizes[1]); // Channel dimension - } else { - num_channels = static_cast(input_sizes[axis_val]); - } - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output), - }; - push_constants = { - PushConstantDataInfo(&axis_whcn, sizeof(int)), - PushConstantDataInfo(&num_channels, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - } else { - param_ubos = { - graph.logical_limits_ubo(input), - graph.logical_limits_ubo(output), - }; - push_constants = { - PushConstantDataInfo(&axis_whcn, sizeof(int)), - PushConstantDataInfo(&num_channels, sizeof(int)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - } - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - quantize_per_channel_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_quantize_node)); -} - -void add_quantize_block_wise_node( - ComputeGraph& graph, - const ValueRef& input, - const ValueRef& block_size, - const ValueRef& scale, - const ValueRef& zero_point, - const ValueRef& quant_min, - const ValueRef& quant_max, - const ValueRef& output) { - std::string kernel_name("quantize_block_wise"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(input)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - add_dtype_suffix(kernel_name, graph.dtype_of(scale)); - add_dtype_suffix(kernel_name, graph.dtype_of(zero_point)); - - // Handle optional quant_min and quant_max parameters independently - auto bounds = get_dtype_bounds(graph.dtype_of(output)); - - int quant_min_val, quant_max_val; - - // Handle quant_min - if (graph.val_is_none(quant_min)) { - quant_min_val = bounds.first; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_min), - "quant_min must be an integer, got type: ", - graph.get_val_type(quant_min)); - quant_min_val = static_cast(graph.get_int(quant_min)); - } - - // Handle quant_max - if (graph.val_is_none(quant_max)) { - quant_max_val = bounds.second; - } else { - VK_CHECK_COND( - graph.val_is_int(quant_max), - "quant_max must be an integer, got type: ", - graph.get_val_type(quant_max)); - quant_max_val = static_cast(graph.get_int(quant_max)); - } - - const auto input_sizes = graph.sizes_of(input); - const auto block_size_list = graph.get_int_list(block_size); - - // Convert PyTorch dimensions to WHCN order for shader - utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*block_size_list); - utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(input_sizes); - - // Calculate numBlocks: tensorSize / blockSize (both in WHCN order) - utils::ivec4 num_blocks_vec = { - tensor_size_whcn[0] / block_size_vec[0], - tensor_size_whcn[1] / block_size_vec[1], - tensor_size_whcn[2] / block_size_vec[2], - tensor_size_whcn[3] / block_size_vec[3]}; - - // Calculate blockStride: pre-computed linear strides for the block grid - utils::ivec4 block_stride_vec = { - 1, - num_blocks_vec[0], - num_blocks_vec[0] * num_blocks_vec[1], - num_blocks_vec[0] * num_blocks_vec[1] * num_blocks_vec[2]}; - - vkapi::ParamsBindList param_ubos; - std::vector push_constants; - - if (graph.is_buffer_storage(input)) { - param_ubos = { - graph.numel_ubo(input), - graph.sizes_ubo(input), - graph.strides_ubo(input), - graph.sizes_ubo(output), - graph.strides_ubo(output)}; - } else { - param_ubos = { - graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; - } - - push_constants = { - PushConstantDataInfo(&block_size_vec, sizeof(block_size_vec)), - PushConstantDataInfo(&num_blocks_vec, sizeof(num_blocks_vec)), - PushConstantDataInfo(&block_stride_vec, sizeof(block_stride_vec)), - PushConstantDataInfo(&quant_min_val, sizeof(int)), - PushConstantDataInfo(&quant_max_val, sizeof(int)), - }; - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - quantize_block_wise_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {input, vkapi::kRead}, - {{scale, zero_point}, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - {}, - // Resizing Logic - resize_quantize_node)); -} - -void quantize_per_tensor_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warning - dtype is inferred from output - (void)dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is a floating point type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kDouble || - graph.dtype_of(input) == vkapi::kFloat || - graph.dtype_of(input) == vkapi::kHalf); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - add_quantize_per_tensor_node( - graph, input, scale, zero_point, quant_min, quant_max, output); -} - -void quantize_per_token_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warning - dtype is inferred from output - (void)dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is a floating point type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kDouble || - graph.dtype_of(input) == vkapi::kFloat || - graph.dtype_of(input) == vkapi::kHalf); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - // Calculate number of tokens (product of all dimensions except the last one) - int64_t num_tokens = 1; - const auto input_sizes = graph.sizes_of(input); - for (size_t i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - const auto scale_sizes = graph.sizes_of(scale); - const auto zero_point_sizes = graph.sizes_of(zero_point); - - // Calculate total number of elements in scale and zero_point tensors - int64_t scale_numel = 1; - for (size_t i = 0; i < scale_sizes.size(); i++) { - scale_numel *= scale_sizes[i]; - } - - int64_t zero_point_numel = 1; - for (size_t i = 0; i < zero_point_sizes.size(); i++) { - zero_point_numel *= zero_point_sizes[i]; - } - - // Check that the total number of elements matches num_tokens - // This allows for both 1D tensors (size [num_tokens]) and reshaped tensors - // (size [num_tokens, 1]) - VK_CHECK_COND(scale_numel == num_tokens); - VK_CHECK_COND(zero_point_numel == num_tokens); - - add_quantize_per_token_node( - graph, input, scale, zero_point, quant_min, quant_max, output); -} - -void quantize_per_channel_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef axis = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef dtype = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warning - dtype is inferred from output - (void)dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is a floating point type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kDouble || - graph.dtype_of(input) == vkapi::kFloat || - graph.dtype_of(input) == vkapi::kHalf); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - // Normalize axis - int axis_val = static_cast(graph.get_int(axis)); - const auto input_sizes = graph.sizes_of(input); - int64_t ndim = graph.dim_of(input); - if (axis_val < 0) { - axis_val += ndim; - } - - // Verify axis is valid - VK_CHECK_COND(axis_val >= 0 && axis_val < ndim); - - // Get number of channels along the specified axis - int64_t num_channels = input_sizes[axis_val]; - - const auto scale_sizes = graph.sizes_of(scale); - const auto zero_point_sizes = graph.sizes_of(zero_point); - - // Calculate total number of elements in scale and zero_point tensors - int64_t scale_numel = 1; - for (size_t i = 0; i < scale_sizes.size(); i++) { - scale_numel *= scale_sizes[i]; - } - - int64_t zero_point_numel = 1; - for (size_t i = 0; i < zero_point_sizes.size(); i++) { - zero_point_numel *= zero_point_sizes[i]; - } - - // Check that the total number of elements matches num_channels - VK_CHECK_COND(scale_numel == num_channels); - VK_CHECK_COND(zero_point_numel == num_channels); - - add_quantize_per_channel_node( - graph, input, scale, zero_point, axis, quant_min, quant_max, output); -} - -void quantize_affine_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef input = args[arg_idx++]; - const ValueRef block_size = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - const ValueRef zero_point = args[arg_idx++]; - const ValueRef output_dtype = args[arg_idx++]; - const ValueRef quant_min = args[arg_idx++]; - const ValueRef quant_max = args[arg_idx++]; - const ValueRef output = args[arg_idx++]; - - // Suppress unused variable warnings - (void)output_dtype; - - // Check tensor types - VK_CHECK_COND(graph.val_is_tensor(input)); - VK_CHECK_COND(graph.val_is_tensor(scale)); - VK_CHECK_COND(graph.val_is_tensor(zero_point)); - VK_CHECK_COND(graph.val_is_tensor(output)); - - // Verify input is a floating point type - VK_CHECK_COND( - graph.dtype_of(input) == vkapi::kDouble || - graph.dtype_of(input) == vkapi::kFloat || - graph.dtype_of(input) == vkapi::kHalf); - - // Get scale and zero point dtypes - vkapi::ScalarType scale_dtype = graph.dtype_of(scale); - vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point); - - // Verify supported types for scale (fp32 only for now) - VK_CHECK_COND(scale_dtype == vkapi::kFloat); - - // Verify supported types for zero point (int32, int8, fp32) - VK_CHECK_COND( - zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar || - zero_point_dtype == vkapi::kFloat); - - // Check that scale and zero_point have buffer storage and width packing - VK_CHECK_COND(graph.is_buffer_storage(scale)); - VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(zero_point)); - VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); - - // Check that tensors with texture storage have standard axis map - if (!graph.is_buffer_storage(input)) { - VK_CHECK_COND(graph.has_standard_axis_map(input)); - } - if (!graph.is_buffer_storage(output)) { - VK_CHECK_COND(graph.has_standard_axis_map(output)); - } - - // Verify block_size is valid (each dimension must divide evenly into input - // size) - const auto input_sizes = graph.sizes_of(input); - const auto block_size_list = graph.get_int_list(block_size); - VK_CHECK_COND(block_size_list->size() == input_sizes.size()); - - for (size_t i = 0; i < input_sizes.size(); i++) { - if ((*block_size_list)[i] > 1) { - VK_CHECK_COND( - input_sizes[i] % (*block_size_list)[i] == 0, - "Input size at dimension ", - i, - " (", - input_sizes[i], - ") must be divisible by block_size at dimension ", - i, - " (", - (*block_size_list)[i], - ")"); - } - } - - add_quantize_block_wise_node( - graph, - input, - block_size, - scale, - zero_point, - quant_min, - quant_max, - output); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP( - quantized_decomposed.quantize_per_tensor.tensor, - quantize_per_tensor_impl); - VK_REGISTER_OP( - quantized_decomposed.quantize_per_token.default, quantize_per_token_impl); - VK_REGISTER_OP( - quantized_decomposed.quantize_per_channel.default, - quantize_per_channel_impl); - - // TorchAO affine quantization operators - VK_REGISTER_OP(torchao.quantize_affine.default, quantize_affine_impl); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp deleted file mode 100644 index 51f8138485e..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp +++ /dev/null @@ -1,695 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include - -namespace vkcompute { - -// -// Utility functions -// - -struct Conv2DParams { - utils::ivec2 kernel_size; - utils::ivec2 stride; - utils::ivec2 padding; - utils::ivec2 dilation; - int32_t groups; - int32_t out_channels_per_group; - int32_t in_channels_per_group; - int32_t logical_K_per_group; - int32_t K_per_group; - int32_t K4_per_group; - int32_t logical_K; - int32_t K; - int32_t K4; -}; - -Conv2DParams create_conv2d_params( - ComputeGraph& graph, - const ValueRef& conv_input, - const ValueRef& conv_output, - const ValueRef& kernel_size, - const ValueRef& stride, - const ValueRef& padding, - const ValueRef& dilation, - const ValueRef& groups) { - const auto kernel_size_list = graph.get_int_list(kernel_size); - const auto stride_list = graph.get_int_list(stride); - const auto padding_list = graph.get_int_list(padding); - const auto dilation_list = graph.get_int_list(dilation); - const int32_t groups_val = graph.get_int(groups); - - // Pre-compute input and output channels per group - - std::vector out_sizes = graph.sizes_of(conv_output); - const int32_t out_channels = utils::val_at(-3, out_sizes); - const int32_t out_channels_per_group = out_channels / groups_val; - - std::vector in_sizes = graph.sizes_of(conv_input); - const int32_t in_channels = utils::val_at(-3, in_sizes); - const int32_t in_channels_per_group = in_channels / groups_val; - - // Pre-compute the number of elements along the K dimension per group. This - // quantity is aligned to the next multiple of 4 to ensure data loads are - // aligned to texel boundaries. - - const int32_t logical_K_per_group = - kernel_size_list->at(0) * kernel_size_list->at(1) * in_channels_per_group; - const int32_t K_per_group = utils::align_up_4(logical_K_per_group); - const int32_t K4_per_group = K_per_group / 4; - - // Pre-compute the "theoretical" size of the K dim of the input im2col matrix, - // which represents the flattened convolution window used to compute an output - // element. This is used for bounds checking. - - const int32_t logical_K = - kernel_size_list->at(0) * kernel_size_list->at(1) * in_channels; - - const int32_t K = K_per_group * groups_val; - // Used for texel stride calculations - const int32_t K4 = K / 4; - - return Conv2DParams{ - // Swap the order from HW to WH - utils::make_ivec2({kernel_size_list->at(1), kernel_size_list->at(0)}), - utils::make_ivec2({stride_list->at(1), stride_list->at(0)}), - utils::make_ivec2({padding_list->at(1), padding_list->at(0)}), - utils::make_ivec2({dilation_list->at(1), dilation_list->at(0)}), - groups_val, - out_channels_per_group, - in_channels_per_group, - logical_K_per_group, - K_per_group, - K4_per_group, - logical_K, - K, - K4, - }; -} - -std::vector calculate_input_im2col_sizes( - ComputeGraph* graph, - const ValueRef& input, - const ValueRef& output, - const ValueRef& kernel_size, - const ValueRef& groups) { - std::vector in_sizes = graph->sizes_of(input); - const int64_t in_channels = utils::val_at(-3, in_sizes); - - std::vector out_sizes = graph->sizes_of(output); - const int64_t batches = utils::val_at(-4, out_sizes); - const int64_t out_height = utils::val_at(-2, out_sizes); - const int64_t out_width = utils::val_at(-1, out_sizes); - - // Represents the number of channel groups - const int64_t groups_val = graph->extract_scalar(groups); - // No need to div_up because in_channels % groups_val = 0 - const int64_t in_channels_per_group = in_channels / groups_val; - - const auto kernel_size_list = graph->get_int_list(kernel_size); - - // Align to the next multiple of 4 to ensure that data loads align nicely with - // texel boundaries. We want to ensure that the first data element of each - // group is at the start of its texel. - const int64_t flattened_kernel_len = utils::align_up_4( - in_channels_per_group * kernel_size_list->at(0) * - kernel_size_list->at(1)); - - // K -> flattened convolution window (adjusted) - const int64_t K = flattened_kernel_len * groups_val; - // M -> number of elements in 2D output plane. This is aligned to the next - // multiple of 4 since the im2col shader operates on 4x4 blocks. - const int64_t M = utils::align_up_4(out_height * out_width * batches); - - return {M, K}; -} - -std::vector calculate_output_im2col_sizes( - ComputeGraph* graph, - const ValueRef& output) { - std::vector out_sizes = graph->sizes_of(output); - const int64_t batches = utils::val_at(-4, out_sizes); - const int64_t out_channels = utils::val_at(-3, out_sizes); - const int64_t out_height = utils::val_at(-2, out_sizes); - const int64_t out_width = utils::val_at(-1, out_sizes); - - // N -> output channels - const int64_t N = out_channels; - // M -> number of elements in 2D output plane - const int64_t M = out_height * out_width * batches; - - return {M, N}; -} - -// -// Shader dispatch utilities -// - -utils::uvec3 im2col_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef input_image = args.at(1).refs.at(0); - const ValueRef output_image = resize_args.at(0); - const ValueRef kernel_size = resize_args.at(1); - const ValueRef groups = resize_args.at(2); - - std::vector im2col_sizes = calculate_input_im2col_sizes( - graph, input_image, output_image, kernel_size, groups); - const uint32_t K = utils::safe_downcast(im2col_sizes[1]); - const uint32_t M = utils::safe_downcast(im2col_sizes[0]); - - // 1 output tile is 4x4 elements - const uint32_t K4 = utils::div_up(K, 4u); - const uint32_t M4 = utils::div_up(M, 4u); - - return {K4, M4, 1}; -} - -utils::uvec3 col2im_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef output = args.at(0).refs.at(0); - - std::vector im2col_sizes = - calculate_output_im2col_sizes(graph, output); - const uint32_t N = utils::safe_downcast(im2col_sizes[1]); - const uint32_t M = utils::safe_downcast(im2col_sizes[0]); - - // 1 output tile is 4x4 elements - const uint32_t N4 = utils::div_up(N, 4u); - const uint32_t M4 = utils::div_up(M, 4u); - - return {N4, M4, 1}; -} - -// -// Dispatch nodes -// - -void add_input_im2col_node( - ComputeGraph& graph, - const ValueRef input_image, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef output_image, - const ValueRef input_im2col) { - Conv2DParams conv_params = create_conv2d_params( - graph, - input_image, - output_image, - kernel_size, - stride, - padding, - dilation, - groups); - - std::string kernel_name = "im2col"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_image)); - add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(input_im2col), - graph.sizes_ubo(input_image), - graph.sizes_ubo(output_image), - graph.create_params_buffer(conv_params)}; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - im2col_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{input_im2col, vkapi::kWrite}, {input_image, vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize args - {output_image, kernel_size, groups}, - // Resizing Logic - nullptr)); -} - -void add_quantize_and_pack_im2col_node( - ComputeGraph& graph, - const ValueRef input_image, - const ValueRef input_scale, - const ValueRef input_zp, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef output_image, - const ValueRef input_int_im2col) { - Conv2DParams conv_params = create_conv2d_params( - graph, - input_image, - output_image, - kernel_size, - stride, - padding, - dilation, - groups); - - float inv_scale = 1.0f / graph.extract_scalar(input_scale); - int32_t zp = graph.extract_scalar(input_zp); - - // Get shader for quantized conv2d linear tiled - std::string kernel_name = "quantize_and_pack_im2col"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_image)); - add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); - - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(input_int_im2col), - graph.sizes_ubo(input_image), - graph.sizes_ubo(output_image), - graph.create_params_buffer(conv_params)}; - - std::vector push_constants = { - PushConstantDataInfo(&inv_scale, sizeof(inv_scale)), - PushConstantDataInfo(&zp, sizeof(zp)), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - im2col_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{input_int_im2col, vkapi::kWrite}, {input_image, vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - {}, - // Resize args - {output_image, kernel_size, groups}, - // Resizing Logic - nullptr)); -} - -void add_conv2d_q8csw_linear_node( - ComputeGraph& graph, - const ValueRef input_im2col, - const ValueRef input_image, - const ValueRef packed_weight, - const ValueRef packed_weight_scales, - const ValueRef bias_data, - const ValueRef packed_bias, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef output_image) { - Conv2DParams conv_params = create_conv2d_params( - graph, - input_image, - output_image, - kernel_size, - stride, - padding, - dilation, - groups); - - // One limitation of the current implementation is that for grouped convs, - // the number of output_image channels per group must be a multiple of 4. One - // loaded 4x4 weight tile must all belong to the same group. - if (conv_params.groups > 1) { - VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0); - } - - std::string kernel_name = "conv2d_q8csw_linear_tiled"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(output_image), - graph.sizes_ubo(input_image), - graph.create_params_buffer(conv_params)}; - - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - col2im_global_wg_size, - quantized_linear_local_wg_size, - // Inputs and Outputs - {{output_image, vkapi::kWrite}, - {{input_im2col, packed_weight, packed_weight_scales, packed_bias}, - vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {apply_bias}, - // Resize args - {}, - // Resizing Logic - nullptr)); -} - -void add_conv2d_q8ta_q8csw_linear_node( - ComputeGraph& graph, - const ValueRef input_int_im2col, - const ValueRef input_image, - const ValueRef input_scale, - const ValueRef input_zp, - const ValueRef weight_data, - const ValueRef packed_weight, - const ValueRef packed_weight_sums, - const ValueRef packed_weight_scales, - const ValueRef bias_data, - const ValueRef packed_bias, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef output_image) { - Conv2DParams conv_params = create_conv2d_params( - graph, - input_image, - output_image, - kernel_size, - stride, - padding, - dilation, - groups); - - // One limitation of the current implementation is that for grouped convs, - // the number of output channels per group must be a multiple of 4. One loaded - // 4x4 weight tile must all belong to the same group. - if (conv_params.groups > 1) { - VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0); - } - - float scale = graph.extract_scalar(input_scale); - int32_t zp = graph.extract_scalar(input_zp); - - std::string kernel_name = "conv2d_q8ta_q8csw_linear_tiled"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(output_image), - graph.sizes_ubo(input_image), - graph.create_params_buffer(conv_params)}; - - std::vector push_constants = { - PushConstantDataInfo(&scale, sizeof(scale)), - PushConstantDataInfo(&zp, sizeof(zp)), - }; - - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - col2im_global_wg_size, - quantized_linear_local_wg_size, - // Inputs and Outputs - {{output_image, vkapi::kWrite}, - {{input_int_im2col, - packed_weight, - packed_weight_sums, - packed_weight_scales, - packed_bias}, - vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - {apply_bias}, - // Resize args - {weight_data}, - // Resizing Logic - nullptr)); -} - -// -// High level operator impl -// - -void quantized_conv2d_impl( - ComputeGraph& graph, - const QuantizationConfig& input_quant_config, - const QuantizationConfig& weight_quant_config, - const ValueRef input_image, - const ValueRef input_scale, - const ValueRef input_zp, - const ValueRef weight_data, - const ValueRef weight_sums_data, - const ValueRef weight_scales_data, - const ValueRef bias_data, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef output_image) { - VK_CHECK_COND(weight_quant_config.granularity == kPerChannel); - VK_CHECK_COND(weight_quant_config.nbits == 8); - VK_CHECK_COND(weight_quant_config.is_symmetric); - - const ValueRef packed_weight = - prepack_quantized_linear_weight(graph, weight_quant_config, weight_data); - ValueRef packed_weight_scales = prepack_standard( - graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked); - - // Create a dummy tensor to fill the binding slot of the bias tensor if it is - // not provided. This helps simplify dispatch logic and makes it so that - // fewer shader variants need to be generated. - TmpTensor dummy_bias( - &graph, - {}, - graph.dtype_of(output_image), - utils::kBuffer, - utils::kWidthPacked); - - ValueRef packed_bias = dummy_bias.vref; - if (!graph.val_is_none(bias_data)) { - packed_bias = - prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked); - } - - std::vector input_im2col_sizes = calculate_input_im2col_sizes( - &graph, input_image, output_image, kernel_size, groups); - - // Use weight only quantized conv2d if at least one is true: - // 1. Device does not support int8 dot product - // 2. Input is not quantized - if (!graph.can_use_int8_dot_product() || - input_quant_config.granularity == kNoQuantization) { - TmpTensor input_im2col( - &graph, - input_im2col_sizes, - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - - add_input_im2col_node( - graph, - input_image, - kernel_size, - stride, - padding, - dilation, - groups, - output_image, - input_im2col); - - add_conv2d_q8csw_linear_node( - graph, - input_im2col, - input_image, - packed_weight, - packed_weight_scales, - bias_data, - packed_bias, - kernel_size, - stride, - padding, - dilation, - groups, - output_image); - return; - } else { - // Otherwise, use activation + weight quantized conv2d - VK_CHECK_COND(input_quant_config.granularity == kPerTensor); - VK_CHECK_COND(weight_quant_config.nbits == 8); - VK_CHECK_COND(!weight_quant_config.is_dynamic); - - ValueRef packed_weight_sums = prepack_standard( - graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); - - // Allocate quantized + packed im2col matrix for input - const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0)); - const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1)); - - TmpTensor input_int_im2col( - &graph, - {num_blocks_M, num_blocks_K * 4}, - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - add_quantize_and_pack_im2col_node( - graph, - input_image, - input_scale, - input_zp, - kernel_size, - stride, - padding, - dilation, - groups, - output_image, - input_int_im2col); - - add_conv2d_q8ta_q8csw_linear_node( - graph, - input_int_im2col, - input_image, - input_scale, - input_zp, - weight_data, - packed_weight, - packed_weight_sums, - packed_weight_scales, - bias_data, - packed_bias, - kernel_size, - stride, - padding, - dilation, - groups, - output_image); - return; - }; -} - -void conv2d_q8ta_q8csw(ComputeGraph& graph, const std::vector& args) { - int32_t idx = 0; - const ValueRef input_image = args.at(idx++); - const ValueRef input_scale = args.at(idx++); - const ValueRef input_zp = args.at(idx++); - const ValueRef weight_data = args.at(idx++); - const ValueRef weight_sums_data = args.at(idx++); - const ValueRef weight_scales_data = args.at(idx++); - const ValueRef bias_data = args.at(idx++); - const ValueRef kernel_size = args.at(idx++); - const ValueRef stride = args.at(idx++); - const ValueRef padding = args.at(idx++); - const ValueRef dilation = args.at(idx++); - const ValueRef groups = args.at(idx++); - const ValueRef output_image = args.at(idx++); - - const int64_t K = graph.size_at(-1, weight_data); - - QuantizationConfig input_quant_config(8, kPerTensor, {}, false); - QuantizationConfig weight_quant_config(8, kPerChannel, {K}); - - quantized_conv2d_impl( - graph, - input_quant_config, - weight_quant_config, - input_image, - input_scale, - input_zp, - weight_data, - weight_sums_data, - weight_scales_data, - bias_data, - kernel_size, - stride, - padding, - dilation, - groups, - output_image); -} - -void conv2d_q8csw(ComputeGraph& graph, const std::vector& args) { - int32_t idx = 0; - const ValueRef input_image = args.at(idx++); - const ValueRef weight_data = args.at(idx++); - const ValueRef weight_scales_data = args.at(idx++); - const ValueRef bias_data = args.at(idx++); - const ValueRef kernel_size = args.at(idx++); - const ValueRef stride = args.at(idx++); - const ValueRef padding = args.at(idx++); - const ValueRef dilation = args.at(idx++); - const ValueRef groups = args.at(idx++); - const ValueRef output_image = args.at(idx++); - - const int64_t K = graph.size_at(-1, weight_data); - - QuantizationConfig input_quant_config(32, kNoQuantization, {}); - QuantizationConfig weight_quant_config(8, kPerChannel, {K}); - - quantized_conv2d_impl( - graph, - input_quant_config, - weight_quant_config, - input_image, - kDummyValueRef, // input scale - kDummyValueRef, // input zero point - weight_data, - kDummyValueRef, // weight sums - weight_scales_data, - bias_data, - kernel_size, - stride, - padding, - dilation, - groups, - output_image); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw); - VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp deleted file mode 100644 index 4831c6f2f85..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ /dev/null @@ -1,728 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include - -namespace vkcompute { - -// -// Shader dispatch utilities -// - -bool is_gemv(ComputeGraph* graph, const ValueRef& fp_input) { - return graph->size_at(-2, fp_input) == 1; -} - -void resize_linear_qw_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - ValueRef output = args.at(0).refs.at(0); - ValueRef fp_input = args.at(1).refs.at(0); - ValueRef weight_data = extra_args.at(1); - - std::vector mat1_sizes = graph->sizes_of(fp_input); - std::vector mat2_sizes = graph->sizes_of(weight_data); - - const int64_t out_cols = utils::val_at(-2, mat1_sizes); - const int64_t out_rows = utils::val_at(-2, mat2_sizes); - - std::vector new_out_sizes(3); - if (mat1_sizes.size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1_sizes.at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - graph->virtual_resize(output, new_out_sizes); -} - -utils::uvec3 quantized_linear_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - - std::vector out_sizes = graph->sizes_of(out); - // height - const uint32_t M = utils::val_at(-2, out_sizes); - // width - const uint32_t N = utils::val_at(-1, out_sizes); - - const uint32_t M4 = utils::div_up(M, 4u); - const uint32_t N4 = utils::div_up(N, 4u); - - // For 4-bit weights, each output tile contains 8 columns and 4 rows - if (shader.kernel_name.find("q4") != std::string::npos) { - const uint32_t N8 = utils::div_up(N, 8u); - - const bool using_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - // TODO: explain - if (using_coop_algorithm) { - return {64, N8, M}; - } - return {N8, M4, 1}; - } - - // Otherwise, each output tile contains 4 columns and 4 rows - return {N4, M4, 1}; -} - -utils::uvec3 quantized_linear_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - const bool use_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - - if (use_coop_algorithm) { - return {64, 1, 1}; - } else { - return pick_hw_square_wg_size( - graph, shader, global_workgroup_size, args, resize_args); - } -} - -std::tuple get_quantized_input_num_blocks( - ComputeGraph& graph, - const ValueRef input) { - std::vector input_sizes = graph.sizes_of(input); - const int64_t ndim = graph.dim_of(input); - - const int64_t M = input_sizes.at(ndim - 2); - const int64_t K = input_sizes.at(ndim - 1); - - const int64_t num_blocks_M = utils::div_up(M, int64_t(4)); - const int64_t num_blocks_K = utils::div_up(K, int64_t(4)); - - return std::make_tuple(num_blocks_M, num_blocks_K); -} - -utils::uvec3 quant_pack_input_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef input = args.at(1).refs.at(0); - int64_t num_blocks_M, num_blocks_K; - std::tie(num_blocks_M, num_blocks_K) = - get_quantized_input_num_blocks(*graph, input); - - return { - utils::safe_downcast(num_blocks_K), - utils::safe_downcast(num_blocks_M), - 1u}; -} - -vkapi::ShaderInfo pick_linear_qw_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - - const ValueRef output = args.at(0).refs.at(0); - const ValueRef fp_input = args.at(1).refs.at(0); - const ValueRef packed_int_weight = args.at(1).refs.at(1); - - const bool weight_is_4bit = resize_args.at(0) != kDummyValueRef; - const bool is_gemv_case = is_gemv(graph, fp_input); - - std::string kernel_name = "linear_"; - if (weight_is_4bit) { - kernel_name += "q4gsw"; - } else { - kernel_name += "q8csw"; - } - - if (weight_is_4bit && is_gemv_case) { - kernel_name += "_coop"; - } else { - kernel_name += "_tiled"; - } - add_storage_type_suffix(kernel_name, graph->storage_type_of(output)); - add_storage_type_suffix( - kernel_name, graph->storage_type_of(packed_int_weight)); - add_dtype_suffix(kernel_name, graph->dtype_of(output)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -// -// Prepacking nodes -// - -ValueRef prepack_quantized_linear_weight( - ComputeGraph& graph, - const QuantizationConfig& weight_quant_config, - const ValueRef qmat2_data) { - VK_CHECK_COND( - weight_quant_config.nbits == 8 || weight_quant_config.nbits == 4); - - std::vector qmat2_orig_sizes = graph.sizes_of(qmat2_data); - const int64_t ndim = graph.dim_of(qmat2_data); - - int64_t qmat2_width = qmat2_orig_sizes.at(ndim - 1); - int64_t qmat2_height = qmat2_orig_sizes.at(ndim - 2); - - int64_t K; - int64_t N; - if (weight_quant_config.nbits == 4) { - // For 4-bit quantization, weight source data has shape [N, K/2]. Each byte - // contains 2 * 4-bit values. - K = qmat2_width * 2; - N = qmat2_height; - } else { - // For 8-bit quantization, the weight source data has shape [N, K] - K = qmat2_width; - N = qmat2_height; - } - - // Sanity check that assumptions are correct. Data loads along the innermost - // dimension must be well aligned along texel boundaries. - if (weight_quant_config.nbits == 4) { - VK_CHECK_COND(K % 8 == 0); - } else { - VK_CHECK_COND(K % 4 == 0); - } - - // The packing format packs the weight tensor into blocks of 4 columns (K) and - // 4 rows (N) - int64_t N_per_block = 4; - int64_t K_per_block = 4; - - // For 4 bit, quantization, the amount of information contained in one block - // can be doubled. Each block will contain data for 8 rows (N) instead of the - // usual 4. - if (weight_quant_config.nbits == 4) { - N_per_block = 8; - } - - // To figure out the size of the output tensor, determine the number of blocks - // along each dimension. - const int64_t num_blocks_K = utils::div_up(K, K_per_block); - const int64_t num_blocks_N = utils::div_up(N, N_per_block); - - // The blocks are arranged in a transposed manner, such that the transposed - // weight block is indexed like packed_weights[k4][n4] - this is to allow for - // optimal memory coalescing when computing GEMM. - int64_t output_height = num_blocks_K; - // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit - // values) and each block is represented as a ivec4. Therefore the width dim - // of the packed tensor is multiplied by 4. - int64_t output_width = num_blocks_N * 4; - - // For 4 bit quantization, The blocks are arranged without the transposition, - // such that a weight block is accessed like packed_weights[n8][k4]. This is - // an optimization targeted for LLMs, which need to compute GEMV as well as - // GEMM. This memory layout provides better performance for the co-operative - // algorithm used to compute GEMV, at the cost of slightly reducing GEMM - // performance. - if (weight_quant_config.nbits == 4) { - output_height = num_blocks_N; - output_width = num_blocks_K * 4; - } - - // Store the original sizes of the weight data to pass to the shader - utils::ivec2 orig_sizes = { - utils::safe_downcast(K), utils::safe_downcast(N)}; - - std::vector qmat2_sizes{output_height, output_width}; - - utils::StorageType storage_type = utils::kTexture2D; - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - if (output_width > max_extent * 4 || output_height > max_extent) { - storage_type = utils::kBuffer; - } - - ValueRef qmat2 = graph.add_tensor( - qmat2_sizes, vkcompute::vkapi::kInt, storage_type, utils::kWidthPacked); - - utils::uvec3 global_wg_size; - if (weight_quant_config.nbits == 4) { - // For 4-bit quantization, each thread writes out two adjacent blocks - global_wg_size = { - utils::safe_downcast(utils::div_up(num_blocks_K, int64_t(2))), - utils::safe_downcast(num_blocks_N), - 1u}; - } else { - global_wg_size = { - utils::safe_downcast(num_blocks_N), - utils::safe_downcast(num_blocks_K), - 1u}; - } - - std::string kernel_name = weight_quant_config.nbits == 4 - ? "pack_q4_linear_weight" - : "pack_q8_linear_weight"; - add_storage_type_suffix(kernel_name, storage_type); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - qmat2_data, - qmat2, - // UBOs - {}, - // Specialization Constants - {}, - // Push Constants - {graph.sizes_pc_of(qmat2), - PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec2))})); - - return qmat2; -} - -// -// Dispatch nodes -// - -/* - * Shader dispatch for linear with quantized weight but fp activations. - */ -DynamicDispatchNode make_linear_qw_node( - ComputeGraph& graph, - const QuantizationConfig& weight_quant_config, - const ValueRef fp_input, - const ValueRef weight_data, - const ValueRef packed_weight, - const ValueRef packed_weight_scales, - const ValueRef packed_weight_zeros, - const ValueRef group_size, - const ValueRef bias_data, - const ValueRef packed_bias, - const ValueRef output) { - // Only certain quantization types supported at the moment - VK_CHECK_COND( - weight_quant_config.granularity == kPerChannel || - weight_quant_config.granularity == kPerGroup); - VK_CHECK_COND(weight_quant_config.is_symmetric); - VK_CHECK_COND( - weight_quant_config.nbits == 8 || weight_quant_config.nbits == 4); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(output), graph.sizes_ubo(fp_input)}; - - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - - int32_t K4_per_group = 0; - if (weight_quant_config.nbits == 4) { - int32_t group_size_val = graph.extract_scalar(group_size); - K4_per_group = utils::div_up(group_size_val, int32_t(4)); - } - - const ValueRef is_4bit_flag = - weight_quant_config.nbits == 4 ? group_size : kDummyValueRef; - - return DynamicDispatchNode( - graph, - pick_linear_qw_shader, - quantized_linear_global_wg_size, - quantized_linear_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {{fp_input, packed_weight, packed_weight_scales, packed_bias}, - vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {apply_bias, K4_per_group}, - // Resize args - {is_4bit_flag, weight_data}, - // Resizing Logic - resize_linear_qw_node); -} - -DynamicDispatchNode make_quantize_and_pack_linear_input_node( - ComputeGraph& graph, - const QuantizationConfig& input_quant_config, - const ValueRef fp_input, - const ValueRef packed_input_scale, - const ValueRef packed_input_zp, - const ValueRef input_scale_data, - const ValueRef input_zp_data, - const ValueRef packed_int_input, - const ValueRef group_size) { - // Only certain quantization types supported at the moment - VK_CHECK_COND(input_quant_config.granularity == kPerTensor); - - int64_t num_blocks_M, num_blocks_K; - std::tie(num_blocks_M, num_blocks_K) = - get_quantized_input_num_blocks(graph, fp_input); - - float inv_scale = 1.0f / graph.extract_scalar(input_scale_data); - int32_t zp = graph.extract_scalar(input_zp_data); - - std::string shader_name = "quantize_and_pack_linear_input_per_tensor"; - add_storage_type_suffix(shader_name, graph.storage_type_of(packed_int_input)); - add_storage_type_suffix(shader_name, graph.storage_type_of(fp_input)); - add_dtype_suffix(shader_name, graph.dtype_of(fp_input)); - - vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(fp_input)}; - - std::vector push_constants = { - PushConstantDataInfo(&inv_scale, sizeof(inv_scale)), - PushConstantDataInfo(&zp, sizeof(zp)), - }; - - return DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(shader_name), - quant_pack_input_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{packed_int_input, vkapi::kWrite}, {fp_input, vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - {}, - // Resize args - {}); -} - -DynamicDispatchNode make_linear_qa_qw_node( - ComputeGraph& graph, - const QuantizationConfig& input_quant_config, - const QuantizationConfig& weight_quant_config, - const ValueRef fp_input, - const ValueRef packed_int_input, - const ValueRef packed_input_scale, - const ValueRef packed_input_zp, - const ValueRef input_scale_data, - const ValueRef input_zp_data, - const ValueRef weight_data, - const ValueRef packed_weight, - const ValueRef packed_weight_sums, - const ValueRef packed_weight_scales, - const ValueRef group_size, - const ValueRef bias_data, - const ValueRef packed_bias, - const ValueRef output) { - VK_CHECK_COND(input_quant_config.granularity == kPerTensor); - VK_CHECK_COND(input_quant_config.nbits == 8); - VK_CHECK_COND(weight_quant_config.granularity == kPerChannel); - VK_CHECK_COND(weight_quant_config.is_symmetric); - VK_CHECK_COND(weight_quant_config.nbits == 8); - - float scale = graph.extract_scalar(input_scale_data); - int32_t zp = graph.extract_scalar(input_zp_data); - - // Get shader for quantized linear - std::string kernel_name = "linear_q8ta_q8csw_tiled"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(output)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_int_input)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(output)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(output), graph.sizes_ubo(packed_int_input)}; - - std::vector push_constants = { - PushConstantDataInfo(&scale, sizeof(scale)), - PushConstantDataInfo(&zp, sizeof(zp)), - }; - - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - - // Add the compute node - return DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - quantized_linear_global_wg_size, - quantized_linear_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, - {{packed_int_input, - packed_weight, - packed_weight_sums, - packed_weight_scales, - packed_bias}, - vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - {apply_bias}, - // Resize args - {fp_input}, - // Resizing Logic - nullptr); -} - -// -// High level operator impl -// - -void quantized_linear_impl( - ComputeGraph& graph, - const QuantizationConfig& input_quant_config, - const QuantizationConfig& weight_quant_config, - const ValueRef fp_input, - const ValueRef input_scale, - const ValueRef input_zp, - const ValueRef weight_data, - const ValueRef weight_sums_data, - const ValueRef weight_scales_data, - const ValueRef weight_zeros_data, - const ValueRef group_size, - const ValueRef bias_data, - const ValueRef output) { - std::vector input_sizes = graph.sizes_of(fp_input); - std::vector weight_sizes = graph.sizes_of(weight_data); - - const int64_t K = utils::val_at(-1, input_sizes); - // K (input channels) must be a multiple of 4 to ensure that reading a group - // of 4 input channels from the input tensor will be aligned on a texel - // boundary. - VK_CHECK_COND(K % 4 == 0); - - // Prepack weight data - - const ValueRef packed_weight = - prepack_quantized_linear_weight(graph, weight_quant_config, weight_data); - const ValueRef packed_weight_scales = prepack_standard( - graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked); - // Weight affine quant not supported at the moment - const ValueRef packed_weight_zeros = kDummyValueRef; - - // Prepack bias data - - // Create a dummy tensor to fill the binding slot of the bias tensor if it is - // not provided. This helps simplify dispatch logic and makes it so that - // fewer shdaer variants need to be generated. - TmpTensor dummy_bias( - &graph, {}, graph.dtype_of(output), utils::kBuffer, utils::kWidthPacked); - - ValueRef packed_bias = dummy_bias.vref; - if (graph.val_is_not_none(bias_data)) { - packed_bias = - prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked); - } - - // Use weight only quantized linear if at least one is true: - // 1. Device does not support int8 dot product - // 2. Input is not quantized - if (!graph.can_use_int8_dot_product() || - input_quant_config.granularity == kNoQuantization) { - DynamicDispatchNode linear_qw_node(make_linear_qw_node( - graph, - weight_quant_config, - fp_input, - weight_data, - packed_weight, - packed_weight_scales, - packed_weight_zeros, - group_size, - bias_data, - packed_bias, - output)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode(linear_qw_node)); - return; - } else { - // Otherwise, use input and weight quantized linear computed with integer - // accumulation - - // Input scale/zero point only used for activation & weight quantized linear - ValueRef packed_input_scale = input_scale; - ValueRef packed_input_zp = input_zp; - if (graph.val_is_tref(input_scale)) { - VK_CHECK_COND(graph.val_is_tref(packed_input_zp)); - packed_input_scale = prepack_standard( - graph, input_scale, utils::kBuffer, utils::kWidthPacked); - packed_input_zp = prepack_standard( - graph, input_zp, utils::kBuffer, utils::kWidthPacked); - } - - // Pre-computed per quant group weight sums are needed for int accumulation, - // but not for weight only - const ValueRef packed_weight_sums = prepack_standard( - graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); - - // Allocate temporary tensor to store quantized and packed input - - int64_t num_blocks_M, num_blocks_K; - std::tie(num_blocks_M, num_blocks_K) = - get_quantized_input_num_blocks(graph, fp_input); - - const int64_t int_input_height = num_blocks_M; - const int64_t int_input_width = num_blocks_K * 4; - - TmpTensor packed_int_input( - &graph, - {int_input_height, int_input_width}, - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - DynamicDispatchNode quantize_and_pack_linear_node( - make_quantize_and_pack_linear_input_node( - graph, - input_quant_config, - fp_input, - packed_input_scale, - packed_input_zp, - input_scale, - input_zp, - packed_int_input, - group_size)); - - graph.execute_nodes().emplace_back( - new DynamicDispatchNode(quantize_and_pack_linear_node)); - - DynamicDispatchNode linear_qa_qw_node(make_linear_qa_qw_node( - graph, - input_quant_config, - weight_quant_config, - fp_input, - packed_int_input, - packed_input_scale, - packed_input_zp, - input_scale, - input_zp, - weight_data, - packed_weight, - packed_weight_sums, - packed_weight_scales, - group_size, - bias_data, - packed_bias, - output)); - - graph.execute_nodes().emplace_back( - new DynamicDispatchNode(linear_qa_qw_node)); - } -} - -void linear_q8ta_q8csw(ComputeGraph& graph, const std::vector& args) { - int32_t idx = 0; - const ValueRef fp_input = args.at(idx++); - const ValueRef input_scale = args.at(idx++); - const ValueRef input_zp = args.at(idx++); - const ValueRef weight_data = args.at(idx++); - const ValueRef weight_sums_data = args.at(idx++); - const ValueRef weight_scales_data = args.at(idx++); - const ValueRef bias_data = args.at(idx++); - const ValueRef output = args.at(idx++); - - const int64_t K = graph.size_at(-1, fp_input); - - QuantizationConfig input_quant_config(8, kPerTensor, {}, false); - QuantizationConfig weight_quant_config(8, kPerChannel, {K}); - - quantized_linear_impl( - graph, - input_quant_config, - weight_quant_config, - fp_input, - input_scale, - input_zp, - weight_data, - weight_sums_data, - weight_scales_data, - kDummyValueRef, // weight_zeros_data - kDummyValueRef, // group_size - bias_data, - output); -} - -void linear_q8csw(ComputeGraph& graph, const std::vector& args) { - int32_t idx = 0; - const ValueRef fp_input = args.at(idx++); - const ValueRef weight_data = args.at(idx++); - const ValueRef weight_scales_data = args.at(idx++); - const ValueRef bias_data = args.at(idx++); - const ValueRef output = args.at(idx++); - - const int64_t K = graph.size_at(-1, fp_input); - - QuantizationConfig input_quant_config(32, kNoQuantization, {}); - QuantizationConfig weight_quant_config(8, kPerChannel, {K}); - - quantized_linear_impl( - graph, - input_quant_config, - weight_quant_config, - fp_input, - kDummyValueRef, // input scale - kDummyValueRef, // input zp - weight_data, - kDummyValueRef, // weight sums - weight_scales_data, - kDummyValueRef, // weight zeros - kDummyValueRef, // group size - bias_data, - output); -} - -void linear_q4gsw(ComputeGraph& graph, const std::vector& args) { - int32_t idx = 0; - const ValueRef fp_input = args.at(idx++); - const ValueRef weight_data = args.at(idx++); - const ValueRef weight_scales_data = args.at(idx++); - const ValueRef group_size = args.at(idx++); - const ValueRef bias_data = args.at(idx++); - const ValueRef output = args.at(idx++); - - const int64_t group_size_val = graph.extract_scalar(group_size); - - QuantizationConfig input_quant_config(32, kNoQuantization, {}); - QuantizationConfig weight_quant_config(4, kPerGroup, {group_size_val}); - - quantized_linear_impl( - graph, - input_quant_config, - weight_quant_config, - fp_input, - kDummyValueRef, // input scale - kDummyValueRef, // input zp - weight_data, - kDummyValueRef, // weight sums - weight_scales_data, - kDummyValueRef, // weight zeros - group_size, // group size - bias_data, - output); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.linear_q8ta_q8csw.default, linear_q8ta_q8csw); - VK_REGISTER_OP(et_vk.linear_q8csw.default, linear_q8csw); - VK_REGISTER_OP(et_vk.linear_q4gsw.default, linear_q4gsw); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h deleted file mode 100644 index 7b62c98390d..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include - -namespace vkcompute { - -utils::uvec3 quantized_linear_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args); - -ValueRef prepack_quantized_linear_weight( - ComputeGraph& graph, - const QuantizationConfig& weight_quant_config, - const ValueRef qmat2_data); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp deleted file mode 100644 index 89c9e847724..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -namespace vkcompute { - -// Custom global workgroup size function for linear_qcs8w -utils::uvec3 linear_qcs8w_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return {static_cast(graph->numel_of(out)), 1, 1}; -} - -// Custom local workgroup size function for linear_qcs8w -utils::uvec3 linear_qcs8w_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)graph; - (void)shader; - (void)global_workgroup_size; - (void)args; - (void)resize_args; - return {64, 1, 1}; -} - -// Custom global workgroup size function for linear_qcsnw_tiled -utils::uvec3 linear_qcsnw_tiled_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - - // Determine quantization bits from shader name - int quant_nbits = 8; - if (shader.kernel_name.find("qcs4w") != std::string::npos) { - quant_nbits = 4; - } - - std::vector mat1_sizes = graph->sizes_of(mat1); - const int64_t M = utils::val_at(-2, mat1_sizes); - uint32_t out_tile_nrows = 4; - if (M % 6 == 0) { - out_tile_nrows = 2; - } else if (M % 4 == 0) { - out_tile_nrows = 4; - } else if (M % 1 == 0) { - out_tile_nrows = 1; - } else { - out_tile_nrows = 4; - } - - // Number of output texels in the output tile - uint32_t out_tile_ntxcols = 1; - if (quant_nbits == 4) { - out_tile_ntxcols = 2; - } - - utils::uvec3 out_limits = graph->logical_limits_of(out); - uint32_t global_wg_x = utils::div_up(out_limits[0], out_tile_ntxcols); - return { - global_wg_x * (utils::div_up(out_limits[1], out_tile_nrows)), - 1, - out_limits[2]}; -} - -// Custom local workgroup size function for linear_qcsnw_tiled -utils::uvec3 linear_qcsnw_tiled_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)graph; - (void)global_workgroup_size; - (void)args; - (void)resize_args; - - // Check if using cooperative algorithm from shader name - bool use_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - - if (use_coop_algorithm) { - return {8, 1, 8}; - } else { - return {64, 1, 1}; - } -} - -void check_linear_qcsnw_args( - const ComputeGraph& graph, - const int quant_nbits, - const ValueRef mat1, - const ValueRef qmat2_data, - const ValueRef scales, - const ValueRef out) { - std::vector mat1_sizes = graph.sizes_of(mat1); - std::vector qmat2_sizes = graph.sizes_of(qmat2_data); - std::vector scales_sizes = graph.sizes_of(scales); - - VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3); - VK_CHECK_COND(qmat2_sizes.size() == 2); - VK_CHECK_COND(scales_sizes.size() == 1); - - VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out)); - - if (quant_nbits == 4) { - VK_CHECK_COND( - utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes) * 2); - VK_CHECK_COND( - utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes)); - } else { - VK_CHECK_COND( - utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes)); - VK_CHECK_COND( - utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes)); - } - - if (graph.is_buffer_storage(out)) { - VK_CHECK_COND(graph.is_contiguous(out)); - } -} - -void resize_linear_qcsnw_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - const ValueRef qmat2 = args.at(1).refs.at(1); - - const std::vector mat1_sizes = graph->sizes_of(mat1); - const std::vector qmat2_sizes = graph->sizes_of(qmat2); - - const int out_cols = utils::val_at(-2, mat1_sizes); - int out_rows = utils::val_at(-1, qmat2_sizes); - // Byte dtype suggests 4-bit quantization in which case the weight tensor is - // packed with 2 values per byte. - if (graph->dtype_of(qmat2) == vkapi::kByte) { - out_rows *= 2; - } - - std::vector new_out_sizes(3); - if (mat1_sizes.size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1_sizes.at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - graph->virtual_resize(out, new_out_sizes); -} - -void add_linear_qcs8w_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef q_mat2_data, - const ValueRef scales_data, - const ValueRef out) { - auto viewFn = VK_GET_OP_FN("aten.view_copy.default"); - ValueRef mat1_W_packed = mat1; - ValueRef out_W_packed = out; - // Create temporary tensors to store the width packed versions of mat1 and out - TmpTensor mat1_tmp( - &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked); - TmpTensor out_tmp( - &graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked); - if (!graph.is_buffer_storage(out) && - graph.packed_dim_of(mat1) != WHCN::kWidthDim) { - // Ensure mat1 is width packed - mat1_W_packed = mat1_tmp; - viewFn(graph, {mat1, graph.add_none(), mat1_W_packed}); - // Ensure out is packed correctly - out_W_packed = out_tmp; - } - ValueRef q_mat2 = prepack_standard_hw_transposed( - graph, q_mat2_data, graph.storage_type_of(out), utils::kWidthPacked); - ValueRef scales = prepack_standard( - graph, scales_data, graph.storage_type_of(out), utils::kWidthPacked); - - std::string kernel_name = "linear_qcs8w"; - kernel_name.reserve(kShaderNameReserve); - add_packed_dim_suffix(kernel_name, graph.packed_dim_of(mat1_W_packed)); - add_packed_dim_suffix(kernel_name, graph.packed_dim_of(q_mat2)); - add_dtype_suffix(kernel_name, graph.dtype_of(out_W_packed)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out_W_packed)); - - std::vector pcs; - if (graph.is_buffer_storage(out_W_packed)) { - pcs = { - graph.sizes_pc_of(out_W_packed), - graph.strides_pc_of(out_W_packed), - graph.sizes_pc_of(mat1_W_packed), - graph.strides_pc_of(mat1), - graph.strides_pc_of(q_mat2), - graph.strides_pc_of(scales), - graph.numel_pc_of(out_W_packed)}; - } else { - pcs = { - graph.logical_limits_pc_of(out_W_packed), - graph.sizes_pc_of(mat1_W_packed)}; - } - - const utils::uvec3 global_wg = { - static_cast(graph.numel_of(out_W_packed)), 1, 1}; - const utils::uvec3 local_wg{64, 1, 1}; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - linear_qcs8w_global_wg_size, - linear_qcs8w_local_wg_size, - // Inputs and Outputs - {{out_W_packed, vkapi::MemoryAccessType::WRITE}, - {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}}, - // Shader params buffers - {}, - // Push Constants - pcs, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_linear_qcsnw_node)); - if (!graph.is_buffer_storage(out) && - graph.packed_dim_of(out) != WHCN::kWidthDim) { - viewFn(graph, {out_W_packed, graph.add_none(), out}); - } -} - -void add_linear_qcsnw_tiled_node( - ComputeGraph& graph, - const bool use_coop_algorithm, - const int quant_nbits, - const ValueRef mat1, - const ValueRef q_mat2_data, - const ValueRef scales_data, - const ValueRef out) { - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - std::vector qmat2_orig_sizes = graph.sizes_of(q_mat2_data); - const int64_t ndim = graph.dim_of(q_mat2_data); - const int64_t K = qmat2_orig_sizes.at(ndim - 1); - const int64_t N = qmat2_orig_sizes.at(ndim - 2); - - ValueRef q_mat2; - if (quant_nbits == 4) { - q_mat2 = - prepack_int4_linear_weight_transposed_interleaved(graph, q_mat2_data); - } else { - utils::StorageType q_mat2_storage = utils::kTexture2D; - if (N > max_extent * 4 || K > max_extent) { - q_mat2_storage = utils::kBuffer; - } - - q_mat2 = prepack_standard_hw_transposed( - graph, q_mat2_data, q_mat2_storage, utils::kWidthPacked); - } - - utils::StorageType scales_storage = utils::kTexture2D; - if (N > max_extent) { - scales_storage = utils::kBuffer; - } - ValueRef scales = - prepack_standard(graph, scales_data, scales_storage, utils::kWidthPacked); - - std::string kernel_name; - if (quant_nbits == 4) { - kernel_name = - use_coop_algorithm ? "linear_qcs4w_coop" : "linear_qcs4w_tiled"; - } else { - kernel_name = - use_coop_algorithm ? "linear_qcs8w_coop" : "linear_qcs8w_tiled"; - } - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(q_mat2)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(scales)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - std::vector mat1_sizes = graph.sizes_of(mat1); - const int64_t M = utils::val_at(-2, mat1_sizes); - uint32_t out_tile_nrows = 4; - if (M % 6 == 0) { - kernel_name += "_o4x2"; - out_tile_nrows = 2; - } else if (M % 4 == 0) { - kernel_name += "_o4x4"; - out_tile_nrows = 4; - } else if (M % 1 == 0) { - kernel_name += "_o4x1"; - out_tile_nrows = 1; - } else { - kernel_name += "_o4x4"; - out_tile_nrows = 4; - } - - // Number of output texels in the output tile - uint32_t out_tile_ntxcols = 1; - if (quant_nbits == 4) { - out_tile_ntxcols = 2; - } - - utils::uvec3 out_limits = graph.logical_limits_of(out); - uint32_t global_wg_x = utils::div_up(out_limits[0], out_tile_ntxcols); - utils::uvec3 global_wg_size = { - global_wg_x * (utils::div_up(out_limits[1], out_tile_nrows)), - 1, - out_limits[2]}; - - utils::uvec3 local_wg_size{64, 1, 1}; - if (use_coop_algorithm) { - local_wg_size = {8, 1, 8}; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - linear_qcsnw_tiled_global_wg_size, - linear_qcsnw_tiled_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, q_mat2, scales}, vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - {{graph.sizes_pc_of(out), graph.sizes_pc_of(mat1)}}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_linear_qcsnw_node)); -} - -bool can_use_tiled_impl( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef q_mat2_data, - const ValueRef scales_data, - const ValueRef out) { - (void)q_mat2_data; - (void)scales_data; - - // Check if mat1 is not a 3D tensor or that batches = 1 - // TODO(ssjia): Add support for batches in the tiled impl - if (graph.dim_of(mat1) == 3 && graph.size_at(0, mat1) != 1) { - return false; - } - // Check that K is a multiple of 4 - if (graph.size_at(-1, mat1) % 4 != 0) { - return false; - } - // Check that N is a multiple of 4 - if (graph.size_at(-1, out) % 4 != 0) { - return false; - } - // Check that the packed dim is the width dim - if (graph.packed_dim_of(mat1) != WHCN::kWidthDim && - graph.packed_dim_of(out) != WHCN::kWidthDim) { - return false; - } - // Check that no special axis mapping is used for the input - // TODO(ssjia): Add support for non-standard axis mapping in the tiled impl - if (!graph.has_standard_axis_map(mat1)) { - return false; - } - // Check that no special axis mapping is used for the output - // TODO(ssjia): Add support for non-standard axis mapping in the tiled impl - if (!graph.has_standard_axis_map(out)) { - return false; - } - - return true; -} - -bool can_use_coop_impl(ComputeGraph& graph, const ValueRef mat1) { - // Do not use coop algorithm for Adreno 702; manual experimentation shows that - // it performs worse than the tiled algorithm. - // TODO(ssjia): Determine a more robust heuristic to determine when the coop - // algorithm should be used, instead of depending on specific device identity. - if (graph.device_is_adreno() && graph.device_name_contains("702")) { - return false; - } - // Check that the computation is vector * matrix - return (graph.size_at(-2, mat1) == 1); -} - -void weight_int8pack_mm( - ComputeGraph& graph, - const std::vector& args) { - check_linear_qcsnw_args(graph, 8, args[0], args[1], args[2], args[3]); - if (can_use_tiled_impl(graph, args[0], args[1], args[2], args[3])) { - bool use_coop_algorithm = can_use_coop_impl(graph, args[0]); - return add_linear_qcsnw_tiled_node( - graph, use_coop_algorithm, 8, args[0], args[1], args[2], args[3]); - } - return add_linear_qcs8w_node(graph, args[0], args[1], args[2], args[3]); -} - -void linear_qcs4w(ComputeGraph& graph, const std::vector& args) { - check_linear_qcsnw_args(graph, 4, args[0], args[1], args[2], args[3]); - - VK_CHECK_COND(can_use_tiled_impl(graph, args[0], args[1], args[2], args[3])); - bool use_coop_algorithm = can_use_coop_impl(graph, args[0]); - return add_linear_qcsnw_tiled_node( - graph, use_coop_algorithm, 4, args[0], args[1], args[2], args[3]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten._weight_int8pack_mm.default, weight_int8pack_mm); - VK_REGISTER_OP(et_vk.linear_qcs4w.default, linear_qcs4w); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp deleted file mode 100644 index 52cf75e28b5..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -namespace vkcompute { - -void check_linear_qga4w_args( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef scales_and_zeros, - const ValueRef out) { - VK_CHECK_COND(graph.val_is_tensor(mat1)); - VK_CHECK_COND(graph.val_is_tref(mat2_data)); - VK_CHECK_COND(graph.val_is_tref(scales_and_zeros)); - - VK_CHECK_COND(graph.dim_of(mat1) <= 3); - VK_CHECK_COND(graph.dim_of(mat2_data) == 2); - VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3); - - VK_CHECK_COND(graph.size_at(-3, mat1) == 1); - const int K = graph.size_at(-1, mat1); - VK_CHECK_COND(graph.size_at(-1, mat2_data) * 2 == K); - - const int group_size_val = graph.extract_scalar(group_size); - VK_CHECK_COND(K % group_size_val == 0); - // Due to the way weight packing works, group size needs to be a multiple of 8 - VK_CHECK_COND(group_size_val % 8 == 0); - - VK_CHECK_COND(graph.has_standard_axis_map(mat1)); - VK_CHECK_COND(graph.has_standard_axis_map(out)); -} - -void resize_linear_qga4w_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - ValueRef out = args.at(0).refs.at(0); - ValueRef mat1 = args.at(1).refs.at(0); - ValueRef mat2_data = extra_args.at(0); - - std::vector mat1_sizes = graph->sizes_of(mat1); - std::vector mat2_sizes = graph->sizes_of(mat2_data); - - const int64_t out_cols = utils::val_at(-2, mat1_sizes); - const int64_t out_rows = utils::val_at(-2, mat2_sizes); - - std::vector new_out_sizes(3); - if (mat1_sizes.size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1_sizes.at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - graph->virtual_resize(out, new_out_sizes); -} - -/** - * Determines if the cooperative algorithm should be used based on input tensor - * dimensions. Apply the coop algorithm for gemv cases, i.e. mat1 is avector as - * as opposed to a matrix. - */ -bool should_use_coop_algorithm(ComputeGraph* graph, const ValueRef& mat1) { - return graph->size_at(-2, mat1) == 1; -} - -vkapi::ShaderInfo pick_linear_qga4w_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - const ValueRef mat2 = args.at(1).refs.at(1); - - const bool use_coop_algorithm = should_use_coop_algorithm(graph, mat1); - - std::string kernel_name = "linear_qga4w"; - if (use_coop_algorithm) { - kernel_name += "_coop"; - } else { - kernel_name += "_tiled"; - } - add_storage_type_suffix(kernel_name, graph->storage_type_of(out)); - add_storage_type_suffix(kernel_name, graph->storage_type_of(mat1)); - add_storage_type_suffix(kernel_name, graph->storage_type_of(mat2)); - add_dtype_suffix(kernel_name, graph->dtype_of(out)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -utils::uvec3 linear_qga4w_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - - const bool use_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - - if (!use_coop_algorithm) { - // Constructing the global workgroup size for the tiled algorithm - utils::uvec3 global_wg_size = graph->logical_limits_of(out); - // Each shader thread computes a 4 high x 8 wide tile of the output matrix, - // which is equivalent to 4 x 2 texels. Since the output tensor must be - // width packed, div-up the "texel-width" of the output by 2 and the height - // of the output tensor by 4 to obtain the number of tiles that need to be - // computed. - global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2)); - global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(4)); - return global_wg_size; - } - - uint32_t output_channels = graph->size_at(-1, out); - uint32_t batch_size = graph->size_at(-2, out); - - // Constructing the global workgroup size of the co-operative algorithm. The - // local work group size is 64, and each local work group co-operates to - // compute 8 output channels of the output. Therefore, a total of - // (output_channels / 8 x 64) threads should be launched, assuming a batch - // size of 1. - return {64, utils::div_up(output_channels, 8u), batch_size}; -} - -utils::uvec3 linear_qga4w_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)args; - (void)resize_args; - const bool use_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - - if (use_coop_algorithm) { - return {64, 1, 1}; - } else { - return pick_hw_square_wg_size( - graph, shader, global_workgroup_size, args, resize_args); - } -} - -void add_linear_qga4w_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef scales_and_zeros_data, - const ValueRef out) { - check_linear_qga4w_args( - graph, mat1, mat2_data, group_size, scales_and_zeros_data, out); - - const uint32_t group_size_val = graph.extract_scalar(group_size); - - ValueRef mat2 = - prepack_int4_linear_weight_transposed_block_4x8(graph, mat2_data); - - ValueRef scales_and_zeros = prepack_standard( - graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_linear_qga4w_shader, - linear_qga4w_global_wg_size, - linear_qga4w_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, mat2, scales_and_zeros}, vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - {graph.sizes_pc_of(out), - graph.sizes_pc_of(mat1), - graph.sizes_pc_of(mat2)}, - // Specialization Constants - {SV(group_size_val)}, - // Resize Args - {mat2_data}, - // Resizing Logic - resize_linear_qga4w_node)); -} - -void linear_weight_int4( - ComputeGraph& graph, - const std::vector& args) { - return add_linear_qga4w_node( - graph, - args[0], // mat1 - args[1], // mat2 - args[2], // group_size - args[3], // scales_and_zeros - // There is an unused variable inner_k_tiles which is used to call - // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th - // argument is skipped. - args[5] // out - ); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear_QTA8A_QGA4W.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear_QTA8A_QGA4W.cpp deleted file mode 100644 index e3443ca34e6..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear_QTA8A_QGA4W.cpp +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -namespace vkcompute { - -void check_linear_qta8a_qga4w_args( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat1_scale, - const ValueRef mat1_zero_point, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef weight_scales, - const ValueRef weight_zeros, - const ValueRef out) { - VK_CHECK_COND(graph.val_is_tensor(mat1)); - VK_CHECK_COND(graph.val_is_tensor(mat1_scale)); - VK_CHECK_COND(graph.val_is_tensor(mat1_zero_point)); - VK_CHECK_COND(graph.val_is_tref(mat2_data)); - VK_CHECK_COND(graph.val_is_tref(weight_scales)); - VK_CHECK_COND(graph.val_is_tref(weight_zeros)); - - VK_CHECK_COND(graph.dim_of(mat1) <= 3); - VK_CHECK_COND(graph.dim_of(mat2_data) == 2); - VK_CHECK_COND(graph.dim_of(weight_scales) == 2); - VK_CHECK_COND(graph.dim_of(weight_zeros) == 2); - - VK_CHECK_COND(graph.size_at(-3, mat1) == 1); - const int K = graph.size_at(-1, mat1); - VK_CHECK_COND(graph.size_at(-1, mat2_data) * 2 == K); - - const int group_size_val = graph.extract_scalar(group_size); - VK_CHECK_COND(K % group_size_val == 0); - // Due to the way weight packing works, group size needs to be a multiple of 8 - VK_CHECK_COND(group_size_val % 8 == 0); - - VK_CHECK_COND(graph.has_standard_axis_map(mat1)); - VK_CHECK_COND(graph.has_standard_axis_map(out)); - - // Check that scale and zero_point tensors are buffer storage with width - // packing - VK_CHECK_COND(graph.is_buffer_storage(mat1_scale)); - VK_CHECK_COND(graph.packed_dim_of(mat1_scale) == WHCN::kWidthDim); - VK_CHECK_COND(graph.is_buffer_storage(mat1_zero_point)); - VK_CHECK_COND(graph.packed_dim_of(mat1_zero_point) == WHCN::kWidthDim); - - // Calculate number of tokens for input - int64_t input_num_tokens = 1; - const auto mat1_sizes = graph.sizes_of(mat1); - for (size_t i = 0; i < mat1_sizes.size() - 1; i++) { - input_num_tokens *= mat1_sizes[i]; - } - - // Verify scale and zero_point tensor sizes match number of tokens - const auto mat1_scale_sizes = graph.sizes_of(mat1_scale); - const auto mat1_zero_point_sizes = graph.sizes_of(mat1_zero_point); - - VK_CHECK_COND( - utils::val_at(-1, mat1_scale_sizes) == input_num_tokens); - VK_CHECK_COND( - utils::val_at(-1, mat1_zero_point_sizes) == input_num_tokens); - - // Verify weight scales and zeros have the same shape - const auto weight_scales_sizes = graph.sizes_of(weight_scales); - const auto weight_zeros_sizes = graph.sizes_of(weight_zeros); - VK_CHECK_COND(weight_scales_sizes == weight_zeros_sizes); -} - -void resize_linear_qta8a_qga4w_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - const ValueRef mat2 = args.at(1).refs.at(1); - - const std::vector mat1_sizes = graph->sizes_of(mat1); - const std::vector mat2_sizes = graph->sizes_of(mat2); - - const int64_t out_cols = utils::val_at(-2, mat1_sizes); - const int64_t out_rows = utils::val_at(-1, mat2_sizes) * 2; - - std::vector new_out_sizes(3); - if (mat1_sizes.size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1_sizes.at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - graph->virtual_resize(out, new_out_sizes); -} - -/** - * Determines if the cooperative algorithm should be used based on input tensor - * dimensions. Apply the coop algorithm for vectors (GEMV cases), tiled for - * matrices (GEMM cases). - */ -bool should_use_coop_algorithm_qta8a_qga4w( - ComputeGraph* graph, - const ValueRef& mat1) { - const uint32_t M = graph->size_at(-2, mat1); - // Use coop algorithm for vectors (GEMV), tiled for larger matrices (GEMM) - return M == 1; -} - -vkapi::ShaderInfo pick_linear_qta8a_qga4w_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - - const ValueRef out = args.at(0).refs.at(0); - const ValueRef mat1 = args.at(1).refs.at(0); - const ValueRef mat2 = args.at(1).refs.at(1); - - const bool use_coop_algorithm = - should_use_coop_algorithm_qta8a_qga4w(graph, mat1); - - std::string kernel_name = "linear_qta8a_qga4w"; - if (use_coop_algorithm) { - kernel_name += "_coop"; - } else { - kernel_name += "_tiled"; - } - add_storage_type_suffix(kernel_name, graph->storage_type_of(out)); - add_storage_type_suffix(kernel_name, graph->storage_type_of(mat1)); - add_storage_type_suffix(kernel_name, graph->storage_type_of(mat2)); - add_dtype_suffix(kernel_name, graph->dtype_of(out)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -utils::uvec3 linear_qta8a_qga4w_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - - const bool use_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - - // C = 1, H = 2, W = 3 - // global_wg_size = {round_up(C / 2f), round_up(H / 3f), W} --> (2W, 1H, 0C) - // --> {1, 1, 3} global - - utils::uvec3 global_wg_size = graph->logical_limits_of(out); - global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2)); - if (!use_coop_algorithm) { // GEMM - TILED - global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(3)); - } - - return global_wg_size; -} - -utils::uvec3 linear_qta8a_qga4w_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)args; - (void)resize_args; - - const bool use_coop_algorithm = - shader.kernel_name.find("_coop") != std::string::npos; - - utils::uvec3 local_wg_size; - if (use_coop_algorithm) { // GEMV - COOP - local_wg_size = {8, 1, 8}; - } else { // GEMM - TILED - local_wg_size = graph->create_local_wg_size(global_workgroup_size); - } - - return local_wg_size; -} - -void add_linear_qta8a_qga4w_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat1_scale, - const ValueRef mat1_zero_point, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef weight_scales_data, - const ValueRef weight_zeros_data, - const ValueRef out) { - check_linear_qta8a_qga4w_args( - graph, - mat1, - mat1_scale, - mat1_zero_point, - mat2_data, - group_size, - weight_scales_data, - weight_zeros_data, - out); - const uint32_t group_size_val = graph.extract_scalar(group_size); - - ValueRef mat2 = - prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data); - ValueRef weight_scales = prepack_standard( - graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked); - ValueRef weight_zeros = prepack_standard( - graph, weight_zeros_data, utils::kBuffer, utils::kWidthPacked); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_linear_qta8a_qga4w_shader, - linear_qta8a_qga4w_global_wg_size, - linear_qta8a_qga4w_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, - {{mat1, mat2, weight_scales, weight_zeros, mat1_scale, mat1_zero_point}, - vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - {graph.sizes_pc_of(out), - graph.sizes_pc_of(mat1), - graph.sizes_pc_of(mat2)}, - // Specialization Constants - {SV(group_size_val)}, - // Resize Args - {}, - // Resizing Logic - resize_linear_qta8a_qga4w_node)); -} - -void linear_qta8a_qga4w( - ComputeGraph& graph, - const std::vector& args) { - return add_linear_qta8a_qga4w_node( - graph, - args[0], // quantized input (char tensor) - args[1], // input_scale (float buffer tensor) - args[2], // input_zero_point (int buffer tensor) - args[3], // quantized weights (4-bit packed, byte) - args[4], // group_size (int) - args[5], // weight_scales (float tensor) - args[6], // weight_zeros (int tensor) - args[7] // float output tensor - ); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.linear_qta8a_qga4w.default, linear_qta8a_qga4w); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp deleted file mode 100644 index 6ad1d7f371d..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -namespace vkcompute { - -using namespace utils; - -void resize_reduce_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const int32_t reduce_dim_nchw = - graph->extract_scalar(resize_args.at(0)); - - std::vector new_sizes = graph->sizes_of(in); - new_sizes.at(normalize(reduce_dim_nchw, new_sizes.size())) = 1; - graph->virtual_resize(out, new_sizes); -} - -void resize_reduce2d_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - // Extract the dimensions to reduce over - const std::vector dims_list = - graph->extract_int_or_symint_list(resize_args.at(0)); - int32_t reduce_dim1_nchw = dims_list[0]; - int32_t reduce_dim2_nchw = dims_list[1]; - - std::vector new_sizes = graph->sizes_of(in); - new_sizes.at(normalize(reduce_dim1_nchw, new_sizes.size())) = 1; - new_sizes.at(normalize(reduce_dim2_nchw, new_sizes.size())) = 1; - graph->virtual_resize(out, new_sizes); -} - -utils::uvec3 reduce_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - const ValueRef out = args.at(0).refs.at(0); - const int32_t reduce_dim_whcn = - graph->extract_scalar(resize_args.at(1)); - - utils::uvec3 global_wg_size = graph->logical_limits_of(out); - global_wg_size[reduce_dim_whcn] = 1; - return global_wg_size; -} - -utils::uvec3 reduce_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)args; - (void)global_workgroup_size; - - const int32_t reduce_dim_whcn = - graph->extract_scalar(resize_args.at(1)); - const int64_t group_dim_whcn = - graph->extract_scalar(resize_args.at(2)); - - // This should match the value of MAX_NTHREADS in the reduce shader. - constexpr uint32_t max_nthreads = 16; - - const uint32_t nworkers_per_group = 4; - const uint32_t ngroups = 4; - VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads); - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim_whcn] = nworkers_per_group; - local_wg_size[group_dim_whcn] = ngroups; - - return local_wg_size; -} - -void add_reduce_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef out, - const std::string& op_name) { - VK_CHECK_COND( - !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out), - "Vulkan reduction only supports texture storage"); - - const int64_t ndim = graph.dim_of(in); - - int32_t reduce_dim = graph.extract_scalar(dim_ref); - reduce_dim = normalize(reduce_dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - // Check that the concat dim is not the reduction dim, if the tensor has a - // batch dim greater than 1. - if (graph.dim_of(in) == 4 && graph.size_at(0, in) > 1) { - VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim); - VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim); - } - - std::string kernel_name = op_name; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - // Calculate group_dim for specialization constants - const int other_dim_1 = (reduce_dim + 1) % 3; - const int other_dim_2 = (reduce_dim + 2) % 3; - int32_t group_dim; - utils::uvec3 limits = graph.logical_limits_of(out); - if (limits[other_dim_1] > limits[other_dim_2]) { - group_dim = other_dim_1; - } else { - group_dim = other_dim_2; - } - - const ValueRef reduce_dim_whcn_ref = - graph.get_or_add_value_for_int(reduce_dim); - const ValueRef group_dim_whcn_ref = graph.get_or_add_value_for_int(group_dim); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - reduce_global_wg_size, - reduce_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.logical_limits_ubo(in), graph.sizes_ubo(in)}, - // Push Constants - {}, - // Specialization Constants - {graph.packed_dim_of(out), reduce_dim, group_dim}, - // Resize Args - {dim_ref, reduce_dim_whcn_ref, group_dim_whcn_ref}, - // Resizing Logic - resize_reduce_node)); -} - -void add_reduce2d_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dims_ref, - const ValueRef out, - const std::string& op_name) { - VK_CHECK_COND( - !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out), - "Vulkan reduction only supports texture storage"); - - const int64_t ndim = graph.dim_of(in); - - // Extract the two dimensions to reduce over - const std::vector dims_list = - graph.extract_int_or_symint_list(dims_ref); - VK_CHECK_COND( - dims_list.size() == 2, "reduce2d requires exactly 2 dimensions"); - - int32_t reduce_dim1 = normalize(dims_list[0], ndim); - int32_t reduce_dim2 = normalize(dims_list[1], ndim); - - // Convert to WHCN format - reduce_dim1 = nchw_dim_to_whcn_dim(reduce_dim1, ndim); - reduce_dim2 = nchw_dim_to_whcn_dim(reduce_dim2, ndim); - - // Check that none of the reduction dims are packed - VK_CHECK_COND(graph.packed_dim_of(in) != reduce_dim1); - VK_CHECK_COND(graph.packed_dim_of(in) != reduce_dim2); - VK_CHECK_COND(graph.packed_dim_of(out) != reduce_dim1); - VK_CHECK_COND(graph.packed_dim_of(out) != reduce_dim2); - - // Check that the concat dim is not one of the reduction dims - if (graph.dim_of(in) == 4 && graph.size_at(0, in) > 1) { - VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim1); - VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim2); - VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim1); - VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim2); - } - - std::string kernel_name = op_name + "2d"; // Add "2d" suffix - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - // Calculate group_dim for specialization constants (use remaining dimension) - int32_t group_dim = 0; - for (int i = 0; i < 3; i++) { - if (i != reduce_dim1 && i != reduce_dim2) { - group_dim = i; - break; - } - } - - const ValueRef reduce_dim1_whcn_ref = - graph.get_or_add_value_for_int(reduce_dim1); - const ValueRef reduce_dim2_whcn_ref = - graph.get_or_add_value_for_int(reduce_dim2); - const ValueRef group_dim_whcn_ref = graph.get_or_add_value_for_int(group_dim); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - reduce_global_wg_size, - reduce_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.logical_limits_ubo(in), graph.sizes_ubo(in)}, - // Push Constants - {}, - // Specialization Constants - {graph.packed_dim_of(out), reduce_dim1, reduce_dim2, group_dim}, - // Resize Args - {dims_ref, - reduce_dim1_whcn_ref, - reduce_dim2_whcn_ref, - group_dim_whcn_ref}, - // Resizing Logic - resize_reduce2d_node)); -} - -#define DEFINE_REDUCE_FN(op_name, out_arg_idx) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - const std::vector dims_list = \ - graph.extract_int_or_symint_list(args[1]); \ - if (dims_list.size() == 1) { \ - const int64_t dim_val = dims_list.at(0); \ - const ValueRef dim_ref = graph.get_or_add_value_for_int(dim_val); \ - return add_reduce_node( \ - graph, args[0], dim_ref, args[out_arg_idx], #op_name); \ - } \ - if (dims_list.size() == 2) { \ - return add_reduce2d_node( \ - graph, args[0], args[1], args[out_arg_idx], #op_name); \ - } \ - VK_CHECK_COND(false, "Only 1 or 2 dimensions supported"); \ - } - -DEFINE_REDUCE_FN(sum, 4) -DEFINE_REDUCE_FN(mean, 4) -DEFINE_REDUCE_FN(amax, 3) -DEFINE_REDUCE_FN(amin, 3) - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.sum.dim_IntList, sum); - VK_REGISTER_OP(aten.mean.dim, mean); - VK_REGISTER_OP(aten.amax.default, amax); - VK_REGISTER_OP(aten.amin.default, amin); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp deleted file mode 100644 index 72c1637a2c9..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -namespace vkcompute { - -namespace { - -void check_args( - ComputeGraph& graph, - const ValueRef in, - const std::vector& repeats, - const ValueRef out) { - VK_CHECK_COND(graph.packed_dim_of(in) == graph.packed_dim_of(out)); - - VK_CHECK_COND(graph.storage_type_of(in) == graph.storage_type_of(out)); - if (graph.storage_type_of(in) == utils::kTexture2D) { - VK_CHECK_COND(graph.dim_of(in) <= 2); - } - - const int64_t in_dim = graph.dim_of(in); - VK_CHECK_COND( - in_dim <= repeats.size(), - "Input tensor dim size must be not greater than the repeat argument's size"); - - const std::vector in_sizes = graph.sizes_of(in); - const std::vector out_sizes = graph.sizes_of(out); - - VK_CHECK_COND( - dim_at(in_sizes) * dim_at(repeats) == - dim_at(out_sizes), - "Output's width doesn't match input's width * repeat count"); - - VK_CHECK_COND( - dim_at(in_sizes) * dim_at(repeats) == - dim_at(out_sizes), - "Output's height doesn't match input's height * repeat count"); - - VK_CHECK_COND( - dim_at(in_sizes) * dim_at(repeats) == - dim_at(out_sizes), - "Output's channel doesn't match input's channel * repeat count"); - - VK_CHECK_COND( - dim_at(in_sizes) * dim_at(repeats) == - dim_at(out_sizes), - "Output's batch doesn't match input's batch * repeat count"); -} - -} // namespace - -void add_repeat_node( - ComputeGraph& graph, - ValueRef in, - ValueRef repeats_ref, - ValueRef out) { - const std::vector repeats = *(graph.get_int_list(repeats_ref)); - - check_args(graph, in, repeats, out); - - const std::vector in_sizes = graph.sizes_of(in); - const utils::ivec4 src_dims{ - dim_at(in_sizes), - dim_at(in_sizes), - dim_at(in_sizes), - dim_at(in_sizes)}; - const utils::ivec4 dst_repeats{ - dim_at(repeats), - dim_at(repeats), - dim_at(repeats), - dim_at(repeats)}; - - std::string kernel_name = "repeat"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - // A copy of range with the last element set to batch size of the input tensor - const utils::ivec3 wg_size = graph.logical_limits_of(out); - - const auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - { - {out, vkapi::kWrite}, - {in, vkapi::kRead}, - }, - // Parameter buffers - {}, - // Push Constants - { - PushConstantDataInfo(&wg_size, sizeof(wg_size), sizeof(utils::ivec4)), - PushConstantDataInfo( - &src_dims, sizeof(src_dims), sizeof(utils::ivec4)), - PushConstantDataInfo( - &dst_repeats, sizeof(dst_repeats), sizeof(utils::ivec4)), - }, - // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void repeat(ComputeGraph& graph, const std::vector& args) { - add_repeat_node(graph, args[0], args[1], args[2]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.repeat.default, repeat); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp deleted file mode 100644 index 221d0d23f51..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -namespace vkcompute { - -void resize_repeat_interleave_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const int64_t nrepeats = graph->extract_scalar(extra_args.at(0)); - int64_t repeat_dim = graph->extract_scalar(extra_args.at(1)); - - std::vector new_sizes = graph->sizes_of(in); - repeat_dim = normalize(repeat_dim, new_sizes.size()); - new_sizes.at(repeat_dim) *= nrepeats; - - graph->virtual_resize(out, new_sizes); -} - -void add_repeat_interleave_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef num_repeats, - const ValueRef dim, - const ValueRef out) { - const int32_t nrepeats = graph.extract_scalar(num_repeats); - const int32_t repeat_dim = - graph.extract_whcn_dim(dim, graph.dim_of(in)); - - VK_CHECK_COND(repeat_dim != graph.packed_dim_of(out)); - VK_CHECK_COND(repeat_dim != graph.packed_dim_of(in)); - - std::string kernel_name = "repeat_interleave"; - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - // Parameter buffers - {graph.logical_limits_ubo(in)}, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - nrepeats, - repeat_dim}, - // Resize Args - {num_repeats, dim}, - // Resizing Logic - resize_repeat_interleave_node)); -} - -void repeat_interleave(ComputeGraph& graph, const std::vector& args) { - int args_i = 0; - const ValueRef in = args[args_i++]; - const ValueRef num_repeats = args[args_i++]; - const ValueRef dim = args[args_i++]; - const ValueRef output_size = args[args_i++]; - const ValueRef out = args[args_i++]; - - // Output size is not used in the kernel - (void)output_size; - - add_repeat_interleave_node(graph, in, num_repeats, dim, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.repeat_interleave.self_int, repeat_interleave); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h deleted file mode 100644 index f29a817e86e..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -void add_repeat_interleave_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef num_repeats, - const ValueRef dim, - const ValueRef out); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp b/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp deleted file mode 100644 index fcc8fe4b265..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -namespace vkcompute { - -void resize_rotary_embedding_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - - const ValueRef xq_out = args.at(0).refs.at(0); - const ValueRef xk_out = args.at(0).refs.at(1); - - const ValueRef xq = args.at(1).refs.at(0); - const ValueRef xk = args.at(1).refs.at(1); - - const std::vector xq_sizes = graph->sizes_of(xq); - const std::vector xk_sizes = graph->sizes_of(xk); - - graph->virtual_resize(xq_out, xq_sizes); - graph->virtual_resize(xk_out, xk_sizes); -} - -utils::uvec3 rotary_embedding_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef xq_out = args.at(0).refs.at(0); - - utils::uvec3 global_wg_size = graph->logical_limits_of(xq_out); - global_wg_size[0] /= 2; - - return global_wg_size; -} - -void add_rotary_embedding_node( - ComputeGraph& graph, - const ValueRef xq, - const ValueRef xk, - const ValueRef freqs_cos, - const ValueRef freqs_sin, - const ValueRef xq_out, - const ValueRef xk_out) { - VK_CHECK_COND(graph.size_at(-1, xq) == graph.size_at(-1, xk)); - VK_CHECK_COND(graph.size_at(-3, xq) == graph.size_at(-3, xk)); - VK_CHECK_COND( - graph.size_at(-1, xq) == graph.size_at(-1, freqs_cos) * 2); - VK_CHECK_COND(graph.sizes_of(freqs_cos) == graph.sizes_of(freqs_sin)); - - VK_CHECK_COND(graph.packed_dim_of(xq) == WHCN::kWidthDim); - VK_CHECK_COND(graph.packed_dim_of(xk) == WHCN::kWidthDim); - VK_CHECK_COND(graph.packed_dim_of(freqs_cos) == WHCN::kWidthDim); - VK_CHECK_COND(graph.packed_dim_of(freqs_sin) == WHCN::kWidthDim); - VK_CHECK_COND(graph.has_standard_axis_map(xq)); - VK_CHECK_COND(graph.has_standard_axis_map(xk)); - VK_CHECK_COND(graph.has_standard_axis_map(freqs_cos)); - VK_CHECK_COND(graph.has_standard_axis_map(freqs_sin)); - - std::string kernel_name = "rotary_embedding"; - add_dtype_suffix(kernel_name, graph.dtype_of(xq_out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - rotary_embedding_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{{xq_out, xk_out}, vkapi::kWrite}, - {{xq, xk, freqs_cos, freqs_sin}, vkapi::kRead}}, - // Parameter buffers - {graph.logical_limits_ubo(xq_out), graph.logical_limits_ubo(xk_out)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_rotary_embedding_node)); -} - -void apply_rotary_emb(ComputeGraph& graph, const std::vector& args) { - const ValueListPtr out_tuple = graph.get_value_list(args[4]); - const ValueRef xq_out = out_tuple->at(0); - const ValueRef xk_out = out_tuple->at(1); - - add_rotary_embedding_node( - graph, args[0], args[1], args[2], args[3], xq_out, xk_out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.apply_rotary_emb.default, apply_rotary_emb); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp deleted file mode 100644 index 2cc7455cd4a..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp +++ /dev/null @@ -1,566 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include - -namespace vkcompute { - -void resize_sdpa_out( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)args; - - int arg_idx = 0; - const ValueRef q_projected = extra_args[arg_idx++]; - const ValueRef out = extra_args[arg_idx++]; - graph->virtual_resize(out, graph->sizes_of(q_projected)); -} - -void resize_flash_attention_out( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - - // Find the output tensor in the args - it's the first tensor in the first - // ArgGroup - const ValueRef out = args.at(0).refs.at(0); - const ValueRef q_projected = args.at(1).refs.at(0); - graph->virtual_resize(out, graph->sizes_of(q_projected)); -} - -utils::uvec3 flash_attention_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - - const ValueRef q_projected = resize_args.at(0); - const ValueRef block_size_r = resize_args.at(1); - - // Get tensor dimensions - PyTorch format is [B, N, H, D] - // But Vulkan uses negative indexing: -4=B, -3=N, -2=H, -1=D - const int32_t B = graph->size_at(-4, q_projected); // batch - const int32_t N = graph->size_at(-3, q_projected); // sequence length - const int32_t H = graph->size_at(-2, q_projected); // num heads - const int32_t Br = - static_cast(graph->extract_scalar(block_size_r)); - - // Calculate number of row blocks - const int32_t Tr = (N + Br - 1) / Br; - - return {static_cast(B * H * Tr), 1, 1}; -} - -void flash_attention_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef q_projected = args[arg_idx++]; - const ValueRef k_cache = args[arg_idx++]; - const ValueRef v_cache = args[arg_idx++]; - const ValueRef input_pos_symint = args[arg_idx++]; - const ValueRef attn_mask = args[arg_idx++]; - const ValueRef dropout_p = args[arg_idx++]; - const ValueRef is_causal = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - - const ValueRef out = args[arg_idx++]; - - // Extract input_pos value for causal masking - const int32_t input_pos_val = graph.read_symint(input_pos_symint); - - const ValueRef k_cache_tensor = k_cache; - const ValueRef v_cache_tensor = v_cache; - - // Validation checks - re-enable with correct indexing - VK_CHECK_COND(graph.size_at(-4, q_projected) == 1); // batch size = 1 - VK_CHECK_COND(graph.size_at(-4, k_cache_tensor) == 1); - VK_CHECK_COND(graph.size_at(-4, v_cache_tensor) == 1); - VK_CHECK_COND( - graph.sizes_of(k_cache_tensor) == graph.sizes_of(v_cache_tensor)); - VK_CHECK_COND( - graph.size_at(-1, q_projected) == - graph.size_at(-1, k_cache_tensor)); // head_dim must match - VK_CHECK_COND( - graph.val_is_none(dropout_p) || - graph.extract_scalar(dropout_p) == 0); - VK_CHECK_COND(graph.val_is_none(scale)); - VK_CHECK_COND( - graph.val_is_none(is_causal) || graph.extract_scalar(is_causal)); - VK_CHECK_COND(graph.val_is_none(attn_mask)); - - if (graph.is_buffer_storage(q_projected)) { - VK_CHECK_COND(graph.is_buffer_storage(k_cache_tensor)); - VK_CHECK_COND(graph.is_buffer_storage(v_cache_tensor)); - VK_CHECK_COND(graph.is_buffer_storage(out)); - } - - // Calculate scale factor - const int32_t head_dim_size = graph.size_at(-1, q_projected); - const float scale_val = 1.0f / std::sqrt(static_cast(head_dim_size)); - - // Get number of heads for multi-query attention support - const int32_t num_heads = graph.size_at(-2, q_projected); - const int32_t num_kv_heads = graph.size_at(-2, k_cache_tensor); - - const int32_t block_size_r = 32; // Row block size - const int32_t block_size_c = 32; // Column block size - - // l and m have shape [B, H, N] - std::vector lm_sizes = { - graph.size_at(-4, q_projected), // B (batch) - graph.size_at(-2, q_projected), // H (num heads) - graph.size_at(-3, q_projected) // N (sequence length) - }; - - // t_l stores row-wise normalization sums for softmax computation - // t_m stores row-wise maximum values for numerical stability in softmax - TmpTensor t_l(&graph, lm_sizes, vkapi::kFloat, graph.storage_type_of(out)); - TmpTensor t_m(&graph, lm_sizes, vkapi::kFloat, graph.storage_type_of(out)); - - // Choose kernel name based on storage type - std::string kernel_name = "flash_attention"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_ubos = { - graph.sizes_ubo(q_projected), // Q_sizes - graph.sizes_ubo(k_cache_tensor), // K_sizes - graph.sizes_ubo(v_cache_tensor), // V_sizes - graph.sizes_ubo(out), // O_sizes - graph.sizes_ubo(t_l), // l_sizes - graph.sizes_ubo(t_m), // m_sizes - graph.create_params_buffer(scale_val), // scale - graph.create_params_buffer(block_size_r), // block_size_r - graph.create_params_buffer(block_size_c), // block_size_c - graph.create_params_buffer(input_pos_val), // input_pos - graph.create_params_buffer(num_heads), // num_heads - graph.create_params_buffer(num_kv_heads) // num_kv_heads - }; - - // Create block size references for dispatch calculation - const ValueRef block_size_r_ref = - graph.add_scalar(static_cast(block_size_r)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - flash_attention_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - { - {{out, t_l, t_m}, vkapi::kReadWrite}, - {{q_projected, k_cache_tensor, v_cache_tensor}, vkapi::kRead}, - }, - // Shader param buffers - param_ubos, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {q_projected, block_size_r_ref}, - // Resizing Logic - resize_flash_attention_out)); -} - -utils::uvec3 kv_cache_update_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef cache = args.at(0).refs.at(0); - const ValueRef projected = args.at(1).refs.at(0); - - if (graph->is_buffer_storage(cache)) { - return graph->create_global_wg_size(projected); - } else { - return graph->logical_limits_of(projected); - } -} - -void add_kv_cache_update_node( - ComputeGraph& graph, - const ValueRef input_pos_symint, - const ValueRef projected, - const ValueRef cache) { - std::string kernel_name("kv_cache_update"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(projected)); - add_dtype_suffix(kernel_name, graph.dtype_of(projected)); - - vkapi::ParamsBindList param_ubos; - - if (graph.is_buffer_storage(cache)) { - param_ubos = { - graph.numel_ubo(projected), - graph.strides_ubo(cache), - graph.get_or_create_int_param_buffer(input_pos_symint)}; - } else { - param_ubos = { - graph.logical_limits_ubo(projected), - graph.get_or_create_int_param_buffer(input_pos_symint)}; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - kv_cache_update_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{cache, vkapi::kWrite}, {projected, vkapi::kRead}}, - // Shader param buffers - param_ubos, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -utils::uvec3 attn_weight_scale_and_mask_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef attn_weight = args.at(0).refs.at(0); - - if (graph->is_buffer_storage(attn_weight)) { - return { - graph->size_at(-1, attn_weight), - graph->size_at(-2, attn_weight), - graph->size_at(-3, attn_weight), - }; - } else { - return graph->logical_limits_of(attn_weight); - } -} - -void add_attn_weight_scale_and_mask_node( - ComputeGraph& graph, - const ValueRef input_pos_symint, - const ValueRef q_projected, - const ValueRef attn_weight) { - std::string kernel_name("sdpa_attn_weight_scale_and_mask"); - add_storage_type_suffix(kernel_name, graph.storage_type_of(attn_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(attn_weight)); - - const int32_t head_dim_size = graph.size_at(-1, q_projected); - const float scale_val = 1.0f / std::sqrt(static_cast(head_dim_size)); - - vkapi::ParamsBindList param_ubos; - - if (graph.is_buffer_storage(attn_weight)) { - param_ubos = { - graph.sizes_ubo(attn_weight), - graph.strides_ubo(attn_weight), - graph.create_params_buffer(scale_val)}; - } else { - param_ubos = { - graph.logical_limits_ubo(attn_weight), - graph.get_or_create_int_param_buffer(input_pos_symint), - graph.create_params_buffer(scale_val)}; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - attn_weight_scale_and_mask_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{attn_weight, vkapi::kReadWrite}}, - // Shader param buffers - param_ubos, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -std::vector get_cache_slice_sizes( - ComputeGraph& graph, - ValueRef cache, - ValueRef input_pos_symint, - ValueRef q_projected) { - std::vector slice_sizes = graph.sizes_of(cache); - - // Cache slicing will always be in the channels dim - const int32_t input_pos_val = graph.read_symint(input_pos_symint); - const int64_t q_seq_len = graph.size_at(1, q_projected); - slice_sizes.at(1) = input_pos_val + q_seq_len; - return slice_sizes; -} - -void resize_cache_slice_view_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)args; - std::vector slice_sizes = get_cache_slice_sizes( - *graph, extra_args[0], extra_args[1], extra_args[2]); - - graph->virtual_resize(extra_args[3], slice_sizes); -} - -void add_cache_slice_view_node( - ComputeGraph& graph, - ValueRef cache, - ValueRef input_pos_symint, - ValueRef q_projected, - ValueRef cache_sliced, - const int64_t max_seq_len) { - std::vector slice_sizes = - get_cache_slice_sizes(graph, cache, input_pos_symint, q_projected); - // Initialize the slice to the maximum possible size to start - slice_sizes.at(1) = max_seq_len; - - graph.virtual_resize(cache_sliced, slice_sizes); - - graph.execute_nodes().emplace_back(new ExecuteNode( - resize_cache_slice_view_node, - {cache, input_pos_symint, q_projected, cache_sliced})); -} - -void update_cache_impl(ComputeGraph& graph, const std::vector& args) { - int arg_idx = 0; - const ValueRef value = args[arg_idx++]; - const ValueRef cache = args[arg_idx++]; - const ValueRef input_pos_symint = args[arg_idx++]; - const ValueRef out = args[arg_idx++]; - - // Unused variables - (void)out; - - VK_CHECK_COND(graph.size_at(-4, value) == 1); - VK_CHECK_COND(graph.size_at(-4, cache) == 1); - VK_CHECK_COND( - graph.size_at(-1, value) == graph.size_at(-1, cache)); - VK_CHECK_COND( - graph.size_at(-2, value) == graph.size_at(-2, cache)); - - add_kv_cache_update_node(graph, input_pos_symint, value, cache); -} - -void sdpa_impl(ComputeGraph& graph, const std::vector& args) { - int arg_idx = 0; - const ValueRef q_projected = args[arg_idx++]; - const ValueRef k_cache = args[arg_idx++]; - const ValueRef v_cache = args[arg_idx++]; - const ValueRef input_pos_symint = args[arg_idx++]; - const ValueRef attn_mask = args[arg_idx++]; - const ValueRef dropout_p = args[arg_idx++]; - const ValueRef is_causal = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - - // Output tensors - const ValueRef out = args[arg_idx++]; - - // Batches must be 1 - VK_CHECK_COND(graph.size_at(-4, q_projected) == 1); - VK_CHECK_COND(graph.size_at(-4, k_cache) == 1); - VK_CHECK_COND(graph.size_at(-4, v_cache) == 1); - // k and v projected must have the same shape - VK_CHECK_COND(graph.sizes_of(k_cache) == graph.sizes_of(v_cache)); - // head dim must match between tensors - VK_CHECK_COND( - graph.size_at(-1, q_projected) == - graph.size_at(-1, k_cache)); - // All tensors must have the packed dim be the width (head) dimension - VK_CHECK_COND(graph.packed_dim_of(q_projected) == WHCN::kWidthDim); - VK_CHECK_COND(graph.packed_dim_of(k_cache) == WHCN::kWidthDim); - VK_CHECK_COND(graph.packed_dim_of(v_cache) == WHCN::kWidthDim); - // Some variables are not supported yet - VK_CHECK_COND( - graph.val_is_none(dropout_p) || - graph.extract_scalar(dropout_p) == 0); - VK_CHECK_COND(graph.val_is_none(scale)); - // is_causal is assumed to be true in the current implementation. - VK_CHECK_COND( - graph.val_is_none(is_causal) || graph.extract_scalar(is_causal)); - VK_CHECK_COND(graph.val_is_none(attn_mask)); - - const int32_t max_seq_len = graph.size_at(1, k_cache); - - // Slice caches from 0 to input_pos + sequence_len - const ValueRef k_cache_sliced = graph.add_tensor_view(k_cache); - const ValueRef v_cache_sliced = graph.add_tensor_view(v_cache); - add_cache_slice_view_node( - graph, - k_cache, - input_pos_symint, - q_projected, - k_cache_sliced, - max_seq_len); - add_cache_slice_view_node( - graph, - v_cache, - input_pos_symint, - q_projected, - v_cache_sliced, - max_seq_len); - - // Scalar values for various dims - const ValueRef channels = graph.add_scalar(1); - const ValueRef height = graph.add_scalar(2); - const ValueRef width = graph.add_scalar(3); - - // Repeat interleave - const int64_t num_heads = graph.size_at(2, q_projected); - const int64_t num_kv_heads = graph.size_at(2, k_cache); - - const ValueRef num_repeats = - graph.add_scalar(num_heads / num_kv_heads); - - std::vector cache_slice_repeated_sizes(graph.sizes_of(q_projected)); - cache_slice_repeated_sizes.at(1) = max_seq_len; - - TmpTensor k_cache_sliced_repeated( - &graph, cache_slice_repeated_sizes, graph.dtype_of(k_cache_sliced)); - TmpTensor v_cache_sliced_repeated( - &graph, cache_slice_repeated_sizes, graph.dtype_of(v_cache_sliced)); - - add_repeat_interleave_node( - graph, k_cache_sliced, num_repeats, height, k_cache_sliced_repeated); - add_repeat_interleave_node( - graph, v_cache_sliced, num_repeats, height, v_cache_sliced_repeated); - - // Transpose sequence and head dims - const ValueRef q_transposed = graph.add_tensor_view(q_projected); - const ValueRef k_transposed = graph.add_tensor_view(k_cache_sliced_repeated); - const ValueRef v_transposed = graph.add_tensor_view(v_cache_sliced_repeated); - - add_transpose_view_node(graph, q_projected, channels, height, q_transposed); - add_transpose_view_node( - graph, k_cache_sliced_repeated, channels, height, k_transposed); - add_transpose_view_node( - graph, v_cache_sliced_repeated, channels, height, v_transposed); - - // Transpose K again to prepare for matmul - const ValueRef k_transposed_2 = graph.add_tensor_view(k_transposed); - add_transpose_view_node(graph, k_transposed, height, width, k_transposed_2); - - // Initialize attn_weight to the maximum possible size - std::vector attn_weight_full_sizes = graph.sizes_of(q_transposed); - attn_weight_full_sizes.at(2) = max_seq_len; - attn_weight_full_sizes.at(3) = max_seq_len; - TmpTensor attn_weight( - &graph, attn_weight_full_sizes, graph.dtype_of(q_transposed)); - - // Resize attn_weight to the correct dim - std::vector attn_weight_sizes = attn_weight_full_sizes; - attn_weight_sizes.at(2) = graph.size_at(2, q_transposed); - attn_weight_sizes.at(3) = graph.size_at(2, k_transposed); - graph.virtual_resize(attn_weight, attn_weight_sizes); - - // Calculate attention weight, which is a matmul of Q and K - const ValueRef mat2_is_transposed = graph.add_scalar(false); - add_matmul_node( - graph, q_transposed, k_transposed_2, attn_weight, mat2_is_transposed); - - // Apply scale and mask to the attention weight - add_attn_weight_scale_and_mask_node( - graph, input_pos_symint, q_projected, attn_weight); - - TmpTensor attn_weight_softmax( - &graph, attn_weight_full_sizes, graph.dtype_of(q_transposed)); - graph.virtual_resize(attn_weight_softmax, attn_weight_sizes); - add_softmax_node(graph, attn_weight, width, attn_weight_softmax, false); - - // Calculate final output - const ValueRef out_transposed = graph.add_tensor_view(out); - add_transpose_view_node(graph, out, channels, height, out_transposed); - add_matmul_node( - graph, - attn_weight_softmax, - v_transposed, - out_transposed, - mat2_is_transposed); - - graph.execute_nodes().emplace_back( - new ExecuteNode(resize_sdpa_out, {q_projected, out})); -} - -void sdpa_with_kv_cache_impl( - ComputeGraph& graph, - const std::vector& args) { - int arg_idx = 0; - const ValueRef q_projected = args[arg_idx++]; - const ValueRef k_projected = args[arg_idx++]; - const ValueRef v_projected = args[arg_idx++]; - const ValueRef k_cache_data = args[arg_idx++]; - const ValueRef v_cache_data = args[arg_idx++]; - const ValueRef input_pos_symint = args[arg_idx++]; - const ValueRef sequence_len = args[arg_idx++]; - const ValueRef attn_mask = args[arg_idx++]; - const ValueRef dropout_p = args[arg_idx++]; - const ValueRef is_causal = args[arg_idx++]; - const ValueRef scale = args[arg_idx++]; - - // Output tensors - const ValueRef out = args[arg_idx++]; - - (void)sequence_len; - - const ValueRef k_cache = - prepack_standard_like(graph, k_cache_data, q_projected); - const ValueRef v_cache = - prepack_standard_like(graph, v_cache_data, q_projected); - - update_cache_impl(graph, {k_projected, k_cache, input_pos_symint, -1}); - update_cache_impl(graph, {v_projected, v_cache, input_pos_symint, -1}); - - sdpa_impl( - graph, - {q_projected, - k_cache, - v_cache, - input_pos_symint, - attn_mask, - dropout_p, - is_causal, - scale, - out}); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(sdpa_with_kv_cache.default, sdpa_with_kv_cache_impl); - VK_REGISTER_OP(update_cache.default, update_cache_impl); - VK_REGISTER_OP(llama.custom_sdpa.default, sdpa_impl); - VK_REGISTER_OP(llama.flash_attention.default, flash_attention_impl); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/ScalarTensor.cpp b/backends/vulkan/runtime/graph/ops/impl/ScalarTensor.cpp deleted file mode 100644 index 82fc5c977d3..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/ScalarTensor.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -namespace vkcompute { - -void scalar_tensor(ComputeGraph& graph, const std::vector& args) { - // Extract the scalar value from the first argument - ValueRef scalar_in = args[0]; - float scalar_value = graph.extract_scalar(scalar_in); - - // Get the output tensor reference - ValueRef out = args[args.size() - 1]; - - std::string kernel_name("scalar_tensor"); - kernel_name.reserve(kShaderNameReserve); - - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(scalar_in)); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), - // Inputs and Outputs - {{out, vkapi::kWrite}}, - // Shader params buffers - {graph.create_params_buffer(scalar_value)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.scalar_tensor.default, scalar_tensor); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp deleted file mode 100644 index 69d49e8283b..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include - -namespace vkcompute { - -void resize_select_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - ValueRef out = args.at(0).refs.at(0); - ValueRef in = args.at(1).refs.at(0); - int64_t dim = graph->extract_scalar(extra_args.at(0)); - - int64_t in_ndim = graph->dim_of(in); - - if (dim < 0) { - dim += in_ndim; - } - - std::vector new_out_sizes; - for (int64_t i = 0; i < in_ndim; ++i) { - if (i != dim) { - new_out_sizes.push_back(graph->size_at(i, in)); - } - } - - graph->virtual_resize(out, new_out_sizes); -} - -void check_select_args( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef index_ref, - const ValueRef out) { - int64_t dim = graph.extract_scalar(dim_ref); - int64_t index = graph.extract_optional_scalar(index_ref, 0); - int64_t in_ndim = graph.dim_of(in); - - if (dim < 0) { - dim += in_ndim; - } - - VK_CHECK_COND( - dim >= 0 && dim < in_ndim, - "Dimension out of range (expected to be in range of [", - -in_ndim, - ", ", - in_ndim - 1, - "], but got ", - dim, - ")"); - - const int64_t in_size_at_dim = graph.size_at(dim, in); - - if (index < 0) { - index += in_size_at_dim; - } - - VK_CHECK_COND( - index >= 0 && index < in_size_at_dim, - "select(): index ", - index, - " out of range for tensor of size ", - in_size_at_dim, - " at dimension ", - dim); - - // Check that output tensor has correct dimensions - int64_t out_dim = graph.dim_of(out); - VK_CHECK_COND( - out_dim == in_ndim - 1, - "Output tensor dimension mismatch (expected ", - in_size_at_dim - 1, - ", but got ", - out_dim, - ")"); - - // Check that output tensor has correct sizes - int64_t out_idx = 0; - for (int64_t i = 0; i < in_size_at_dim; ++i) { - if (i != dim) { - VK_CHECK_COND( - graph.size_at(out_idx, out) == graph.size_at(i, in), - "Output size mismatch at dimension ", - out_idx, - " (expected ", - graph.size_at(i, in), - ", but got ", - graph.size_at(out_idx, out), - ")"); - out_idx++; - } - } -} - -/** - * Adds a select operation node to the compute graph. - * - * The select operator extracts a slice from a tensor along a specified - * dimension at a given index. It effectively reduces the dimensionality of the - * input tensor by one, by selecting a single slice at the specified index along - * the given dimension. For example, if input is a 3D tensor with shape [2,3,4] - * and we select dimension 1, index 2, the output will be a 2D tensor with shape - * [2,4]. - */ -void add_select_copy_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef index_ref, - const ValueRef out) { - check_select_args(graph, in, dim_ref, index_ref, out); - - add_transfer_copy_node( - graph, - TransferType::SELECT, - in, - dim_ref, - index_ref, - kDummyValueRef, - kDummyValueRef, - out, - {dim_ref, index_ref}, - resize_select_node); -} - -void select_int(ComputeGraph& graph, const std::vector& args) { - return add_select_copy_node(graph, args[0], args[1], args[2], args[3]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.select.int, select_int); - VK_REGISTER_OP(aten.select_copy.int, select_int); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp deleted file mode 100644 index 67d714d10aa..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include -#include - -namespace vkcompute { - -inline int64_t normalize_idx( - const int64_t index, - const int64_t max, - const int64_t default_value) { - // INT64_MAX is passed when value is unspecified - if (index == INT64_MAX) { - return default_value; - } - if (index == default_value) { - return index; - } - return normalize(index, max); -} - -void resize_slice_copy_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - ValueRef out_ref = args.at(0).refs.at(0); - ValueRef in_ref = args.at(1).refs.at(0); - - int64_t dim = graph->extract_scalar(extra_args.at(0)); - std::optional opt_start = - graph->extract_optional_scalar(extra_args.at(1)); - std::optional opt_end = - graph->extract_optional_scalar(extra_args.at(2)); - int64_t step = graph->extract_scalar(extra_args.at(3)); - - // Normalize dim - if (dim < 0) { - dim += graph->dim_of(in_ref); - } - - const std::vector in_sizes = graph->sizes_of(in_ref); - int64_t dim_size = in_sizes.at(dim); - - int64_t start = opt_start.value_or(0); - int64_t end = opt_end.value_or(dim_size); - - // Normalize start and end indices - start = normalize_idx(start, dim_size, 0); - end = normalize_idx(end, dim_size, dim_size); - - // Calculate output size - std::vector new_out_sizes = in_sizes; - new_out_sizes.at(dim) = (end - start + step - 1) / step; // Ceiling division - - graph->virtual_resize(out_ref, new_out_sizes); -} - -/** - * Adds a slice_copy operation node to the compute graph. - * - * The slice operator extracts a portion of a tensor along a specified - * dimension. It creates a new tensor that contains a subset of the input - * tensor's data, defined by start, end, and step parameters along the given - * dimension. - * - * For example, if input is a tensor with shape [4,5,6] and we slice along - * dimension 1 with start=1, end=4, step=2, the output will have shape [4,2,6], - * containing elements from the input at positions 1 and 3 along dimension 1. - */ -void add_slice_copy_node( - ComputeGraph& graph, - ValueRef in, - ValueRef dim_ref, - ValueRef opt_start_ref, - ValueRef opt_end_ref, - ValueRef step_ref, - ValueRef out) { - add_transfer_copy_node( - graph, - TransferType::SLICE, - in, - dim_ref, - opt_start_ref, - opt_end_ref, - step_ref, - out, - {dim_ref, opt_start_ref, opt_end_ref, step_ref}, - resize_slice_copy_node); -} - -std::vector get_slice_sizes( - ComputeGraph& graph, - ValueRef in_ref, - ValueRef dim_ref, - ValueRef opt_start_ref, - ValueRef opt_end_ref) { - const int64_t dim = graph.extract_scalar(dim_ref); - std::optional opt_start = - graph.extract_optional_scalar(opt_start_ref); - std::optional opt_end = - graph.extract_optional_scalar(opt_end_ref); - - int64_t dim_size = graph.size_at(dim, in_ref); - int64_t start = opt_start.value_or(0); - int64_t end = opt_end.value_or(dim_size); - - start = normalize_idx(start, dim_size, 0); - end = normalize_idx(end, dim_size, dim_size); - - std::vector new_out_sizes = graph.sizes_of(in_ref); - new_out_sizes.at(dim) = end - start; - - return new_out_sizes; -} - -void resize_slice_view_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)args; - ValueRef out_ref = extra_args.at(0); - - std::vector new_out_sizes = get_slice_sizes( - *graph, - extra_args.at(1), // input - extra_args.at(2), // dim - extra_args.at(3), // optional start - extra_args.at(4)); // optional end - - graph->virtual_resize(out_ref, new_out_sizes); -} - -void check_slice_view_args( - ComputeGraph& graph, - ValueRef in_ref, - ValueRef dim_ref, - ValueRef opt_start_ref, - ValueRef opt_end_ref, - ValueRef opt_step_ref, - ValueRef out_ref) { - VK_CHECK_COND( - graph.val_is_view_of(out_ref, in_ref), - "output must be a view of the input"); - - const int64_t dim = graph.extract_scalar(dim_ref); - const int64_t dim_size = graph.size_at(dim, in_ref); - - int64_t start = - graph.extract_optional_scalar(opt_start_ref).value_or(0); - int64_t end = graph.extract_optional_scalar(opt_end_ref).value_or(0); - int64_t step = - graph.extract_optional_scalar(opt_step_ref).value_or(1); - - start = normalize_idx(start, dim_size, 0); - end = normalize_idx(end, dim_size, dim_size); - - // The start idx must be 0; this is to ensure that the start of the slice view - // does not have any offset with respect to the base buffer storage. If the - // offset is nonzero, then it will potentially change upon a resize; however - // the buffer offset of the view tensor will have been "locked in" when the - // descriptor for its buffer storage is bound to a compute shader. Therefore - // there is no way to update the offset of the view once it has been bound. - VK_CHECK_COND(start == 0, "start must be 0 for slice view"); - VK_CHECK_COND(step == 1, "step must be 1 for slice view"); - - VK_CHECK_COND( - end < dim_size, "end must be less than dim size for slice view"); - - // We must also check that all earlier dims in the dim order have a size of 1. - // This ensures that the slice view encompasses a contiguous memory region of - // the source tensor's memory buffer. - std::vector in_sizes = graph.sizes_of(in_ref); - std::vector in_dim_order = graph.dim_order_of(in_ref); - for (int i = 0; i < in_dim_order.size(); ++i) { - if (in_dim_order[i] == dim) { - break; - } - VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1); - } -} - -void add_slice_view_node( - ComputeGraph& graph, - ValueRef in_ref, - ValueRef dim_ref, - ValueRef opt_start_ref, - ValueRef opt_end_ref, - ValueRef opt_step_ref, - ValueRef out_ref) { - check_slice_view_args( - graph, - in_ref, - dim_ref, - opt_start_ref, - opt_end_ref, - opt_step_ref, - out_ref); - - std::vector new_out_sizes = - get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref); - - graph.virtual_resize(out_ref, new_out_sizes); - - graph.execute_nodes().emplace_back(new ExecuteNode( - resize_slice_view_node, - {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref})); -} - -void slice_copy(ComputeGraph& graph, const std::vector& args) { - return add_slice_copy_node( - graph, - args.at(0), - args.at(1), // dim - args.at(2), // optional start - args.at(3), // optional end - args.at(4), // step - args.at(5)); -} - -void slice(ComputeGraph& graph, const std::vector& args) { - ValueRef in = args.at(0); - ValueRef out = args.at(5); - - // Special case if out is a view of in - if (graph.val_is_view_of(out, in)) { - add_slice_view_node( - graph, - in, - args.at(1), // dim - args.at(2), // optional start - args.at(3), // optional end - args.at(4), // step - out); - return; - } - - add_slice_copy_node( - graph, - in, - args.at(1), // dim - args.at(2), // optional start - args.at(3), // optional end - args.at(4), // step - out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.slice_copy.Tensor, slice_copy); - VK_REGISTER_OP(aten.slice.Tensor, slice); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.h b/backends/vulkan/runtime/graph/ops/impl/Slice.h deleted file mode 100644 index 220066ff1bb..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include - -namespace vkcompute { - -void add_slice_view_node( - ComputeGraph& graph, - ValueRef in_ref, - ValueRef dim_ref, - ValueRef opt_start_ref, - ValueRef opt_end_ref, - ValueRef opt_step_ref, - ValueRef out_ref); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp deleted file mode 100644 index 5e645e29e3d..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -namespace vkcompute { - -using namespace utils; - -utils::uvec3 pick_softmax_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - - const ValueRef out = args.at(0).refs.at(0); - const int32_t reduce_dim_xyz = - graph->extract_scalar(resize_args.at(1)); - - utils::uvec3 global_size = graph->logical_limits_of(out); - global_size[reduce_dim_xyz] = 1; - return global_size; -} - -utils::uvec3 pick_softmax_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)global_workgroup_size; - (void)args; - - const int64_t group_dim_xyz = - graph->extract_scalar(resize_args.at(2)); - - const int32_t reduce_dim_xyz = - graph->extract_scalar(resize_args.at(1)); - - // These values are hardcoded in add_softmax_node - const uint32_t nworkers_per_group = 4; - const uint32_t ngroups = 4; - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim_xyz] = nworkers_per_group; - local_wg_size[group_dim_xyz] = ngroups; - - return local_wg_size; -} - -void resize_softmax_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const std::vector in_sizes = graph->sizes_of(in); - graph->virtual_resize(out, in_sizes); -} - -void add_softmax_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef out, - bool log_softmax) { - VK_CHECK_COND( - !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out), - "Vulkan softmax only supports texture storage"); - - const int64_t ndim = graph.dim_of(in); - - int32_t reduce_dim_nchw = graph.extract_scalar(dim_ref); - reduce_dim_nchw = normalize(reduce_dim_nchw, ndim); - const int32_t reduce_dim_xyz = nchw_dim_to_whcn_dim(reduce_dim_nchw, ndim); - - // Check that the concat dim is not the reduction dim, if the tensor has a - // batch dim greater than 1. - if (graph.dim_of(in) == 4 && graph.size_at(0, in) > 1) { - VK_CHECK_COND( - graph.concat_dim_of(in) != reduce_dim_xyz, - "Softmax shader currently does not support concat dim == reduce dim"); - VK_CHECK_COND( - graph.concat_dim_of(out) != reduce_dim_xyz, - "Softmax shader currently does not support concat dim == reduce dim"); - } - - vkapi::ShaderInfo shader_descriptor; - std::string kernel_name = "softmax"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - if (log_softmax) { - kernel_name = "log_" + kernel_name; - } - - // This should match the value of MAX_NTHREADS in the softmax shader. - constexpr uint32_t max_nthreads = 16; - - const uint32_t nworkers_per_group = 4; - const uint32_t ngroups = 4; - VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads); - - // Determine the group dimension - const int other_dim_1 = (reduce_dim_xyz + 1) % 3; - const int other_dim_2 = (reduce_dim_xyz + 2) % 3; - int32_t group_dim; - utils::uvec3 global_wg_size = graph.logical_limits_of(out); - if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) { - group_dim = other_dim_1; - } else { - group_dim = other_dim_2; - } - - const ValueRef reduce_dim_xyz_ref = - graph.get_or_add_value_for_int(reduce_dim_xyz); - const ValueRef group_dim_xyz_ref = graph.get_or_add_value_for_int(group_dim); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - pick_softmax_global_wg_size, - pick_softmax_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.logical_limits_ubo(out), graph.sizes_ubo(in)}, - // Push Constants - {}, - // Specialization Constants - {graph.packed_dim_of(out), reduce_dim_xyz, group_dim}, - // Resize Args - {dim_ref, reduce_dim_xyz_ref, group_dim_xyz_ref}, - // Resizing Logic - resize_softmax_node)); -} - -void softmax(ComputeGraph& graph, const std::vector& args) { - // args[1] bool half_to_float is unused - return add_softmax_node( - graph, args[0], args[1], args[3], /* log_softmax = */ false); -} - -void log_softmax(ComputeGraph& graph, const std::vector& args) { - // args[1] bool half_to_float is unused - return add_softmax_node( - graph, args[0], args[1], args[3], /* log_softmax = */ true); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten._softmax.default, softmax); - VK_REGISTER_OP(aten._log_softmax.default, log_softmax); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.h b/backends/vulkan/runtime/graph/ops/impl/Softmax.h deleted file mode 100644 index 58fcfb93404..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Softmax.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include - -namespace vkcompute { - -void add_softmax_node( - ComputeGraph& graph, - ValueRef in, - ValueRef dim, - ValueRef out, - bool log_softmax); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp deleted file mode 100644 index f87af08ee69..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include -#include - -namespace vkcompute { - -void add_split_with_sizes_default_node( - ComputeGraph& graph, - ValueRef in, - const std::vector& split_sizes, - int64_t dim, - ValueRef out_list_ref) { - const ValueListPtr out_list = graph.get_value_list(out_list_ref); - - const int64_t input_ndim = graph.dim_of(in); - const DimIndex dim_index = dim < 0 ? static_cast(dim) - : static_cast(dim - input_ndim); - - VK_CHECK_COND(out_list->size() == split_sizes.size()); - - for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) { - const int64_t split_size = split_sizes.at(split_idx); - const ValueRef out_ref = out_list->at(split_idx); - - VK_CHECK_COND(dim_at(graph.sizes_of(out_ref), dim_index) == split_size); - } - - const auto packed_dim = graph.packed_dim_of(in); - const auto packed_dim_index = static_cast(kWidth4D - packed_dim); - - // Index of dimension to be concatenated in (w, h, c * b) coordinate system - const auto dim_xyz_index = std::min(2, -dim_index - 1); - - utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); - utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); - - const bool is_splitting_channel = (dim_index == kChannel4D); - - // if splitting channels - if (is_splitting_channel) { - // set source offset w as channel size of the input tensor - src_offset[3] = dim_at(graph.sizes_of(in), kChannel4D); - } - - for (ValueRef out_ref : *out_list) { - // Doesn't need to use split_size since we have already verified that the - // output tensor's size matches with the split_size. - const auto out_channel_size = dim_at(graph.sizes_of(out_ref), kChannel4D); - const utils::ivec3 range = graph.logical_limits_of(out_ref); - - if (dim_index == packed_dim_index) { - // if splitting channels, use add_copy_channel_offset_node function as - // add_copy_packed_dim_offset_node does not support channel packing - if (is_splitting_channel) { - add_copy_channel_offset_node( - graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref); - src_offset[dim_xyz_index] += out_channel_size; - } else { - // dst_offset[3] is not used now but will be used in the future when - // add_copy_packed_dim_offset_node will support channel packing - // - // set destination offset w as channel size of the output tensor if - // splitting channel - dst_offset[3] = is_splitting_channel ? out_channel_size : 0; - add_copy_packed_dim_offset_node( - graph, in, range, src_offset, dst_offset, out_ref); - src_offset[dim_xyz_index] += - dim_at(graph.sizes_of(out_ref), packed_dim_index); - } - } else { - // set destination offset w as channel size of the output tensor if - // splitting channels - dst_offset[3] = is_splitting_channel ? out_channel_size : 0; - add_copy_offset_node( - graph, in, range, src_offset, dst_offset, out_ref, false, true); - src_offset[dim_xyz_index] += - is_splitting_channel ? out_channel_size : range[dim_xyz_index]; - } - } -} - -void add_split_with_sizes_default_node( - ComputeGraph& graph, - ValueRef in, - ValueRef split_sizes_ref, - ValueRef dim_ref, - ValueRef out) { - int64_t dim = graph.extract_scalar(dim_ref); - std::vector split_sizes = *(graph.get_int_list(split_sizes_ref)); - - add_split_with_sizes_default_node(graph, in, split_sizes, dim, out); -} - -void split_with_sizes_copy_default( - ComputeGraph& graph, - const std::vector& args) { - add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]); -} - -void add_split_tensor_node( - ComputeGraph& graph, - ValueRef in, - ValueRef split_size_ref, - ValueRef dim_ref, - ValueRef out) { - const int64_t split_size = graph.extract_scalar(split_size_ref); - const int64_t dim = graph.extract_scalar(dim_ref); - - const int64_t input_ndim = graph.dim_of(in); - const DimIndex dim_index = dim < 0 ? static_cast(dim) - : static_cast(dim - input_ndim); - const int64_t size = dim_at(graph.sizes_of(in), dim_index); - const std::vector split_sizes(size / split_size, split_size); - - add_split_with_sizes_default_node(graph, in, split_sizes, dim, out); -} - -void split_tensor(ComputeGraph& graph, const std::vector& args) { - add_split_tensor_node(graph, args[0], args[1], args[2], args[3]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP( - aten.split_with_sizes_copy.default, split_with_sizes_copy_default); - VK_REGISTER_OP(aten.split.Tensor, split_tensor); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp deleted file mode 100644 index 13801b45cc7..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -void add_squeeze_copy_dims_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dims_ref, - const ValueRef out) { - const int64_t in_dim = graph.dim_of(in); - const std::vector in_sizes = graph.sizes_of(in); - const std::vector out_sizes = graph.sizes_of(in); - - const std::vector dims = graph.extract_int_or_symint_list(dims_ref); - std::vector squeeze_dims; - // Filter out edge cases that we don't need squeeze: - // 1. The size of squeeze dim is larger than 1. - // 2. Squeeze outter most dim - // For these cases, just pass input to output via clone. - for (int i = 0; i < dims.size(); ++i) { - if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) { - squeeze_dims.push_back(dims.at(i)); - } - } - if (squeeze_dims.size() == 0) { - add_clone_node(graph, in, out); - } else { - std::vector permute_dims(in_dim); - for (int i = 0; i < in_dim; ++i) { - permute_dims.at(i) = i; - } - for (auto& elem : squeeze_dims) { - auto it = std::find(permute_dims.begin(), permute_dims.end(), elem); - VK_CHECK_COND( - it != permute_dims.end(), "Squeeze dim not found in permute_dims"); - std::rotate(permute_dims.begin(), it, it + 1); - } - - const ValueRef permute_dims_ref = - graph.add_scalar_list(std::vector(permute_dims)); - add_permute_node(graph, in, permute_dims_ref, out); - } -} - -void resize_squeeze_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - const ValueRef dims_ref = extra_args.at(0); - - const IntListPtr dims = graph->get_int_list(dims_ref); - - std::vector out_sizes = graph->sizes_of(in); - - // Remove the dimensions specified in dims if their size is 1 - for (int64_t dim : *dims) { - if (dim >= 0 && dim < static_cast(out_sizes.size()) && - out_sizes[dim] == 1) { - out_sizes.erase(out_sizes.begin() + dim); - // After erasing, all subsequent dims shift left by one - // So we need to decrement all subsequent dims in dims - for (auto& d : *dims) { - if (d > dim) { - --d; - } - } - } - } - - graph->virtual_resize(out, out_sizes); -} - -void squeeze_copy_dims(ComputeGraph& graph, const std::vector& args) { - int idx = 0; - const ValueRef in = args.at(idx++); - const ValueRef dims = args.at(idx++); - const ValueRef out = args.at(idx++); - - std::vector resize_args = {dims}; - - if (graph.is_buffer_storage(in)) { - return add_view_copy_buffer_node( - graph, in, out, resize_args, resize_squeeze_node); - } - return add_squeeze_copy_dims_node(graph, in, dims, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp deleted file mode 100644 index 6cd5115563a..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include -#include - -#include -#include - -namespace vkcompute { - -void add_staging_to_tensor_node( - ComputeGraph& graph, - const ValueRef in_staging, - const ValueRef out_tensor) { - VK_CHECK_COND(graph.val_is_staging(in_staging)); - - vkapi::ShaderInfo shader = get_nchw_to_tensor_shader( - graph, out_tensor, graph.int8_buffers_enabled()); - - vkapi::ParamsBindList param_buffers = {}; - if (graph.is_buffer_storage(out_tensor)) { - param_buffers.append(graph.buffer_meta_ubo(out_tensor)); - } - - std::vector pcs; - if (graph.is_texture_storage(out_tensor)) { - pcs = {graph.sizes_pc_of(out_tensor)}; - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - shader, - default_pick_global_wg_size, - default_pick_local_wg_size, - // Input and Outputs - {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}}, - // Parameter Buffers - param_buffers, - // Push Constants - pcs, - // Specialization Constants - {graph.hashed_layout_of(out_tensor)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -const std::string kBitw8PrefixStr = "bitw8_image_to_nchw_nobitw8buffer"; - -bool is_bitw8_shader(const vkapi::ShaderInfo& shader) { - const auto size = kBitw8PrefixStr.size(); - const std::string& shader_prefix_str = shader.kernel_name.substr(0, size); - return shader_prefix_str == kBitw8PrefixStr; -} - -vkapi::ShaderInfo get_tensor_to_staging_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef in_tensor = args.at(1).refs.at(0); - return get_tensor_to_nchw_shader( - *graph, in_tensor, graph->int8_buffers_enabled()); -} - -utils::uvec3 tensor_to_staging_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef in_tensor = args.at(1).refs.at(0); - const ValueRef out_staging = args.at(0).refs.at(0); - - utils::uvec3 global_wg_size = graph->create_global_wg_size(in_tensor); - - // Normally, the image_to_nchw shader is structured so that each thread reads - // one texel from the input texture and writes each component of the texel - // into the corresponding location in the output buffer. However, this shader - // is structured slightly differently in that each thread writes out a - // complete 32 bit integer (containing 4 packed 8-bit integers) into the - // output buffer. Therefore, the global work group size for this shader will - // be the number of elements in the output buffer divided by 4, as opposed to - // the extents of the input texture. - if (is_bitw8_shader(shader)) { - const uint32_t buffer_len = utils::safe_downcast( - graph->get_staging(out_staging)->numel() / 4); - global_wg_size = {buffer_len, 1, 1}; - } - - return global_wg_size; -} - -void add_tensor_to_staging_node( - ComputeGraph& graph, - const ValueRef in_tensor, - const ValueRef out_staging) { - VK_CHECK_COND(graph.val_is_staging(out_staging)); - - vkapi::ShaderInfo shader = - get_tensor_to_nchw_shader(graph, in_tensor, graph.int8_buffers_enabled()); - - vkapi::ParamsBindList param_buffers = {}; - if (graph.is_buffer_storage(in_tensor)) { - param_buffers.append(graph.buffer_meta_ubo(in_tensor)); - } - - std::vector pcs; - if (graph.is_texture_storage(in_tensor)) { - pcs = {graph.sizes_pc_of(in_tensor)}; - } - - if (is_bitw8_shader(shader)) { - pcs.push_back(graph.numel_pc_of(in_tensor)); - } - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - shader, - tensor_to_staging_global_wg_size, - default_pick_local_wg_size, - // Input and Outputs - {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}}, - // Parameter Buffers - param_buffers, - // Push Constants - pcs, - // Specialization Constants - {graph.hashed_layout_of(in_tensor)}, - // Resize Args - {}, - // Resizing Logic - nullptr)); -} - -void add_prepack_standard_node( - ComputeGraph& graph, - const ValueRef tensor_data, - const ValueRef tensor, - const bool transpose_hw = false) { - vkapi::ShaderInfo shader = - get_nchw_to_tensor_shader(graph, tensor, graph.int8_buffers_enabled()); - - vkapi::ParamsBindList param_buffers = {}; - if (graph.is_buffer_storage(tensor)) { - param_buffers.append(graph.buffer_meta_ubo(tensor)); - } - - std::vector pcs; - if (graph.is_buffer_storage(tensor)) { - pcs = { - graph.sizes_pc_of(tensor), - graph.strides_pc_of(tensor), - graph.numel_pc_of(tensor)}; - } else { - pcs = {graph.sizes_pc_of(tensor)}; - } - - int transpose_hw_spec = transpose_hw ? 1 : 0; - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - shader, - graph.create_global_wg_size(tensor), - graph.create_local_wg_size(tensor), - // Input and Outputs - tensor_data, - tensor, - // Parameter Buffers - param_buffers, - // Specialization Constants - {graph.hashed_layout_of(tensor), transpose_hw_spec}, - pcs)); -} - -ValueRef prepack_standard( - ComputeGraph& graph, - const ValueRef tensor_data, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout layout, - const bool passthrough, - const utils::AxisMapLayout axis_map_layout) { - if (passthrough && graph.val_is_tensor(tensor_data)) { - return tensor_data; - } - VK_CHECK_COND(graph.val_is_tref(tensor_data)); - ValueRef tensor = - graph.add_tensor_like(tensor_data, storage_type, layout, axis_map_layout); - add_prepack_standard_node(graph, tensor_data, tensor); - return tensor; -} - -ValueRef prepack_standard_hw_transposed( - ComputeGraph& graph, - const ValueRef tensor_data, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout layout, - const bool passthrough, - const utils::AxisMapLayout axis_map_layout) { - (void)passthrough; - - VK_CHECK_COND(graph.val_is_tref(tensor_data)); - std::vector new_out_sizes = graph.sizes_of(tensor_data); - const int w_dim = new_out_sizes.size() - 1; - const int h_dim = new_out_sizes.size() - 2; - const int64_t tmp = new_out_sizes.at(w_dim); - new_out_sizes.at(w_dim) = new_out_sizes.at(h_dim); - new_out_sizes.at(h_dim) = tmp; - ValueRef tensor = graph.add_tensor( - new_out_sizes, - graph.dtype_of(tensor_data), - storage_type, - layout, - -1, - axis_map_layout); - add_prepack_standard_node(graph, tensor_data, tensor, true); - return tensor; -} - -ValueRef prepack_standard_like( - ComputeGraph& graph, - const ValueRef tensor_data, - const ValueRef to_copy, - const bool passthrough) { - VK_CHECK_COND(graph.val_is_tensor(to_copy)); - return prepack_standard( - graph, - tensor_data, - graph.storage_type_of(to_copy), - graph.estimate_memory_layout_of(to_copy), - passthrough); -} - -void add_prepack_direct_copy_buffer_node( - ComputeGraph& graph, - const ValueRef tensor_data, - const ValueRef tensor) { - std::string kernel_name = "buffer_to_buffer"; - add_dtype_suffix(kernel_name, graph.dtype_of(tensor_data)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - vkapi::ParamsBindList ubos; - ubos.append({graph.numel_ubo(tensor)}); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - shader, - graph.create_global_wg_size(tensor), - graph.create_local_wg_size(tensor), - // Input and Outputs - tensor_data, - tensor, - // Parameter Buffers - ubos, - // Specialization Constants - {})); -} - -ValueRef prepack_direct_copy_buffer( - ComputeGraph& graph, - const ValueRef tensor_data) { - VK_CHECK_COND(graph.val_is_tref(tensor_data)); - ValueRef tensor = - graph.add_tensor_like(tensor_data, utils::kBuffer, utils::kWidthPacked); - add_prepack_direct_copy_buffer_node(graph, tensor_data, tensor); - return tensor; -} - -ValueRef prepack_int4_linear_weight_transposed_interleaved( - ComputeGraph& graph, - const ValueRef qmat2_data) { - std::vector qmat2_orig_sizes = graph.sizes_of(qmat2_data); - const int64_t ndim = graph.dim_of(qmat2_data); - - const int64_t K = qmat2_orig_sizes.at(ndim - 1) * 2; - const int64_t N = qmat2_orig_sizes.at(ndim - 2); - const int64_t N_div2 = N / int64_t(2); - - utils::StorageType storage_type = utils::kTexture2D; - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - if (N_div2 > max_extent * 4 || K > max_extent) { - storage_type = utils::kBuffer; - } - - std::vector qmat2_sizes{K, N_div2}; - ValueRef qmat2 = graph.add_tensor( - qmat2_sizes, vkcompute::vkapi::kByte, storage_type, utils::kWidthPacked); - - utils::uvec3 global_wg_size; - global_wg_size = graph.logical_limits_of(qmat2); - global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2)); - - std::string kernel_name = - graph.context()->adapter_ptr()->has_full_int8_buffers_support() - ? "pack_int4_linear_weight_transposed_interleaved" - : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer"; - add_storage_type_suffix(kernel_name, storage_type); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - qmat2_data, - qmat2, - // UBOs - {}, - // Specialization Constants - {}, - // Push Constants - {graph.sizes_pc_of(qmat2)})); - - return qmat2; -} - -ValueRef prepack_int4_linear_weight_transposed_block_4x8( - ComputeGraph& graph, - const ValueRef qmat2_data) { - std::vector qmat2_orig_sizes = graph.sizes_of(qmat2_data); - const int64_t ndim = graph.dim_of(qmat2_data); - - const int64_t K_div2 = qmat2_orig_sizes.at(ndim - 1); // Input is [N, K/2] - const int64_t N = qmat2_orig_sizes.at(ndim - 2); - // Logical K dimension. Each value in the tensor is a uint8 that contains 2 - // packed 4-bit values. - const int64_t K = K_div2 * 2; - - // This packing format partitions the weight tensor into 4 wide x 8 high - // blocks. To figure out the size of the output tensor, determine the number - // of blocks along the width and height dims. - const int64_t num_blocks_K = utils::div_up(K, int64_t(4)); - const int64_t num_blocks_N = utils::div_up(N, int64_t(8)); - // Each transposed block is 8 wide x 4 high. In terms of 8-bit values, the - // block is 4 wide x 4 high. To maximize memory loading efficiency, the packed - // weight tensor will use a base data type of uint32_t; in terms of uint32_t, - // each block is 1 wide x 4 high. However, each block is also flattened as it - // is stored, so that the whole block can be loaded at once. As a result, the - // stored block will be 4 wide x 1 high. - const int64_t output_width = num_blocks_K * 4; - const int64_t output_height = num_blocks_N; - - // Store the original sizes of the tensor to pass to the shader - utils::ivec2 orig_sizes{ - utils::safe_downcast(K), utils::safe_downcast(N)}; - - std::vector qmat2_sizes{output_height, output_width}; - - utils::StorageType storage_type = utils::kTexture2D; - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - if (output_width > max_extent * 4 || output_height > max_extent) { - storage_type = utils::kBuffer; - } - - ValueRef qmat2 = graph.add_tensor( - qmat2_sizes, vkcompute::vkapi::kUInt, storage_type, utils::kWidthPacked); - - // Global workgroup size: each thread writes out two adjacent blocks - utils::uvec3 global_wg_size{ - utils::div_up(utils::safe_downcast(num_blocks_K), uint32_t(2)), - utils::safe_downcast(num_blocks_N), - 1u}; - - std::string kernel_name = "pack_int4_linear_weight_transposed_block_4x8"; - add_storage_type_suffix(kernel_name, storage_type); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - qmat2_data, - qmat2, - // UBOs - {}, - // Specialization Constants - {}, - // Push Constants - {graph.sizes_pc_of(qmat2), - PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec2))})); - - return qmat2; -} - -void prepack_op(ComputeGraph& graph, const std::vector& args) { - return add_prepack_standard_node(graph, args[0], args[1]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.prepack.default, prepack_op); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h deleted file mode 100644 index 0b1568ca139..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -// -// Staging Buffer <-> Tensor -// - -void add_staging_to_tensor_node( - ComputeGraph& graph, - const ValueRef in_staging, - const ValueRef out_tensor); - -void add_tensor_to_staging_node( - ComputeGraph& graph, - const ValueRef in_tensor, - const ValueRef out_staging); - -// -// Standard Prepack -// - -/* - * Given that `v` is a `TensorRef`, create a new `Tensor` value with the - * specified `storage_type` and `memory_layout`, and add a a prepacking node to - * transfer the `TensorRef` data to the new `Tensor` object via a staging to - * tensor shader. The created `Tensor` value is then returned. - * - * If `passthrough` is `true`, then `v` may be a `Tensor` as well. If `v` is a - * `Tensor`, then it is returned as-is. If `passthrough` is `false` (default), - * then an exception will be thrown. - */ - -ValueRef prepack_standard( - ComputeGraph& graph, - const ValueRef tensor_data, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout layout, - const bool passthrough = false, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - -/* - * Same as prepack_standard, but transpose the height and width dimensions of - * the tensor while packing. - */ -ValueRef prepack_standard_hw_transposed( - ComputeGraph& graph, - const ValueRef tensor_data, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout layout, - const bool passthrough = false, - const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap); - -/* - * Equivalent to `prepack_standard()` function, except the `storage_type` and - * `memory_layout` are set to match `to_copy`, which must be a `Tensor`. - */ -ValueRef prepack_standard_like( - ComputeGraph& graph, - const ValueRef tensor_data, - const ValueRef to_copy, - const bool passthrough = false); - -// -// Direct buffer copy prepack -// - -/* - * Given that `v` is a `TensorRef`, create a new `Tensor` value with buffer - * storage and `kWidthPacked` memory layout, and add a prepacking node to - * transfer the `TensorRef` data to the new `Tensor` object via a direct buffer - * to buffer copy shader. - */ -ValueRef prepack_direct_copy_buffer( - ComputeGraph& graph, - const ValueRef tensor_data); - -// -// Op specific prepack functions -// - -ValueRef prepack_int4_linear_weight_transposed_interleaved( - ComputeGraph& graph, - const ValueRef qmat2_data); - -ValueRef prepack_int4_linear_weight_transposed_block_4x8( - ComputeGraph& graph, - const ValueRef qmat2_data); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp b/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp deleted file mode 100644 index f07522d2578..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace vkcompute { - -// -// sym_size -// - -void sym_size_impl(ComputeGraph* graph, const std::vector& args) { - const ValueRef in_tensor = args.at(0); - const ValueRef dim = args.at(1); - const ValueRef out_symint = args.at(2); - - const int64_t dim_val = graph->extract_scalar(dim); - const int64_t size_at_dim = graph->size_at(dim_val, in_tensor); - - graph->set_symint(out_symint, static_cast(size_at_dim)); -} - -void resize_sym_size_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)args; // Unused parameter - sym_size_impl(graph, resize_args); -} - -/* - * This operator takes a tensor and an integer dimension as inputs, and produces - * a symint as output. The symint's value is the size of the tensor at the - * specified dimension. - */ -void sym_size_int(ComputeGraph& graph, const std::vector& args) { - sym_size_impl(&graph, args); - - graph.execute_nodes().emplace_back( - new ExecuteNode(resize_sym_size_node, args)); -} - -// -// binary operators -// - -void sym_add_impl(ComputeGraph* graph, const std::vector& args) { - const ValueRef a = args.at(0); - const ValueRef b = args.at(1); - const ValueRef out = args.at(2); - - const int32_t a_val = graph->read_symint(a); - const int32_t b_val = graph->read_symint(b); - const int32_t result = a_val + b_val; - - graph->set_symint(out, result); -} - -void resize_sym_add_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)args; // Unused parameter - sym_add_impl(graph, resize_args); -} - -/* - * This operator takes two symints as inputs and produces a symint as output. - * The output symint's value is the sum of the two input symints. - */ -void sym_add(ComputeGraph& graph, const std::vector& args) { - sym_add_impl(&graph, args); - - graph.execute_nodes().emplace_back( - new ExecuteNode(resize_sym_add_node, args)); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(sym_size.int, sym_size_int); - VK_REGISTER_OP(add, sym_add); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp deleted file mode 100644 index 687b3923354..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include - -namespace vkcompute { - -using namespace utils; - -void resize_tan_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - - const std::vector self_sizes = graph->sizes_of(self); - graph->virtual_resize(out, self_sizes); -} - -void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) { - std::string kernel_name = "tan"; - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - - vkapi::ParamsBindList ubos({}); - ubos.append({graph.logical_limits_ubo(out)}); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - ubos, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_tan_node)); -} - -void tan(ComputeGraph& graph, const std::vector& args) { - return add_tan_node(graph, args[0], args[1]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.tan.default, tan); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp deleted file mode 100644 index b7e0218823a..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -namespace vkcompute { - -void resize_to_copy_op_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - - graph->virtual_resize(out, graph->sizes_of(self)); -} - -void add_to_copy_node(ComputeGraph& graph, ValueRef in, ValueRef out) { - static std::set supported_types = { - vkapi::ScalarType::Float, vkapi::ScalarType::Half}; - - VK_CHECK_COND( - supported_types.find(graph.dtype_of(in)) != supported_types.end() && - supported_types.find(graph.dtype_of(out)) != supported_types.end(), - "Unsupported dtype for to_copy, only Float and Half are currently supported, recieved ", - vkapi::to_string(graph.dtype_of(in)), - " <-> ", - vkapi::to_string(graph.dtype_of(out))); - - graph.execute_nodes().emplace_back(new BlitNode(graph, in, out)); -} - -void to_copy(ComputeGraph& graph, const std::vector& args) { - return add_to_copy_node(graph, args[0], args[7]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten._to_copy.default, to_copy); -} -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp deleted file mode 100644 index 60127ecf9bd..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include -#include -#include - -namespace vkcompute { - -/** - * Adds a transfer copy operation node to the compute graph. - * This function handles both SELECT and SLICE operations based on the - * transfer_type parameter. - */ -void add_transfer_copy_node( - ComputeGraph& graph, - TransferType transfer_type, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef index_or_start_ref, - const ValueRef end_ref, - const ValueRef step_ref, - const ValueRef out, - const std::vector& resize_args, - const ExecuteNode::ResizeFunction& resize_fn) { - int64_t ndim = graph.dim_of(in); - int64_t dim = graph.extract_scalar(dim_ref); - - if (dim < 0) { - dim += ndim; - } - - int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, ndim); - - struct TransferParams { - int32_t dim; - int32_t index_or_start_ref; - int32_t step_ref; - } transfer_params{static_cast(dim_whcn), 0, 0}; - - const bool param_is_scalar = graph.is_scalar_or_none(index_or_start_ref) && - (transfer_type == TransferType::SELECT || - graph.is_scalar_or_none(step_ref)); - - vkapi::ParamsBindList param_buffers; - if (!param_is_scalar) { - if (transfer_type == TransferType::SELECT) { - param_buffers = { - graph.get_or_create_int_param_buffer(index_or_start_ref, 0)}; - } else { // TransferType::SLICE - param_buffers = { - graph.get_or_create_int_param_buffer(index_or_start_ref, 0), - graph.get_or_create_int_param_buffer(step_ref, 1)}; - } - } else { - transfer_params.index_or_start_ref = - graph.extract_scalar_or(index_or_start_ref, 0); - if (transfer_type != TransferType::SELECT) { - transfer_params.step_ref = graph.extract_scalar_or(step_ref, 1); - } - } - - std::vector push_constants; - push_constants.reserve(graph.is_buffer_storage(out) ? 5 : 3); - - if (graph.is_buffer_storage(out)) { - push_constants.emplace_back(graph.sizes_pc_of(in)); - push_constants.emplace_back(graph.strides_pc_of(out)); - push_constants.emplace_back(graph.strides_pc_of(in)); - push_constants.emplace_back(graph.numel_pc_of(out)); - } else { - push_constants.emplace_back(graph.sizes_pc_of(out)); - push_constants.emplace_back(graph.sizes_pc_of(in)); - } - - if (param_is_scalar) { - push_constants.emplace_back(&transfer_params, sizeof(transfer_params)); - } else { - push_constants.emplace_back( - &transfer_params.dim, sizeof(transfer_params.dim)); - } - - vkapi::SpecVarList spec_vars = { - graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - }; - - // Determine the shader directly - std::string kernel_name; - if (transfer_type == TransferType::SELECT) { - kernel_name = "select"; - } else { // TransferType::SLICE - kernel_name = "slice"; - } - if (!param_is_scalar) { - kernel_name += "_ubo"; - } - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - // Create and add the dispatch node - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize Args - resize_args, - // Resizing Logic - resize_fn)); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.h b/backends/vulkan/runtime/graph/ops/impl/Transfer.h deleted file mode 100644 index 09aae144994..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Transfer.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include - -namespace vkcompute { - -enum class TransferType { SELECT, SLICE }; - -/** - * Adds a transfer copy operation node to the compute graph, which implements - * operators for which each element of the output tensor maps to a unique - * element of the input tensor. - * - * This function currently handles the following operations: - * - select - * - slice - */ -void add_transfer_copy_node( - ComputeGraph& graph, - TransferType transfer_type, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef index_or_start_ref, - const ValueRef end_ref, - const ValueRef step_ref, - const ValueRef out, - const std::vector& resize_args, - const ExecuteNode::ResizeFunction& resize_fn = nullptr); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp deleted file mode 100644 index b797536d817..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -#include - -#include - -namespace vkcompute { - -void resize_transpose_view_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)args; - const ValueRef out = extra_args.at(0); - const ValueRef in = extra_args.at(1); - - const int64_t dim0 = graph->extract_scalar(extra_args.at(2)); - const int64_t dim1 = graph->extract_scalar(extra_args.at(3)); - - std::vector new_sizes = graph->sizes_of(in); - // Transpose the resized input sizes - std::iter_swap(new_sizes.begin() + dim0, new_sizes.begin() + dim1); - graph->virtual_resize(out, new_sizes); -} - -void check_transpose_view_args( - ComputeGraph& graph, - ValueRef in_ref, - const int64_t dim0, - const int64_t dim1, - ValueRef out_ref) { - VK_CHECK_COND( - graph.val_is_view_of(out_ref, in_ref), - "output tensor must be a view of the input tensor"); - - const int64_t in_ndim = graph.dim_of(in_ref); - VK_CHECK_COND( - dim0 >= 0 && dim0 < in_ndim, "dim0 is not in the range of [0, in_ndim)"); - VK_CHECK_COND( - dim1 >= 0 && dim1 < in_ndim, "dim1 is not in the range of [0, in_ndim)"); -} - -void add_transpose_view_node( - ComputeGraph& graph, - ValueRef input_ref, - ValueRef dim0_ref, - ValueRef dim1_ref, - ValueRef out_ref) { - const int64_t dim0 = graph.extract_scalar(dim0_ref); - const int64_t dim1 = graph.extract_scalar(dim1_ref); - - check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref); - graph.virtual_clone(out_ref, input_ref); - graph.virtual_transpose(out_ref, dim0, dim1); - - graph.execute_nodes().emplace_back(new ExecuteNode( - resize_transpose_view_node, {out_ref, input_ref, dim0_ref, dim1_ref})); -} - -void transpose(ComputeGraph& graph, const std::vector& args) { - const ValueRef out = args[3]; - return add_transpose_view_node( - graph, - args[0], // input - args[1], // dim0 - args[2], // dim1 - out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.transpose.int, transpose); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.h b/backends/vulkan/runtime/graph/ops/impl/Transpose.h deleted file mode 100644 index a4fc4029222..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Transpose.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include - -namespace vkcompute { - -void add_transpose_view_node( - ComputeGraph& graph, - ValueRef input_ref, - ValueRef dim0_ref, - ValueRef dim1_ref, - ValueRef out_ref); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp deleted file mode 100644 index 9830a8e8784..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include - -#include - -namespace vkcompute { - -constexpr float kDummyFloat = -1.0f; -const std::string kClampShaderName = "clamp"; - -void resize_unary_op_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - - const std::vector self_sizes = graph->sizes_of(self); - graph->virtual_resize(out, self_sizes); -} - -void add_unary_op_node( - ComputeGraph& graph, - const ValueRef in, - const float min, - const float max, - const ValueRef out, - const std::string& op_name) { - std::string kernel_name(op_name); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - - const utils::vec2 min_max = {min, max}; - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - { - graph.is_buffer_storage(out) ? graph.numel_pc_of(out) - : graph.logical_limits_pc_of(out), - PushConstantDataInfo(&min_max, sizeof(min_max)), - }, - // pcs, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - resize_unary_op_node)); -} - -float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) { - if (!graph.val_is_none(val)) { - return graph.extract_scalar(val); - } - return max ? std::numeric_limits::infinity() - : -std::numeric_limits::infinity(); -} - -#define DEFINE_ACTIVATION_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, args[0], kDummyFloat, kDummyFloat, args[1], #op_name); \ - } - -#define DEFINE_CLAMP_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, \ - args[0], \ - get_val_or_inf(graph, args[1], /*max = */ false), \ - get_val_or_inf(graph, args[2], /*max = */ true), \ - args[3], \ - kClampShaderName); \ - } - -#define DEFINE_RELU_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, \ - args[0], \ - 0, \ - std::numeric_limits::infinity(), \ - args[1], \ - kClampShaderName); \ - } - -#define DEFINE_RELU6_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node(graph, args[0], 0, 6, args[1], kClampShaderName); \ - } - -#define DEFINE_HARDSHRINK_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, \ - args[0], \ - get_val_or_inf(graph, args[1], /*max = */ false), \ - -get_val_or_inf(graph, args[1], /*max = */ true), \ - args[2], \ - "hardshrink"); \ - } - -#define DEFINE_LEAKY_RELU_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, \ - args[0], \ - get_val_or_inf(graph, args[1], /*neg slope*/ false), \ - kDummyFloat, \ - args[2], \ - "leaky_relu"); \ - } - -void gelu(ComputeGraph& graph, const std::vector& args) { - // args[1] is the `approximate` string - // https://fburl.com/code/9omngmyo - // currently only `approximate = "tanh"` is supported - return add_unary_op_node( - graph, args[0], kDummyFloat, kDummyFloat, args[2], "gelu"); -} - -DEFINE_ACTIVATION_FN(abs); -DEFINE_ACTIVATION_FN(cos); -DEFINE_ACTIVATION_FN(exp); -DEFINE_ACTIVATION_FN(neg); -DEFINE_ACTIVATION_FN(sigmoid); -DEFINE_ACTIVATION_FN(sin); -DEFINE_ACTIVATION_FN(sqrt); -DEFINE_ACTIVATION_FN(rsqrt); -DEFINE_ACTIVATION_FN(tanh); -DEFINE_CLAMP_FN(clamp); -DEFINE_CLAMP_FN(hardtanh); -DEFINE_RELU_FN(relu); -DEFINE_RELU6_FN(relu6); -DEFINE_HARDSHRINK_FN(hardshrink); -DEFINE_ACTIVATION_FN(hardswish); -DEFINE_ACTIVATION_FN(hardsigmoid); -DEFINE_LEAKY_RELU_FN(leaky_relu); -DEFINE_ACTIVATION_FN(round); - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.abs.default, abs); - VK_REGISTER_OP(aten.clamp.default, clamp); - VK_REGISTER_OP(aten.cos.default, cos); - VK_REGISTER_OP(aten.exp.default, exp); - VK_REGISTER_OP(aten.gelu.default, gelu); - VK_REGISTER_OP(aten.hardtanh.default, hardtanh); - VK_REGISTER_OP(aten.neg.default, neg); - VK_REGISTER_OP(aten.relu.default, relu); - VK_REGISTER_OP(aten.relu6.default, relu6); - VK_REGISTER_OP(aten.sigmoid.default, sigmoid); - VK_REGISTER_OP(aten.sin.default, sin); - VK_REGISTER_OP(aten.sqrt.default, sqrt); - VK_REGISTER_OP(aten.rsqrt.default, rsqrt); - VK_REGISTER_OP(aten.tanh.default, tanh); - VK_REGISTER_OP(aten.hardshrink.default, hardshrink); - VK_REGISTER_OP(aten.hardswish.default, hardswish); - VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid); - VK_REGISTER_OP(aten.leaky_relu.default, leaky_relu); - VK_REGISTER_OP(aten.round.default, round); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp deleted file mode 100644 index 0a98f6d8f43..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include - -namespace vkcompute { - -void add_unsqueeze_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef dim_ref, - const ValueRef out) { - const int64_t in_dim = graph.dim_of(in); - const int64_t out_dim = graph.dim_of(out); - - VK_CHECK_COND( - in_dim < 4, "Cannot unsqueeze a tensor with more than 3 dimensions"); - - int64_t dim = graph.extract_scalar(dim_ref); - if (dim < 0) { - dim += out_dim; - } - - std::vector permute_dims(out_dim); - for (int i = 1; i <= dim; i++) { - permute_dims[i - 1] = i; - } - permute_dims[dim] = 0; - - for (int i = dim + 1; i < out_dim; i++) { - permute_dims[i] = i; - } - - const ValueRef permute_dims_ref = - graph.add_scalar_list(std::vector(permute_dims)); - add_permute_node(graph, in, permute_dims_ref, out); -} - -void resize_unsqueeze_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - const ValueRef dims_ref = extra_args.at(0); - - const IntListPtr dims = graph->get_int_list(dims_ref); - - std::vector out_sizes = graph->sizes_of(in); - - // Insert singleton dimensions at the specified positions - for (auto dim : *dims) { - int64_t d = dim; - if (d < 0) { - d += static_cast(out_sizes.size()) + 1; - } - out_sizes.insert(out_sizes.begin() + d, 1); - } - - graph->virtual_resize(out, out_sizes); -} - -void unsqueeze(ComputeGraph& graph, const std::vector& args) { - int idx = 0; - const ValueRef in = args.at(idx++); - const ValueRef dims = args.at(idx++); - const ValueRef out = args.at(idx++); - - std::vector resize_args = {dims}; - if (graph.is_buffer_storage(in)) { - return add_view_copy_buffer_node( - graph, in, out, resize_args, resize_unsqueeze_node); - } - return add_unsqueeze_node(graph, in, dims, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp deleted file mode 100644 index 6662ae367c5..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include - -#include - -namespace vkcompute { - -enum class UpsampleMode : int { NEAREST, BILINEAR }; - -void resize_upsample_nearest2d_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef self = args.at(1).refs.at(0); - std::vector out_sizes = graph->sizes_of(self); // NCHW - - const ValueRef output_sizes = extra_args.at(0); // HW - const ValueRef scale_factors = extra_args.at(1); // HW - if (!graph->val_is_none(output_sizes)) { - IntListPtr output_size_ref = graph->get_int_list(output_sizes); - out_sizes.at(2) = output_size_ref->at(0); - out_sizes.at(3) = output_size_ref->at(1); - } else { - DoubleListPtr scales = graph->get_double_list(scale_factors); - out_sizes.at(2) *= scales->at(0); - out_sizes.at(3) *= scales->at(1); - } - - graph->virtual_resize(out, out_sizes); -} - -void add_upsample_nearest2d_node( - ComputeGraph& graph, - const UpsampleMode mode, - const ValueRef in, - const ValueRef output_sizes, - const ValueRef align_corners, - const ValueRef scale_factors, - const ValueRef out) { - if (graph.val_is_none(output_sizes) && graph.val_is_none(scale_factors)) { - VK_THROW( - "Invalid input, must provide either output_sizes or scale_factors"); - } - if (!graph.val_is_none(output_sizes) && !graph.val_is_none(scale_factors)) { - VK_THROW( - "Invalid input, must provide ONLY one of output_sizes or scale_factors"); - } - - int align_corners_val = 0; - if (is_valid(align_corners) && graph.get_bool(align_corners)) { - align_corners_val = 1; - } - - utils::uvec3 in_limits = graph.logical_limits_of(in); - utils::uvec3 out_limits = graph.logical_limits_of(out); - - uint32_t out_width = out_limits[0u]; - uint32_t out_height = out_limits[1u]; - - float scale_factor_x = float(in_limits[0u]) / float(out_width); - float scale_factor_y = float(in_limits[1u]) / float(out_height); - - float recip_scale_factor_x = 1.0f / scale_factor_x; - float recip_scale_factor_y = 1.0f / scale_factor_y; - - if (!graph.val_is_none(output_sizes)) { - IntListPtr output_size_ref = graph.get_int_list(output_sizes); - out_width = output_size_ref->at(1); - out_height = output_size_ref->at(0); - - VK_CHECK_COND(out_width == out_limits[0u]); - VK_CHECK_COND(out_height == out_limits[1u]); - - } else { - DoubleListPtr scales = graph.get_double_list(scale_factors); - scale_factor_x = scales->at(1); - scale_factor_y = scales->at(0); - - VK_CHECK_COND(in_limits[0u] * scale_factor_x == out_width); - VK_CHECK_COND(in_limits[1u] * scale_factor_y == out_height); - } - - if (align_corners_val == 1) { - recip_scale_factor_x = float(in_limits[0u] - 1) / float(out_width - 1); - recip_scale_factor_y = float(in_limits[1u] - 1) / float(out_height - 1); - } else { - recip_scale_factor_x = float(in_limits[0u]) / float(out_width); - recip_scale_factor_y = float(in_limits[1u]) / float(out_height); - } - - utils::vec2 recip_scales = {recip_scale_factor_x, recip_scale_factor_y}; - - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - switch (mode) { - case UpsampleMode::NEAREST: - kernel_name = "upsample_nearest2d"; - break; - case UpsampleMode::BILINEAR: - kernel_name = "upsample_bilinear2d"; - break; - } - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - // Shader params buffers - {graph.logical_limits_ubo(out), - graph.logical_limits_ubo(in), - graph.create_params_buffer(recip_scales)}, - // Push Constants - {}, - // Specialization Constants - {align_corners_val}, - // Resize Args - {output_sizes, scale_factors}, - // Resizing Logic - resize_upsample_nearest2d_node)); -} - -void upsample_nearest2d( - ComputeGraph& graph, - const std::vector& args) { - return add_upsample_nearest2d_node( - graph, - UpsampleMode::NEAREST, - args[0], - args[1], - kDummyValueRef, - args[2], - args[3]); -} - -void upsample_bilinear2d( - ComputeGraph& graph, - const std::vector& args) { - return add_upsample_nearest2d_node( - graph, - UpsampleMode::BILINEAR, - args[0], - args[1], - args[2], - args[3], - args[4]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.upsample_nearest2d.vec, upsample_nearest2d); - VK_REGISTER_OP(aten.upsample_bilinear2d.vec, upsample_bilinear2d); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Var.cpp b/backends/vulkan/runtime/graph/ops/impl/Var.cpp deleted file mode 100644 index d8fd367f18a..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Var.cpp +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#include - -#include -#include -#include - -namespace vkcompute { - -using namespace utils; - -// Custom global workgroup size function for var_buffer -utils::uvec3 var_buffer_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - return { - graph->size_at(-1, out), - graph->size_at(-2, out), - graph->size_at(-3, out) * graph->size_at(-4, out)}; -} - -// Custom local workgroup size function for var_buffer -utils::uvec3 var_buffer_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - (void)global_workgroup_size; - const ValueRef in = args.at(1).refs.at(0); - const int dim = resize_args.at(0); - - const int64_t ndim = graph->dim_of(in); - int32_t reduce_dim = normalize(dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - const uint32_t nworkers_per_group = 4; - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim] = nworkers_per_group; - return local_wg_size; -} - -// Custom global workgroup size function for var_texture -utils::uvec3 var_texture_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - const int dim = resize_args.at(0); - - const int64_t ndim = graph->dim_of(in); - int32_t reduce_dim = normalize(dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - utils::uvec3 global_wg_size = graph->logical_limits_of(out); - global_wg_size[reduce_dim] = 1; - return global_wg_size; -} - -// Custom local workgroup size function for var_texture -utils::uvec3 var_texture_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - const ValueRef in = args.at(1).refs.at(0); - const int dim = resize_args.at(0); - - const int64_t ndim = graph->dim_of(in); - int32_t reduce_dim = normalize(dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - const uint32_t nworkers_per_group = 4; - const uint32_t ngroups = 4; - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim] = nworkers_per_group; - const int other_dim_1 = (reduce_dim + 1) % 3; - const int other_dim_2 = (reduce_dim + 2) % 3; - if (global_workgroup_size[other_dim_1] > global_workgroup_size[other_dim_2]) { - local_wg_size[other_dim_1] = ngroups; - } else { - local_wg_size[other_dim_2] = ngroups; - } - return local_wg_size; -} - -void resize_var_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const int dim = extra_args.at(0); - - std::vector new_sizes = graph->sizes_of(in); - if (!new_sizes.empty()) { - new_sizes.at(normalize(dim, new_sizes.size())) = 1; - } - - graph->virtual_resize(out, new_sizes); -} - -void add_var_buffer_node( - ComputeGraph& graph, - ValueRef in, - const int dim, - bool unbiased, - ValueRef out) { - const int64_t ndim = graph.dim_of(in); - int32_t reduce_dim = normalize(dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - // Check that the concat dim is not the reduction dim, if the tensor has a - // batch dim greater than 1 - if (graph.dim_of(in) == 4 && graph.size_at(0, in) > 1) { - VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim); - VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim); - } - - std::string kernel_name = "var"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const uint32_t nworkers_per_group = 4; - - utils::uvec3 global_wg_size = { - graph.size_at(-1, out), - graph.size_at(-2, out), - graph.size_at(-3, out) * graph.size_at(-4, out)}; - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim] = nworkers_per_group; - - std::vector push_constants; - int32_t unbiased_int = static_cast(unbiased); - push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - var_buffer_global_wg_size, - var_buffer_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(in), - graph.strides_ubo(in), - graph.sizes_ubo(out), - graph.strides_ubo(out), - }, - // Push Constants - push_constants, - // Specialization Constants - {reduce_dim}, - // Resize Args - {dim}, - // Resizing Logic - resize_var_node)); -} - -void add_var_texture_node( - ComputeGraph& graph, - ValueRef in, - const int dim, - bool unbiased, - ValueRef out) { - const int64_t ndim = graph.dim_of(in); - - int32_t reduce_dim = dim; - reduce_dim = normalize(reduce_dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - // Check that the concat dim is not the reduction dim, if the tensor has a - // batch dim greater than 1. - if (graph.dim_of(in) == 4 && graph.size_at(0, in) > 1) { - VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim); - VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim); - } - - std::string kernel_name = "var"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - // This should match the value of MAX_NTHREADS in the softmax shader. - constexpr uint32_t max_nthreads = 16; - - const uint32_t nworkers_per_group = 4; - const uint32_t ngroups = 4; - VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads); - - utils::uvec3 global_wg_size = graph.logical_limits_of(out); - global_wg_size[reduce_dim] = 1; - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim] = nworkers_per_group; - const int other_dim_1 = (reduce_dim + 1) % 3; - const int other_dim_2 = (reduce_dim + 2) % 3; - int32_t group_dim; - if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) { - local_wg_size[other_dim_1] = ngroups; - group_dim = other_dim_1; - } else { - local_wg_size[other_dim_2] = ngroups; - group_dim = other_dim_2; - } - - std::vector push_constants; - int32_t unbiased_int = static_cast(unbiased); - push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - var_texture_global_wg_size, - var_texture_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.logical_limits_ubo(in), graph.sizes_ubo(in)}, - // Push Constants - push_constants, - // Specialization Constants - {graph.packed_dim_of(out), reduce_dim, group_dim}, - // Resize Args - {dim}, - // Resizing Logic - resize_var_node)); -} - -void add_var_node( - ComputeGraph& graph, - ValueRef in, - const int dim, - bool unbiased, - ValueRef out) { - bool is_buffer = graph.is_buffer_storage(in) || graph.is_buffer_storage(out); - - if (is_buffer) { - add_var_buffer_node(graph, in, dim, unbiased, out); - } else { - add_var_texture_node(graph, in, dim, unbiased, out); - } -} - -void var(ComputeGraph& graph, const std::vector& args) { - const IntListPtr dims_list = graph.get_int_list(args[1]); - VK_CHECK_COND(dims_list->size() == 1); - bool unbiased = true; - if (args.size() > 2) { - unbiased = graph.get_bool(args[2]); - } - return add_var_node( - graph, args[0], static_cast(dims_list->at(0)), unbiased, args[4]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.var.dim, var); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp deleted file mode 100644 index 8701a6246b0..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/View.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include -#include - -namespace vkcompute { - -std::vector compute_out_sizes( - std::vector orig_sizes, - std::vector& view_sizes) { - std::vector out_sizes(view_sizes.begin(), view_sizes.end()); - int64_t numel = 1; - int64_t transferred_numel = 1; - - for (int i = 0; i < orig_sizes.size(); i++) { - numel *= orig_sizes.at(i); - } - for (int i = 0; i < view_sizes.size(); i++) { - if (view_sizes.at(i) > 0) { - transferred_numel *= view_sizes.at(i); - } - } - for (int i = 0; i < out_sizes.size(); i++) { - if (out_sizes.at(i) == -1) { - out_sizes.at(i) = numel / transferred_numel; - } - } - return out_sizes; -} - -void resize_view_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - if (extra_args.at(0) == kDummyValueRef || - graph->val_is_none(extra_args.at(0))) { - const std::vector in_sizes = graph->sizes_of(in); - graph->virtual_resize(out, in_sizes); - } else { - std::vector view_sizes = - graph->extract_int_or_symint_list(extra_args.at(0)); - const std::vector in_sizes = graph->sizes_of(in); - const std::vector out_sizes = - compute_out_sizes(in_sizes, view_sizes); - graph->virtual_resize(out, out_sizes); - } -} - -void add_view_node( - ComputeGraph& graph, - ValueRef in, - ValueRef sizes, - ValueRef out) { - std::string kernel_name = "view"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - // Parameter Buffers - {}, - // Push Constants - {{graph.sizes_pc_of(out), graph.sizes_pc_of(in)}}, - // Specialization Constants - {graph.packed_dim_of(in), graph.packed_dim_of(out)}, - // Resize Args - {sizes}, - // Resizing Logic - resize_view_node)); -} - -void add_view_copy_buffer_node( - ComputeGraph& graph, - ValueRef in, - ValueRef out, - const std::vector& resize_args, - const ExecuteNode::ResizeFunction& resize_fn) { - std::string kernel_name = "view_buffer"; - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter Buffers - {graph.buffer_meta_ubo(out), graph.buffer_meta_ubo(in)}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - resize_args, - // Resizing Logic - resize_fn)); -} - -void view(ComputeGraph& graph, const std::vector& args) { - int idx = 0; - const ValueRef in = args.at(idx++); - const ValueRef sizes = args.at(idx++); - const ValueRef out = args.at(idx++); - - std::vector resize_args = {sizes}; - - if (graph.is_buffer_storage(out)) { - return add_view_copy_buffer_node( - graph, in, out, resize_args, resize_view_node); - } - return add_view_node(graph, in, sizes, out); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.view_copy.default, view); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h deleted file mode 100644 index 7a7a8d57742..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/View.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -/* - * Dispatches the view_copy compute shader. This can be used to implement ops - * that preserve the "contiguous" indexes of elements between the input and - * output such as view_copy, squeeze_copy, unsqueeze_copy, etc. - */ -void add_view_copy_buffer_node( - ComputeGraph& graph, - ValueRef in, - ValueRef out, - const std::vector& resize_args, - const ExecuteNode::ResizeFunction& resize_fn); - -void add_view_node( - ComputeGraph& graph, - ValueRef in, - ValueRef sizes, - ValueRef out); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp deleted file mode 100644 index c1c482d9967..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp +++ /dev/null @@ -1,114 +0,0 @@ -// Where.cpp - -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -namespace vkcompute { - -void resize_where_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - const std::vector in_sizes = graph->sizes_of(in); - graph->virtual_resize(out, in_sizes); -} - -void add_where_texture_node( - ComputeGraph& graph, - const ValueRef cond, - const ValueRef self, - const ValueRef other, - const ValueRef out) { - std::string kernel_name = "where"; - - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}}, - // Parameter buffers - {graph.logical_limits_ubo(self)}, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out)}, - // Resize Arguments - {}, - // Resizing Logic - resize_where_node)); -} - -void add_where_buffer_node( - ComputeGraph& graph, - const ValueRef cond, - const ValueRef self, - const ValueRef other, - const ValueRef out) { - std::string kernel_name = "where"; - - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList ubos = { - graph.numel_ubo(out), - graph.strides_ubo(out), - graph.strides_ubo(cond), - graph.strides_ubo(self), - graph.strides_ubo(other)}; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}}, - // Parameter buffers - ubos, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out)}, - // Resize Arguments - {}, - // Resizing Logic - resize_where_node)); -} - -void where(ComputeGraph& graph, const std::vector& args) { - int args_i = 0; - const ValueRef cond = args[args_i++]; - const ValueRef self = args[args_i++]; - const ValueRef other = args[args_i++]; - const ValueRef out = args[args_i++]; - if (graph.is_buffer_storage(out)) { - add_where_buffer_node(graph, cond, self, other, out); - } else { - add_where_texture_node(graph, cond, self, other, out); - } -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.where.self, where); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h deleted file mode 100644 index 5ed07dece38..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -/* - * Maps a semantic dimension name to an integer that corresponds to its - * innermost ordering in a 4D tensor in NCHW format. In a way, it is the - * "negative index" associated with a dim. For instance: in a NCHW tensor, Width - * is the innermost dimension, so it corresponds to 1, height is the next - * innermost, so it corresponds to 2, and so on. - */ -enum DimIndex : int32_t { - DIM_LAST = -1, - DIM_2ND_LAST = -2, - DIM_3RD_LAST = -3, - DIM_4TH_LAST = -4, -}; - -constexpr DimIndex kWidth4D = DimIndex::DIM_LAST; -constexpr DimIndex kHeight4D = DimIndex::DIM_2ND_LAST; -constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST; -constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST; - -/* - * Semantic dimension names for a 1D tensor - */ -struct Dim1D { - static constexpr uint32_t Length = 1u; -}; - -/* - * Semantic dimension names for a 2D Convolution kernel. - */ -struct DimConv2DKernel { - static constexpr uint32_t Width = 1u; - static constexpr uint32_t Height = 2u; - static constexpr uint32_t InChannels = 3u; - static constexpr uint32_t OutChannels = 4u; -}; - -/* - * The same as the above, except for a 2D Transposed Convolution kernel. - */ -struct DimTConv2DKernel { - static constexpr uint32_t Width = 1u; - static constexpr uint32_t Height = 2u; - static constexpr uint32_t OutChannels = 3u; - static constexpr uint32_t InChannels = 4u; -}; - -/* - * The functions below safely return the size of the dimension at the N-th - * innermost index. If the dimensionality of the size array is not sufficient - * then 1 will be returned. The structs above are intended to be used with - * these functions. - */ - -inline int32_t dim_at(const std::vector& sizes, DimIndex dim_index) { - const uint32_t dims = sizes.size(); - // Recall that dim_index is a negative index. - return dims < -dim_index - ? 1 - : utils::safe_downcast(sizes[dims + dim_index]); -} - -template -int32_t dim_at(const std::vector& sizes) { - return dim_at(sizes, DI); -} - -inline std::ostream& operator<<(std::ostream& os, DimIndex dim_index) { - switch (dim_index) { - case kWidth4D: - os << "kWidth4D"; - break; - case kHeight4D: - os << "kHeight4D"; - break; - case kChannel4D: - os << "kChannel4D"; - break; - case kBatch4D: - os << "kBatch4D"; - break; - default: - os << "kDim4DUnknown"; - break; - } - return os; -} -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp deleted file mode 100644 index 2fb0f60b249..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -utils::ivec2 make_ivec2_from_list(ComputeGraph& graph, ValueRef vref) { - return utils::make_ivec2(*graph.get_int_list(vref), /*reverse = */ true); -} - -utils::ivec2 make_ivec2_kernel_size( - ComputeGraph& graph, - const ValueRef weight, - const bool kernel_size_only) { - if (kernel_size_only) { - return make_ivec2_from_list(graph, weight); - } else { - const auto weight_sizes = graph.get_tref(weight)->sizes; - return utils::make_ivec2({weight_sizes.at(3), weight_sizes.at(2)}); - } -} - -Kernel2dParams create_kernel2d_params( - ComputeGraph& graph, - const ValueRef weight, - const bool kernel_size_only, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation) { - return { - make_ivec2_kernel_size(graph, weight, kernel_size_only), - make_ivec2_from_list(graph, stride), - make_ivec2_from_list(graph, padding), - make_ivec2_from_list(graph, dilation), - }; -} - -Kernel2dParams create_kernel2d_params( - ComputeGraph& graph, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding) { - return { - make_ivec2_kernel_size(graph, kernel_size, /*kernel_size_only = */ true), - make_ivec2_from_list(graph, stride), - make_ivec2_from_list(graph, padding), - {}, - }; -} - -int64_t calc_out_size( - const int64_t in_size, - const int64_t kernel_size, - const int64_t stride, - const int64_t padding, - const int64_t dilation, - const bool ceil_mode) { - int64_t c = ceil_mode ? stride - 1 : 0; - int64_t out_size = - (in_size + 2 * padding - dilation * (kernel_size - 1) - 1 + c) / stride + - 1; - if (ceil_mode && (out_size - 1) * stride >= in_size + padding) { - --out_size; - } - VK_CHECK_COND(out_size >= 1); - return out_size; -} - -std::vector calc_out_sizes_hw( - const std::vector& in_sizes, - const utils::ivec2& kernel_size, - const utils::ivec2& stride, - const utils::ivec2& padding, - const utils::ivec2& dilation, - const bool ceil_mode) { - const int64_t ndim = in_sizes.size(); - std::vector out_sizes(2); - - // Height - out_sizes.at(0) = calc_out_size( - in_sizes.at(ndim - 2), - kernel_size[1], - stride[1], - padding[1], - dilation[1], - ceil_mode); - // Width - out_sizes.at(1) = calc_out_size( - in_sizes.at(ndim - 1), - kernel_size[0], - stride[0], - padding[0], - dilation[0], - ceil_mode); - - return out_sizes; -} - -int64_t calc_transpose_out_size( - const int64_t in_size, - const int64_t kernel, - const int64_t stride, - const int64_t padding, - const int64_t dilation, - const int64_t output_padding) { - int64_t out_size = (in_size - 1) * stride - 2 * padding + - dilation * (kernel - 1) + output_padding + 1; - VK_CHECK_COND(out_size >= 1); - return out_size; -} - -std::vector calc_transpose_out_sizes_hw( - const std::vector& in_sizes, - const utils::ivec2& kernel_size, - const utils::ivec2& stride, - const utils::ivec2& padding, - const utils::ivec2& dilation, - const utils::ivec2& output_padding) { - const int64_t ndim = in_sizes.size(); - std::vector out_sizes(2); - - // Height - out_sizes.at(0) = calc_transpose_out_size( - in_sizes.at(ndim - 2), - kernel_size[1], - stride[1], - padding[1], - dilation[1], - output_padding[1]); - // Width - out_sizes.at(1) = calc_transpose_out_size( - in_sizes.at(ndim - 1), - kernel_size[0], - stride[0], - padding[0], - dilation[0], - output_padding[0]); - - return out_sizes; -} - -std::vector calc_out_sizes_hw( - ComputeGraph& graph, - const std::vector& in_sizes, - const ValueRef weight, - const bool kernel_size_only, - const std::vector& args, - const bool transposed) { - const auto kernel_size = - make_ivec2_kernel_size(graph, weight, kernel_size_only); - const auto stride = make_ivec2_from_list(graph, args[0]); - const auto padding = make_ivec2_from_list(graph, args[1]); - const auto dilation = args[2] == kDummyValueRef - ? utils::ivec2{1, 1} - : make_ivec2_from_list(graph, args[2]); - - if (transposed) { - const auto output_padding = make_ivec2_from_list(graph, args[3]); - return calc_transpose_out_sizes_hw( - in_sizes, kernel_size, stride, padding, dilation, output_padding); - } else { - const bool ceil_mode = - graph.val_is_bool(args[3]) ? graph.get_bool(args[3]) : false; - - return calc_out_sizes_hw( - in_sizes, kernel_size, stride, padding, dilation, ceil_mode); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h deleted file mode 100644 index 1e8b5b0f7a4..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include - -namespace vkcompute { - -struct Kernel1dParams final { - int kernel_size; - int stride; - int padding; - int dilation; - int in_group_size; - int out_group_size; -}; - -struct Kernel2dParams final { - utils::ivec2 kernel_size; - utils::ivec2 stride; - utils::ivec2 padding; - utils::ivec2 dilation; -}; - -Kernel2dParams create_kernel2d_params( - ComputeGraph& graph, - const ValueRef weight, - const bool kernel_size_only, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation); - -Kernel2dParams create_kernel2d_params( - ComputeGraph& graph, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding); - -int64_t calc_out_size( - const int64_t in_size, - const int64_t kernel_size, - const int64_t stride, - const int64_t padding, - const int64_t dilation, - const bool ceil_mode); - -std::vector calc_out_sizes_hw( - ComputeGraph& graph, - const std::vector& in_sizes, - const ValueRef weight, - const bool kernel_size_only, - const std::vector& args, - const bool transposed = false); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp deleted file mode 100644 index 4cf678a9dcb..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -void pack4(const uint8_t* w_ptr, uint8_t* b_ptr, uint32_t N, uint32_t K) { - for (int32_t n = 0; n < N; n++) { - for (int32_t k2 = 0; k2 < K / 2; k2++) { - uint8_t src_val0 = w_ptr[n * K + k2 * 2]; - uint8_t src_val1 = w_ptr[n * K + k2 * 2 + 1]; - b_ptr[n * (K / 2) + k2] = (uint8_t(src_val1) << 4) | uint8_t(src_val0); - } - } -} - -std::vector int4mm_pack_weights( - const std::vector& W_sizes, - const uint8_t* w_ptr) { - const int32_t N = utils::val_at(-1, W_sizes); - const int32_t K = utils::val_at(-2, W_sizes); - - const auto numel = K * N; - std::vector w_ptr_T(numel); - std::vector b_ptr(utils::div_up(numel, 2)); - - // Transpose the weights - for (int32_t k = 0; k < K; k++) { - for (int32_t n = 0; n < N; n++) { - w_ptr_T[n * K + k] = w_ptr[k * N + n]; - } - } - - // Pack two int4s into each int8 - pack4(w_ptr_T.data(), b_ptr.data(), N, K); - - return b_ptr; -} - -std::vector int4mm_dequantize_weights( - const std::vector& W_sizes, - const uint8_t* w_ptr, - const uint32_t group_size, - const float* scales_and_zeros) { - const int64_t N = utils::val_at(-1, W_sizes); - const int64_t K = utils::val_at(-2, W_sizes); - - std::vector w_ptr_deq(K * N); - const int k_groups = K / group_size; - const int zeros_stride = k_groups * N; - - for (int k = 0; k < K; k++) { - for (int n = 0; n < N; n++) { - const int kb = k / group_size; - const int scale_idx = k_groups * n + kb; - const float scale = scales_and_zeros[scale_idx]; - const float zero = - scales_and_zeros[scale_idx + zeros_stride] - scale * 8.0; - w_ptr_deq[k * N + n] = w_ptr[k * N + n] * scale + zero; - } - } - - return w_ptr_deq; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h deleted file mode 100644 index 4c4cf26d504..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -std::vector int4mm_pack_weights( - const std::vector& W_sizes, - const uint8_t* w_ptr); - -std::vector int4mm_dequantize_weights( - const std::vector& W_sizes, - const uint8_t* w_ptr, - const uint32_t group_size, - const float* scales_and_zeros); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h b/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h deleted file mode 100644 index 4bc8c7c3bfc..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -enum class QuantizationGranularity { - PerChannel, - PerTensor, - PerGroup, - NoQuantization, -}; - -static constexpr QuantizationGranularity kPerChannel = - QuantizationGranularity::PerChannel; -static constexpr QuantizationGranularity kPerTensor = - QuantizationGranularity::PerTensor; -static constexpr QuantizationGranularity kPerGroup = - QuantizationGranularity::PerGroup; -static constexpr QuantizationGranularity kNoQuantization = - QuantizationGranularity::NoQuantization; - -struct QuantizationConfig { - int nbits; - QuantizationGranularity granularity; - std::vector granularity_sizes; - bool is_symmetric; - bool is_dynamic; - - QuantizationConfig() - : nbits(8), - granularity(kPerTensor), - granularity_sizes(), - is_symmetric(true), - is_dynamic(false) {} - - QuantizationConfig( - int nbits_, - QuantizationGranularity granularity_, - const std::vector& granularity_sizes_, - bool is_symmetric_ = true, - bool is_dynamic_ = false) - : nbits(nbits_), - granularity(granularity_), - granularity_sizes(granularity_sizes_), - is_symmetric(is_symmetric_), - is_dynamic(is_dynamic_) {} -}; - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h deleted file mode 100644 index 270bdd1cd6b..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -template -T extract_scalar(const Value& value) { - if (value.isInt()) { - return static_cast(value.toInt()); - } - if (value.isDouble()) { - return static_cast(value.toDouble()); - } - if (value.isBool()) { - return static_cast(value.toBool()); - } - VK_THROW("Cannot extract scalar from Value with type ", value.type()); -} - -// Helper function to get default quant_min and quant_max based on dtype -// This matches the logic in _get_and_check_qmin_qmax from quant_primitives.py -inline std::pair get_dtype_bounds(vkapi::ScalarType dtype) { - switch (dtype) { - case vkapi::kByte: // uint8 - return {0, 255}; - case vkapi::kChar: // int8 - return {-128, 127}; - case vkapi::kShort: // int16 - return {-(1 << 15), (1 << 15) - 1}; - case vkapi::kInt: // int32 - return {-(1LL << 31), (1LL << 31) - 1}; - default: - // For unsupported types, throw an error instead of assuming int8 - VK_THROW("Unsupported dtype for quantization bounds: ", dtype); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp deleted file mode 100644 index a52572289a4..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -// -// Tensor output size calculation functions -// - -std::vector calculate_broadcasted_output_size( - const std::vector& sizes1, - const std::vector& sizes2) { - std::vector out_sizes(std::max(sizes1.size(), sizes2.size())); - - // Match the sizes in reverse because sizes are in NCHW order - for (int i = -1; i >= -out_sizes.size(); --i) { - out_sizes.at(out_sizes.size() + i) = - std::max(utils::val_at(i, sizes1), utils::val_at(i, sizes2)); - } - - return out_sizes; -} - -// -// Tensor property checking functions -// - -bool check_same_packed_dim( - ComputeGraph& graph, - const ValueRef in, - const ValueRef out) { - return graph.packed_dim_of(in) == graph.packed_dim_of(out); -} - -// -// Broadcast flag functions -// - -bool is_packed_dim_broadcasted( - ComputeGraph& graph, - const ValueRef sndr, - const ValueRef rcvr) { - // We assume that the tensors are broadcastable. If values aren't equal at - // some index, then the value of rcvr is 1 and hence should be broadcasted. - const std::vector sndr_sizes = graph.sizes_of(sndr); - const std::vector rcvr_sizes = graph.sizes_of(rcvr); - - switch (graph.packed_dim_of(sndr)) { - case WHCN::kChannelsDim: - return utils::val_at(-3, sndr_sizes) > utils::val_at(-3, rcvr_sizes); - case WHCN::kHeightDim: - return utils::val_at(-2, sndr_sizes) > utils::val_at(-2, rcvr_sizes); - case WHCN::kWidthDim: - return utils::val_at(-1, sndr_sizes) > utils::val_at(-1, rcvr_sizes); - default: - VK_THROW("Invalid packed dim"); - } -} - -utils::ivec2 create_broadcast_params( - ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2) { - return utils::make_ivec2( - {is_packed_dim_broadcasted(graph, t2, t1), - is_packed_dim_broadcasted(graph, t1, t2)}); -} - -// -// Work group size calculation functions -// - -utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group) { - utils::uvec3 local_group_size = {4, 4, 4}; - if (global_work_group[2u] == 1) { - if (global_work_group[1u] < 8) { - local_group_size[0u] = 16; - local_group_size[1u] = 4; - local_group_size[2u] = 1; - } else { - local_group_size[0u] = 8; - local_group_size[1u] = 8; - local_group_size[2u] = 1; - } - } - return local_group_size; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h deleted file mode 100644 index b62bf661995..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace vkcompute { - -// -// Tensor output size calculation functions -// - -std::vector calculate_broadcasted_output_size( - const std::vector& sizes1, - const std::vector& sizes2); - -// -// Tensor property checking functions -// - -bool check_same_packed_dim( - ComputeGraph& graph, - const ValueRef in, - const ValueRef out); - -// -// Broadcast flag functions -// - -bool is_packed_dim_broadcasted( - ComputeGraph& graph, - const ValueRef sndr, - const ValueRef rcvr); - -utils::ivec2 create_broadcast_params( - ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2); - -// -// Work group size calculation functions -// - -utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group); - -// -// Tensor dim utilities -// - -template < - typename T, - typename std::enable_if< - std::is_integral::value && std::is_signed::value, - int>::type = 0> -T normalize(const T& nchw_dim, const int64_t ndim) { - return (nchw_dim % ndim + ndim) % ndim; -} - -template < - typename T, - typename std::enable_if< - std::is_integral::value && std::is_signed::value, - int>::type = 0> -T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) { - return ndim - 1 - nchw_dim; -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp deleted file mode 100644 index e829f355fe2..00000000000 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -uint32_t bind_values_to_descriptor_set( - ComputeGraph* graph, - const std::vector& args, - vkapi::PipelineBarrier& pipeline_barrier, - vkapi::DescriptorSet& descriptor_set, - const uint32_t base_idx) { - uint32_t idx = base_idx; - for (auto& arg : args) { - for (auto& ref : arg.refs) { - graph->bind_value_to_descriptor_set( - ref, pipeline_barrier, arg.access, descriptor_set, idx++); - } - } - return idx; -} - -uint32_t bind_params_to_descriptor_set( - const vkapi::ParamsBindList& params, - vkapi::DescriptorSet& descriptor_set, - const uint32_t base_idx) { - uint32_t idx = base_idx; - for (auto& param : params.bind_infos) { - descriptor_set.bind(idx++, param); - } - return idx; -} - -void bind_staging_to_descriptor_set( - api::StagingBuffer& staging, - vkapi::DescriptorSet& descriptor_set, - const uint32_t idx) { - descriptor_set.bind(idx, staging.buffer()); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h deleted file mode 100644 index 307bec154f3..00000000000 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -// -// For objects in the graph -// - -uint32_t bind_values_to_descriptor_set( - ComputeGraph* graph, - const std::vector& args, - vkapi::PipelineBarrier& pipeline_barrier, - vkapi::DescriptorSet& descriptor_set, - const uint32_t base_idx); - -// -// For objects NOT in the graph -// - -uint32_t bind_params_to_descriptor_set( - const vkapi::ParamsBindList& params, - vkapi::DescriptorSet& descriptor_set, - const uint32_t base_idx); - -void bind_staging_to_descriptor_set( - api::StagingBuffer& staging, - vkapi::DescriptorSet& descriptor_set, - const uint32_t idx); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp deleted file mode 100644 index 231e6d0c7f6..00000000000 --- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { - -void add_storage_type_suffix( - std::string& kernel_name, - const utils::StorageType storage_type) { - switch (storage_type) { - case utils::kBuffer: - kernel_name += "_buffer"; - break; - case utils::kTexture3D: - kernel_name += "_texture3d"; - break; - case utils::kTexture2D: - kernel_name += "_texture2d"; - break; - } -} - -void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) { - switch (dtype) { - case vkapi::kDouble: - kernel_name += "_double"; - break; - case vkapi::kFloat: - kernel_name += "_float"; - break; - case vkapi::kHalf: - kernel_name += "_half"; - break; - case vkapi::kChar: - case vkapi::kQInt8: - kernel_name += "_int8"; - break; - case vkapi::kByte: - case vkapi::kBool: - case vkapi::kQUInt8: - kernel_name += "_uint8"; - break; - case vkapi::kShort: - kernel_name += "_int16"; - break; - case vkapi::kUInt16: - kernel_name += "_uint16"; - break; - case vkapi::kInt: - kernel_name += "_int32"; - break; - case vkapi::kUInt: - kernel_name += "_uint32"; - break; - case vkapi::kLong: - kernel_name += "_int64"; - break; - case vkapi::kUInt64: - kernel_name += "_uint64"; - break; - default: - break; - } -} - -void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim) { - switch (packed_dim) { - case WHCN::kWidthDim: - kernel_name += "_W_packed"; - break; - case WHCN::kHeightDim: - kernel_name += "_H_packed"; - break; - case WHCN::kChannelsDim: - kernel_name += "_C_packed"; - break; - default: - VK_THROW("Invalid packed dim!"); - } -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h deleted file mode 100644 index 4a2fddb5cf2..00000000000 --- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace vkcompute { - -constexpr size_t kShaderNameReserve = 64u; - -void add_storage_type_suffix( - std::string& kernel_name, - const utils::StorageType storage_type); - -void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype); - -void add_ndim_suffix(std::string& kernel_name, const size_t ndim); - -void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp deleted file mode 100644 index c90bfa402bb..00000000000 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// @lint-ignore-every CLANGTIDY facebook-security-vulnerable-memcpy - -#include -#include - -#include - -namespace vkcompute { - -bool is_bitw8(vkapi::ScalarType dtype) { - return dtype == vkapi::kByte || dtype == vkapi::kChar || - dtype == vkapi::kQInt8 || dtype == vkapi::kQUInt8; -} - -vkapi::ShaderInfo get_nchw_to_tensor_shader( - ComputeGraph& graph, - const ValueRef dst, - bool int8_buffer_enabled, - bool push_constant_variant) { - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - - const vkapi::ScalarType dst_dtype = graph.dtype_of(dst); - const utils::StorageType dst_storage_type = graph.storage_type_of(dst); - - if (is_bitw8(dst_dtype) && dst_storage_type != utils::kBuffer && - !int8_buffer_enabled) { - kernel_name = "nchw_to_bitw8_image_nobitw8buffer"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, dst_storage_type); - add_dtype_suffix(kernel_name, dst_dtype); - return VK_KERNEL_FROM_STR(kernel_name); - } - - if (dst_storage_type == utils::kBuffer) { - kernel_name = "nchw_to_buffer"; - add_dtype_suffix(kernel_name, dst_dtype); - return VK_KERNEL_FROM_STR(kernel_name); - } - - kernel_name = "nchw_to_image"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, dst_storage_type); - add_dtype_suffix(kernel_name, dst_dtype); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -vkapi::ShaderInfo get_tensor_to_nchw_shader( - ComputeGraph& graph, - const ValueRef src, - bool int8_buffer_enabled, - bool push_constant_variant) { - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - - const vkapi::ScalarType src_dtype = graph.dtype_of(src); - const utils::StorageType src_storage_type = graph.storage_type_of(src); - - if (is_bitw8(src_dtype) && src_storage_type != utils::kBuffer && - !int8_buffer_enabled) { - kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, src_storage_type); - add_dtype_suffix(kernel_name, src_dtype); - return VK_KERNEL_FROM_STR(kernel_name); - } - - if (src_storage_type == utils::kBuffer) { - kernel_name = "buffer_to_nchw"; - add_dtype_suffix(kernel_name, src_dtype); - return VK_KERNEL_FROM_STR(kernel_name); - } - - kernel_name = "image_to_nchw"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, src_storage_type); - add_dtype_suffix(kernel_name, src_dtype); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h deleted file mode 100644 index 71c92b833b7..00000000000 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -vkapi::ShaderInfo get_nchw_to_tensor_shader( - ComputeGraph& graph, - const ValueRef dst, - bool int8_buffer_enabled = true, - bool push_constant_variant = true); -vkapi::ShaderInfo get_tensor_to_nchw_shader( - ComputeGraph& graph, - const ValueRef src, - bool int8_buffer_enabled = true, - bool push_constant_variant = true); - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/utils/MacroUtils.h b/backends/vulkan/runtime/utils/MacroUtils.h deleted file mode 100644 index a182f9046b7..00000000000 --- a/backends/vulkan/runtime/utils/MacroUtils.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// Suppress an unused variable. Copied from [[maybe_unused]] -#if defined(_MSC_VER) && !defined(__clang__) -#define VK_UNUSED __pragma(warning(suppress : 4100 4101)) -#else -#define VK_UNUSED __attribute__((__unused__)) -#endif //_MSC_VER diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h deleted file mode 100644 index 20addf88c53..00000000000 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { - -// Convenience constexpr to attach semantic names to WHCN dimension index -namespace WHCN { - -constexpr int32_t kWidthDim = 0; -constexpr int32_t kHeightDim = 1; -constexpr int32_t kChannelsDim = 2; - -} // namespace WHCN - -namespace utils { - -// -// GPU Storage Options -// - -/** - * The enum below is used to describe what type of GPU memory will be used to - * store a particular tensor's data. - * - * BUFFER means that a SSBO (Shader Storage Buffer Object) will be used. - * TEXTURE_3D means that a 3-dimensional image texture will be used. - * TEXTURE_2D means that a 2-dimensional image texture will be used. - * - * UNKNOWN is not expected to be used. - */ -enum class StorageType : uint8_t { - BUFFER, - TEXTURE_3D, - TEXTURE_2D, -}; - -static constexpr StorageType kBuffer = StorageType::BUFFER; -static constexpr StorageType kTexture3D = StorageType::TEXTURE_3D; -static constexpr StorageType kTexture2D = StorageType::TEXTURE_2D; - -/* - * A tensor's memory layout is defined in one of two ways: - * - * 1. If it's a buffer backed tensor, the memory layout is defined by its - * `dim_order`, and by extension its `strides`. - * 2. If it's a texture backed tensor, the memory layout is defined by the - * combination of its `axis_map` and its `packed_dim`. - * - * Providing explicit memory layout metadata upon tensor construction is not - * very convenient from an API perspective, so the `GPUMemoryLayout` serves as - * an abstraction that is used to determine how to initialize a tensor's layout - * metadata based on the developer's intent. A `GPUMemoryLayout` is provided to - * the constructor of `vTensor`, which will use it to determine how to set its - * `dim_order` if it's a buffer backed tensor, or how to set its `axis_map` and - * `packed_dim` if it's a texture backed tensor. - * - * Note that GPUMemoryLayout is not stored as a tensor property, as it does not - * have any meaning after the vTensor is constructed. After construction, - * methods such as `virtual_transpose()` may be used to modify the tensor's - * layout metadata that cannot be represented by any `GPUMemoryLayout` entry. - * Nonetheless, a "best guess" of the closest memory layout can be produced via - * the `estimate_memory_layout()` API of `vTensor`. - * - * Currently, only 3 memory layouts are provided, but more will be added in the - * future that will enable different functionality such as minimizing texture - * memory footprint. - */ -enum class GPUMemoryLayout : uint8_t { - /* - * The below memory layouts will produce a `vTensor` with the following - * properties: - * - * 1. For buffer backed tensors, the `dim_order` will be the same as a - * contiguous dim order, but with the specified dim last in the dim order. - * 2. For texture backed tensors, the packed dim will be the specified dim. - * The axis map will be `{0, 1, 2, 2}`. - */ - TENSOR_WIDTH_PACKED = 0u, - TENSOR_HEIGHT_PACKED = 1u, - TENSOR_CHANNELS_PACKED = 2u, -}; - -static constexpr GPUMemoryLayout kWidthPacked = - GPUMemoryLayout::TENSOR_WIDTH_PACKED; - -static constexpr GPUMemoryLayout kHeightPacked = - GPUMemoryLayout::TENSOR_HEIGHT_PACKED; - -static constexpr GPUMemoryLayout kChannelsPacked = - GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - -template -T to_packed_dim(const GPUMemoryLayout layout) { - switch (layout) { - case kWidthPacked: - return 0; - case kHeightPacked: - return 1; - case kChannelsPacked: - return 2; - }; - // Should be unreachable - return 0; -} - -inline std::ostream& operator<<( - std::ostream& os, - const StorageType storage_type) { - switch (storage_type) { - case kBuffer: - os << "BUFFER"; - break; - case kTexture3D: - os << "TEXTURE_3D"; - break; - case kTexture2D: - os << "TEXTURE_2D"; - break; - } - return os; -} - -inline std::ostream& operator<<( - std::ostream& os, - const GPUMemoryLayout layout) { - switch (layout) { - case kWidthPacked: - os << "TENSOR_WIDTH_PACKED"; - break; - case kHeightPacked: - os << "TENSOR_HEIGHT_PACKED"; - break; - case kChannelsPacked: - os << "TENSOR_CHANNELS_PACKED"; - break; - } - return os; -} - -enum class AxisMapLayout : uint8_t { - DEFAULT = 0u, - OPTIMIZED = 1u, -}; - -constexpr AxisMapLayout kDefaultAxisMap = AxisMapLayout::DEFAULT; - -constexpr AxisMapLayout kOptimizedAxisMap = AxisMapLayout::OPTIMIZED; - -} // namespace utils -} // namespace vkcompute diff --git a/backends/vulkan/runtime/utils/StringUtils.h b/backends/vulkan/runtime/utils/StringUtils.h deleted file mode 100644 index 986b58c3303..00000000000 --- a/backends/vulkan/runtime/utils/StringUtils.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once -// @lint-ignore-every CLANGTIDY facebook-hte-LocalUncheckedArrayBounds - -#include -#include -#include - -namespace vkcompute { -namespace utils { - -namespace detail { - -struct CompileTimeEmptyString { - operator const std::string&() const { - static const std::string empty_string_literal; - return empty_string_literal; - } - operator const char*() const { - return ""; - } -}; - -template -struct CanonicalizeStrTypes { - using type = const T&; -}; - -template -struct CanonicalizeStrTypes { - using type = const char*; -}; - -inline std::ostream& _str(std::ostream& ss) { - return ss; -} - -template -inline std::ostream& _str(std::ostream& ss, const T& t) { - ss << t; - return ss; -} - -template <> -inline std::ostream& _str( - std::ostream& ss, - const CompileTimeEmptyString&) { - return ss; -} - -template -inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) { - return _str(_str(ss, t), args...); -} - -template -struct _str_wrapper final { - static std::string call(const Args&... args) { - std::ostringstream ss; - _str(ss, args...); - return ss.str(); - } -}; - -template <> -struct _str_wrapper<> final { - static CompileTimeEmptyString call() { - return CompileTimeEmptyString(); - } -}; - -} // namespace detail - -template -inline std::string concat_str(const Args&... args) { - return detail::_str_wrapper< - typename detail::CanonicalizeStrTypes::type...>::call(args...); -} - -} // namespace utils -} // namespace vkcompute diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h deleted file mode 100644 index d84eb54d2b9..00000000000 --- a/backends/vulkan/runtime/utils/VecUtils.h +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include -#include -#include -#include - -namespace vkcompute { -namespace utils { - -// -// Hashing -// - -/** - * hash_combine is taken from c10/util/hash.h, which in turn is based on - * implementation from Boost - */ -inline size_t hash_combine(size_t seed, size_t value) { - return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u)); -} - -// -// Alignment -// - -template -inline constexpr Type align_down(const Type& number, const Type& multiple) { - return (number / multiple) * multiple; -} - -template -inline constexpr Type align_up(const Type& number, const Type& multiple) { - return align_down(number + multiple - 1, multiple); -} - -template -inline constexpr Type align_up_4(const Type& numerator) { - return (numerator + 3) & -4; -} - -template -inline constexpr Type div_up(const Type& numerator, const Type& denominator) { - return (numerator + denominator - 1) / denominator; -} - -template -inline constexpr Type div_up_4(const Type& numerator) { - return (numerator + 3) / 4; -} - -// -// Casting Utilities -// - -namespace detail { - -/* - * x cannot be less than 0 if x is unsigned - */ -template -static inline constexpr bool is_negative( - const T& /*x*/, - std::true_type /*is_unsigned*/) { - return false; -} - -/* - * check if x is less than 0 if x is signed - */ -template -static inline constexpr bool is_negative( - const T& x, - std::false_type /*is_unsigned*/) { - return x < T(0); -} - -/* - * Returns true if x < 0 - */ -template -inline constexpr bool is_negative(const T& x) { - return is_negative(x, std::is_unsigned()); -} - -/* - * Returns true if x < lowest(Limit); standard comparison - */ -template -static inline constexpr bool less_than_lowest( - const T& x, - std::false_type /*limit_is_unsigned*/, - std::false_type /*x_is_unsigned*/) { - return x < std::numeric_limits::lowest(); -} - -/* - * Limit can contained negative values, but x cannot; return false - */ -template -static inline constexpr bool less_than_lowest( - const T& /*x*/, - std::false_type /*limit_is_unsigned*/, - std::true_type /*x_is_unsigned*/) { - return false; -} - -/* - * Limit cannot contained negative values, but x can; check if x is negative - */ -template -static inline constexpr bool less_than_lowest( - const T& x, - std::true_type /*limit_is_unsigned*/, - std::false_type /*x_is_unsigned*/) { - return x < T(0); -} - -/* - * Both x and Limit cannot be negative; return false - */ -template -static inline constexpr bool less_than_lowest( - const T& /*x*/, - std::true_type /*limit_is_unsigned*/, - std::true_type /*x_is_unsigned*/) { - return false; -} - -/* - * Returns true if x is less than the lowest value of type T - */ -template -inline constexpr bool less_than_lowest(const T& x) { - return less_than_lowest( - x, std::is_unsigned(), std::is_unsigned()); -} - -// Suppress sign compare warning when compiling with GCC -// as later does not account for short-circuit rule before -// raising the warning, see https://godbolt.org/z/Tr3Msnz99 -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsign-compare" -#endif - -/* - * Returns true if x is greater than the greatest value of the type Limit - */ -template -inline constexpr bool greater_than_max(const T& x) { - constexpr bool can_overflow = - std::numeric_limits::digits > std::numeric_limits::digits; - return can_overflow && x > std::numeric_limits::max(); -} - -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif - -template -std::enable_if_t< - std::is_integral::value && !std::is_same::value, - bool> -overflows(From f) { - using limit = std::numeric_limits; - // Casting from signed to unsigned; allow for negative numbers to wrap using - // two's complement arithmetic. - if (!limit::is_signed && std::numeric_limits::is_signed) { - return greater_than_max(f) || - (is_negative(f) && -static_cast(f) > limit::max()); - } - // standard case, check if f is outside the range of type To - else { - return less_than_lowest(f) || greater_than_max(f); - } -} - -template -std::enable_if_t::value, bool> overflows(From f) { - using limit = std::numeric_limits; - if (limit::has_infinity && std::isinf(static_cast(f))) { - return false; - } - return f < limit::lowest() || f > limit::max(); -} - -template -inline constexpr To safe_downcast(const From& v) { - VK_CHECK_COND(!overflows(v), "Cast failed: out of range!"); - return static_cast(v); -} - -template -inline constexpr bool is_signed_to_unsigned() { - return std::is_signed::value && std::is_unsigned::value; -} - -} // namespace detail - -template < - typename To, - typename From, - std::enable_if_t(), bool> = true> -inline constexpr To safe_downcast(const From& v) { - VK_CHECK_COND(v >= From{}, "Cast failed: negative signed to unsigned!"); - return detail::safe_downcast(v); -} - -template < - typename To, - typename From, - std::enable_if_t(), bool> = true> -inline constexpr To safe_downcast(const From& v) { - return detail::safe_downcast(v); -} - -// -// Vector Types -// - -namespace detail { - -template -struct vec final { - // NOLINTNEXTLINE - Type data[N]; - - vec() = default; - - // Standard constructor with initializer list - vec(std::initializer_list values) { - VK_CHECK_COND(values.size() == N); - std::copy(values.begin(), values.end(), data); - } - - // Conversion constructor from an _integral_ vec type. Note that this is only - // defined if `OtherType` is an integral type to disallow implicit narrowing. - template < - typename OtherType, - typename std::enable_if< - !std::is_same::value && - std::is_integral::value, - int>::type = 0> - /* implicit */ vec(const vec& other) { - for (int i = 0; i < N; ++i) { - data[i] = safe_downcast(other[i]); - } - } - - template < - typename IndexType, - typename = std::enable_if_t::value>> - const Type& operator[](const IndexType& i) const { - VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!"); - return data[i]; - } - - template < - typename IndexType, - typename = std::enable_if_t::value>> - Type& operator[](const IndexType& i) { - VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!"); - return data[i]; - } - - bool operator==(const vec& other) const { - for (uint32_t i = 0; i < N; ++i) { - if (data[i] != other.data[i]) { - return false; - } - } - return true; - } - - bool operator!=(const vec& other) const { - return !(*this == other); - } -}; - -} // namespace detail - -template -using ivec = detail::vec; -using ivec2 = ivec<2u>; -using ivec3 = ivec<3u>; -using ivec4 = ivec<4u>; - -template -using uvec = detail::vec; -using uvec2 = uvec<2u>; -using uvec3 = uvec<3u>; -using uvec4 = uvec<4u>; - -template -using vec = detail::vec; -using vec2 = vec<2u>; -using vec3 = vec<3u>; -using vec4 = vec<4u>; - -// uvec3 is the type representing tensor extents. Useful for debugging. -inline std::ostream& operator<<(std::ostream& os, const uvec3& v) { - os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const ivec3& v) { - os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const uvec4& v) { - os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const ivec4& v) { - os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")"; - return os; -} - -template -inline detail::vec divup_vec( - const detail::vec& a, - const detail::vec& b) { - detail::vec result; - for (uint32_t i = 0; i < N; ++i) { - result[i] = utils::div_up(a[i], b[i]); - } - return result; -} - -// -// std::vector Handling -// - -/* - * Utility function to perform indexing on an std::vector. Negative indexing - * is allowed. For instance, passing an index of -1 will retrieve the last - * element. If the requested index is out of bounds, then 1u will be returned. - */ -template -inline T val_at(const int64_t index, const std::vector& sizes) { - const int64_t ndim = static_cast(sizes.size()); - if (index >= 0) { - return index >= ndim ? 1 : sizes[index]; - } else { - return ndim + index < 0 ? 1 : sizes[ndim + index]; - } -} - -inline ivec2 make_ivec2( - const std::vector& ints, - bool reverse = false) { - VK_CHECK_COND(ints.size() == 2); - if (reverse) { - return {safe_downcast(ints[1]), safe_downcast(ints[0])}; - } else { - return {safe_downcast(ints[0]), safe_downcast(ints[1])}; - } -} - -inline ivec3 make_ivec3( - const std::vector& ints, - bool reverse = false) { - VK_CHECK_COND(ints.size() == 3); - if (reverse) { - return { - safe_downcast(ints[2]), - safe_downcast(ints[1]), - safe_downcast(ints[0]), - }; - } else { - return { - safe_downcast(ints[0]), - safe_downcast(ints[1]), - safe_downcast(ints[2]), - }; - } -} - -inline ivec4 make_ivec4( - const std::vector& ints, - bool reverse = false) { - VK_CHECK_COND(ints.size() == 4); - if (reverse) { - return { - safe_downcast(ints[3]), - safe_downcast(ints[2]), - safe_downcast(ints[1]), - safe_downcast(ints[0]), - }; - } else { - return { - safe_downcast(ints[0]), - safe_downcast(ints[1]), - safe_downcast(ints[2]), - safe_downcast(ints[3]), - }; - } -} - -inline ivec4 make_ivec4_prepadded1(const std::vector& ints) { - VK_CHECK_COND(ints.size() <= 4); - - ivec4 result = {1, 1, 1, 1}; - size_t base = 4 - ints.size(); - for (size_t i = 0; i < ints.size(); ++i) { - result[i + base] = safe_downcast(ints[i]); - } - - return result; -} - -inline ivec3 make_ivec3(uvec3 ints) { - return { - safe_downcast(ints[0u]), - safe_downcast(ints[1u]), - safe_downcast(ints[2u])}; -} - -inline uvec3 make_uvec3(ivec3 ints) { - return { - safe_downcast(ints[0u]), - safe_downcast(ints[1u]), - safe_downcast(ints[2u])}; -} - -/* - * Given an vector of up to 4 uint64_t representing the sizes of a tensor, - * constructs a uvec4 containing those elements in reverse order. - */ -inline uvec4 make_whcn_uvec4(const std::vector& arr) { - uint32_t w = safe_downcast(val_at(-1, arr)); - uint32_t h = safe_downcast(val_at(-2, arr)); - uint32_t c = safe_downcast(val_at(-3, arr)); - uint32_t n = safe_downcast(val_at(-4, arr)); - - return {w, h, c, n}; -} - -/* - * Given an vector of up to 4 int64_t representing the sizes of a tensor, - * constructs an ivec4 containing those elements in reverse order. - */ -inline ivec4 make_whcn_ivec4(const std::vector& arr) { - int32_t w = val_at(-1, arr); - int32_t h = val_at(-2, arr); - int32_t c = val_at(-3, arr); - int32_t n = val_at(-4, arr); - - return {w, h, c, n}; -} - -/* - * Wrapper around std::accumulate that accumulates values of a container of - * integral types into int64_t. Taken from `multiply_integers` in - * - */ -template < - typename C, - std::enable_if_t::value, int> = 0> -inline int64_t multiply_integers(const C& container) { - return std::accumulate( - container.begin(), - container.end(), - static_cast(1), - std::multiplies<>()); -} - -/* - * Product of integer elements referred to by iterators; accumulates into the - * int64_t datatype. Taken from `multiply_integers` in - */ -template < - typename Iter, - std::enable_if_t< - std::is_integral< - typename std::iterator_traits::value_type>::value, - int> = 0> -inline int64_t multiply_integers(Iter begin, Iter end) { - // std::accumulate infers return type from `init` type, so if the `init` type - // is not large enough to hold the result, computation can overflow. We use - // `int64_t` here to avoid this. - return std::accumulate( - begin, end, static_cast(1), std::multiplies<>()); -} - -class WorkgroupSize final { - uint32_t val; - - public: - explicit WorkgroupSize() : val(0) {} - explicit WorkgroupSize(const uint32_t x, const uint32_t y, const uint32_t z) { - // shift numbers by multiple of 11 bits, since each local workgroup axis can - // be 1024 at most and which is 0x400. only z axis can't store 1024, because - // it would overflow uint32_t storage. - if (z == 1024) { - throw std::runtime_error( - "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage"); - } - val = x | (y << 11) | (z << 22); - } - - explicit WorkgroupSize(const uvec3& vec) { - // shift numbers by multiple of 11 bits, since each local workgroup axis can - // be 1024 at most and which is 0x400. only z axis can't store 1024, because - // it would overflow uint32_t storage. - if (vec[2u] == 1024) { - throw std::runtime_error( - "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage"); - } - val = vec[0u] | (vec[1u] << 11) | (vec[2u] << 22); - } - - explicit inline operator uvec3() const { - return { - val & 0x7ffu, - (val >> 11) & 0x7ffu, - (val >> 22), - }; - } - - explicit inline operator uint32_t() const { - return val; - } - - inline constexpr uint32_t operator[](const int idx) const { - return (val >> (11 * idx)) & 0x7ffu; - } - - // Equality operator - bool operator==(const WorkgroupSize& other) const { - return val == other.val; - } - - // Inequality operator (optional, for completeness) - bool operator!=(const WorkgroupSize& other) const { - return !(*this == other); - } -}; - -} // namespace utils -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp deleted file mode 100644 index 0e87dde1922..00000000000 --- a/backends/vulkan/runtime/vk_api/Adapter.cpp +++ /dev/null @@ -1,566 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// @lint-ignore-every CLANGTIDY clang-diagnostic-missing-field-initializers - -#include - -#include - -namespace vkcompute { -namespace vkapi { - -namespace { - -void find_compute_queues( - const PhysicalDevice& physical_device, - const uint32_t num_queues_to_create, - std::vector& queue_create_infos, - std::vector>& queues_to_get) { - queue_create_infos.reserve(num_queues_to_create); - queues_to_get.reserve(num_queues_to_create); - - uint32_t remaining_queues = num_queues_to_create; - for (uint32_t family_i = 0; family_i < physical_device.queue_families.size(); - ++family_i) { - const VkQueueFamilyProperties& queue_properties = - physical_device.queue_families.at(family_i); - // Check if this family has compute capability - if (queue_properties.queueFlags & VK_QUEUE_COMPUTE_BIT) { - const uint32_t queues_to_init = - std::min(remaining_queues, queue_properties.queueCount); - - const std::vector queue_priorities(queues_to_init, 1.0f); - queue_create_infos.push_back({ - VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - family_i, // queueFamilyIndex - queues_to_init, // queueCount - queue_priorities.data(), // pQueuePriorities - }); - - for (size_t queue_i = 0; queue_i < queues_to_init; ++queue_i) { - // Use this to get the queue handle once device is created - queues_to_get.emplace_back(family_i, queue_i); - } - remaining_queues -= queues_to_init; - } - if (remaining_queues == 0) { - break; - } - } -} - -void populate_queue_info( - const PhysicalDevice& physical_device, - VkDevice logical_device, - const std::vector>& queues_to_get, - std::vector& queues, - std::vector& queue_usage) { - queues.reserve(queues_to_get.size()); - queue_usage.reserve(queues_to_get.size()); - - // Obtain handles for the created queues and initialize queue usage heuristic - - for (const std::pair& queue_idx : queues_to_get) { - VkQueue queue_handle = VK_NULL_HANDLE; - VkQueueFlags flags = - physical_device.queue_families.at(queue_idx.first).queueFlags; - vkGetDeviceQueue( - logical_device, queue_idx.first, queue_idx.second, &queue_handle); - queues.push_back({queue_idx.first, queue_idx.second, flags, queue_handle}); - // Initial usage value - queue_usage.push_back(0); - } -} - -VkDevice create_logical_device( - const PhysicalDevice& physical_device, - const uint32_t num_queues_to_create, - std::vector& queues, - std::vector& queue_usage) { - // Find compute queues up to the requested number of queues - - std::vector queue_create_infos; - std::vector> queues_to_get; - find_compute_queues( - physical_device, num_queues_to_create, queue_create_infos, queues_to_get); - - // Create the VkDevice - std::vector requested_device_extensions{ -#ifdef VK_KHR_portability_subset - VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME, -#endif /* VK_KHR_portability_subset */ -#ifdef VK_ANDROID_external_memory_android_hardware_buffer - VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME, -#endif /* VK_ANDROID_external_memory_android_hardware_buffer */ -#ifdef VK_KHR_16bit_storage - VK_KHR_16BIT_STORAGE_EXTENSION_NAME, -#endif /* VK_KHR_16bit_storage */ -#ifdef VK_KHR_8bit_storage - VK_KHR_8BIT_STORAGE_EXTENSION_NAME, -#endif /* VK_KHR_8bit_storage */ -#ifdef VK_KHR_shader_float16_int8 - VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, -#endif /* VK_KHR_shader_float16_int8 */ -#ifdef VK_KHR_shader_integer_dot_product - VK_KHR_SHADER_INTEGER_DOT_PRODUCT_EXTENSION_NAME, -#endif /* VK_KHR_shader_integer_dot_product */ -#if defined(VK_KHR_pipeline_executable_properties) && defined(VULKAN_DEBUG) - VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME, -#endif /* VK_KHR_pipeline_executable_properties */ - }; - - std::vector enabled_device_extensions; - find_requested_device_extensions( - physical_device.handle, - enabled_device_extensions, - requested_device_extensions); - - VkDeviceCreateInfo device_create_info{ - VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - static_cast(queue_create_infos.size()), // queueCreateInfoCount - queue_create_infos.data(), // pQueueCreateInfos - 0u, // enabledLayerCount - nullptr, // ppEnabledLayerNames - static_cast( - enabled_device_extensions.size()), // enabledExtensionCount - enabled_device_extensions.data(), // ppEnabledExtensionNames - nullptr, // pEnabledFeatures - }; - - void* extension_list_top = nullptr; - -#ifdef VK_KHR_16bit_storage - VkPhysicalDevice16BitStorageFeatures shader_16bit_storage{ - physical_device.shader_16bit_storage}; - - shader_16bit_storage.pNext = extension_list_top; - extension_list_top = &shader_16bit_storage; -#endif /* VK_KHR_16bit_storage */ - -#ifdef VK_KHR_8bit_storage - VkPhysicalDevice8BitStorageFeatures shader_8bit_storage{ - physical_device.shader_8bit_storage}; - - shader_8bit_storage.pNext = extension_list_top; - extension_list_top = &shader_8bit_storage; -#endif /* VK_KHR_8bit_storage */ - -#ifdef VK_KHR_shader_float16_int8 - VkPhysicalDeviceShaderFloat16Int8Features shader_float16_int8_types{ - physical_device.shader_float16_int8_types}; - - shader_float16_int8_types.pNext = extension_list_top; - extension_list_top = &shader_float16_int8_types; -#endif /* VK_KHR_shader_float16_int8 */ - -#ifdef VK_KHR_shader_integer_dot_product - VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR - shader_int_dot_product_features{ - physical_device.shader_int_dot_product_features}; - shader_int_dot_product_features.pNext = extension_list_top; - extension_list_top = &shader_int_dot_product_features; -#endif /* VK_KHR_shader_integer_dot_product */ - - device_create_info.pNext = extension_list_top; - - VkDevice handle = nullptr; - VK_CHECK(vkCreateDevice( - physical_device.handle, &device_create_info, nullptr, &handle)); - -#ifdef USE_VULKAN_VOLK - volkLoadDevice(handle); -#endif /* USE_VULKAN_VOLK */ - - populate_queue_info( - physical_device, handle, queues_to_get, queues, queue_usage); - - return handle; -} - -bool test_linear_tiling_3d_image_support(VkDevice device) { - // Test creating a 3D image with linear tiling to see if it is supported. - // According to the Vulkan spec, linear tiling may not be supported for 3D - // images. - VkExtent3D image_extents{1u, 1u, 1u}; - const VkImageCreateInfo image_create_info{ - VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - VK_IMAGE_TYPE_3D, // imageType - VK_FORMAT_R32G32B32A32_SFLOAT, // format - image_extents, // extents - 1u, // mipLevels - 1u, // arrayLayers - VK_SAMPLE_COUNT_1_BIT, // samples - VK_IMAGE_TILING_LINEAR, // tiling - VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, // usage - VK_SHARING_MODE_EXCLUSIVE, // sharingMode - 0u, // queueFamilyIndexCount - nullptr, // pQueueFamilyIndices - VK_IMAGE_LAYOUT_UNDEFINED, // initialLayout - }; - VkImage image = VK_NULL_HANDLE; - VkResult res = vkCreateImage(device, &image_create_info, nullptr, &image); - - if (res == VK_SUCCESS) { - vkDestroyImage(device, image, nullptr); - } - - return res == VK_SUCCESS; -} - -} // namespace - -// -// Adapter -// - -Adapter::Adapter( - VkInstance instance, - PhysicalDevice physical_device, - const uint32_t num_queues, - const std::string& cache_data_path) - : queue_usage_mutex_{}, - physical_device_(std::move(physical_device)), - queues_{}, - queue_usage_{}, - queue_mutexes_{}, - instance_(instance), - device_(create_logical_device( - physical_device_, - num_queues, - queues_, - queue_usage_)), - shader_layout_cache_(device_.handle), - shader_cache_(device_.handle), - pipeline_layout_cache_(device_.handle), - compute_pipeline_cache_(device_.handle, cache_data_path), - sampler_cache_(device_.handle), - vma_(instance_, physical_device_.handle, device_.handle), - linear_tiling_3d_enabled_{ - test_linear_tiling_3d_image_support(device_.handle)}, - owns_device_{true} {} - -Adapter::Adapter( - VkInstance instance, - VkPhysicalDevice physical_device, - VkDevice logical_device, - const uint32_t num_queues, - const std::string& cache_data_path) - : queue_usage_mutex_{}, - physical_device_(physical_device), - queues_{}, - queue_usage_{}, - queue_mutexes_{}, - instance_(instance), - device_(logical_device), - shader_layout_cache_(device_.handle), - shader_cache_(device_.handle), - pipeline_layout_cache_(device_.handle), - compute_pipeline_cache_(device_.handle, cache_data_path), - sampler_cache_(device_.handle), - vma_(instance_, physical_device_.handle, device_.handle), - linear_tiling_3d_enabled_{ - test_linear_tiling_3d_image_support(device_.handle)}, - owns_device_{false} { - std::vector queue_create_infos; - std::vector> queues_to_get; - find_compute_queues( - physical_device_, num_queues, queue_create_infos, queues_to_get); - populate_queue_info( - physical_device_, device_.handle, queues_to_get, queues_, queue_usage_); -} - -Adapter::~Adapter() { - if (!owns_device_) { - device_.handle = VK_NULL_HANDLE; - } -} - -Adapter::Queue Adapter::request_queue() { - // Lock the mutex as multiple threads can request a queue at the same time - std::lock_guard lock(queue_usage_mutex_); - - uint32_t min_usage = UINT32_MAX; - uint32_t min_used_i = 0; - for (size_t i = 0; i < queues_.size(); ++i) { - if (queue_usage_[i] < min_usage) { - min_used_i = i; - min_usage = queue_usage_[i]; - } - } - queue_usage_[min_used_i] += 1; - - return queues_[min_used_i]; -} - -void Adapter::return_queue(Adapter::Queue& compute_queue) { - for (size_t i = 0; i < queues_.size(); ++i) { - if ((queues_[i].family_index == compute_queue.family_index) && - (queues_[i].queue_index == compute_queue.queue_index)) { - std::lock_guard lock(queue_usage_mutex_); - queue_usage_[i] -= 1; - break; - } - } -} - -void Adapter::submit_cmd( - const Adapter::Queue& device_queue, - VkCommandBuffer cmd, - VkFence fence, - VkSemaphore wait_semaphore, - VkSemaphore signal_semaphore) { - const VkPipelineStageFlags flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - const bool set_wait_semaphore = wait_semaphore != VK_NULL_HANDLE; - const bool set_signal_semaphore = signal_semaphore != VK_NULL_HANDLE; - const VkSubmitInfo submit_info{ - VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType - nullptr, // pNext - set_wait_semaphore ? 1u : 0u, // waitSemaphoreCount - set_wait_semaphore ? &wait_semaphore : nullptr, // pWaitSemaphores - &flags, // pWaitDstStageMask - 1u, // commandBufferCount - &cmd, // pCommandBuffers - set_signal_semaphore ? 1u : 0u, // signalSemaphoreCount - set_signal_semaphore ? &signal_semaphore : nullptr, // pSignalSemaphores - }; - - std::lock_guard queue_lock( - queue_mutexes_[device_queue.queue_index % NUM_QUEUE_MUTEXES]); - - VK_CHECK(vkQueueSubmit(device_queue.handle, 1u, &submit_info, fence)); -} - -std::string Adapter::stringize() const { - std::stringstream ss; - - VkPhysicalDeviceProperties properties = physical_device_.properties; - uint32_t v_major = VK_VERSION_MAJOR(properties.apiVersion); - uint32_t v_minor = VK_VERSION_MINOR(properties.apiVersion); - std::string device_type = get_device_type_str(properties.deviceType); - VkPhysicalDeviceLimits limits = properties.limits; - - ss << "{" << std::endl; - ss << " Physical Device Info {" << std::endl; - ss << " apiVersion: " << v_major << "." << v_minor << std::endl; - ss << " driverversion: " << properties.driverVersion << std::endl; - ss << " deviceType: " << device_type << std::endl; - ss << " deviceName: " << properties.deviceName << std::endl; - -#define PRINT_BOOL(value, name) \ - ss << " " << std::left << std::setw(36) << #name << value << std::endl; - -#define PRINT_PROP(struct, name) \ - ss << " " << std::left << std::setw(36) << #name << struct.name \ - << std::endl; - -#define PRINT_PROP_VEC3(struct, name) \ - ss << " " << std::left << std::setw(36) << #name << struct.name[0] \ - << "," << struct.name[1] << "," << struct.name[2] << std::endl; - - ss << " Physical Device Limits {" << std::endl; - PRINT_PROP(limits, maxImageDimension1D); - PRINT_PROP(limits, maxImageDimension2D); - PRINT_PROP(limits, maxImageDimension3D); - PRINT_PROP(limits, maxStorageBufferRange); - PRINT_PROP(limits, maxTexelBufferElements); - PRINT_PROP(limits, maxPushConstantsSize); - PRINT_PROP(limits, maxMemoryAllocationCount); - PRINT_PROP(limits, maxSamplerAllocationCount); - PRINT_PROP(limits, maxComputeSharedMemorySize); - PRINT_PROP_VEC3(limits, maxComputeWorkGroupCount); - PRINT_PROP(limits, maxComputeWorkGroupInvocations); - PRINT_PROP_VEC3(limits, maxComputeWorkGroupSize); - ss << " }" << std::endl; - -#ifdef VK_KHR_16bit_storage - ss << " 16bit Storage Features {" << std::endl; - PRINT_PROP(physical_device_.shader_16bit_storage, storageBuffer16BitAccess); - PRINT_PROP( - physical_device_.shader_16bit_storage, - uniformAndStorageBuffer16BitAccess); - PRINT_PROP(physical_device_.shader_16bit_storage, storagePushConstant16); - PRINT_PROP(physical_device_.shader_16bit_storage, storageInputOutput16); - ss << " }" << std::endl; -#endif /* VK_KHR_16bit_storage */ - -#ifdef VK_KHR_8bit_storage - ss << " 8bit Storage Features {" << std::endl; - PRINT_PROP(physical_device_.shader_8bit_storage, storageBuffer8BitAccess); - PRINT_PROP( - physical_device_.shader_8bit_storage, uniformAndStorageBuffer8BitAccess); - PRINT_PROP(physical_device_.shader_8bit_storage, storagePushConstant8); - ss << " }" << std::endl; -#endif /* VK_KHR_8bit_storage */ - - ss << " Shader 16bit and 8bit Features {" << std::endl; - PRINT_BOOL(physical_device_.supports_int16_shader_types, shaderInt16) -#ifdef VK_KHR_shader_float16_int8 - PRINT_PROP(physical_device_.shader_float16_int8_types, shaderFloat16); - PRINT_PROP(physical_device_.shader_float16_int8_types, shaderInt8); -#endif /* VK_KHR_shader_float16_int8 */ - ss << " }" << std::endl; - -#ifdef VK_KHR_shader_integer_dot_product - ss << " Shader Integer Dot Product Features {" << std::endl; - PRINT_PROP( - physical_device_.shader_int_dot_product_features, - shaderIntegerDotProduct); - ss << " }" << std::endl; - - ss << " Shader Integer Dot Product Properties {" << std::endl; - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct8BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct8BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct8BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct4x8BitPackedUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct4x8BitPackedSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct4x8BitPackedMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct16BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct16BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct16BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct32BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct32BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct32BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct64BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct64BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProduct64BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating8BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating8BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating16BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating16BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating32BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating32BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating64BitUnsignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating64BitSignedAccelerated); - PRINT_PROP( - physical_device_.shader_int_dot_product_properties, - integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated); - ss << " }" << std::endl; -#endif /* VK_KHR_shader_integer_dot_product */ - - const VkPhysicalDeviceMemoryProperties& mem_props = - physical_device_.memory_properties; - - ss << " }" << std::endl; - ss << " Memory Info {" << std::endl; - ss << " Memory Types [" << std::endl; - for (size_t i = 0; i < mem_props.memoryTypeCount; ++i) { - ss << " " << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] " - << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags) - << std::endl; - } - ss << " ]" << std::endl; - ss << " Memory Heaps [" << std::endl; - for (size_t i = 0; i < mem_props.memoryHeapCount; ++i) { - ss << " " << mem_props.memoryHeaps[i].size << std::endl; - } - ss << " ]" << std::endl; - ss << " }" << std::endl; - - ss << " Queue Families {" << std::endl; - for (const VkQueueFamilyProperties& queue_family_props : - physical_device_.queue_families) { - ss << " (" << queue_family_props.queueCount << " Queues) " - << get_queue_family_properties_str(queue_family_props.queueFlags) - << std::endl; - } - ss << " }" << std::endl; - ss << " VkDevice: " << device_.handle << std::endl; - ss << " Compute Queues [" << std::endl; - for (const Adapter::Queue& compute_queue : queues_) { - ss << " Family " << compute_queue.family_index << ", Queue " - << compute_queue.queue_index << ": " << compute_queue.handle - << std::endl; - ; - } - ss << " ]" << std::endl; - ss << "}"; - -#undef PRINT_PROP -#undef PRINT_PROP_VEC3 - - return ss.str(); -} - -std::ostream& operator<<(std::ostream& os, const Adapter& adapter) { - os << adapter.stringize() << std::endl; - return os; -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h deleted file mode 100644 index 6a68b487348..00000000000 --- a/backends/vulkan/runtime/vk_api/Adapter.h +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include -#include - -#include - -#include - -namespace vkcompute { -namespace vkapi { - -// -// A Vulkan Adapter represents a logical device and all its properties. It -// manages all relevant properties of the underlying physical device, a -// handle to the logical device, and a number of compute queues available to -// the device. It is primarily responsible for managing the VkDevice handle -// which points to the logical device object on the GPU. -// -// This class is primarily used by the Runtime class, which holds one Adapter -// instance for each physical device visible to the VkInstance. Upon -// construction, this class will populate the physical device properties, but -// will not create the logical device until specifically requested via the -// init_device() function. -// -// init_device() will create the logical device and obtain the VkDevice handle -// for it. It will also create a number of compute queues up to the amount -// requested when the Adapter instance was constructed. -// -// Contexts (which represent one thread of execution) will request a compute -// queue from an Adapter. The Adapter will then select a compute queue to -// assign to the Context, attempting to balance load between all available -// queues. This will allow different Contexts (which typically execute on -// separate threads) to run concurrently. -// - -#define NUM_QUEUE_MUTEXES 4 - -class Adapter final { - public: - explicit Adapter( - VkInstance instance, - PhysicalDevice physical_device, - const uint32_t num_queues, - const std::string& cache_data_path); - - explicit Adapter( - VkInstance instance, - VkPhysicalDevice physical_device, - VkDevice logical_device, - const uint32_t num_queues, - const std::string& cache_data_path); - - Adapter(const Adapter&) = delete; - Adapter& operator=(const Adapter&) = delete; - - Adapter(Adapter&&) = delete; - Adapter& operator=(Adapter&&) = delete; - - ~Adapter(); - - struct Queue { - uint32_t family_index; - uint32_t queue_index; - VkQueueFlags capabilities; - VkQueue handle; - }; - - private: - // Use a mutex to manage queue usage info since - // it can be accessed from multiple threads - std::mutex queue_usage_mutex_; - // Physical Device Info - PhysicalDevice physical_device_; - // Queue Management - std::vector queues_; - std::vector queue_usage_; - std::array queue_mutexes_; - // Handles - VkInstance instance_; - DeviceHandle device_; - // Device-level resource caches - ShaderLayoutCache shader_layout_cache_; - ShaderCache shader_cache_; - PipelineLayoutCache pipeline_layout_cache_; - ComputePipelineCache compute_pipeline_cache_; - // Memory Management - SamplerCache sampler_cache_; - Allocator vma_; - // Miscellaneous - bool linear_tiling_3d_enabled_; - bool owns_device_; - - public: - // Physical Device metadata - - inline VkPhysicalDevice physical_handle() const { - return physical_device_.handle; - } - - inline VkDevice device_handle() const { - return device_.handle; - } - - inline bool has_unified_memory() const { - return physical_device_.has_unified_memory; - } - - inline uint32_t num_compute_queues() const { - return physical_device_.num_compute_queues; - } - - inline bool timestamp_compute_and_graphics() const { - return physical_device_.has_timestamps; - } - - inline float timestamp_period() const { - return physical_device_.timestamp_period; - } - - // Device Identity - inline const std::string& device_name() const { - return physical_device_.device_name; - } - - inline vkapi::DeviceType device_type() const { - return physical_device_.device_type; - } - - // Queue Management - - Queue request_queue(); - void return_queue(Queue&); - - // Caches - - inline ShaderLayoutCache& shader_layout_cache() { - return shader_layout_cache_; - } - - inline ShaderCache& shader_cache() { - return shader_cache_; - } - - inline PipelineLayoutCache& pipeline_layout_cache() { - return pipeline_layout_cache_; - } - - inline ComputePipelineCache& compute_pipeline_cache() { - return compute_pipeline_cache_; - } - - // Memory Allocation - - inline SamplerCache& sampler_cache() { - return sampler_cache_; - } - - inline Allocator& vma() { - return vma_; - } - - inline bool linear_tiling_3d_enabled() const { - return linear_tiling_3d_enabled_; - } - - // Physical Device Features - - inline bool supports_16bit_storage_buffers() { -#ifdef VK_KHR_16bit_storage - return physical_device_.shader_16bit_storage.storageBuffer16BitAccess == - VK_TRUE; -#else - return false; -#endif /* VK_KHR_16bit_storage */ - } - - inline bool supports_8bit_storage_buffers() { -#ifdef VK_KHR_8bit_storage - return physical_device_.shader_8bit_storage.storageBuffer8BitAccess == - VK_TRUE; -#else - return false; -#endif /* VK_KHR_8bit_storage */ - } - - inline bool supports_float16_shader_types() { -#ifdef VK_KHR_shader_float16_int8 - return physical_device_.shader_float16_int8_types.shaderFloat16 == VK_TRUE; -#else - return false; -#endif /* VK_KHR_shader_float16_int8 */ - } - - inline bool supports_int8_shader_types() { -#ifdef VK_KHR_shader_float16_int8 - return physical_device_.shader_float16_int8_types.shaderInt8 == VK_TRUE; -#else - return false; -#endif /* VK_KHR_shader_float16_int8 */ - } - - inline bool supports_int8_dot_product() { -#ifdef VK_KHR_shader_integer_dot_product - return physical_device_.shader_int_dot_product_features - .shaderIntegerDotProduct == VK_TRUE; -#else - return false; -#endif /* VK_KHR_shader_integer_dot_product */ - } - - inline bool supports_int16_shader_types() { - return physical_device_.supports_int16_shader_types; - } - - inline bool has_full_float16_buffers_support() { - return supports_16bit_storage_buffers() && supports_float16_shader_types(); - } - - inline bool has_full_int8_buffers_support() { - return supports_8bit_storage_buffers() && supports_int8_shader_types(); - } - - inline size_t min_ubo_alignment() const { - return physical_device_.min_ubo_alignment; - } - - inline uint32_t max_texture2d_dim() const { - return physical_device_.properties.limits.maxImageDimension2D; - } - - inline uint32_t max_texture3d_dim() const { - return physical_device_.properties.limits.maxImageDimension3D; - } - - inline uint32_t max_buffer_numel() const { - return physical_device_.properties.limits.maxStorageBufferRange; - } - - // Command Buffer Submission - - void submit_cmd( - const Queue&, - VkCommandBuffer, - VkFence fence = VK_NULL_HANDLE, - VkSemaphore wait_semaphore = VK_NULL_HANDLE, - VkSemaphore signal_semaphore = VK_NULL_HANDLE); - - std::string stringize() const; - friend std::ostream& operator<<(std::ostream&, const Adapter&); -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp deleted file mode 100644 index 84e1f68dc68..00000000000 --- a/backends/vulkan/runtime/vk_api/Command.cpp +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include - -namespace vkcompute { -namespace vkapi { - -// -// CommandBuffer -// - -CommandBuffer::CommandBuffer( - VkCommandBuffer handle, - const VkCommandBufferUsageFlags flags) - : handle_(handle), - flags_(flags), - state_(CommandBuffer::State::NEW), - bound_{} {} - -CommandBuffer::CommandBuffer(CommandBuffer&& other) noexcept - : handle_(other.handle_), - flags_(other.flags_), - state_(other.state_), - bound_(other.bound_) { - other.handle_ = VK_NULL_HANDLE; - other.bound_.reset(); -} - -CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept { - handle_ = other.handle_; - flags_ = other.flags_; - state_ = other.state_; - bound_ = other.bound_; - - other.handle_ = VK_NULL_HANDLE; - other.bound_.reset(); - other.state_ = CommandBuffer::State::INVALID; - - return *this; -} - -void CommandBuffer::begin() { - VK_CHECK_COND( - state_ == CommandBuffer::State::NEW, - "Vulkan CommandBuffer: called begin() on a command buffer whose state " - "is not NEW."); - - const VkCommandBufferBeginInfo begin_info{ - VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, - nullptr, - flags_, - nullptr, - }; - - VK_CHECK(vkBeginCommandBuffer(handle_, &begin_info)); - state_ = CommandBuffer::State::RECORDING; -} - -void CommandBuffer::end() { - VK_CHECK_COND( - state_ == CommandBuffer::State::RECORDING || - state_ == CommandBuffer::State::SUBMITTED, - "Vulkan CommandBuffer: called end() on a command buffer whose state " - "is not RECORDING or SUBMITTED."); - - if (state_ == CommandBuffer::State::RECORDING) { - VK_CHECK(vkEndCommandBuffer(handle_)); - } - state_ = CommandBuffer::State::READY; -} - -void CommandBuffer::bind_pipeline( - VkPipeline pipeline, - VkPipelineLayout pipeline_layout, - const utils::WorkgroupSize local_workgroup_size) { - VK_CHECK_COND( - state_ == CommandBuffer::State::RECORDING, - "Vulkan CommandBuffer: called bind_pipeline() on a command buffer whose state " - "is not RECORDING."); - - if (pipeline != bound_.pipeline) { - vkCmdBindPipeline(handle_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); - - bound_.pipeline = pipeline; - } - - bound_.pipeline_layout = pipeline_layout; - bound_.local_workgroup_size = local_workgroup_size; - - state_ = CommandBuffer::State::PIPELINE_BOUND; -} - -void CommandBuffer::bind_descriptors(VkDescriptorSet descriptors) { - VK_CHECK_COND( - state_ == CommandBuffer::State::PIPELINE_BOUND, - "Vulkan CommandBuffer: called bind_descriptors() on a command buffer whose state " - "is not PIPELINE_BOUND."); - - if (descriptors != bound_.descriptors) { - vkCmdBindDescriptorSets( - handle_, // commandBuffer - VK_PIPELINE_BIND_POINT_COMPUTE, // pipelineBindPoint - bound_.pipeline_layout, // layout - 0u, // firstSet - 1u, // descriptorSetCount - &descriptors, // pDescriptorSets - 0u, // dynamicOffsetCount - nullptr); // pDynamicOffsets - } - - bound_.descriptors = descriptors; - - state_ = CommandBuffer::State::DESCRIPTORS_BOUND; -} - -void CommandBuffer::set_push_constants( - VkPipelineLayout pipeline_layout, - const void* push_constants_data, - uint32_t push_constants_size) { - if (push_constants_data != nullptr && push_constants_size > 0) { - vkCmdPushConstants( - handle_, - pipeline_layout, - VK_SHADER_STAGE_COMPUTE_BIT, - 0, - push_constants_size, - push_constants_data); - } -} - -void CommandBuffer::insert_barrier(PipelineBarrier& pipeline_barrier) { - VK_CHECK_COND( - state_ == CommandBuffer::State::DESCRIPTORS_BOUND || - state_ == CommandBuffer::State::RECORDING, - "Vulkan CommandBuffer: called insert_barrier() on a command buffer whose state " - "is not DESCRIPTORS_BOUND or RECORDING."); - - if (pipeline_barrier) { - if (!pipeline_barrier.buffer_barrier_handles.empty()) { - pipeline_barrier.buffer_barrier_handles.clear(); - } - for (const BufferMemoryBarrier& memory_barrier : pipeline_barrier.buffers) { - pipeline_barrier.buffer_barrier_handles.push_back(memory_barrier.handle); - } - - if (!pipeline_barrier.image_barrier_handles.empty()) { - pipeline_barrier.image_barrier_handles.clear(); - } - for (const ImageMemoryBarrier& memory_barrier : pipeline_barrier.images) { - pipeline_barrier.image_barrier_handles.push_back(memory_barrier.handle); - } - vkCmdPipelineBarrier( - handle_, // commandBuffer - pipeline_barrier.stage.src, // srcStageMask - pipeline_barrier.stage.dst, // dstStageMask - 0u, // dependencyFlags - 0u, // memoryBarrierCount - nullptr, // pMemoryBarriers - pipeline_barrier.buffers.size(), // bufferMemoryBarrierCount - !pipeline_barrier.buffers.empty() - ? pipeline_barrier.buffer_barrier_handles.data() - : nullptr, // pMemoryBarriers - pipeline_barrier.images.size(), // imageMemoryBarrierCount - !pipeline_barrier.images.empty() - ? pipeline_barrier.image_barrier_handles.data() - : nullptr); // pImageMemoryBarriers - } - - state_ = CommandBuffer::State::BARRIERS_INSERTED; -} - -void CommandBuffer::dispatch(const utils::uvec3& global_workgroup_size) { - VK_CHECK_COND( - state_ == CommandBuffer::State::BARRIERS_INSERTED, - "Vulkan CommandBuffer: called dispatch() on a command buffer whose state " - "is not BARRIERS_INSERTED."); - - vkCmdDispatch( - handle_, - utils::div_up(global_workgroup_size[0u], bound_.local_workgroup_size[0u]), - utils::div_up(global_workgroup_size[1u], bound_.local_workgroup_size[1u]), - utils::div_up( - global_workgroup_size[2u], bound_.local_workgroup_size[2u])); - - state_ = CommandBuffer::State::RECORDING; -} - -void CommandBuffer::blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst) { - VK_CHECK_COND( - state_ == CommandBuffer::State::BARRIERS_INSERTED, - "Vulkan CommandBuffer: called blit() on a command buffer whose state " - "is not BARRIERS_INSERTED."); - - auto src_extents = src.extents(); - auto dst_extents = dst.extents(); - - VkImageBlit blit{}; - blit.srcOffsets[0] = {0, 0, 0}, - blit.srcOffsets[1] = - {static_cast(src_extents.width), - static_cast(src_extents.height), - static_cast(src_extents.depth)}, - blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - blit.srcSubresource.mipLevel = 0, blit.srcSubresource.baseArrayLayer = 0, - blit.srcSubresource.layerCount = 1, blit.dstOffsets[0] = {0, 0, 0}, - blit.dstOffsets[1] = - {static_cast(dst_extents.width), - static_cast(dst_extents.height), - static_cast(dst_extents.depth)}, - blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - blit.dstSubresource.mipLevel = 0, blit.dstSubresource.baseArrayLayer = 0, - blit.dstSubresource.layerCount = 1, - - vkCmdBlitImage( - handle_, - src.handle(), - src.layout(), - dst.handle(), - dst.layout(), - 1, - &blit, - VK_FILTER_NEAREST); - - state_ = CommandBuffer::State::RECORDING; -} - -void CommandBuffer::write_timestamp(VkQueryPool querypool, const uint32_t idx) - const { - VK_CHECK_COND( - state_ == CommandBuffer::State::RECORDING, - "Vulkan CommandBuffer: called write_timestamp() on a command buffer whose state " - "is not RECORDING."); - - vkCmdWriteTimestamp( - handle_, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, querypool, idx); -} - -void CommandBuffer::reset_querypool( - VkQueryPool querypool, - const uint32_t first_idx, - const uint32_t count) const { - VK_CHECK_COND( - state_ == CommandBuffer::State::RECORDING, - "Vulkan CommandBuffer: called reset_querypool() on a command buffer whose state " - "is not RECORDING."); - - vkCmdResetQueryPool(handle_, querypool, first_idx, count); -} - -VkCommandBuffer CommandBuffer::get_submit_handle(const bool final_use) { - VK_CHECK_COND( - state_ == CommandBuffer::State::READY, - "Vulkan CommandBuffer: called begin() on a command buffer whose state " - "is not READY."); - - VkCommandBuffer handle = handle_; - - if (!is_reusable() || final_use) { - invalidate(); - } - state_ = CommandBuffer::State::SUBMITTED; - - return handle; -} - -// -// CommandPool -// - -CommandPool::CommandPool( - VkDevice device, - const uint32_t queue_family_idx, - const CommandPoolConfig& config) - : device_(device), - queue_family_idx_(queue_family_idx), - pool_(VK_NULL_HANDLE), - config_(config), - mutex_{}, - buffers_{}, - in_use_(0u) { - const VkCommandPoolCreateInfo create_info{ - VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, - nullptr, - VK_COMMAND_POOL_CREATE_TRANSIENT_BIT, - queue_family_idx_, - }; - - VK_CHECK(vkCreateCommandPool(device_, &create_info, nullptr, &pool_)); - - // Pre-allocate some command buffers - allocate_new_batch(config_.cmd_pool_initial_size); -} - -CommandPool::~CommandPool() { - if (pool_ == VK_NULL_HANDLE) { - return; - } - vkDestroyCommandPool(device_, pool_, nullptr); -} - -CommandBuffer CommandPool::get_new_cmd(bool reusable) { - std::lock_guard lock(mutex_); - - // No-ops if there are command buffers available - allocate_new_batch(config_.cmd_pool_batch_size); - - VkCommandBuffer handle = buffers_[in_use_]; - - VkCommandBufferUsageFlags cmd_flags = 0u; - if (!reusable) { - cmd_flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - } - - in_use_++; - return CommandBuffer(handle, cmd_flags); -} - -void CommandPool::flush() { - std::lock_guard lock(mutex_); - VK_CHECK(vkResetCommandPool(device_, pool_, 0u)); - in_use_ = 0u; -} - -void CommandPool::allocate_new_batch(const uint32_t count) { - // No-ops if there are still command buffers available - if (in_use_ < buffers_.size()) { - return; - } - - buffers_.resize(buffers_.size() + count); - - const VkCommandBufferAllocateInfo allocate_info{ - VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, // sType - nullptr, // pNext - pool_, // commandPool - VK_COMMAND_BUFFER_LEVEL_PRIMARY, // level - count, // commandBufferCount - }; - - VK_CHECK(vkAllocateCommandBuffers( - device_, &allocate_info, buffers_.data() + in_use_)); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h deleted file mode 100644 index ff1e5934a5c..00000000000 --- a/backends/vulkan/runtime/vk_api/Command.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include -#include -#include - -#include -#include - -namespace vkcompute { -namespace vkapi { - -class CommandBuffer final { - public: - explicit CommandBuffer(VkCommandBuffer, const VkCommandBufferUsageFlags); - - CommandBuffer(const CommandBuffer&) = delete; - CommandBuffer& operator=(const CommandBuffer&) = delete; - - CommandBuffer(CommandBuffer&&) noexcept; - CommandBuffer& operator=(CommandBuffer&&) noexcept; - - ~CommandBuffer() = default; - - // The lifecycle of a command buffer is as follows: - enum State { - INVALID, // Used to indicate the command buffer is moved from - NEW, // Set during constructor - RECORDING, // Set during call to begin() and dispatch() - PIPELINE_BOUND, // Set during call to bind_pipeline() - DESCRIPTORS_BOUND, // Set during call to bind_descriptors() - BARRIERS_INSERTED, // Set during call to insert_barrier() - READY, // Set during call to end() - SUBMITTED, // Set during call to get_submit_handle() - }; - - struct Bound { - VkPipeline pipeline; - VkPipelineLayout pipeline_layout; - utils::WorkgroupSize local_workgroup_size; - VkDescriptorSet descriptors; - - explicit Bound() - : pipeline{VK_NULL_HANDLE}, - pipeline_layout{VK_NULL_HANDLE}, - local_workgroup_size{0u, 0u, 0u}, - descriptors{VK_NULL_HANDLE} {} - - inline void reset() { - pipeline = VK_NULL_HANDLE; - pipeline_layout = VK_NULL_HANDLE; - local_workgroup_size = utils::WorkgroupSize{0u, 0u, 0u}; - descriptors = VK_NULL_HANDLE; - } - }; - - private: - VkCommandBuffer handle_; - VkCommandBufferUsageFlags flags_; - State state_; - Bound bound_; - - public: - inline bool is_reusable() { - return !(flags_ & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT); - } - - inline void invalidate() { - handle_ = VK_NULL_HANDLE; - bound_.reset(); - } - - void begin(); - void end(); - - void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::WorkgroupSize); - void bind_descriptors(VkDescriptorSet); - void set_push_constants(VkPipelineLayout, const void*, uint32_t); - - void insert_barrier(PipelineBarrier& pipeline_barrier); - void dispatch(const utils::uvec3&); - void blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst); - - void write_timestamp(VkQueryPool, const uint32_t) const; - void reset_querypool(VkQueryPool, const uint32_t, const uint32_t) const; - - VkCommandBuffer get_submit_handle(const bool final_use = false); - - inline operator bool() const { - return handle_ != VK_NULL_HANDLE; - } -}; - -struct CommandPoolConfig final { - uint32_t cmd_pool_initial_size; - uint32_t cmd_pool_batch_size; -}; - -class CommandPool final { - public: - explicit CommandPool(VkDevice, const uint32_t, const CommandPoolConfig&); - - CommandPool(const CommandPool&) = delete; - CommandPool& operator=(const CommandPool&) = delete; - - CommandPool(CommandPool&&) = delete; - CommandPool& operator=(CommandPool&&) = delete; - - ~CommandPool(); - - private: - VkDevice device_; - uint32_t queue_family_idx_; - VkCommandPool pool_; - CommandPoolConfig config_; - // New Buffers - std::mutex mutex_; - std::vector buffers_; - size_t in_use_; - - public: - CommandBuffer get_new_cmd(bool reusable = false); - - void flush(); - - private: - void allocate_new_batch(const uint32_t); -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp deleted file mode 100644 index 9e8394ffa9c..00000000000 --- a/backends/vulkan/runtime/vk_api/Descriptor.cpp +++ /dev/null @@ -1,353 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include - -namespace vkcompute { -namespace vkapi { - -// -// BufferBindInfo -// - -BufferBindInfo::BufferBindInfo() - : handle(VK_NULL_HANDLE), offset(0u), range(0u) {} - -BufferBindInfo::BufferBindInfo( - const VulkanBuffer& buffer_p, - const uint32_t offset_p) - : handle(buffer_p.handle()), - offset(buffer_p.mem_offset() + offset_p), - range(buffer_p.mem_range() - offset_p) {} - -BufferBindInfo::BufferBindInfo( - const VulkanBuffer& buffer_p, - const size_t offset_p, - const size_t range_p) - : handle(buffer_p.handle()), - offset(buffer_p.mem_offset() + offset_p), - range(range_p) { - VK_CHECK_COND(range_p <= (buffer_p.mem_range() - offset_p)); -} - -// -// ParamsBindList -// - -ParamsBindList::ParamsBindList( - std::initializer_list init_list) { - bind_infos.resize(init_list.size()); - std::copy(init_list.begin(), init_list.end(), bind_infos.begin()); -} - -void ParamsBindList::append(const BufferBindInfo& bind_info) { - bind_infos.emplace_back(bind_info); -} - -void ParamsBindList::append(const ParamsBindList& other) { - bind_infos.insert( - bind_infos.end(), other.bind_infos.begin(), other.bind_infos.end()); -} - -// -// DescriptorSet -// - -DescriptorSet::DescriptorSet( - VkDevice device, - VkDescriptorSet handle, - ShaderLayout::Signature shader_layout_signature) - : device_(device), - handle_(handle), - shader_layout_signature_(std::move(shader_layout_signature)), - bindings_{} {} - -DescriptorSet::DescriptorSet(DescriptorSet&& other) noexcept - : device_(other.device_), - handle_(other.handle_), - shader_layout_signature_(std::move(other.shader_layout_signature_)), - bindings_(std::move(other.bindings_)) { - other.handle_ = VK_NULL_HANDLE; -} - -DescriptorSet& DescriptorSet::operator=(DescriptorSet&& other) noexcept { - device_ = other.device_; - handle_ = other.handle_; - shader_layout_signature_ = std::move(other.shader_layout_signature_); - bindings_ = std::move(other.bindings_); - - other.handle_ = VK_NULL_HANDLE; - - return *this; -} - -DescriptorSet& DescriptorSet::bind( - const uint32_t idx, - const VulkanBuffer& buffer) { - VK_CHECK_COND( - buffer.has_memory(), - "Buffer must be bound to memory for it to be usable"); - - DescriptorSet::ResourceBinding binder{}; - binder.binding_idx = idx; // binding_idx - binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type - binder.is_image = false; // is_image - binder.resource_info.buffer_info.buffer = buffer.handle(); // buffer - binder.resource_info.buffer_info.offset = buffer.mem_offset(); // offset - binder.resource_info.buffer_info.range = buffer.mem_range(); // range - add_binding(binder); - - return *this; -} - -DescriptorSet& DescriptorSet::bind( - const uint32_t idx, - const BufferBindInfo& bind_info) { - DescriptorSet::ResourceBinding binder{}; - binder.binding_idx = idx; // binding_idx - binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type - binder.is_image = false; // is_image - binder.resource_info.buffer_info.buffer = bind_info.handle; // buffer - binder.resource_info.buffer_info.offset = bind_info.offset; // offset - binder.resource_info.buffer_info.range = bind_info.range; // range - add_binding(binder); - - return *this; -} - -DescriptorSet& DescriptorSet::bind( - const uint32_t idx, - const VulkanImage& image) { - // If the image does not have an allocator attached, then it is externally - // allocated; assume it is already bound to memory. Otherwise, it must be - // bound to a VmaAllocation to be used. - VK_CHECK_COND( - image.vma_allocator() == VK_NULL_HANDLE || image.has_memory(), - "Image must be bound to memory for it to be usable"); - - VkImageLayout binding_layout = image.layout(); - if (shader_layout_signature_[idx] == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) { - binding_layout = VK_IMAGE_LAYOUT_GENERAL; - } - - DescriptorSet::ResourceBinding binder{}; - binder.binding_idx = idx; // binding_idx - binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type - binder.is_image = true; // is_image - binder.resource_info.image_info.sampler = image.sampler(); // buffer - binder.resource_info.image_info.imageView = image.image_view(); // imageView - binder.resource_info.image_info.imageLayout = binding_layout; // imageLayout - add_binding(binder); - - return *this; -} - -VkDescriptorSet DescriptorSet::get_bind_handle() const { - std::vector write_descriptor_sets; - write_descriptor_sets.reserve(bindings_.size()); - - for (const ResourceBinding& binding : bindings_) { - VkWriteDescriptorSet write{ - VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, // sType - nullptr, // pNext - handle_, // dstSet - binding.binding_idx, // dstBinding - 0u, // dstArrayElement - 1u, // descriptorCount - binding.descriptor_type, // descriptorType - nullptr, // pImageInfo - nullptr, // pBufferInfo - nullptr, // pTexelBufferView - }; - - if (binding.is_image) { - write.pImageInfo = &binding.resource_info.image_info; - } else { - write.pBufferInfo = &binding.resource_info.buffer_info; - } - - write_descriptor_sets.emplace_back(write); - } - - vkUpdateDescriptorSets( - device_, - write_descriptor_sets.size(), - write_descriptor_sets.data(), - 0u, - nullptr); - - return handle_; -} - -void DescriptorSet::add_binding(const ResourceBinding& binding) { - const auto bindings_itr = std::find_if( - bindings_.begin(), - bindings_.end(), - [binding_idx = binding.binding_idx](const ResourceBinding& other) { - return other.binding_idx == binding_idx; - }); - - if (bindings_.end() == bindings_itr) { - bindings_.emplace_back(binding); - } else { - *bindings_itr = binding; - } -} - -// -// DescriptorSetPile -// - -DescriptorSetPile::DescriptorSetPile( - const uint32_t pile_size, - VkDescriptorSetLayout descriptor_set_layout, - VkDevice device, - VkDescriptorPool descriptor_pool) - : pile_size_{pile_size}, - set_layout_{descriptor_set_layout}, - device_{device}, - pool_{descriptor_pool}, - descriptors_{}, - in_use_(0u) { - descriptors_.resize(pile_size_); - allocate_new_batch(); -} - -VkDescriptorSet DescriptorSetPile::get_descriptor_set() { - // No-ops if there are descriptor sets available - allocate_new_batch(); - - VkDescriptorSet handle = descriptors_[in_use_]; - descriptors_[in_use_] = VK_NULL_HANDLE; - - in_use_++; - return handle; -} - -void DescriptorSetPile::allocate_new_batch() { - // No-ops if there are still descriptor sets available - if (in_use_ < descriptors_.size() && - descriptors_[in_use_] != VK_NULL_HANDLE) { - return; - } - - std::vector layouts(descriptors_.size()); - fill(layouts.begin(), layouts.end(), set_layout_); - - const VkDescriptorSetAllocateInfo allocate_info{ - VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, // sType - nullptr, // pNext - pool_, // descriptorPool - utils::safe_downcast(layouts.size()), // descriptorSetCount - layouts.data(), // pSetLayouts - }; - - VK_CHECK( - vkAllocateDescriptorSets(device_, &allocate_info, descriptors_.data())); - - in_use_ = 0u; -} - -// -// DescriptorPool -// - -DescriptorPool::DescriptorPool( - VkDevice device, - const DescriptorPoolConfig& config) - : device_(device), - pool_(VK_NULL_HANDLE), - config_(config), - mutex_{}, - piles_{} { - if (config.descriptor_pool_max_sets > 0) { - init(config); - } -} - -DescriptorPool::~DescriptorPool() { - if (pool_ == VK_NULL_HANDLE) { - return; - } - vkDestroyDescriptorPool(device_, pool_, nullptr); -} - -void DescriptorPool::init(const DescriptorPoolConfig& config) { - VK_CHECK_COND( - pool_ == VK_NULL_HANDLE, - "Trying to init a DescriptorPool that has already been created!"); - - config_ = config; - - std::vector type_sizes{ - { - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - config_.descriptor_uniform_buffer_count, - }, - { - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - config_.descriptor_storage_buffer_count, - }, - { - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - config_.descriptor_combined_sampler_count, - }, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - config_.descriptor_storage_buffer_count, - }, - }; - - const VkDescriptorPoolCreateInfo create_info{ - VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - config_.descriptor_pool_max_sets, // maxSets - static_cast(type_sizes.size()), // poolSizeCounts - type_sizes.data(), // pPoolSizes - }; - - VK_CHECK(vkCreateDescriptorPool(device_, &create_info, nullptr, &pool_)); -} - -DescriptorSet DescriptorPool::get_descriptor_set( - VkDescriptorSetLayout set_layout, - const ShaderLayout::Signature& signature) { - VK_CHECK_COND( - pool_ != VK_NULL_HANDLE, "DescriptorPool has not yet been initialized!"); - - auto it = piles_.find(set_layout); - if (piles_.cend() == it) { - it = piles_ - .insert({ - set_layout, - DescriptorSetPile( - config_.descriptor_pile_sizes, set_layout, device_, pool_), - }) - .first; - } - - VkDescriptorSet handle = it->second.get_descriptor_set(); - - return DescriptorSet(device_, handle, signature); -} - -void DescriptorPool::flush() { - if (pool_ != VK_NULL_HANDLE) { - VK_CHECK(vkResetDescriptorPool(device_, pool_, 0u)); - piles_.clear(); - } -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h deleted file mode 100644 index 15ea5e23e33..00000000000 --- a/backends/vulkan/runtime/vk_api/Descriptor.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include -#include - -#include - -namespace vkcompute { -namespace vkapi { - -/* - * Stores the binding information of a Vulkan Buffer so that the buffer can be - * bound at a later time. This struct should only be used if the buffer to be - * bound is guaranteed to be active at the time of binding. - */ -struct BufferBindInfo final { - VkBuffer handle; - VkDeviceSize offset; - VkDeviceSize range; - - BufferBindInfo(); - BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u); - BufferBindInfo( - const VulkanBuffer& buffer_p, - const size_t offset_p, - const size_t range_p); -}; - -struct ParamsBindList final { - std::vector bind_infos; - - ParamsBindList() = default; - ParamsBindList(std::initializer_list init_list); - - void append(const BufferBindInfo& bind_info); - void append(const ParamsBindList& other); -}; - -class DescriptorSet final { - public: - explicit DescriptorSet(VkDevice, VkDescriptorSet, ShaderLayout::Signature); - - DescriptorSet(const DescriptorSet&) = delete; - DescriptorSet& operator=(const DescriptorSet&) = delete; - - DescriptorSet(DescriptorSet&&) noexcept; - DescriptorSet& operator=(DescriptorSet&&) noexcept; - - ~DescriptorSet() = default; - - struct ResourceBinding final { - uint32_t binding_idx; - VkDescriptorType descriptor_type; - bool is_image; - - union { - VkDescriptorBufferInfo buffer_info; - VkDescriptorImageInfo image_info; - } resource_info; - }; - - private: - VkDevice device_; - VkDescriptorSet handle_; - ShaderLayout::Signature shader_layout_signature_; - std::vector bindings_; - - public: - DescriptorSet& bind(const uint32_t, const BufferBindInfo&); - DescriptorSet& bind(const uint32_t, const VulkanBuffer&); - DescriptorSet& bind(const uint32_t, const VulkanImage&); - - VkDescriptorSet get_bind_handle() const; - - private: - void add_binding(const ResourceBinding& resource); -}; - -class DescriptorSetPile final { - public: - DescriptorSetPile( - const uint32_t, - VkDescriptorSetLayout, - VkDevice, - VkDescriptorPool); - - DescriptorSetPile(const DescriptorSetPile&) = delete; - DescriptorSetPile& operator=(const DescriptorSetPile&) = delete; - - DescriptorSetPile(DescriptorSetPile&&) = default; - DescriptorSetPile& operator=(DescriptorSetPile&&) = default; - - ~DescriptorSetPile() = default; - - private: - uint32_t pile_size_; - VkDescriptorSetLayout set_layout_; - VkDevice device_; - VkDescriptorPool pool_; - std::vector descriptors_; - size_t in_use_; - - public: - VkDescriptorSet get_descriptor_set(); - - private: - void allocate_new_batch(); -}; - -struct DescriptorPoolConfig final { - // Overall Pool capacity - uint32_t descriptor_pool_max_sets; - // DescriptorCounts by type - uint32_t descriptor_uniform_buffer_count; - uint32_t descriptor_storage_buffer_count; - uint32_t descriptor_combined_sampler_count; - uint32_t descriptor_storage_image_count; - // Pile size for pre-allocating descriptor sets - uint32_t descriptor_pile_sizes; -}; - -class DescriptorPool final { - public: - explicit DescriptorPool(VkDevice, const DescriptorPoolConfig&); - - DescriptorPool(const DescriptorPool&) = delete; - DescriptorPool& operator=(const DescriptorPool&) = delete; - - DescriptorPool(DescriptorPool&&) = delete; - DescriptorPool& operator=(DescriptorPool&&) = delete; - - ~DescriptorPool(); - - private: - VkDevice device_; - VkDescriptorPool pool_; - DescriptorPoolConfig config_; - // New Descriptors - std::mutex mutex_; - std::unordered_map piles_; - - public: - operator bool() const { - return (pool_ != VK_NULL_HANDLE); - } - - void init(const DescriptorPoolConfig& config); - - DescriptorSet get_descriptor_set( - VkDescriptorSetLayout handle, - const ShaderLayout::Signature& signature); - - void flush(); -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp deleted file mode 100644 index a21130f1231..00000000000 --- a/backends/vulkan/runtime/vk_api/Device.cpp +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// @lint-ignore-every CLANGTIDY clang-diagnostic-missing-field-initializers - -#include - -#include - -#include -#include -#include -#include - -namespace vkcompute { -namespace vkapi { - -PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle) - : handle(physical_device_handle), - properties{}, - memory_properties{}, -#ifdef VK_KHR_16bit_storage - shader_16bit_storage{ - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES}, -#endif /* VK_KHR_16bit_storage */ -#ifdef VK_KHR_8bit_storage - shader_8bit_storage{ - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES}, -#endif /* VK_KHR_8bit_storage */ -#ifdef VK_KHR_shader_float16_int8 - shader_float16_int8_types{ - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR}, -#endif /* VK_KHR_shader_float16_int8 */ -#ifdef VK_KHR_shader_integer_dot_product - shader_int_dot_product_features{ - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR}, - shader_int_dot_product_properties{ - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR}, -#endif - queue_families{}, - num_compute_queues(0), - supports_int16_shader_types(false), - has_unified_memory(false), - has_timestamps(false), - timestamp_period(0), - min_ubo_alignment(0), - device_name{}, - device_type{DeviceType::UNKNOWN} { - // Extract physical device properties - vkGetPhysicalDeviceProperties(handle, &properties); - - // Extract fields of interest - has_timestamps = properties.limits.timestampComputeAndGraphics; - timestamp_period = properties.limits.timestampPeriod; - min_ubo_alignment = properties.limits.minUniformBufferOffsetAlignment; - - vkGetPhysicalDeviceMemoryProperties(handle, &memory_properties); - - VkPhysicalDeviceFeatures2 features2{ - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2}; - - // Create linked list to query availability of extensions - - void* extension_list_top = nullptr; - -#ifdef VK_KHR_16bit_storage - shader_16bit_storage.pNext = extension_list_top; - extension_list_top = &shader_16bit_storage; -#endif /* VK_KHR_16bit_storage */ - -#ifdef VK_KHR_8bit_storage - shader_8bit_storage.pNext = extension_list_top; - extension_list_top = &shader_8bit_storage; -#endif /* VK_KHR_8bit_storage */ - -#ifdef VK_KHR_shader_float16_int8 - shader_float16_int8_types.pNext = extension_list_top; - extension_list_top = &shader_float16_int8_types; -#endif /* VK_KHR_shader_float16_int8 */ - -#ifdef VK_KHR_shader_integer_dot_product - shader_int_dot_product_features.pNext = extension_list_top; - extension_list_top = &shader_int_dot_product_features; - shader_int_dot_product_properties.pNext = extension_list_top; - extension_list_top = &shader_int_dot_product_properties; -#endif /* VK_KHR_shader_integer_dot_product */ - - features2.pNext = extension_list_top; - - vkGetPhysicalDeviceFeatures2(handle, &features2); - - if (features2.features.shaderInt16 == VK_TRUE) { - supports_int16_shader_types = true; - } - - // Check if there are any memory types have both the HOST_VISIBLE and the - // DEVICE_LOCAL property flags - const VkMemoryPropertyFlags unified_memory_flags = - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - for (size_t i = 0; i < memory_properties.memoryTypeCount; ++i) { - if (memory_properties.memoryTypes[i].propertyFlags | unified_memory_flags) { - has_unified_memory = true; - break; - } - } - - uint32_t queue_family_count = 0; - vkGetPhysicalDeviceQueueFamilyProperties( - handle, &queue_family_count, nullptr); - - queue_families.resize(queue_family_count); - vkGetPhysicalDeviceQueueFamilyProperties( - handle, &queue_family_count, queue_families.data()); - - // Find the total number of compute queues - for (const VkQueueFamilyProperties& p : queue_families) { - // Check if this family has compute capability - if (p.queueFlags & VK_QUEUE_COMPUTE_BIT) { - num_compute_queues += p.queueCount; - } - } - - // Obtain device identity metadata - device_name = std::string(properties.deviceName); - std::transform( - device_name.begin(), - device_name.end(), - device_name.begin(), - [](unsigned char c) { return std::tolower(c); }); - - if (device_name.find("adreno") != std::string::npos) { - device_type = DeviceType::ADRENO; - } else if (device_name.find("swiftshader") != std::string::npos) { - device_type = DeviceType::SWIFTSHADER; - } else if (device_name.find("nvidia") != std::string::npos) { - device_type = DeviceType::NVIDIA; - } else if (device_name.find("mali") != std::string::npos) { - device_type = DeviceType::MALI; - } -} - -// -// DeviceHandle -// - -DeviceHandle::DeviceHandle(VkDevice device) : handle(device) {} - -DeviceHandle::~DeviceHandle() { - if (handle == VK_NULL_HANDLE) { - return; - } - vkDestroyDevice(handle, nullptr); -} - -// -// Utils -// - -void find_requested_device_extensions( - VkPhysicalDevice physical_device, - std::vector& enabled_extensions, - const std::vector& requested_extensions) { - uint32_t device_extension_properties_count = 0; - VK_CHECK(vkEnumerateDeviceExtensionProperties( - physical_device, nullptr, &device_extension_properties_count, nullptr)); - std::vector device_extension_properties( - device_extension_properties_count); - VK_CHECK(vkEnumerateDeviceExtensionProperties( - physical_device, - nullptr, - &device_extension_properties_count, - device_extension_properties.data())); - - std::vector enabled_device_extensions; - - for (const auto& requested_extension : requested_extensions) { - for (const auto& extension : device_extension_properties) { - if (strcmp(requested_extension, extension.extensionName) == 0) { - enabled_extensions.push_back(requested_extension); - break; - } - } - } -} - -std::string get_device_type_str(const VkPhysicalDeviceType type) { - switch (type) { - case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU: - return "INTEGRATED_GPU"; - case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU: - return "DISCRETE_GPU"; - case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU: - return "VIRTUAL_GPU"; - case VK_PHYSICAL_DEVICE_TYPE_CPU: - return "CPU"; - default: - return "UNKNOWN"; - } -} - -std::string get_memory_properties_str(const VkMemoryPropertyFlags flags) { - std::bitset<10> values(flags); - std::stringstream ss("|"); - if (values[0]) { - ss << " DEVICE_LOCAL |"; - } - if (values[1]) { - ss << " HOST_VISIBLE |"; - } - if (values[2]) { - ss << " HOST_COHERENT |"; - } - if (values[3]) { - ss << " HOST_CACHED |"; - } - if (values[4]) { - ss << " LAZILY_ALLOCATED |"; - } - - return ss.str(); -} - -std::string get_queue_family_properties_str(const VkQueueFlags flags) { - std::bitset<10> values(flags); - std::stringstream ss("|"); - if (values[0]) { - ss << " GRAPHICS |"; - } - if (values[1]) { - ss << " COMPUTE |"; - } - if (values[2]) { - ss << " TRANSFER |"; - } - - return ss.str(); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h deleted file mode 100644 index f5b7154d260..00000000000 --- a/backends/vulkan/runtime/vk_api/Device.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include -#include - -namespace vkcompute { -namespace vkapi { - -enum class DeviceType : uint32_t { - UNKNOWN, - NVIDIA, - MALI, - ADRENO, - SWIFTSHADER, -}; - -struct PhysicalDevice final { - // Handle - VkPhysicalDevice handle; - - // Properties obtained from Vulkan - VkPhysicalDeviceProperties properties; - VkPhysicalDeviceMemoryProperties memory_properties; - - // Additional features available from extensions -#ifdef VK_KHR_16bit_storage - VkPhysicalDevice16BitStorageFeatures shader_16bit_storage; -#endif /* VK_KHR_16bit_storage */ -#ifdef VK_KHR_8bit_storage - VkPhysicalDevice8BitStorageFeatures shader_8bit_storage; -#endif /* VK_KHR_8bit_storage */ -#ifdef VK_KHR_shader_float16_int8 - VkPhysicalDeviceShaderFloat16Int8Features shader_float16_int8_types; -#endif /* VK_KHR_shader_float16_int8 */ -#ifdef VK_KHR_shader_integer_dot_product - VkPhysicalDeviceShaderIntegerDotProductFeatures - shader_int_dot_product_features; - VkPhysicalDeviceShaderIntegerDotProductProperties - shader_int_dot_product_properties; -#endif /* VK_KHR_shader_integer_dot_product */ - - // Available GPU queues - std::vector queue_families; - - // Metadata - uint32_t num_compute_queues; - bool supports_int16_shader_types; - bool has_unified_memory; - bool has_timestamps; - float timestamp_period; - size_t min_ubo_alignment; - - // Device identity - std::string device_name; - DeviceType device_type; - - explicit PhysicalDevice(VkPhysicalDevice); -}; - -struct DeviceHandle final { - VkDevice handle; - - explicit DeviceHandle(VkDevice); - ~DeviceHandle(); -}; - -void find_requested_device_extensions( - VkPhysicalDevice physical_device, - std::vector& enabled_extensions, - const std::vector& requested_extensions); - -std::string get_device_type_str(const VkPhysicalDeviceType type); - -std::string get_memory_properties_str(const VkMemoryPropertyFlags flags); - -std::string get_queue_family_properties_str(const VkQueueFlags flags); - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp deleted file mode 100644 index c07349fa7ca..00000000000 --- a/backends/vulkan/runtime/vk_api/Exception.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { -namespace vkapi { - -#define VK_RESULT_CASE(code) \ - case code: \ - out << #code; \ - break; - -std::ostream& operator<<(std::ostream& out, const VkResult result) { - switch (result) { - VK_RESULT_CASE(VK_SUCCESS) - VK_RESULT_CASE(VK_NOT_READY) - VK_RESULT_CASE(VK_TIMEOUT) - VK_RESULT_CASE(VK_EVENT_SET) - VK_RESULT_CASE(VK_EVENT_RESET) - VK_RESULT_CASE(VK_INCOMPLETE) - VK_RESULT_CASE(VK_ERROR_OUT_OF_HOST_MEMORY) - VK_RESULT_CASE(VK_ERROR_OUT_OF_DEVICE_MEMORY) - VK_RESULT_CASE(VK_ERROR_INITIALIZATION_FAILED) - VK_RESULT_CASE(VK_ERROR_DEVICE_LOST) - VK_RESULT_CASE(VK_ERROR_MEMORY_MAP_FAILED) - VK_RESULT_CASE(VK_ERROR_LAYER_NOT_PRESENT) - VK_RESULT_CASE(VK_ERROR_EXTENSION_NOT_PRESENT) - VK_RESULT_CASE(VK_ERROR_FEATURE_NOT_PRESENT) - VK_RESULT_CASE(VK_ERROR_INCOMPATIBLE_DRIVER) - VK_RESULT_CASE(VK_ERROR_TOO_MANY_OBJECTS) - VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED) - VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL) - default: - out << "VK_ERROR_UNKNOWN (VkResult " << result << ")"; - break; - } - return out; -} - -#undef VK_RESULT_CASE - -// -// SourceLocation -// - -std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { - out << loc.function << " at " << loc.file << ":" << loc.line; - return out; -} - -// -// Exception -// - -Error::Error(SourceLocation source_location, std::string msg) - : msg_(std::move(msg)), source_location_{source_location} { - std::ostringstream oss; - oss << "Exception raised from " << source_location_ << ": "; - oss << msg_; - what_ = oss.str(); -} - -Error::Error(SourceLocation source_location, const char* cond, std::string msg) - : msg_(std::move(msg)), source_location_{source_location} { - std::ostringstream oss; - oss << "Exception raised from " << source_location_ << ": "; - oss << "(" << cond << ") is false! "; - oss << msg_; - what_ = oss.str(); -} - -// -// ShaderNotSupportedError -// - -std::ostream& operator<<(std::ostream& out, const VulkanExtension result) { - switch (result) { - case VulkanExtension::SHADER_INT16: - out << "shaderInt16"; - break; - case VulkanExtension::INT16_STORAGE: - out << "VK_KHR_16bit_storage"; - break; - case VulkanExtension::INT8_STORAGE: - out << "VK_KHR_8bit_storage"; - break; - case VulkanExtension::INTEGER_DOT_PRODUCT: - out << "VK_KHR_shader_integer_dot_product"; - break; - } - return out; -} - -ShaderNotSupportedError::ShaderNotSupportedError( - std::string shader_name, - VulkanExtension extension) - : shader_name_(std::move(shader_name)), extension_{extension} { - std::ostringstream oss; - oss << "Shader " << shader_name_ << " "; - oss << "not compatible with device. "; - oss << "Missing support for extension or physical device feature: "; - oss << extension_; - what_ = oss.str(); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Exception.h b/backends/vulkan/runtime/vk_api/Exception.h deleted file mode 100644 index a883a68fefc..00000000000 --- a/backends/vulkan/runtime/vk_api/Exception.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include -#include -#include -#include - -#define VK_CHECK(function) \ - do { \ - const VkResult result = (function); \ - if (VK_SUCCESS != result) { \ - throw ::vkcompute::vkapi::Error( \ - {__func__, __FILE__, static_cast(__LINE__)}, \ - ::vkcompute::utils::concat_str(#function, " returned ", result)); \ - } \ - } while (false) - -#define VK_CHECK_COND(cond, ...) \ - do { \ - if (!(cond)) { \ - throw ::vkcompute::vkapi::Error( \ - {__func__, __FILE__, static_cast(__LINE__)}, \ - #cond, \ - ::vkcompute::utils::concat_str(__VA_ARGS__)); \ - } \ - } while (false) - -#define VK_THROW(...) \ - do { \ - throw ::vkcompute::vkapi::Error( \ - {__func__, __FILE__, static_cast(__LINE__)}, \ - ::vkcompute::utils::concat_str(__VA_ARGS__)); \ - } while (false) - -namespace vkcompute { -namespace vkapi { - -std::ostream& operator<<(std::ostream& out, const VkResult loc); - -struct SourceLocation { - const char* function; - const char* file; - uint32_t line; -}; - -std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); - -class Error : public std::exception { - public: - Error(SourceLocation source_location, std::string msg); - Error(SourceLocation source_location, const char* cond, std::string msg); - - private: - std::string msg_; - SourceLocation source_location_; - std::string what_; - - public: - const std::string& msg() const { - return msg_; - } - - const char* what() const noexcept override { - return what_.c_str(); - } -}; - -enum class VulkanExtension : uint8_t { - SHADER_INT16, - INT16_STORAGE, - INT8_STORAGE, - INTEGER_DOT_PRODUCT, -}; - -class ShaderNotSupportedError : public std::exception { - public: - ShaderNotSupportedError(std::string shader_name, VulkanExtension extension); - - private: - std::string shader_name_; - VulkanExtension extension_; - std::string what_; - - public: - const char* what() const noexcept override { - return what_.c_str(); - } -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Fence.cpp b/backends/vulkan/runtime/vk_api/Fence.cpp deleted file mode 100644 index d359990e634..00000000000 --- a/backends/vulkan/runtime/vk_api/Fence.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { -namespace vkapi { - -VulkanFence::VulkanFence() - : device_(VK_NULL_HANDLE), handle_(VK_NULL_HANDLE), waiting_(false) {} - -VulkanFence::VulkanFence(VkDevice device) - : device_(device), handle_(VK_NULL_HANDLE), waiting_(VK_NULL_HANDLE) { - const VkFenceCreateInfo fence_create_info{ - VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - }; - - VK_CHECK(vkCreateFence(device_, &fence_create_info, nullptr, &handle_)); -} - -VulkanFence::VulkanFence(VulkanFence&& other) noexcept - : device_(other.device_), handle_(other.handle_), waiting_(other.waiting_) { - other.handle_ = VK_NULL_HANDLE; - other.waiting_ = false; -} - -VulkanFence& VulkanFence::operator=(VulkanFence&& other) noexcept { - device_ = other.device_; - handle_ = other.handle_; - waiting_ = other.waiting_; - - other.device_ = VK_NULL_HANDLE; - other.handle_ = VK_NULL_HANDLE; - other.waiting_ = false; - - return *this; -} - -VulkanFence::~VulkanFence() { - if (handle_ == VK_NULL_HANDLE) { - return; - } - vkDestroyFence(device_, handle_, nullptr); -} - -void VulkanFence::wait() { - // if get_submit_handle() has not been called, then this will no-op - if (waiting_) { - VkResult fence_status = VK_NOT_READY; - // Run the wait in a loop to keep the CPU hot. A single call to - // vkWaitForFences with no timeout may cause the calling thread to be - // scheduled out. - do { - // The timeout (last) arg is in units of ns - fence_status = vkWaitForFences(device_, 1u, &handle_, VK_TRUE, 100000); - - VK_CHECK_COND( - fence_status != VK_ERROR_DEVICE_LOST, - "Vulkan Fence: Device lost while waiting for fence!"); - } while (fence_status != VK_SUCCESS); - - VK_CHECK(vkResetFences(device_, 1u, &handle_)); - - waiting_ = false; - } -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Fence.h b/backends/vulkan/runtime/vk_api/Fence.h deleted file mode 100644 index 52fa24de55b..00000000000 --- a/backends/vulkan/runtime/vk_api/Fence.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -namespace vkcompute { -namespace vkapi { - -class VulkanFence final { - public: - // TODO: This is required for the lazy allocation pattern in api::vTensor. - // It will be disabled pending future refactors. - explicit VulkanFence(); - - explicit VulkanFence(VkDevice); - - VulkanFence(const VulkanFence&) = delete; - VulkanFence& operator=(const VulkanFence&) = delete; - - VulkanFence(VulkanFence&&) noexcept; - VulkanFence& operator=(VulkanFence&&) noexcept; - - ~VulkanFence(); - - private: - VkDevice device_; - VkFence handle_; - bool waiting_; - - public: - // Used to get the handle for a queue submission. - VkFence get_submit_handle() { - if (handle_ != VK_NULL_HANDLE) { - // Indicate we are now waiting for this fence to be signaled - waiting_ = true; - } - return handle_; - } - - VkFence handle() { - return handle_; - } - - // Trigger a synchronous wait for the fence to be signaled - void wait(); - - bool waiting() const { - return waiting_; - } - - operator bool() const { - return (handle_ != VK_NULL_HANDLE); - } -}; - -// A pool to track created Fences and reuse ones that are available. -// Only intended to be modified by one thread at a time. -struct FencePool final { - VkDevice device_; - - std::stack pool_; - - explicit FencePool(VkDevice device) : device_(device), pool_{} {} - - // Returns an rvalue reference to a fence, so that it can be moved - inline VulkanFence get_fence() { - if (pool_.empty()) { - VulkanFence new_fence = VulkanFence(device_); - return new_fence; - } - - VulkanFence top_fence = std::move(pool_.top()); - pool_.pop(); - - return top_fence; - } - - // Marks the fence as available - inline void return_fence(VulkanFence& fence) { - pool_.push(std::move(fence)); - } -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp deleted file mode 100644 index 994b46b8c76..00000000000 --- a/backends/vulkan/runtime/vk_api/Pipeline.cpp +++ /dev/null @@ -1,589 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { -namespace vkapi { - -// -// Utility Functions -// - -VkAccessFlags vk_access( - const PipelineStageFlags stage, - const MemoryAccessFlags access) { - VkAccessFlags vk_access = 0u; - - if (access & MemoryAccessType::READ) { - if (stage & PipelineStage::COMPUTE) { - vk_access |= VK_ACCESS_SHADER_READ_BIT; - } - - if (stage & PipelineStage::HOST) { - vk_access |= VK_ACCESS_HOST_READ_BIT; - } - - if (stage & PipelineStage::TRANSFER) { - vk_access |= VK_ACCESS_TRANSFER_READ_BIT; - } - } - - if (access & MemoryAccessType::WRITE) { - if (stage & PipelineStage::COMPUTE) { - vk_access |= VK_ACCESS_SHADER_WRITE_BIT; - } - - if (stage & PipelineStage::HOST) { - vk_access |= VK_ACCESS_HOST_WRITE_BIT; - } - - if (stage & PipelineStage::TRANSFER) { - vk_access |= VK_ACCESS_TRANSFER_WRITE_BIT; - } - } - - return vk_access; -} - -VkPipelineStageFlags vk_stage(const PipelineStageFlags stage) { - VkPipelineStageFlags vk_stage = 0u; - - if (stage & PipelineStage::COMPUTE) { - vk_stage |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - } - - if (stage & PipelineStage::HOST) { - vk_stage |= VK_PIPELINE_STAGE_HOST_BIT; - } - - if (stage & PipelineStage::TRANSFER) { - vk_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; - } - - return vk_stage; -} - -VkImageLayout vk_layout( - const PipelineStageFlags stage, - const MemoryAccessFlags access) { - switch (stage) { - case PipelineStage::COMPUTE: - switch (access) { - case MemoryAccessType::READ: - return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - default: - return VK_IMAGE_LAYOUT_GENERAL; - } - break; - case PipelineStage::TRANSFER: - switch (access) { - case MemoryAccessType::READ: - return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; - case MemoryAccessType::WRITE: - return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - default: - VK_THROW("Invalid memory access type for transfer stage!"); - } - break; - default: - VK_THROW("Cannot determine appropriate image layout"); - } - - return VK_IMAGE_LAYOUT_UNDEFINED; -} - -// -// SpecVar -// - -SpecVar::SpecVar() : type(SpecVar::Type::INT) { - value.as_int32 = 0; -} - -SpecVar::SpecVar(const float val) : type(SpecVar::Type::FLOAT) { - value.as_float = val; -} - -SpecVar::SpecVar(const int32_t val) : type(SpecVar::Type::INT) { - value.as_int32 = val; -} - -SpecVar::SpecVar(const uint32_t val) : type(SpecVar::Type::UINT) { - value.as_uint32 = val; -} - -SpecVar::SpecVar(const bool val) : type(SpecVar::Type::BOOL) { - value.as_bool = val; -} - -uint32_t SpecVar::val_size() const { - switch (type) { - case SpecVar::Type::FLOAT: - return sizeof(float); - case SpecVar::Type::INT: - return sizeof(int32_t); - case SpecVar::Type::UINT: - return sizeof(uint32_t); - case SpecVar::Type::BOOL: - return sizeof(bool); - } - return 4; -} - -uint32_t SpecVar::val_offset() const { - return utils::safe_downcast(offsetof(SpecVar, value)); -} - -bool operator==(const SpecVar& lhs, const SpecVar& rhs) { - if (lhs.type != rhs.type) { - return false; - } - switch (lhs.type) { - case SpecVar::Type::FLOAT: - return lhs.value.as_float == rhs.value.as_float; - case SpecVar::Type::INT: - return lhs.value.as_int32 == rhs.value.as_int32; - case SpecVar::Type::UINT: - return lhs.value.as_uint32 == rhs.value.as_uint32; - case SpecVar::Type::BOOL: - return lhs.value.as_bool == rhs.value.as_bool; - } - return false; -} - -bool operator!=(const SpecVar& lhs, const SpecVar& rhs) { - return !(lhs == rhs); -} - -SpecVarList::SpecVarList() {} - -SpecVarList::SpecVarList(std::initializer_list init_list) { - vars.resize(init_list.size()); - std::copy(init_list.begin(), init_list.end(), vars.begin()); -} - -void SpecVarList::append(const SpecVarList& other) { - vars.insert(vars.end(), other.vars.begin(), other.vars.end()); -} - -void SpecVarList::reserve(const size_t size) { - vars.reserve(size); -} - -void SpecVarList::append(const SpecVar& other) { - vars.push_back(other); -} - -std::vector SpecVarList::generate_map_entries() - const { - std::vector map_entries; - map_entries.resize(vars.size()); - uint32_t cur_offset = 0u; - for (uint32_t i = 0; i < vars.size(); ++i) { - map_entries.at(i) = { - i, cur_offset + vars.at(i).val_offset(), vars.at(i).val_size()}; - cur_offset += sizeof(SpecVar); - } - return map_entries; -} - -bool operator==(const SpecVarList& lhs, const SpecVarList& rhs) { - if (lhs.size() != rhs.size()) { - return false; - } - for (uint32_t i = 0; i < lhs.size(); ++i) { - if (lhs.vars.at(i) != rhs.vars.at(i)) { - return false; - } - } - return true; -} - -// -// PipelineLayout -// - -PipelineLayout::PipelineLayout( - VkDevice device, - VkDescriptorSetLayout descriptor_layout, - const uint32_t push_constants_size) - : device_(device), handle_{VK_NULL_HANDLE} { - VkPushConstantRange pc_range{ - VK_SHADER_STAGE_COMPUTE_BIT, // stageFlags - 0u, // offset - push_constants_size, // size - }; - uint32_t num_push_constants = 0u; - VkPushConstantRange* pc_ranges_ptr = nullptr; - if (push_constants_size > 0u) { - num_push_constants = 1u; - pc_ranges_ptr = &pc_range; - } - - const VkPipelineLayoutCreateInfo pipeline_layout_create_info{ - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - 1u, // setLayoutCount - &descriptor_layout, // pSetLayouts - num_push_constants, // pushConstantRangeCount - pc_ranges_ptr, // pPushConstantRanges - }; - - VK_CHECK(vkCreatePipelineLayout( - device_, &pipeline_layout_create_info, nullptr, &handle_)); -} - -PipelineLayout::PipelineLayout(PipelineLayout&& other) noexcept - : device_(other.device_), handle_(other.handle_) { - other.handle_ = VK_NULL_HANDLE; -} - -PipelineLayout::~PipelineLayout() { - if (handle_ == VK_NULL_HANDLE) { - return; - } - vkDestroyPipelineLayout(device_, handle_, nullptr); - handle_ = VK_NULL_HANDLE; -} - -void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept { - VkDevice tmp_device = lhs.device_; - VkPipelineLayout tmp_handle = lhs.handle_; - - lhs.device_ = rhs.device_; - lhs.handle_ = rhs.handle_; - - rhs.device_ = tmp_device; - rhs.handle_ = tmp_handle; -} - -// -// ComputePipeline -// - -ComputePipeline::ComputePipeline(VkDevice device, VkPipeline handle) - : device_{device}, handle_{handle} {} - -ComputePipeline::ComputePipeline( - VkDevice device, - const ComputePipeline::Descriptor& descriptor, - VkPipelineCache pipeline_cache) - : device_(device), handle_{VK_NULL_HANDLE} { - map_entries_ = descriptor.specialization_constants.generate_map_entries(); - - const VkSpecializationInfo specialization_info{ - descriptor.specialization_constants.size(), // mapEntryCount - map_entries_.data(), // pMapEntries - descriptor.specialization_constants.data_nbytes(), // dataSize - descriptor.specialization_constants.data(), // pData - }; - - const VkPipelineShaderStageCreateInfo shader_stage_create_info{ - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - VK_SHADER_STAGE_COMPUTE_BIT, // stage - descriptor.shader_module, // module - "main", // pName - &specialization_info, // pSpecializationInfo - }; - - VkPipelineCreateFlags flags = 0u; -#if defined(VULKAN_DEBUG) && defined(VK_KHR_pipeline_executable_properties) - flags = VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR | - VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR | flags; -#endif /* VULKAN_DEBUG && VK_KHR_pipeline_executable_properties */ - - const VkComputePipelineCreateInfo compute_pipeline_create_info{ - VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType - nullptr, // pNext - flags, // flags - shader_stage_create_info, // stage - descriptor.pipeline_layout, // layout - VK_NULL_HANDLE, // basePipelineHandle - 0u, // basePipelineIndex - }; - - VK_CHECK(vkCreateComputePipelines( - device_, - pipeline_cache, - 1u, - &compute_pipeline_create_info, - nullptr, - &handle_)); -} - -ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept - : device_(other.device_), - handle_(other.handle_), - map_entries_(std::move(other.map_entries_)) { - other.handle_ = VK_NULL_HANDLE; -} - -ComputePipeline::~ComputePipeline() { - if (handle_ == VK_NULL_HANDLE) { - return; - } - vkDestroyPipeline(device_, handle_, nullptr); - handle_ = VK_NULL_HANDLE; -} - -void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept { - VkDevice tmp_device = lhs.device_; - VkPipeline tmp_handle = lhs.handle_; - - lhs.device_ = rhs.device_; - lhs.handle_ = rhs.handle_; - - rhs.device_ = tmp_device; - rhs.handle_ = tmp_handle; -} - -bool operator==( - const ComputePipeline::Descriptor& _1, - const ComputePipeline::Descriptor& _2) { - return ( - _1.pipeline_layout == _2.pipeline_layout && - _1.shader_module == _2.shader_module && - _1.specialization_constants == _2.specialization_constants); -} - -// -// PipelineLayoutCache -// - -PipelineLayoutCache::PipelineLayoutCache(VkDevice device) - : cache_mutex_{}, device_(device), cache_{} {} - -PipelineLayoutCache::PipelineLayoutCache(PipelineLayoutCache&& other) noexcept - : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) { - std::lock_guard lock(other.cache_mutex_); -} - -PipelineLayoutCache::~PipelineLayoutCache() { - purge(); -} - -VkPipelineLayout PipelineLayoutCache::retrieve( - const VkDescriptorSetLayout layout, - const uint32_t push_constants_size) { - PipelineLayoutCache::Key key{layout, push_constants_size}; - std::lock_guard lock(cache_mutex_); - - auto it = cache_.find(key); - if (cache_.cend() == it) { - it = cache_ - .insert( - {key, - PipelineLayoutCache::Value( - device_, layout, push_constants_size)}) - .first; - } - - return it->second.handle(); -} - -void PipelineLayoutCache::purge() { - std::lock_guard lock(cache_mutex_); - cache_.clear(); -} - -// -// ComputePipelineCache -// - -ComputePipelineCache::ComputePipelineCache( - VkDevice device, - const std::string& cache_data_path) - : cache_mutex_{}, - device_(device), - pipeline_cache_{VK_NULL_HANDLE}, - cache_{}, - cache_data_path_(cache_data_path) { - VkPipelineCacheCreateInfo pipeline_cache_create_info{}; - - auto buffer = load_cache(); - - pipeline_cache_create_info = { - VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - buffer.size(), // initialDataSize - buffer.data(), // pInitialData - }; - - VK_CHECK(vkCreatePipelineCache( - device, &pipeline_cache_create_info, nullptr, &pipeline_cache_)); -} - -ComputePipelineCache::ComputePipelineCache( - ComputePipelineCache&& other) noexcept - : cache_mutex_{}, - device_(other.device_), - pipeline_cache_(other.pipeline_cache_), - cache_(std::move(other.cache_)) { - std::lock_guard lock(other.cache_mutex_); - - other.pipeline_cache_ = VK_NULL_HANDLE; -} - -ComputePipelineCache::~ComputePipelineCache() { - purge(); - - if (pipeline_cache_ == VK_NULL_HANDLE) { - return; - } - - vkDestroyPipelineCache(device_, pipeline_cache_, nullptr); - pipeline_cache_ = VK_NULL_HANDLE; -} - -bool ComputePipelineCache::contains(const ComputePipelineCache::Key& key) { - std::lock_guard lock(cache_mutex_); - - auto it = cache_.find(key); - return it != cache_.cend(); -} - -void ComputePipelineCache::create_pipelines( - const std::unordered_set& descriptors) { - std::lock_guard lock(cache_mutex_); - - const auto num_pipelines = descriptors.size(); - std::vector pipelines(num_pipelines); - - std::vector> map_entries; - map_entries.reserve(num_pipelines); - - std::vector specialization_infos; - specialization_infos.reserve(num_pipelines); - - std::vector shader_stage_create_infos; - shader_stage_create_infos.reserve(num_pipelines); - - std::vector create_infos; - create_infos.reserve(num_pipelines); - - for (auto& key : descriptors) { - map_entries.push_back(key.specialization_constants.generate_map_entries()); - - specialization_infos.push_back(VkSpecializationInfo{ - key.specialization_constants.size(), // mapEntryCount - map_entries.back().data(), // pMapEntries - key.specialization_constants.data_nbytes(), // dataSize - key.specialization_constants.data(), // pData - }); - - shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{ - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - VK_SHADER_STAGE_COMPUTE_BIT, // stage - key.shader_module, // module - "main", // pName - &specialization_infos.back(), // pSpecializationInfo - }); - - create_infos.push_back(VkComputePipelineCreateInfo{ - VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - shader_stage_create_infos.back(), // stage - key.pipeline_layout, // layout - VK_NULL_HANDLE, // basePipelineHandle - 0u, // basePipelineIndex - }); - } - - VK_CHECK(vkCreateComputePipelines( - device_, - pipeline_cache_, - create_infos.size(), - create_infos.data(), - nullptr, - pipelines.data())); - - uint32_t i = 0; - for (auto& key : descriptors) { - auto it = cache_.find(key); - if (it != cache_.cend()) { - continue; - } - cache_.insert({key, ComputePipelineCache::Value(device_, pipelines[i])}); - ++i; - } -} - -VkPipeline ComputePipelineCache::retrieve( - const ComputePipelineCache::Key& key) { - std::lock_guard lock(cache_mutex_); - - auto it = cache_.find(key); - if (it == cache_.cend()) { - it = cache_ - .insert( - {key, - ComputePipelineCache::Value(device_, key, pipeline_cache_)}) - .first; - } - return it->second.handle(); -} - -void ComputePipelineCache::purge() { - cache_.clear(); -} - -std::vector ComputePipelineCache::load_cache() { - // No optimization if path is unspecified - if (cache_data_path_.empty()) { - return {}; - } - - // Return if file doesn't exist; this is expected on first model-load - std::ifstream file(cache_data_path_, std::ios::binary | std::ios::ate); - if (file.fail()) { - return {}; - } - - auto size = file.tellg(); - file.seekg(0, std::ios::beg); - - std::vector buffer(size); - file.read(buffer.data(), size); - - return buffer; -} - -void ComputePipelineCache::save_cache() { - // No optimization if path is unspecified - if (cache_data_path_.empty()) { - return; - } - - // Return if file exists; the cache is already saved - std::ifstream ifile(cache_data_path_); - if (ifile.good()) { - return; - } - - size_t size{}; - vkGetPipelineCacheData(device_, pipeline_cache_, &size, nullptr); - - std::vector buffer(size); - vkGetPipelineCacheData(device_, pipeline_cache_, &size, buffer.data()); - - std::ofstream file(cache_data_path_, std::ios::binary); - file.write(buffer.data(), buffer.size()); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h deleted file mode 100644 index 67dfaebe75b..00000000000 --- a/backends/vulkan/runtime/vk_api/Pipeline.h +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include -#include - -#include -#include -#include - -#define SV(x) ::vkcompute::vkapi::SpecVar(x) - -namespace vkcompute { -namespace vkapi { - -struct SpecVar final { - enum class Type : uint8_t { - FLOAT, - INT, - UINT, - BOOL, - }; - - union Value { - int32_t as_int32; - uint32_t as_uint32; - float as_float; - bool as_bool; - }; - - Value value; - Type type; - - SpecVar(); - SpecVar(const float val); - SpecVar(const int32_t val); - SpecVar(const uint32_t val); - SpecVar(const bool val); - - uint32_t val_size() const; - uint32_t val_offset() const; -}; - -bool operator==(const SpecVar& lhs, const SpecVar& rhs); - -bool operator!=(const SpecVar& lhs, const SpecVar& rhs); - -class SpecVarList final { - std::vector vars; - - public: - SpecVarList(); - SpecVarList(std::initializer_list init_list); - - inline const SpecVar& at(const size_t index) const { - return vars.at(index); - } - - inline const SpecVar* data() const { - return vars.data(); - } - - inline uint32_t size() const { - return utils::safe_downcast(vars.size()); - } - - inline uint32_t data_nbytes() const { - return vars.size() * sizeof(SpecVar); - } - - void append(const SpecVarList& other); - - void reserve(const size_t size); - - void append(const SpecVar& other); - - std::vector generate_map_entries() const; - - friend bool operator==(const SpecVarList& lhs, const SpecVarList& rhs); -}; - -bool operator==(const SpecVarList& lhs, const SpecVarList& rhs); - -struct PipelineBarrier final { - struct Stages final { - VkPipelineStageFlags src; - VkPipelineStageFlags dst; - } stage; - - std::vector buffers; - std::vector images; - std::vector buffer_barrier_handles; - std::vector image_barrier_handles; - - inline operator bool() const { - return (0u != stage.src) || (0u != stage.dst) || !buffers.empty() || - !images.empty(); - } -}; - -using PipelineStageFlags = uint8_t; - -enum PipelineStage : PipelineStageFlags { - NO_STAGE = 0u << 0u, - COMPUTE = 1u << 0u, - HOST = 1u << 1u, - TRANSFER = 1u << 2u, -}; - -VkAccessFlags vk_access(const PipelineStageFlags, const MemoryAccessFlags); -VkPipelineStageFlags vk_stage(const PipelineStageFlags); -VkImageLayout vk_layout(const PipelineStageFlags, const MemoryAccessFlags); - -class PipelineLayout final { - public: - explicit PipelineLayout(VkDevice, VkDescriptorSetLayout, const uint32_t); - - PipelineLayout(const PipelineLayout&) = delete; - PipelineLayout& operator=(const PipelineLayout&) = delete; - - PipelineLayout(PipelineLayout&&) noexcept; - PipelineLayout& operator=(PipelineLayout&&) = delete; - - ~PipelineLayout(); - - private: - VkDevice device_; - VkPipelineLayout handle_; - - public: - VkPipelineLayout handle() const { - return handle_; - } - - // We need to define a custom swap function since this class - // does not allow for move assignment. The swap function will - // be used in the hash map. - friend void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept; -}; - -class ComputePipeline final { - public: - struct Descriptor final { - VkPipelineLayout pipeline_layout; - VkShaderModule shader_module; - SpecVarList specialization_constants; - }; - - explicit ComputePipeline(VkDevice device, VkPipeline handle); - - explicit ComputePipeline( - VkDevice device, - const Descriptor& descriptor, - VkPipelineCache pipeline_cache); - - ComputePipeline(const ComputePipeline&) = delete; - ComputePipeline& operator=(const ComputePipeline&) = delete; - - ComputePipeline(ComputePipeline&&) noexcept; - ComputePipeline& operator=(ComputePipeline&&) = delete; - - ~ComputePipeline(); - - private: - VkDevice device_; - VkPipeline handle_; - std::vector map_entries_; - - public: - inline VkPipeline handle() const { - return handle_; - } - - // We need to define a custom swap function since this class - // does not allow for move assignment. The swap function will - // be used in the hash map. - friend void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept; - - friend bool operator==( - const ComputePipeline::Descriptor& _1, - const ComputePipeline::Descriptor& _2); -}; - -class PipelineLayoutCache final { - public: - explicit PipelineLayoutCache(VkDevice device); - - PipelineLayoutCache(const PipelineLayoutCache&) = delete; - PipelineLayoutCache& operator=(const PipelineLayoutCache&) = delete; - - PipelineLayoutCache(PipelineLayoutCache&&) noexcept; - PipelineLayoutCache& operator=(PipelineLayoutCache&&) = delete; - - ~PipelineLayoutCache(); - using Key = std::pair; - using Value = PipelineLayout; - - struct Hasher { - inline size_t operator()( - std::pair key) const { - size_t seed = 0; - seed = utils::hash_combine( - seed, std::hash()(key.first)); - seed = utils::hash_combine(seed, std::hash()(key.second)); - return seed; - } - }; - - private: - // Multiple threads could potentially be adding entries into the cache, so use - // a mutex to manage access - std::mutex cache_mutex_; - - VkDevice device_; - std::unordered_map cache_; - - public: - VkPipelineLayout retrieve(const VkDescriptorSetLayout, const uint32_t); - void purge(); -}; - -class ComputePipelineCache final { - public: - explicit ComputePipelineCache( - VkDevice device, - const std::string& cache_data_path); - - ComputePipelineCache(const ComputePipelineCache&) = delete; - ComputePipelineCache& operator=(const ComputePipelineCache&) = delete; - - ComputePipelineCache(ComputePipelineCache&&) noexcept; - ComputePipelineCache& operator=(ComputePipelineCache&&) = delete; - - ~ComputePipelineCache(); - - using Key = ComputePipeline::Descriptor; - using Value = ComputePipeline; - - struct Hasher { - inline size_t operator()( - const ComputePipeline::Descriptor& descriptor) const { - size_t seed = 0; - seed = utils::hash_combine( - seed, std::hash()(descriptor.pipeline_layout)); - seed = utils::hash_combine( - seed, std::hash()(descriptor.shader_module)); - - const SpecVarList& spec_vars = descriptor.specialization_constants; - seed = utils::hash_combine(seed, std::hash()(spec_vars.size())); - - for (int i = 0; i < spec_vars.size(); ++i) { - const SpecVar& spec_var = spec_vars.at(i); - size_t new_seed = 0; - switch (spec_var.type) { - case SpecVar::Type::FLOAT: - new_seed = std::hash()(spec_var.value.as_float); - break; - case SpecVar::Type::INT: - new_seed = std::hash()(spec_var.value.as_int32); - break; - case SpecVar::Type::UINT: - new_seed = std::hash()(spec_var.value.as_uint32); - break; - case SpecVar::Type::BOOL: - new_seed = std::hash()(spec_var.value.as_bool); - break; - } - seed = utils::hash_combine(seed, new_seed); - } - - return seed; - } - }; - - void save_cache(); - - private: - std::vector load_cache(); - - // Multiple threads could potentially be adding entries into the cache, so use - // a mutex to manage access - std::mutex cache_mutex_; - - VkDevice device_; - VkPipelineCache pipeline_cache_; - std::unordered_map cache_; - const std::string cache_data_path_; - - public: - bool contains(const Key&); - void create_pipelines(const std::unordered_set&); - VkPipeline retrieve(const Key&); - void purge(); -}; - -// -// Impl -// - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/QueryPool.cpp b/backends/vulkan/runtime/vk_api/QueryPool.cpp deleted file mode 100644 index e8b3ca55206..00000000000 --- a/backends/vulkan/runtime/vk_api/QueryPool.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast - -#include - -#include - -#include -#include -#include -#include - -namespace vkcompute { -namespace vkapi { - -namespace { - -// On Mali gpus timestamp_period seems to return 0. -// For some reason when 52.08 is used op runtimes seem to make more sense -// TODO: Figure out what is special about 52.08 -constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f); - -} // namespace - -#define EARLY_RETURN_IF_UNINITIALIZED() \ - if (querypool_ == VK_NULL_HANDLE) { \ - return; \ - } - -QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p) - : config_(config), - ns_per_tick_(1u), - device_(VK_NULL_HANDLE), - querypool_(VK_NULL_HANDLE), - num_queries_(0u), - shader_durations_(0), - mutex_{} { - initialize(adapter_p); -} - -QueryPool::~QueryPool() { - EARLY_RETURN_IF_UNINITIALIZED(); - vkDestroyQueryPool(device_, querypool_, nullptr); -} - -void QueryPool::initialize(const Adapter* adapter_p) { - // No-op if adapter_p is nullptr or querypool is already created - if (!adapter_p || querypool_ != VK_NULL_HANDLE) { - return; - } - - device_ = adapter_p->device_handle(); - - ns_per_tick_ = std::lround(adapter_p->timestamp_period()); - ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_; - - shader_durations_.reserve(config_.initial_reserve_size); - - const VkQueryPoolCreateInfo info{ - VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - VK_QUERY_TYPE_TIMESTAMP, // queryType - config_.max_query_count, // queryCount - 0u, // pipelineStatistics - }; - - VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_)); -} - -size_t QueryPool::write_timestamp(const CommandBuffer& cmd) { - VK_CHECK_COND( - num_queries_ < config_.max_query_count, - "Vulkan QueryPool: Exceeded the maximum number of queries " - "allowed by the queryPool (", - config_.max_query_count, - ")!"); - - cmd.write_timestamp(querypool_, num_queries_++); - return num_queries_ - 1; -} - -void QueryPool::reset_querypool(const CommandBuffer& cmd) { - EARLY_RETURN_IF_UNINITIALIZED(); - std::lock_guard lock(mutex_); - - cmd.reset_querypool(querypool_, 0u, config_.max_query_count); - reset_state(); -} - -void QueryPool::reset_state() { - num_queries_ = 0u; - shader_durations_.clear(); -} - -void QueryPool::shader_profile_begin( - const CommandBuffer& cmd, - const uint32_t dispatch_id, - const std::string& kernel_name, - const VkExtent3D global_workgroup_size, - const VkExtent3D local_workgroup_size) { - EARLY_RETURN_IF_UNINITIALIZED(); - std::lock_guard lock(mutex_); - - uint32_t query_idx = write_timestamp(cmd); - - ShaderDuration log_entry{ - utils::safe_downcast(shader_durations_.size()), - // Execution Properties - dispatch_id, - kernel_name, - global_workgroup_size, - local_workgroup_size, - // Query indexes - query_idx, // start query idx - UINT32_MAX, // end query idx - // Timings - 0u, // start time - 0u, // end time - 0u, // duration - }; - - shader_durations_.emplace_back(log_entry); -} - -void QueryPool::shader_profile_end(const CommandBuffer& cmd) { - EARLY_RETURN_IF_UNINITIALIZED(); - std::lock_guard lock(mutex_); - - size_t query_idx = write_timestamp(cmd); - shader_durations_.back().end_query_idx = query_idx; -} - -void QueryPool::extract_results() { - EARLY_RETURN_IF_UNINITIALIZED(); - std::lock_guard lock(mutex_); - - const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT; - - std::vector query_data; - query_data.resize(num_queries_); - - VK_CHECK(vkGetQueryPoolResults( - device_, - querypool_, - 0u, // firstQuery - num_queries_, // queryCount - sizeof(uint64_t) * num_queries_, // dataSize - query_data.data(), // pData - sizeof(uint64_t), // stride - flags)); // flags - - for (ShaderDuration& entry : shader_durations_) { - entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_; - entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_; - entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns; - } -} - -std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) { - os << "{" << extents.width << ", " << extents.height << ", " << extents.depth - << "}"; - return os; -} - -std::string stringize(const VkExtent3D& extents) { - std::stringstream ss; - ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth - << "}"; - return ss.str(); -} - -std::vector QueryPool::get_shader_timestamp_data() { - if (querypool_ == VK_NULL_HANDLE) { - return {}; - } - std::lock_guard lock(mutex_); - std::vector shader_result; - for (ShaderDuration& entry : shader_durations_) { - shader_result.push_back(ShaderResult{ - /* .kernel_name = */ entry.kernel_name, - /* .dispatch_id = */ entry.dispatch_id, - /* .start_time_ns = */ entry.start_time_ns, - /* .end_time_ns = */ entry.end_time_ns, - /* .metadata = */ - ShaderMetadata{ - /* .global_workgroup_size = */ - {entry.global_workgroup_size.width, - entry.global_workgroup_size.height, - entry.global_workgroup_size.depth}, - /* .local_workgroup_size = */ - {entry.local_workgroup_size.width, - entry.local_workgroup_size.height, - entry.local_workgroup_size.depth}, - }}); - } - return shader_result; -} - -std::string QueryPool::generate_string_report() { - std::lock_guard lock(mutex_); - - std::stringstream ss; - - int kernel_name_w = 120; - int global_size_w = 25; - int local_size_w = 25; - int duration_w = 25; - - ss << std::left; - ss << std::setw(kernel_name_w) << "Kernel Name"; - ss << std::setw(global_size_w) << "Global Workgroup Size"; - ss << std::setw(local_size_w) << "Local Workgroup Size"; - ss << std::right << std::setw(duration_w) << "Duration (ns)"; - ss << std::endl; - - ss << std::left; - ss << std::setw(kernel_name_w) << "==========="; - ss << std::setw(global_size_w) << "====================="; - ss << std::setw(local_size_w) << "===================="; - ss << std::right << std::setw(duration_w) << "============="; - ss << std::endl; - - for (ShaderDuration& entry : shader_durations_) { - std::chrono::duration exec_duration_ns( - entry.execution_duration_ns); - - ss << std::left; - ss << std::setw(kernel_name_w) << entry.kernel_name; - ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size); - ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size); - ss << std::right << std::setw(duration_w) << exec_duration_ns.count(); - ss << std::endl; - } - - return ss.str(); -} - -std::string QueryPool::generate_tsv_string_report() { - std::lock_guard lock(mutex_); - - std::stringstream ss; - - ss << "Kernel Name\t"; - ss << "Global Workgroup Size\t"; - ss << "Local Workgroup Size\t"; - ss << "Duration (ns)\t"; - ss << std::endl; - - ss << "===========\t"; - ss << "=====================\t"; - ss << "====================\t"; - ss << "=============\t"; - ss << std::endl; - - for (ShaderDuration& entry : shader_durations_) { - std::chrono::duration exec_duration_ns( - entry.execution_duration_ns); - - ss << entry.kernel_name << "\t"; - ss << stringize(entry.global_workgroup_size) << "\t"; - ss << stringize(entry.local_workgroup_size) << "\t"; - ss << exec_duration_ns.count() << "\t"; - ss << std::endl; - } - - return ss.str(); -} - -void QueryPool::print_results(const bool tsv_format) { - EARLY_RETURN_IF_UNINITIALIZED(); - if (tsv_format) { - std::cout << generate_tsv_string_report() << std::endl; - } else { - std::cout << generate_string_report() << std::endl; - } -} - -unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) { - for (ShaderDuration& entry : shader_durations_) { - if (entry.kernel_name == kernel_name) { - std::chrono::duration exec_duration_ns( - entry.execution_duration_ns); - return exec_duration_ns.count(); - } - } - return 0; -} - -unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) { - uint64_t total_ns = 0; - uint32_t count = 0; - for (ShaderDuration& entry : shader_durations_) { - if (entry.kernel_name == kernel_name) { - std::chrono::duration exec_duration_ns( - entry.execution_duration_ns); - total_ns += exec_duration_ns.count(); - count++; - } - } - if (count == 0) { - return 0; - } - return total_ns / count; -} -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/QueryPool.h b/backends/vulkan/runtime/vk_api/QueryPool.h deleted file mode 100644 index 94bd99584eb..00000000000 --- a/backends/vulkan/runtime/vk_api/QueryPool.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include -#include -#include - -#include -#include - -#ifndef VULKAN_QUERY_POOL_SIZE -#define VULKAN_QUERY_POOL_SIZE 4096u -#endif - -namespace vkcompute { -namespace vkapi { - -struct ShaderMetadata final { - const uint32_t global_workgroup_size[3]; - const uint32_t local_workgroup_size[3]; -}; - -struct ShaderResult final { - const std::string kernel_name; - const uint32_t dispatch_id; - const uint64_t start_time_ns; - const uint64_t end_time_ns; - const ShaderMetadata metadata; -}; - -struct QueryPoolConfig final { - uint32_t max_query_count = VULKAN_QUERY_POOL_SIZE; - uint32_t initial_reserve_size = 256u; -}; - -struct ShaderDuration final { - uint32_t idx; - - // Execution Properties - uint32_t dispatch_id; - std::string kernel_name; - VkExtent3D global_workgroup_size; - VkExtent3D local_workgroup_size; - - // Query indexes - uint32_t start_query_idx; - uint32_t end_query_idx; - - // Timings - uint64_t start_time_ns; - uint64_t end_time_ns; - uint64_t execution_duration_ns; -}; - -class QueryPool final { - // Configuration - QueryPoolConfig config_; - uint64_t ns_per_tick_; - - // Vulkan handles - VkDevice device_; - VkQueryPool querypool_; - - // Internal State - uint32_t num_queries_; - std::vector shader_durations_; - - std::mutex mutex_; - - public: - explicit QueryPool(const QueryPoolConfig&, const Adapter* adapter_p); - - QueryPool(const QueryPool&) = delete; - QueryPool& operator=(const QueryPool&) = delete; - - QueryPool(QueryPool&&) = delete; - QueryPool& operator=(QueryPool&&) = delete; - - ~QueryPool(); - - void initialize(const Adapter* adapter_p); - - private: - size_t write_timestamp(const CommandBuffer&); - - public: - void reset_querypool(const CommandBuffer&); - - void reset_state(); - - void shader_profile_begin( - const CommandBuffer&, - const uint32_t, - const std::string&, - const VkExtent3D, - const VkExtent3D); - - void shader_profile_end(const CommandBuffer&); - - void extract_results(); - - std::vector get_shader_timestamp_data(); - void print_results(const bool tsv_format = false); - unsigned long get_total_shader_ns(std::string kernel_name); - unsigned long get_mean_shader_ns(std::string kernel_name); - - operator bool() const { - return querypool_ != VK_NULL_HANDLE; - } - - private: - std::string generate_string_report(); - std::string generate_tsv_string_report(); -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Runtime.cpp b/backends/vulkan/runtime/vk_api/Runtime.cpp deleted file mode 100644 index c3376e2ccbf..00000000000 --- a/backends/vulkan/runtime/vk_api/Runtime.cpp +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#ifdef USE_VOLK_HEADER_ONLY -// For volk.h, define this before including volk.h in exactly one CPP file. -#define VOLK_IMPLEMENTATION -#include -#endif /* USE_VOLK_HEADER_ONLY */ - -namespace vkcompute { -namespace vkapi { - -#define PRINT_CASE(name) \ - case MemoryAccessType::name: \ - out << #name; \ - break; - -std::ostream& operator<<(std::ostream& out, const MemoryAccessType& tag) { - switch (tag) { - PRINT_CASE(NONE) - PRINT_CASE(READ) - PRINT_CASE(WRITE) - } - return out; -} - -#undef PRINT_CASE - -namespace { - -void find_requested_layers_and_extensions( - std::vector& enabled_layers, - std::vector& enabled_extensions, - const std::vector& requested_layers, - const std::vector& requested_extensions) { - // Get supported instance layers - uint32_t layer_count = 0; - VK_CHECK(vkEnumerateInstanceLayerProperties(&layer_count, nullptr)); - - std::vector layer_properties(layer_count); - VK_CHECK(vkEnumerateInstanceLayerProperties( - &layer_count, layer_properties.data())); - - // Search for requested layers - for (const auto& requested_layer : requested_layers) { - for (const auto& layer : layer_properties) { - if (strcmp(requested_layer, layer.layerName) == 0) { - enabled_layers.push_back(requested_layer); - break; - } - } - } - - // Get supported instance extensions - uint32_t extension_count = 0; - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &extension_count, nullptr)); - - std::vector extension_properties(extension_count); - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &extension_count, extension_properties.data())); - - // Search for requested extensions - for (const auto& requested_extension : requested_extensions) { - for (const auto& extension : extension_properties) { - if (strcmp(requested_extension, extension.extensionName) == 0) { - enabled_extensions.push_back(requested_extension); - break; - } - } - } -} - -VkInstance create_instance(const RuntimeConfig& config) { - const VkApplicationInfo application_info{ - VK_STRUCTURE_TYPE_APPLICATION_INFO, // sType - nullptr, // pNext - "PyTorch Vulkan Backend", // pApplicationName - 0, // applicationVersion - nullptr, // pEngineName - 0, // engineVersion - VK_API_VERSION_1_1, // apiVersion - }; - - std::vector enabled_layers; - std::vector enabled_extensions; - - std::vector requested_layers; - std::vector requested_extensions; - - if (config.enable_validation_messages) { - requested_layers.emplace_back("VK_LAYER_KHRONOS_validation"); -#ifdef VK_EXT_debug_report - requested_extensions.emplace_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); -#endif /* VK_EXT_debug_report */ - } - - VkInstanceCreateFlags instance_flags = 0; -#ifdef __APPLE__ - instance_flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR; - requested_extensions.emplace_back( - VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); -#endif - - find_requested_layers_and_extensions( - enabled_layers, - enabled_extensions, - requested_layers, - requested_extensions); - - const void* instance_create_next = nullptr; - // VkConfig on Mac platforms does not expose debugPrintf settings for whatever - // reason so it has to be enabled manually. -#if defined(__APPLE__) && defined(VULKAN_DEBUG) - std::vector enabled_validation_features{ - VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT, - }; - VkValidationFeaturesEXT validation_features = { - VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT, // sType - nullptr, // pNext - static_cast( - enabled_validation_features.size()), // enabledValidationFeatureCount - enabled_validation_features.data(), // pEnabledValidationFeatures - 0, - nullptr, // pDisabledValidationFeatures - }; - instance_create_next = &validation_features; -#endif /* __APPLE__ && VULKAN_DEBUG */ - - const VkInstanceCreateInfo instance_create_info{ - VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, // sType - instance_create_next, // pNext - instance_flags, // flags - &application_info, // pApplicationInfo - static_cast(enabled_layers.size()), // enabledLayerCount - enabled_layers.data(), // ppEnabledLayerNames - static_cast(enabled_extensions.size()), // enabledExtensionCount - enabled_extensions.data(), // ppEnabledExtensionNames - }; - - VkInstance instance{}; - VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); - VK_CHECK_COND(instance, "Invalid Vulkan instance!"); - -#ifdef USE_VULKAN_VOLK - volkLoadInstance(instance); -#endif /* USE_VULKAN_VOLK */ - - return instance; -} - -std::vector create_physical_devices( - VkInstance instance) { - if (instance == VK_NULL_HANDLE) { - return std::vector(); - } - - uint32_t device_count = 0; - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); - - std::vector devices(device_count); - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); - - std::vector device_mappings; - device_mappings.reserve(device_count); - for (VkPhysicalDevice physical_device : devices) { - device_mappings.emplace_back(PhysicalDevice(physical_device), -1); - } - - return device_mappings; -} - -VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( - const VkDebugReportFlagsEXT flags, - const VkDebugReportObjectTypeEXT /* object_type */, - const uint64_t /* object */, - const size_t /* location */, - const int32_t message_code, - const char* const layer_prefix, - const char* const message, - void* const /* user_data */) { - (void)flags; - - std::stringstream stream; - stream << layer_prefix << " " << message_code << " " << message << std::endl; - const std::string log = stream.str(); - - std::cout << log; - - return VK_FALSE; -} - -VkDebugReportCallbackEXT create_debug_report_callback( - VkInstance instance, - const RuntimeConfig config) { - if (instance == VK_NULL_HANDLE || !config.enable_validation_messages) { - return VkDebugReportCallbackEXT{}; - } - - const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{ - VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, // sType - nullptr, // pNext - VK_DEBUG_REPORT_INFORMATION_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT | - VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | - VK_DEBUG_REPORT_ERROR_BIT_EXT | - VK_DEBUG_REPORT_DEBUG_BIT_EXT, // flags - debug_report_callback_fn, // pfnCallback - nullptr, // pUserData - }; - - const auto vkCreateDebugReportCallbackEXT = - (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance, "vkCreateDebugReportCallbackEXT"); - - VK_CHECK_COND( - vkCreateDebugReportCallbackEXT, - "Could not load vkCreateDebugReportCallbackEXT"); - - VkDebugReportCallbackEXT debug_report_callback{}; - VK_CHECK(vkCreateDebugReportCallbackEXT( - instance, - &debugReportCallbackCreateInfo, - nullptr, - &debug_report_callback)); - - VK_CHECK_COND(debug_report_callback, "Invalid Vulkan debug report callback!"); - - return debug_report_callback; -} - -// -// Adapter selection methods -// - -uint32_t select_first(const std::vector& devices) { - if (devices.empty()) { - return devices.size() + 1; // return out of range to signal invalidity - } - - // Select the first adapter that has compute capability - for (size_t i = 0; i < devices.size(); ++i) { - if (devices[i].first.num_compute_queues > 0) { - return i; - } - } - - return devices.size() + 1; -} - -// -// Global runtime initialization -// - -std::unique_ptr init_global_vulkan_runtime( - const std::string& cache_data_path) { - // Load Vulkan drivers -#if defined(USE_VULKAN_VOLK) - if (VK_SUCCESS != volkInitialize()) { - return std::unique_ptr(nullptr); - } -#elif defined(USE_VULKAN_WRAPPER) - if (!InitVulkan()) { - return std::unique_ptr(nullptr); - } -#endif /* USE_VULKAN_VOLK, USE_VULKAN_WRAPPER */ - - const bool enable_validation_messages = -#if defined(VULKAN_DEBUG) - true; -#else - false; -#endif /* VULKAN_DEBUG */ - const bool init_default_device = true; - const uint32_t num_requested_queues = 1; // TODO: raise this value - - const RuntimeConfig default_config{ - enable_validation_messages, - init_default_device, - AdapterSelector::First, - num_requested_queues, - cache_data_path, - }; - - try { - return std::make_unique(default_config); - } catch (...) { - } - - return std::unique_ptr(nullptr); -} - -} // namespace - -Runtime::Runtime(const RuntimeConfig config) - : config_(config), - instance_(create_instance(config_)), - device_mappings_(create_physical_devices(instance_)), - adapters_{}, - default_adapter_i_(UINT32_MAX), - debug_report_callback_(create_debug_report_callback(instance_, config_)) { - // List of adapters will never exceed the number of physical devices - adapters_.reserve(device_mappings_.size()); - - if (config.init_default_device) { - try { - switch (config.default_selector) { - case AdapterSelector::First: - default_adapter_i_ = create_adapter(select_first); - } - } catch (...) { - } - } -} - -Runtime::~Runtime() { - if (instance_ == VK_NULL_HANDLE) { - return; - } - - // Clear adapters list to trigger device destruction before destroying - // VkInstance - adapters_.clear(); - - // Instance must be destroyed last as its used to destroy the debug report - // callback. - if (debug_report_callback_) { - const auto vkDestroyDebugReportCallbackEXT = - (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance_, "vkDestroyDebugReportCallbackEXT"); - - if (vkDestroyDebugReportCallbackEXT) { - vkDestroyDebugReportCallbackEXT( - instance_, debug_report_callback_, nullptr); - } - - debug_report_callback_ = {}; - } - - vkDestroyInstance(instance_, nullptr); - instance_ = VK_NULL_HANDLE; -} - -uint32_t Runtime::create_adapter(const Selector& selector) { - VK_CHECK_COND( - !device_mappings_.empty(), - "Pytorch Vulkan Runtime: Could not initialize adapter because no " - "devices were found by the Vulkan instance."); - - uint32_t physical_device_i = selector(device_mappings_); - VK_CHECK_COND( - physical_device_i < device_mappings_.size(), - "Pytorch Vulkan Runtime: no suitable device adapter was selected! " - "Device could not be initialized"); - - Runtime::DeviceMapping& device_mapping = device_mappings_[physical_device_i]; - // If an Adapter has already been created, return that - int32_t adapter_i = device_mapping.second; - if (adapter_i >= 0) { - return adapter_i; - } - // Otherwise, create an adapter for the selected physical device - adapter_i = utils::safe_downcast(adapters_.size()); - adapters_.emplace_back(new Adapter( - instance_, - device_mapping.first, - config_.num_requested_queues, - config_.cache_data_path)); - device_mapping.second = adapter_i; - - return adapter_i; -} - -std::string& set_and_get_pipeline_cache_data_path( - const std::string& file_path) { - // The global cache data path is declared as a static local variable for the - // same reasons as the global runtime below. -#if defined(ETVK_DEFAULT_CACHE_PATH) - static std::string global_cache_data_path = ETVK_DEFAULT_CACHE_PATH; -#else - static std::string global_cache_data_path; -#endif /* ETVK_DEFAULT_CACHE_PATH */ - - if (file_path.size() > 0) { - global_cache_data_path = file_path; - } - return global_cache_data_path; -} - -Runtime* runtime() { - // The global vulkan runtime is declared as a static local variable within a - // non-static function to ensure it has external linkage. If it were a global - // static variable there would be one copy per translation unit that includes - // Runtime.h as it would have internal linkage. - static const std::unique_ptr p_runtime = - init_global_vulkan_runtime(set_and_get_pipeline_cache_data_path("")); - - VK_CHECK_COND( - p_runtime, - "Pytorch Vulkan Runtime: The global runtime could not be retrieved " - "because it failed to initialize."); - - return p_runtime.get(); -} - -std::unique_ptr init_external_adapter( - const VkInstance instance, - const VkPhysicalDevice physical_device, - const VkDevice logical_device, - const uint32_t num_queues, - const std::string& cache_data_path) { - if (instance == VK_NULL_HANDLE || physical_device == VK_NULL_HANDLE || - logical_device == VK_NULL_HANDLE) { - return std::unique_ptr(nullptr); - } - - return std::make_unique( - instance, physical_device, logical_device, num_queues, cache_data_path); -} - -Adapter* set_and_get_external_adapter( - const VkInstance instance, - const VkPhysicalDevice physical_device, - const VkDevice logical_device) { - static const std::unique_ptr p_external_adapter = - init_external_adapter( - instance, - physical_device, - logical_device, - 1, - set_and_get_pipeline_cache_data_path("")); - - return p_external_adapter.get(); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Runtime.h b/backends/vulkan/runtime/vk_api/Runtime.h deleted file mode 100644 index 3706d6c73d0..00000000000 --- a/backends/vulkan/runtime/vk_api/Runtime.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include -#include - -namespace vkcompute { -namespace vkapi { - -// -// A Vulkan Runtime initializes a Vulkan instance and decouples the concept of -// Vulkan instance initialization from initialization of, and subsequent -// interactions with, Vulkan [physical and logical] devices as a precursor to -// multi-GPU support. The Vulkan Runtime can be queried for available Adapters -// (i.e. physical devices) in the system which in turn can be used for creation -// of a Vulkan Context (i.e. logical devices). All Vulkan tensors in PyTorch -// are associated with a Context to make tensor <-> device affinity explicit. -// - -enum AdapterSelector { - First, -}; - -struct RuntimeConfig final { - bool enable_validation_messages; - bool init_default_device; - AdapterSelector default_selector; - uint32_t num_requested_queues; - std::string cache_data_path; -}; - -class Runtime final { - public: - explicit Runtime(const RuntimeConfig); - - // Do not allow copying. There should be only one global instance of this - // class. - Runtime(const Runtime&) = delete; - Runtime& operator=(const Runtime&) = delete; - - Runtime(Runtime&&) = delete; - Runtime& operator=(Runtime&&) = delete; - - ~Runtime(); - - using DeviceMapping = std::pair; - using AdapterPtr = std::unique_ptr; - - private: - RuntimeConfig config_; - - VkInstance instance_; - - std::vector device_mappings_; - std::vector adapters_; - uint32_t default_adapter_i_; - - VkDebugReportCallbackEXT debug_report_callback_; - - public: - inline VkInstance instance() const { - return instance_; - } - - inline Adapter* get_adapter_p() { - VK_CHECK_COND( - default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(), - "Pytorch Vulkan Runtime: Default device adapter is not set correctly!"); - return adapters_[default_adapter_i_].get(); - } - - inline Adapter* get_adapter_p(uint32_t i) { - VK_CHECK_COND( - i >= 0 && i < adapters_.size(), - "Pytorch Vulkan Runtime: Adapter at index ", - i, - " is not available!"); - return adapters_[i].get(); - } - - inline uint32_t default_adapter_i() const { - return default_adapter_i_; - } - - using Selector = - std::function&)>; - uint32_t create_adapter(const Selector&); -}; - -std::string& set_and_get_pipeline_cache_data_path(const std::string& file_path); - -// The global runtime is retrieved using this function, where it is declared as -// a static local variable. -Runtime* runtime(); - -// Used to share instance + devices between client code and ETVK -Adapter* set_and_get_external_adapter( - const VkInstance instance = VK_NULL_HANDLE, - const VkPhysicalDevice physical_device = VK_NULL_HANDLE, - const VkDevice logical_device = VK_NULL_HANDLE); - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Shader.cpp b/backends/vulkan/runtime/vk_api/Shader.cpp deleted file mode 100644 index 4356f92efe7..00000000000 --- a/backends/vulkan/runtime/vk_api/Shader.cpp +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -namespace vkcompute { -namespace vkapi { - -// -// ShaderInfo -// - -ShaderInfo::ShaderInfo() - : src_code{ - nullptr, - 0u, - } {} - -ShaderInfo::ShaderInfo( - std::string name, - const uint32_t* const spirv_bin, - const uint32_t size, - std::vector layout, - const utils::uvec3 tile_size, - const bool requires_shader_int16_ext, - const bool requires_16bit_storage_ext, - const bool requires_8bit_storage_ext, - const bool requires_integer_dot_product_ext) - : src_code{ - spirv_bin, - size, - }, - kernel_name{std::move(name)}, - kernel_layout{std::move(layout)}, - out_tile_size(tile_size), - requires_shader_int16(requires_shader_int16_ext), - requires_16bit_storage(requires_16bit_storage_ext), - requires_8bit_storage(requires_8bit_storage_ext), - requires_integer_dot_product(requires_integer_dot_product_ext) { -} - -bool operator==(const ShaderInfo& _1, const ShaderInfo& _2) { - return ( - _1.src_code.bin == _2.src_code.bin && - _1.src_code.size == _2.src_code.size); -} - -// -// ShaderLayout -// - -ShaderLayout::ShaderLayout( - VkDevice device, - const ShaderLayout::Signature& signature) - : device_(device), handle_{VK_NULL_HANDLE} { - std::vector bindings; - bindings.reserve(signature.size()); - - uint32_t binding_num = 0u; - for (const VkDescriptorType type : signature) { - bindings.emplace_back(VkDescriptorSetLayoutBinding{ - binding_num++, // binding - type, // descriptorType - 1u, // descriptorCount - VK_SHADER_STAGE_COMPUTE_BIT, // stageFlags - nullptr, // pImmutableSamplers - }); - } - - const VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info{ - VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - static_cast(bindings.size()), // bindingCount - bindings.data(), // pBindings - }; - - VK_CHECK(vkCreateDescriptorSetLayout( - device_, &descriptor_set_layout_create_info, nullptr, &handle_)); -} - -ShaderLayout::ShaderLayout(ShaderLayout&& other) noexcept - : device_(other.device_), handle_(other.handle_) { - other.handle_ = VK_NULL_HANDLE; -} - -ShaderLayout::~ShaderLayout() { - if (handle_ == VK_NULL_HANDLE) { - return; - } - vkDestroyDescriptorSetLayout(device_, handle_, nullptr); - handle_ = VK_NULL_HANDLE; -} - -void swap(ShaderLayout& lhs, ShaderLayout& rhs) noexcept { - VkDevice tmp_device = lhs.device_; - VkDescriptorSetLayout tmp_handle = lhs.handle_; - - lhs.device_ = rhs.device_; - lhs.handle_ = rhs.handle_; - - rhs.device_ = tmp_device; - rhs.handle_ = tmp_handle; -} - -// -// ShaderModule -// - -ShaderModule::ShaderModule(VkDevice device, const ShaderInfo& source) - : device_(device), handle_{VK_NULL_HANDLE} { - const uint32_t* code = source.src_code.bin; - uint32_t size = source.src_code.size; - - const VkShaderModuleCreateInfo shader_module_create_info{ - VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - size, // codeSize - code, // pCode - }; - - VK_CHECK(vkCreateShaderModule( - device_, &shader_module_create_info, nullptr, &handle_)); -} - -ShaderModule::ShaderModule(ShaderModule&& other) noexcept - : device_(other.device_), handle_(other.handle_) { - other.handle_ = VK_NULL_HANDLE; -} - -ShaderModule::~ShaderModule() { - if (handle_ == VK_NULL_HANDLE) { - return; - } - vkDestroyShaderModule(device_, handle_, nullptr); - handle_ = VK_NULL_HANDLE; -} - -void swap(ShaderModule& lhs, ShaderModule& rhs) noexcept { - VkDevice tmp_device = lhs.device_; - VkShaderModule tmp_handle = lhs.handle_; - - lhs.device_ = rhs.device_; - lhs.handle_ = rhs.handle_; - - rhs.device_ = tmp_device; - rhs.handle_ = tmp_handle; -} - -// -// ShaderLayoutCache -// - -ShaderLayoutCache::ShaderLayoutCache(VkDevice device) - : cache_mutex_{}, device_(device), cache_{} {} - -ShaderLayoutCache::ShaderLayoutCache(ShaderLayoutCache&& other) noexcept - : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) { - std::lock_guard lock(other.cache_mutex_); -} - -ShaderLayoutCache::~ShaderLayoutCache() { - purge(); -} - -VkDescriptorSetLayout ShaderLayoutCache::retrieve( - const ShaderLayoutCache::Key& key) { - std::lock_guard lock(cache_mutex_); - - auto it = cache_.find(key); - if (cache_.cend() == it) { - it = cache_.insert({key, ShaderLayoutCache::Value(device_, key)}).first; - } - - return it->second.handle(); -} - -void ShaderLayoutCache::purge() { - std::lock_guard lock(cache_mutex_); - cache_.clear(); -} - -// -// ShaderCache -// - -ShaderCache::ShaderCache(VkDevice device) - : cache_mutex_{}, device_(device), cache_{} {} - -ShaderCache::ShaderCache(ShaderCache&& other) noexcept - : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) { - std::lock_guard lock(other.cache_mutex_); -} - -ShaderCache::~ShaderCache() { - purge(); -} - -VkShaderModule ShaderCache::retrieve(const ShaderCache::Key& key) { - std::lock_guard lock(cache_mutex_); - - auto it = cache_.find(key); - if (cache_.cend() == it) { - it = cache_.insert({key, ShaderCache::Value(device_, key)}).first; - } - - return it->second.handle(); -} - -void ShaderCache::purge() { - cache_.clear(); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h deleted file mode 100644 index 21332381406..00000000000 --- a/backends/vulkan/runtime/vk_api/Shader.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -#include -#include - -namespace vkcompute { -namespace vkapi { - -class ShaderLayout final { - public: - using Signature = std::vector; - - explicit ShaderLayout(VkDevice, const Signature&); - - ShaderLayout(const ShaderLayout&) = delete; - ShaderLayout& operator=(const ShaderLayout&) = delete; - - ShaderLayout(ShaderLayout&&) noexcept; - ShaderLayout& operator=(ShaderLayout&&) = delete; - - ~ShaderLayout(); - - private: - VkDevice device_; - VkDescriptorSetLayout handle_; - - public: - VkDescriptorSetLayout handle() const { - return handle_; - } - - // We need to define a custom swap function since this class - // does not allow for move assignment. The swap function will - // be used in the hash map. - friend void swap(ShaderLayout& lhs, ShaderLayout& rhs) noexcept; -}; - -struct ShaderInfo final { - struct { - const uint32_t* bin = nullptr; - uint32_t size = 0u; - } src_code; - - std::string kernel_name{""}; - ShaderLayout::Signature kernel_layout{}; - - // Shader Metadata - utils::WorkgroupSize out_tile_size{1u, 1u, 1u}; - bool requires_shader_int16 = false; - bool requires_16bit_storage = false; - bool requires_8bit_storage = false; - bool requires_integer_dot_product = false; - - explicit ShaderInfo(); - - explicit ShaderInfo( - std::string, - const uint32_t*, - const uint32_t, - std::vector, - const utils::uvec3 tile_size, - const bool requires_shader_int16_ext, - const bool requires_16bit_storage_ext, - const bool requires_8bit_storage_ext, - const bool requires_integer_dot_product_ext); - - operator bool() const { - return src_code.bin != nullptr; - }; -}; - -bool operator==(const ShaderInfo& _1, const ShaderInfo& _2); - -class ShaderModule final { - public: - explicit ShaderModule(VkDevice device, const ShaderInfo& source); - - ShaderModule(const ShaderModule&) = delete; - ShaderModule& operator=(const ShaderModule&) = delete; - - ShaderModule(ShaderModule&&) noexcept; - ShaderModule& operator=(ShaderModule&&) = delete; - - ~ShaderModule(); - - private: - VkDevice device_; - VkShaderModule handle_; - - public: - inline VkShaderModule handle() const { - return handle_; - } - - // We need to define a custom swap function since this class - // does not allow for move assignment. The swap function will - // be used in the hash map. - friend void swap(ShaderModule& lhs, ShaderModule& rhs) noexcept; -}; - -class ShaderLayoutCache final { - public: - explicit ShaderLayoutCache(VkDevice device); - - ShaderLayoutCache(const ShaderLayoutCache&) = delete; - ShaderLayoutCache& operator=(const ShaderLayoutCache&) = delete; - - ShaderLayoutCache(ShaderLayoutCache&&) noexcept; - ShaderLayoutCache& operator=(ShaderLayoutCache&&) = delete; - - ~ShaderLayoutCache(); - - using Key = ShaderLayout::Signature; - using Value = ShaderLayout; - - struct Hasher { - inline size_t operator()(const ShaderLayout::Signature& signature) const { - size_t hashed = 0u; - - for (const VkDescriptorType type : signature) { - hashed = - utils::hash_combine(hashed, std::hash()(type)); - } - - return hashed; - } - }; - - private: - // Multiple threads could potentially be adding entries into the cache, so use - // a mutex to manage access - std::mutex cache_mutex_; - - VkDevice device_; - std::unordered_map cache_; - - public: - VkDescriptorSetLayout retrieve(const Key&); - void purge(); -}; - -class ShaderCache final { - public: - explicit ShaderCache(VkDevice device); - - ShaderCache(const ShaderCache&) = delete; - ShaderCache& operator=(const ShaderCache&) = delete; - - ShaderCache(ShaderCache&&) noexcept; - ShaderCache& operator=(ShaderCache&&) = delete; - - ~ShaderCache(); - - using Key = ShaderInfo; - using Value = ShaderModule; - - struct Hasher { - inline size_t operator()(const ShaderInfo& source) const { - size_t seed = 0; - seed = utils::hash_combine( - seed, std::hash()(source.src_code.bin)); - seed = utils::hash_combine( - seed, std::hash()(source.src_code.size)); - - return seed; - } - }; - - private: - // Multiple threads could potentially be adding entries into the cache, so use - // a mutex to manage access - std::mutex cache_mutex_; - - VkDevice device_; - std::unordered_map cache_; - - public: - VkShaderModule retrieve(const Key&); - void purge(); -}; - -} // namespace vkapi -} // namespace vkcompute - -inline bool operator==( - const VkDescriptorSetLayoutBinding& _1, - const VkDescriptorSetLayoutBinding& _2) { - return ( - _1.binding == _2.binding && _1.descriptorType == _2.descriptorType && - _1.descriptorCount == _2.descriptorCount && - _1.stageFlags == _2.stageFlags && - _1.pImmutableSamplers == _2.pImmutableSamplers); -} diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h deleted file mode 100644 index b3309aa6c69..00000000000 --- a/backends/vulkan/runtime/vk_api/Types.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY bugprone-branch-clone - -#include - -#include - -#include -#include - -// X11 headers via volk define Bool, so we need to undef it -#if defined(__linux__) -#undef Bool -#endif - -#ifdef USE_VULKAN_FP16_INFERENCE -#define VK_FORMAT_FLOAT4 VK_FORMAT_R16G16B16A16_SFLOAT -#else -#define VK_FORMAT_FLOAT4 VK_FORMAT_R32G32B32A32_SFLOAT -#endif /* USE_VULKAN_FP16_INFERENCE */ - -#define VK_FORALL_SCALAR_TYPES(_) \ - _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Byte) \ - _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Bool) \ - _(int8_t, VK_FORMAT_R8G8B8A8_SINT, Char) \ - _(uint16_t, VK_FORMAT_R16G16B16A16_SFLOAT, Half) \ - _(uint16_t, VK_FORMAT_R16G16B16A16_UINT, UInt16) \ - _(int16_t, VK_FORMAT_R16G16B16A16_SINT, Short) \ - _(uint32_t, VK_FORMAT_R32G32B32A32_UINT, UInt) \ - _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int) \ - _(uint64_t, VK_FORMAT_R64G64B64A64_UINT, UInt64) \ - _(int64_t, VK_FORMAT_R64G64B64A64_SINT, Long) \ - _(float, VK_FORMAT_FLOAT4, Float) \ - _(double, VK_FORMAT_R64G64B64A64_SFLOAT, Double) \ - _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8) \ - _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8) \ - _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32) - -namespace vkcompute { -namespace vkapi { - -// -// Scalar Types -// - -enum class ScalarType : int8_t { -#define DEFINE_ENUM_VAL_(ctype, vkformat, name) name, - VK_FORALL_SCALAR_TYPES(DEFINE_ENUM_VAL_) -#undef DEFINE_ENUM_VAL_ - Undefined, - NumOptions -}; - -#define DEFINE_CONSTANT(ctype, vkformat, name) \ - constexpr ScalarType k##name = ScalarType::name; - -VK_FORALL_SCALAR_TYPES(DEFINE_CONSTANT) -#undef DEFINE_CONSTANT - -/* - * Given a `ScalarType`, return the corresponding `VkFormat` that should be used - * for image texture storage. The `ScalarType` to `VkFormat` mapping is dictated - * by the `VK_FORALL_SCALAR_TYPE` macro in `api/Types.h` - */ -inline VkFormat to_vkformat(const ScalarType t) { -#define CASE_VK_FORMAT(ctype, vkformat, name) \ - case ScalarType::name: \ - return vkformat; - - switch (t) { - VK_FORALL_SCALAR_TYPES(CASE_VK_FORMAT) - default: - VK_THROW("Unknown ScalarType: ", t); - } -#undef CASE_VK_FORMAT -} - -/* - * Given a `VkFormat`, return the `ScalarType` that best represents the data - * type of invidivual elements in an image texture of the `VkFormat`. Note that - * this mapping is different from the `to_vkformat()` function, since different - * `ScalarType`s may use the same `VkFormat`. - */ -inline ScalarType element_scalartype(const VkFormat vkformat) { - switch (vkformat) { - case VK_FORMAT_R64G64B64A64_SFLOAT: - return kDouble; - case VK_FORMAT_R32G32B32A32_SFLOAT: - return kFloat; - case VK_FORMAT_R16G16B16A16_SFLOAT: - return kHalf; - case VK_FORMAT_R8G8B8A8_SINT: - return kChar; - case VK_FORMAT_R8G8B8A8_UINT: - case VK_FORMAT_R8G8B8A8_UNORM: - return kByte; - case VK_FORMAT_R16G16B16A16_SINT: - return kShort; - case VK_FORMAT_R16G16B16A16_UINT: - return kUInt16; - case VK_FORMAT_R32G32B32A32_SINT: - return kInt; - case VK_FORMAT_R32G32B32A32_UINT: - return kUInt; - case VK_FORMAT_R64G64B64A64_SINT: - return kLong; - case VK_FORMAT_R64G64B64A64_UINT: - return kUInt64; - default: - VK_THROW("No corresponding scalar type for unknown VkFormat: ", vkformat); - } -} - -/* - * Given a ScalarType, return `sizeof(ctype)` where ctype is the C type - * corresponding to the ScalarType. The C type to ScalarType mapping is dictated - * by the VK_FORALL_SCALAR_TYPE macro in api/Types.h - */ -inline size_t element_size(const ScalarType t) { -#define CASE_ELEMENTSIZE_CASE(ctype, vkformat, name) \ - case ScalarType::name: \ - return sizeof(ctype); - - switch (t) { - VK_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE) - default: - VK_THROW("Unknown ScalarType: ", t); - } -#undef CASE_ELEMENTSIZE_CASE -} - -inline const char* to_string(const ScalarType t) { -#define CASE_TO_STRING(ctype, vkformat, name) \ - case ScalarType::name: \ - return #name; - - switch (t) { - VK_FORALL_SCALAR_TYPES(CASE_TO_STRING) - default: - return "UNKNOWN_SCALAR_TYPE"; - } -#undef CASE_TO_STRING -} - -inline std::ostream& operator<<(std::ostream& os, const ScalarType dtype) { - return os << to_string(dtype); -} - -// -// Map ScalarTypes to C++ types -// - -template -struct ScalarTypeToCType; - -#define SPECIALIZE_ScalarTypeToCType(ctype, vkformat, scalar_type) \ - template <> \ - struct ScalarTypeToCType<::vkcompute::vkapi::ScalarType::scalar_type> { \ - using type = ctype; \ - }; - -VK_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCType) - -#undef SPECIALIZE_ScalarTypeToCPPType - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/VkUtils.h b/backends/vulkan/runtime/vk_api/VkUtils.h deleted file mode 100644 index b765d417d41..00000000000 --- a/backends/vulkan/runtime/vk_api/VkUtils.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace vkcompute { -namespace vkapi { - -inline VkExtent3D create_extent3d(const utils::uvec3& extents) { - return VkExtent3D{extents[0u], extents[1u], extents[2u]}; -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp deleted file mode 100644 index fc2de39c811..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#define PRINT_FIELD(struct, field) #field << ": " << struct.field << std::endl - -std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats) { - VmaDetailedStatistics total_stats = stats.total; - out << "VmaTotalStatistics: " << std::endl; - out << " " << PRINT_FIELD(total_stats.statistics, blockCount); - out << " " << PRINT_FIELD(total_stats.statistics, allocationCount); - out << " " << PRINT_FIELD(total_stats.statistics, blockBytes); - out << " " << PRINT_FIELD(total_stats.statistics, allocationBytes); - return out; -} - -#undef PRINT_FIELD - -namespace vkcompute { -namespace vkapi { - -Allocation::Allocation() - : allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), is_copy_(false) {} - -Allocation::Allocation( - VmaAllocator vma_allocator, - const VkMemoryRequirements& mem_props, - const VmaAllocationCreateInfo& create_info) - : allocator(vma_allocator), allocation(VK_NULL_HANDLE), is_copy_(false) { - VK_CHECK(vmaAllocateMemory( - allocator, &mem_props, &create_info, &allocation, nullptr)); -} - -Allocation::Allocation(const Allocation& other) noexcept - : allocator(other.allocator), - allocation(other.allocation), - is_copy_(true) {} - -Allocation::Allocation(Allocation&& other) noexcept - : allocator(other.allocator), - allocation(other.allocation), - is_copy_(other.is_copy_) { - other.allocation = VK_NULL_HANDLE; -} - -Allocation& Allocation::operator=(Allocation&& other) noexcept { - VmaAllocation tmp_allocation = allocation; - - allocator = other.allocator; - allocation = other.allocation; - is_copy_ = other.is_copy_; - - other.allocation = tmp_allocation; - - return *this; -} - -Allocation::~Allocation() { - // Do not destroy the VmaAllocation if this class instance is a copy of some - // other class instance, since this means that this class instance does not - // have ownership of the underlying resource. - if (allocation != VK_NULL_HANDLE && !is_copy_) { - vmaFreeMemory(allocator, allocation); - } -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h deleted file mode 100644 index e56605e14b2..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -#include - -std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats); - -namespace vkcompute { -namespace vkapi { - -struct Allocation final { - explicit Allocation(); - - explicit Allocation( - const VmaAllocator, - const VkMemoryRequirements&, - const VmaAllocationCreateInfo&); - - protected: - /* - * The Copy constructor allows for creation of a class instance that are - * "aliases" of another class instance. The resulting class instance will not - * have ownership of the underlying VmaAllocation. - * - * This behaviour is analogous to creating a copy of a pointer, thus it is - * unsafe, as the original class instance may be destroyed before the copy. - * These constructors are therefore marked protected so that they may be used - * only in situations where the lifetime of the original class instance is - * guaranteed to exceed, or at least be the same as, the lifetime of the - * copied class instance. - */ - Allocation(const Allocation&) noexcept; - - public: - // To discourage creating copies, the assignment operator is still deleted. - Allocation& operator=(const Allocation&) = delete; - - Allocation(Allocation&&) noexcept; - Allocation& operator=(Allocation&&) noexcept; - - ~Allocation(); - - // The allocator object this was allocated from - VmaAllocator allocator; - // Handles to the allocated memory - VmaAllocation allocation; - - private: - // Indicates whether this class instance is a copy of another class instance, - // in which case it does not have ownership of the underlying VmaAllocation - bool is_copy_; - - public: - operator bool() const { - return (allocation != VK_NULL_HANDLE); - } - - inline bool is_copy() const { - return is_copy_; - } - - friend class VulkanBuffer; - friend class VulkanImage; -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp deleted file mode 100644 index 7976d0ddee5..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { -namespace vkapi { - -Allocator::Allocator( - VkInstance instance, - VkPhysicalDevice physical_device, - VkDevice device) - : instance_{}, - physical_device_(physical_device), - device_(device), - allocator_{VK_NULL_HANDLE} { - VmaVulkanFunctions vk_functions{}; - vk_functions.vkGetInstanceProcAddr = vkGetInstanceProcAddr; - vk_functions.vkGetDeviceProcAddr = vkGetDeviceProcAddr; - - const VmaAllocatorCreateInfo allocator_create_info{ - 0u, // flags - physical_device_, // physicalDevice - device_, // device - 0u, // preferredLargeHeapBlockSize - nullptr, // pAllocationCallbacks - nullptr, // pDeviceMemoryCallbacks - nullptr, // pHeapSizeLimit - &vk_functions, // pVulkanFunctions - instance, // instance - VK_API_VERSION_1_0, // vulkanApiVersion - nullptr, // pTypeExternalMemoryHandleTypes - }; - - VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator_)); -} - -Allocator::Allocator(Allocator&& other) noexcept - : instance_(other.instance_), - physical_device_(other.physical_device_), - device_(other.device_), - allocator_(other.allocator_) { - other.allocator_ = VK_NULL_HANDLE; - other.device_ = VK_NULL_HANDLE; - other.physical_device_ = VK_NULL_HANDLE; - other.instance_ = VK_NULL_HANDLE; -} - -Allocator::~Allocator() { - if (allocator_ == VK_NULL_HANDLE) { - return; - } - vmaDestroyAllocator(allocator_); -} - -VmaAllocationCreateInfo Allocator::gpuonly_resource_create_info() { - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; - return alloc_create_info; -} - -Allocation Allocator::create_allocation( - const VkMemoryRequirements& memory_requirements, - const VmaAllocationCreateInfo& create_info) { - VmaAllocationCreateInfo alloc_create_info = create_info; - // Protect against using VMA_MEMORY_USAGE_AUTO_* flags when allocating memory - // directly, since those usage flags require that VkBufferCreateInfo and/or - // VkImageCreateInfo also be available. - switch (create_info.usage) { - // The logic for the below usage options are too complex, therefore prevent - // those from being used with direct memory allocation. - case VMA_MEMORY_USAGE_AUTO: - case VMA_MEMORY_USAGE_AUTO_PREFER_HOST: - VK_THROW( - "Only the VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE usage flag is compatible with create_allocation()"); - break; - // Most of the time, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE will simply set the - // DEVICE_LOCAL_BIT as a preferred memory flag. Therefore the below is a - // decent approximation for VMA behaviour. - case VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE: - alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_UNKNOWN; - break; - default: - break; - } - - return Allocation(allocator_, memory_requirements, alloc_create_info); -} - -VulkanImage Allocator::create_image( - const VkDevice device, - const VkExtent3D& extents, - const VkFormat image_format, - const VkImageType image_type, - const VkImageTiling image_tiling, - const VkImageViewType image_view_type, - const VulkanImage::SamplerProperties& sampler_props, - VkSampler sampler, - const bool allow_transfer, - const bool allocate_memory) { - VkImageUsageFlags usage = - VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT; - if (allow_transfer) { - usage |= - (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); - } - - VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info(); - - const VulkanImage::ImageProperties image_props{ - image_type, - image_format, - extents, - image_tiling, - usage, - }; - - const VulkanImage::ViewProperties view_props{ - image_view_type, - image_format, - }; - - const VkImageLayout initial_layout = VK_IMAGE_LAYOUT_UNDEFINED; - - return VulkanImage( - device, - allocator_, - alloc_create_info, - image_props, - view_props, - sampler_props, - sampler, - initial_layout, - allocate_memory); -} - -VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { - const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; - - // Staging buffers are accessed by both the CPU and GPU, so set the - // appropriate flags to indicate that the host device will be accessing - // the data from this buffer. - alloc_create_info.flags |= - VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | - VMA_ALLOCATION_CREATE_MAPPED_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - alloc_create_info.preferredFlags = - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - - return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); -} - -VulkanBuffer Allocator::create_storage_buffer( - const VkDeviceSize size, - const bool allocate_memory) { - const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - - VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info(); - return VulkanBuffer( - allocator_, size, alloc_create_info, buffer_usage, allocate_memory); -} - -VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY | - VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO; - - VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - - return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h deleted file mode 100644 index 8f76ca932b7..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -#include -#include -#include - -namespace vkcompute { -namespace vkapi { - -constexpr VmaAllocationCreateFlags DEFAULT_ALLOCATION_STRATEGY = - VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT; - -class Allocator final { - public: - explicit Allocator( - VkInstance instance, - VkPhysicalDevice physical_device, - VkDevice device); - - Allocator(const Allocator&) = delete; - Allocator& operator=(const Allocator&) = delete; - - Allocator(Allocator&&) noexcept; - Allocator& operator=(Allocator&&) = delete; - - ~Allocator(); - - private: - VkInstance instance_; - VkPhysicalDevice physical_device_; - VkDevice device_; - VmaAllocator allocator_; - - public: - VmaAllocationCreateInfo gpuonly_resource_create_info(); - - Allocation create_allocation( - const VkMemoryRequirements& memory_requirements, - const VmaAllocationCreateInfo& create_info); - - VulkanImage create_image( - const VkDevice, - const VkExtent3D&, - const VkFormat, - const VkImageType, - const VkImageTiling, - const VkImageViewType, - const VulkanImage::SamplerProperties&, - VkSampler, - const bool allow_transfer = false, - const bool allocate_memory = true); - - VulkanBuffer create_staging_buffer(const VkDeviceSize); - - VulkanBuffer create_storage_buffer( - const VkDeviceSize, - const bool allocate_memory = true); - - /* - * Create a uniform buffer with a specified size - */ - VulkanBuffer create_uniform_buffer(const VkDeviceSize); - - /* - * Create a uniform buffer containing the data in an arbitrary struct - */ - template - VulkanBuffer create_params_buffer(const Block& block); - - VmaTotalStatistics get_memory_statistics() const { - VmaTotalStatistics stats = {}; - vmaCalculateStatistics(allocator_, &stats); - return stats; - } -}; - -// -// Impl -// - -template -inline VulkanBuffer Allocator::create_params_buffer(const Block& block) { - VulkanBuffer uniform_buffer = create_uniform_buffer(sizeof(Block)); - - // Fill the uniform buffer with data in block - { - MemoryMap mapping(uniform_buffer, MemoryAccessType::WRITE); - Block* data_ptr = mapping.template data(); - - *data_ptr = block; - } - - return uniform_buffer; -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp deleted file mode 100644 index f10e40abdbb..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { -namespace vkapi { - -// -// VulkanBuffer -// - -VulkanBuffer::VulkanBuffer() - : buffer_properties_{}, - allocator_(VK_NULL_HANDLE), - memory_{}, - owns_memory_(false), - memory_bundled_(false), - is_copy_(false), - handle_(VK_NULL_HANDLE) {} - -VulkanBuffer::VulkanBuffer( - VmaAllocator vma_allocator, - const VkDeviceSize size, - const VmaAllocationCreateInfo& allocation_create_info, - const VkBufferUsageFlags usage, - const bool allocate_memory) - : buffer_properties_({size, 0u, size}), - allocator_(vma_allocator), - memory_{}, - owns_memory_(allocate_memory), - memory_bundled_(allocate_memory), - is_copy_(false), - handle_(VK_NULL_HANDLE) { - // If the buffer size is 0, allocate a buffer with a size of 1 byte. This is - // to ensure that there will be some resource that can be bound to a shader. - if (size == 0) { - buffer_properties_.size = 1u; - buffer_properties_.mem_range = 1u; - } - - const VkBufferCreateInfo buffer_create_info{ - VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - buffer_properties_.size, // size - usage, // usage - VK_SHARING_MODE_EXCLUSIVE, // sharingMode - 0u, // queueFamilyIndexCount - nullptr, // pQueueFamilyIndices - }; - - if (allocate_memory) { - VK_CHECK(vmaCreateBuffer( - allocator_, - &buffer_create_info, - &allocation_create_info, - &handle_, - &(memory_.allocation), - nullptr)); - } else { - VmaAllocatorInfo allocator_info{}; - vmaGetAllocatorInfo(allocator_, &allocator_info); - VK_CHECK(vkCreateBuffer( - allocator_info.device, &buffer_create_info, nullptr, &handle_)); - } -} - -VulkanBuffer::VulkanBuffer( - const VulkanBuffer& other, - const VkDeviceSize offset, - const VkDeviceSize range) noexcept - : buffer_properties_(other.buffer_properties_), - allocator_(other.allocator_), - memory_(other.memory_), - owns_memory_(false), - memory_bundled_(false), - is_copy_(true), - handle_(other.handle_) { - // TODO: set the offset and range appropriately - buffer_properties_.mem_offset = other.buffer_properties_.mem_offset + offset; - if (range != VK_WHOLE_SIZE) { - buffer_properties_.mem_range = range; - } -} - -VulkanBuffer::VulkanBuffer(VulkanBuffer&& other) noexcept - : buffer_properties_(other.buffer_properties_), - allocator_(other.allocator_), - memory_(std::move(other.memory_)), - owns_memory_(other.owns_memory_), - memory_bundled_(other.memory_bundled_), - is_copy_(other.is_copy_), - handle_(other.handle_) { - other.handle_ = VK_NULL_HANDLE; -} - -VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept { - VkBuffer tmp_buffer = handle_; - bool tmp_owns_memory = owns_memory_; - bool tmp_memory_bundled = memory_bundled_; - - buffer_properties_ = other.buffer_properties_; - allocator_ = other.allocator_; - memory_ = std::move(other.memory_); - owns_memory_ = other.owns_memory_; - memory_bundled_ = other.memory_bundled_; - is_copy_ = other.is_copy_; - handle_ = other.handle_; - - other.handle_ = tmp_buffer; - other.owns_memory_ = tmp_owns_memory; - other.memory_bundled_ = tmp_memory_bundled; - - return *this; -} - -VulkanBuffer::~VulkanBuffer() { - // Do not destroy the VkBuffer if this class instance is a copy of another - // class instance, since this means that this class instance does not have - // ownership of the underlying resource. - if (handle_ != VK_NULL_HANDLE && !is_copy_) { - if (owns_memory_) { - if (memory_bundled_) { - vmaDestroyBuffer(allocator_, handle_, memory_.allocation); - // Prevent the underlying memory allocation from being freed; it was - // freed by vmaDestroyImage - memory_.allocation = VK_NULL_HANDLE; - } else { - vkDestroyBuffer(this->device(), handle_, nullptr); - // Allow underlying memory allocation to be freed by the destructor of - // Allocation class - } - } else { - vkDestroyBuffer(this->device(), handle_, nullptr); - // Prevent the underlying memory allocation from being freed since this - // object doesn't own it - memory_.allocation = VK_NULL_HANDLE; - } - } -} - -VmaAllocationInfo VulkanBuffer::allocation_info() const { - VmaAllocationInfo info; - vmaGetAllocationInfo(allocator_, memory_.allocation, &info); - return info; -} - -void VulkanBuffer::bind_allocation_impl(const Allocation& memory) { - VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!"); - if (!is_copy_) { - VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_)); - } -} - -void VulkanBuffer::bind_allocation(const Allocation& memory) { - bind_allocation_impl(memory); - memory_.allocation = memory.allocation; -} - -void VulkanBuffer::acquire_allocation(Allocation&& memory) { - bind_allocation_impl(memory); - memory_ = std::move(memory); - owns_memory_ = true; -} - -VkMemoryRequirements VulkanBuffer::get_memory_requirements() const { - VkMemoryRequirements memory_requirements; - vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements); - return memory_requirements; -} - -// -// MemoryMap -// - -MemoryMap::MemoryMap(const VulkanBuffer& buffer, const uint8_t access) - : access_(access), - allocator_(buffer.vma_allocator()), - allocation_(buffer.allocation()), - data_(nullptr), - data_len_{buffer.mem_size()} { - if (allocation_) { - VK_CHECK(vmaMapMemory(allocator_, allocation_, &data_)); - } -} - -MemoryMap::MemoryMap(MemoryMap&& other) noexcept - : access_(other.access_), - allocator_(other.allocator_), - allocation_(other.allocation_), - data_(other.data_), - data_len_{other.data_len_} { - other.allocation_ = VK_NULL_HANDLE; - other.data_ = nullptr; -} - -MemoryMap::~MemoryMap() { - if (!data_) { - return; - } - - if (allocation_) { - if (access_ & MemoryAccessType::WRITE) { - // Call will be ignored by implementation if the memory type this - // allocation belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is - // the behavior we want. Don't check the result here as the destructor - // cannot throw. - vmaFlushAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE); - } - - vmaUnmapMemory(allocator_, allocation_); - } -} - -void MemoryMap::invalidate() { - if (access_ & MemoryAccessType::READ && allocation_) { - // Call will be ignored by implementation if the memory type this allocation - // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior - // we want. - VK_CHECK( - vmaInvalidateAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE)); - } -} - -// -// BufferMemoryBarrier -// - -BufferMemoryBarrier::BufferMemoryBarrier( - const VkAccessFlags src_access_flags, - const VkAccessFlags dst_access_flags, - const VulkanBuffer& buffer) - : handle{ - VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, // sType - nullptr, // pNext - src_access_flags, // srcAccessMask - dst_access_flags, // dstAccessMask - VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex - VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex - buffer.handle_, // buffer - buffer.buffer_properties_.mem_offset, // offset - buffer.buffer_properties_.mem_range, // size - } {} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h deleted file mode 100644 index 582b537465d..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -#include - -namespace vkcompute { - -// Forward declare vTensor classes such that they can be set as friend classes -namespace api { -class vTensorStorage; -} // namespace api - -namespace vkapi { - -using MemoryAccessFlags = uint8_t; - -enum MemoryAccessType : MemoryAccessFlags { - NONE = 0u << 0u, - READ = 1u << 0u, - WRITE = 1u << 1u, -}; - -static constexpr MemoryAccessFlags kReadWrite = - MemoryAccessType::WRITE | MemoryAccessType::READ; - -static constexpr MemoryAccessFlags kRead = MemoryAccessType::READ; - -static constexpr MemoryAccessFlags kWrite = MemoryAccessType::WRITE; - -class VulkanBuffer final { - public: - struct BufferProperties final { - VkDeviceSize size; - VkDeviceSize mem_offset; - VkDeviceSize mem_range; - }; - - explicit VulkanBuffer(); - - explicit VulkanBuffer( - const VmaAllocator, - const VkDeviceSize, - const VmaAllocationCreateInfo&, - const VkBufferUsageFlags, - const bool allocate_memory = true); - - protected: - /* - * The Copy constructor and allows for creation of a class instance that are - * "aliases" of another class instance. The resulting class instance will not - * have ownership of the underlying VkBuffer. - * - * This behaviour is analogous to creating a copy of a pointer, thus it is - * unsafe, as the original class instance may be destroyed before the copy. - * These constructors are therefore marked protected so that they may be used - * only in situations where the lifetime of the original class instance is - * guaranteed to exceed, or at least be the same as, the lifetime of the - * copied class instance. - */ - VulkanBuffer( - const VulkanBuffer& other, - const VkDeviceSize offset = 0u, - const VkDeviceSize range = VK_WHOLE_SIZE) noexcept; - - public: - // To discourage creating copies, the assignment operator is still deleted. - VulkanBuffer& operator=(const VulkanBuffer& other) = delete; - - VulkanBuffer(VulkanBuffer&&) noexcept; - VulkanBuffer& operator=(VulkanBuffer&&) noexcept; - - ~VulkanBuffer(); - - struct Package final { - VkBuffer handle; - VkDeviceSize buffer_offset; - VkDeviceSize buffer_range; - }; - - friend struct BufferMemoryBarrier; - - private: - BufferProperties buffer_properties_; - VmaAllocator allocator_; - Allocation memory_; - // Indicates whether the underlying memory is owned by this resource - bool owns_memory_; - // Indicates whether the allocation for the buffer was created with the buffer - // via vmaCreateBuffer; if this is false, the memory is owned but was bound - // separately via vmaBindBufferMemory - bool memory_bundled_; - // Indicates whether this VulkanBuffer was copied from another VulkanBuffer, - // thus it does not have ownership of the underlying VKBuffer - bool is_copy_; - VkBuffer handle_; - - public: - inline VkDevice device() const { - VmaAllocatorInfo allocator_info{}; - vmaGetAllocatorInfo(allocator_, &allocator_info); - return allocator_info.device; - } - - inline VmaAllocator vma_allocator() const { - return allocator_; - } - - inline VmaAllocation allocation() const { - return memory_.allocation; - } - - VmaAllocationInfo allocation_info() const; - - inline VkBuffer handle() const { - return handle_; - } - - inline VkDeviceSize mem_offset() const { - return buffer_properties_.mem_offset; - } - - inline VkDeviceSize mem_range() const { - return buffer_properties_.mem_range; - } - - inline VkDeviceSize mem_size() const { - return buffer_properties_.size; - } - - inline size_t mem_size_as_size_t() const { - return utils::safe_downcast(mem_size()); - } - - inline bool has_memory() const { - return (memory_.allocation != VK_NULL_HANDLE); - } - - inline bool owns_memory() const { - return owns_memory_; - } - - inline bool is_copy() const { - return is_copy_; - } - - operator bool() const { - return (handle_ != VK_NULL_HANDLE); - } - - inline bool is_copy_of(const VulkanBuffer& other) const { - return (handle_ == other.handle_) && is_copy_; - } - - private: - void bind_allocation_impl(const Allocation& memory); - - public: - /* - * Given a memory allocation, bind it to the underlying VkImage. The lifetime - * of the memory allocation is assumed to be managed externally. - */ - void bind_allocation(const Allocation& memory); - - /* - * Given a rvalue memory allocation, bind it to the underlying VkImage and - * also acquire ownership of the memory allocation. - */ - void acquire_allocation(Allocation&& memory); - - VkMemoryRequirements get_memory_requirements() const; - - friend class api::vTensorStorage; -}; - -class MemoryMap final { - public: - explicit MemoryMap( - const VulkanBuffer& buffer, - const MemoryAccessFlags access); - - MemoryMap(const MemoryMap&) = delete; - MemoryMap& operator=(const MemoryMap&) = delete; - - MemoryMap(MemoryMap&&) noexcept; - MemoryMap& operator=(MemoryMap&&) = delete; - - ~MemoryMap(); - - private: - uint8_t access_; - VmaAllocator allocator_; - VmaAllocation allocation_; - void* data_; - VkDeviceSize data_len_; - - public: - template - T* data(const uint32_t offset = 0) { - return reinterpret_cast(static_cast(data_) + offset); - } - - inline size_t nbytes() { - return utils::safe_downcast(data_len_); - } - - void invalidate(); -}; - -struct BufferMemoryBarrier final { - VkBufferMemoryBarrier handle; - - BufferMemoryBarrier( - const VkAccessFlags src_access_flags, - const VkAccessFlags dst_access_flags, - const VulkanBuffer& buffer); -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp deleted file mode 100644 index cadeb779c83..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Image.cpp +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace vkcompute { -namespace vkapi { - -// -// ImageSampler -// - -bool operator==( - const ImageSampler::Properties& _1, - const ImageSampler::Properties& _2) { - return ( - _1.filter == _2.filter && _1.mipmap_mode == _2.mipmap_mode && - _1.address_mode == _2.address_mode && _1.border_color == _2.border_color); -} - -ImageSampler::ImageSampler( - VkDevice device, - const ImageSampler::Properties& props) - : device_(device), handle_(VK_NULL_HANDLE) { - const VkSamplerCreateInfo sampler_create_info{ - VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - props.filter, // magFilter - props.filter, // minFilter - props.mipmap_mode, // mipmapMode - props.address_mode, // addressModeU - props.address_mode, // addressModeV - props.address_mode, // addressModeW - 0.0f, // mipLodBias - VK_FALSE, // anisotropyEnable - 1.0f, // maxAnisotropy, - VK_FALSE, // compareEnable - VK_COMPARE_OP_NEVER, // compareOp - 0.0f, // minLod - VK_LOD_CLAMP_NONE, // maxLod - props.border_color, // borderColor - VK_FALSE, // unnormalizedCoordinates - }; - - VK_CHECK(vkCreateSampler(device_, &sampler_create_info, nullptr, &handle_)); -} - -ImageSampler::ImageSampler(ImageSampler&& other) noexcept - : device_(other.device_), handle_(other.handle_) { - other.handle_ = VK_NULL_HANDLE; -} - -ImageSampler::~ImageSampler() { - if (handle_ == VK_NULL_HANDLE) { - return; - } - vkDestroySampler(device_, handle_, nullptr); -} - -size_t ImageSampler::Hasher::operator()( - const ImageSampler::Properties& props) const { - size_t seed = 0; - seed = utils::hash_combine(seed, std::hash()(props.filter)); - seed = utils::hash_combine( - seed, std::hash()(props.mipmap_mode)); - seed = utils::hash_combine( - seed, std::hash()(props.address_mode)); - seed = - utils::hash_combine(seed, std::hash()(props.border_color)); - return seed; -} - -void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept { - VkDevice tmp_device = lhs.device_; - VkSampler tmp_handle = lhs.handle_; - - lhs.device_ = rhs.device_; - lhs.handle_ = rhs.handle_; - - rhs.device_ = tmp_device; - rhs.handle_ = tmp_handle; -} - -// -// VulkanImage -// - -VulkanImage::VulkanImage() - : device_{VK_NULL_HANDLE}, - image_properties_{}, - view_properties_{}, - sampler_properties_{}, - allocator_(VK_NULL_HANDLE), - memory_{}, - owns_memory_(false), - memory_bundled_(false), - owns_view_(false), - is_copy_(false), - handles_{ - VK_NULL_HANDLE, - VK_NULL_HANDLE, - VK_NULL_HANDLE, - }, - layout_{} {} - -VulkanImage::VulkanImage( - VkDevice device, - VmaAllocator vma_allocator, - const VmaAllocationCreateInfo& allocation_create_info, - const ImageProperties& image_props, - const ViewProperties& view_props, - const SamplerProperties& sampler_props, - VkSampler sampler, - const VkImageLayout layout, - const bool allocate_memory) - : device_{device}, - image_properties_(image_props), - view_properties_(view_props), - sampler_properties_(sampler_props), - allocator_(vma_allocator), - memory_{}, - owns_memory_{allocate_memory}, - memory_bundled_(allocate_memory), - owns_view_(false), - is_copy_(false), - handles_{ - VK_NULL_HANDLE, - VK_NULL_HANDLE, - sampler, - }, - layout_(layout) { - VmaAllocatorInfo allocator_info{}; - vmaGetAllocatorInfo(allocator_, &allocator_info); - - // If any dims are zero, then allocate a 1x1x1 image texture. This is to - // ensure that there will be some resource that can be bound to a shader. - if (image_props.image_extents.width == 0 || - image_props.image_extents.height == 0 || - image_props.image_extents.depth == 0) { - image_properties_.image_extents.width = 1u; - image_properties_.image_extents.height = 1u; - image_properties_.image_extents.depth = 1u; - } - - const VkImageCreateInfo image_create_info{ - VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - image_properties_.image_type, // imageType - image_properties_.image_format, // format - image_properties_.image_extents, // extents - 1u, // mipLevels - 1u, // arrayLayers - VK_SAMPLE_COUNT_1_BIT, // samples - image_properties_.image_tiling, // tiling - image_properties_.image_usage, // usage - VK_SHARING_MODE_EXCLUSIVE, // sharingMode - 0u, // queueFamilyIndexCount - nullptr, // pQueueFamilyIndices - layout_, // initialLayout - }; - - if (allocate_memory) { - VK_CHECK(vmaCreateImage( - allocator_, - &image_create_info, - &allocation_create_info, - &(handles_.image), - &(memory_.allocation), - nullptr)); - // Only create the image view if the image has been bound to memory - owns_view_ = true; - create_image_view(); - } else { - VK_CHECK(vkCreateImage( - allocator_info.device, &image_create_info, nullptr, &(handles_.image))); - } -} - -VulkanImage::VulkanImage( - VkDevice device, - const ImageProperties& image_props, - VkImage image, - VkImageView image_view, - VkSampler sampler, - const VkImageLayout layout) - : device_{device}, - image_properties_{image_props}, - view_properties_{}, - sampler_properties_{}, - allocator_(VK_NULL_HANDLE), - memory_{}, - owns_memory_(false), - memory_bundled_(false), - is_copy_(false), - handles_{ - image, - image_view, - sampler, - }, - layout_{layout} {} - -VulkanImage::VulkanImage(const VulkanImage& other) noexcept - : device_(other.device_), - image_properties_(other.image_properties_), - view_properties_(other.view_properties_), - sampler_properties_(other.sampler_properties_), - allocator_(other.allocator_), - memory_(other.memory_), - owns_memory_{false}, - owns_view_{false}, - is_copy_(true), - handles_(other.handles_), - layout_(other.layout_) {} - -VulkanImage::VulkanImage(VulkanImage&& other) noexcept - : device_(other.device_), - image_properties_(other.image_properties_), - view_properties_(other.view_properties_), - sampler_properties_(other.sampler_properties_), - allocator_(other.allocator_), - memory_(std::move(other.memory_)), - owns_memory_(other.owns_memory_), - memory_bundled_(other.memory_bundled_), - owns_view_(other.owns_view_), - is_copy_(other.is_copy_), - handles_(other.handles_), - layout_(other.layout_) { - other.handles_.image = VK_NULL_HANDLE; - other.handles_.image_view = VK_NULL_HANDLE; - other.handles_.sampler = VK_NULL_HANDLE; - other.owns_memory_ = false; - other.memory_bundled_ = false; -} - -VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept { - VkImage tmp_image = handles_.image; - VkImageView tmp_image_view = handles_.image_view; - bool tmp_owns_memory = owns_memory_; - bool tmp_memory_bundled = memory_bundled_; - - device_ = other.device_; - image_properties_ = other.image_properties_; - view_properties_ = other.view_properties_; - sampler_properties_ = other.sampler_properties_; - allocator_ = other.allocator_; - memory_ = std::move(other.memory_); - owns_memory_ = other.owns_memory_; - memory_bundled_ = other.memory_bundled_; - is_copy_ = other.is_copy_; - handles_ = other.handles_; - layout_ = other.layout_; - - other.handles_.image = tmp_image; - other.handles_.image_view = tmp_image_view; - other.owns_memory_ = tmp_owns_memory; - other.memory_bundled_ = tmp_memory_bundled; - - return *this; -} - -VulkanImage::~VulkanImage() { - if (owns_view_ && handles_.image_view != VK_NULL_HANDLE) { - vkDestroyImageView(this->device(), handles_.image_view, nullptr); - } - - // Do not destroy any resources if this class instance is a copy of another - // class instance, since this means that this class instance does not have - // ownership of the underlying resource. - if (is_copy_) { - return; - } - - if (handles_.image != VK_NULL_HANDLE) { - if (owns_memory_) { - if (memory_bundled_) { - vmaDestroyImage(allocator_, handles_.image, memory_.allocation); - // Prevent the underlying memory allocation from being freed; it was - // freed by vmaDestroyImage - memory_.allocation = VK_NULL_HANDLE; - } else { - vkDestroyImage(this->device(), handles_.image, nullptr); - // Allow underlying memory allocation to be freed by the destructor of - // Allocation class - } - } else { - vkDestroyImage(this->device(), handles_.image, nullptr); - // Prevent the underlying memory allocation from being freed since this - // object doesn't own it - memory_.allocation = VK_NULL_HANDLE; - } - } -} - -void VulkanImage::create_image_view() { - VmaAllocatorInfo allocator_info{}; - vmaGetAllocatorInfo(allocator_, &allocator_info); - - const VkComponentMapping component_mapping{ - VK_COMPONENT_SWIZZLE_IDENTITY, // r - VK_COMPONENT_SWIZZLE_IDENTITY, // g - VK_COMPONENT_SWIZZLE_IDENTITY, // b - VK_COMPONENT_SWIZZLE_IDENTITY, // a - }; - - const VkImageSubresourceRange subresource_range{ - VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask - 0u, // baseMipLevel - VK_REMAINING_MIP_LEVELS, // levelCount - 0u, // baseArrayLayer - VK_REMAINING_ARRAY_LAYERS, // layerCount - }; - - const VkImageViewCreateInfo image_view_create_info{ - VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - handles_.image, // image - view_properties_.view_type, // viewType - view_properties_.view_format, // format - component_mapping, // components - subresource_range, // subresourceRange - }; - - VK_CHECK(vkCreateImageView( - allocator_info.device, - &(image_view_create_info), - nullptr, - &(handles_.image_view))); -} - -void VulkanImage::bind_allocation_impl(const Allocation& memory) { - VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!"); - // To prevent multiple instances of binding the same VkImage to a memory - // block, do not actually bind memory if this VulkanImage is a copy. Assume - // that the original VulkanImage is responsible for binding the image. - if (!is_copy_) { - VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image)); - } - - // Only create the image view if the image has been bound to memory - owns_view_ = true; - create_image_view(); -} - -void VulkanImage::bind_allocation(const Allocation& memory) { - bind_allocation_impl(memory); - memory_.allocation = memory.allocation; -} - -void VulkanImage::acquire_allocation(Allocation&& memory) { - bind_allocation_impl(memory); - memory_ = std::move(memory); - owns_memory_ = true; -} - -VkMemoryRequirements VulkanImage::get_memory_requirements() const { - VkMemoryRequirements memory_requirements; - vkGetImageMemoryRequirements( - this->device(), handles_.image, &memory_requirements); - return memory_requirements; -} - -// -// ImageMemoryBarrier -// - -ImageMemoryBarrier::ImageMemoryBarrier( - const VkAccessFlags src_access_flags, - const VkAccessFlags dst_access_flags, - const VkImageLayout src_layout_flags, - const VkImageLayout dst_layout_flags, - const VulkanImage& image) - : handle{ - VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, // sType - nullptr, // pNext - src_access_flags, // srcAccessMask - dst_access_flags, // dstAccessMask - src_layout_flags, // oldLayout - dst_layout_flags, // newLayout - VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex - VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex - image.handles_.image, // image - { - // subresourceRange - VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask - 0u, // baseMipLevel - VK_REMAINING_MIP_LEVELS, // levelCount - 0u, // baseArrayLayer - VK_REMAINING_ARRAY_LAYERS, // layerCount - }, - } {} - -// -// SamplerCache -// - -SamplerCache::SamplerCache(VkDevice device) - : cache_mutex_{}, device_(device), cache_{} {} - -SamplerCache::SamplerCache(SamplerCache&& other) noexcept - : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) { - std::lock_guard lock(other.cache_mutex_); -} - -SamplerCache::~SamplerCache() { - purge(); -} - -VkSampler SamplerCache::retrieve(const SamplerCache::Key& key) { - std::lock_guard lock(cache_mutex_); - - auto it = cache_.find(key); - if (cache_.cend() == it) { - it = cache_.insert({key, SamplerCache::Value(device_, key)}).first; - } - - return it->second.handle(); -} - -void SamplerCache::purge() { - std::lock_guard lock(cache_mutex_); - cache_.clear(); -} - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/Image.h b/backends/vulkan/runtime/vk_api/memory/Image.h deleted file mode 100644 index db632c34378..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/Image.h +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -#include - -#include - -#include -#include - -namespace vkcompute { - -// Forward declare vTensor classes such that they can be set as friend classes -namespace api { -class vTensorStorage; -} // namespace api - -namespace vkapi { - -class ImageSampler final { - public: - struct Properties final { - VkFilter filter; - VkSamplerMipmapMode mipmap_mode; - VkSamplerAddressMode address_mode; - VkBorderColor border_color; - }; - - explicit ImageSampler(VkDevice, const Properties&); - - ImageSampler(const ImageSampler&) = delete; - ImageSampler& operator=(const ImageSampler&) = delete; - - ImageSampler(ImageSampler&&) noexcept; - ImageSampler& operator=(ImageSampler&&) = delete; - - ~ImageSampler(); - - private: - VkDevice device_; - VkSampler handle_; - - public: - VkSampler handle() const { - return handle_; - } - - struct Hasher { - size_t operator()(const Properties&) const; - }; - - // We need to define a custom swap function since this class - // does not allow for move assignment. The swap function will - // be used in the hash map. - friend void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept; -}; - -class VulkanImage final { - public: - struct ImageProperties final { - VkImageType image_type; - VkFormat image_format; - VkExtent3D image_extents; - VkImageTiling image_tiling; - VkImageUsageFlags image_usage; - }; - - struct ViewProperties final { - VkImageViewType view_type; - VkFormat view_format; - }; - - using SamplerProperties = ImageSampler::Properties; - - struct Handles final { - VkImage image; - VkImageView image_view; - VkSampler sampler; - }; - - explicit VulkanImage(); - - explicit VulkanImage( - VkDevice, - const VmaAllocator, - const VmaAllocationCreateInfo&, - const ImageProperties&, - const ViewProperties&, - const SamplerProperties&, - VkSampler, - const VkImageLayout, - const bool allocate_memory = true); - - explicit VulkanImage( - VkDevice, - const ImageProperties&, - VkImage, - VkImageView, - VkSampler, - const VkImageLayout); - - protected: - /* - * The Copy constructor allows for creation of a class instance that are - * "aliases" of another class instance. The resulting class instance will not - * have ownership of the underlying VkImage. - * - * This behaviour is analogous to creating a copy of a pointer, thus it is - * unsafe, as the original class instance may be destroyed before the copy. - * These constructors are therefore marked protected so that they may be used - * only in situations where the lifetime of the original class instance is - * guaranteed to exceed, or at least be the same as, the lifetime of the - * copied class instance. - */ - VulkanImage(const VulkanImage& other) noexcept; - - public: - // To discourage creating copies, the assignment operator is still deleted. - VulkanImage& operator=(const VulkanImage&) = delete; - - VulkanImage(VulkanImage&&) noexcept; - VulkanImage& operator=(VulkanImage&&) noexcept; - - ~VulkanImage(); - - struct Package final { - VkImage handle; - VkImageLayout image_layout; - VkImageView image_view; - VkSampler image_sampler; - }; - - friend struct ImageMemoryBarrier; - - private: - VkDevice device_; - ImageProperties image_properties_; - ViewProperties view_properties_; - SamplerProperties sampler_properties_; - // The allocator object this was allocated from - VmaAllocator allocator_; - // Handles to the allocated memory - Allocation memory_; - // Indicates whether the underlying memory is owned by this resource - bool owns_memory_; - // Indicates whether the allocation for the image was created with the image - // via vmaCreateImage; if this is false, the memory is owned but was bound - // separately via vmaBindImageMemory - bool memory_bundled_; - // In some cases, a VulkanImage may be a copy of another VulkanImage but still - // own a unique view of the VkImage. - bool owns_view_; - // Indicates whether this VulkanImage was copied from another VulkanImage, - // thus it does not have ownership of the underlying VKBuffer - bool is_copy_; - Handles handles_; - // Layout - VkImageLayout layout_; - - public: - void create_image_view(); - - inline VkDevice device() const { - return device_; - } - - inline VmaAllocator vma_allocator() const { - return allocator_; - } - - inline VmaAllocation allocation() const { - return memory_.allocation; - } - - inline VkImageType type() const { - return image_properties_.image_type; - } - - inline VkFormat format() const { - return image_properties_.image_format; - } - - inline VkExtent3D extents() const { - return image_properties_.image_extents; - } - - inline VkImage handle() const { - return handles_.image; - } - - inline VkImageView image_view() const { - return handles_.image_view; - } - - inline VkSampler sampler() const { - return handles_.sampler; - } - - Package package() const { - return { - handles_.image, - layout_, - handles_.image_view, - handles_.sampler, - }; - } - - inline VkImageLayout layout() const { - return layout_; - } - - inline void set_layout(const VkImageLayout layout) { - layout_ = layout; - } - - inline bool has_memory() const { - return (memory_.allocation != VK_NULL_HANDLE); - } - - inline bool owns_memory() const { - return owns_memory_; - } - - inline bool is_copy() const { - return is_copy_; - } - - inline operator bool() const { - return (handles_.image != VK_NULL_HANDLE); - } - - inline bool is_copy_of(const VulkanImage& other) const { - return (handles_.image == other.handles_.image) && is_copy_; - } - - private: - void bind_allocation_impl(const Allocation& memory); - - public: - /* - * Given a memory allocation, bind it to the underlying VkImage. The lifetime - * of the memory allocation is assumed to be managed externally. - */ - void bind_allocation(const Allocation& memory); - - /* - * Given a rvalue memory allocation, bind it to the underlying VkImage and - * also acquire ownership of the memory allocation. - */ - void acquire_allocation(Allocation&& memory); - - VkMemoryRequirements get_memory_requirements() const; - - friend class api::vTensorStorage; -}; - -struct ImageMemoryBarrier final { - VkImageMemoryBarrier handle; - - ImageMemoryBarrier( - const VkAccessFlags src_access_flags, - const VkAccessFlags dst_access_flags, - const VkImageLayout src_layout_flags, - const VkImageLayout dst_layout_flags, - const VulkanImage& image); -}; - -class SamplerCache final { - public: - explicit SamplerCache(VkDevice device); - - SamplerCache(const SamplerCache&) = delete; - SamplerCache& operator=(const SamplerCache&) = delete; - - SamplerCache(SamplerCache&&) noexcept; - SamplerCache& operator=(SamplerCache&&) = delete; - - ~SamplerCache(); - - using Key = ImageSampler::Properties; - using Value = ImageSampler; - using Hasher = ImageSampler::Hasher; - - private: - // Multiple threads could potentially be adding entries into the cache, so use - // a mutex to manage access - std::mutex cache_mutex_; - - VkDevice device_; - std::unordered_map cache_; - - public: - VkSampler retrieve(const Key&); - void purge(); -}; - -} // namespace vkapi -} // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/memory/vma_api.cpp b/backends/vulkan/runtime/vk_api/memory/vma_api.cpp deleted file mode 100644 index c5a1b588f19..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/vma_api.cpp +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#define VMA_IMPLEMENTATION -#include diff --git a/backends/vulkan/runtime/vk_api/memory/vma_api.h b/backends/vulkan/runtime/vk_api/memory/vma_api.h deleted file mode 100644 index 16205a3b619..00000000000 --- a/backends/vulkan/runtime/vk_api/memory/vma_api.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// -// Do NOT include vk_mem_alloc.h directly. -// Always include this file (vma_api.h) instead. -// - -#define VMA_VULKAN_VERSION 1000000 - -#ifdef USE_VULKAN_WRAPPER -#define VMA_STATIC_VULKAN_FUNCTIONS 0 -#else -#define VMA_DYNAMIC_VULKAN_FUNCTIONS 0 -#endif /* USE_VULKAN_WRAPPER */ - -#define VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE (4ull * 1024 * 1024) -#define VMA_SMALL_HEAP_MAX_SIZE (256ull * 1024 * 1024) - -#define VMA_STATS_STRING_ENABLED 0 - -#ifdef VULKAN_DEBUG -#define VMA_DEBUG_ALIGNMENT 4096 -#define VMA_DEBUG_ALWAYS_DEDICATED_MEMORY 0 -#define VMA_DEBUG_DETECT_CORRUPTION 1 -#define VMA_DEBUG_GLOBAL_MUTEX 1 -#define VMA_DEBUG_INITIALIZE_ALLOCATIONS 1 -#define VMA_DEBUG_MARGIN 64 -#define VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY 256 -#define VMA_RECORDING_ENABLED 1 - -#define VMA_DEBUG_LOG(format, ...) -/* -#define VMA_DEBUG_LOG(format, ...) do { \ - printf(format, __VA_ARGS__); \ - printf("\n"); \ -} while(false) -*/ -#endif /* VULKAN_DEBUG */ - -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wnullability-completeness" -#pragma clang diagnostic ignored "-Wunused-variable" -#endif /* __clang__ */ - -#include - -#ifdef __clang__ -#pragma clang diagnostic pop -#endif /* __clang__ */ diff --git a/backends/vulkan/runtime/vk_api/vk_api.h b/backends/vulkan/runtime/vk_api/vk_api.h deleted file mode 100644 index e3fbf057f8b..00000000000 --- a/backends/vulkan/runtime/vk_api/vk_api.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#ifdef USE_VULKAN_WRAPPER -#ifdef USE_VULKAN_VOLK -#ifdef VK_ANDROID_external_memory_android_hardware_buffer -#include -#include -#include -#endif /* VK_ANDROID_external_memory_android_hardware_buffer */ - -#include -#else -#include -#endif /* USE_VULKAN_VOLK */ -#else -#include -#endif /* USE_VULKAN_WRAPPER */ diff --git a/backends/vulkan/serialization b/backends/vulkan/serialization new file mode 120000 index 00000000000..9d6671ad5f4 --- /dev/null +++ b/backends/vulkan/serialization @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/serialization \ No newline at end of file diff --git a/backends/vulkan/serialization/TARGETS b/backends/vulkan/serialization/TARGETS deleted file mode 100644 index 41893d29274..00000000000 --- a/backends/vulkan/serialization/TARGETS +++ /dev/null @@ -1,4 +0,0 @@ -load(":targets.bzl", "define_common_targets") -oncall("executorch") - -define_common_targets(is_fbcode = True) diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs deleted file mode 100644 index b6670b6f53d..00000000000 --- a/backends/vulkan/serialization/schema.fbs +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. - -namespace vkgraph; - -// Update after any BC breaking changes. -file_identifier "VK00"; - -table OperatorCall { - node_id:uint; - name:string; - args:[int]; -} - -enum VkDataType : byte { - BOOL = 0, - UINT8 = 1, - INT8 = 2, - INT32 = 3, - FLOAT16 = 4, - FLOAT32 = 5, - FLOAT64 = 6, - INT64 = 7, -} - -// Describes what kind of GPU resource should be used to represent a tensor. The -// int values assigned to each entry must match the corresponding entry in -// api::StorageType. -enum VkStorageType : ubyte { - BUFFER = 0, - TEXTURE_3D = 1, - TEXTURE_2D = 2, - DEFAULT_STORAGE = 255, -} - -// Describes how memory should be laid out in GPU memory. See the GPUMemoryLayout -// enum class in PyTorch Vulkan for more details. The int values assigned to each -// entry must match the corresponding entry in utils::GPUMemoryLayout. -enum VkMemoryLayout : ubyte { - TENSOR_WIDTH_PACKED = 0, - TENSOR_HEIGHT_PACKED = 1, - TENSOR_CHANNELS_PACKED = 2, - DEFAULT_LAYOUT = 255, -} - -table VkTensor { - // Type of the tensor elements. - datatype:VkDataType; - // Shape dimensions. - dims:[uint]; - // Index to the program's constant data. Negative indicates tensor is non-constant. - constant_id:int; - // Index to the shared memory object. Negative indicates the tensor doesn't share memory. - mem_obj_id:int; - // Storage type that should be used to represent this tensor - storage_type:VkStorageType = DEFAULT_STORAGE; - // Memory layout that should be used to represent this tensor - memory_layout:VkMemoryLayout = DEFAULT_LAYOUT; -} - -table Null {} - -table Int { - int_val:long; -} - -table Bool { - bool_val:bool; -} - -table Double { - double_val:double; -} - -table String { - string_val:string; -} - -table IntList { - items:[long]; -} - -table DoubleList { - items:[double]; -} - -table BoolList { - items:[bool]; -} - -table ValueList { - items:[int]; -} - -table SymInt { - value:int; -} - -union GraphTypes { - Null, - Int, - Double, - Bool, - VkTensor, - IntList, - DoubleList, - BoolList, - ValueList, - String, - SymInt, -} - -table VkValue { - value:GraphTypes; -} - -// Abstraction to represent a region of bytes in a raw data buffer. Useful for referencing raw data -// serialized outside of the flatbuffer. -table VkBytes { - offset:ulong; - length:ulong; - named_key:string; -} - -table VkGraph { - // Schema version. - version:string; - - // Objects - chain:[OperatorCall]; - values:[VkValue]; - - // Indices - input_ids:[uint]; - output_ids:[uint]; - - // Raw Objects (e.g. weight tensors and custom shaders) - constants:[VkBytes]; - shaders:[VkBytes]; - - // Graph configuration - // As per flatbuffer BC/FC policy, new fields can be freely added to this - // section. It is recommended to provide default values, since older blobs - // without the field will be deserialized with the default value. - - // Sets an override for the storage type and memory layout that will be used - // to represent a VkTensor if the VkTensor is not serialized with a particular - // storage type or memory layout setting - storage_type_override:VkStorageType = DEFAULT_STORAGE; - memory_layout_override:VkMemoryLayout = DEFAULT_LAYOUT; -} - -root_type VkGraph; diff --git a/backends/vulkan/serialization/targets.bzl b/backends/vulkan/serialization/targets.bzl deleted file mode 100644 index 15ec61e70b0..00000000000 --- a/backends/vulkan/serialization/targets.bzl +++ /dev/null @@ -1,60 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(is_fbcode = False): - runtime.genrule( - name = "gen_vk_delegate_schema", - srcs = ["schema.fbs"], - # We're only generating a single file, so it seems like we could use - # `out`, but `flatc` takes a directory as a parameter, not a single - # file. Use `outs` so that `${OUT}` is expanded as the containing - # directory instead of the file itself. - outs = { - "schema_generated.h": ["schema_generated.h"], - }, - cmd = " ".join([ - "$(exe {})".format(runtime.external_dep_location("flatc")), - "--cpp", - "--cpp-std c++11", - "--scoped-enums", - "-o ${OUT}", - "${SRCS}", - ]), - default_outs = ["."], - ) - - runtime.cxx_library( - name = "vk_delegate_schema", - srcs = [], - visibility = [ - "//executorch/backends/vulkan/...", - ], - exported_headers = { - "schema_generated.h": ":gen_vk_delegate_schema[schema_generated.h]", - }, - exported_external_deps = [ - "flatbuffers-api", - ], - ) - - if is_fbcode: - runtime.python_library( - name = "lib", - srcs = [ - "vulkan_graph_builder.py", - "vulkan_graph_schema.py", - "vulkan_graph_serialize.py", - ], - resources = [ - "schema.fbs", - ], - visibility = [ - "//executorch/...", - "//executorch/vulkan/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/exir:graph_module", - "//executorch/exir/_serialize:_bindings", - "//executorch/exir/_serialize:lib", - ], - ) diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py deleted file mode 100644 index 78ac51c8808..00000000000 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ /dev/null @@ -1,471 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import ctypes -import hashlib -import logging -import operator -from types import NoneType -from typing import cast, List, Optional, Union - -import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema - -import torch - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - VkMemoryLayout, - VkStorageType, -) -from executorch.backends.vulkan.utils import ( - is_constant, - is_get_attr_node, - is_mutable_buffer_node, - is_param_node, - is_symint_node, - TensorRepr, -) -from executorch.exir._serialize._named_data_store import NamedDataStore -from executorch.exir.backend.utils import DelegateMappingBuilder - -from executorch.exir.tensor import TensorSpec -from torch._export.utils import get_buffer, get_param, is_buffer, is_param -from torch.export import ExportedProgram -from torch.fx import Node - -_ScalarType = Union[bool, int, float] -_Argument = Union[ - Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str -] - -logger: logging.Logger = logging.getLogger("") -logger.setLevel(logging.INFO) - - -class VkGraphBuilder: - def __init__( - self, - program: ExportedProgram, - delegate_mapping_builder: DelegateMappingBuilder, - downcast_64_bit: bool = True, - ) -> None: - self.program = program - self.delegate_mapping_builder = delegate_mapping_builder - self.downcast_64_bit = downcast_64_bit - self.chain = [] - self.values = [] - self.input_ids = [] - self.output_ids = [] - self.const_tensors = [] - self.named_data_store = NamedDataStore() - - # Mapping from Node to VkValue id - self.node_to_value_ids = {} - # Mapping from const scalar value to created VkValue id - self.const_scalar_to_value_ids = {} - - # For logging - self.seen_ops = set() - - @staticmethod - def get_vk_datatype(torch_dtype: torch.dtype) -> vk_graph_schema.VkDataType: - if torch_dtype == torch.bool: - return vk_graph_schema.VkDataType.BOOL - elif torch_dtype == torch.uint8: - return vk_graph_schema.VkDataType.UINT8 - elif torch_dtype == torch.int8: - return vk_graph_schema.VkDataType.INT8 - elif torch_dtype == torch.int32: - return vk_graph_schema.VkDataType.INT32 - elif torch_dtype == torch.int64: - return vk_graph_schema.VkDataType.INT64 - elif torch_dtype == torch.float16: - return vk_graph_schema.VkDataType.FLOAT16 - elif torch_dtype == torch.float32: - return vk_graph_schema.VkDataType.FLOAT32 - elif torch_dtype == torch.float64: - return vk_graph_schema.VkDataType.FLOAT64 - else: - raise AssertionError(f"Invalid dtype for vulkan_preprocess ({torch_dtype})") - - def get_constant(self, node: Node) -> Optional[torch.Tensor]: - """ - Returns the constant associated with the given node in the exported program. - Returns None if the node is not a constant within the exported program - """ - if is_constant(self.program, node): - constant_name = ( - self.program.graph_signature.inputs_to_lifted_tensor_constants[ - node.name - ] - ) - if constant_name in self.program.constants: - return self.program.constants[constant_name] - else: - return None - - return None - - def get_param_tensor(self, node: Node) -> torch.Tensor: - tensor = None - if node is None: - raise RuntimeError("node is None") - elif is_param(self.program, node): - tensor = get_param(self.program, node) - elif is_buffer(self.program, node): - tensor = get_buffer(self.program, node) - elif is_constant(self.program, node): - tensor = self.get_constant(node) - elif is_get_attr_node(node): - # This is a hack to support both lifted and unlifted graph - try: - tensor = getattr(node.graph.owning_module, node.target) - except AttributeError: - tensor = getattr(self.program.graph_module, node.target) - else: - raise RuntimeError(f"unsupported param type, {node.op}.") - - assert tensor is not None - return tensor - - def maybe_add_constant_tensor(self, node: Node) -> int: - constant_id = -1 - if is_param_node(self.program, node): - tensor = self.get_param_tensor(node) - - # Serialize tensor data to bytes - tensor = tensor.contiguous() - size = tensor.untyped_storage().nbytes() - - if size > 0: - array_type = ctypes.c_char * size - array = ctypes.cast( - tensor.untyped_storage().data_ptr(), - ctypes.POINTER(array_type), - ).contents - - # Generate SHA256 hash as the named key - tensor_bytes = bytes(array) - sha256_hash = hashlib.sha256(tensor_bytes) - named_key = sha256_hash.hexdigest() - - # Add to named data store with 16-byte alignment (matching XNNPACK) - self.named_data_store.add_named_data( - named_key, tensor_bytes, alignment=16 - ) - - # Create VkBytes entry with named_key and set offset to indicate named data usage - constant_id = len(self.const_tensors) - self.const_tensors.append((named_key, size)) - else: - # Handle empty tensors - constant_id = len(self.const_tensors) - self.const_tensors.append(None) - - return constant_id - - def create_node_value(self, node: Node) -> int: - # If the node has been marked as a scalar tensor, create a SymInt instead of a tensor - if is_symint_node(node) or node.meta.get("etvk_is_scalar_tensor", False): - new_id = self.create_symint_value() - self.node_to_value_ids[node] = new_id - return new_id - - spec = node.meta.get("spec") - if isinstance(spec, TensorSpec): - constant_id = self.maybe_add_constant_tensor(node) - new_id = self.create_tensor_value(spec, constant_id) - self.node_to_value_ids[node] = new_id - return new_id - elif isinstance(spec, list) or isinstance(spec, tuple): - # pyre-ignore[6]: pyre having hard time to infer Node type inside - # the container. - new_id = self.create_value_list_value(spec) - self.node_to_value_ids[node] = new_id - return new_id - else: - raise RuntimeError( - f"Cannot create value for node {node} with spec of type {type(spec)}" - ) - - def create_null_value(self) -> int: - new_id = len(self.values) - self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Null())) - return new_id - - def get_or_create_scalar_value(self, scalar: _ScalarType) -> int: - scalar_key = scalar - # Since Python considers 1 and True to be "equivalent" (as well as 0 and False) - # to distinguish entries in the dictionary, if scalar is bool then convert it - # to a string representation to use as a key for the dictionary - if isinstance(scalar, bool): - scalar_key = str(scalar) - - if scalar_key in self.const_scalar_to_value_ids: - return self.const_scalar_to_value_ids[scalar_key] - - new_id = len(self.values) - if isinstance(scalar, bool): - self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Bool(scalar))) - elif isinstance(scalar, int): - self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Int(scalar))) - elif isinstance(scalar, float): - self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Double(scalar))) - - self.const_scalar_to_value_ids[scalar_key] = new_id - return new_id - - def create_symint_value(self) -> int: - new_id = len(self.values) - self.values.append(vk_graph_schema.VkValue(vk_graph_schema.SymInt(0))) - return new_id - - def create_tensor_value(self, spec: TensorSpec, constant_id: int = -1) -> int: - # Negative id indicates that this tensor will have its own dedicated memory. - mem_obj_id = -1 - if spec.mem_obj_id is not None: - mem_obj_id = spec.mem_obj_id - - storage_type = VkStorageType.DEFAULT_STORAGE - memory_layout = VkMemoryLayout.DEFAULT_LAYOUT - if hasattr(spec, "etvk_node_repr"): - # pyre-ignore[16] - assert isinstance(spec.etvk_node_repr, TensorRepr) - storage_type = spec.etvk_node_repr.storage_type - memory_layout = spec.etvk_node_repr.memory_layout - - # Apply downcast logic before getting VK datatype - effective_dtype = spec.dtype - if self.downcast_64_bit and spec.dtype == torch.float64: - effective_dtype = torch.float32 - elif self.downcast_64_bit and spec.dtype == torch.int64: - effective_dtype = torch.int32 - - datatype = self.get_vk_datatype(effective_dtype) - - new_id = len(self.values) - self.values.append( - vk_graph_schema.VkValue( - value=vk_graph_schema.VkTensor( - datatype=datatype, - dims=spec.shape, - constant_id=constant_id, - mem_obj_id=mem_obj_id, - storage_type=storage_type, - memory_layout=memory_layout, - ) - ) - ) - return new_id - - def create_scalar_list_value(self, arg: List[_ScalarType]) -> int: - new_id = len(self.values) - - if len(arg) == 0: - self.values.append( - vk_graph_schema.VkValue(vk_graph_schema.IntList(items=[])) - ) - - all_bool = True - all_int = True - all_float = True - all_int_or_symint = True - - for val in arg: - if not isinstance(val, bool): - all_bool = False - if not isinstance(val, int): - all_int = False - if not (isinstance(val, Node) and is_symint_node(val)): - all_int_or_symint = False - if not isinstance(val, float): - all_float = False - - if all_bool: - self.values.append( - vk_graph_schema.VkValue( - vk_graph_schema.BoolList(items=[cast(bool, e) for e in arg]) - ) - ) - if all_int: - self.values.append( - vk_graph_schema.VkValue( - vk_graph_schema.IntList(items=[cast(int, e) for e in arg]) - ) - ) - elif all_float: - self.values.append( - vk_graph_schema.VkValue( - vk_graph_schema.DoubleList(items=[cast(float, e) for e in arg]) - ) - ) - elif all_int_or_symint: - return self.create_value_list_value(arg) - else: - raise NotImplementedError(f"Cannot add value for list {arg}") - - return new_id - - def create_value_list_value(self, arg: tuple | list) -> int: - self.values.append( - vk_graph_schema.VkValue( - vk_graph_schema.ValueList( - items=[self.get_or_create_value_for(e) for e in arg] - ) - ) - ) - return len(self.values) - 1 - - def create_string_value(self, string: str) -> int: - new_id = len(self.values) - self.values.append( - vk_graph_schema.VkValue(vk_graph_schema.String(string_val=string)) - ) - return new_id - - def get_or_create_value_for(self, arg: _Argument): - if isinstance(arg, Node): - # If the Node has already been processed, return the existing id. - if arg in self.node_to_value_ids: - return self.node_to_value_ids[arg] - return self.create_node_value(arg) - elif ( - isinstance(arg, NoneType) - or isinstance(arg, torch.device) - or isinstance(arg, torch.dtype) - or isinstance(arg, torch.layout) - or isinstance(arg, torch.memory_format) - ): - return self.create_null_value() - elif isinstance(arg, _ScalarType): - return self.get_or_create_scalar_value(arg) - elif isinstance(arg, TensorSpec): - return self.create_tensor_value(arg) - elif isinstance(arg, list) and ( - len(arg) == 0 or any(isinstance(val, _ScalarType) for val in arg) - ): - # pyre-ignore[6] - return self.create_scalar_list_value(arg) - elif isinstance(arg, list) and isinstance(arg[0], Node): - return self.create_value_list_value(arg) - elif isinstance(arg, torch.fx.immutable_collections.immutable_list): - return self.create_value_list_value(arg) - elif isinstance(arg, str): - return self.create_string_value(arg) - else: - raise RuntimeError(f"Cannot create value for arg of type {type(arg)}") - - def process_placeholder_node(self, node: Node) -> None: - # ignores any tensors that don't get used in any ops - if len(node.users) == 0: - return None - ids = self.create_node_value(node) - if not is_param_node(self.program, node): - if isinstance(ids, int): - self.input_ids.append(ids) - else: - self.input_ids += ids - - def process_getitem_node(self, node: Node) -> None: - # Find ValueList id from the collection node. - collection_node = node.all_input_nodes[0] - list_id = self.node_to_value_ids[collection_node] - - # Extract the target Value id from ValueList. - valuelist_id = node.args[1] - value_id = self.values[list_id].value.items[valuelist_id] - - # Map Node to Value id. - self.node_to_value_ids[node] = value_id - - def process_call_function_node(self, node) -> None: - operator_call_args = [] - - self.seen_ops.add(node.target) - - if hasattr(node.target, "_schema"): - for i, schema_arg in enumerate(node.target._schema.arguments): - if not schema_arg.kwarg_only and i < len(node.args): - function_arg = node.args[i] - elif schema_arg.name in node.kwargs: - function_arg = node.kwargs[schema_arg.name] - else: - function_arg = schema_arg.default_value - - # Create a Value for each function argument. If the argument has been - # previously encountered, then use the existing Value id. - operator_call_args.append(self.get_or_create_value_for(function_arg)) - else: - for _, arg_node in enumerate(node.args): - operator_call_args.append(self.get_or_create_value_for(arg_node)) - - # Add output node - operator_call_args.append(self.create_node_value(node)) - operator_node_id = ( - 0 - if not self.delegate_mapping_builder - else self.delegate_mapping_builder.insert_delegate_mapping_entry(node) - ) - self.chain.append( - vk_graph_schema.OperatorCall( - node_id=operator_node_id, # pyre-ignore[6]: this is going to be an int - name=node.target.__name__, - args=operator_call_args, - ), - ) - - def process_getattr_node(self, node: Node) -> None: - self.create_node_value(node) - - def process_output_node(self, node: Node) -> None: - for out_node in node.all_input_nodes: - if out_node not in self.node_to_value_ids: - raise AssertionError( - "Cannot find input to output node in node_to_value_ids. This means " - "the output node is being serialized before its corresponding " - "internal node which is not allowed." - ) - # Mutable buffers outputs are not included as an output to the - # delegate call. Skip marking them as an output. - if is_mutable_buffer_node(out_node, self.program): - continue - - self.output_ids.append(self.node_to_value_ids[out_node]) - - def process_node(self, node: Node, call_node_debug_hdl: int) -> None: - if node.op == "placeholder": - self.process_placeholder_node(node) - elif node.op == "call_function": - if node.target == operator.getitem: - self.process_getitem_node(node) - else: - node.meta["debug_handle"] = call_node_debug_hdl - self.process_call_function_node(node) - elif node.op == "get_attr": - self.process_getattr_node(node) - elif node.op == "output": - self.process_output_node(node) - else: - raise AssertionError(f"Unsupported node op: {node.op}") - - def build_graph(self) -> vk_graph_schema.VkGraph: - call_node_debug_hdl = 0 - for node in self.program.graph_module.graph.nodes: - self.process_node(node, call_node_debug_hdl) - call_node_debug_hdl += 1 - - logger.info("Operators included in this Vulkan partition: ") - for op in self.seen_ops: - logger.info(f" {op.__name__}") - - return vk_graph_schema.VkGraph( - version="0", - chain=self.chain, - values=self.values, - input_ids=self.input_ids, - output_ids=self.output_ids, - constants=[], - shaders=[], - ) diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py deleted file mode 100644 index aa7641bd927..00000000000 --- a/backends/vulkan/serialization/vulkan_graph_schema.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -""" -Please refer to fbcode/caffe2/executorch/backends/vulkan/serialization/schema/schema.fbs for the schema definitions -""" - -from dataclasses import dataclass -from enum import IntEnum -from typing import List, Union - - -@dataclass -class OperatorCall: - node_id: int - name: str - args: List[int] - - -class VkDataType(IntEnum): - BOOL = 0 - UINT8 = 1 - INT8 = 2 - INT32 = 3 - FLOAT16 = 4 - FLOAT32 = 5 - FLOAT64 = 6 - INT64 = 7 - - -class VkStorageType(IntEnum): - BUFFER = 0 - TEXTURE_3D = 1 - TEXTURE_2D = 2 - DEFAULT_STORAGE = 255 - - def __str__(self) -> str: - return self.name - - -class VkMemoryLayout(IntEnum): - TENSOR_WIDTH_PACKED = 0 - TENSOR_HEIGHT_PACKED = 1 - TENSOR_CHANNELS_PACKED = 2 - DEFAULT_LAYOUT = 255 - - def __str__(self) -> str: - return self.name - - -@dataclass -class VkTensor: - datatype: VkDataType - dims: List[int] - constant_id: int - mem_obj_id: int - storage_type: VkStorageType = VkStorageType.DEFAULT_STORAGE - memory_layout: VkMemoryLayout = VkMemoryLayout.DEFAULT_LAYOUT - - -@dataclass -class Null: - pass - - -@dataclass -class Int: - int_val: int - - -@dataclass -class Bool: - bool_val: bool - - -@dataclass -class Double: - double_val: float - - -@dataclass -class IntList: - items: List[int] - - -@dataclass -class DoubleList: - items: List[float] - - -@dataclass -class BoolList: - items: List[bool] - - -@dataclass -class ValueList: - items: List[int] - - -@dataclass -class String: - string_val: str - - -@dataclass -class SymInt: - value: int - - -GraphTypes = Union[ - Null, - Int, - Double, - Bool, - VkTensor, - IntList, - BoolList, - DoubleList, - ValueList, - String, - SymInt, -] - - -@dataclass -class VkValue: - value: "GraphTypes" - - -@dataclass -class VkBytes: - offset: int - length: int - named_key: str = "" - - -@dataclass -class VkGraph: - version: str - - chain: List[OperatorCall] - values: List[VkValue] - - input_ids: List[int] - output_ids: List[int] - - constants: List[VkBytes] - shaders: List[VkBytes] - - storage_type_override: VkStorageType = VkStorageType.DEFAULT_STORAGE - memory_layout_override: VkMemoryLayout = VkMemoryLayout.DEFAULT_LAYOUT diff --git a/backends/vulkan/serialization/vulkan_graph_serialize.py b/backends/vulkan/serialization/vulkan_graph_serialize.py deleted file mode 100644 index 96f944560a8..00000000000 --- a/backends/vulkan/serialization/vulkan_graph_serialize.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# Copyright 2025 Arm Limited and/or its affiliates. -# -# pyre-strict -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import ctypes -import importlib.resources as _resources -import json -import os -import tempfile -from dataclasses import dataclass -from typing import ClassVar, List - -import executorch.backends.vulkan.serialization as serialization_package - -import torch - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - VkBytes, - VkGraph, -) -from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass -from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile - - -def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes: - vk_graph_json = json.dumps(vk_graph, cls=_DataclassEncoder) - - with tempfile.TemporaryDirectory() as d: - schema_path = os.path.join(d, "schema.fbs") - with open(schema_path, "wb") as schema_file: - schema_file.write( - _resources.read_binary(serialization_package, "schema.fbs") - ) - json_path = os.path.join(d, "schema.json") - with open(json_path, "wb") as json_file: - json_file.write(vk_graph_json.encode("ascii")) - _flatc_compile(d, schema_path, json_path) - output_path = os.path.join(d, "schema.bin") - with open(output_path, "rb") as output_file: - return output_file.read() - - -def flatbuffer_to_vk_graph(flatbuffers: bytes) -> VkGraph: - # Following similar (de)serialization logic on other backends: - # https://github.com/pytorch/executorch/blob/main/backends/qualcomm/serialization/qc_schema_serialize.py#L33 - with tempfile.TemporaryDirectory() as d: - schema_path = os.path.join(d, "schema.fbs") - with open(schema_path, "wb") as schema_file: - schema_file.write( - _resources.read_binary(serialization_package, "schema.fbs") - ) - - bin_path = os.path.join(d, "schema.bin") - with open(bin_path, "wb") as bin_file: - bin_file.write(flatbuffers) - - _flatc_decompile(d, schema_path, bin_path, ["--raw-binary"]) - - json_path = os.path.join(d, "schema.json") - with open(json_path, "rb") as output_file: - return _json_to_dataclass(json.load(output_file), VkGraph) - - -def extract_vk_flatbuffer(data: bytes) -> bytes: - h: VulkanDelegateHeader = VulkanDelegateHeader.from_bytes( - data[: VulkanDelegateHeader.EXPECTED_LENGTH] - ) - start = h.flatbuffer_offset - end = h.flatbuffer_offset + h.flatbuffer_size - return data[start:end] - - -@dataclass -class VulkanDelegateHeader: - # Defines the byte region that each component of the header corresponds to - MAGIC_IX: ClassVar[slice] = slice(4, 8) - HEADER_SIZE_IX: ClassVar[slice] = slice(8, 10) - FLATBUFFER_OFFSET_IX: ClassVar[slice] = slice(10, 14) - FLATBUFFER_SIZE_IX: ClassVar[slice] = slice(14, 18) - BYTES_OFFSET_IX: ClassVar[slice] = slice(18, 22) - BYTES_SIZE_IX: ClassVar[slice] = slice(22, 30) - - # magic bytes that should be at the beginning of the header - EXPECTED_MAGIC: ClassVar[bytes] = b"VH00" - # The length of the header in bytes - EXPECTED_LENGTH: ClassVar[int] = 30 - - # Instance attributes, @dataclass will turn these into constructor args - flatbuffer_offset: int - flatbuffer_size: int - bytes_offset: int - bytes_size: int - - @staticmethod - def from_bytes(data: bytes) -> "VulkanDelegateHeader": - if len(data) > VulkanDelegateHeader.EXPECTED_LENGTH: - raise ValueError( - f"Expected header to be {VulkanDelegateHeader.EXPECTED_LENGTH} bytes, " - f"but got {len(data)} bytes." - ) - - magic_b: bytes = data[VulkanDelegateHeader.MAGIC_IX] - - if magic_b != VulkanDelegateHeader.EXPECTED_MAGIC: - raise ValueError( - f"Expected magic bytes to be {VulkanDelegateHeader.EXPECTED_MAGIC}, " - f"but got {magic_b}." - ) - - length: int = int.from_bytes( - data[VulkanDelegateHeader.HEADER_SIZE_IX], byteorder="little" - ) - - if length != VulkanDelegateHeader.EXPECTED_LENGTH: - raise ValueError( - f"Expected header to be {VulkanDelegateHeader.EXPECTED_LENGTH} bytes, " - f"but got {length} bytes." - ) - - flatbuffer_offset_b: bytes = data[VulkanDelegateHeader.FLATBUFFER_OFFSET_IX] - flatbuffer_size_b: bytes = data[VulkanDelegateHeader.FLATBUFFER_SIZE_IX] - bytes_offset_b: bytes = data[VulkanDelegateHeader.BYTES_OFFSET_IX] - bytes_size_b: bytes = data[VulkanDelegateHeader.BYTES_SIZE_IX] - - return VulkanDelegateHeader( - flatbuffer_offset=int.from_bytes(flatbuffer_offset_b, byteorder="little"), - flatbuffer_size=int.from_bytes(flatbuffer_size_b, byteorder="little"), - bytes_offset=int.from_bytes(bytes_offset_b, byteorder="little"), - bytes_size=int.from_bytes(bytes_size_b, byteorder="little"), - ) - - def is_valid(self) -> bool: - if self.flatbuffer_size <= 0: - return False - - expected_offset = self.flatbuffer_offset + self.flatbuffer_size - if self.bytes_offset < expected_offset: - return False - - if self.bytes_size < 0: - return False - - return True - - def to_bytes(self) -> bytes: - if not self.is_valid(): - raise ValueError("VulkanDelegateHeader instance contains invalid values") - - data: bytes = ( - # 4 bytes of padding for magic bytes, this is so that the header magic - # bytes is in the same position as the flatbuffer header magic bytes - b"\x00\x00\x00\x00" - + self.EXPECTED_MAGIC - + self.EXPECTED_LENGTH.to_bytes(2, byteorder="little") - + self.flatbuffer_offset.to_bytes(4, byteorder="little") - + self.flatbuffer_size.to_bytes(4, byteorder="little") - + self.bytes_offset.to_bytes(4, byteorder="little") - + self.bytes_size.to_bytes(8, byteorder="little") - ) - - assert len(data) == VulkanDelegateHeader.EXPECTED_LENGTH - - return data - - -def padding_required(data_len: int, alignment: int = 16) -> int: - remainder: int = data_len % alignment - if remainder != 0: - return alignment - remainder - return 0 - - -def aligned_size(data_len: int, alignment: int = 16) -> int: - return data_len + padding_required(data_len, alignment) - - -def pad_to(data: bytes, size: int) -> bytes: - if size > len(data): - data += b"\x00" * (size - len(data)) - return data - - -def serialize_constant_tensors( - vk_graph: VkGraph, - const_tensors: List[torch.Tensor], - raw_bytes: bytearray, -) -> None: - # Make sure that the graph does not have any registered constants prior to calling - # this function. - assert len(vk_graph.constants) == 0 - - current_offset = len(raw_bytes) - for tensor in const_tensors: - # The tensor data is stored in the named data map - if isinstance(tensor, tuple): - named_key, size = tensor - vk_graph.constants.append( - VkBytes( - offset=18446744073709551615, # UINT64_MAX to indicate named data - length=size, - named_key=named_key, - ) - ) - elif tensor is None or ( - isinstance(tensor, torch.Tensor) and tensor.numel() == 0 - ): - vk_graph.constants.append(VkBytes(current_offset, 0)) - elif isinstance(tensor, torch.Tensor): - array_type = ctypes.c_char * tensor.untyped_storage().nbytes() - array = ctypes.cast( - tensor.untyped_storage().data_ptr(), - ctypes.POINTER(array_type), - ).contents - - tensor_bytes = bytes(array) - # Pad the tensor bytes to the next 16 byte boundary - raw_bytes += tensor_bytes - raw_bytes += b"\x00" * padding_required(len(tensor_bytes)) - - vk_graph.constants.append(VkBytes(current_offset, len(tensor_bytes))) - current_offset += aligned_size(len(tensor_bytes)) - else: - raise ValueError(f"Unsupported constant tensor type: {type(tensor)}") - - -def serialize_custom_shaders( - vk_graph: VkGraph, - custom_shaders: List[str], - raw_bytes: bytearray, -) -> bytes: - # Make sure that the graph deos not have any registered shaders prior to calling - # this function. - assert len(vk_graph.shaders) == 0 - - if len(custom_shaders) == 0: - return b"" - - else: - raise NotImplementedError("Serializing Custom shaders are not yet supported") - - -def serialize_vulkan_graph( - vk_graph: VkGraph, const_tensors: List[torch.Tensor], custom_shaders: List[str] -) -> bytes: - raw_bytes = bytearray() - serialize_constant_tensors(vk_graph, const_tensors, raw_bytes) - serialize_custom_shaders(vk_graph, custom_shaders, raw_bytes) - raw_bytes = bytes(raw_bytes) - - flatbuffer_payload = convert_to_flatbuffer(vk_graph) - - header_len = aligned_size(VulkanDelegateHeader.EXPECTED_LENGTH) - flatbuffer_payload_len = aligned_size(len(flatbuffer_payload)) - raw_bytes_len = aligned_size(len(raw_bytes)) - - header: bytes = VulkanDelegateHeader( - flatbuffer_offset=header_len, - flatbuffer_size=len(flatbuffer_payload), - bytes_offset=header_len + flatbuffer_payload_len, - bytes_size=len(raw_bytes), - ).to_bytes() - - return b"".join( - [ - pad_to(header, header_len), - pad_to(flatbuffer_payload, flatbuffer_payload_len), - pad_to(raw_bytes, raw_bytes_len), - ] - ) diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl deleted file mode 100644 index 775341d420d..00000000000 --- a/backends/vulkan/targets.bzl +++ /dev/null @@ -1,394 +0,0 @@ -load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci") -load("@fbcode_macros//build_defs:native_rules.bzl", "buck_genrule") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "CXX", "FBCODE", "APPLE") - - -def get_vulkan_compiler_flags(): - return select({ - "DEFAULT": [ - "-Wno-global-constructors", - "-Wno-missing-prototypes", - ], - "ovr_config//os:windows": [], - }) - -def get_vulkan_preprocessor_flags(no_volk, is_fbcode): - VK_API_PREPROCESSOR_FLAGS = [] - - default_flags = [] - android_flags = [] - - if not no_volk: - for flags in [default_flags, android_flags]: - flags.append("-DUSE_VULKAN_WRAPPER") - flags.append("-DUSE_VULKAN_VOLK") - flags.append("-DUSE_VOLK_HEADER_ONLY") - android_flags.append("-DVK_ANDROID_external_memory_android_hardware_buffer") - - if not is_fbcode: - link_moltenvk = no_volk and read_config("etvk", "link_moltenvk", "1") == "1" - mac_flags = default_flags - if link_moltenvk: - mac_flags = [] - - VK_API_PREPROCESSOR_FLAGS += select({ - "DEFAULT": default_flags, - "ovr_config//os:android": android_flags, - "ovr_config//os:macos": mac_flags, - }) + select({ - "//third-party/cuda:windows-cuda-11": [ - "-DVK_USE_PLATFORM_WIN32_KHR", - ], - "DEFAULT": [], - "ovr_config//os:android": [ - "-DVK_USE_PLATFORM_ANDROID_KHR", - ], - "ovr_config//os:linux": [ - "-DVK_USE_PLATFORM_XLIB_KHR", - ], - "ovr_config//os:macos": [ - "-DVK_USE_PLATFORM_MACOS_MVK", - ], - "ovr_config//os:windows": [ - "-DVK_USE_PLATFORM_WIN32_KHR", - ], - }) - - etvk_default_cache_path = read_config("etvk", "default_cache_path", "") - if etvk_default_cache_path != "": - VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)] - - debug_mode = read_config("etvk", "debug", "0") == "1" - if debug_mode: - VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"] - - return VK_API_PREPROCESSOR_FLAGS - -def get_labels(no_volk): - if no_volk: - return ci.labels(ci.linux(ci.mode("fbsource//arvr/mode/android/mac/dbg"))) - else: - return [] - -def get_platforms(): - return [ANDROID, APPLE, CXX] - -def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = False): - gen_vulkan_spv_target = "//xplat/executorch/backends/vulkan:gen_vulkan_spv_bin" - glslc_path = "//xplat/caffe2/fb/vulkan/dotslash:glslc" - - if is_fbcode: - gen_vulkan_spv_target = "//executorch/backends/vulkan:gen_vulkan_spv_bin" - glslc_path = "//caffe2/fb/vulkan/tools:glslc" - - glsl_paths = [] - - # TODO(ssjia): remove the need for subpath once subdir_glob is enabled in OSS - for target, subpath in spv_filegroups.items(): - glsl_paths.append("$(location {})/{}".format(target, subpath)) - - genrule_cmd = ( - "$(exe {}) ".format(gen_vulkan_spv_target) + - "--glsl-paths {} ".format(" ".join(glsl_paths)) + - "--output-path $OUT " + - "--glslc-path=$(exe {}) ".format(glslc_path) + - "--tmp-dir-path=shader_cache " + - ("-f " if read_config("etvk", "force_shader_rebuild", "0") == "1" else " ") + - select({ - "DEFAULT": "", - "ovr_config//os:android": "--optimize", - "ovr_config//os:linux": "--replace-u16vecn", - "ovr_config//os:windows": "--optimize --spv_debug", - }) - ) - - genrule_name = "gen_{}_cpp".format(name) - buck_genrule( - name = genrule_name, - outs = { - "{}.cpp".format(name): ["spv.cpp"], - }, - cmd = genrule_cmd, - default_outs = ["."], - labels = ["uses_dotslash"], - ) - - suffix = "_no_volk" if no_volk else "" - runtime.cxx_library( - name = name, - srcs = [ - ":{}[{}.cpp]".format(genrule_name, name), - ], - compiler_flags = get_vulkan_compiler_flags(), - labels = get_labels(no_volk), - platforms = get_platforms(), - define_static_target = True, - # Static initialization is used to register shaders to the global shader registry, - # therefore link_whole must be True to make sure unused symbols are not discarded. - # @lint-ignore BUCKLINT: Avoid `link_whole=True` - link_whole = True, - # Define a soname that can be used for dynamic loading in Java, Python, etc. - soname = "lib{}.$(ext)".format(name), - exported_deps = [ - "//executorch/backends/vulkan:vulkan_compute_api{}".format(suffix), - ], - ) - -def define_common_targets(is_fbcode = False): - runtime.python_library( - name = "gen_vulkan_spv_lib", - srcs = [ - "runtime/gen_vulkan_spv.py", - ], - base_module = "", - external_deps = ["torchgen"], - ) - - runtime.python_binary( - name = "gen_vulkan_spv_bin", - main_module = "runtime.gen_vulkan_spv", - visibility = [ - "//executorch/backends/vulkan/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - ":gen_vulkan_spv_lib", - ], - ) - - runtime.filegroup( - name = "vulkan_graph_runtime_shaders", - srcs = native.glob([ - "runtime/graph/ops/glsl/*", - ]), - ) - - for no_volk in [True, False]: - # No volk builds only available on xplat to build for Android - if no_volk and is_fbcode: - continue - - suffix = "_no_volk" if no_volk else "" - - VK_API_DEPS = [ - "fbsource//third-party/VulkanMemoryAllocator/3.0.1:VulkanMemoryAllocator_xplat", - ] - - default_deps = [] - android_deps = ["fbsource//third-party/toolchains:android"] - - if no_volk: - for deps in [default_deps, android_deps]: - deps.append("fbsource//third-party/toolchains:vulkan") - deps.append("fbsource//third-party/khronos:vulkan-headers") - else: - for deps in [default_deps, android_deps]: - deps.append("fbsource//third-party/volk:volk-header") - - if is_fbcode: - VK_API_DEPS += [ - "fbsource//third-party/swiftshader:swiftshader_vk_headers", - "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_fbcode", - "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_so", - ] - else: - link_moltenvk = no_volk and read_config("etvk", "link_moltenvk", "1") == "1" - mac_deps = default_deps - if link_moltenvk: - mac_deps = [ - "//third-party/khronos:moltenVK_static" - ] - - VK_API_DEPS += select({ - "DEFAULT": default_deps, - "ovr_config//os:android": android_deps, - "ovr_config//os:macos": mac_deps, - }) + select({ - "DEFAULT": [], - "ovr_config//os:linux": [ - "//arvr/third-party/libX11:libX11", - ] - }) - - runtime.cxx_library( - name = "vulkan_compute_api{}".format(suffix), - compiler_flags = get_vulkan_compiler_flags(), - srcs = native.glob([ - "runtime/api/**/*.cpp", - "runtime/utils/**/*.cpp", - "runtime/vk_api/**/*.cpp", - ]), - exported_headers = native.glob([ - "runtime/api/**/*.h", - "runtime/utils/**/*.h", - "runtime/vk_api/**/*.h", - ]), - labels = get_labels(no_volk), - platforms = get_platforms(), - visibility = [ - "//executorch/backends/vulkan/...", - "@EXECUTORCH_CLIENTS", - ], - fbobjc_frameworks = select({ - "DEFAULT": [], - "ovr_config//os:macos": [ - "$SDKROOT/System/Library/Frameworks/CoreGraphics.framework", - "$SDKROOT/System/Library/Frameworks/Foundation.framework", - "$SDKROOT/System/Library/Frameworks/AppKit.framework", - "$SDKROOT/System/Library/Frameworks/Metal.framework", - "$SDKROOT/System/Library/Frameworks/QuartzCore.framework", - ], - }), - exported_preprocessor_flags = get_vulkan_preprocessor_flags(no_volk, is_fbcode), - exported_deps = VK_API_DEPS, - ) - - runtime.cxx_library( - name = "vulkan_graph_runtime{}".format(suffix), - srcs = native.glob([ - "runtime/graph/**/*.cpp", - ]), - compiler_flags = get_vulkan_compiler_flags(), - exported_headers = native.glob([ - "runtime/graph/**/*.h", - ]), - labels = get_labels(no_volk), - platforms = get_platforms(), - visibility = [ - "//executorch/backends/...", - "//executorch/extension/pybindings/...", - "//executorch/test/...", - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - ":vulkan_graph_runtime_shaderlib{}".format(suffix), - "//executorch/runtime/backend:interface", - ], - define_static_target = True, - # Static initialization is used to register operators to the global operator registry, - # therefore link_whole must be True to make sure unused symbols are not discarded. - # @lint-ignore BUCKLINT: Avoid `link_whole=True` - link_whole = True, - # Define an soname that can be used for dynamic loading in Java, Python, etc. - soname = "libvulkan_graph_runtime.$(ext)", - ) - - vulkan_spv_shader_lib( - name = "vulkan_graph_runtime_shaderlib{}".format(suffix), - spv_filegroups = { - ":vulkan_graph_runtime_shaders": "runtime/graph/ops/glsl", - }, - is_fbcode = is_fbcode, - no_volk = no_volk, - ) - - runtime.cxx_library( - name = "vulkan_backend_lib{}".format(suffix), - srcs = native.glob([ - "runtime/*.cpp", - ]), - compiler_flags = get_vulkan_compiler_flags(), - headers = native.glob([ - "runtime/*.h", - ]), - labels = get_labels(no_volk), - platforms = get_platforms(), - visibility = [ - "//executorch/backends/...", - "//executorch/extension/pybindings/...", - "//executorch/test/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - ":vulkan_graph_runtime{}".format(suffix), - "//executorch/backends/vulkan/serialization:vk_delegate_schema", - "//executorch/runtime/core:event_tracer", - "//executorch/runtime/core/exec_aten/util:tensor_util", - "//executorch/runtime/core:named_data_map", - ], - define_static_target = True, - # VulkanBackend.cpp needs to compile with executor as whole - # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) - link_whole = True, - ) - - ## - ## AOT targets - ## - if is_fbcode: - runtime.python_library( - name = "utils_lib", - srcs = [ - "utils.py", - ], - visibility = [ - "//executorch/backends/vulkan/...", - ], - deps = [ - "//caffe2:torch", - "//executorch/exir:tensor", - "//executorch/exir/backend/canonical_partitioners:config_partitioner_lib", - "//executorch/backends/vulkan/serialization:lib", - ] - ) - - runtime.python_library( - name = "custom_ops_lib", - srcs = [ - "custom_ops_lib.py" - ], - visibility = [ - "//executorch/...", - "//executorch/vulkan/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan/patterns:vulkan_patterns", - ] - ) - - runtime.python_library( - name = "op_registry", - srcs = [ - "op_registry.py", - ], - visibility = [ - "//executorch/...", - "//executorch/vulkan/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - ":custom_ops_lib", - ":utils_lib", - "//caffe2:torch", - "//executorch/exir/dialects:lib", - "//executorch/backends/vulkan/serialization:lib", - ] - ) - - runtime.python_library( - name = "vulkan_preprocess", - srcs = [ - "vulkan_preprocess.py", - ], - visibility = [ - "//executorch/...", - "//executorch/vulkan/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/backends/transforms:addmm_mm_to_linear", - "//executorch/backends/transforms:fuse_batch_norm_with_conv", - "//executorch/backends/transforms:fuse_conv_with_clamp", - "//executorch/backends/transforms:fuse_view_copy", - "//executorch/backends/transforms:remove_clone_ops", - "//executorch/backends/transforms:view_copy_to_squeeze_unsqueeze", - "//executorch/backends/vulkan/_passes:vulkan_passes", - "//executorch/backends/vulkan/serialization:lib", - "//executorch/backends/transforms:remove_getitem_op", - "//executorch/backends/xnnpack/_passes:xnnpack_passes", - "//executorch/exir/backend:backend_details", - ], - ) diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl new file mode 120000 index 00000000000..f44d6f73587 --- /dev/null +++ b/backends/vulkan/targets.bzl @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/targets.bzl \ No newline at end of file diff --git a/backends/vulkan/test b/backends/vulkan/test new file mode 120000 index 00000000000..4de6140b88e --- /dev/null +++ b/backends/vulkan/test @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/test \ No newline at end of file diff --git a/backends/vulkan/test/CMakeLists.txt b/backends/vulkan/test/CMakeLists.txt deleted file mode 100644 index e3bce1d8baf..00000000000 --- a/backends/vulkan/test/CMakeLists.txt +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ### Editing this file ### -# -# This file should be formatted with -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ -# It should also be cmake-lint clean. -# -# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON - -cmake_minimum_required(VERSION 3.19) -project(executorch) - -if(ANDROID) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) -endif() - -find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend) -find_package(GTest CONFIG REQUIRED) - -# Only build tests if Vulkan was compiled -if(TARGET vulkan_backend) - if(NOT EXECUTORCH_ROOT) - set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) - endif() - - if(NOT PYTHON_EXECUTABLE) - set(PYTHON_EXECUTABLE python3) - endif() - - # Include this file to access executorch_target_link_options_shared_lib This - # is required to provide access to executorch_target_link_options_shared_lib - # which allows libraries to be linked with the --whole-archive flag. This is - # required for libraries that perform dynamic registration via static - # initialization. - include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) - - include(../cmake/ShaderLibrary.cmake) - - # Third party include paths - - set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../third-party) - - set(GTEST_INCLUDE_PATH - ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include - ) - set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include) - set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) - set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) - - set(COMMON_INCLUDES ${EXECUTORCH_ROOT}/.. ${VULKAN_HEADERS_PATH} ${VOLK_PATH} - ${VMA_PATH} ${GTEST_INCLUDE_PATH} ${PYTORCH_PATH} - ) - - # Test Utility files - - set(TEST_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/utils) - file(GLOB TEST_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils/*.cpp) - - # Test shaders - - set(TEST_SHADERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/glsl) - gen_vulkan_shader_lib_cpp(${TEST_SHADERS_PATH}) - vulkan_shader_lib(test_shaderlib ${generated_spv_cpp}) - - # API Test binary - - set(COMPUTE_API_TEST_CPP - ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_compute_api_test.cpp - ) - - executorch_target_link_options_shared_lib(vulkan_backend) - - add_executable( - vulkan_compute_api_test ${COMPUTE_API_TEST_CPP} ${TEST_UTILS_CPP} - ) - target_include_directories(vulkan_compute_api_test PRIVATE ${COMMON_INCLUDES}) - target_link_libraries( - vulkan_compute_api_test PRIVATE GTest::gtest_main vulkan_backend - executorch_core test_shaderlib - ) - target_compile_options(vulkan_compute_api_test PRIVATE ${VULKAN_CXX_FLAGS}) - - set_property(TARGET vulkan_compute_api_test PROPERTY CXX_STANDARD 17) - -endif() diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS deleted file mode 100644 index 53fad86f90c..00000000000 --- a/backends/vulkan/test/TARGETS +++ /dev/null @@ -1,91 +0,0 @@ -load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -oncall("executorch") - -python_unittest( - name = "test_vulkan_delegate", - srcs = [ - "test_vulkan_delegate.py", - ], - preload_deps = [ - "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_fbcode", - "//executorch/backends/vulkan:vulkan_backend_lib", - "//executorch/kernels/portable:custom_ops_generated_lib", - ], - deps = [ - ":test_utils", - "//caffe2:torch", - "//executorch/backends/transforms:convert_dtype_pass", - "//executorch/backends/vulkan:vulkan_preprocess", - "//executorch/backends/vulkan/partitioner:vulkan_partitioner", - "//executorch/exir:lib", - "//executorch/extension/pybindings:portable_lib", # @manual - "//executorch/extension/pytree:pylib", - "//executorch/kernels/portable:custom_ops_generated_lib", - ], -) - -python_unittest( - name = "test_vulkan_passes", - srcs = [ - "test_vulkan_passes.py", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan/_passes:vulkan_passes", - "//executorch/backends/vulkan/quantizer:vulkan_quantizer", - "//executorch/backends/vulkan:vulkan_preprocess", - "//pytorch/ao:torchao", # @manual - ] -) - -python_unittest( - name = "test_vulkan_delegate_header", - srcs = [ - "test_vulkan_delegate_header.py", - ], - deps = [ - "//executorch/backends/vulkan:vulkan_preprocess", - ], -) - -python_unittest( - name = "test_serialization", - srcs = [ - "test_serialization.py", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan:vulkan_preprocess", - ], -) - -runtime.python_library( - name = "tester", - srcs = ["tester.py"], - deps = [ - "//executorch/backends/vulkan/partitioner:vulkan_partitioner", - "//executorch/backends/vulkan:vulkan_preprocess", - ] -) - -runtime.python_library( - name = "test_utils", - srcs = [ - "utils.py", - ], - deps = [ - "//caffe2:torch", - "//executorch/backends/vulkan:vulkan_preprocess", - "//executorch/backends/vulkan/partitioner:vulkan_partitioner", - "//executorch/backends/xnnpack:xnnpack_preprocess", - "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer", - "//executorch/backends/xnnpack/partition:xnnpack_partitioner", - "//executorch/devtools:lib", - "//executorch/devtools/bundled_program/serialize:lib", - "//executorch/exir:lib", - "//executorch/extension/pybindings:portable_lib", # @manual - "//executorch/extension/pytree:pylib", - ], -) diff --git a/backends/vulkan/test/compute_api_tests.bzl b/backends/vulkan/test/compute_api_tests.bzl deleted file mode 100644 index db7bfe3c6ab..00000000000 --- a/backends/vulkan/test/compute_api_tests.bzl +++ /dev/null @@ -1,73 +0,0 @@ -load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci") -load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary") -load("@fbsource//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test") -load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "MACOSX", "CXX") -load( - "@fbsource//xplat/executorch/backends/vulkan:targets.bzl", - "get_labels", - "get_platforms", - "vulkan_spv_shader_lib", -) - -def define_compute_api_test_targets(): - for no_volk in [True, False]: - suffix = "_no_volk" if no_volk else "" - - vulkan_spv_shader_lib( - name = "test_shader_lib{}".format(suffix), - spv_filegroups = { - ":test_shaders": "glsl", - }, - no_volk = no_volk, - ) - - fb_xplat_cxx_binary( - name = "vulkan_compute_api_test_bin{}".format(suffix), - srcs = [ - "utils/test_utils.cpp", - "vulkan_compute_api_test.cpp", - ], - headers = [ - "utils/test_utils.h", - ], - apple_sdks = MACOSX, - labels = get_labels(no_volk), - platforms = get_platforms(), - visibility = ["PUBLIC"], - deps = [ - ":test_shader_lib{}".format(suffix), - "//third-party/googletest:gtest_main", - "//xplat/executorch/backends/vulkan:vulkan_graph_runtime{}".format(suffix), - "//xplat/executorch/runtime/core/exec_aten:lib", - ], - ) - - # no_volk variant does not work under the flagfile used for instrumentation tests, - # but it is also not necessary to test it as an instrumentation test. Therefore do - # not generate a no_volk variant for the instrumentation test. - fb_xplat_cxx_test( - name = "vulkan_compute_api_test{}".format(suffix), - srcs = [ - "utils/test_utils.cpp", - "vulkan_compute_api_test.cpp", - ], - headers = [ - "utils/test_utils.h", - ], - contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], - fbandroid_additional_loaded_sonames = [ - "test_shader_lib", - "vulkan_graph_runtime", - "vulkan_graph_runtime_shaderlib", - ], - # Since this is an Android instrumentation test, only generate for ANDROID - platforms = [ANDROID], - use_instrumentation_test = True, - visibility = ["PUBLIC"], - deps = [ - ":test_shader_lib{}".format(suffix), - "//third-party/googletest:gtest_main", - "//xplat/executorch/backends/vulkan:vulkan_graph_runtime{}".format(suffix), - "//xplat/executorch/runtime/core/exec_aten:lib", - ], - ) diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt deleted file mode 100644 index fe58055f649..00000000000 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -cmake_minimum_required(VERSION 3.19) -project(prototyping_shaders) - -if(ANDROID) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) -endif() - -find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend) - -# Compile settings - -set(VULKAN_CXX_FLAGS "-fexceptions") -list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_WRAPPER") -list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_VOLK") - -message(STATUS "VULKAN_CXX_FLAGS: ${VULKAN_CXX_FLAGS}") - -# Only build if Vulkan was compiled -if(TARGET vulkan_backend) - if(NOT EXECUTORCH_ROOT) - set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) - endif() - - if(NOT PYTHON_EXECUTABLE) - set(PYTHON_EXECUTABLE python3) - endif() - - # Include this file to access executorch_target_link_options_shared_lib - include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) - include(${EXECUTORCH_ROOT}/backends/vulkan/cmake/ShaderLibrary.cmake) - - # Third party include paths - set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party) - set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include) - set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) - set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) - - set(COMMON_INCLUDES ${EXECUTORCH_ROOT}/.. ${VULKAN_HEADERS_PATH} ${VOLK_PATH} - ${VMA_PATH} - ) - - # Prototyping utility files - set(PROTOTYPING_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}) - set(PROTOTYPING_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp) - - # Prototyping shaders - message(STATUS "shader stuff") - set(PROTOTYPING_SHADERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/glsl) - gen_vulkan_shader_lib_cpp(${PROTOTYPING_SHADERS_PATH}) - vulkan_shader_lib(prototyping_shaderlib ${generated_spv_cpp}) - target_compile_options(prototyping_shaderlib PRIVATE ${VULKAN_CXX_FLAGS}) - message(STATUS "done shader stuff") - - # Operator implementations library - file(GLOB OPERATOR_IMPL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) - add_library(operator_implementations STATIC ${OPERATOR_IMPL_SOURCES}) - target_include_directories( - operator_implementations PRIVATE ${COMMON_INCLUDES} - ) - target_link_libraries( - operator_implementations PRIVATE vulkan_backend executorch_core - prototyping_shaderlib - ) - target_compile_options(operator_implementations PRIVATE ${VULKAN_CXX_FLAGS}) - set_property(TARGET operator_implementations PROPERTY CXX_STANDARD 17) - - executorch_target_link_options_shared_lib(vulkan_backend) - executorch_target_link_options_shared_lib(operator_implementations) - - # Function to create operator prototype binaries - function(add_operator_prototype OPERATOR_NAME) - set(TARGET_NAME ${OPERATOR_NAME}) - set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${OPERATOR_NAME}.cpp) - - add_executable(${TARGET_NAME} ${SOURCE_FILE} ${PROTOTYPING_UTILS_CPP}) - target_include_directories(${TARGET_NAME} PRIVATE ${COMMON_INCLUDES}) - target_link_libraries( - ${TARGET_NAME} PRIVATE vulkan_backend executorch_core - prototyping_shaderlib operator_implementations - ) - target_compile_options(${TARGET_NAME} PRIVATE ${VULKAN_CXX_FLAGS}) - set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 17) - endfunction() - - # Define operator prototypes - add_operator_prototype(add) - add_operator_prototype(q8csw_linear) - add_operator_prototype(quantized_q4gaw_linear) - add_operator_prototype(quantized_int4_linear) - add_operator_prototype(q8csw_conv2d) -endif() diff --git a/backends/vulkan/test/custom_ops/TARGETS b/backends/vulkan/test/custom_ops/TARGETS deleted file mode 100644 index e84397dc20e..00000000000 --- a/backends/vulkan/test/custom_ops/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets(is_fbcode = True) diff --git a/backends/vulkan/test/custom_ops/add.cpp b/backends/vulkan/test/custom_ops/add.cpp deleted file mode 100644 index bc20246a7d1..00000000000 --- a/backends/vulkan/test/custom_ops/add.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -using namespace executorch::vulkan::prototyping; - -// Generate test cases for add operation -std::vector generate_add_test_cases() { - std::vector test_cases; - - // Set the data generation type as a local variable - DataGenType data_gen_type = DataGenType::ONES; - - // Define different input size configurations - std::vector> size_configs = { - {1, 64, 64}, // Small square - {1, 128, 128}, // Medium square - {1, 256, 256}, // Large square - {1, 512, 512}, // Very large square - {1, 1, 1024}, // Wide tensor - {1, 1024, 1}, // Tall tensor - {32, 32, 32}, // 3D cube - {16, 128, 64}, // 3D rectangular - }; - - // Storage types to test - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - // Data types to test - std::vector data_types = {vkapi::kFloat, vkapi::kHalf}; - - // Generate test cases for each combination - for (const auto& sizes : size_configs) { - for (const auto& storage_type : storage_types) { - for (const auto& data_type : data_types) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string size_str = ""; - for (size_t i = 0; i < sizes.size(); ++i) { - size_str += std::to_string(sizes[i]); - if (i < sizes.size() - 1) - size_str += "x"; - } - - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (data_type == vkapi::kFloat) ? "Float" : "Half"; - - // Add data generation type to the name for clarity - std::string test_name = - "Add_" + size_str + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - test_case.set_operator_name("etvk.add_prototype"); - - // Add two input tensors with the same size, type, storage, and data - // generation method - ValueSpec input_a( - sizes, data_type, storage_type, utils::kWidthPacked, data_gen_type); - ValueSpec input_b( - sizes, data_type, storage_type, utils::kWidthPacked, data_gen_type); - - // Add output tensor with the same size, type, and storage as inputs - // (output uses ZEROS by default) - ValueSpec output( - sizes, - data_type, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - - test_case.add_input_spec(input_a); - test_case.add_input_spec(input_b); - test_case.add_output_spec(output); - - test_cases.push_back(test_case); - } - } - } - - return test_cases; -} - -// Custom FLOP calculator for add operation -// Add operation performs 1 FLOP (addition) per element -int64_t add_flop_calculator(const TestCase& test_case) { - // Calculate total elements from the first input tensor - int64_t total_elements = 1; - if (!test_case.empty() && test_case.num_inputs() > 0 && - test_case.inputs()[0].is_tensor()) { - const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); - for (int64_t size : sizes) { - total_elements *= size; - } - } - - // Add operation: 1 FLOP per element (one addition) - return total_elements; -} - -// Reference implementation for add operator -void add_reference_compute(TestCase& test_case) { - const ValueSpec& input_a = test_case.inputs().at(0); - const ValueSpec& input_b = test_case.inputs().at(1); - - ValueSpec& output = test_case.outputs().at(0); - - if (input_a.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Calculate number of elements - int64_t num_elements = input_a.numel(); - - auto& input_a_data = input_a.get_float_data(); - auto& input_b_data = input_b.get_float_data(); - - auto& ref_data = output.get_ref_float_data(); - ref_data.resize(num_elements); - for (int64_t i = 0; i < num_elements; ++i) { - ref_data[i] = input_a_data[i] + input_b_data[i]; - } -} - -int main(int argc, char* argv[]) { - set_print_output(false); // Disable output tensor printing - set_print_latencies(false); // Enable latency timing printing - set_use_gpu_timestamps(true); // Enable GPU timestamps - - print_performance_header(); - std::cout << "Add Operation Prototyping Framework" << std::endl; - print_separator(); - - // Initialize Vulkan context - try { - api::context()->initialize_querypool(); - } catch (const std::exception& e) { - std::cerr << "Failed to initialize Vulkan context: " << e.what() - << std::endl; - return 1; - } - - // Execute test cases using the new framework with custom FLOP calculator and - // reference compute - auto results = execute_test_cases( - generate_add_test_cases, - add_flop_calculator, - "Add", - 3, - 10, - add_reference_compute); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/build_and_run.sh b/backends/vulkan/test/custom_ops/build_and_run.sh deleted file mode 100755 index 2b9ce576e0e..00000000000 --- a/backends/vulkan/test/custom_ops/build_and_run.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/zsh - -set -eux - -# Check that we're in the executorch directory -current_dir=$(pwd) -if [[ ! "$current_dir" =~ executorch$ ]]; then - echo "Error: This script must be run from a directory ending in 'executorch'" - echo "Current directory: $current_dir" - exit 1 -fi - -# Function to configure and build main project -configure_and_build_main() { - local android_args="" - if [[ "$ANDROID_MODE" == "true" ]]; then - cmake . \ - -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-28 \ - -DGLSLC_PATH=$(which glslc) \ - -B$CMAKE_OUT_DIR - else - cmake . \ - -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DGLSLC_PATH=$(which glslc) \ - -B$CMAKE_OUT_DIR - fi - - cmake --build $CMAKE_OUT_DIR -j16 --target install - # -DCMAKE_CXX_FLAGS="-DVULKAN_DEBUG" \ -} - -# Function to build main project only -build_main() { - cmake --build $CMAKE_OUT_DIR -j16 --target install -} - -# Function to configure and build tests -configure_and_build_tests() { - # Check if glslc is installed - if ! command -v glslc >/dev/null 2>&1; then - echo "Error: glslc is not installed or not found in PATH." - exit 1 - fi - - local android_args="" - if [[ "$ANDROID_MODE" == "true" ]]; then - cmake backends/vulkan/test/custom_ops/ \ - -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-28 \ - -DGLSLC_PATH=$(which glslc) \ - -B$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops - else - cmake backends/vulkan/test/custom_ops/ \ - -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ - -DCMAKE_BUILD_TYPE=Debug \ - -DGLSLC_PATH=$(which glslc) \ - -B$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops - fi - - cmake --build $CMAKE_OUT_DIR/backends/vulkan/test/custom_ops -j16 --target all - -} - -build_tests() { - cmake --build $CMAKE_OUT_DIR/backends/vulkan/test/custom_ops -j16 --target all -} - -# Function to rebuild both main and tests -rebuild_both() { - build_main - build_tests -} - -# Function to clean and rebuild everything -clean_and_rebuild() { - rm -rf $CMAKE_OUT_DIR - configure_and_build_main - configure_and_build_tests -} - -# Function to execute binary if specified -execute_binary() { - local binary_name="$1" - if [[ -n "$binary_name" ]]; then - local binary_path="$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops/$binary_name" - echo "Executing binary: $binary_path" - - if [[ "$ANDROID_MODE" == "true" ]]; then - if [[ -f "$binary_path" ]]; then - echo "Pushing binary to Android device..." - adb push "$binary_path" /data/local/tmp/ - echo "Executing binary on Android device..." - adb shell "cd /data/local/tmp && ./$binary_name" - else - echo "Error: Binary '$binary_path' not found" - exit 1 - fi - else - if [[ -f "$binary_path" && -x "$binary_path" ]]; then - "$binary_path" - else - echo "Error: Binary '$binary_path' not found or not executable" - exit 1 - fi - fi - fi -} - -# Parse command line arguments -BINARY_TO_EXECUTE="" -ANDROID_MODE=false -CMAKE_OUT_DIR="cmake-out" - -# Check for --android flag and adjust arguments accordingly -if [[ "$1" == "--android" ]]; then - ANDROID_MODE=true - CMAKE_OUT_DIR="cmake-android-out" - shift # Remove --android from arguments - echo "Android mode enabled. Using $CMAKE_OUT_DIR as build directory." -fi - -case "${1:-}" in - --rebuild|-r) - echo "Rebuilding both main project and tests..." - BINARY_TO_EXECUTE="${2:-}" - rebuild_both - execute_binary "$BINARY_TO_EXECUTE" - ;; - --rebuild1|-r1) - echo "Rebuilding main project only..." - BINARY_TO_EXECUTE="${2:-}" - build_main - execute_binary "$BINARY_TO_EXECUTE" - ;; - --rebuild2|-r2) - echo "Rebuilding tests only..." - BINARY_TO_EXECUTE="${2:-}" - build_tests - execute_binary "$BINARY_TO_EXECUTE" - ;; - --clean|-c) - echo "WARNING: This will delete the entire $CMAKE_OUT_DIR directory and rebuild everything." - echo -n "Are you sure you want to continue? (y/N): " - read -r response - if [[ "$response" =~ ^[Yy]$ ]]; then - echo "Cleaning and rebuilding everything..." - BINARY_TO_EXECUTE="${2:-}" - clean_and_rebuild - execute_binary "$BINARY_TO_EXECUTE" - else - echo "Clean operation cancelled." - exit 0 - fi - ;; - "") - echo "Running full configure and build..." - configure_and_build_main - configure_and_build_tests - ;; - *) - # If first argument doesn't match any build option, treat it as binary name - # and use default build behavior - echo "Running full configure and build..." - BINARY_TO_EXECUTE="$1" - configure_and_build_main - configure_and_build_tests - execute_binary "$BINARY_TO_EXECUTE" - ;; -esac diff --git a/backends/vulkan/test/custom_ops/choose_qparams_per_row.cpp b/backends/vulkan/test/custom_ops/choose_qparams_per_row.cpp deleted file mode 100644 index aa2b21feab8..00000000000 --- a/backends/vulkan/test/custom_ops/choose_qparams_per_row.cpp +++ /dev/null @@ -1,363 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; -using namespace vkcompute; - -static constexpr int64_t kRefDimSizeLimit = 2050; -static constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; - -// ChooseQParams configuration struct -struct ChooseQParamsConfig { - int64_t num_channels; // Height dimension (number of channels) - int64_t channel_size; // Width dimension (size per channel) - int32_t quant_min = -128; - int32_t quant_max = 127; - std::string test_case_name = "placeholder"; - std::string op_name = "choose_qparams_per_row"; -}; - -// Utility function to create a test case from a ChooseQParamsConfig -TestCase create_test_case_from_config( - const ChooseQParamsConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - config.test_case_name + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "etvk." + config.op_name + ".default"; - test_case.set_operator_name(operator_name); - - // Input tensor (float) - [num_channels, channel_size] - std::vector input_size = {config.num_channels, config.channel_size}; - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM); - - if (debugging()) { - print_valuespec_data(input_tensor, "input_tensor"); - } - - // Quantization parameters - ValueSpec quant_min(config.quant_min); - ValueSpec quant_max(config.quant_max); - - // Output scale tensor (float) - [num_channels] - ValueSpec scale_out( - {config.num_channels}, - vkapi::kFloat, - utils::kBuffer, // Always buffer as per requirement - utils::kWidthPacked, - DataGenType::ZEROS); - - // Output zero_point tensor (int8) - [num_channels] - ValueSpec zero_point_out( - {config.num_channels}, - vkapi::kChar, // int8 for quantized zero point - utils::kBuffer, // Always buffer as per requirement - utils::kWidthPacked, - DataGenType::ZEROS); - - // Add all specs to test case - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(quant_min); - test_case.add_input_spec(quant_max); - test_case.add_output_spec(scale_out); - test_case.add_output_spec(zero_point_out); - - return test_case; -} - -// CPU reference implementation matching the behavior from op_choose_qparams.cpp -void calculate_scale_and_zero_point_reference( - float min_val, - float max_val, - int32_t qmin, - int32_t qmax, - float& scale, - int32_t& zero_point) { - // Extend the [min, max] interval to ensure that it contains 0 - min_val = std::min(min_val, 0.0f); - max_val = std::max(max_val, 0.0f); - - // Use double precision for intermediate computation but use single precision - // in final number to reflect the actual number used during quantization. - double scale_double = - (static_cast(max_val) - min_val) / (qmax - qmin); - - // If scale is 0 or too small so its reciprocal is infinity, we arbitrary - // adjust the scale to 0.1 . We want to avoid scale's reciprocal being - // infinity because some of fbgemm code pre-computes scale's reciprocal to do - // multiplication instead of division in the time critical part of code. - if (static_cast(scale_double) == 0.0f || - std::isinf(1.0f / static_cast(scale_double))) { - scale_double = 0.1; - } - - // Cut off small scale - if (scale_double < SMALL_SCALE_THRESHOLD) { - float org_scale = static_cast(scale_double); - scale_double = SMALL_SCALE_THRESHOLD; - // Adjust the min and max based on the new scale - if (min_val == 0.0f) { - max_val = SMALL_SCALE_THRESHOLD * (qmax - qmin); - } else if (max_val == 0.0f) { - min_val = -SMALL_SCALE_THRESHOLD * (qmax - qmin); - } else { - float amplifier = SMALL_SCALE_THRESHOLD / org_scale; - min_val *= amplifier; - max_val *= amplifier; - } - } - - // Zero-point computation. - // First the initial floating-point computation. The zero-point can be - // determined from solving an affine equation for any known pair - // (real value, corresponding quantized value). - // We know two such pairs: (rmin, qmin) and (rmax, qmax). - // The arithmetic error on the zero point computed from either pair - // will be roughly machine_epsilon * (sum of absolute values of terms) - // so we want to use the variant that adds the smaller terms. - double zero_point_from_min = qmin - min_val / scale_double; - double zero_point_from_max = qmax - max_val / scale_double; - double zero_point_from_min_error = - std::abs(qmin) - std::abs(min_val / scale_double); - double zero_point_from_max_error = - std::abs(qmax) - std::abs(max_val / scale_double); - double initial_zero_point = - zero_point_from_min_error < zero_point_from_max_error - ? zero_point_from_min - : zero_point_from_max; - - // Now we need to nudge the zero point to be an integer - // (our zero points are integer, and this is motivated by the requirement - // to be able to represent the real value "0" exactly as a quantized value, - // which is required in multiple places, for example in Im2col with zero - // padding). - int32_t nudged_zero_point = 0; - if (initial_zero_point < qmin) { - nudged_zero_point = qmin; - } else if (initial_zero_point > qmax) { - nudged_zero_point = qmax; - } else { - nudged_zero_point = - static_cast(nearbyint(static_cast(initial_zero_point))); - } - - scale = static_cast(scale_double); - zero_point = nudged_zero_point; -} - -// Generate easy test cases for choose_qparams_per_channel operation (for -// debugging) -std::vector generate_choose_qparams_per_channel_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - int num_channels = 4; - int channel_size = 8; - - ChooseQParamsConfig config = { - num_channels, // num_channels - channel_size, // channel_size - -128, // quant_min - 127, // quant_max - "simple", // test_case_name - }; - - // Test with both storage types - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for choose_qparams_per_channel operation -std::vector generate_choose_qparams_per_channel_test_cases() { - std::vector test_cases; - - std::vector configs = { - {4, 16}, - {8, 32}, - {16, 64}, - {32, 128}, - {64, 256}, - {128, 512}, - {1, 512}, - // Performance cases - {256, 1024}, - {512, 2048}, - {1, 2048}, - {1, 8096}, - }; - - // Test with different storage types - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - for (auto config : configs) { - std::string prefix = (config.num_channels < kRefDimSizeLimit && - config.channel_size < kRefDimSizeLimit) - ? "correctness_" - : "performance_"; - std::string generated_test_case_name = prefix + - std::to_string(config.num_channels) + "_" + - std::to_string(config.channel_size); - - config.test_case_name = generated_test_case_name; - - for (const auto& storage_type : storage_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Reference implementation for choose_qparams_per_channel -void choose_qparams_per_channel_reference_impl(TestCase& test_case) { - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& quant_min_spec = test_case.inputs()[idx++]; - const ValueSpec& quant_max_spec = test_case.inputs()[idx++]; - const ValueSpec& eps_spec = test_case.inputs()[idx++]; - const ValueSpec& dtype_spec = test_case.inputs()[idx++]; - (void)eps_spec; // Unused in reference implementation - (void)dtype_spec; // Unused in reference implementation - - // Extract output specifications - ValueSpec& scale_out_spec = test_case.outputs()[0]; - ValueSpec& zero_point_out_spec = test_case.outputs()[1]; - - // Get tensor dimensions - auto input_sizes = - input_spec.get_tensor_sizes(); // [num_channels, channel_size] - int64_t num_channels = input_sizes[0]; - int64_t channel_size = input_sizes[1]; - - // Skip for large tensors since computation time will be extremely slow - if (num_channels > kRefDimSizeLimit || channel_size > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (num_channels, channel_size) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - int32_t quant_min = quant_min_spec.get_int_value(); - int32_t quant_max = quant_max_spec.get_int_value(); - - // Prepare output data - auto& scale_ref_data = scale_out_spec.get_ref_float_data(); - auto& zero_point_ref_data = zero_point_out_spec.get_ref_int8_data(); - scale_ref_data.resize(num_channels); - zero_point_ref_data.resize(num_channels); - - // Process each channel - for (int64_t channel = 0; channel < num_channels; ++channel) { - // Find min and max for this channel - float min_val = std::numeric_limits::max(); - float max_val = std::numeric_limits::lowest(); - - for (int64_t i = 0; i < channel_size; ++i) { - int64_t input_idx = channel * channel_size + i; - float val = input_data[input_idx]; - min_val = std::min(min_val, val); - max_val = std::max(max_val, val); - } - - // Calculate scale and zero point for this channel - float scale; - int32_t zero_point; - calculate_scale_and_zero_point_reference( - min_val, max_val, quant_min, quant_max, scale, zero_point); - - // Store results (cast zero_point to int8) - scale_ref_data[channel] = scale; - zero_point_ref_data[channel] = static_cast(zero_point); - } -} - -void reference_impl(TestCase& test_case) { - choose_qparams_per_channel_reference_impl(test_case); -} - -int64_t choose_qparams_per_channel_flop_calculator(const TestCase& test_case) { - // Get input dimensions - const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); - int64_t num_channels = input_sizes[0]; - int64_t channel_size = input_sizes[1]; - - // Calculate FLOPs for choose_qparams_per_channel operation - // Each channel requires: - // - Min/max finding: approximately 2 * channel_size comparisons - // - Scale calculation: ~5 operations (division, min/max operations) - // - Zero point calculation: ~10 operations (multiple arithmetic operations) - int64_t ops_per_channel = 2 * channel_size + 15; // Simplified estimate - - int64_t flop = num_channels * ops_per_channel; - - return flop; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "Choose QParams Per Channel Operation Prototyping Framework" - << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = reference_impl; - - auto results = execute_test_cases( - generate_choose_qparams_per_channel_test_cases, - choose_qparams_per_channel_flop_calculator, - "ChooseQParamsPerChannel", - 0, - 10, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/glsl/add.yaml b/backends/vulkan/test/custom_ops/glsl/add.yaml deleted file mode 100644 index dd479cafd31..00000000000 --- a/backends/vulkan/test/custom_ops/glsl/add.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -add_buffer: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: C_packed - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: add_buffer - -add_texture: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: C_packed - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: add_texture3d diff --git a/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl b/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl deleted file mode 100644 index 8a0ddc4dba7..00000000000 --- a/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type("buffer")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} -${layout_declare_tensor(B, "r", "t_other", DTYPE, "buffer")} - -layout(push_constant) uniform restrict Block { - int out_numel; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const int out_bufi = int(gl_GlobalInvocationID.x); - if (out_bufi >= out_numel) { - return; - } - - // Simple addition without broadcasting - t_out[out_bufi] = t_in[out_bufi] + t_other[out_bufi]; -} \ No newline at end of file diff --git a/backends/vulkan/test/custom_ops/glsl/add_texture.glsl b/backends/vulkan/test/custom_ops/glsl/add_texture.glsl deleted file mode 100644 index f64c8e25d71..00000000000 --- a/backends/vulkan/test/custom_ops/glsl/add_texture.glsl +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -${define_active_storage_type("texture3d")} -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_other", DTYPE, "texture3d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -${layout_declare_spec_const(C, "int", "out_layout", "0")} - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - // Simple addition without broadcasting - same position for all tensors - VEC4_T in_texel = texelFetch(t_in, pos, 0); - VEC4_T other_texel = texelFetch(t_other, pos, 0); - - imageStore(t_out, pos, in_texel + other_texel); -} diff --git a/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl b/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl deleted file mode 100644 index f821fa3586f..00000000000 --- a/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type("texture3d")} - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", "float", "texture3d")} -${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")} - -${layout_declare_ubo(B, "ivec3", "out_limits")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(lpos, out_limits))) { - return; - } - - // Placeholder: just copy input to output - vec4 in_texel = vec4(1.0f); - imageStore(t_out, lpos, in_texel); -} diff --git a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl deleted file mode 100644 index c1d90fadf7e..00000000000 --- a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type("texture3d")} - -#extension GL_EXT_debug_printf : enable - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", "int", "texture3d")} -${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")} - -${layout_declare_ubo(B, "ivec3", "out_limits")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(lpos, out_limits))) { - return; - } - - // Pack four 8-bit values equal to 1 into a single uint - int packed = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 24); - - debugPrintfEXT( - "t_out[%i, %i] = %i\\n", - lpos.x, lpos.y, - packed); - - - // Placeholder: just copy input to output - ivec4 in_texel = ivec4(packed); - imageStore(t_out, lpos, in_texel); -} diff --git a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl deleted file mode 100644 index be6717efdaa..00000000000 --- a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_active_storage_type("texture2d")} - -#extension GL_EXT_debug_printf : enable - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "t_out", "int", "texture3d")} -${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")} - -${layout_declare_ubo(B, "ivec3", "out_limits")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(lpos, out_limits))) { - return; - } - - // Pack four 8-bit values equal to 1 into a single uint - int packed = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 24); - - debugPrintfEXT( - "t_out[%i, %i] = %i\\n", - lpos.x, lpos.y, - packed); - - - // Placeholder: just copy input to output - ivec4 in_texel = ivec4(packed); - imageStore(t_out, lpos, in_texel); -} diff --git a/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp b/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp deleted file mode 100644 index dc35153baf0..00000000000 --- a/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -namespace vkcompute { - -// Shader selection function for add operations -vkapi::ShaderInfo pick_add_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in1 = args.at(1).refs.at(0); - - // Build shader name following the binary_op pattern - std::string kernel_name = "add"; - add_storage_type_suffix(kernel_name, graph->storage_type_of(out)); - add_dtype_suffix(kernel_name, graph->dtype_of(in1)); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -// Global workgroup size function for add operations -utils::uvec3 add_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - return default_pick_global_wg_size(graph, shader, args, resize_args); -} - -// Local workgroup size function for add operations -utils::uvec3 add_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - return default_pick_local_wg_size( - graph, shader, global_workgroup_size, args, resize_args); -} - -void add_prototype(ComputeGraph& graph, const std::vector& args) { - int idx = 0; - const ValueRef input_a = args.at(idx++); - const ValueRef input_b = args.at(idx++); - const ValueRef output = args.at(idx++); - - // Prepare parameter buffers (empty for add operation) - vkapi::ParamsBindList param_buffers; - - // Prepare push constants based on storage type - std::vector push_constants; - push_constants.reserve(graph.is_buffer_storage(output) ? 1 : 1); - - if (graph.is_buffer_storage(output)) { - // Buffer storage: pass numel as push constant - push_constants.emplace_back(graph.numel_pc_of(output)); - } else { - // Texture storage: pass sizes as push constant - push_constants.emplace_back(graph.sizes_pc_of(output)); - } - - // Prepare specialization constants - vkapi::SpecVarList spec_vars; - if (graph.is_buffer_storage(output)) { - spec_vars = { - graph.hashed_layout_of(output), - graph.hashed_layout_of(input_a), - graph.hashed_layout_of(input_b)}; - } else { - spec_vars = {graph.hashed_layout_of(output)}; - } - - // Add the compute node - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_add_shader, - add_global_wg_size, - add_local_wg_size, - // Inputs and Outputs - {{output, vkapi::kWrite}, {{input_a, input_b}, vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - spec_vars, - // Resize args - {}, - // Resizing Logic - nullptr)); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(etvk.add_prototype, add_prototype); -} - -} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp deleted file mode 100644 index 805b67c30a2..00000000000 --- a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp +++ /dev/null @@ -1,373 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; - -using namespace vkcompute; - -static constexpr int64_t kRefDimSizeLimit = 300; - -// Linear configuration struct -struct LinearConfig { - int64_t M; // Batch size / number of rows in input - int64_t K; // Input features / columns in input, rows in weight - int64_t N; // Output features / columns in weight - int64_t group_size; // Number of input channels per quantization group - bool has_bias = false; - std::string test_case_name = "placeholder"; - std::string op_name = "linear_q4gsw"; -}; - -// Helper function to unpack 4-bit values from uint8 -std::pair unpack_4bit(uint8_t packed) { - // Extract lower 4 bits and upper 4 bits - int8_t lower = packed & 0x0F; - int8_t upper = (packed >> 4) & 0x0F; - - // Subtract 8 from unpacked 4-bit values - lower -= 8; - upper -= 8; - - return std::make_pair(lower, upper); -} - -// Utility function to create a test case from a LinearConfig -TestCase create_test_case_from_config( - const LinearConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - config.test_case_name + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "et_vk." + config.op_name + ".default"; - test_case.set_operator_name(operator_name); - - // Derive sizes from M, K, N - std::vector input_size = {config.M, config.K}; - // Input tensor (float/half) - [M, K] - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM); - - if (debugging()) { - print_valuespec_data(input_tensor, "input_tensor"); - } - - // For 4-bit weights, packed size is [N, K/2] since 2 weights per byte - std::vector weight_size = {config.N, config.K / 2}; - // Quantized weight tensor (uint8, packed 4-bit) - [N, K/2] - ValueSpec quantized_weight( - weight_size, - vkapi::kByte, // uint8 for packed 4-bit quantized weights - storage_type, - utils::kWidthPacked, - DataGenType::RANDINT4); - quantized_weight.set_constant(true); - quantized_weight.set_int4(true); - - if (debugging()) { - print_valuespec_data(quantized_weight, "weight_tensor"); - } - - // Weight quantization scales (float/half, per-group) - // For group symmetric quantization: [K/group_size, N] - // Each group of input features has scales for all output features - std::vector weight_scales_size = { - config.K / config.group_size, config.N}; - ValueSpec weight_scales( - weight_scales_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM_SCALES); - weight_scales.set_constant(true); - - // Group size parameter - ValueSpec group_size_spec(static_cast(config.group_size)); - - // Bias (optional, float/half) - [N] - ValueSpec bias( - {config.N}, // Per output feature - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - bias.set_constant(true); - if (!config.has_bias) { - bias.set_none(true); - } - - // Output tensor (float/half) - [M, N] - ValueSpec output( - {config.M, config.N}, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - - // Add all specs to test case for linear_q4gsw - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(weight_scales); - test_case.add_input_spec(group_size_spec); - test_case.add_input_spec(bias); - test_case.add_output_spec(output); - - return test_case; -} - -// Generate easy test cases for quantized linear operation (for debugging) -std::vector generate_quantized_linear_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - int M = 4; - int K = 32; - int N = 16; - int group_size = 8; - - LinearConfig config = { - M, // Batch size - K, // Input features - N, // Output features - group_size, // Group size - true, // has_bias - "simple", // test_case_name - }; - - // Test with both storage types and data types for completeness - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for quantized linear operation -std::vector generate_quantized_linear_test_cases() { - std::vector test_cases; - - std::vector configs = { - // Gemv test cases - {1, 128, 64, 32}, - {1, 256, 128, 64}, - // Gemm - {4, 64, 32, 16}, - {4, 128, 64, 32}, - {4, 256, 128, 64}, - {32, 64, 32, 16}, - {32, 128, 64, 32}, - {32, 256, 128, 64}, - // No bias tests - {32, 128, 64, 32, false}, - {32, 256, 128, 64, false}, - // Performance test cases - {1, 2048, 2048, 128}, - {128, 2048, 2048, 128}, - {256, 2048, 2048, 128}, - {1024, 2048, 2048, 128}, - }; - - // Test with different storage types and data types - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - for (auto config : configs) { - std::string prefix = - (config.M < kRefDimSizeLimit && config.K < kRefDimSizeLimit && - config.N < kRefDimSizeLimit) - ? "correctness_" - : "performance_"; - std::string generated_test_case_name = prefix + std::to_string(config.M) + - "_" + std::to_string(config.K) + "_" + std::to_string(config.N) + "_g" + - std::to_string(config.group_size); - if (!config.has_bias) { - generated_test_case_name += "_no_bias"; - } - - config.test_case_name = generated_test_case_name; - - for (const auto& storage_type : storage_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Reference implementation for 4-bit group symmetric weight quantized linear -void linear_q4gsw_reference_impl(TestCase& test_case) { - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; - const ValueSpec& group_size_spec = test_case.inputs()[idx++]; - const ValueSpec& bias_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [in_features, out_features/2] - auto output_sizes = - output_spec.get_tensor_sizes(); // [batch_size, out_features] - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = output_sizes[1]; - int64_t group_size = group_size_spec.get_int_value(); - - // Skip for large tensors since computation time will be extremely slow - if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit || - out_features > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - auto& weight_data = weight_spec.get_uint8_data(); - auto& weight_scales_data = weight_scales_spec.get_float_data(); - auto& bias_data = bias_spec.get_float_data(); - - // Calculate number of output elements - int64_t num_output_elements = batch_size * out_features; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - // Perform quantized linear transformation (matrix multiplication) - for (int64_t b = 0; b < batch_size; ++b) { - for (int64_t out_f = 0; out_f < out_features; ++out_f) { - float sum = 0.0f; - - // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] * - // weight[out_f][in_f]) - for (int64_t in_f = 0; in_f < in_features; ++in_f) { - // Get input value - int64_t input_idx = b * in_features + in_f; - float input_val = input_data[input_idx]; - - // Get weight value and dequantize (4-bit group symmetric quantization) - int64_t group_idx = in_f / group_size; - int64_t scales_idx = group_idx * out_features + out_f; - - // Get packed weight value - weight matrix is [N, K/2] - int64_t weight_idx = (out_f) * (in_features / 2) + (in_f / 2); - uint8_t packed_weight = weight_data[weight_idx]; - - // Unpack 4-bit weight - auto unpacked = unpack_4bit(packed_weight); - int8_t weight_4bit = (in_f % 2 == 0) ? unpacked.first : unpacked.second; - - // Dequantize weight using group symmetric quantization (no zero point) - float weight_scale = weight_scales_data[scales_idx]; - float dequant_weight = static_cast(weight_4bit) * weight_scale; - - sum += input_val * dequant_weight; - } - - // Add bias and store result - if (!bias_spec.is_none()) { - sum += bias_data[out_f]; - } - int64_t output_idx = b * out_features + out_f; - ref_data[output_idx] = sum; - } - } -} - -void reference_impl(TestCase& test_case) { - linear_q4gsw_reference_impl(test_case); -} - -int64_t quantized_linear_flop_calculator(const TestCase& test_case) { - // Get input and weight dimensions - const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); - const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = output_sizes[1]; - - // Calculate FLOPs for quantized linear operation - // Each output element requires: - // - in_features multiply-accumulate operations - // - Additional operations for quantization/dequantization - int64_t output_elements = batch_size * out_features; - int64_t ops_per_output = in_features; - - // Add quantization overhead (approximate) - // - Unpack 4-bit weight: 1 op per weight element used - // - Dequantize weight: 1 op per weight element used - // - Add bias: 1 op per output element - int64_t quantization_ops = ops_per_output * 2 + 1; // Simplified estimate - - int64_t flop = output_elements * (ops_per_output + quantization_ops); - - return flop; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout - << "4-bit Group Symmetric Weight Quantized Linear Operation Prototyping Framework" - << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = reference_impl; - - // Execute easy test cases using the new framework with custom FLOP calculator - auto results = execute_test_cases( - generate_quantized_linear_test_cases, - quantized_linear_flop_calculator, - "QuantizedLinearQ4GSW", - 0, - 10, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp deleted file mode 100644 index d566e5b2646..00000000000 --- a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp +++ /dev/null @@ -1,785 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; - -using namespace vkcompute; - -static constexpr int64_t kRefDimSizeLimit = 100; - -// Component structs for better readability -struct KernelSize { - int32_t h; - int32_t w; - - KernelSize(int32_t height, int32_t width) : h(height), w(width) {} -}; - -struct Stride { - int32_t h; - int32_t w; - - Stride(int32_t height, int32_t width) : h(height), w(width) {} -}; - -struct Padding { - int32_t h; - int32_t w; - - Padding(int32_t height, int32_t width) : h(height), w(width) {} -}; - -struct Dilation { - int32_t h; - int32_t w; - - Dilation(int32_t height = 1, int32_t width = 1) : h(height), w(width) {} -}; - -struct OutInChannels { - int32_t out; - int32_t in; - - OutInChannels(int32_t out_channels, int32_t in_channels) - : out(out_channels), in(in_channels) {} -}; - -struct InputSize2D { - int32_t h; - int32_t w; - - InputSize2D(int32_t height, int32_t width) : h(height), w(width) {} -}; - -// Conv2d configuration struct -struct Conv2dConfig { - OutInChannels channels; - InputSize2D input_size; - KernelSize kernel; - Stride stride; - Padding padding; - Dilation dilation; - int32_t groups; // Number of groups for grouped convolution - std::string test_case_name = "placeholder"; - std::string op_name = "conv2d_q8ta_q8csw"; - - // Calculate output dimensions - int64_t get_output_height() const { - return (input_size.h + 2 * padding.h - dilation.h * (kernel.h - 1) - 1) / - stride.h + - 1; - } - - int64_t get_output_width() const { - return (input_size.w + 2 * padding.w - dilation.w * (kernel.w - 1) - 1) / - stride.w + - 1; - } -}; - -// Utility function to create a test case from a Conv2dConfig -TestCase create_test_case_from_config( - const Conv2dConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - config.test_case_name + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "et_vk." + config.op_name + ".default"; - test_case.set_operator_name(operator_name); - - // Calculate output dimensions - int64_t H_out = config.get_output_height(); - int64_t W_out = config.get_output_width(); - - // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1) - std::vector input_size = { - 1, config.channels.in, config.input_size.h, config.input_size.w}; - - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kChannelsPacked, - DataGenType::RANDOM); - - if (debugging()) { - print_valuespec_data(input_tensor, "input_tensor"); - } - - float input_scale_val = 0.07f; - ValueSpec input_scale(input_scale_val); - - int32_t input_zero_point_val = -3; - ValueSpec input_zero_point(input_zero_point_val); - - // Quantized weight tensor (int8) - [C_out, C_in_per_group * K_h * K_w] - // Memory layout: height, width, then channels - in_c is innermost (stride 1) - // in the second dimension - const int64_t in_channels_per_group = config.channels.in / config.groups; - const int64_t in_features = utils::align_up_4( - in_channels_per_group * config.kernel.h * config.kernel.w); - std::vector weight_size = {config.channels.out, in_features}; - ValueSpec quantized_weight( - weight_size, - vkapi::kChar, // int8 for quantized weights - storage_type, - utils::kWidthPacked, - DataGenType::RANDINT8); - quantized_weight.set_constant(true); - - if (debugging()) { - print_valuespec_data(quantized_weight, "weight_tensor"); - } - - // Weight quantization scales (float/half, per-channel) - ValueSpec weight_scales( - {config.channels.out}, // Per output channel - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM_SCALES); - weight_scales.set_constant(true); - - ValueSpec weight_sums( - {config.channels.out}, // Per output channel - vkapi::kInt, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - weight_sums.set_constant(true); - - // Compute weight_sums data based on quantized weights - compute_weight_sums( - weight_sums, quantized_weight, config.channels.out, in_features); - - // Bias (optional, float/half) - [C_out] - ValueSpec bias( - {config.channels.out}, // Per output channel - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM); - bias.set_constant(true); - - // Stride and padding parameters - ValueSpec stride({config.stride.h, config.stride.w}); - ValueSpec padding({config.padding.h, config.padding.w}); - - // Dilation and groups parameters - ValueSpec dilation({config.dilation.h, config.dilation.w}); - ValueSpec groups(config.groups); - - // Kernel size parameters - ValueSpec kernel_size({config.kernel.h, config.kernel.w}); - - // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1) - ValueSpec output( - {1, config.channels.out, H_out, W_out}, - input_dtype, - storage_type, - utils::kChannelsPacked, - DataGenType::ZEROS); - - // Add all specs to test case - if (config.op_name.find("q8ta") != std::string::npos) { - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(input_scale); - test_case.add_input_spec(input_zero_point); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(weight_sums); - test_case.add_input_spec(weight_scales); - test_case.add_input_spec(bias); - test_case.add_input_spec(kernel_size); - test_case.add_input_spec(stride); - test_case.add_input_spec(padding); - test_case.add_input_spec(dilation); - test_case.add_input_spec(groups); - } else { - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(weight_scales); - test_case.add_input_spec(bias); - test_case.add_input_spec(kernel_size); - test_case.add_input_spec(stride); - test_case.add_input_spec(padding); - test_case.add_input_spec(dilation); - test_case.add_input_spec(groups); - } - - test_case.add_output_spec(output); - - return test_case; -} - -// Generate easy test cases for quantized conv2d operation (for debugging) -std::vector generate_quantized_conv2d_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - Conv2dConfig config = { - OutInChannels(8, 3), // channels (out, in) - InputSize2D(8, 8), // input_size (h, w) - KernelSize(3, 3), // kernel - Stride(1, 1), // stride - Padding(0, 0), // padding - Dilation(1, 1), // dilation - 1, // groups - }; - - // Test with both storage types and data types for completeness - std::vector storage_types = {utils::kTexture3D}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for quantized conv2d operation -std::vector generate_quantized_conv2d_test_cases() { - std::vector test_cases; - - std::vector configs = { - {OutInChannels(32, 3), - InputSize2D(64, 64), - KernelSize(3, 3), - Stride(2, 2), - Padding(1, 1), - Dilation(1, 1), - 1}, - {OutInChannels(32, 16), - InputSize2D(32, 32), - KernelSize(3, 3), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 1}, - {OutInChannels(64, 32), - InputSize2D(16, 16), - KernelSize(3, 3), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 1}, - // One output channel case - {OutInChannels(1, 32), - InputSize2D(55, 55), - KernelSize(3, 3), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 1}, - - // Stride 2 convolutions - {OutInChannels(32, 3), - InputSize2D(64, 64), - KernelSize(3, 3), - Stride(2, 2), - Padding(1, 1), - Dilation(1, 1), - 1}, - {OutInChannels(64, 32), - InputSize2D(32, 32), - KernelSize(3, 3), - Stride(2, 2), - Padding(1, 1), - Dilation(1, 1), - 1}, - // Different kernel sizes - {OutInChannels(32, 16), - InputSize2D(28, 28), - KernelSize(5, 5), - Stride(1, 1), - Padding(2, 2), - Dilation(1, 1), - 1}, - {OutInChannels(64, 32), - InputSize2D(14, 14), - KernelSize(7, 7), - Stride(1, 1), - Padding(3, 3), - Dilation(1, 1), - 1}, - - // Dilated convolutions - {OutInChannels(32, 16), - InputSize2D(32, 32), - KernelSize(3, 3), - Stride(1, 1), - Padding(2, 2), - Dilation(2, 2), - 1}, - {OutInChannels(64, 32), - InputSize2D(16, 16), - KernelSize(3, 3), - Stride(1, 1), - Padding(3, 3), - Dilation(3, 3), - 1}, - - // Grouped convolutions - {OutInChannels(32, 32), - InputSize2D(32, 32), - KernelSize(3, 3), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 4}, - {OutInChannels(64, 64), - InputSize2D(16, 16), - KernelSize(3, 3), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 8}, - // Performance test cases - {OutInChannels(256, 128), - InputSize2D(128, 128), - KernelSize(1, 1), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 8}, - {OutInChannels(128, 64), - InputSize2D(128, 128), - KernelSize(3, 3), - Stride(1, 1), - Padding(1, 1), - Dilation(1, 1), - 1}}; - - // Test with different storage types and data types - std::vector storage_types = {utils::kTexture3D}; - - // Generate test cases for each combination - for (auto& config : configs) { - for (const auto& storage_type : storage_types) { - // Generate test case name programmatically - bool is_performance = config.channels.out > kRefDimSizeLimit || - config.channels.in > kRefDimSizeLimit || - config.input_size.h > kRefDimSizeLimit || - config.input_size.w > kRefDimSizeLimit; - std::string prefix = is_performance ? "performance_" : "correctness_"; - std::string suffix = std::to_string(config.channels.out) + "/" + - std::to_string(config.channels.in) + "_" + - std::to_string(config.input_size.h) + "/" + - std::to_string(config.input_size.w) + "_" + - std::to_string(config.kernel.h) + "/" + - std::to_string(config.kernel.w); - - config.test_case_name = prefix + suffix; - // The default operator tested is activation + weight quantized conv2d; - // however, only test this if the int8 dot product extension is supported - if (vkcompute::api::context() - ->adapter_ptr() - ->supports_int8_dot_product()) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - } - - Conv2dConfig wo_quant_config = config; - wo_quant_config.op_name = "conv2d_q8csw"; - test_cases.push_back(create_test_case_from_config( - wo_quant_config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Reference implementation for weight only quantized conv2d (fp accumulation) -void conv2d_q8csw_reference_impl(TestCase& test_case) { - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; - const ValueSpec& bias_spec = test_case.inputs()[idx++]; - const ValueSpec& kernel_size_spec = test_case.inputs()[idx++]; - const ValueSpec& stride_spec = test_case.inputs()[idx++]; - const ValueSpec& padding_spec = test_case.inputs()[idx++]; - const ValueSpec& dilation_spec = test_case.inputs()[idx++]; - const ValueSpec& groups_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w] - auto output_sizes = - output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out] - - int64_t N = input_sizes[0]; - int64_t C_in = input_sizes[1]; - int64_t H_in = input_sizes[2]; - int64_t W_in = input_sizes[3]; - int64_t C_out = output_sizes[1]; - int64_t H_out = output_sizes[2]; - int64_t W_out = output_sizes[3]; - - // Get kernel dimensions from kernel_size ValueSpec - auto kernel_size_data = kernel_size_spec.get_int32_data(); - int64_t K_h = kernel_size_data[0]; - int64_t K_w = kernel_size_data[1]; - - // Get stride, padding, dilation, and groups - auto stride_data = stride_spec.get_int32_data(); - auto padding_data = padding_spec.get_int32_data(); - auto dilation_data = dilation_spec.get_int32_data(); - int64_t stride_h = stride_data[0]; - int64_t stride_w = stride_data[1]; - int64_t pad_h = padding_data[0]; - int64_t pad_w = padding_data[1]; - int64_t dilation_h = dilation_data[0]; - int64_t dilation_w = dilation_data[1]; - int64_t groups = groups_spec.get_int_value(); - - // Skip for large tensors since computation time will be extremely slow - if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit || - H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit || - C_out > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - auto& weight_data = weight_spec.get_int8_data(); - auto& weight_scales_data = weight_scales_spec.get_float_data(); - auto& bias_data = bias_spec.get_float_data(); - - // Calculate channels per group for grouped convolution - int64_t C_in_per_group = C_in / groups; - int64_t C_out_per_group = C_out / groups; - - // Calculate number of output elements - int64_t num_output_elements = N * C_out * H_out * W_out; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w); - - // Perform weight-only quantized conv2d operation (fp accumulation) - for (int64_t n = 0; n < N; ++n) { - for (int64_t out_c = 0; out_c < C_out; ++out_c) { - for (int64_t out_h = 0; out_h < H_out; ++out_h) { - for (int64_t out_w = 0; out_w < W_out; ++out_w) { - float sum = 0.0f; - - // Determine which group this output channel belongs to - int64_t group_idx = out_c / C_out_per_group; - int64_t in_c_start = group_idx * C_in_per_group; - int64_t in_c_end = (group_idx + 1) * C_in_per_group; - - // Convolution operation with dilation support and grouped convolution - for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) { - for (int64_t kh = 0; kh < K_h; ++kh) { - for (int64_t kw = 0; kw < K_w; ++kw) { - // Calculate input position with dilation - int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h; - int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w; - - // Check bounds (zero padding) - if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) { - // Get input value (keep as float) - int64_t input_idx = n * (C_in * H_in * W_in) + - in_c * (H_in * W_in) + in_h * W_in + in_w; - float input_val = input_data[input_idx]; - - // Get weight value and dequantize - // Weight layout: [C_out, C_in_per_group * K_h * K_w] - int64_t weight_idx = out_c * in_features + - (kh * (K_w * C_in_per_group) + kw * C_in_per_group + - (in_c % C_in_per_group)); - float dequant_weight = - (static_cast(weight_data[weight_idx])) * - weight_scales_data[out_c]; - - sum += input_val * dequant_weight; - } - } - } - } - - // Add bias and store result - sum += bias_data[out_c]; - int64_t output_idx = n * (C_out * H_out * W_out) + - out_c * (H_out * W_out) + out_h * W_out + out_w; - ref_data[output_idx] = sum; - } - } - } - } -} - -// Reference implementation for activation and weight quantized conv2d (int -// accumulation) -void conv2d_q8ta_q8csw_reference_impl(TestCase& test_case) { - // Extract input specifications - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& input_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_zeros_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_sums_spec = test_case.inputs()[idx++]; - (void)weight_sums_spec; - const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; - const ValueSpec& bias_spec = test_case.inputs()[idx++]; - const ValueSpec& kernel_size_spec = test_case.inputs()[idx++]; - const ValueSpec& stride_spec = test_case.inputs()[idx++]; - const ValueSpec& padding_spec = test_case.inputs()[idx++]; - const ValueSpec& dilation_spec = test_case.inputs()[idx++]; - const ValueSpec& groups_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w] - auto output_sizes = - output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out] - - int64_t N = input_sizes[0]; - int64_t C_in = input_sizes[1]; - int64_t H_in = input_sizes[2]; - int64_t W_in = input_sizes[3]; - int64_t C_out = output_sizes[1]; - int64_t H_out = output_sizes[2]; - int64_t W_out = output_sizes[3]; - - // Get kernel dimensions from kernel_size ValueSpec - auto kernel_size_data = kernel_size_spec.get_int32_data(); - int64_t K_h = kernel_size_data[0]; - int64_t K_w = kernel_size_data[1]; - - // Get stride, padding, dilation, and groups - auto stride_data = stride_spec.get_int32_data(); - auto padding_data = padding_spec.get_int32_data(); - auto dilation_data = dilation_spec.get_int32_data(); - int64_t stride_h = stride_data[0]; - int64_t stride_w = stride_data[1]; - int64_t pad_h = padding_data[0]; - int64_t pad_w = padding_data[1]; - int64_t dilation_h = dilation_data[0]; - int64_t dilation_w = dilation_data[1]; - int64_t groups = groups_spec.get_int_value(); - - // Skip for large tensors since computation time will be extremely slow - if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit || - H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit || - C_out > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - const float input_scale = input_scale_spec.get_float_value(); - const int32_t input_zero_point = input_zeros_spec.get_int_value(); - - auto& weight_data = weight_spec.get_int8_data(); - auto& weight_scales_data = weight_scales_spec.get_float_data(); - auto& bias_data = bias_spec.get_float_data(); - - // Calculate channels per group for grouped convolution - int64_t C_in_per_group = C_in / groups; - int64_t C_out_per_group = C_out / groups; - - // Calculate number of output elements - int64_t num_output_elements = N * C_out * H_out * W_out; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w); - - // Perform activation and weight quantized conv2d operation (int accumulation) - for (int64_t n = 0; n < N; ++n) { - for (int64_t out_c = 0; out_c < C_out; ++out_c) { - for (int64_t out_h = 0; out_h < H_out; ++out_h) { - for (int64_t out_w = 0; out_w < W_out; ++out_w) { - int32_t int_sum = 0; - int32_t weight_sum = 0; // Track weight sum on the fly - - // Determine which group this output channel belongs to - int64_t group_idx = out_c / C_out_per_group; - int64_t in_c_start = group_idx * C_in_per_group; - int64_t in_c_end = (group_idx + 1) * C_in_per_group; - - // Convolution operation with integer accumulation - for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) { - for (int64_t kh = 0; kh < K_h; ++kh) { - for (int64_t kw = 0; kw < K_w; ++kw) { - // Calculate input position with dilation - int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h; - int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w; - - // Check bounds (zero padding) - if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) { - // Get input value and quantize to int8 - int64_t input_idx = n * (C_in * H_in * W_in) + - in_c * (H_in * W_in) + in_h * W_in + in_w; - - float quant_input_f = - std::round(input_data[input_idx] / input_scale) + - input_zero_point; - quant_input_f = - std::min(std::max(quant_input_f, -128.0f), 127.0f); - int8_t quantized_input = static_cast(quant_input_f); - - // Get quantized weight (already int8) - // Weight layout: [C_out, C_in_per_group * K_h * K_w] - int64_t weight_idx = out_c * in_features + - (kh * (K_w * C_in_per_group) + kw * C_in_per_group + - (in_c % C_in_per_group)); - int8_t quantized_weight = weight_data[weight_idx]; - - // Integer multiplication and accumulation - int_sum += static_cast(quantized_input) * - static_cast(quantized_weight); - - // Track weight sum for this output channel on the fly - weight_sum += static_cast(quantized_weight); - } else { - // For zero padding, we still need to account for the weight - // in weight_sum when input is effectively 0 (but quantized 0 - // is input_zero_point) - int64_t weight_idx = out_c * in_features + - (kh * (K_w * C_in_per_group) + kw * C_in_per_group + - (in_c % C_in_per_group)); - int8_t quantized_weight = weight_data[weight_idx]; - - // Add contribution from zero-padded input (quantized zero = - // input_zero_point) - int_sum += static_cast(input_zero_point) * - static_cast(quantized_weight); - - // Track weight sum for this output channel on the fly - weight_sum += static_cast(quantized_weight); - } - } - } - } - - // Convert accumulated integer result to float and apply scales - // Final result = (int_sum - zero_point_correction) * input_scale * - // weight_scale + bias zero_point_correction = input_zero_point * - // sum_of_weights_for_this_output_channel - int32_t zero_point_correction = input_zero_point * weight_sum; - int32_t accum_adjusted = int_sum - zero_point_correction; - float float_result = - accum_adjusted * input_scale * weight_scales_data[out_c]; - - // Add bias and store result - float_result += bias_data[out_c]; - int64_t output_idx = n * (C_out * H_out * W_out) + - out_c * (H_out * W_out) + out_h * W_out + out_w; - ref_data[output_idx] = float_result; - } - } - } - } -} - -void reference_impl(TestCase& test_case) { - if (test_case.operator_name().find("q8ta") != std::string::npos) { - conv2d_q8ta_q8csw_reference_impl(test_case); - } else { - conv2d_q8csw_reference_impl(test_case); - } -} - -// Custom FLOP calculator for quantized conv2d operation -int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) { - int kernel_idx = 4; - if (test_case.operator_name().find("q8ta") != std::string::npos) { - kernel_idx = 7; - } - // Get input and weight dimensions - const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); - const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); - - const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data(); - - int64_t N = input_sizes[0]; - int64_t C_in = input_sizes[1]; - int64_t C_out = output_sizes[1]; - int64_t K_h = kernel_sizes[0]; - int64_t K_w = kernel_sizes[1]; - int64_t H_out = output_sizes[2]; - int64_t W_out = output_sizes[3]; - - // Calculate FLOPs for quantized conv2d operation - // Each output element requires: - // - C_in * K_h * K_w multiply-accumulate operations - // - Additional operations for quantization/dequantization - int64_t output_elements = N * C_out * H_out * W_out; - int64_t ops_per_output = C_in * K_h * K_w; - - int64_t flop = output_elements * (ops_per_output); - - return flop; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "Quantized Conv2d Operation Prototyping Framework" << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = reference_impl; - - // Execute test cases using the new framework with custom FLOP calculator - auto results = execute_test_cases( - generate_quantized_conv2d_test_cases, - quantized_conv2d_flop_calculator, - "QuantizedConv2d", - 0, - 10, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp deleted file mode 100644 index 23973426fcc..00000000000 --- a/backends/vulkan/test/custom_ops/q8csw_linear.cpp +++ /dev/null @@ -1,479 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; - -using namespace vkcompute; - -static constexpr int64_t kRefDimSizeLimit = 300; - -// Linear configuration struct -struct LinearConfig { - int64_t M; // Batch size / number of rows in input - int64_t K; // Input features / columns in input, rows in weight - int64_t N; // Output features / columns in weight - bool has_bias = true; - std::string test_case_name = "placeholder"; - std::string op_name = "linear_q8ta_q8csw"; -}; - -// Utility function to create a test case from a LinearConfig -TestCase create_test_case_from_config( - const LinearConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - config.test_case_name + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "et_vk." + config.op_name + ".default"; - test_case.set_operator_name(operator_name); - - // Derive sizes from M, K, N - std::vector input_size = {config.M, config.K}; - std::vector weight_size = {config.N, config.K}; - - // Input tensor (float/half) - [M, K] - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM); - - if (debugging()) { - print_valuespec_data(input_tensor, "input_tensor"); - } - - float input_scale_val = 0.008f; - ValueSpec input_scale(input_scale_val); - - int32_t input_zero_point_val = -2; - ValueSpec input_zero_point(input_zero_point_val); - - // Quantized weight tensor (int8) - [K, N] - ValueSpec quantized_weight( - weight_size, - vkapi::kChar, // int8 for quantized weights - storage_type, - utils::kWidthPacked, - DataGenType::RANDINT8); - quantized_weight.set_constant(true); - - if (debugging()) { - print_valuespec_data(quantized_weight, "weight_tensor"); - } - - // Weight quantization scales (float/half, per-channel) - ValueSpec weight_scales( - {config.N}, // Per output feature - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM_SCALES); - weight_scales.set_constant(true); - - ValueSpec weight_sums( - {config.N}, // Per output features - vkapi::kInt, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - weight_sums.set_constant(true); - - // Compute weight_sums data based on quantized weights - int64_t in_features = config.K; - int64_t out_features = config.N; - compute_weight_sums(weight_sums, quantized_weight, out_features, in_features); - - // Bias (optional, float/half) - [N] - ValueSpec bias( - {config.N}, // Per output feature - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM); - bias.set_constant(true); - if (!config.has_bias) { - bias.set_none(true); - } - - // Output tensor (float/half) - [M, N] - ValueSpec output( - {config.M, config.N}, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - - // Add all specs to test case - if (config.op_name.find("q8ta") != std::string::npos) { - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(input_scale); - test_case.add_input_spec(input_zero_point); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(weight_sums); - test_case.add_input_spec(weight_scales); - test_case.add_input_spec(bias); - test_case.add_output_spec(output); - } else { - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(weight_scales); - test_case.add_input_spec(bias); - test_case.add_output_spec(output); - } - - return test_case; -} - -// Generate easy test cases for quantized linear operation (for debugging) -std::vector generate_quantized_linear_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - int M = 4; - int K = 4; - int N = 4; - - LinearConfig config = { - M, // Batch size - K, // Input features - N, // Output features - true, // has_bias - "simple", // test_case_name - }; - - // Test with both storage types and data types for completeness - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for quantized linear operation -std::vector generate_quantized_linear_test_cases() { - std::vector test_cases; - - std::vector configs = { - {4, 64, 32}, - {4, 128, 64}, - {4, 256, 128}, - {32, 64, 32}, - {32, 128, 64}, - {32, 256, 128}, - // No bias tests - {32, 128, 64, false}, - {32, 256, 128, false}, - {256, 2048, 2048}, - {512, 2048, 2048}, - {1024, 2048, 2048}, - }; - - // Test with different storage types and data types - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - for (auto config : configs) { - std::string prefix = - (config.M < kRefDimSizeLimit && config.K < kRefDimSizeLimit && - config.N < kRefDimSizeLimit) - ? "correctness_" - : "performance_"; - std::string generated_test_case_name = prefix + std::to_string(config.M) + - "_" + std::to_string(config.K) + "_" + std::to_string(config.N); - if (!config.has_bias) { - generated_test_case_name += "_no_bias"; - } - - config.test_case_name = generated_test_case_name; - - for (const auto& storage_type : storage_types) { - if (vkcompute::api::context() - ->adapter_ptr() - ->supports_int8_dot_product()) { - // Test both activation+weight quantized and weight only quantized - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - } - - LinearConfig wo_quant_config = config; - wo_quant_config.op_name = "linear_q8csw"; - test_cases.push_back(create_test_case_from_config( - wo_quant_config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Reference implementation for weight only quantized linear -void linear_q8csw_reference_impl(TestCase& test_case) { - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; - const ValueSpec& bias_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [out_features, in_features] - auto output_sizes = - output_spec.get_tensor_sizes(); // [batch_size, out_features] - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = weight_sizes[0]; - - // Skip for large tensors since computation time will be extremely slow - if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit || - out_features > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - - auto& weight_data = weight_spec.get_int8_data(); - auto& weight_scales_data = weight_scales_spec.get_float_data(); - auto& bias_data = bias_spec.get_float_data(); - - // Calculate number of output elements - int64_t num_output_elements = batch_size * out_features; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - // Perform quantized linear transformation (matrix multiplication) - for (int64_t b = 0; b < batch_size; ++b) { - for (int64_t out_f = 0; out_f < out_features; ++out_f) { - float sum = 0.0f; - - // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] * - // weight[out_f][in_f]) - for (int64_t in_f = 0; in_f < in_features; ++in_f) { - // Get input value and dequantize - int64_t input_idx = b * in_features + in_f; - float input_val = input_data[input_idx]; - - // Get weight value and dequantize - int64_t weight_idx = out_f * in_features + in_f; - float dequant_weight = (static_cast(weight_data[weight_idx])) * - weight_scales_data[out_f]; - - sum += input_val * dequant_weight; - } - - // Add bias and store result - if (!bias_spec.is_none()) { - sum += bias_data[out_f]; - } - int64_t output_idx = b * out_features + out_f; - ref_data[output_idx] = sum; - } - } -} - -void linear_q8ta_q8csw_reference_impl(TestCase& test_case) { - // Extract input specifications - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& input_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_zeros_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_sums_spec = test_case.inputs()[idx++]; - (void)weight_sums_spec; - const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; - const ValueSpec& bias_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [out_features, in_features] - auto output_sizes = - output_spec.get_tensor_sizes(); // [batch_size, out_features] - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = weight_sizes[0]; - - // Skip for large tensors since computation time will be extremely slow - if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit || - out_features > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - const float input_scale = input_scale_spec.get_float_value(); - const int32_t input_zero_point = input_zeros_spec.get_int_value(); - - auto& weight_data = weight_spec.get_int8_data(); - auto& weight_scales_data = weight_scales_spec.get_float_data(); - auto& bias_data = bias_spec.get_float_data(); - - // Calculate number of output elements - int64_t num_output_elements = batch_size * out_features; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - // Perform quantized linear transformation (matrix multiplication) with - // integer accumulation - for (int64_t b = 0; b < batch_size; ++b) { - for (int64_t out_f = 0; out_f < out_features; ++out_f) { - int32_t int_sum = 0; - int32_t weight_sum = 0; // Track weight sum on the fly - - // Matrix multiplication with integer accumulation: - // int_sum = sum(quantized_input[b][in_f] * quantized_weight[out_f][in_f]) - for (int64_t in_f = 0; in_f < in_features; ++in_f) { - // Get input value and quantize to int8 - int64_t input_idx = b * in_features + in_f; - - float quant_input_f = - std::round(input_data[input_idx] / input_scale) + input_zero_point; - quant_input_f = std::min(std::max(quant_input_f, -128.0f), 127.0f); - int8_t quantized_input = static_cast(quant_input_f); - - // Get quantized weight (already int8) - int64_t weight_idx = out_f * in_features + in_f; - int8_t quantized_weight = weight_data[weight_idx]; - - // Integer multiplication and accumulation - int_sum += static_cast(quantized_input) * - static_cast(quantized_weight); - - // Track weight sum for this output channel on the fly - weight_sum += static_cast(quantized_weight); - } - - // Convert accumulated integer result to float and apply scales - // Final result = (int_sum - zero_point_correction) * input_scale * - // weight_scale + bias zero_point_correction = input_zero_point * - // sum_of_weights_for_this_output_channel - int32_t zero_point_correction = input_zero_point * weight_sum; - int32_t accum_adjusted = int_sum - zero_point_correction; - - float float_result = - accum_adjusted * input_scale * weight_scales_data[out_f]; - - // Add bias and store result - if (!bias_spec.is_none()) { - float_result += bias_data[out_f]; - } - int64_t output_idx = b * out_features + out_f; - ref_data[output_idx] = float_result; - } - } -} - -void reference_impl(TestCase& test_case) { - if (test_case.operator_name().find("q8ta") != std::string::npos) { - linear_q8ta_q8csw_reference_impl(test_case); - } else { - linear_q8csw_reference_impl(test_case); - } -} - -int64_t quantized_linear_flop_calculator(const TestCase& test_case) { - int input_idx = 0; - int weight_idx = 1; - if (test_case.operator_name().find("q8ta") != std::string::npos) { - input_idx = 0; - weight_idx = 3; - } - - // Get input and weight dimensions - const auto& input_sizes = test_case.inputs()[input_idx].get_tensor_sizes(); - const auto& weight_sizes = test_case.inputs()[weight_idx].get_tensor_sizes(); - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = weight_sizes[0]; - - // Calculate FLOPs for quantized linear operation - // Each output element requires: - // - in_features multiply-accumulate operations - // - Additional operations for quantization/dequantization - int64_t output_elements = batch_size * out_features; - int64_t ops_per_output = in_features; - - // Add quantization overhead (approximate) - // - Dequantize input: 1 op per input element used - // - Dequantize weight: 1 op per weight element used - // - Add bias: 1 op per output element - int64_t quantization_ops = ops_per_output + 1; // Simplified estimate - - int64_t flop = output_elements * (ops_per_output + quantization_ops); - - return flop; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "Quantized Linear Operation Prototyping Framework" << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = reference_impl; - - auto results = execute_test_cases( - generate_quantized_linear_test_cases, - quantized_linear_flop_calculator, - "QuantizedLinear", - 0, - 10, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/quantized_int4_linear.cpp b/backends/vulkan/test/custom_ops/quantized_int4_linear.cpp deleted file mode 100644 index c125ce2d09c..00000000000 --- a/backends/vulkan/test/custom_ops/quantized_int4_linear.cpp +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; - -using namespace vkcompute; - -// Linear configuration struct -struct LinearConfig { - int64_t M; // Batch size / number of rows in input - int64_t K; // Input features / columns in input, rows in weight - int64_t N; // Output features / columns in weight - int64_t group_size; // Number of input channels per quantization group - std::string name_suffix; - std::string shader_variant_name = "default"; -}; - -// Utility function to create a test case from a LinearConfig -TestCase create_test_case_from_config( - const LinearConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = "QuantizedLinearInt4_" + config.name_suffix + "_" + - storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "et_vk.linear_weight_int4.default"; - test_case.set_operator_name(operator_name); - - // Derive sizes from M, K, N - std::vector input_size = {config.M, config.K}; - std::vector weight_size = { - config.N, config.K / 2}; // Packed 4-bit weights - - // Input tensor (float/half) - [M, K] - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ONES); - - if (debugging()) { - print_valuespec_data(input_tensor, "input_tensor"); - } - - // Quantized weight tensor (int8, packed 4-bit) - [N, K/2] - ValueSpec quantized_weight( - weight_size, - vkapi::kChar, // int8 for packed 4-bit quantized weights - storage_type, - utils::kWidthPacked, - DataGenType::ONES); - quantized_weight.set_constant(true); - quantized_weight.set_int4(true); - - if (debugging()) { - print_valuespec_data(quantized_weight, "weight_tensor"); - } - - // Group size parameter - ValueSpec group_size_spec(static_cast(config.group_size)); - - // Weight quantization scales and zeros (float/half, per-group) - - // [K/group_size, N, 2] - std::vector scales_and_zeros_size = { - config.K / config.group_size, config.N, 2}; - ValueSpec scales_and_zeros( - scales_and_zeros_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ONES); - scales_and_zeros.set_constant(true); - - if (debugging()) { - print_valuespec_data(scales_and_zeros, "scales_and_zeros"); - } - - // Output tensor (float/half) - [M, N] - ValueSpec output( - {config.M, config.N}, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - - // Add all specs to test case - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(group_size_spec); - test_case.add_input_spec(scales_and_zeros); - // Add dummy value for inner_k_tiles (unused but required by operator - // signature) - ValueSpec dummy_inner_k_tiles(static_cast(8)); - test_case.add_input_spec(dummy_inner_k_tiles); - - test_case.add_output_spec(output); - - return test_case; -} - -// Generate easy test cases for quantized linear operation (for debugging) -std::vector generate_quantized_linear_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - int M = 8; - int K = 16; - int N = 16; - int group_size = 8; - - LinearConfig config = { - M, // Batch size - K, // Input features - N, // Output features - group_size, // Group size - "simple", // descriptive name - "default" // shader variant name - }; - - // Test with both storage types and data types for completeness - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for quantized linear operation -std::vector generate_quantized_linear_test_cases() { - std::vector test_cases; - - std::vector configs = { - {8, 64, 32, 8, "correctness_8_64_32_g8"}, - {8, 128, 64, 16, "correctness_8_128_64_g16"}, - {8, 256, 128, 32, "correctness_8_256_128_g32"}, - {32, 64, 32, 8, "correctness_32_64_32_g8"}, - {32, 128, 64, 16, "correctness_32_128_64_g16"}, - {32, 256, 128, 32, "correctness_32_256_128_g32"}, - {1, 256, 128, 32, "correctness_32_256_128_g32"}, - // Performance test cases - {1, 2048, 2048, 128, "performance_128_2048_2048_g128"}, - {128, 2048, 2048, 128, "performance_128_2048_2048_g128"}, - {248, 2048, 2048, 128, "performance_128_2048_2048_g128"}, - {1024, 2048, 2048, 128, "performance_128_2048_2048_g128"}, - // {16384, 576, 128, 32, "performance_16384_576_128_g32"} - }; - - // Test with different storage types and data types - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - // Generate test cases for each combination - for (const auto& config : configs) { - for (const auto& storage_type : storage_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Helper function to unpack 4-bit values from int8 -std::pair unpack_4bit(int8_t packed) { - // Extract lower 4 bits and upper 4 bits - int8_t lower = packed & 0x0F; - int8_t upper = (packed >> 4) & 0x0F; - - // Sign extend from 4-bit to 8-bit - if (lower & 0x08) - lower |= 0xF0; - if (upper & 0x08) - upper |= 0xF0; - - return std::make_pair(lower, upper); -} - -// Reference implementation for quantized linear operation -void quantized_linear_reference_impl(TestCase& test_case) { - static constexpr int64_t kRefDimSizeLimit = 300; - // Extract input specifications - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& group_size_spec = test_case.inputs()[idx++]; - const ValueSpec& scales_and_zeros_spec = test_case.inputs()[idx++]; - // Skip dummy inner_k_tiles - idx++; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [out_features, in_features/2] - auto output_sizes = - output_spec.get_tensor_sizes(); // [batch_size, out_features] - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = output_sizes[1]; - int64_t group_size = group_size_spec.get_int_value(); - - // Skip for large tensors since computation time will be extremely slow - if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit || - out_features > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - auto& weight_data = weight_spec.get_int8_data(); - auto& scales_and_zeros_data = scales_and_zeros_spec.get_float_data(); - - // Calculate number of output elements - int64_t num_output_elements = batch_size * out_features; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - // Perform quantized linear transformation (matrix multiplication) - for (int64_t b = 0; b < batch_size; ++b) { - for (int64_t out_f = 0; out_f < out_features; ++out_f) { - float sum = 0.0f; - - bool should_print = b == 0 && out_f == 0; - should_print = false; - - if (should_print) { - std::cout << "Weights seen: "; - } - - // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] * - // weight[out_f][in_f]) - for (int64_t in_f = 0; in_f < in_features; ++in_f) { - // Get input value - int64_t input_idx = b * in_features + in_f; - float input_val = input_data[input_idx]; - - // Get weight value and dequantize (4-bit group affine quantization) - int64_t group_idx = in_f / group_size; - int64_t scales_and_zeros_idx = group_idx * out_features * 2 + out_f * 2; - - // Get packed weight value - int64_t weight_idx = out_f * (in_features / 2) + (in_f / 2); - int8_t packed_weight = weight_data[weight_idx]; - - // Unpack 4-bit weight - auto unpacked = unpack_4bit(packed_weight); - int8_t weight_4bit = (in_f % 2 == 0) ? unpacked.first : unpacked.second; - - // Dequantize weight using group affine quantization - float weight_scale = scales_and_zeros_data[scales_and_zeros_idx]; - float weight_zero = scales_and_zeros_data[scales_and_zeros_idx + 1]; - float dequant_weight = - (static_cast(weight_4bit) - 8.0f) * weight_scale + - weight_zero; - - if (should_print) { - std::cout << int(weight_4bit) << ", "; - } - - sum += input_val * dequant_weight; - } - - if (should_print) { - std::cout << std::endl; - } - - // Store result - int64_t output_idx = b * out_features + out_f; - ref_data[output_idx] = sum; - } - } -} - -// Custom FLOP calculator for quantized linear operation -int64_t quantized_linear_flop_calculator(const TestCase& test_case) { - if (test_case.num_inputs() < 4 || test_case.num_outputs() < 1) { - return 0; - } - - // Get input and weight dimensions - const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); - const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = output_sizes[1]; - - // Calculate FLOPs for quantized linear operation - // Each output element requires: - // - in_features multiply-accumulate operations - // - Additional operations for quantization/dequantization - int64_t output_elements = batch_size * out_features; - int64_t ops_per_output = in_features; - - // Add quantization overhead (approximate) - // - Dequantize weight: 2 ops per weight element used (unpack + dequantize) - int64_t quantization_ops = ops_per_output * 2; // Simplified estimate - - int64_t flop = output_elements * (ops_per_output + quantization_ops); - - return flop; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "Quantized 4-bit Int4 Linear Operation Prototyping Framework" - << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = quantized_linear_reference_impl; - - // Execute easy test cases using the new framework with custom FLOP - // calculator - auto results = execute_test_cases( - generate_quantized_linear_test_cases, - quantized_linear_flop_calculator, - "QuantizedLinearInt4", - 0, - 10, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/quantized_q4gaw_linear.cpp b/backends/vulkan/test/custom_ops/quantized_q4gaw_linear.cpp deleted file mode 100644 index 084d718b502..00000000000 --- a/backends/vulkan/test/custom_ops/quantized_q4gaw_linear.cpp +++ /dev/null @@ -1,433 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; - -using namespace vkcompute; - -// Linear configuration struct -struct LinearConfig { - int64_t M; // Batch size / number of rows in input - int64_t K; // Input features / columns in input, rows in weight - int64_t N; // Output features / columns in weight - int64_t group_size; // Number of input channels per quantization group - std::string name_suffix; - std::string shader_variant_name = "default"; -}; - -// Utility function to create a test case from a LinearConfig -TestCase create_test_case_from_config( - const LinearConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = "QuantizedLinear4GAW_" + config.name_suffix + "_" + - storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "et_vk.linear_q8ta_q4gaw."; - operator_name += config.shader_variant_name; - test_case.set_operator_name(operator_name); - - // Derive sizes from M, K, N - std::vector input_size = {config.M, config.K}; - std::vector weight_size = { - config.K, config.N / 2}; // Packed 4-bit weights - - // Input tensor (float/half) - [M, K] - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDINT); - - if (debugging()) { - print_valuespec_data(input_tensor, "input_tensor"); - } - - float input_scale_val = 1.0f; - ValueSpec input_scale(input_scale_val); - - int32_t input_zero_point_val = 0; - ValueSpec input_zero_point(input_zero_point_val); - - // Group size parameter - ValueSpec group_size_spec(static_cast(config.group_size)); - - // Quantized weight tensor (int8, packed 4-bit) - [K, N/2] - ValueSpec quantized_weight( - weight_size, - vkapi::kChar, // int8 for packed 4-bit quantized weights - storage_type, - utils::kWidthPacked, - DataGenType::RANDINT4); - quantized_weight.set_constant(true); - quantized_weight.set_int4(true); - - if (debugging()) { - print_valuespec_data(quantized_weight, "weight_tensor"); - } - - // Weight quantization scales (float/half, per-group) - [N, K/group_size] - std::vector weight_scales_size = { - config.N, config.K / config.group_size}; - ValueSpec weight_scales( - weight_scales_size, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::RANDOM_SCALES); - weight_scales.set_constant(true); - - if (debugging()) { - print_valuespec_data(weight_scales, "weight_scales"); - } - - // Weight zeros (int32, per-group) - [N, K/group_size] - ValueSpec weight_zeros( - weight_scales_size, - vkapi::kInt, // int32 for zeros - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - weight_zeros.set_constant(true); - - ValueSpec weight_sums( - {config.N}, // Per output features - vkapi::kFloat, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - weight_sums.set_constant(true); - - // Compute weight_sums data based on quantized weights - int64_t in_features = config.K; - int64_t out_features = config.N; - - ValueSpec orig_OC(static_cast(config.N)); - - // Bias (optional, float/half) - [N] - ValueSpec bias( - {config.N}, // Per output feature - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - bias.set_constant(true); - - // Output tensor (float/half) - [M, N] - ValueSpec output( - {config.M, config.N}, - input_dtype, - storage_type, - utils::kWidthPacked, - DataGenType::ZEROS); - - // Add all specs to test case - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(input_scale); - test_case.add_input_spec(input_zero_point); - test_case.add_input_spec(quantized_weight); - test_case.add_input_spec(weight_sums); - test_case.add_input_spec(weight_scales); - test_case.add_input_spec(weight_zeros); - test_case.add_input_spec(orig_OC); - test_case.add_input_spec(group_size_spec); - test_case.add_input_spec(bias); - - test_case.add_output_spec(output); - - return test_case; -} - -// Generate easy test cases for quantized linear operation (for debugging) -std::vector generate_quantized_linear_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - int M = 4; - int K = 32; - int N = 32; - int group_size = 8; - - LinearConfig config = { - M, // Batch size - K, // Input features - N, // Output features - group_size, // Group size - "simple", // descriptive name - "noint8" // shader variant name - }; - - // Test with both storage types and data types for completeness - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for quantized linear operation -std::vector generate_quantized_linear_test_cases() { - std::vector test_cases; - - std::vector configs = { - {8, 64, 32, 8, "correctness_1_64_32_g8"}, - {8, 128, 64, 16, "correctness_1_128_64_g16"}, - {8, 256, 128, 32, "correctness_1_256_128_g32"}, - {32, 64, 32, 8, "correctness_32_64_32_g8"}, - {32, 128, 64, 16, "correctness_32_128_64_g16"}, - {32, 256, 128, 32, "correctness_32_256_128_g32"}, - {1, 256, 128, 32, "todo"}, - // Performance test cases - {1, 2048, 2048, 128, "todo"}, - {128, 2048, 2048, 128, "performance_128_2048_2048_g64"}, - {248, 2048, 2048, 128, "performance_128_2048_2048_g64"}, - {1024, 2048, 2048, 128, "performance_128_2048_2048_g64"}, - // {16384, 576, 128, 32, "performance_16384_576_128_g32"} - }; - - // Test with different storage types and data types - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - // Generate test cases for each combination - for (const auto& config : configs) { - for (const auto& storage_type : storage_types) { - // Test both with and without shader int8 dot product - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - - // LinearConfig no_int_config = config; - // no_int_config.name_suffix = config.name_suffix + "_noint8"; - // no_int_config.shader_variant_name = "noint8"; - - // test_cases.push_back(create_test_case_from_config( - // no_int_config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Helper function to unpack 4-bit values from int8 -std::pair unpack_4bit(int8_t packed) { - // Extract lower 4 bits and upper 4 bits - int8_t lower = packed & 0x0F; - int8_t upper = (packed >> 4) & 0x0F; - - // Sign extend from 4-bit to 8-bit - if (lower & 0x08) - lower |= 0xF0; - if (upper & 0x08) - upper |= 0xF0; - - return std::make_pair(lower, upper); -} - -// Reference implementation for quantized linear operation -void quantized_linear_reference_impl(TestCase& test_case) { - static constexpr int64_t kRefDimSizeLimit = 300; - // Extract input specifications - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& input_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_zeros_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_sums_spec = test_case.inputs()[idx++]; - (void)weight_sums_spec; - const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; - const ValueSpec& weight_zeros_spec = test_case.inputs()[idx++]; - const ValueSpec& orig_OC = test_case.inputs()[idx++]; - (void)orig_OC; - const ValueSpec& group_size_spec = test_case.inputs()[idx++]; - const ValueSpec& bias_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features] - auto weight_sizes = - weight_spec.get_tensor_sizes(); // [in_features, out_features/2] - auto output_sizes = - output_spec.get_tensor_sizes(); // [batch_size, out_features] - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = output_sizes[1]; - int64_t group_size = group_size_spec.get_int_value(); - - // Skip for large tensors since computation time will be extremely slow - if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit || - out_features > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - const float input_scale = input_scale_spec.get_float_value(); - const int32_t input_zero_point = input_zeros_spec.get_int_value(); - - auto& weight_data = weight_spec.get_int8_data(); - auto& weight_scales_data = weight_scales_spec.get_float_data(); - auto& weight_zeros_data = weight_zeros_spec.get_int32_data(); - auto& bias_data = bias_spec.get_float_data(); - - // Calculate number of output elements - int64_t num_output_elements = batch_size * out_features; - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_output_elements); - - // Perform quantized linear transformation (matrix multiplication) - for (int64_t b = 0; b < batch_size; ++b) { - for (int64_t out_f = 0; out_f < out_features; ++out_f) { - float sum = 0.0f; - - bool should_print = b == 0 && out_f == 0; - should_print = false; - - if (should_print) { - std::cout << "Weights seen: "; - } - - // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] * - // weight[out_f][in_f]) - for (int64_t in_f = 0; in_f < in_features; ++in_f) { - // Get input value and dequantize - int64_t input_idx = b * in_features + in_f; - - float quant_input = - std::round(input_data[input_idx] / input_scale) + input_zero_point; - quant_input = std::min(std::max(quant_input, -128.0f), 127.0f); - float dequant_input = (quant_input - input_zero_point) * input_scale; - - // Get weight value and dequantize (4-bit group affine quantization) - int64_t group_idx = in_f / group_size; - int64_t scales_idx = group_idx * out_features + out_f; - - // Get packed weight value - int64_t weight_idx = in_f * (out_features / 2) + (out_f / 2); - int8_t packed_weight = weight_data[weight_idx]; - - // Unpack 4-bit weight - auto unpacked = unpack_4bit(packed_weight); - int8_t weight_4bit = - (out_f % 2 == 0) ? unpacked.first : unpacked.second; - - // Dequantize weight using group affine quantization - float weight_scale = weight_scales_data[scales_idx]; - int32_t weight_zero = weight_zeros_data[scales_idx]; - float dequant_weight = - (static_cast(weight_4bit) - weight_zero) * weight_scale; - - if (should_print) { - std::cout << int(weight_4bit) << ", "; - } - - sum += dequant_input * dequant_weight; - } - - if (should_print) { - std::cout << std::endl; - } - - // Add bias and store result - sum += bias_data[out_f]; - int64_t output_idx = b * out_features + out_f; - ref_data[output_idx] = sum; - } - } -} - -// Custom FLOP calculator for quantized linear operation -int64_t quantized_linear_flop_calculator(const TestCase& test_case) { - if (test_case.num_inputs() < 6 || test_case.num_outputs() < 1) { - return 0; - } - - // Get input and weight dimensions - const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); - const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); - - int64_t batch_size = input_sizes[0]; - int64_t in_features = input_sizes[1]; - int64_t out_features = output_sizes[1]; - - // Calculate FLOPs for quantized linear operation - // Each output element requires: - // - in_features multiply-accumulate operations - // - Additional operations for quantization/dequantization - int64_t output_elements = batch_size * out_features; - int64_t ops_per_output = in_features; - - // Add quantization overhead (approximate) - // - Dequantize input: 1 op per input element used - // - Dequantize weight: 2 ops per weight element used (unpack + dequantize) - // - Add bias: 1 op per output element - int64_t quantization_ops = ops_per_output * 2 + 1; // Simplified estimate - - int64_t flop = output_elements * (ops_per_output + quantization_ops); - - return flop; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout - << "Quantized 4-bit Group Affine Weights Linear Operation Prototyping Framework" - << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = quantized_linear_reference_impl; - - // Execute easy test cases using the new framework with custom FLOP - // calculator - auto results = execute_test_cases( - generate_quantized_linear_test_cases, - quantized_linear_flop_calculator, - "QuantizedLinear4GAW", - 0, - 10, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl deleted file mode 100644 index 3162857c2d3..00000000000 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ /dev/null @@ -1,99 +0,0 @@ -load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load( - "@fbsource//xplat/executorch/backends/vulkan:targets.bzl", - "get_platforms", - "vulkan_spv_shader_lib", -) - -def define_custom_op_test_binary(custom_op_name, extra_deps = [], src_file = None): - deps_list = [ - ":prototyping_utils", - ":operator_implementations", - ":custom_ops_shaderlib", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ] + extra_deps - - src_file_str = src_file if src_file else "{}.cpp".format(custom_op_name) - - runtime.cxx_binary( - name = custom_op_name, - srcs = [ - src_file_str, - ], - platforms = get_platforms(), - define_static_target = False, - deps = deps_list, - ) - -def define_common_targets(is_fbcode = False): - if is_fbcode: - return - - # Shader library from GLSL files - runtime.filegroup( - name = "custom_ops_shaders", - srcs = native.glob([ - "glsl/*.glsl", - "glsl/*.yaml", - ]), - visibility = [ - "//executorch/backends/vulkan/test/custom_ops/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - vulkan_spv_shader_lib( - name = "custom_ops_shaderlib", - spv_filegroups = { - ":custom_ops_shaders": "glsl", - }, - is_fbcode = is_fbcode, - ) - - # Prototyping utilities library - runtime.cxx_library( - name = "prototyping_utils", - srcs = [ - "utils.cpp", - ], - headers = [ - "utils.h", - ], - exported_headers = [ - "utils.h", - ], - platforms = get_platforms(), - deps = [ - "//executorch/backends/vulkan:vulkan_graph_runtime", - ], - visibility = [ - "//executorch/backends/vulkan/test/custom_ops/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - # Operator implementations library - runtime.cxx_library( - name = "operator_implementations", - srcs = native.glob([ - "impl/*.cpp", - ]), - platforms = get_platforms(), - deps = [ - "//executorch/backends/vulkan:vulkan_graph_runtime", - ":custom_ops_shaderlib", - ], - visibility = [ - "//executorch/backends/vulkan/test/custom_ops/...", - "@EXECUTORCH_CLIENTS", - ], - link_whole = True, - ) - - define_custom_op_test_binary("add") - define_custom_op_test_binary("q8csw_linear") - define_custom_op_test_binary("q8csw_conv2d") - define_custom_op_test_binary("choose_qparams_per_row") - define_custom_op_test_binary("q4gsw_linear") diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp deleted file mode 100644 index 37e0060b3f2..00000000000 --- a/backends/vulkan/test/custom_ops/utils.cpp +++ /dev/null @@ -1,1717 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "utils.h" -#include -#include -#include -#include - -#include - -namespace executorch { -namespace vulkan { -namespace prototyping { - -int get_seed() { - static int seed = 42; - return seed++; -} - -// Forward declarations for data generation utilities -void generate_random_float_data( - std::vector& data, - float min_val = -1.0f, - float max_val = 1.0f); -void generate_random_int_data( - std::vector& data, - int min_val = -10, - int max_val = 10); -void generate_randint_float_data( - std::vector& data, - int min_val = -10, - int max_val = 10); -void generate_randint_half_data( - std::vector& data, - int min_val = -10, - int max_val = 10); -void generate_random_int8_data( - std::vector& data, - int8_t min_val = -10, - int8_t max_val = 10); -void generate_random_uint8_data( - std::vector& data, - uint8_t min_val = 0, - uint8_t max_val = 255); -void generate_random_2xint4_data(std::vector& data); -void generate_random_2xint4_data(std::vector& data); -void generate_random_int4_data( - std::vector& data, - int8_t min_val = -8, - int8_t max_val = 7); -void generate_ones_data(std::vector& data); -void generate_zeros_data(std::vector& data); - -// Output and latency printing utilities -namespace { -static int print_output_enabled = 0; -static int print_latencies_enabled = 0; -static int gpu_timestamps_enabled = 0; -static int debugging_enabled = 0; -} // namespace - -bool print_output() { - return print_output_enabled > 0; -} - -void set_print_output(bool print_output) { - print_output_enabled = print_output ? 1 : 0; -} - -bool print_latencies() { - return print_latencies_enabled > 0; -} - -void set_print_latencies(bool print_latencies) { - print_latencies_enabled = print_latencies ? 1 : 0; -} - -bool use_gpu_timestamps() { - return gpu_timestamps_enabled > 0; -} - -void set_use_gpu_timestamps(bool use_timestamps) { - gpu_timestamps_enabled = use_timestamps ? 1 : 0; -} - -bool debugging() { - return debugging_enabled > 0; -} - -void set_debugging(bool enable_debugging) { - debugging_enabled = enable_debugging ? 1 : 0; -} - -// ValueSpec implementation -void ValueSpec::generate_tensor_data() { - if (spec_type != SpecType::Tensor) { - return; - } - - int64_t num_elements = numel(); - - switch (dtype) { - case vkapi::kFloat: { - float_data.resize(num_elements); - if (data_gen_type == DataGenType::RANDOM) { - generate_random_float_data(float_data); - } else if (data_gen_type == DataGenType::RANDOM_SCALES) { - generate_random_float_data(float_data, 0.005, 0.015); - } else if (data_gen_type == DataGenType::RANDINT) { - generate_randint_float_data(float_data); - } else if (data_gen_type == DataGenType::RANDINT8) { - generate_randint_float_data(float_data, -128, 127); - } else if (data_gen_type == DataGenType::RANDINT4) { - generate_randint_float_data(float_data, -8, 7); - } else if (data_gen_type == DataGenType::ONES) { - generate_ones_data(float_data); - } else if (data_gen_type == DataGenType::ZEROS) { - generate_zeros_data(float_data); - } else { - generate_zeros_data(float_data); - } - break; - } - case vkapi::kHalf: { - half_data.resize(num_elements); - if (data_gen_type == DataGenType::RANDOM) { - // Generate random float data first, then convert to half - std::vector temp_data(num_elements); - generate_random_float_data(temp_data); - for (size_t i = 0; i < temp_data.size(); ++i) { - // Simple conversion to uint16_t representation of half - half_data[i] = static_cast(temp_data[i] * 32767.0f); - } - } else if (data_gen_type == DataGenType::RANDINT) { - generate_randint_half_data(half_data); - } else if (data_gen_type == DataGenType::RANDINT8) { - generate_randint_half_data(half_data, -128, 127); - } else if (data_gen_type == DataGenType::RANDINT4) { - generate_randint_half_data(half_data, -8, 7); - } else if (data_gen_type == DataGenType::ONES) { - std::fill( - half_data.begin(), - half_data.end(), - static_cast(32767)); // 1.0 in half - } else if (data_gen_type == DataGenType::ZEROS) { - std::fill( - half_data.begin(), - half_data.end(), - static_cast(0)); // 0.0 in half - } else { - std::fill( - half_data.begin(), - half_data.end(), - static_cast(0)); // 0.0 in half - } - break; - } - case vkapi::kInt: { - int32_data.resize(num_elements); - if (data_gen_type == DataGenType::RANDOM) { - generate_random_int_data(int32_data); - } else if (data_gen_type == DataGenType::RANDINT) { - generate_random_int_data( - int32_data); // For int type, RANDINT is same as RANDOM - } else if (data_gen_type == DataGenType::RANDINT8) { - generate_random_int_data(int32_data, -128, 127); - } else if (data_gen_type == DataGenType::RANDINT4) { - generate_random_int_data(int32_data, -8, 7); - } else if (data_gen_type == DataGenType::ONES) { - std::fill(int32_data.begin(), int32_data.end(), 1); - } else if (data_gen_type == DataGenType::ZEROS) { - std::fill(int32_data.begin(), int32_data.end(), 0); - } else { - std::fill(int32_data.begin(), int32_data.end(), 0); - } - break; - } - case vkapi::kChar: { - int8_data.resize(num_elements); - if (data_gen_type == DataGenType::RANDOM) { - generate_random_int8_data(int8_data); - } else if (data_gen_type == DataGenType::RANDINT) { - generate_random_int8_data(int8_data); - } else if (data_gen_type == DataGenType::RANDINT8) { - generate_random_int8_data(int8_data, -128, 127); - } else if (data_gen_type == DataGenType::RANDINT4) { - generate_random_2xint4_data(int8_data); - } else if (data_gen_type == DataGenType::ONES) { - std::fill(int8_data.begin(), int8_data.end(), 1); - } else if (data_gen_type == DataGenType::ONES_INT4) { - int8_t packed_data = (1 << 4) | 1; - std::fill(int8_data.begin(), int8_data.end(), packed_data); - } else if (data_gen_type == DataGenType::ZEROS) { - std::fill(int8_data.begin(), int8_data.end(), 0); - } else { - std::fill(int8_data.begin(), int8_data.end(), 0); - } - break; - } - case vkapi::kByte: { - uint8_data.resize(num_elements); - if (data_gen_type == DataGenType::RANDOM) { - generate_random_uint8_data(uint8_data); - } else if (data_gen_type == DataGenType::RANDINT) { - generate_random_uint8_data(uint8_data); - } else if (data_gen_type == DataGenType::RANDINT8) { - generate_random_uint8_data(uint8_data, 0, 255); - } else if (data_gen_type == DataGenType::RANDINT4) { - generate_random_2xint4_data(uint8_data); - } else if (data_gen_type == DataGenType::ONES) { - std::fill(uint8_data.begin(), uint8_data.end(), 1); - } else if (data_gen_type == DataGenType::ZEROS) { - std::fill(uint8_data.begin(), uint8_data.end(), 0); - } else { - std::fill(uint8_data.begin(), uint8_data.end(), 0); - } - break; - } - default: - // Default to float - float_data.resize(num_elements); - if (data_gen_type == DataGenType::RANDOM) { - generate_random_float_data(float_data); - } else if (data_gen_type == DataGenType::RANDINT) { - generate_randint_float_data(float_data); - } else if (data_gen_type == DataGenType::ONES) { - generate_ones_data(float_data); - } else if (data_gen_type == DataGenType::ZEROS) { - generate_zeros_data(float_data); - } else { - generate_zeros_data(float_data); - } - break; - } -} - -int64_t ValueSpec::numel() const { - if (spec_type == SpecType::Int || spec_type == SpecType::Float || - spec_type == SpecType::Bool) { - return 1; - } else if (spec_type == SpecType::IntList) { - return sizes.empty() ? 0 : sizes[0]; - } else { // Tensor - int64_t total = 1; - for (int64_t size : sizes) { - total *= size; - } - return total; - } -} - -size_t ValueSpec::nbytes() const { - size_t element_size = 0; - switch (dtype) { - case vkapi::kFloat: - element_size = sizeof(float); - break; - case vkapi::kHalf: - element_size = sizeof(uint16_t); - break; - case vkapi::kInt: - element_size = sizeof(int32_t); - break; - case vkapi::kChar: - element_size = sizeof(int8_t); - break; - case vkapi::kByte: - element_size = sizeof(uint8_t); - break; - default: - element_size = sizeof(float); // Default fallback - break; - } - return numel() * element_size; -} - -std::string ValueSpec::to_string() const { - std::string result = "ValueSpec("; - - switch (spec_type) { - case SpecType::Tensor: - result += "type=Tensor, sizes=["; - break; - case SpecType::IntList: - result += "type=IntList, count="; - result += std::to_string(sizes.empty() ? 0 : sizes[0]); - result += ", data_gen="; - result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; - result += ")"; - return result; - case SpecType::Int: - result += "type=Int, value="; - result += std::to_string(get_int_value()); - result += ", data_gen="; - result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; - result += ")"; - return result; - case SpecType::Float: - result += "type=Float, value="; - result += std::to_string(get_float_value()); - result += ", data_gen="; - result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; - result += ")"; - return result; - case SpecType::Bool: - result += "type=Bool, value="; - result += get_bool_value() ? "true" : "false"; - result += ", data_gen="; - result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; - result += ")"; - return result; - } - - for (size_t i = 0; i < sizes.size(); ++i) { - result += std::to_string(sizes[i]); - if (i < sizes.size() - 1) - result += ", "; - } - result += "]"; - - if (spec_type == SpecType::Tensor) { - result += ", dtype="; - switch (dtype) { - case vkapi::kFloat: - result += "float"; - break; - case vkapi::kHalf: - result += "half"; - break; - case vkapi::kInt: - result += "int32"; - break; - case vkapi::kChar: - result += "int8"; - break; - case vkapi::kByte: - result += "uint8"; - break; - default: - result += "unknown"; - break; - } - - result += ", memory_layout="; - switch (memory_layout) { - case utils::kWidthPacked: - result += "WidthPacked"; - break; - case utils::kHeightPacked: - result += "HeightPacked"; - break; - case utils::kChannelsPacked: - result += "ChannelsPacked"; - break; - default: - result += "unknown"; - break; - } - - result += ", storage_type="; - switch (storage_type) { - case utils::kTexture3D: - result += "Texture3D"; - break; - case utils::kBuffer: - result += "Buffer"; - break; - default: - result += "unknown"; - break; - } - } - - result += ", data_gen="; - switch (data_gen_type) { - case DataGenType::FIXED: - result += "FIXED"; - break; - case DataGenType::RANDOM: - result += "RANDOM"; - break; - case DataGenType::RANDINT: - result += "RANDINT"; - break; - case DataGenType::RANDINT8: - result += "RANDINT8"; - break; - case DataGenType::RANDINT4: - result += "RANDINT4"; - break; - case DataGenType::ONES: - result += "ONES"; - break; - case DataGenType::ZEROS: - result += "ZEROS"; - break; - default: - result += "unknown"; - break; - } - result += ")"; - return result; -} - -// Additional ValueSpec methods -void ValueSpec::resize_data(size_t new_size) { - switch (dtype) { - case vkapi::kFloat: - float_data.resize(new_size); - break; - case vkapi::kHalf: - half_data.resize(new_size); - break; - case vkapi::kInt: - int32_data.resize(new_size); - break; - case vkapi::kChar: - int8_data.resize(new_size); - break; - case vkapi::kByte: - uint8_data.resize(new_size); - break; - default: - float_data.resize(new_size); - break; - } -} - -void* ValueSpec::get_mutable_data_ptr() { - switch (dtype) { - case vkapi::kFloat: - return float_data.data(); - case vkapi::kHalf: - return half_data.data(); - case vkapi::kInt: - return int32_data.data(); - case vkapi::kChar: - return int8_data.data(); - case vkapi::kByte: - return uint8_data.data(); - default: - return float_data.data(); - } -} - -float ValueSpec::get_element(size_t index) const { - if (index >= static_cast(numel())) { - return 0.0f; - } - - switch (dtype) { - case vkapi::kFloat: - return index < float_data.size() ? float_data[index] : 0.0f; - case vkapi::kHalf: - return index < half_data.size() ? (half_data[index] / 32767.0f) : 0.0f; - case vkapi::kInt: - return index < int32_data.size() ? static_cast(int32_data[index]) - : 0.0f; - case vkapi::kChar: - return index < int8_data.size() ? static_cast(int8_data[index]) - : 0.0f; - case vkapi::kByte: - return index < uint8_data.size() ? static_cast(uint8_data[index]) - : 0.0f; - default: - return 0.0f; - } -} - -const void* ValueSpec::get_data_ptr() const { - switch (dtype) { - case vkapi::kFloat: - return float_data.data(); - case vkapi::kHalf: - return half_data.data(); - case vkapi::kInt: - return int32_data.data(); - case vkapi::kChar: - return int8_data.data(); - case vkapi::kByte: - return uint8_data.data(); - default: - throw std::runtime_error("Unsupported data type for get_data_ptr"); - } -} - -void generate_random_float_data( - std::vector& data, - float min_val, - float max_val) { - std::mt19937 gen(get_seed()); - std::uniform_real_distribution dis(min_val, max_val); - for (auto& val : data) { - val = dis(gen); - } -} - -void generate_random_int_data( - std::vector& data, - int min_val, - int max_val) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(min_val, max_val); - for (auto& val : data) { - val = dis(gen); - } -} - -void generate_randint_float_data( - std::vector& data, - int min_val, - int max_val) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(min_val, max_val); - for (auto& val : data) { - val = static_cast(dis(gen)); - } -} - -void generate_randint_half_data( - std::vector& data, - int min_val, - int max_val) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(min_val, max_val); - for (auto& val : data) { - val = static_cast(std::abs(dis(gen)) % 65536); - } -} - -void generate_ones_data(std::vector& data) { - std::fill(data.begin(), data.end(), 1.0f); -} - -void generate_random_int8_data( - std::vector& data, - int8_t min_val, - int8_t max_val) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(min_val, max_val); - for (auto& val : data) { - val = static_cast(dis(gen)); - } -} - -void generate_random_uint8_data( - std::vector& data, - uint8_t min_val, - uint8_t max_val) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(min_val, max_val); - for (auto& val : data) { - val = static_cast(dis(gen)); - } -} - -void generate_random_int4_data( - std::vector& data, - int8_t min_val, - int8_t max_val) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(min_val, max_val); - for (auto& val : data) { - val = static_cast(dis(gen)); - } -} - -void generate_random_2xint4_data(std::vector& data) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(-8, 7); // Signed 4-bit range - for (auto& val : data) { - // Generate two separate 4-bit values - int8_t lower_4bits = static_cast(dis(gen)) & 0x0F; - int8_t upper_4bits = static_cast(dis(gen)) & 0x0F; - // Pack them into a single 8-bit value - val = (upper_4bits << 4) | lower_4bits; - } -} - -void generate_random_2xint4_data(std::vector& data) { - std::mt19937 gen(get_seed()); - std::uniform_int_distribution dis(0, 15); // Unsigned 4-bit range - for (auto& val : data) { - // Generate two separate 4-bit values - uint8_t lower_4bits = static_cast(dis(gen)) & 0x0F; - uint8_t upper_4bits = static_cast(dis(gen)) & 0x0F; - // Pack them into a single 8-bit value - val = (upper_4bits << 4) | lower_4bits; - } -} - -void generate_zeros_data(std::vector& data) { - std::fill(data.begin(), data.end(), 0.0f); -} - -// Correctness checking against reference data -bool ValueSpec::validate_against_reference( - float abs_tolerance, - float rel_tolerance) const { - // Only validate float tensors as specified in requirements - if (dtype != vkapi::kFloat || !is_tensor()) { - return true; // Skip validation for non-float or non-tensor types - } - - const auto& computed_data = get_float_data(); - const auto& reference_data = get_ref_float_data(); - - // Skip validation if no reference data is available - if (reference_data.empty()) { - return true; - } - - // Check if sizes match - if (computed_data.size() != reference_data.size()) { - if (debugging()) { - std::cout << "Size mismatch: computed=" << computed_data.size() - << ", reference=" << reference_data.size() << std::endl; - } - return false; - } - - // Element-wise comparison with both absolute and relative tolerance - for (size_t i = 0; i < computed_data.size(); ++i) { - float diff = std::abs(computed_data[i] - reference_data[i]); - float abs_ref = std::abs(reference_data[i]); - - // Check if either absolute or relative tolerance condition is satisfied - bool abs_tolerance_ok = diff <= abs_tolerance; - bool rel_tolerance_ok = diff <= rel_tolerance * abs_ref; - - if (!abs_tolerance_ok && !rel_tolerance_ok) { - std::cout << "Mismatch at element " << i - << ": computed=" << computed_data[i] - << ", reference=" << reference_data[i] << ", diff=" << diff - << ", abs_tolerance=" << abs_tolerance - << ", rel_tolerance=" << rel_tolerance - << ", rel_threshold=" << (rel_tolerance * abs_ref) << std::endl; - return false; - } - } - - if (debugging()) { - std::cout << "Correctness validation PASSED" << std::endl; - } - return true; -} - -// Helper function to collect GPU timing from querypool -float collect_gpu_timing_us(ComputeGraph& graph) { - graph.context()->querypool().extract_results(); - const auto results = graph.context()->querypool().get_shader_timestamp_data(); - if (!results.empty()) { - // Sum durations of all shaders that don't contain nchw_to or to_nchw - float total_duration_us = 0.0f; - for (const auto& shader_result : results) { - if (shader_result.kernel_name.find("nchw_to") == std::string::npos && - shader_result.kernel_name.find("to_nchw") == std::string::npos) { - // Calculate duration from start and end times, convert from ns to μs - uint64_t duration_ns = - shader_result.end_time_ns - shader_result.start_time_ns; - total_duration_us += static_cast(duration_ns) / 1000.0f; - } - } - return total_duration_us; - } - return 0.0f; -} - -// BenchmarkResult implementation -void BenchmarkResult::add_iter_timing(float time_us) { - iter_timings.push_back(time_us); -} - -float BenchmarkResult::get_avg_time_us() const { - if (iter_timings.empty()) { - return 0.0f; - } - - float sum = 0.0f; - for (float timing : iter_timings) { - sum += timing; - } - return sum / iter_timings.size(); -} - -float BenchmarkResult::get_min_time_us() const { - if (iter_timings.empty()) { - return 0.0f; - } - - return *std::min_element(iter_timings.begin(), iter_timings.end()); -} - -float BenchmarkResult::get_max_time_us() const { - if (iter_timings.empty()) { - return 0.0f; - } - - return *std::max_element(iter_timings.begin(), iter_timings.end()); -} - -float BenchmarkResult::get_std_dev_us() const { - if (iter_timings.size() <= 1) { - return 0.0f; - } - - float mean = get_avg_time_us(); - float sum_sq_diff = 0.0f; - - for (float timing : iter_timings) { - float diff = timing - mean; - sum_sq_diff += diff * diff; - } - - return std::sqrt(sum_sq_diff / (iter_timings.size() - 1)); -} - -void BenchmarkResult::print_summary( - int case_number, - const std::string& size_info, - float total_gflops) const { - static constexpr int OPERATOR_NAME_WIDTH = 50; - static constexpr int KERNEL_NAME_WIDTH = 70; - static constexpr int SIZE_INFO_WIDTH = 20; - static constexpr int TIMING_WIDTH = 20; - static constexpr int GFLOPS_WIDTH = 20; - static constexpr int CORRECTNESS_WIDTH = 10; - - std::string correctness_str; - switch (correctness_status_) { - case CorrectnessStatus::SKIPPED: - correctness_str = "SKIPPED"; - break; - case CorrectnessStatus::PASSED: - correctness_str = "PASSED"; - break; - case CorrectnessStatus::FAILED: - correctness_str = "FAILED"; - break; - } - - std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH) - << get_operator_name() << " " << std::left - << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() << std::right - << " " << std::setw(SIZE_INFO_WIDTH) << size_info - << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) - << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH) - << std::fixed << std::setprecision(3) << total_gflops << " GFLOP/s " - << std::setw(CORRECTNESS_WIDTH) << correctness_str << std::endl; -} - -// TestResult implementation -void TestResult::add_result(const BenchmarkResult& result) { - results_.push_back(result); -} - -void TestResult::add_result(BenchmarkResult&& result) { - results_.push_back(std::move(result)); -} - -void TestResult::print_summary() const { - static constexpr int CASE_WIDTH = 80; - static constexpr int KERNEL_NAME_WIDTH = 20; - static constexpr int TIMING_WIDTH = 12; - static constexpr int PASS_WIDTH = 8; - - if (results_.empty()) { - std::cout << "No results to display" << std::endl; - return; - } - - std::cout << "\n=== " << operation_name_ - << " Performance Summary ===" << std::endl; - print_separator(); - - std::cout << std::left << std::setw(CASE_WIDTH) << "Case" << std::left - << std::setw(KERNEL_NAME_WIDTH) << "Kernel Name" << std::left - << std::setw(TIMING_WIDTH) << "Avg (μs)" << std::left - << std::setw(TIMING_WIDTH) << "Min (μs)" << std::left - << std::setw(TIMING_WIDTH) << "Max (μs)" << std::left - << std::setw(TIMING_WIDTH) << "Std Dev" << std::left - << std::setw(PASS_WIDTH) << "Pass" << std::endl; - print_separator(); - - for (size_t i = 0; i < results_.size(); ++i) { - const auto& result = results_[i]; - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - std::cout << std::left << std::setw(CASE_WIDTH) << i + 1 << std::left - << std::setw(KERNEL_NAME_WIDTH) - << result.get_kernel_name().substr(0, KERNEL_NAME_WIDTH - 1) - << std::left << std::setw(TIMING_WIDTH) << std::fixed - << std::setprecision(3) << result.get_avg_time_us() << std::left - << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) - << result.get_min_time_us() << std::left - << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) - << result.get_max_time_us() << std::left - << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) - << result.get_std_dev_us() << std::left << std::setw(PASS_WIDTH) - << (vulkan_execute_succeeded ? "✓" : "✗") << std::endl; - } - - print_separator(); - std::cout << "Total cases: " << results_.size() - << ", Passed: " << get_passed_count() - << ", Failed: " << get_failed_count() << std::endl; - std::cout << "Overall GFLOP/s: " << std::fixed << std::setprecision(3) - << gflops_ << std::endl; - std::cout << "Overall correctness: " - << (correctness_passed_ ? "PASSED" : "FAILED") << std::endl; -} - -void TestResult::print_detailed_results() const { - if (results_.empty()) { - std::cout << "No results to display" << std::endl; - return; - } - - std::cout << "\n=== " << operation_name_ - << " Detailed Results ===" << std::endl; - - for (size_t i = 0; i < results_.size(); ++i) { - const auto& result = results_[i]; - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - std::cout << "\nCase " << i + 1 << ": " << result.get_kernel_name() - << std::endl; - std::cout << " Iterations: " << result.get_num_iterations() << std::endl; - std::cout << " Average: " << std::fixed << std::setprecision(3) - << result.get_avg_time_us() << " μs" << std::endl; - std::cout << " Min: " << std::fixed << std::setprecision(3) - << result.get_min_time_us() << " μs" << std::endl; - std::cout << " Max: " << std::fixed << std::setprecision(3) - << result.get_max_time_us() << " μs" << std::endl; - std::cout << " Std Dev: " << std::fixed << std::setprecision(3) - << result.get_std_dev_us() << " μs" << std::endl; - std::cout << " Correctness: " - << (vulkan_execute_succeeded ? "PASSED" : "FAILED") << std::endl; - - if (result.get_num_iterations() > 0) { - std::cout << " Individual timings (μs): "; - const auto& timings = result.get_iter_timings(); - for (size_t j = 0; j < std::min(size_t(10), timings.size()); ++j) { - std::cout << std::fixed << std::setprecision(1) << timings[j]; - if (j < std::min(size_t(10), timings.size()) - 1) - std::cout << ", "; - } - if (timings.size() > 10) { - std::cout << " ... (" << (timings.size() - 10) << " more)"; - } - std::cout << std::endl; - } - } - - std::cout << "\nOverall Results:" << std::endl; - std::cout << " Total GFLOP/s: " << std::fixed << std::setprecision(3) - << gflops_ << std::endl; - std::cout << " Overall correctness: " - << (correctness_passed_ ? "PASSED" : "FAILED") << std::endl; -} - -void TestResult::print_statistics() const { - if (results_.empty()) { - std::cout << "No results to display statistics for" << std::endl; - return; - } - - std::cout << "\n=== " << operation_name_ << " Statistics ===" << std::endl; - std::cout << "Total test cases: " << results_.size() << std::endl; - std::cout << "Passed: " << get_passed_count() << std::endl; - std::cout << "Failed: " << get_failed_count() << std::endl; - std::cout << "Success rate: " << std::fixed << std::setprecision(1) - << (100.0f * get_passed_count() / results_.size()) << "%" - << std::endl; - - if (get_passed_count() > 0) { - std::cout << "Total average time: " << std::fixed << std::setprecision(3) - << get_total_avg_time_us() << " μs" << std::endl; - std::cout << "Total GFLOP/s: " << std::fixed << std::setprecision(3) - << get_total_gflops() << std::endl; - - const auto* fastest = get_fastest_result(); - const auto* slowest = get_slowest_result(); - const auto* highest_gflops = get_highest_gflops_result(); - - if (fastest) { - std::cout << "Fastest case: " << fastest->get_kernel_name() << " (" - << std::fixed << std::setprecision(3) - << fastest->get_avg_time_us() << " μs)" << std::endl; - } - - if (slowest) { - std::cout << "Slowest case: " << slowest->get_kernel_name() << " (" - << std::fixed << std::setprecision(3) - << slowest->get_avg_time_us() << " μs)" << std::endl; - } - - if (highest_gflops) { - std::cout << "Best performing case: " << highest_gflops->get_kernel_name() - << " (" << std::fixed << std::setprecision(3) - << highest_gflops->get_avg_time_us() << " μs)" << std::endl; - } - } -} - -void TestResult::print_brief_summary() const { - print_separator(); - std::cout << "Summary Statistics:" << std::endl; - - if (get_passed_count() > 0) { - std::cout << "Average execution time: " << std::fixed - << std::setprecision(3) << get_total_avg_time_us() << " μs" - << std::endl; - std::cout << "Total throughput: " << std::fixed << std::setprecision(3) - << get_gflops() << " GFLOP/s" << std::endl; - std::cout << "Successful test cases: " << get_passed_count() << "/" - << size() << std::endl; - std::cout << "Overall correctness: " - << (get_correctness_passed() ? "PASSED" : "FAILED") << std::endl; - } else { - std::cout << "No successful test cases to report" << std::endl; - } -} - -float TestResult::get_total_avg_time_us() const { - if (results_.empty()) { - return 0.0f; - } - - float sum = 0.0f; - size_t count = 0; - - for (const auto& result : results_) { - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - if (vulkan_execute_succeeded) { - sum += result.get_avg_time_us(); - count++; - } - } - - return count > 0 ? sum / count : 0.0f; -} - -float TestResult::get_total_gflops() const { - return gflops_; -} - -size_t TestResult::get_passed_count() const { - size_t count = 0; - for (const auto& result : results_) { - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - if (vulkan_execute_succeeded) { - count++; - } - } - return count; -} - -size_t TestResult::get_failed_count() const { - return results_.size() - get_passed_count(); -} - -const BenchmarkResult* TestResult::get_fastest_result() const { - const BenchmarkResult* fastest = nullptr; - - for (const auto& result : results_) { - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - if (vulkan_execute_succeeded) { - if (!fastest || result.get_avg_time_us() < fastest->get_avg_time_us()) { - fastest = &result; - } - } - } - - return fastest; -} - -const BenchmarkResult* TestResult::get_slowest_result() const { - const BenchmarkResult* slowest = nullptr; - - for (const auto& result : results_) { - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - if (vulkan_execute_succeeded) { - if (!slowest || result.get_avg_time_us() > slowest->get_avg_time_us()) { - slowest = &result; - } - } - } - - return slowest; -} - -const BenchmarkResult* TestResult::get_highest_gflops_result() const { - // Since GFLOPS is now a TestResult-level metric rather than per-case, - // this method returns the fastest result as a proxy for highest performance - return get_fastest_result(); -} - -// Default FLOP calculation function (assumes 1 FLOP per element) -int64_t default_flop_calculator(const TestCase& test_case) { - // Calculate total elements from the first input tensor - int64_t total_elements = 1; - if (!test_case.empty() && test_case.num_inputs() > 0 && - test_case.inputs()[0].is_tensor()) { - const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); - for (int64_t size : sizes) { - total_elements *= size; - } - } - - // Assume 1 FLOP per element for basic operations - return total_elements; -} - -ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name) { - GraphConfig config; - config.enable_querypool = true; - ComputeGraph graph(config); - - std::vector input_values; - - // Process input ValueSpecs - for (size_t i = 0; i < test_case.num_inputs(); ++i) { - const ValueSpec& input_spec = test_case.inputs()[i]; - - if (input_spec.is_none()) { - input_values.push_back(graph.add_none()); - } else if (input_spec.is_float()) { - ValueRef input_value = - graph.add_scalar(static_cast(input_spec.get_float_value())); - input_values.push_back(input_value); - } else if (input_spec.is_int()) { - ValueRef input_value = - graph.add_scalar(static_cast(input_spec.get_int_value())); - input_values.push_back(input_value); - } else if (input_spec.is_bool()) { - ValueRef input_value = graph.add_scalar(input_spec.get_bool_value()); - input_values.push_back(input_value); - } else if (input_spec.is_int_list()) { - // Convert int32_t list to int64_t list for ComputeGraph - const auto& int32_list = input_spec.get_int_list(); - std::vector int64_list; - int64_list.reserve(int32_list.size()); - for (int32_t val : int32_list) { - int64_list.push_back(static_cast(val)); - } - ValueRef input_value = graph.add_scalar_list(std::move(int64_list)); - input_values.push_back(input_value); - } else if (input_spec.is_constant()) { - ValueRef input_value = graph.add_tensorref( - input_spec.get_tensor_sizes(), - input_spec.dtype, - input_spec.get_data_ptr()); - input_values.push_back(input_value); - } else { - IOValueRef input_io = graph.add_input_tensor( - input_spec.get_tensor_sizes(), - input_spec.dtype, - input_spec.storage_type, - input_spec.memory_layout); - input_values.push_back(input_io.value); - } - } - - std::vector output_values; - - // Process output ValueSpecs - for (size_t i = 0; i < test_case.num_outputs(); ++i) { - const ValueSpec& output_spec = test_case.outputs()[i]; - - if (!output_spec.is_tensor()) { - throw std::runtime_error("All output specifications must be tensors"); - } - - // Create output tensor - ValueRef output_value = graph.add_tensor( - output_spec.get_tensor_sizes(), - output_spec.dtype, - output_spec.storage_type, - output_spec.memory_layout); - - output_values.push_back(output_value); - } - - // Get the operator function and call it - auto opFn = VK_GET_OP_FN(op_name); - - // Create arguments vector for the operator function - std::vector op_args = input_values; - op_args.insert(op_args.end(), output_values.begin(), output_values.end()); - - opFn(graph, op_args); - - for (size_t i = 0; i < output_values.size(); ++i) { - graph.set_output_value(output_values[i]); - } - return graph; -} - -// Test execution utilities -BenchmarkResult -execute_test_case(TestCase& test_case, int warmup_runs, int benchmark_runs) { - BenchmarkResult result( - test_case.name().empty() ? "unnamed_test_case" : test_case.name()); - - // Initialize querypool if using GPU timestamps - if (use_gpu_timestamps()) { - api::context()->initialize_querypool(); - } - - // Create the compute graph for this test case using setup_compute_graph - ComputeGraph graph = - setup_compute_graph(test_case, test_case.operator_name()); - - // Prepare the graph - graph.prepare(); - graph.prepack(); - - // Copy input data into the graph's staging buffers - for (size_t i = 0; i < test_case.num_inputs(); ++i) { - const ValueSpec& input_spec = test_case.inputs()[i]; - if (input_spec.is_tensor() && i < graph.inputs().size()) { - // Skip copying data for constant tensors - if (input_spec.is_constant()) { - continue; - } - - const auto& input_ref = graph.inputs()[i]; - - // Get the appropriate data based on dtype - const void* data_ptr = nullptr; - size_t data_numel = input_spec.numel(); - - switch (input_spec.dtype) { - case vkapi::kFloat: - data_ptr = input_spec.get_float_data().data(); - break; - case vkapi::kHalf: - data_ptr = input_spec.get_half_data().data(); - break; - case vkapi::kInt: - data_ptr = input_spec.get_int32_data().data(); - break; - case vkapi::kChar: - data_ptr = input_spec.get_int8_data().data(); - break; - case vkapi::kByte: - data_ptr = input_spec.get_uint8_data().data(); - break; - default: - throw std::runtime_error("Unsupported data type for input tensor"); - } - - // Copy data into staging buffer - graph.copy_into_staging(input_ref.staging, data_ptr, data_numel); - } - } - - // Warmup runs - for (int run = 0; run < warmup_runs; ++run) { - graph.execute(); - } - - // Benchmark runs - collect individual iteration timings - float total_cpu_time_us = 0.0f; - float total_gpu_time_us = 0.0f; - - for (int run = 0; run < benchmark_runs; ++run) { - // Measure CPU time for each execute() call - auto cpu_start = std::chrono::high_resolution_clock::now(); - graph.execute(); - auto cpu_end = std::chrono::high_resolution_clock::now(); - - auto cpu_duration = std::chrono::duration_cast( - cpu_end - cpu_start); - float cpu_time_us = static_cast(cpu_duration.count()); - total_cpu_time_us += cpu_time_us; - - // Collect GPU timing using helper function - float gpu_time_us = collect_gpu_timing_us(graph); - total_gpu_time_us += gpu_time_us; - - // Add the appropriate timing based on the flag - float iter_time_us = use_gpu_timestamps() ? gpu_time_us : cpu_time_us; - result.add_iter_timing(iter_time_us); - } - - // Calculate averages for display - float avg_cpu_time_us = total_cpu_time_us / benchmark_runs; - float avg_gpu_time_us = total_gpu_time_us / benchmark_runs; - - // Print both timings if latency printing is enabled - if (print_latencies()) { - if (use_gpu_timestamps()) { - graph.context()->querypool().print_results(); - } - std::cout << " CPU timing: " << std::fixed << std::setprecision(3) - << avg_cpu_time_us << " μs" << std::endl; - std::cout << " GPU timing: " << std::fixed << std::setprecision(3) - << avg_gpu_time_us << " μs" << std::endl; - std::cout << " Using " << (use_gpu_timestamps() ? "GPU" : "CPU") - << " timing for result" << std::endl; - } - - // Copy output data from the graph's staging buffers - for (size_t i = 0; i < test_case.num_outputs(); ++i) { - ValueSpec& output_spec = test_case.outputs()[i]; - - if (output_spec.is_tensor() && i < graph.outputs().size()) { - const auto& output_ref = graph.outputs()[i]; - - // Ensure output data vector is properly sized - size_t data_numel = output_spec.numel(); - output_spec.resize_data(data_numel); - - // Get mutable data pointer for the output - void* data_ptr = output_spec.get_mutable_data_ptr(); - - if (data_ptr != nullptr) { - // Copy data from staging buffer to output spec - graph.copy_from_staging(output_ref.staging, data_ptr, data_numel); - } - - // Print output tensor data if output printing is enabled - if (print_output()) { - std::string output_name = "Output[" + std::to_string(i) + "]"; - print_valuespec_data(output_spec, output_name); - } - } - } - - return result; -} - -TestResult execute_test_cases( - std::function()> test_case_generator, - FlopCalculatorFunc flop_calculator, - const std::string& operation_name, - int warmup_runs, - int benchmark_runs, - ReferenceComputeFunc reference_compute_func) { - TestResult results(operation_name); - - // Generate all test cases - std::vector test_cases = test_case_generator(); - - std::cout << "Executing " << test_cases.size() << " test cases for " - << operation_name << std::endl; - print_separator(); - - bool any_correctness_failed = false; - float total_gflops = 0.0f; - - for (size_t i = 0; i < test_cases.size(); ++i) { - TestCase& test_case = test_cases[i]; - - // Compute reference data if reference function is provided - bool skipped_reference_fn = true; - if (reference_compute_func) { - try { - reference_compute_func(test_case); - skipped_reference_fn = false; - } catch (const std::invalid_argument& e) { - if (debugging()) { - std::cout << "Compute reference skipped: " << e.what() << std::endl; - } - } - } - - // Execute single test case - BenchmarkResult result; - bool shader_not_supported = false; - try { - result = execute_test_case(test_case, warmup_runs, benchmark_runs); - result.set_operator_name(test_case.operator_name()); - } catch (const vkcompute::vkapi::ShaderNotSupportedError& e) { - result = BenchmarkResult( - test_case.name().empty() ? "unnamed_test_case" : test_case.name(), - test_case.operator_name()); - shader_not_supported = true; - } - - // Determine if this test case passed (has valid timing data) - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; - - if (shader_not_supported) { - result.set_correctness_status(CorrectnessStatus::SKIPPED); - } else if (!vulkan_execute_succeeded) { - result.set_correctness_status(CorrectnessStatus::FAILED); - } else if (skipped_reference_fn) { - result.set_correctness_status(CorrectnessStatus::SKIPPED); - } else { - // Reference function provided and succeeded - validate outputs - bool correctness_passed = true; - - for (size_t output_idx = 0; output_idx < test_case.num_outputs(); - ++output_idx) { - const ValueSpec& output_spec = test_case.outputs()[output_idx]; - - if (!output_spec.validate_against_reference( - test_case.get_abs_tolerance(), test_case.get_rel_tolerance())) { - correctness_passed = false; - std::cout << " Correctness validation FAILED for test " - << result.get_kernel_name() << std::endl; - print_valuespec_data(output_spec, "vulkan output"); - print_valuespec_data(output_spec, "ref output", true); - - throw std::runtime_error("Correctness validation failed"); - } - } - - if (correctness_passed) { - result.set_correctness_status(CorrectnessStatus::PASSED); - } else { - any_correctness_failed = true; - result.set_correctness_status(CorrectnessStatus::FAILED); - } - } - - // Calculate GFLOPS for this test case using the provided FLOP calculator - float case_gflops = 0.0f; - if (vulkan_execute_succeeded) { - // Use the provided FLOP calculator to get total FLOPs for this test case - int64_t total_flops = flop_calculator(test_case); - float flops = static_cast(total_flops); - float avg_time_us = result.get_avg_time_us(); - if (avg_time_us > 0.0f && total_flops > 0) { - case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f); - } - - total_gflops += case_gflops; - } else { - case_gflops = -1.0f; // Indicate failure - } - - // Calculate tensor info for display - std::string size_info = "["; - if (!test_case.empty() && test_case.num_inputs() > 0 && - test_case.inputs()[0].is_tensor()) { - const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); - for (size_t j = 0; j < sizes.size(); ++j) { - size_info += std::to_string(sizes[j]); - if (j < sizes.size() - 1) - size_info += "x"; - } - } - size_info += "]"; - - // Print progress using the BenchmarkResult member function - result.print_summary(i + 1, size_info, case_gflops); - - // Add result to collection - results.add_result(std::move(result)); - } - - // Set the overall results on the TestResult - results.set_correctness_passed(!any_correctness_failed); - results.set_gflops(total_gflops); - - print_separator(); - std::cout << "Completed " << results.size() << " test cases" << std::endl; - - return results; -} - -// Convenience overload that uses the default FLOP calculator -TestResult execute_test_cases( - std::function()> test_case_generator, - const std::string& operation_name, - int warmup_runs, - int benchmark_runs, - ReferenceComputeFunc reference_compute_func) { - return execute_test_cases( - test_case_generator, - default_flop_calculator, - operation_name, - warmup_runs, - benchmark_runs, - reference_compute_func); -} - -// Utility functions for printing -void print_performance_header() { - std::cout << "\n=== Compute Shader Performance Benchmark ===" << std::endl; -} - -void print_separator() { - std::cout << std::string(70, '-') << std::endl; -} - -// ValueSpec data printing utilities -void print_valuespec_data( - const ValueSpec& spec, - const std::string& name, - const bool print_ref_data, - size_t max_elements, - int precision) { - std::cout << "\n" << name << " Data:" << std::endl; - std::cout << " Type: " << spec.to_string() << std::endl; - - if (!spec.is_tensor()) { - if (spec.is_int()) { - std::cout << " Value: " << spec.get_int_value() << std::endl; - } else if (spec.is_int_list()) { - const auto& int_list = spec.get_int_list(); - std::cout << " Values: ["; - size_t print_count = std::min(max_elements, int_list.size()); - for (size_t i = 0; i < print_count; ++i) { - std::cout << int_list[i]; - if (i < print_count - 1) - std::cout << ", "; - } - if (int_list.size() > max_elements) { - std::cout << ", ... (" << (int_list.size() - max_elements) << " more)"; - } - std::cout << "]" << std::endl; - } - return; - } - - // Print tensor data - size_t total_elements = spec.numel(); - size_t print_count = std::min(max_elements, total_elements); - - std::cout << " Total elements: " << total_elements << std::endl; - std::cout << " Data (first " << print_count << " elements): ["; - - std::cout << std::fixed << std::setprecision(precision); - - switch (spec.dtype) { - case vkapi::kFloat: { - auto data = spec.get_float_data().data(); - if (print_ref_data) { - data = spec.get_ref_float_data().data(); - } - for (size_t i = 0; i < print_count; ++i) { - std::cout << data[i]; - if (i < print_count - 1) - std::cout << ", "; - } - break; - } - case vkapi::kHalf: { - const auto& data = spec.get_half_data(); - for (size_t i = 0; i < print_count; ++i) { - // Convert uint16_t back to float for display - float value = data[i] / 32767.0f; - std::cout << value; - if (i < print_count - 1) - std::cout << ", "; - } - break; - } - case vkapi::kInt: { - const auto& data = spec.get_int32_data(); - for (size_t i = 0; i < print_count; ++i) { - std::cout << data[i]; - if (i < print_count - 1) - std::cout << ", "; - } - break; - } - case vkapi::kChar: { - const auto& data = spec.get_int8_data(); - if (spec.is_int4()) { - // Print each 4-bit value individually - size_t element_count = 0; - for (size_t i = 0; i < data.size() && element_count < print_count; - ++i) { - // Extract lower 4 bits (signed) - int8_t lower_4bits = data[i] & 0x0F; - if (lower_4bits > 7) - lower_4bits -= 16; // Convert to signed - std::cout << static_cast(lower_4bits); - element_count++; - - if (element_count < print_count) { - std::cout << ", "; - // Extract upper 4 bits (signed) - int8_t upper_4bits = (data[i] >> 4) & 0x0F; - if (upper_4bits > 7) - upper_4bits -= 16; // Convert to signed - std::cout << static_cast(upper_4bits); - element_count++; - - if (element_count < print_count) - std::cout << ", "; - } - } - } else { - for (size_t i = 0; i < print_count; ++i) { - std::cout << static_cast(data[i]); - if (i < print_count - 1) - std::cout << ", "; - } - } - break; - } - case vkapi::kByte: { - const auto& data = spec.get_uint8_data(); - if (spec.is_int4()) { - // Print each 4-bit value individually - size_t element_count = 0; - for (size_t i = 0; i < data.size() && element_count < print_count; - ++i) { - // Extract lower 4 bits - uint8_t lower_4bits = data[i] & 0x0F; - std::cout << static_cast(lower_4bits); - element_count++; - - if (element_count < print_count) { - std::cout << ", "; - // Extract upper 4 bits - uint8_t upper_4bits = (data[i] >> 4) & 0x0F; - std::cout << static_cast(upper_4bits); - element_count++; - - if (element_count < print_count) - std::cout << ", "; - } - } - } else { - for (size_t i = 0; i < print_count; ++i) { - std::cout << static_cast(data[i]); - if (i < print_count - 1) - std::cout << ", "; - } - } - break; - } - default: - std::cout << "unsupported data type"; - break; - } - - if (total_elements > max_elements) { - std::cout << ", ... (" << (total_elements - max_elements) << " more)"; - } - std::cout << "]" << std::endl; - - // Print some statistics for tensor data - if (total_elements > 0) { - float min_val = 0.0f, max_val = 0.0f, sum = 0.0f; - bool first = true; - - for (size_t i = 0; i < total_elements; ++i) { - float val = spec.get_element(i); - if (first) { - min_val = max_val = val; - first = false; - } else { - min_val = std::min(min_val, val); - max_val = std::max(max_val, val); - } - sum += val; - } - - float mean = sum / total_elements; - std::cout << " Statistics: min=" << std::setprecision(precision) << min_val - << ", max=" << max_val << ", mean=" << mean << ", sum=" << sum - << std::endl; - } -} - -ValueRef quantized_weights_canvas( - ComputeGraph& graph, - const ValueRef weight_ref) { - const auto original_sizes = graph.sizes_of(weight_ref); - - // Get the 2 highest values of original_sizes - std::vector sorted_sizes = original_sizes; - std::sort(sorted_sizes.begin(), sorted_sizes.end(), std::greater()); - int64_t largest1 = sorted_sizes.size() > 0 ? sorted_sizes[0] : 0; - int64_t largest2 = sorted_sizes.size() > 1 ? sorted_sizes[1] : 0; - - std::vector final_sizes = {1, largest1, largest1}; - - // Debug logging if debugging flag is set - if (debugging()) { - std::cout << "Debug: Creating quantized weights canvas tensor" << std::endl; - std::cout << "Debug: Original sizes: ["; - for (size_t i = 0; i < original_sizes.size(); ++i) { - std::cout << original_sizes[i]; - if (i < original_sizes.size() - 1) - std::cout << ", "; - } - std::cout << "]" << std::endl; - std::cout << "Debug: Canvas sizes: ["; - for (size_t i = 0; i < final_sizes.size(); ++i) { - std::cout << final_sizes[i]; - if (i < final_sizes.size() - 1) - std::cout << ", "; - } - std::cout << "]" << std::endl; - } - - ValueRef packed_weight = graph.add_tensor( - final_sizes, vkapi::kInt, utils::kTexture3D, utils::kWidthPacked); - - utils::uvec3 global_wg_size{ - utils::div_up(utils::safe_downcast(largest1), uint32_t(4)), - utils::safe_downcast(largest2), - utils::safe_downcast(std::min(largest1, int64_t(2048)))}; - - std::string kernel_name = "packed_int32_canvas"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(packed_weight), - graph.create_local_wg_size(packed_weight), - weight_ref, - packed_weight, - // UBOs - {graph.logical_limits_ubo(packed_weight)}, - // Specialization constants - {}, - // Push Constants - {})); - - return packed_weight; -} - -ValueRef float_tensor_canvas(ComputeGraph& graph, const ValueRef weight_ref) { - const auto original_sizes = graph.sizes_of(weight_ref); - - // Get the 2 highest values of original_sizes - std::vector sorted_sizes = original_sizes; - std::sort(sorted_sizes.begin(), sorted_sizes.end(), std::greater()); - int64_t largest1 = sorted_sizes.size() > 0 ? sorted_sizes[0] : 0; - int64_t largest2 = sorted_sizes.size() > 1 ? sorted_sizes[1] : 0; - - std::vector final_sizes = {1, largest1, largest1}; - - // Debug logging if debugging flag is set - if (debugging()) { - std::cout << "Debug: Creating float tensor canvas" << std::endl; - std::cout << "Debug: Original sizes: ["; - for (size_t i = 0; i < original_sizes.size(); ++i) { - std::cout << original_sizes[i]; - if (i < original_sizes.size() - 1) - std::cout << ", "; - } - std::cout << "]" << std::endl; - std::cout << "Debug: Canvas sizes: ["; - for (size_t i = 0; i < final_sizes.size(); ++i) { - std::cout << final_sizes[i]; - if (i < final_sizes.size() - 1) - std::cout << ", "; - } - std::cout << "]" << std::endl; - } - - ValueRef packed_weight = graph.add_tensor( - final_sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked); - - utils::uvec3 global_wg_size{ - utils::div_up(utils::safe_downcast(largest1), uint32_t(4)), - utils::safe_downcast(largest2), - utils::safe_downcast(std::min(largest1, int64_t(2048)))}; - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR("float_canvas"), - graph.create_global_wg_size(packed_weight), - graph.create_local_wg_size(packed_weight), - weight_ref, - packed_weight, - // UBOs - {graph.logical_limits_ubo(packed_weight)}, - // Specialization constants - {}, - // Push Constants - {})); - - return packed_weight; -} - -// Compute weight sums for quantized operations (linear and convolution) -void compute_weight_sums( - ValueSpec& weight_sums, - const ValueSpec& quantized_weight, - int64_t out_features, - int64_t elements_per_output_feature) { - auto& weight_sums_data = weight_sums.get_int32_data(); - auto& quantized_weight_data = quantized_weight.get_int8_data(); - - weight_sums_data.resize(out_features); - - // For each output feature, compute the sum of quantized weights - for (int64_t out_f = 0; out_f < out_features; ++out_f) { - int32_t sum = 0; - for (int64_t elem = 0; elem < elements_per_output_feature; ++elem) { - // Weight indexing depends on the layout: - // For linear: [out_features, in_features] -> out_f * - // elements_per_output_feature + elem For conv2d: [C_out, C_in * K_h * - // K_w] -> out_f * elements_per_output_feature + elem - int64_t weight_idx = out_f * elements_per_output_feature + elem; - sum += static_cast(quantized_weight_data[weight_idx]); - } - weight_sums_data[out_f] = sum; - } -} - -} // namespace prototyping -} // namespace vulkan -} // namespace executorch diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h deleted file mode 100644 index 2440e225ef2..00000000000 --- a/backends/vulkan/test/custom_ops/utils.h +++ /dev/null @@ -1,661 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace executorch { -namespace vulkan { -namespace prototyping { - -using namespace vkcompute; - -// -// Global configuration options -// - -bool print_output(); -void set_print_output(bool print_output); - -bool print_latencies(); -void set_print_latencies(bool print_latencies); - -bool use_gpu_timestamps(); -void set_use_gpu_timestamps(bool use_timestamps); - -bool debugging(); -void set_debugging(bool enable_debugging); - -// -// ValueSpec class -// - -enum class SpecType { Tensor, IntList, Int, Float, Bool }; - -// Data generation types -enum class DataGenType { - FIXED, - RANDOM, - RANDOM_SCALES, - RANDINT, - RANDINT8, - RANDINT4, - ONES, - ONES_INT4, - ZEROS -}; - -// Value specification struct -struct ValueSpec { - std::vector sizes; - vkapi::ScalarType dtype; - utils::GPUMemoryLayout memory_layout; - utils::StorageType storage_type; - SpecType spec_type; - DataGenType data_gen_type; - bool is_constant_tensor; - bool is_none_flag; - bool is_int4_tensor; - - std::vector float_data; - std::vector int32_data; - std::vector half_data; // Using uint16_t as substitute for half - std::vector int8_data; // For kChar (signed 8-bit) - std::vector uint8_data; // For kByte (unsigned 8-bit) - - std::vector ref_float_data; - std::vector ref_int32_data; - std::vector ref_half_data; - std::vector ref_int8_data; - std::vector ref_uint8_data; - - ValueSpec( - const std::vector& sizes, - vkapi::ScalarType dtype, - utils::StorageType storage_type = utils::kTexture3D, - utils::GPUMemoryLayout memory_layout = utils::kWidthPacked) - : sizes(sizes), - dtype(dtype), - memory_layout(memory_layout), - storage_type(storage_type), - spec_type(SpecType::Tensor), - data_gen_type(DataGenType::ZEROS), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false) { - generate_tensor_data(); - } - - // Constructor for tensor with custom data generation type - ValueSpec( - const std::vector& sizes, - vkapi::ScalarType dtype, - utils::StorageType storage_type, - utils::GPUMemoryLayout memory_layout, - DataGenType data_gen_type) - : sizes(sizes), - dtype(dtype), - memory_layout(memory_layout), - storage_type(storage_type), - spec_type(SpecType::Tensor), - data_gen_type(data_gen_type), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false) { - generate_tensor_data(); - } - - // Constructor for single int - ValueSpec(int32_t value) - : sizes({1}), - dtype(vkapi::kInt), - memory_layout(utils::kWidthPacked), - storage_type(utils::kTexture3D), - spec_type(SpecType::Int), - data_gen_type(DataGenType::FIXED), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false) { - int32_data.push_back(value); - } - - // Constructor for single float - ValueSpec(float value) - : sizes({1}), - dtype(vkapi::kFloat), - memory_layout(utils::kWidthPacked), - storage_type(utils::kTexture3D), - spec_type(SpecType::Float), - data_gen_type(DataGenType::FIXED), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false) { - float_data.push_back(value); - } - - // Constructor for single bool - ValueSpec(bool value) - : sizes({1}), - dtype(vkapi::kInt), - memory_layout(utils::kWidthPacked), - storage_type(utils::kTexture3D), - spec_type(SpecType::Bool), - data_gen_type(DataGenType::FIXED), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false) { - int32_data.push_back(value ? 1 : 0); - } - - // Constructor for int list - ValueSpec(const std::vector& values) - : sizes({static_cast(values.size())}), - dtype(vkapi::kInt), - memory_layout(utils::kWidthPacked), - storage_type(utils::kTexture3D), - spec_type(SpecType::IntList), - data_gen_type(DataGenType::FIXED), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false), - int32_data(values) {} - - // Default constructor - ValueSpec() - : dtype(vkapi::kFloat), - memory_layout(utils::kWidthPacked), - storage_type(utils::kTexture3D), - spec_type(SpecType::Tensor), - data_gen_type(DataGenType::ZEROS), - is_constant_tensor(false), - is_none_flag(false), - is_int4_tensor(false) {} - - int64_t numel() const; - size_t nbytes() const; - std::string to_string() const; - - bool is_tensor() const { - return spec_type == SpecType::Tensor; - } - bool is_int_list() const { - return spec_type == SpecType::IntList; - } - bool is_int() const { - return spec_type == SpecType::Int; - } - bool is_float() const { - return spec_type == SpecType::Float; - } - bool is_bool() const { - return spec_type == SpecType::Bool; - } - - int32_t get_int_value() const { - return int32_data.empty() ? 0 : int32_data[0]; - } - float get_float_value() const { - return float_data.empty() ? 0.0f : float_data[0]; - } - bool get_bool_value() const { - return int32_data.empty() ? false : (int32_data[0] != 0); - } - const std::vector& get_int_list() const { - return int32_data; - } - const std::vector& get_tensor_sizes() const { - return sizes; - } - - const std::vector& get_float_data() const { - return float_data; - } - const std::vector& get_int32_data() const { - return int32_data; - } - const std::vector& get_half_data() const { - return half_data; - } - const std::vector& get_int8_data() const { - return int8_data; - } - const std::vector& get_uint8_data() const { - return uint8_data; - } - - std::vector& get_float_data() { - return float_data; - } - std::vector& get_int32_data() { - return int32_data; - } - std::vector& get_half_data() { - return half_data; - } - std::vector& get_int8_data() { - return int8_data; - } - std::vector& get_uint8_data() { - return uint8_data; - } - - const std::vector& get_ref_float_data() const { - return ref_float_data; - } - const std::vector& get_ref_int32_data() const { - return ref_int32_data; - } - const std::vector& get_ref_half_data() const { - return ref_half_data; - } - const std::vector& get_ref_int8_data() const { - return ref_int8_data; - } - const std::vector& get_ref_uint8_data() const { - return ref_uint8_data; - } - - std::vector& get_ref_float_data() { - return ref_float_data; - } - std::vector& get_ref_int32_data() { - return ref_int32_data; - } - std::vector& get_ref_half_data() { - return ref_half_data; - } - std::vector& get_ref_int8_data() { - return ref_int8_data; - } - std::vector& get_ref_uint8_data() { - return ref_uint8_data; - } - - void resize_data(size_t new_size); - void* get_mutable_data_ptr(); - float get_element(size_t index) const; - - // Set/get constant flag - bool is_constant() const { - return is_constant_tensor; - } - void set_constant(bool is_constant) { - is_constant_tensor = is_constant; - } - - // Set/get none flag - bool is_none() const { - return is_none_flag; - } - - void set_none(bool is_none) { - is_none_flag = is_none; - } - - // Set/get int4 flag - bool is_int4() const { - return is_int4_tensor; - } - void set_int4(bool is_int4) { - is_int4_tensor = is_int4; - } - - const void* get_data_ptr() const; - - // Correctness checking against reference data - // Returns true if computed data matches reference data within tolerance - // Only validates float tensors as specified in requirements - bool validate_against_reference( - float abs_tolerance = 2e-3f, - float rel_tolerance = 1e-3f) const; - - private: - void generate_tensor_data(); -}; - -// -// TestCase -// - -class TestCase { - public: - TestCase() : abs_tolerance_(2e-3f), rel_tolerance_(1e-3f) {} - TestCase(const std::string& name) - : name_(name), abs_tolerance_(2e-3f), rel_tolerance_(1e-3f) {} - - void set_name(const std::string& name) { - name_ = name; - } - const std::string& name() const { - return name_; - } - - void set_operator_name(const std::string& op_name) { - operator_name_ = op_name; - } - const std::string& operator_name() const { - return operator_name_; - } - - // Tolerance settings - void set_abs_tolerance(float abs_tolerance) { - abs_tolerance_ = abs_tolerance; - } - float get_abs_tolerance() const { - return abs_tolerance_; - } - - void set_rel_tolerance(float rel_tolerance) { - rel_tolerance_ = rel_tolerance; - } - float get_rel_tolerance() const { - return rel_tolerance_; - } - - void add_input_spec(const ValueSpec& spec) { - inputs_.push_back(spec); - } - - const std::vector& inputs() const { - return inputs_; - } - - std::vector& inputs() { - return inputs_; - } - - size_t num_inputs() const { - return inputs_.size(); - } - - void add_output_spec(const ValueSpec& spec) { - outputs_.push_back(spec); - } - - const std::vector& outputs() const { - return outputs_; - } - - std::vector& outputs() { - return outputs_; - } - - size_t num_outputs() const { - return outputs_.size(); - } - - bool empty() const { - return inputs_.empty() && outputs_.empty(); - } - void clear() { - inputs_.clear(); - outputs_.clear(); - name_.clear(); - operator_name_.clear(); - abs_tolerance_ = 2e-3f; - rel_tolerance_ = 1e-3f; - } - - private: - std::string name_; - std::string operator_name_; - std::vector inputs_; - std::vector outputs_; - float abs_tolerance_; - float rel_tolerance_; -}; - -// -// BenchmarkResult -// - -enum class CorrectnessStatus { - SKIPPED, // No reference function provided - PASSED, // Reference function provided and validation passed - FAILED // Reference function provided but validation failed -}; - -class BenchmarkResult { - public: - BenchmarkResult() : correctness_status_(CorrectnessStatus::SKIPPED) {} - - BenchmarkResult(const std::string& name) - : kernel_name(name), correctness_status_(CorrectnessStatus::SKIPPED) {} - - BenchmarkResult( - const std::string& kernel_name, - const std::string& operator_name) - : kernel_name(kernel_name), - operator_name(operator_name), - correctness_status_(CorrectnessStatus::SKIPPED) {} - - // Add timing for a single iteration - void add_iter_timing(float time_us); - - // Getters - const std::string& get_kernel_name() const { - return kernel_name; - } - const std::string& get_operator_name() const { - return operator_name; - } - float get_avg_time_us() const; - size_t get_num_iterations() const { - return iter_timings.size(); - } - const std::vector& get_iter_timings() const { - return iter_timings; - } - CorrectnessStatus get_correctness_status() const { - return correctness_status_; - } - - // Setters - void set_kernel_name(const std::string& name) { - kernel_name = name; - } - void set_operator_name(const std::string& name) { - operator_name = name; - } - void set_correctness_status(CorrectnessStatus status) { - correctness_status_ = status; - } - - // Statistics - float get_min_time_us() const; - float get_max_time_us() const; - float get_std_dev_us() const; - - // Clear all timings - void clear_timings() { - iter_timings.clear(); - } - - // Print progress for this benchmark result - void print_summary( - int case_number, - const std::string& size_info, - float total_gflops) const; - - private: - std::string kernel_name; - std::string operator_name; - std::vector - iter_timings; // Individual iteration timings in microseconds - CorrectnessStatus correctness_status_; -}; - -// Test result collection and processing -class TestResult { - public: - TestResult() : gflops_(0.0f), correctness_passed_(true) {} - TestResult(const std::string& operation_name) - : operation_name_(operation_name), - gflops_(0.0f), - correctness_passed_(true) {} - - // Add a benchmark result - void add_result(const BenchmarkResult& result); - void add_result(BenchmarkResult&& result); - - // Getters - const std::string& get_operation_name() const { - return operation_name_; - } - float get_gflops() const { - return gflops_; - } - bool get_correctness_passed() const { - return correctness_passed_; - } - size_t size() const { - return results_.size(); - } - bool empty() const { - return results_.empty(); - } - - // Setters - void set_gflops(float gflops_val) { - gflops_ = gflops_val; - } - void set_correctness_passed(bool passed) { - correctness_passed_ = passed; - } - - // Access results - const BenchmarkResult& operator[](size_t index) const { - return results_[index]; - } - BenchmarkResult& operator[](size_t index) { - return results_[index]; - } - const std::vector& get_results() const { - return results_; - } - - // Iterator support - std::vector::iterator begin() { - return results_.begin(); - } - std::vector::iterator end() { - return results_.end(); - } - std::vector::const_iterator begin() const { - return results_.begin(); - } - std::vector::const_iterator end() const { - return results_.end(); - } - - // Processing and analysis - void print_summary() const; - void print_detailed_results() const; - void print_statistics() const; - void print_brief_summary() const; - - // Get aggregate statistics - float get_total_avg_time_us() const; - float get_total_gflops() const; - size_t get_passed_count() const; - size_t get_failed_count() const; - - // Find best/worst performing results - const BenchmarkResult* get_fastest_result() const; - const BenchmarkResult* get_slowest_result() const; - const BenchmarkResult* get_highest_gflops_result() const; - - // Clear all results - void clear() { - results_.clear(); - } - - // Set operation name - void set_operation_name(const std::string& name) { - operation_name_ = name; - } - - private: - std::string operation_name_; - std::vector results_; - float gflops_; - bool correctness_passed_; -}; - -// -// Test case execution -// - -using FlopCalculatorFunc = std::function; - -// Default FLOP calculation function (assumes 1 FLOP per element) -int64_t default_flop_calculator(const TestCase& test_case); - -using ReferenceComputeFunc = std::function; - -BenchmarkResult execute_test_case( - TestCase& test_case, - int warmup_runs = 3, - int benchmark_runs = 10); - -TestResult execute_test_cases( - std::function()> test_case_generator, - FlopCalculatorFunc flop_calculator, - const std::string& operation_name = "Operation", - int warmup_runs = 3, - int benchmark_runs = 10, - ReferenceComputeFunc reference_compute_func = nullptr); - -TestResult execute_test_cases( - std::function()> test_case_generator, - const std::string& operation_name = "Operation", - int warmup_runs = 3, - int benchmark_runs = 10, - ReferenceComputeFunc reference_compute_func = nullptr); - -// -// Print utilities -// - -void print_performance_header(); -void print_separator(); - -void print_valuespec_data( - const ValueSpec& spec, - const std::string& name = "ValueSpec", - const bool print_ref_data = false, - size_t max_elements = 20, - int precision = 6); - -ValueRef quantized_weights_canvas( - ComputeGraph& graph, - const ValueRef weight_ref); - -ValueRef float_tensor_canvas(ComputeGraph& graph, const ValueRef weight_ref); - -// Compute weight sums for quantized operations (linear and convolution) -void compute_weight_sums( - ValueSpec& weight_sums, - const ValueSpec& quantized_weight, - int64_t out_features, - int64_t elements_per_output_feature); - -// Setup compute graph based on TestCase and operation name -ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name); - -} // namespace prototyping -} // namespace vulkan -} // namespace executorch diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml deleted file mode 100644 index 4ef934eb105..00000000000 --- a/backends/vulkan/test/glsl/all_shaders.yaml +++ /dev/null @@ -1,68 +0,0 @@ -binary_op_nobroadcast__test: - parameter_names_with_default_values: - DTYPE: float - OPERATOR: X + Y - generate_variant_forall: - DTYPE: - - VALUE: "half" - SUFFIX: "half" - - VALUE: "float" - SUFFIX: "float" - shader_variants: - - NAME: binary_add_nobroadcast__test - OPERATOR: X + Y - - NAME: binary_sub_nobroadcast__test - OPERATOR: X - Y - - NAME: binary_mul_nobroadcast__test - OPERATOR: X * Y - - NAME: binary_div_nobroadcast__test - OPERATOR: X / Y - - NAME: binary_pow_nobroadcast__test - OPERATOR: pow(X, Y) - -fill_texture__test: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: "half" - SUFFIX: "half" - - VALUE: "float" - SUFFIX: "float" - shader_variants: - - NAME: fill_texture__test - -idx_fill_buffer: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - - VALUE: int8 - shader_variants: - - NAME: idx_fill_buffer - -idx_fill_texture: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int32 - - VALUE: int8 - shader_variants: - - NAME: idx_fill_texture - -scalar_add_buffer: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - - VALUE: int8 - shader_variants: - - NAME: scalar_add_buffer diff --git a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl deleted file mode 100644 index 7f72ac58972..00000000000 --- a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define op(X, Y) ${OPERATOR} - -layout(std430) buffer; - -// clang-format off -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D image_out; -// clang-format on -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; -layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents { - uvec4 data; -} -out_extents; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_extents.data.xyz))) { - return; - } - - vec4 in_texel = texelFetch(image_in, pos, 0); - vec4 other_texel = texelFetch(image_other, pos, 0); - - imageStore(image_out, pos, op(in_texel, other_texel)); -} diff --git a/backends/vulkan/test/glsl/dynamic_dispatch_test.glsl b/backends/vulkan/test/glsl/dynamic_dispatch_test.glsl deleted file mode 100644 index 341da3eeacd..00000000000 --- a/backends/vulkan/test/glsl/dynamic_dispatch_test.glsl +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", "float", "texture3d")} -${layout_declare_tensor(1, "r", "t_in1", "float", "texture3d")} -${layout_declare_tensor(2, "r", "t_in2", "float", "texture3d")} - -layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in1_sizes; - ivec4 in2_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_sizes.xyz))) { - return; - } - - - vec4 out_texel = vec4(0.0); - for (int row = 0; row < in1_sizes.y; ++row) { - ivec3 in_pos = ivec3(pos.x, row, pos.z); - vec4 in1_texel = texelFetch(t_in1, in_pos, 0); - vec4 in2_texel = texelFetch(t_in2, in_pos, 0); - - out_texel += in1_texel * in2_texel; - } - - imageStore(t_out, pos, out_texel + ${OFFSET}); -} diff --git a/backends/vulkan/test/glsl/dynamic_dispatch_test.yaml b/backends/vulkan/test/glsl/dynamic_dispatch_test.yaml deleted file mode 100644 index 0f0f5f51685..00000000000 --- a/backends/vulkan/test/glsl/dynamic_dispatch_test.yaml +++ /dev/null @@ -1,7 +0,0 @@ -dynamic_dispatch_test: - parameter_names_with_default_values: - OFFSET: 2.25 - shader_variants: - - NAME: dynamic_dispatch_test_var1 - - NAME: dynamic_dispatch_test_var2 - OFFSET: 5.5 diff --git a/backends/vulkan/test/glsl/fill_buffer.glsl b/backends/vulkan/test/glsl/fill_buffer.glsl deleted file mode 100644 index 090d9e70d6c..00000000000 --- a/backends/vulkan/test/glsl/fill_buffer.glsl +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -$PRECISION = "highp" -$DTYPE = "float" - -#define PRECISION ${PRECISION} - -#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -layout(set = 0, binding = 0) buffer PRECISION restrict writeonly Buffer { - VEC4_T data[]; -} -buffer_in; - -layout(set = 0, binding = 1) uniform PRECISION restrict Params { - int len; -} -params; - - - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const float scale = 1; -layout(constant_id = 4) const float offset = 0; - -void main() { - const int i = ivec3(gl_GlobalInvocationID).x; - - const int base = 4 * i; - if (base < params.len) { - buffer_in.data[i] = scale * (VEC4_T(base) + VEC4_T(0, 1, 2, 3)) + offset; - } -} diff --git a/backends/vulkan/test/glsl/fill_texture__test.glsl b/backends/vulkan/test/glsl/fill_texture__test.glsl deleted file mode 100644 index 76c630de55e..00000000000 --- a/backends/vulkan/test/glsl/fill_texture__test.glsl +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} uOutput; -layout(set = 0, binding = 1) uniform PRECISION restrict Block { - ivec3 size; - int fill; - vec4 vals; -} params; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, params.size))) { - return; - } - - imageStore(uOutput, pos, params.vals); -} diff --git a/backends/vulkan/test/glsl/idx_fill_buffer.glsl b/backends/vulkan/test/glsl/idx_fill_buffer.glsl deleted file mode 100644 index d32c52c205e..00000000000 --- a/backends/vulkan/test/glsl/idx_fill_buffer.glsl +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define T ${buffer_scalar_type(DTYPE)} - -#include "indexing_utils.h" - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_buffer(0, "w", "out_buf", DTYPE, PRECISION, True)} -${layout_declare_ubo(1, "int", "numel")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const int t_id = ivec3(gl_GlobalInvocationID).x; - if (t_id >= numel) { - return; - } - - out_buf[t_id] = T(t_id); -} diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl deleted file mode 100644 index 8914d2b8925..00000000000 --- a/backends/vulkan/test/glsl/idx_fill_texture.glsl +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -#include "indexing_utils.h" - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "image_out", DTYPE, "texture3d")} -${layout_declare_ubo(1, "ivec4", "sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; -layout(constant_id = 4) const int offset = 10; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); - - if (any(greaterThanEqual(idx, sizes))) { - return; - } - - const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); - VEC4_T texel = VEC4_T(buf_indices) + offset; - imageStore(image_out, pos, texel); -} diff --git a/backends/vulkan/test/glsl/indexing_utils.h b/backends/vulkan/test/glsl/indexing_utils.h deleted file mode 100644 index 8563daaa5fb..00000000000 --- a/backends/vulkan/test/glsl/indexing_utils.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// Width Dim Index, assuming (W, H, C, N) order -#define W_DIM 0 -// Height, assuming (W, H, C, N) order -#define H_DIM 1 -// Channels, assuming (W, H, C, N) order -#define C_DIM 2 - -/* - * Describes which texture axis the "batches" dimension runs along in a 4D - * texture. - * - * Currently it is set to 2 since we represent batches by concatenating along - * the channels dim, which has index 2 in (W, H, C, N) order and maps to the - * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) - * order. - */ -#define BATCH_AXIS 2 - -// -// Basic Indexing Utility Macros and Functions -// - -/* - * Aligns input to the next multiple of 4 - */ -#define alignup4(x) ((x + 3) & -4) - -// -// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion -// - -/* - * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim - * is packed along a texel - * Output: A ivec4 containing the buffer indices corresponding to each texel - * element. - */ -ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) { - ivec4 strides = - ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z); - - int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z + - idx.w * strides.w; - - return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; -} - -// -// (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion -// - -/* - * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim - * is packed along a texel - * Output: Whether the texel position is outside the bounds of the image texture - * given the size and packed dimension of the tensor. - */ -bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - ivec3 max_pos = sizes.xyz; - max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS]; - max_pos[packed_dim] /= 4; - return (any(greaterThanEqual(pos, max_pos))); -} - -/* - * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, - * which dim is packed along a texel - * Returns: the (w, h, c, n) tensor index cooresponding to the first element of - * the texel at the specified position - */ -ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) { - // Align packed dim to next multiple of 4 to account for texel padding - sizes[packed_dim] = alignup4(sizes[packed_dim]); - - // Packed dim contains 4 elements per texel - pos[packed_dim] *= 4; - // Construct the initial tensor index via swizzling -#if BATCH_AXIS == 2 - ivec4 tensor_idx = pos.xyzz; -#endif -#if BATCH_AXIS == 1 - ivec4 tensor_idx = pos.xyzy; -#endif -#if BATCH_AXIS == 0 - ivec4 tensor_idx = pos.xyzx; -#endif - // Adjust the axis that the batch dim runs along - tensor_idx[3] /= sizes[BATCH_AXIS]; - tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS]; - - return tensor_idx; -} diff --git a/backends/vulkan/test/glsl/reference_matmul.glsl b/backends/vulkan/test/glsl/reference_matmul.glsl deleted file mode 100644 index 4d4e0ae8734..00000000000 --- a/backends/vulkan/test/glsl/reference_matmul.glsl +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION highp - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", "float", "buffer")} -${layout_declare_tensor(1, "r", "t_mat1", "float", "buffer")} -${layout_declare_tensor(2, "r", "t_mat2", "float", "buffer")} -${layout_declare_ubo(3, "ivec4", "out_sizes")} -${layout_declare_ubo(4, "ivec4", "out_strides")} -${layout_declare_ubo(5, "ivec4", "mat1_sizes")} -${layout_declare_ubo(6, "ivec4", "mat1_strides")} -${layout_declare_ubo(7, "ivec4", "mat2_sizes")} -${layout_declare_ubo(8, "ivec4", "mat2_strides")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -#include "reference_matmul_common_buffer.glslh" - -void main() { - const ivec2 out_idx = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y); - if (any(greaterThanEqual(out_idx, out_sizes.xy))) { - return; - } - - // Initial idx for mat1 is (0, out_idx.y) - int mat1_id = out_idx.y * mat1_strides.y; - // Initial idx for mat2 is (out_idx.x, 0) - int mat2_id = out_idx.x * mat2_strides.x; - - float sum = 0.0; - for (int i = 0; i < mat1_sizes.x; ++i) { - sum += perform_dot_product(out_idx.y, out_idx.x, i); - } - - const int out_id = out_idx.x * out_strides.x + out_idx.y * out_strides.y; - t_out[out_id] = sum; -} diff --git a/backends/vulkan/test/glsl/reference_matmul_common.glslh b/backends/vulkan/test/glsl/reference_matmul_common.glslh deleted file mode 100644 index 2f22b588b75..00000000000 --- a/backends/vulkan/test/glsl/reference_matmul_common.glslh +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef MATMUL_COMMON_${STORAGE}_H -#define MATMUL_COMMON_${STORAGE}_H - -$if STORAGE == "buffer": - float perform_dot_product( - const uint out_row, - const uint out_col, - const uint k) { - const uint mat1_bufi = out_row * mat1_strides.y + k * mat1_strides.x; - const uint mat2_bufi = k * mat2_strides.y + out_col * mat2_strides.x; - - return t_mat1[mat1_bufi] * t_mat2[mat2_bufi]; - } -$else: - vec4 perform_dot_product( - const uint out_row, - const uint out_col, - const uint k) { - vec4 mat1_tex = texelFetch(t_mat1, ivec3(k, out_row, 0), 0); - vec4 mat2_tex = texelFetch(t_mat2, ivec3(out_col, k, 0), 0); - - return dot(mat1_tex, mat2_tex); - } - -#endif diff --git a/backends/vulkan/test/glsl/reference_matmul_common.yaml b/backends/vulkan/test/glsl/reference_matmul_common.yaml deleted file mode 100644 index d19bbabf0d1..00000000000 --- a/backends/vulkan/test/glsl/reference_matmul_common.yaml +++ /dev/null @@ -1,9 +0,0 @@ -reference_matmul_common: - parameter_names_with_default_values: - STORAGE: buffer - generate_variant_forall: - STORAGE: - - VALUE: buffer - - VALUE: texture3d - shader_variants: - - NAME: reference_matmul_common diff --git a/backends/vulkan/test/glsl/scalar_add_buffer.glsl b/backends/vulkan/test/glsl/scalar_add_buffer.glsl deleted file mode 100644 index cd3a85a1655..00000000000 --- a/backends/vulkan/test/glsl/scalar_add_buffer.glsl +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -${define_required_extensions(DTYPE)} - -#define T ${buffer_scalar_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(0, "rw", "buffer_in", DTYPE, "buffer")} -${layout_declare_ubo(1, "int", "numel")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const float scalar = 2.0; - -void main() { - const int t_id = ivec3(gl_GlobalInvocationID).x; - if (t_id >= numel) { - return; - } - - buffer_in[t_id] = buffer_in[t_id] + T(scalar); -} diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl deleted file mode 100644 index 992907d0c25..00000000000 --- a/backends/vulkan/test/glsl/scalar_add_texture.glsl +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")} -${layout_declare_ubo(1, "ivec3", "extents")} -${layout_declare_ubo(2, "int", "scalar")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(pos, extents))) { - return; - } - - vec4 in_tex = imageLoad(t_in, pos); - imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar)); -} diff --git a/backends/vulkan/test/glsl/test_shader.glsl b/backends/vulkan/test/glsl/test_shader.glsl deleted file mode 100644 index 4804528346d..00000000000 --- a/backends/vulkan/test/glsl/test_shader.glsl +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; -layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION restrict Block { - ivec4 size; -} uBlock; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (all(lessThan(pos, uBlock.size.xyz))) { - const vec4 intex = texelFetch(uInput, pos, 0); - imageStore( - uOutput, - pos, - intex + 5); - } -} diff --git a/backends/vulkan/test/op_tests/CMakeLists.txt b/backends/vulkan/test/op_tests/CMakeLists.txt deleted file mode 100644 index 07a13c3f260..00000000000 --- a/backends/vulkan/test/op_tests/CMakeLists.txt +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ### Editing this file ### -# -# This file should be formatted with -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ -# It should also be cmake-lint clean. -# -# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON - -cmake_minimum_required(VERSION 3.19) -project(executorch) - -if(ANDROID) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) -endif() - -find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend) -find_package(GTest CONFIG REQUIRED) - -if(NOT EXECUTORCH_ROOT) - set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) -endif() - -# Include this file to access executorch_target_link_options_shared_lib This is -# required to provide access to executorch_target_link_options_shared_lib which -# allows libraries to be linked with the --whole-archive flag. This is required -# for libraries that perform dynamic registration via static initialization. -include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) - -get_torch_base_path(TORCH_BASE_PATH) -if(NOT TORCH_INSTALL_PREFIX) - set(TORCH_INSTALL_PREFIX ${TORCH_BASE_PATH}) -endif() - -# libtorch is needed for Vulkan correctness tests -find_library(LIB_TORCH torch HINTS ${TORCH_INSTALL_PREFIX}/lib) -find_library(LIB_TORCH_CPU torch_cpu HINTS ${TORCH_INSTALL_PREFIX}/lib) -find_library(LIB_C10 c10 HINTS ${TORCH_INSTALL_PREFIX}/lib) - -# Third party include paths - -set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party) - -set(GTEST_INCLUDE_PATH - ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include -) -set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include) -set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) -set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) - -set(COMMON_INCLUDES - ${EXECUTORCH_ROOT}/.. - ${VULKAN_HEADERS_PATH} - ${VOLK_PATH} - ${VMA_PATH} - ${GTEST_INCLUDE_PATH} - ${TORCH_BASE_PATH}/include - ${TORCH_BASE_PATH}/include/torch/csrc/api/include -) - -executorch_target_link_options_shared_lib(vulkan_backend) - -function(vulkan_op_test test_name test_src) - set(extra_deps ${ARGN}) - - add_executable(${test_name} ${test_src}) - target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDES}) - target_link_libraries( - ${test_name} - PRIVATE GTest::gtest_main - vulkan_backend - executorch_core - ${LIB_TORCH} - ${LIB_TORCH_CPU} - ${LIB_C10} - ${extra_deps} - ) - - add_test(${test_name} ${test_name}) -endfunction() - -if(TARGET vulkan_backend AND LIB_TORCH) - add_library(test_utils ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cpp) - target_include_directories(test_utils PRIVATE ${COMMON_INCLUDES}) - target_link_libraries( - test_utils PRIVATE vulkan_backend ${LIB_TORCH} ${LIB_TORCH_CPU} - ) - - find_library( - CUSTOM_OPS_LIB custom_ops_aot_lib - HINTS ${CMAKE_INSTALL_PREFIX}/executorch/extension/llm/custom_ops - ) - if(CUSTOM_OPS_LIB) - vulkan_op_test( - vulkan_sdpa_test ${CMAKE_CURRENT_SOURCE_DIR}/sdpa_test.cpp - ${CUSTOM_OPS_LIB} test_utils - ) - else() - message( - STATUS "Skip building sdpa_test because custom_ops_aot_lib is not found" - ) - endif() - vulkan_op_test( - vulkan_rope_test ${CMAKE_CURRENT_SOURCE_DIR}/rotary_embedding_test.cpp - test_utils - ) - vulkan_op_test( - quantized_linear_test ${CMAKE_CURRENT_SOURCE_DIR}/quantized_linear_test.cpp - test_utils - ) - - # Only build generated op tests if a path to tags.yaml and - # native_functions.yaml is provided. These files are required for codegen. - if(TORCH_OPS_YAML_PATH) - set(GENERATED_VULKAN_TESTS_CPP_PATH ${CMAKE_CURRENT_BINARY_DIR}/vk_gen_cpp) - - # Generated operator correctness tests - - set(generated_test_cpp ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_tests.cpp) - - add_custom_command( - COMMENT "Generating Vulkan operator correctness tests" - OUTPUT ${generated_test_cpp} - COMMAND - ${PYTHON_EXECUTABLE} - ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_correctness_tests.py - -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path - ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path - ${TORCH_OPS_YAML_PATH}/native_functions.yaml - DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py - ) - - vulkan_op_test(vulkan_op_correctness_tests ${generated_test_cpp}) - - # Generated operator benchmarks (only built in google benchmark is - # installed) - find_package(benchmark CONFIG) - - if(benchmark_FOUND) - set(generated_benchmark_cpp - ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_benchmarks.cpp - ) - - add_custom_command( - COMMENT "Generating Vulkan operator benchmarks" - OUTPUT ${generated_benchmark_cpp} - COMMAND - ${PYTHON_EXECUTABLE} - ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_benchmarks.py - -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path - ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path - ${TORCH_OPS_YAML_PATH}/native_functions.yaml - DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py - ) - - vulkan_op_test(vulkan_op_benchmarks ${generated_benchmark_cpp}) - endif() - else() - message( - STATUS - "Skipping generated operator correctness tests and benchmarks. Please specify TORCH_OPS_YAML_PATH to build these tests." - ) - endif() -endif() diff --git a/backends/vulkan/test/op_tests/TARGETS b/backends/vulkan/test/op_tests/TARGETS deleted file mode 100644 index e84397dc20e..00000000000 --- a/backends/vulkan/test/op_tests/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets(is_fbcode = True) diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py deleted file mode 100644 index 8c5d0c4797b..00000000000 --- a/backends/vulkan/test/op_tests/cases.py +++ /dev/null @@ -1,1949 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -import itertools - -from collections import namedtuple -from typing import Callable - -from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite - - -# Prime numbers dim sizes for testing -XL = 113 -L = 89 -M2 = 41 -M1 = 37 -M = 29 -S2 = 11 -S1 = 7 -S = 5 -XS = 3 - -test_suites = {} - - -def register_test_suite(aten_op): - def test_suite_decorator(fn: Callable) -> Callable: - if isinstance(aten_op, str): - test_suites[aten_op] = fn() - elif isinstance(aten_op, list): - for op in aten_op: - test_suites[op] = fn() - return fn - - return test_suite_decorator - - -@register_test_suite( - ["aten.add.Tensor", "aten.sub.Tensor", "aten.div.Tensor", "aten.mul.Tensor"] -) -def get_binary_elementwise_inputs(): - test_suite = VkTestSuite( - [ - ((M1, M2), (M1, M2)), - ((M1, M2), (M1, 1), 2.0), - ((M1, M2), (1, M2)), - ((S, S1, S2), (S, S1, S2)), - ((S, S1, S2), (S, S1, 1), 2.0), - ((S, S1, S2), (S, 1, S2), 2.0), - ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), - ((3, 64, 1), (1, 64, 1)), - ] - ) - test_suite.storage_types = [ - "utils::kBuffer", - "utils::kTexture3D", - ] - - highdim_test_suite = VkTestSuite( - [ - ((4, 5, 8, 1, 2, 1), (4, 5, 8, 1, 1, 1)), - ] - ) - highdim_test_suite.storage_types = [ - "utils::kBuffer", - ] - highdim_test_suite.test_name_suffix = "highdim" - - for suite in [test_suite, highdim_test_suite]: - suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - - return [test_suite, highdim_test_suite] - - -# Eq requires a different test generator so it was split from the other test case. -@register_test_suite( - [ - "aten.eq.Tensor", - "aten.gt.Tensor", - "aten.lt.Tensor", - "aten.ge.Tensor", - "aten.le.Tensor", - ] -) -def get_binary_elementwise_compare_inputs(): - test_suite = VkTestSuite( - [ - ((M1, M2), (M1, M2)), - ((M1, M2), (M1, 1), 2.0), - ((M1, M2), (1, M2)), - ((S, S1, S2), (S, S1, S2)), - ((S, S1, S2), (S, S1, 1), 2.0), - ((S, S1, S2), (S, 1, S2), 2.0), - ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), - ((3, 64, 1), (1, 64, 1)), - ] - ) - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - test_suite.storage_types = [ - "utils::kBuffer", - "utils::kTexture3D", - ] - test_suite.data_gen = "make_casted_randint_tensor" - return test_suite - - -@register_test_suite("aten.mm.default") -def get_mm_inputs(): - test_suite = VkTestSuite( - [ - ((M1, L), (L, M2)), - ((S1, S2), (S2, M)), - ((6, 32), (32, 64)), - ], - ) - test_suite.prepacked_args = ["mat2"] - # ATen matmul doesn't support half - test_suite.dtypes = ["at::kFloat"] - test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - return test_suite - - -@register_test_suite("aten.bmm.default") -def get_bmm_inputs(): - test_suite = VkTestSuite( - [ - ((S, M1, L), (S, L, M2)), - ((M, S1, S2), (M, S2, M)), - ((4, 6, 32), (4, 32, 16)), - ], - ) - test_suite.prepacked_args = ["mat2"] - # ATen matmul doesn't support half - test_suite.dtypes = ["at::kFloat"] - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - return test_suite - - -@register_test_suite("aten.addmm.default") -def get_addmm_inputs(): - test_suite = VkTestSuite( - [ - ((1, S), (S1, S), (S, S), 1.0, 1.5), - ((S, 1), (S, S1), (S1, S1), 1.0, 1.0), - ((M1, M2), (M1, M2), (M2, M2)), - ((M1, M2), (M1, M2), (M2, M2), 4.2, 2.3), - ((M1, 1), (M1, L), (L, L), 2.0, 3.0), - ((M2), (M1, M2), (M2, M2)), - ((6, M2), (6, M2), (M2, M2)), - ] - ) - # ATen matmul doesn't support half - test_suite.dtypes = ["at::kFloat"] - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - return test_suite - - -common_MKN_list = [ - (S2, M2, M1), - (L, L, M1), -] - - -@register_test_suite("aten.linear.default") -def get_linear_inputs(): - MKN_list = common_MKN_list - - inputs_list = [((M, K), (N, K), None) for M, K, N in MKN_list] - inputs_list += [((M, K), (N, K), (N)) for M, K, N in MKN_list] - inputs_list += [((3, M, K), (N, K), None) for M, K, N in MKN_list] - inputs_list += [((3, M, K), (N, K), (N)) for M, K, N in MKN_list] - inputs_list += [((3, 6, K), (N, K), (N)) for M, K, N in MKN_list] - - test_suite = VkTestSuite(inputs_list) - test_suite.dtypes = ["at::kFloat"] - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] - return test_suite - - -@register_test_suite("aten._weight_int8pack_mm.default") -def get_weight_int8pack_mm_inputs(): - MKN_list = [ - [1, 480, 256], - [1, 1024, 1024], - [1, 1024, 256], - [3, 480, 256], - [6, 480, 256], - [6, 256, 1024], - [6, 1024, 256], - [6, 256, 256], - [6, 256, 512], - [4, 768, 4096], - [1024, 1024, 1024], - ] - - inputs_list = [((M, K), (N, K), (N)) for M, K, N in MKN_list] - - test_suite = VkTestSuite(inputs_list) - test_suite.dtypes = ["at::kFloat"] - test_suite.layouts = ["utils::kWidthPacked"] - test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] - test_suite.prepacked_args = ["mat2", "scales"] - test_suite.requires_prepack = True - - test_suite.arg_dtype["mat2"] = "at::kChar" - test_suite.arg_data_range["mat2"] = (0, 100) - - test_suite.arg_data_range["scales"] = (0.0008, 0.001) - - return test_suite - - -@register_test_suite("aten.avg_pool2d.default") -def get_avg_pool2d_inputs(): - Test = namedtuple( - "VkAvgPoolTest", - [ - "self", - "kernel_size", - "stride", - "padding", - "ceil_mode", - "count_include_pad", - "divisor_override", - ], - ) - - test_cases = [] - for ceil_mode in [True, False]: - for count_include_pad in [True, False]: - for divisor_override in [None, 5]: - test_cases += [ - Test( - self=(S, M1, M2), - kernel_size=[2, 2], - stride=[1, 1], - padding=[0, 0], - ceil_mode=ceil_mode, - count_include_pad=count_include_pad, - divisor_override=divisor_override, - ), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - test_suite.dtypes = ["at::kFloat"] - return test_suite - - -@register_test_suite( - ["aten.max_pool2d_with_indices.default", "aten.max_pool2d.default"] -) -def get_max_pool2d_inputs(): - test_suite = VkTestSuite( - [ - ((1, 7, 89, 77), [2, 2], [1, 1], [0, 0], [1, 1]), - ] - ) - return test_suite - - -@register_test_suite("aten.convolution.default") -def get_conv_inputs(): - Test = namedtuple( - "ConvTest", - [ - "self", - "weight", - "bias", - "stride", - "padding", - "dilation", - "transposed", - "output_padding", - "groups", - ], - ) - Test.__new__.__defaults__ = ( - None, - None, - None, - [1, 1], - [0, 0], - [1, 1], - False, - [9, 0], - 1, - ) - - test_cases = [ - Test( - self=(1, 64, 256, 256), - weight=(64, 32, 3, 3), - bias=None, - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=2, - ), - Test( - self=(1, 16, 3, 3), - weight=(16, 8, 3, 3), - bias=None, - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=2, - ), - Test( - self=(1, 6, 40, 50), - weight=(8, 6, 3, 3), - bias=(8,), - stride=[1, 2], - padding=[2, 3], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 6, 40, 50), - weight=(6, 8, 3, 3), - bias=(8,), - stride=[1, 2], - padding=[2, 3], - dilation=[1, 1], - transposed=True, - output_padding=[0, 1], - groups=1, - ), - Test( - self=(1, 6, 40, 50), - weight=(8, 6, 3, 3), - bias=None, - stride=[1, 2], - padding=[2, 3], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 6, 7), - weight=(6, 1, 3), - bias=(6,), - stride=[1], - padding=[0], - dilation=[1], - transposed=False, - output_padding=[0], - groups=6, - ), - Test( - self=(2, 20, 30), - weight=(10, 4, 6), - bias=(10,), - stride=[5], - padding=[5], - dilation=[3], - transposed=False, - output_padding=[0], - groups=5, - ), - Test( - self=(1, 9, 11), - weight=(9, 1, 3), - bias=None, - stride=[1], - padding=[0], - dilation=[1], - transposed=False, - output_padding=[0], - groups=9, - ), - Test( - self=(5, 15, 30), - weight=(20, 3, 3), - bias=None, - stride=[3], - padding=[5], - dilation=[7], - transposed=False, - output_padding=[0], - groups=5, - ), - Test( - self=(1, 8, 90, 77), - weight=(1, 8, 3, 3), - bias=(1,), - stride=[1, 1], - padding=[2, 2], - dilation=[2, 2], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - ] - - test_cases_pw = [ - Test( - self=(1, 16, 3, 5), - weight=(4, 16, 1, 1), - bias=(4,), - stride=[1, 1], - padding=[0, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 5, 3, 5), - weight=(4, 5, 1, 1), - bias=(4,), - stride=[1, 1], - padding=[0, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 5, 3, 5), - weight=(3, 5, 1, 1), - bias=(3,), - stride=[1, 1], - padding=[0, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 5, 3, 5), - weight=(3, 5, 1, 1), - bias=(3,), - stride=[1, 1], - padding=[1, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 5, 3, 5), - weight=(3, 5, 1, 1), - bias=(3,), - stride=[1, 1], - padding=[0, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 5, 3, 5), - weight=(3, 5, 1, 1), - bias=(3,), - stride=[2, 1], - padding=[1, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 8, 72, 96), - weight=(8, 8, 1, 1), - bias=(8,), - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 16, 240, 320), - weight=(64, 16, 1, 1), - bias=(64,), - stride=[1, 1], - padding=[0, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 16, 240, 320), - weight=(64, 16, 1, 1), - bias=(64,), - stride=[2, 2], - padding=[0, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 16, 240, 320), - weight=(64, 16, 1, 1), - bias=(64,), - stride=[4, 4], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 16, 240, 320), - weight=(64, 16, 1, 1), - bias=(64,), - stride=[1, 1], - padding=[4, 4], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - Test( - self=(1, 16, 672, 512), - weight=(64, 16, 1, 1), - bias=(64,), - stride=[1, 1], - padding=[0, 0], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=1, - ), - ] - - test_cases_dw = [ - Test( - self=(1, XS, S, S1), - weight=(XS, 1, 3, 3), - bias=(XS,), - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=XS, - ), - Test( - self=(1, XS, S, S1), - weight=(XS, 1, 5, 5), - bias=(XS,), - stride=[1, 1], - padding=[2, 2], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=XS, - ), - Test( - self=(1, XS, S, S1), - weight=(XS, 1, 3, 3), - bias=(XS,), - stride=[2, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=XS, - ), - Test( - self=(1, XS, S, S1), - weight=(XS, 1, 5, 5), - bias=(XS,), - stride=[1, 2], - padding=[2, 2], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=XS, - ), - Test( - self=(1, S2, S, S1), - weight=(S2, 1, 3, 3), - bias=(S2,), - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=S2, - ), - Test( - self=(1, S2, S, S1), - weight=(S2, 1, 5, 5), - bias=(S2,), - stride=[1, 1], - padding=[2, 2], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=S2, - ), - Test( - self=(1, 8, 72, 96), - weight=(8, 1, 3, 3), - bias=(8,), - stride=[1, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=8, - ), - Test( - self=(1, 8, 72, 96), - weight=(8, 1, 5, 5), - bias=(8,), - stride=[1, 1], - padding=[2, 2], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=8, - ), - Test( - self=(1, 4, 234, 234), - weight=(4, 1, 3, 3), - bias=(4,), - stride=[2, 1], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=4, - ), - Test( - self=(1, 4, 234, 234), - weight=(4, 1, 3, 3), - bias=(4,), - stride=[1, 2], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=4, - ), - Test( - self=(1, 4, 234, 234), - weight=(4, 1, 3, 3), - bias=(4,), - stride=[2, 2], - padding=[1, 1], - dilation=[1, 1], - transposed=False, - output_padding=[0, 0], - groups=4, - ), - ] - - test_suite = VkTestSuite(test_cases) - test_suite.layouts = [ - "utils::kChannelsPacked", - ] - - test_suite_pw = VkTestSuite(test_cases_pw) - test_suite_pw.layouts = [ - "utils::kChannelsPacked", - ] - test_suite_pw.test_name_suffix = "pw" - - test_suite_dw = VkTestSuite(test_cases_dw) - test_suite_dw.layouts = [ - "utils::kChannelsPacked", - ] - test_suite_dw.test_name_suffix = "dw" - return [test_suite, test_suite_pw, test_suite_dw] - - -@register_test_suite("aten.native_layer_norm.default") -def get_native_layer_norm_inputs(): - test_suite = VkTestSuite( - [ - ((S1, S2), [S2], (S2), (S2), 0.001), - ((M, M1, M2), [M2], (M2), (M2), 0.001), - ((S, XL, M1, M2), [M2], (M2), (M2), 0.001), - ] - ) - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - return test_suite - - -@register_test_suite("aten.native_group_norm.default") -def get_native_group_norm_inputs(): - test_suite = VkTestSuite( - [ - # (input_shape, weight_shape, bias_shape, N, C, HxW, group, eps) - # General test cases - ((1, 8, 4, 4), (8), (8), 1, 8, 16, 2, 0.001), - ((2, 8, 3, 3), (8), (8), 2, 8, 9, 4, 0.001), - ((1, 12, 2, 2), (12), (12), 1, 12, 4, 3, 0.001), - ((3, 16, 5, 5), (16), (16), 3, 16, 25, 8, 0.001), - ((3, 16, 13, 17), (16), (16), 3, 16, 13 * 17, 4, 0.001), - ((1, 4, 7, 7), (4), (4), 1, 4, 49, 2, 0.001), - ((2, 6, 1, 8), (6), (6), 2, 6, 8, 3, 0.001), - # Single group and prime number sizes - ((3, 7, 13, 11), (7), (7), 3, 7, 13 * 11, 1, 0.001), - # Each channel is it's own group and prime number sizes - ((1, 7, 13, 11), (7), (7), 1, 7, 13 * 11, 7, 0.001), - ] - ) - test_suite.layouts = [ - "utils::kChannelsPacked", - ] - test_suite.storage_types = [ - "utils::kTexture3D", - ] - test_suite.dtypes = [ - "at::kFloat", - "at::kHalf", - ] - test_suite.arg_storage_types = { - "out": [None, "utils::kBuffer", "utils::kBuffer"], - } - - test_suite.prepacked_args = ["weight", "bias"] - test_suite.requires_prepack = True - - return test_suite - - -def get_upsample_inputs(): - inputs_list = [ - # (input tensor shape, output 2D image size (H, W), output scaling factors) - ((2, 2, 2, 2), None, [1, 1]), - ((1, 1, 2, 2), None, [2, 2]), - ((1, 1, 2, 2), None, [2, 4]), - ((1, 1, 2, 2), None, [4, 2]), - ((1, 1, 2, 2), [2, 2], None), - ((1, 1, 2, 2), [2, 4], None), - ((1, 1, 2, 2), [3, 2], None), - ] - return inputs_list - - -@register_test_suite("aten.upsample_nearest2d.vec") -def get_upsample_nearest2d_inputs(): - inputs_list = get_upsample_inputs() - return VkTestSuite(inputs_list) - - -@register_test_suite("aten.upsample_bilinear2d.vec") -def get_upsample_bilinear2d_inputs(): - base_inputs_list = get_upsample_inputs() - inputs_list = [] - for input_case in base_inputs_list: - inputs_list.append((input_case[0], input_case[1], False, input_case[2])) - inputs_list.append((input_case[0], input_case[1], True, input_case[2])) - return VkTestSuite(inputs_list) - - -@register_test_suite(["aten.full.default", "aten.full_like.default"]) -def get_full_inputs(): - test_suite = VkTestSuite( - [ - ([S1, S2], 42.0), - ([M, M1, M2], 3.14), - ([L, M, M1, M2], 2.72), - ] - ) - return test_suite - - -@register_test_suite("aten.scalar_tensor.default") -def get_scalar_tensor_inputs(): - test_suite = VkTestSuite( - [ - (42.0,), - (3.14,), - (2.72,), - (0.0,), - (-1.0,), - (100.0,), - ] - ) - return test_suite - - -@register_test_suite( - [ - "aten.zeros.default", - "aten.zeros_like.default", - "aten.ones.default", - "aten.ones_like.default", - ] -) -def get_ones_inputs(): - test_suite = VkTestSuite( - [ - ([S1, S2]), - ([M, M1, M2]), - ([L, M, M1, M2]), - ] - ) - return test_suite - - -@register_test_suite(["aten.select.int", "aten.select_copy.int"]) -def get_select_int_inputs(): - test_suite = VkTestSuite( - [ - ((8, 8, 8), 0, -2), - ((8, 8, 8), 1, -3), - ((8, 8, 8), 2, -4), - ((6, 2, 7), 1, 0), - ((6, 2, 7), 2, 3), - ((6, 10, 7), 0, 3), - ((6, 10, 7), 1, 0), - ((6, 10, 7), 1, 9), - ((6, 10, 7), 2, 6), - ((9, 2, 9, 4), 0, 8), - ((9, 2, 9, 4), 1, 1), - ((9, 2, 9, 4), 2, 0), - ((9, 2, 9, 4), 2, 8), - ((9, 2, 9, 4), 3, 3), - ((8, 6, 1, 1), 0, 4), - ((8, 6, 1, 1), 1, 4), - ] - ) - test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] - test_suite.dtypes = ["at::kFloat"] - test_suite.data_gen = "make_seq_tensor" - return test_suite - - -@register_test_suite(["aten.permute.default", "aten.permute_copy.default"]) -def get_permute_inputs(): - batch_tests = [ - ((9, 2, 5, 7), out_axis) for out_axis in itertools.permutations([0, 1, 2, 3]) - ] - channel_tests = [ - ((9, 2, 5), out_axis) for out_axis in itertools.permutations([0, 1, 2]) - ] - wh_tests = [((9, 2), out_axis) for out_axis in itertools.permutations([0, 1])] - test_suite = VkTestSuite(batch_tests + channel_tests + wh_tests) - - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite.storage_types = [ - "utils::kBuffer", - "utils::kTexture3D", - ] - test_suite.dtypes = [ - "at::kFloat", - ] - return test_suite - - -@register_test_suite("aten.view_copy.default") -def get_view_inputs(): - test_suite = VkTestSuite( - [ - ((3, 4, 5), [1, 1, -1]), - ((3, 4, 5), [1, -1, 1]), - ((3, 4, 5), [-1, 1, 1]), - ((8, 7, 2, 3), [4, 3, 7, 4]), - ((8, 7, 2, 3), [7, -1, 2, 1]), - ((8, 7, 2, 3), [1, 1, 1, -1]), - ((8, 7, 2, 3), [-1]), - ((2, 3, 3, 7), [2, -1, 1, 1]), - ((3, 5, 2, 7), [7, -1, 2, 1]), - ((2, 2, 8, 6), [2, 6, -1, 1]), - ((2, 2, 8, 6), [6, -1, 1]), - ((S1, S2, S1, S2), [S2, -1, 1, S1]), - ((S1, S2, S1, S2), [S1, 1, -1, S2]), - ((S1, S2, S1, S2), [-1, 1, S1, S2]), - ] - ) - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - - highdim_test_suite = VkTestSuite( - [ - ((1, 1, 3, 3, 3), (9, 3)), - ((2, 3, 4, 6, 5, 4), (6, 4, 6, 5, 4)), - ((2, 3, 3, 7, 8), (2, 3, 3, 8 * 7)), - ] - ) - highdim_test_suite.storage_types = [ - "utils::kBuffer", - ] - highdim_test_suite.test_name_suffix = "highdim" - highdim_test_suite.data_gen = "make_seq_tensor" - - for suite in [test_suite, highdim_test_suite]: - suite.layouts = [ - # "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - - return [test_suite, highdim_test_suite] - - -@register_test_suite("aten.slice_copy.Tensor") -def get_slice_out_inputs(): - Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"]) - Test.__new__.__defaults__ = (None, 0, None, None, 1) - - # Slice by width and height - test_cases = [ - Test(self=[1, 1, 4, 10], dim=3, start=3), - Test(self=[1, 1, 4, 10], dim=3, start=3, step=2), - Test(self=[1, 1, 4, 10], dim=3, start=3, end=4, step=2), - Test(self=[1, 1, 4, 10], dim=2, start=3), - Test(self=[9, 9, 9, 9], dim=2, start=0, end=9, step=1), - Test(self=[9, 9, 9, 9], dim=2, start=1, end=8, step=1), - Test(self=[9, 9, 9, 9], dim=2, start=1, end=2, step=1), - Test(self=[9, 9, 9, 9], dim=3, start=1, end=5, step=1), - Test(self=[9, 9, 9, 9], dim=3, start=1, end=5, step=2), - Test(self=[9, 9, 9, 9], dim=-1, start=1, end=5, step=2), - Test(self=[9, 9, 9, 9], dim=-2, start=1, end=5, step=2), - Test(self=[9, 9, 9], dim=1, start=2, step=1), - Test(self=[9, 9, 9], dim=1, start=2, step=2), - Test(self=[9, 9, 9], dim=2, start=2, step=1), - Test(self=[9, 9, 9], dim=2, start=2, step=2), - Test(self=[9, 9], dim=0, start=2, step=1), - Test(self=[9, 9], dim=0, start=2, step=2), - Test(self=[9, 9], dim=1, start=2, step=1), - Test(self=[9, 9], dim=1, start=2, step=2), - ] - - # Slice by batch - test_cases += [ - Test(self=[6, 5, 3, 2], dim=0), - Test(self=[6, 5, 3, 2], dim=0, step=2), - Test(self=[13, 13, 3, 2], dim=0, step=2), - Test(self=[13, 13, 3, 2], dim=0, start=1, step=2), - Test(self=[13, 13, 3, 2], dim=0, start=1, step=5), - Test(self=[13, 13, 3, 2], dim=0, start=1, step=20), - Test(self=[13, 2, 3, 2], dim=0, start=1, step=2), - Test(self=[13, 2, 3, 2], dim=0, start=1, step=5), - Test(self=[13, 2, 3, 2], dim=0, start=1, step=20), - ] - - # Slice by channel - test_cases += [ - Test(self=[2, 5, 1, 10], dim=1), - Test(self=[2, 5, 1, 10], dim=1, start=1), - Test(self=[2, 5, 1, 10], dim=1, start=1, step=2), - Test(self=[5, 13, 1, 10], dim=1), - Test(self=[5, 13, 1, 10], dim=1, start=1), - Test(self=[5, 13, 1, 10], dim=1, start=1, step=2), - Test(self=[5, 13, 1, 10], dim=1, start=1, step=5), - Test(self=[5, 13, 1, 10], dim=1, start=1, step=20), - Test(self=[13, 1, 10], dim=0), - Test(self=[13, 1, 10], dim=0, start=1), - Test(self=[13, 1, 10], dim=0, start=1, step=2), - Test(self=[13, 1, 10], dim=0, start=1, step=5), - Test(self=[13, 1, 10], dim=0, start=1, step=20), - ] - - # Slice by negative/unspecified indices - INT64_MAX = 9223372036854775807 # represents arr[:] - test_cases += [ - Test(self=[8, 9], dim=0, start=-2, step=1), - Test(self=[8, 9], dim=0, start=-2, step=2), - Test(self=[8, 9], dim=0, end=-2, step=1), - Test(self=[8, 9], dim=0, end=-2, step=2), - Test(self=[8, 9], dim=0, end=INT64_MAX, step=1), - Test(self=[8, 9], dim=0, end=INT64_MAX, step=2), - Test(self=[8, 9], dim=1, start=-2, step=1), - Test(self=[8, 9], dim=1, start=-2, step=2), - Test(self=[8, 9], dim=1, end=-2, step=1), - Test(self=[8, 9], dim=1, end=-2, step=2), - Test(self=[8, 9], dim=1, end=INT64_MAX, step=1), - Test(self=[8, 9], dim=1, end=INT64_MAX, step=2), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - - test_suite.dtypes = ["at::kFloat", "at::kHalf"] - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite.data_gen = "make_seq_tensor" - return test_suite - - -def get_slice_view_inputs(): - Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"]) - Test.__new__.__defaults__ = (None, 0, None, None, 1) - - # Slice by channel - test_cases = [ - Test(self=[1, 17, 1, 10], dim=1, start=0, end=4), - Test(self=[1, 17, 1, 10], dim=1, start=0, end=8), - Test(self=[1, 17, 3, 7], dim=1, start=0, end=12), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - - test_suite.dtypes = ["at::kFloat"] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] - test_suite.layouts = ["utils::kWidthPacked"] - test_suite.data_gen = "make_seq_tensor" - test_suite.is_view_op = True - - return test_suite - - -@register_test_suite(["aten.slice.Tensor"]) -def get_slice_inputs(): - texture_test_suite = get_slice_out_inputs() - texture_test_suite.test_name_suffix = "no_view" - - view_test_suite = get_slice_view_inputs() - view_test_suite.test_name_suffix = "view" - - return [view_test_suite, texture_test_suite] - - -@register_test_suite(["aten.transpose.int"]) -def get_transpose_inputs(): - Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"]) - Test.__new__.__defaults__ = (None, 0, 1) - - test_cases = [ - Test(self=[M1, M2], dim0=0, dim1=1), - Test(self=[M1, S2, M], dim0=0, dim1=1), - Test(self=[M1, S2, M], dim0=0, dim1=2), - Test(self=[M1, S2, M], dim0=2, dim1=1), - Test(self=[S, M, S2, M2], dim0=3, dim1=2), - Test(self=[S, M, S2, M2], dim0=1, dim1=2), - Test(self=[S, M, S2, M2], dim0=3, dim1=1), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - - test_suite.dtypes = ["at::kFloat"] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] - test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"] - test_suite.data_gen = "make_seq_tensor" - test_suite.is_view_op = True - return test_suite - - -@register_test_suite("aten.index_select.default") -def get_index_select_inputs(): - Test = namedtuple("VkIndexSelectTest", ["self", "dim", "index"]) - Test.__new__.__defaults__ = (None, 0, None) - - test_cases = [] - - for i in range(4): - test_cases += [ - Test(self=[9, 9, 9, 9], dim=i, index=[0]), - Test(self=[9, 9, 9, 9], dim=i, index=[2]), - Test(self=[9, 9, 9, 9], dim=i, index=[0, 2]), - Test(self=[9, 9, 9, 9], dim=i, index=[3, 1]), - Test(self=[9, 9, 9, 9], dim=i, index=[5, 5]), - Test(self=[9, 9, 9, 9], dim=i, index=[2, 3, 4, 5, 7]), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - - test_suite.dtypes = ["at::kFloat"] - test_suite.layouts = ["utils::kChannelsPacked"] - return test_suite - - -@register_test_suite("aten.embedding.default") -def get_embedding_inputs(): - Test = namedtuple("VkEmbeddingTest", ["weight", "indices"]) - Test.__new__.__defaults__ = (None, None) - - test_cases = [ - Test(weight=[10, 9], indices=[0, 2]), - Test(weight=[10, 9], indices=[2, 3, 4, 5, 7]), - Test(weight=[10, 9], indices=[[0, 2], [1, 4], [7, 7]]), - Test(weight=[10, 9], indices=[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]), - Test(weight=[10, 9], indices=[[[3, 1, 4], [1, 5, 9]], [[2, 6, 5], [3, 5, 8]]]), - ] - - test_suite = VkTestSuite([tuple(tc) + (-1, "false", "false") for tc in test_cases]) - - test_suite.dtypes = ["at::kFloat"] - test_suite.layouts = ["utils::kChannelsPacked"] - return test_suite - - -@register_test_suite("aten.unsqueeze_copy.default") -def get_unsqueeze_inputs(): - test_suite = VkTestSuite( - [ - ((2, 3, 4), 0), - ((1, 1, 1), 0), - ((1, 1, 1), 1), - ((1, 1, 1), 2), - ((1, 1, 1), 3), - ((9, 9, 9), 0), - ((9, 9, 9), 1), - ((9, 9, 9), 2), - ((9, 9, 9), 3), - ((9, 9), 0), - ((9, 9), 1), - ((9, 9), 2), - ((9,), 0), - ((9,), 1), - ((1, 10), -1), - ] - ) - - highdim_test_suite = VkTestSuite( - [ - ((2, 3, 4, 5, 6), 0), - ((2, 3, 4, 5, 6), 1), - ((2, 3, 4, 5, 6), 5), - ((2, 3, 4, 5, 6), -1), - ((2, 3, 4, 5, 6), -2), - ((1, 2, 3, 4, 5), 0), - ((1, 2, 3, 4, 5), 3), - ((1, 2, 3, 4, 5), -1), - ((2, 3, 4, 5), 0), - ((1, 2, 3, 4), 1), - ] - ) - highdim_test_suite.storage_types = [ - "utils::kBuffer", - ] - highdim_test_suite.test_name_suffix = "highdim" - - for suite in [test_suite, highdim_test_suite]: - suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - suite.data_gen = "make_seq_tensor" - - return [test_suite, highdim_test_suite] - - -@register_test_suite("aten.clone.default") -def get_clone_inputs(): - test_suite = VkTestSuite( - [ - ((S2, S1, S2, S1),), - ((S2, S1, S2),), - ((S2, S1),), - ((S2,),), - ((XS, S1, XS, S1),), - ((XS, S1, XS),), - ((S1, XS, S1),), - ((XS, S1),), - ((S1, XS),), - ((S1,),), - ((XS,),), - ] - ) - - highdim_test_suite = VkTestSuite( - [ - ((2, 3, 4, 5, 6),), - ((2, 3, 4, 5, 1),), - ((1, 1, 3, 4, 5),), - ((2, 3, 4, 5, 6, 7),), - ((1, 2, 3, 4, 5, 6),), - ] - ) - highdim_test_suite.storage_types = [ - "utils::kBuffer", - ] - highdim_test_suite.test_name_suffix = "highdim" - - for suite in [test_suite, highdim_test_suite]: - suite.layouts = [ - "utils::kChannelsPacked", - ] - suite.data_gen = "make_seq_tensor" - - return [test_suite, highdim_test_suite] - - -@register_test_suite("aten.repeat.default") -def get_repeat_inputs(): - test_suite_2d = VkTestSuite( - [ - ((2, 3), [1, 4]), - ((2, 3), [4, 1]), - ((2, 3), [4, 4]), - ((2, 3), [3, 1, 4]), - ] - ) - test_suite_2d.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite_2d.storage_types = ["utils::kTexture3D"] - test_suite_2d.data_gen = "make_seq_tensor" - test_suite_2d.dtypes = ["at::kFloat"] - test_suite_2d.test_name_suffix = "2d" - - test_suite_3d = VkTestSuite( - [ - # Repeat channels only (most challenging case) - ((3, XS, S), [2, 1, 1]), - ((7, XS, S), [4, 1, 1]), - ((1, 7, XS, S), [1, 4, 1, 1]), - ((3, 7, XS, S), [1, 4, 1, 1]), - # Repat channels with other dims - ((1, 7, XS, S), [1, 4, 1, 3]), - ((3, 7, XS, S), [1, 4, 1, 3]), - ((3, 7, XS, S), [1, 4, 3, 1]), - ((3, 7, XS, S), [1, 4, 3, 3]), - # Repeat Batch - ((3, 7, XS, S), [3, 4, 3, 3]), - ((3, 7, XS, S), [3, 1, 3, 3]), - # More other cases - ((3, 7, 1, 1), [1, 4, 1, 1]), - ((2, 3), [1, 4]), - ((2, 3), [4, 1]), - ((2, 3), [4, 4]), - ((S1, S2, S2), [1, 3, 1]), - ((S1, S2, S2), [1, 3, 3]), - ((S1, S2, S2), [3, 3, 1]), - ((S1, S2, S2), [3, 3, 3]), - ((S1, S2, S2, S2), [1, 1, 3, 1]), - ((S1, S2, S2, S2), [1, 1, 1, 3]), - ((S1, S2, S2, S2), [1, 1, 3, 3]), - ((S1, S2, S2, S2), [1, 3, 1, 3]), - ((S1, S2, S2, S2), [3, 3, 3, 3]), - ((S1, S2, S2, S2), [3, 3, 1, 1]), - # Expanding cases - ((2, 3), [3, 1, 4]), - ((2, 3), [3, 3, 2, 4]), - ] - ) - test_suite_3d.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite_3d.storage_types = ["utils::kTexture3D"] - test_suite_3d.data_gen = "make_seq_tensor" - test_suite_3d.dtypes = ["at::kFloat"] - test_suite_3d.test_name_suffix = "3d" - - return [test_suite_2d, test_suite_3d] - - -@register_test_suite("aten.repeat_interleave.self_int") -def get_repeat_interleave_inputs(): - test_suite_W = VkTestSuite( - [ - ((4, 32, 256), 3, -2), - # Test repeat on each non-packed dim - ((16, 32, 64), 5, -2), - ((16, 32, 64), 5, -3), - # Test batched inputs - ((3, 5, 32, 64), 4, -2), - ((3, 5, 32, 64), 4, -3), - ] - ) - test_suite_W.layouts = [ - "utils::kWidthPacked", - ] - test_suite_W.data_gen = "make_seq_tensor" - test_suite_W.dtypes = ["at::kFloat"] - test_suite_W.test_name_suffix = "W_packed" - - test_suite_C = VkTestSuite( - [ - # Test repeat on each non-packed dim - ((32, 32, 16), 5, -1), - ((32, 32, 16), 5, -2), - # Test batched inputs - ((3, 16, 8, 64), 4, -1), - ((3, 16, 8, 64), 4, -2), - ] - ) - test_suite_C.layouts = [ - "utils::kChannelsPacked", - ] - test_suite_C.data_gen = "make_seq_tensor" - test_suite_C.dtypes = ["at::kFloat"] - test_suite_C.test_name_suffix = "C_packed" - - return [test_suite_W, test_suite_C] - - -@register_test_suite("aten.cat.default") -def get_cat_inputs(): - # TensorList must be specified as list of tuples - suite_inputs = [ - # Cat on Height - ([(M, M, 3, 5), (M, M, 0, 5)], 2), - ([(S1, S1, 3, 5), (S1, S1, 0, 5)], 2), - ([(M, M, 3, 5), (M, M, 4, 5)], 2), - ([(S1, S1, 3, 5), (S1, S1, 4, 5)], 2), - ([(M2, 3, 5), (M2, 4, 5)], 1), - ([(S1, 3, 5), (S1, 4, 5)], 1), - ([(3, 5), (4, 5)], 0), - ([(3, 5), (4, 5), (1, 5)], 0), - ( - [(3, 5)], - 0, - ), - # Cat on Width - ([(M, M, 5, 3), (M, M, 5, 4)], 3), - ([(S1, S1, 5, 3), (S1, S1, 5, 4)], 3), - ([(M, 5, 3), (M, 5, 4)], 2), - ([(S1, 5, 3), (S1, 5, 4)], 2), - ([(5, 0), (5, 4)], 1), - ([(5, 3), (5, 4)], 1), - ([(5, 3), (5, 4), (5, 1)], 1), - ( - [(5, 4)], - 1, - ), - ([(5,), (6,)], 0), - # Cat on Batch - ([(M, S1, 5, 4), (M1, S1, 5, 4)], 0), - ([(S, S1, 5, 4), (S1, S1, 5, 4)], 0), - ([(S, M, 5, 4), (S1, M, 5, 4)], 0), - ([(S, XS, 5, 4), (S1, XS, 5, 4)], 0), - ([(S, S2, 5, 4), (S1, S2, 5, 4)], 0), - ( - [ - (3, 1, 2, 5), - (3, 1, 2, 5), - (3, 1, 2, 5), - ], - 0, - ), - # Cat on Channel - ([(M, 5, 4), (0, 5, 4), (M1, 5, 4)], 0), - ([(S, 5, 4), (0, 5, 4), (S2, 5, 4)], 0), - ([(M, 5, 4), (M1, 5, 4), (M2, 5, 4)], 0), - ([(S, 5, 4), (S1, 5, 4), (S2, 5, 4)], 0), - ([(XS, 5, 4), (XS, 5, 4), (S2, 5, 4)], 0), - ([(XS, S, 5, 4), (XS, S1, 5, 4), (XS, S2, 5, 4)], 1), - ([(XS, XS, 5, 4), (XS, XS, 5, 4), (XS, S2, 5, 4)], 1), - ( - [ - (XS, 1, 2, 5), - (XS, 1, 2, 5), - (XS, 1, 2, 5), - ], - 1, - ), - ] - - high_number_cat_inputs = [] - for num_input in [6, 9]: - odd_size = (3, 7, 29, 31) - even_size = (3, 8, 29, 32) - ones = (3, 1, 1, 1) - - for input_size in [odd_size, even_size, ones]: - input_sizes = [input_size] * num_input - # Test cat on height, width, and batch dim - high_number_cat_inputs.append((input_sizes, 3)) - high_number_cat_inputs.append((input_sizes, 2)) - high_number_cat_inputs.append((input_sizes, 1)) - high_number_cat_inputs.append((input_sizes, 0)) - - test_suite = VkTestSuite(suite_inputs + high_number_cat_inputs) - - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - test_suite.storage_types = [ - "utils::kTexture3D", - "utils::kBuffer", - ] - test_suite.data_gen = "make_seq_tensor" - test_suite.dtypes = ["at::kFloat"] - return test_suite - - -@register_test_suite("aten.split_with_sizes_copy.default") -def get_split_with_sizes_inputs(): - Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"]) - test_cases = [ - # Split on Width - Test(self=(S1, 7, 10, 11), sizes=[1, 3, 2, 5], dim=3), - Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3), - Test(self=(7, 10, 11), sizes=[1, 3, 2, 5], dim=2), - Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2), - Test(self=(7, 10, 11), sizes=[3, 8], dim=2), - Test(self=(7, 10, 10), sizes=[1, 9], dim=2), - Test(self=(10, 10), sizes=[1, 9], dim=1), - Test(self=(10,), sizes=[1, 9], dim=0), - # Split on Height - Test(self=(S1, 7, 11, 10), sizes=[1, 3, 2, 5], dim=2), - Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2), - Test(self=(7, 11, 10), sizes=[1, 3, 2, 5], dim=1), - Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1), - Test(self=(7, 11, 11), sizes=[3, 8], dim=1), - Test(self=(7, 10, 10), sizes=[10], dim=1), - Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1), - Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0), - # Split on Batch - Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0), - Test(self=(10, 7, 10, 10), sizes=[10], dim=0), - # Split on Channel - Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1), - Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1), - Test(self=(7, 13, 4, 8), sizes=[3, 2, 2, 5, 1], dim=1), - Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1), - Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0), - Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0), - Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0), - Test(self=(13, 4, 8), sizes=[13], dim=0), - ] - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite.data_gen = "make_seq_tensor" - test_suite.dtypes = ["at::kFloat"] - return test_suite - - -@register_test_suite("aten.split.Tensor") -def get_split_tensor_inputs(): - test_suite = VkTestSuite( - [ - # Split on Width - ((S1, 7, 10, 12), 12, 3), - ((S1, 7, 10, 12), 3, 3), - ((S1, 7, 10, 12), 1, 3), - ((7, 10, 12), 12, 2), - ((7, 10, 12), 3, 2), - ((7, 10, 12), 1, 2), - ((10, 12), 12, 1), - ((10, 12), 3, 1), - ((10, 12), 1, 1), - ((12,), 12, 0), - ((12,), 3, 0), - ((12,), 1, 0), - # Split on Height - ((S1, 7, 12, 8), 12, 2), - ((S1, 7, 12, 8), 3, 2), - ((S1, 7, 12, 8), 1, 2), - ((7, 12, 8), 12, 1), - ((7, 12, 8), 3, 1), - ((7, 12, 8), 1, 1), - ((12, 8), 12, 0), - ((12, 8), 3, 0), - ((12, 8), 1, 0), - # Split on Batch - ((12, 7, 10, 10), 12, 0), - ((12, 7, 10, 10), 3, 0), - ((12, 7, 10, 10), 1, 0), - # Split on Channel - ((7, 15, 10, 10), 15, 1), - ((7, 15, 10, 10), 5, 1), - ((7, 15, 10, 10), 3, 1), - ((7, 15, 10, 10), 1, 1), - ((15, 10, 10), 15, 0), - ((15, 10, 10), 5, 0), - ((15, 10, 10), 3, 0), - ((15, 10, 10), 1, 0), - ] - ) - - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite.data_gen = "make_seq_tensor" - test_suite.dtypes = ["at::kFloat"] - return test_suite - - -def get_reduce_inputs(is_softmax: bool = False): - bool_arg = False if is_softmax else True - return [ - ((L), 0, bool_arg), - ((L), -1, bool_arg), - ((M, L), 0, bool_arg), - ((M, L), 1, bool_arg), - ((L, M), -1, bool_arg), - ((M, L), -2, bool_arg), - ((S, S1, S2), 0, bool_arg), - ((S, S1, S2), 1, bool_arg), - ((S, S1, S2), 2, bool_arg), - ((S, S1, S2), -1, bool_arg), - ((S, S1, S2), -2, bool_arg), - ((S, S1, S2), -3, bool_arg), - ((1, S, S1, S2), 1, bool_arg), - ((1, S, S1, S2), 2, bool_arg), - ((1, S, S1, S2), 3, bool_arg), - ((1, S, S1, S2), -1, bool_arg), - ((1, S, S1, S2), -2, bool_arg), - ((1, S, S1, S2), -3, bool_arg), - # Test batches > 1 where the reduction dim is not the concat dim - ((S, S2, S1, 128), -1, bool_arg), - ] - - -@register_test_suite(["aten._softmax.default", "aten._log_softmax.default"]) -def get_softmax_inputs(): - test_suite = VkTestSuite(get_reduce_inputs(is_softmax=True)) - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - return test_suite - - -@register_test_suite( - ["aten.amax.default", "aten.amin.default", "aten.sum.dim_IntList", "aten.mean.dim"] -) -def get_reduce_op_inputs(): - test_suite = VkTestSuite(get_reduce_inputs()) - test_suite.layouts = [ - "utils::kChannelsPacked", - "utils::kWidthPacked", - ] - return test_suite - - -@register_test_suite(["aten.var.dim"]) -def get_var_inputs(): - test_cases = [] - shapes_and_dims = [ - ((L), 0), - ((L), -1), - ((M, L), 0), - ((M, L), 1), - ((L, M), -1), - ((M, L), -2), - ((S, S1, S2), 0), - ((S, S1, S2), 1), - ((S, S1, S2), 2), - ((S, S1, S2), -1), - ((S, S1, S2), -2), - ((S, S1, S2), -3), - ((1, S, S1, S2), 1), - ((1, S, S1, S2), 2), - ((1, S, S1, S2), 3), - ((1, S, S1, S2), -1), - ((1, S, S1, S2), -2), - ((1, S, S1, S2), -3), - # Test batches > 1 where the reduction dim is not the concat dim - ((S, L, S1, L), -1), - ((S, S2, S1, S), -2), - ((S, S2, M, M), 2), - ((S, M, S1, L), 3), - ] - - for i, (shape, dim) in enumerate(shapes_and_dims): - unbiased = (i % 2) == 0 - test_cases.append((shape, dim, unbiased, True)) - - # Texture-based tests - texture_test_suite = VkTestSuite(test_cases) - texture_test_suite.layouts = [ - "utils::kChannelsPacked", - "utils::kWidthPacked", - ] - texture_test_suite.storage_types = ["utils::kTexture3D"] - texture_test_suite.atol = "1e-4" - texture_test_suite.rtol = "1e-4" - texture_test_suite.test_name_suffix = "texture" - - # Buffer-based tests - buffer_test_suite = VkTestSuite(test_cases) - buffer_test_suite.layouts = [ - "utils::kChannelsPacked", - "utils::kWidthPacked", - ] - buffer_test_suite.storage_types = ["utils::kBuffer"] - buffer_test_suite.atol = "1e-4" - buffer_test_suite.rtol = "1e-4" - buffer_test_suite.test_name_suffix = "buffer" - - return [texture_test_suite, buffer_test_suite] - - -@register_test_suite( - [ - "aten.sqrt.default", - "aten.rsqrt.default", - "aten.exp.default", - "aten.hardshrink.default", - "aten.sin.default", - "aten.neg.default", - "aten.cos.default", - "aten.hardswish.default", - "aten.hardsigmoid.default", - "aten.leaky_relu.default", - "aten.round.default", - "aten.tan.default", - "aten.relu6.default", - ] -) -def get_unary_ops_inputs(): - test_suite = VkTestSuite( - [ - (M1,), - (M1, M2), - (S1, M1, M2), - (S1, S2, S2, M2), - ] - ) - test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] - test_suite.atol = "1e-4" - test_suite.rtol = "1e-4" - return test_suite - - -# separate test suite from unary_ops for learning purposes -@register_test_suite("aten.tan.default") -def get_tan_inputs(): - test_suite = VkTestSuite( - [ - (M1,), - (M1, M2), - (S1, M1, M2), - (S1, S2, S2, M2), - ] - ) - test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] - test_suite.dtypes = ["at::kFloat", "at::kHalf"] - return test_suite - - -@register_test_suite("aten._native_batch_norm_legit_no_training.default") -def get_native_batch_norm_inputs(): - Test = namedtuple( - "VkSliceTest", ["self", "weight", "bias", "mean", "var", "momentum", "eps"] - ) - - test_cases = [ - Test( - self=(1, 1, 2, 5), - weight=(1,), - bias=(1,), - mean=(1,), - var=(1,), - momentum=0.0, - eps=0.001, - ), - Test( - self=(S2, 1, 2, 5), - weight=(1,), - bias=(1,), - mean=(1,), - var=(1,), - momentum=0.0, - eps=0.001, - ), - Test( - self=(1, S2, 2, 5), - weight=(S2,), - bias=(S2,), - mean=(S2,), - var=(S2,), - momentum=0.0, - eps=0.001, - ), - Test( - self=(9, S1, 2, 5), - weight=(S1,), - bias=(S1,), - mean=(S1,), - var=(S1,), - momentum=0.0, - eps=0.01, - ), - Test( - self=(3, S1, 2, 5), - weight=(S1,), - bias=(S1,), - mean=(S1,), - var=(S1,), - momentum=0.0, - eps=0.001, - ), - Test( - self=(3, S2, 2, 5), - weight=(S2,), - bias=(S2,), - mean=(S2,), - var=(S2,), - momentum=0.0, - eps=0.001, - ), - Test( - self=(3, S2, 2, 5), - weight=(S2,), - bias=(S2,), - mean=(S2,), - var=(S2,), - momentum=0.0, - eps=0.000, - ), - ] - - test_suite = VkTestSuite(test_cases) - test_suite.requires_prepack = True - test_suite.prepacked_args = ["weight", "bias", "mean", "var"] - - return test_suite - - -@register_test_suite("aten.gelu.default") -def get_gelu_inputs(): - test_suite = VkTestSuite( - [ - ((M1), "tanh"), - ((M1, M2), "tanh"), - ((S1, M1, M2), "tanh"), - ((S1, S2, S2, M2), "tanh"), - ] - ) - return test_suite - - -@register_test_suite("aten.arange.start_step") -def get_arange_inputs(): - test_suite = VkTestSuite( - [ - (1, 13), - (1.0, 11), - (-13, 3), - (-11.0, 2), - (3, 15, 3), - (3, 23, 2), - (3, 23.0, 4), - (13, 1, -1), - (-3, -13, -2), - (13, -2.0, -4), - ], - ) - - test_suite.layouts = [ - "utils::kChannelsPacked", - ] - return test_suite - - -@register_test_suite("aten.constant_pad_nd.default") -def get_constant_pad_nd_inputs(): - test_suite = VkTestSuite( - [ - ([S1, S2], [1, 1], 24.0), - ([M, M1, M2], [2, 2], 23.2), - ([L, M, M1, M2], [3, 5], 12.2), - ([S1, S2], [1, 1, 1, 1], 24.0), - ([M, M1, M2], [2, 2, 2, 2], 23.2), - ([L, M, M1, M2], [3, 5, 3, 5], 12.2), - ([M, M1, M2], [1, 2, 3, 4, 5, 6], 23.2), - ([L, M, M1, M2], [3, 3, 3, 3, 3, 3], 12.2), - ] - ) - return test_suite - - -@register_test_suite("aten.minimum.default") -def get_minimum_inputs(): - test_suite = VkTestSuite( - [ - ((M1, M2), (M2)), - ((M1, M2), (M1, M2)), - ((M1, M2, M), (M2, M)), - ((M1, M1, S1, S2), (M1, M1, S1, S2)), - ((S1, S1, S2, S), (S1, S2, S)), - ((M1, S1, S2), (L, M1, S1, S2)), - ((S1, S2), (L, M1, S1, S2)), - ] - ) - return test_suite - - -@register_test_suite("aten.squeeze_copy.dims") -def get_squeeze_copy_dim_inputs(): - test_suite = VkTestSuite( - [ - ([S, S, S, 1], 3), - ([S, 1, S, S], 1), - ([S, 1, 1, S], [1, 2]), - ([1, S, S, S], 0), - ([S, S, S, S], 3), - ([S, S, S, S], 2), - ([S, S, S, S], 1), - ([M, M1, 1], 2), - ([M, 1, M1], 1), - ([1, M1, M1], 0), - ] - ) - - highdim_test_suite = VkTestSuite( - [ - ([1, 2, 3, 4, 5, 1], 0), - ([1, 2, 3, 4, 5, 1], 5), - ([1, 2, 3, 4, 5, 1], [0, 5]), - ([2, 1, 3, 1, 5, 6], 1), - ([2, 1, 3, 1, 5, 6], 3), - ([2, 1, 3, 1, 5, 6], [1, 3]), - ([1, 1, 3, 4, 5, 6], [0, 1]), - ([2, 3, 4, 1, 1, 6], [3, 4]), - ] - ) - highdim_test_suite.storage_types = [ - "utils::kBuffer", - ] - highdim_test_suite.test_name_suffix = "highdim" - - for suite in [test_suite, highdim_test_suite]: - suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - - return [test_suite, highdim_test_suite] - - -@register_test_suite("aten.flip.default") -def get_flip_inputs(): - Test = namedtuple("Flip", ["self", "dim"]) - Test.__new__.__defaults__ = (None, 0) - - test_cases = [ - Test(self=[9], dim=[0]), - Test(self=[9, 9], dim=[0, 1]), - Test(self=[9, 9, 9], dim=[0, 2]), - Test(self=[9, 9, 9], dim=[0, 1, 2]), - Test(self=[9, 9, 9, 9], dim=[0]), - Test(self=[9, 9, 9, 9], dim=[0, 2, 3]), - Test(self=[9, 9, 9, 9], dim=[1, 3]), - Test(self=[9, 9, 9, 9], dim=[0, 1, 2, 3]), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - return test_suite - - -@register_test_suite("aten.expand_copy.default") -def get_expand_inputs(): - test_suite = VkTestSuite( - [ - # Basic expansion cases - ((1,), [5]), - ((1, 1), [3, 4]), - ((1, 3), [2, 3]), - ((3, 1), [3, 4]), - ((1, 1, 1), [2, 3, 4]), - # Expand with same size (no-op) - ((3, 4), [3, 4]), - ((2, 3, 4), [2, 3, 4]), - # Expand with additional dimensions - ((3,), [2, 3]), - ((3, 4), [2, 3, 4]), - ((2, 3), [1, 2, 3]), - # Mixed expansion cases - ((1, 3, 1, 4), [2, 3, 5, 4]), - ((1, 1, 3, 1), [2, 4, 3, 5]), - # Larger tensor cases - ((1, S1), [M, S1]), - ((S2, 1), [S2, M1]), - ((1, 1, S), [S1, S2, S]), - ((1, S1, 1, S2), [M, S1, M1, S2]), - ] - ) - test_suite.storage_types = [ - "utils::kBuffer", - ] - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - test_suite.dtypes = [ - "at::kFloat", - "at::kHalf", - ] - test_suite.data_gen = "make_seq_tensor" - return test_suite - - -@register_test_suite("aten.where.self") -def get_where_inputs(): - Test = namedtuple("Where", ["condition", "self", "other"]) - Test.__new__.__defaults__ = (None, None, None) - - test_cases = [ - Test(condition=[11], self=[11], other=[11]), - Test(condition=[10, 9], self=[10, 9], other=[10, 9]), - Test(condition=[10, 5, 3], self=[10, 5, 3], other=[10, 5, 3]), - Test(condition=[2, 10, 5, 3], self=[2, 10, 5, 3], other=[2, 10, 5, 3]), - ] - - test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) - test_suite.arg_dtype["condition"] = "at::kBool" - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kHeightPacked", - "utils::kChannelsPacked", - ] - test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] - test_suite.atol = "1e-4" - test_suite.rtol = "1e-4" - return test_suite diff --git a/backends/vulkan/test/op_tests/choose_qparams_test.cpp b/backends/vulkan/test/op_tests/choose_qparams_test.cpp deleted file mode 100644 index 3b1094a1e84..00000000000 --- a/backends/vulkan/test/op_tests/choose_qparams_test.cpp +++ /dev/null @@ -1,786 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include -#include - -#include "test_utils.h" - -#include -#include - -namespace torch { -namespace executor { -namespace native { - -// Forward declarations of the functions we're testing -std::tuple choose_qparams_tensor_out( - const Tensor& input, - int64_t quant_min, - int64_t quant_max, - ET_UNUSED double eps, - ScalarType dtype, - Tensor& scale_out, - Tensor& zero_point_out); - -std::tuple choose_qparams_per_token_asymmetric_out( - const Tensor& input, - ScalarType dtype, - Tensor& scale_out, - Tensor& zero_point_out); - -// Wrapper function for choose_qparams_tensor_out without context -Tensor& choose_qparams_tensor_out_no_context( - const Tensor& input, - int64_t quant_min, - int64_t quant_max, - ET_UNUSED double eps, - ScalarType dtype, - Tensor& scale_out, - Tensor& zero_point_out) { - torch::executor::native::choose_qparams_tensor_out( - input, quant_min, quant_max, eps, dtype, scale_out, zero_point_out); - return scale_out; -} - -// Wrapper function for choose_qparams_per_token_asymmetric_out without context -Tensor& choose_qparams_per_token_asymmetric_out_no_context( - const Tensor& input, - ScalarType dtype, - Tensor& scale_out, - Tensor& zero_point_out) { - torch::executor::native::choose_qparams_per_token_asymmetric_out( - input, dtype, scale_out, zero_point_out); - return scale_out; -} - -// ATen wrapper for choose_qparams_tensor -std::tuple choose_qparams_tensor_aten( - const at::Tensor& input, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - auto scale_out = at::empty({}, at::device(at::kCPU).dtype(at::kDouble)); - auto zero_point_out = at::empty({}, at::device(at::kCPU).dtype(at::kLong)); - double eps = 1e-7; - - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - - // Use WRAP_TO_ATEN with the wrapper function - WRAP_TO_ATEN(choose_qparams_tensor_out_no_context, 5) - (input, quant_min, quant_max, eps, et_dtype, scale_out, zero_point_out); - - return {scale_out, zero_point_out}; -} - -// ATen wrapper for choose_qparams_per_token_asymmetric -std::tuple choose_qparams_per_token_asymmetric_aten( - const at::Tensor& input, - at::ScalarType dtype) { - // Calculate output sizes for scale and zero_point tensors - std::vector output_sizes; - for (int64_t i = 0; i < input.dim() - 1; i++) { - output_sizes.push_back(input.size(i)); - } - output_sizes.push_back(1); - - auto scale_out = - at::empty(output_sizes, at::device(at::kCPU).dtype(at::kDouble)); - auto zero_point_out = - at::empty(output_sizes, at::device(at::kCPU).dtype(at::kLong)); - - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - - // Use WRAP_TO_ATEN with the wrapper function - WRAP_TO_ATEN(choose_qparams_per_token_asymmetric_out_no_context, 2) - (input, et_dtype, scale_out, zero_point_out); - - return {scale_out, zero_point_out}; -} - -} // namespace native -} // namespace executor -} // namespace torch - -// -// Reference Implementation -// - -/* - * Reference implementation of choose_qparams_tensor - */ -std::tuple choose_qparams_tensor_reference_impl( - const at::Tensor& input, - int64_t quant_min, - int64_t quant_max) { - // Create output tensors - at::Tensor scale_out = at::empty({}, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_out = - at::empty({}, at::device(at::kCPU).dtype(at::kLong)); - - // Find min and max values in the input tensor - float min_val = input.min().item(); - float max_val = input.max().item(); - - // Extend the [min, max] interval to ensure it contains 0 - min_val = std::min(min_val, 0.f); - max_val = std::max(max_val, 0.f); - - // Calculate scale - double scale = - (static_cast(max_val) - min_val) / (quant_max - quant_min); - - // Handle small scale - constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; - if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { - scale = 0.1; - } - - if (scale < SMALL_SCALE_THRESHOLD) { - float org_scale = scale; - scale = SMALL_SCALE_THRESHOLD; - // Adjust min and max based on new scale - if (min_val == 0.0f) { - max_val = SMALL_SCALE_THRESHOLD * (quant_max - quant_min); - } else if (max_val == 0.0f) { - min_val = -SMALL_SCALE_THRESHOLD * (quant_max - quant_min); - } else { - float amplifier = SMALL_SCALE_THRESHOLD / org_scale; - min_val *= amplifier; - max_val *= amplifier; - } - } - - // Calculate zero point - double zero_point_from_min = quant_min - min_val / static_cast(scale); - double zero_point_from_max = quant_max - max_val / static_cast(scale); - double zero_point_from_min_error = - std::abs(quant_min) - std::abs(min_val / static_cast(scale)); - double zero_point_from_max_error = - std::abs(quant_max) - std::abs(max_val / static_cast(scale)); - double initial_zero_point = - zero_point_from_min_error < zero_point_from_max_error - ? zero_point_from_min - : zero_point_from_max; - - // Nudge zero point to be an integer - int64_t nudged_zero_point = 0; - if (initial_zero_point < quant_min) { - nudged_zero_point = quant_min; - } else if (initial_zero_point > quant_max) { - nudged_zero_point = quant_max; - } else { - nudged_zero_point = std::nearbyint(static_cast(initial_zero_point)); - } - - // Set output values - use item_mutable() for scalar tensors - scale_out.fill_(scale); - zero_point_out.fill_(nudged_zero_point); - - return std::make_tuple(scale_out, zero_point_out); -} - -/* - * Reference implementation of choose_qparams_per_token_asymmetric - */ -std::tuple -choose_qparams_per_token_asymmetric_reference_impl( - const at::Tensor& input, - at::ScalarType dtype) { - // For per-token quantization, we need to compute scale and zero_point for - // each token - int64_t quant_min = -128; - int64_t quant_max = 127; - - // Calculate output sizes - std::vector output_sizes; - for (int64_t i = 0; i < input.dim() - 1; i++) { - output_sizes.push_back(input.size(i)); - } - output_sizes.push_back(1); - - // Create output tensors - at::Tensor scale_out = - at::empty(output_sizes, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_out = - at::empty(output_sizes, at::device(at::kCPU).dtype(at::kLong)); - - // Calculate number of tokens - int64_t num_tokens = 1; - for (int64_t i = 0; i < input.dim() - 1; i++) { - num_tokens *= input.size(i); - } - - // Reshape input to [num_tokens, last_dim] - at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)}); - - // Process each token - for (int64_t token_idx = 0; token_idx < num_tokens; token_idx++) { - at::Tensor token = reshaped_input[token_idx]; - - // Find min and max values for this token - float min_val = token.min().item(); - float max_val = token.max().item(); - - // Extend the [min, max] interval to ensure it contains 0 - min_val = std::min(min_val, 0.f); - max_val = std::max(max_val, 0.f); - - // Calculate scale - double scale = - (static_cast(max_val) - min_val) / (quant_max - quant_min); - - // Handle small scale - constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; - if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { - scale = 0.1; - } - - if (scale < SMALL_SCALE_THRESHOLD) { - float org_scale = scale; - scale = SMALL_SCALE_THRESHOLD; - // Adjust min and max based on new scale - if (min_val == 0.0f) { - max_val = SMALL_SCALE_THRESHOLD * (quant_max - quant_min); - } else if (max_val == 0.0f) { - min_val = -SMALL_SCALE_THRESHOLD * (quant_max - quant_min); - } else { - float amplifier = SMALL_SCALE_THRESHOLD / org_scale; - min_val *= amplifier; - max_val *= amplifier; - } - } - - // Calculate zero point - double zero_point_from_min = - quant_min - min_val / static_cast(scale); - double zero_point_from_max = - quant_max - max_val / static_cast(scale); - double zero_point_from_min_error = - std::abs(quant_min) - std::abs(min_val / static_cast(scale)); - double zero_point_from_max_error = - std::abs(quant_max) - std::abs(max_val / static_cast(scale)); - double initial_zero_point = - zero_point_from_min_error < zero_point_from_max_error - ? zero_point_from_min - : zero_point_from_max; - - // Nudge zero point to be an integer - int64_t nudged_zero_point = 0; - if (initial_zero_point < quant_min) { - nudged_zero_point = quant_min; - } else if (initial_zero_point > quant_max) { - nudged_zero_point = quant_max; - } else { - nudged_zero_point = - std::nearbyint(static_cast(initial_zero_point)); - } - - // Set output values for this token - use index_put_ for safety - scale_out.view({num_tokens, 1}).index_put_({token_idx, 0}, scale); - zero_point_out.view({num_tokens, 1}) - .index_put_({token_idx, 0}, nudged_zero_point); - } - - return std::make_tuple(scale_out, zero_point_out); -} - -// Forward declaration of implementation functions -void test_vulkan_choose_qparams_tensor_impl( - const std::vector& input_sizes, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -void test_vulkan_choose_qparams_per_token_asymmetric_impl( - const std::vector& input_sizes, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_choose_qparams_tensor( - const std::vector& input_sizes, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - // Test with buffer storage - test_vulkan_choose_qparams_tensor_impl( - input_sizes, - quant_min, - quant_max, - dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Test with texture storage - test_vulkan_choose_qparams_tensor_impl( - input_sizes, - quant_min, - quant_max, - dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_choose_qparams_per_token_asymmetric( - const std::vector& input_sizes, - at::ScalarType dtype) { - // Test with buffer storage - test_vulkan_choose_qparams_per_token_asymmetric_impl( - input_sizes, dtype, vkcompute::utils::kBuffer, vkcompute::utils::kBuffer); - - // Test with texture storage - test_vulkan_choose_qparams_per_token_asymmetric_impl( - input_sizes, - dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -void test_reference_choose_qparams_tensor( - const std::vector& input_sizes, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat)); - - // Get reference output - auto [reference_scale, reference_zero_point] = - choose_qparams_tensor_reference_impl(input, quant_min, quant_max); - - // Get implementation output - auto [impl_scale, impl_zero_point] = - torch::executor::native::choose_qparams_tensor_aten( - input, quant_min, quant_max, dtype); - - // Compare outputs - const bool scale_correct = at::allclose(reference_scale, impl_scale); - const bool zero_point_correct = - at::equal(reference_zero_point, impl_zero_point); - - if (!scale_correct || !zero_point_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference scale:" << std::endl; - std::cout << reference_scale << std::endl; - std::cout << "implementation scale:" << std::endl; - std::cout << impl_scale << std::endl; - std::cout << "reference zero_point:" << std::endl; - std::cout << reference_zero_point << std::endl; - std::cout << "implementation zero_point:" << std::endl; - std::cout << impl_zero_point << std::endl; - } - - ASSERT_TRUE(scale_correct && zero_point_correct); -} - -void test_vulkan_choose_qparams_tensor_impl( - const std::vector& input_sizes, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage) { - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat)); - - // Get reference output - auto [reference_scale, reference_zero_point] = - torch::executor::native::choose_qparams_tensor_aten( - input, quant_min, quant_max, dtype); - - // Build Vulkan choose_qparams_tensor graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - // Output tensors - const ValueRef r_scale = graph.add_tensor({}, vkapi::kFloat, out_storage); - const ValueRef r_zero_point = graph.add_tensor({}, vkapi::kInt, out_storage); - - // Create output tuple - const ValueRef r_out_tuple = graph.add_value_list({r_scale, r_zero_point}); - - // Add eps and dtype parameters to match ATen signature - const ValueRef r_eps = graph.add_scalar(6.1e-5); - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - - VK_GET_OP_FN("quantized_decomposed.choose_qparams.tensor") - (graph, - { - r_input.value, - r_quant_min, - r_quant_max, - r_eps, - r_dtype, - r_out_tuple, - }); - - ValueRef staging_scale = graph.set_output_tensor(r_scale); - ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point); - - graph.prepare(); - - graph.prepack(); - - // Run Vulkan choose_qparams_tensor - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - graph.execute(); - - // Create output tensors to hold the results - use types that match GPU output - at::Tensor vk_scale = - at::empty({}, at::device(at::kCPU).dtype(at::kFloat)).contiguous(); - at::Tensor vk_zero_point = - at::empty({}, at::device(at::kCPU).dtype(at::kInt)).contiguous(); - - // Copy results from GPU to CPU - graph.copy_from_staging( - staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel()); - graph.copy_from_staging( - staging_zero_point, - vk_zero_point.mutable_data_ptr(), - vk_zero_point.numel()); - - // Convert reference values to match Vulkan output types for comparison - at::Tensor reference_scale_float = reference_scale.to(at::kFloat); - at::Tensor reference_zero_point_int = reference_zero_point.to(at::kInt); - - // Compare outputs - const bool scale_correct = at::allclose(reference_scale_float, vk_scale); - const bool zero_point_correct = - at::equal(reference_zero_point_int, vk_zero_point); - - if (!scale_correct || !zero_point_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - // make sure that there arent a ton of elements in the input tensor - if (input.numel() < 100) { - std::cout << "input:" << std::endl; - std::cout << input << "\n" << std::endl; - std::cout << "reference scale:" << std::endl; - std::cout << reference_scale << std::endl; - std::cout << "vulkan scale:" << std::endl; - std::cout << vk_scale << "\n" << std::endl; - std::cout << "reference zero_point:" << std::endl; - std::cout << reference_zero_point << std::endl; - std::cout << "vulkan zero_point:" << std::endl; - std::cout << vk_zero_point << std::endl; - } - } - - ASSERT_TRUE(scale_correct && zero_point_correct); -} - -TEST(VulkanChooseQparamsTest, test_reference_choose_qparams_tensor_int8) { - test_reference_choose_qparams_tensor( - {2, 3, 4}, // input sizes - -128, // quant_min - 127, // quant_max - at::kChar); -} - -TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_uint8_4D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_tensor( - {5, 3, 2, 4}, // input sizes - 0, // quant_min - 255, // quant_max - at::kByte); -} - -TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_2D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_tensor( - {5, 5}, // input sizes - -128, // quant_min - 127, // quant_max - at::kChar); -} - -TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_3D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_tensor( - {12, 8, 2}, // input sizes - -128, // quant_min - 127, // quant_max - at::kChar); -} - -TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_4D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_tensor( - {10, 10, 6, 4}, // input sizes - -128, // quant_min - 127, // quant_max - at::kChar); -} - -void test_reference_choose_qparams_per_token_asymmetric( - const std::vector& input_sizes, - at::ScalarType dtype) { - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat)); - - // Get reference output - auto [reference_scale, reference_zero_point] = - choose_qparams_per_token_asymmetric_reference_impl(input, dtype); - - // Get implementation output - auto [impl_scale, impl_zero_point] = - torch::executor::native::choose_qparams_per_token_asymmetric_aten( - input, dtype); - - // Compare outputs - const bool scale_correct = at::allclose(reference_scale, impl_scale); - const bool zero_point_correct = - at::equal(reference_zero_point, impl_zero_point); - - if (!scale_correct || !zero_point_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference scale:" << std::endl; - std::cout << reference_scale << std::endl; - std::cout << "implementation scale:" << std::endl; - std::cout << impl_scale << std::endl; - std::cout << "reference zero_point:" << std::endl; - std::cout << reference_zero_point << std::endl; - std::cout << "implementation zero_point:" << std::endl; - std::cout << impl_zero_point << std::endl; - } - - ASSERT_TRUE(scale_correct && zero_point_correct); -} - -void test_vulkan_choose_qparams_per_token_asymmetric_impl( - const std::vector& input_sizes, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage) { - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat)); - - // Calculate output sizes - std::vector output_sizes; - for (int64_t i = 0; i < input.dim() - 1; i++) { - output_sizes.push_back(input.size(i)); - } - output_sizes.push_back(1); - - // Get reference output - auto [reference_scale, reference_zero_point] = - torch::executor::native::choose_qparams_per_token_asymmetric_aten( - input, dtype); - - // Build Vulkan choose_qparams_per_token_asymmetric graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - - // Output tensors - const ValueRef r_scale = - graph.add_tensor(output_sizes, vkapi::kFloat, out_storage); - const ValueRef r_zero_point = - graph.add_tensor(output_sizes, vkapi::kInt, out_storage); - - // Create output tuple - const ValueRef r_out_tuple = graph.add_value_list({r_scale, r_zero_point}); - - // Add dtype parameter to match ATen signature - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - - VK_GET_OP_FN( - "quantized_decomposed.choose_qparams_per_token_asymmetric.default") - (graph, - { - r_input.value, - r_dtype, - r_out_tuple, - }); - - ValueRef staging_scale = graph.set_output_tensor(r_scale); - ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point); - - graph.prepare(); - - graph.prepack(); - - // Run Vulkan choose_qparams_per_token_asymmetric - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - graph.execute(); - - // Create output tensors to hold the results - use types that match GPU output - at::Tensor vk_scale = - at::empty(output_sizes, at::device(at::kCPU).dtype(at::kFloat)) - .contiguous(); - at::Tensor vk_zero_point = - at::empty(output_sizes, at::device(at::kCPU).dtype(at::kInt)) - .contiguous(); - - // Copy results from GPU to CPU - graph.copy_from_staging( - staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel()); - graph.copy_from_staging( - staging_zero_point, - vk_zero_point.mutable_data_ptr(), - vk_zero_point.numel()); - - // Convert reference values to match Vulkan output types for comparison - at::Tensor reference_scale_float = reference_scale.to(at::kFloat); - at::Tensor reference_zero_point_int = reference_zero_point.to(at::kInt); - - // Compare outputs - const bool scale_correct = at::allclose(reference_scale_float, vk_scale); - const bool zero_point_correct = - at::equal(reference_zero_point_int, vk_zero_point); - if (!scale_correct || !zero_point_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - if (input.numel() < 100) { - std::cout << "input:" << std::endl; - std::cout << input << "\n" << std::endl; - std::cout << "reference scale:" << std::endl; - std::cout << reference_scale << std::endl; - std::cout << "vulkan scale:" << std::endl; - std::cout << vk_scale << "\n" << std::endl; - std::cout << "reference zero_point:" << std::endl; - std::cout << reference_zero_point << std::endl; - std::cout << "vulkan zero_point:" << std::endl; - std::cout << vk_zero_point << std::endl; - } - } - - ASSERT_TRUE(scale_correct && zero_point_correct); -} - -TEST( - VulkanChooseQparamsTest, - test_reference_choose_qparams_per_token_asymmetric_int8) { - test_reference_choose_qparams_per_token_asymmetric( - {2, 3, 4}, // input sizes (2*3=6 tokens) - at::kChar); -} - -TEST( - VulkanChooseQparamsTest, - test_vulkan_choose_qparams_per_token_asymmetric_int8_1D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_per_token_asymmetric({7}, at::kChar); -} - -TEST( - VulkanChooseQparamsTest, - test_vulkan_choose_qparams_per_token_asymmetric_int8_2D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_per_token_asymmetric({2, 2}, at::kChar); -} - -TEST( - VulkanChooseQparamsTest, - test_vulkan_choose_qparams_per_token_asymmetric_int8_3D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_per_token_asymmetric({3, 6, 4}, at::kChar); -} - -TEST( - VulkanChooseQparamsTest, - test_vulkan_choose_qparams_per_token_asymmetric_int8_4D) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_choose_qparams_per_token_asymmetric({128, 2, 16, 3}, at::kChar); -} diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp deleted file mode 100644 index 9fca2c632d3..00000000000 --- a/backends/vulkan/test/op_tests/dequantize_test.cpp +++ /dev/null @@ -1,2492 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include -#include - -#include "test_utils.h" - -#include -#include -#include -#include - -namespace torch { -namespace executor { -namespace native { - -// Forward declarations of the functions we're testing -Tensor& dequantize_per_tensor_out( - const Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - executorch::aten::optional out_dtype, - Tensor& out); - -Tensor& dequantize_per_token_out( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_points, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - ScalarType out_dtype, - Tensor& out); - -Tensor& dequantize_per_channel_out( - const Tensor& input, - const Tensor& scale, - const std::optional& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - executorch::aten::optional out_dtype, - Tensor& out); - -Tensor& dequantize_per_tensor_tensor_args_out( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - executorch::aten::optional out_dtype, - Tensor& out); - -// Wrapper function for dequantize_per_tensor_out without context -Tensor& dequantize_per_tensor_out_no_context( - const Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - executorch::aten::optional out_dtype, - Tensor& out) { - return torch::executor::native::dequantize_per_tensor_out( - input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out); -} - -// Wrapper function for dequantize_per_token_out without context -Tensor& dequantize_per_token_out_no_context( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_points, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - ScalarType out_dtype, - Tensor& out) { - return torch::executor::native::dequantize_per_token_out( - input, scale, zero_points, quant_min, quant_max, dtype, out_dtype, out); -} - -// Wrapper function for dequantize_per_channel_out without context -Tensor& dequantize_per_channel_out_no_context( - const Tensor& input, - const Tensor& scale, - const std::optional& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - executorch::aten::optional out_dtype, - Tensor& out) { - return torch::executor::native::dequantize_per_channel_out( - input, - scale, - zero_points, - axis, - quant_min, - quant_max, - dtype, - out_dtype, - out); -} - -// Wrapper function for dequantize_per_tensor_tensor_args_out without context -Tensor& dequantize_per_tensor_tensor_args_out_no_context( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - executorch::aten::optional out_dtype, - Tensor& out) { - return torch::executor::native::dequantize_per_tensor_tensor_args_out( - input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out); -} - -// ATen wrapper for dequantize_per_tensor -at::Tensor dequantize_per_tensor_aten( - const at::Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - auto out = at::empty_like(input, out_dtype); - // Convert at::ScalarType to executorch::ScalarType - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype); - - executorch::aten::optional opt_et_out_dtype(et_out_dtype); - - WRAP_TO_ATEN(dequantize_per_tensor_out_no_context, 7) - (input, - scale, - zero_point, - quant_min, - quant_max, - et_dtype, - opt_et_out_dtype, - out); - return out; -} - -// ATen wrapper for dequantize_per_token -at::Tensor dequantize_per_token_aten( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - auto out = at::empty_like(input, out_dtype); - // Convert at::ScalarType to executorch::ScalarType - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype); - - WRAP_TO_ATEN(dequantize_per_token_out_no_context, 7) - (input, - scale, - zero_points, - quant_min, - quant_max, - et_dtype, - et_out_dtype, - out); - return out; -} - -// ATen wrapper for dequantize_per_channel -at::Tensor dequantize_per_channel_aten( - const at::Tensor& input, - const at::Tensor& scale, - const std::optional& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - auto out = at::empty_like(input, out_dtype); - // Convert at::ScalarType to executorch::ScalarType - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype); - - executorch::aten::optional opt_et_out_dtype(et_out_dtype); - - WRAP_TO_ATEN(dequantize_per_channel_out_no_context, 8) - (input, - scale, - zero_points, - axis, - quant_min, - quant_max, - et_dtype, - opt_et_out_dtype, - out); - return out; -} - -// ATen wrapper for dequantize_per_tensor with tensor args -at::Tensor dequantize_per_tensor_tensor_args_aten( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - auto out = at::empty_like(input, out_dtype); - // Convert at::ScalarType to executorch::ScalarType - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype); - - executorch::aten::optional opt_et_out_dtype(et_out_dtype); - - WRAP_TO_ATEN(dequantize_per_tensor_tensor_args_out_no_context, 7) - (input, - scale, - zero_point, - quant_min, - quant_max, - et_dtype, - opt_et_out_dtype, - out); - return out; -} - -} // namespace native -} // namespace executor -} // namespace torch - -void check_dequantize_args( - int64_t quant_min, - int64_t quant_max, - c10::ScalarType in_dtype, - c10::ScalarType out_dtype) { - using namespace vkcompute; - - // Check that quant_min <= quant_max - VK_CHECK_COND( - quant_min <= quant_max, - "quant_min must be <= quant_max, got quant_min: ", - quant_min, - " quant_max: ", - quant_max); - - // Check that input dtype is a quantized type - switch (in_dtype) { - case c10::kByte: - case c10::kChar: - case c10::kShort: - case c10::kInt: - case c10::kLong: - break; - default: - VK_THROW( - "Unsupported input dtype: ", - scalar_type_name(in_dtype), - " (", - static_cast(in_dtype), - ")"); - } - - // Check that output dtype is a floating point type - switch (out_dtype) { - case c10::kHalf: - case c10::kFloat: - case c10::kDouble: - break; - default: - VK_THROW( - "Unsupported output dtype: ", - scalar_type_name(out_dtype), - " (", - static_cast(out_dtype), - ")"); - } -} - -/** - * Helper function to validate dequantize_per_channel arguments - * Similar to the validation in quantize_test.cpp - */ -void check_dequantize_per_channel_args( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis) { - // Normalize axis - int64_t normalized_axis = axis; - if (normalized_axis < 0) { - normalized_axis += input_sizes.size(); - } - - ASSERT_GE(normalized_axis, 0) - << "axis " << axis << " is not legal, normalized axis " << normalized_axis - << " should be >= 0"; - - ASSERT_LT(normalized_axis, static_cast(input_sizes.size())) - << "axis " << axis << " is not legal, normalized axis " << normalized_axis - << " should be < input.dim() " << input_sizes.size(); - - int64_t num_channels = input_sizes[normalized_axis]; - - ASSERT_EQ(num_channels, static_cast(scales.size())) - << "Expected scales.size() to match input.size(axis) (" << num_channels - << "), but got " << scales.size(); - - ASSERT_EQ(num_channels, static_cast(zero_points.size())) - << "Expected zero_points.size() to match input.size(axis) (" - << num_channels << "), but got " << zero_points.size(); -} - -// -// Reference Implementation -// - -/* - * Reference implementation of dequantize_per_tensor - */ -at::Tensor dequantize_per_tensor_reference_impl( - const at::Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - // Create output tensor with the target dtype - at::Tensor out = at::empty_like(input, out_dtype); - - // Dequantize the input tensor - at::Tensor flat_input = input.flatten(); - at::Tensor flat_out = out.flatten(); - - // Store casted values to avoid repeated casting - const int32_t zero_point_int32 = static_cast(zero_point); - const float scale_float = static_cast(scale); - - for (int i = 0; i < flat_input.numel(); i++) { - double dequantized_value = 0.0; - - // Extract quantized value and dequantize based on input dtype - // Following the CPU implementation pattern: (input - zero_point) * scale - if (dtype == at::kByte) { - uint8_t qvalue = flat_input[i].item(); - dequantized_value = (qvalue - zero_point_int32) * scale_float; - } else if (dtype == at::kChar) { - int8_t qvalue = flat_input[i].item(); - dequantized_value = (qvalue - zero_point_int32) * scale_float; - } else if (dtype == at::kShort) { - int16_t qvalue = flat_input[i].item(); - dequantized_value = (qvalue - zero_point_int32) * scale_float; - } else if (dtype == at::kInt) { - int32_t qvalue = flat_input[i].item(); - dequantized_value = (qvalue - zero_point_int32) * scale_float; - } else if (dtype == at::kLong) { - int64_t qvalue = flat_input[i].item(); - dequantized_value = (qvalue - zero_point_int32) * scale_float; - } - - // Store result based on output dtype - if (out_dtype == at::kFloat) { - flat_out[i] = static_cast(dequantized_value); - } else if (out_dtype == at::kDouble) { - flat_out[i] = dequantized_value; - } else if (out_dtype == at::kHalf) { - flat_out[i] = static_cast(dequantized_value); - } - } - - return out.reshape(input.sizes()); -} - -/* - * Reference implementation of dequantize_per_token - */ -at::Tensor dequantize_per_token_reference_impl( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - // Create output tensor with the target dtype - at::Tensor out = at::empty_like(input, out_dtype); - - // Calculate number of tokens - int num_tokens = 1; - for (int i = 0; i < input.dim() - 1; i++) { - num_tokens *= input.size(i); - } - - // Verify that the number of tokens matches the size of scale and zero_point - // tensors - assert(num_tokens == scale.numel()); - assert(num_tokens == zero_point.numel()); - - // Reshape input to [num_tokens, last_dim] - at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)}); - at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)}); - - // Dequantize each token separately - for (int token_idx = 0; token_idx < num_tokens; token_idx++) { - // Get scale and zero_point for this token - float token_scale = scale[token_idx].item(); - int64_t token_zero_point = zero_point[token_idx].item(); - - // Store casted values to avoid repeated casting - const int32_t token_zero_point_int32 = - static_cast(token_zero_point); - - // Dequantize the token - for (int i = 0; i < input.size(-1); i++) { - double dequantized_value = 0.0; - - // Extract quantized value and dequantize based on input dtype - // Following the CPU implementation pattern: (input - zero_point) * scale - if (dtype == at::kByte) { - uint8_t qvalue = reshaped_input[token_idx][i].item(); - dequantized_value = (qvalue - token_zero_point_int32) * token_scale; - } else if (dtype == at::kChar) { - int8_t qvalue = reshaped_input[token_idx][i].item(); - dequantized_value = (qvalue - token_zero_point_int32) * token_scale; - } else if (dtype == at::kShort) { - int16_t qvalue = reshaped_input[token_idx][i].item(); - dequantized_value = (qvalue - token_zero_point_int32) * token_scale; - } else if (dtype == at::kInt) { - int32_t qvalue = reshaped_input[token_idx][i].item(); - dequantized_value = (qvalue - token_zero_point_int32) * token_scale; - } else if (dtype == at::kLong) { - int64_t qvalue = reshaped_input[token_idx][i].item(); - dequantized_value = (qvalue - token_zero_point_int32) * token_scale; - } else { - throw std::runtime_error("Unsupported input dtype"); - } - - // Store result based on output dtype - if (out_dtype == at::kFloat) { - reshaped_out[token_idx][i] = static_cast(dequantized_value); - } else if (out_dtype == at::kDouble) { - reshaped_out[token_idx][i] = dequantized_value; - } else if (out_dtype == at::kHalf) { - reshaped_out[token_idx][i] = static_cast(dequantized_value); - } - } - } - - return out; -} - -/* - * Reference implementation of dequantize_per_channel - */ -at::Tensor dequantize_per_channel_reference_impl( - const at::Tensor& input, - const at::Tensor& scale, - const std::optional& zero_point, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - // Normalize axis to handle negative values - int64_t normalized_axis = axis; - if (normalized_axis < 0) { - normalized_axis += input.dim(); - } - - // Create output tensor with the same shape as input but with target dtype - at::Tensor output = at::empty_like(input, out_dtype); - - // Get the number of channels along the quantization axis - int64_t num_channels = input.size(normalized_axis); - - // Calculate strides for efficient indexing - std::vector input_strides; - std::vector input_sizes; - for (int64_t i = 0; i < input.dim(); i++) { - input_sizes.push_back(input.size(i)); - input_strides.push_back(input.stride(i)); - } - - // Get data pointers - const double* scale_data = scale.const_data_ptr(); - const int64_t* zero_point_data = nullptr; - if (zero_point.has_value()) { - zero_point_data = zero_point.value().const_data_ptr(); - } - - // Iterate through all elements in the tensor - int64_t total_elements = input.numel(); - - // Helper lambda to convert flat index to multi-dimensional coordinates - auto flat_to_coords = [&](int64_t flat_idx, std::vector& coords) { - int64_t remaining = flat_idx; - for (int64_t dim = input.dim() - 1; dim >= 0; dim--) { - coords[dim] = remaining % input_sizes[dim]; - remaining /= input_sizes[dim]; - } - }; - - // Process each element - std::vector coords(input.dim()); - for (int64_t flat_idx = 0; flat_idx < total_elements; flat_idx++) { - // Convert flat index to coordinates - flat_to_coords(flat_idx, coords); - - // Get the channel index for this element - int64_t channel_idx = coords[normalized_axis]; - - // Get the quantization parameters for this channel - double channel_scale = scale_data[channel_idx]; - int64_t channel_zero_point = 0; - if (zero_point_data != nullptr) { - channel_zero_point = zero_point_data[channel_idx]; - } - - // Store casted values to avoid repeated casting - const int32_t channel_zero_point_int32 = - static_cast(channel_zero_point); - const float channel_scale_float = static_cast(channel_scale); - - // Get the input value and dequantize - double dequantized_value = 0.0; - - // Extract quantized value and dequantize based on input dtype - // Following the CPU implementation pattern: (input - zero_point) * scale - if (dtype == at::kByte) { - uint8_t qvalue = input.flatten()[flat_idx].item(); - dequantized_value = - (qvalue - channel_zero_point_int32) * channel_scale_float; - } else if (dtype == at::kChar) { - int8_t qvalue = input.flatten()[flat_idx].item(); - dequantized_value = - (qvalue - channel_zero_point_int32) * channel_scale_float; - } else if (dtype == at::kShort) { - int16_t qvalue = input.flatten()[flat_idx].item(); - dequantized_value = - (qvalue - channel_zero_point_int32) * channel_scale_float; - } else if (dtype == at::kInt) { - int32_t qvalue = input.flatten()[flat_idx].item(); - dequantized_value = - (qvalue - channel_zero_point_int32) * channel_scale_float; - } else if (dtype == at::kLong) { - int64_t qvalue = input.flatten()[flat_idx].item(); - dequantized_value = - (qvalue - channel_zero_point_int32) * channel_scale_float; - } else { - throw std::runtime_error("Unsupported input dtype"); - } - - // Store the result based on output dtype - if (out_dtype == at::kFloat) { - output.flatten()[flat_idx] = static_cast(dequantized_value); - } else if (out_dtype == at::kDouble) { - output.flatten()[flat_idx] = dequantized_value; - } else if (out_dtype == at::kHalf) { - output.flatten()[flat_idx] = static_cast(dequantized_value); - } - } - - return output; -} - -// Forward declaration of implementation functions -void test_vulkan_dequantize_per_token_impl( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -void test_vulkan_dequantize_per_channel_impl( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -void test_vulkan_dequantize_per_tensor_tensor_impl( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_dequantize_per_token( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - // Test with buffer storage - test_vulkan_dequantize_per_token_impl( - input_sizes, - scales, - zero_points, - quant_min, - quant_max, - dtype, - out_dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Telling the system to expect a float instead of a double - // since the shader can only return 32bit anyways - if (out_dtype == at::kDouble) { - out_dtype = at::kFloat; - } - - // Test with texture storage - test_vulkan_dequantize_per_token_impl( - input_sizes, - scales, - zero_points, - quant_min, - quant_max, - dtype, - out_dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_dequantize_per_channel( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - // Test with buffer storage - test_vulkan_dequantize_per_channel_impl( - input_sizes, - scales, - zero_points, - axis, - quant_min, - quant_max, - dtype, - out_dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Telling the system to expect a float instead of a double - // since the shader can only return 32bit anyways - if (out_dtype == at::kDouble) { - out_dtype = at::kFloat; - } - - // Test with texture storage - test_vulkan_dequantize_per_channel_impl( - input_sizes, - scales, - zero_points, - axis, - quant_min, - quant_max, - dtype, - out_dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_dequantize_per_tensor_tensor( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - // Test with buffer storage - test_vulkan_dequantize_per_tensor_tensor_impl( - input_sizes, - scale, - zero_point, - quant_min, - quant_max, - dtype, - out_dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Telling the system to expect a float instead of a double - // since the shader can only return 32bit anyways - if (out_dtype == at::kDouble) { - out_dtype = at::kFloat; - } - - // Test with texture storage - test_vulkan_dequantize_per_tensor_tensor_impl( - input_sizes, - scale, - zero_point, - quant_min, - quant_max, - dtype, - out_dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -void test_reference_dequantize_per_tensor( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - check_dequantize_args(quant_min, quant_max, dtype, out_dtype); - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - - // Create a quantized input tensor with values from quant_min to quant_max - at::Tensor input; - if (dtype == at::kByte) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte)); - } else if (dtype == at::kChar) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar)); - } else if (dtype == at::kShort) { - input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort)); - } else if (dtype == at::kInt) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt)); - } else { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong)); - } - - // Fill with a simple pattern: values from quant_min to quant_max in steps - float step = 1.0f; - if (input.numel() > 1) { - step = static_cast(quant_max - quant_min) / (input.numel() - 1); - } - - auto flat_input = input.flatten(); - for (int i = 0; i < flat_input.numel(); i++) { - int64_t qvalue = quant_min + i * step; - if (dtype == at::kByte) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - flat_input[i] = static_cast(qvalue); - } - } - - // Reshape back to original dimensions - input = flat_input.reshape(input_sizes_int64); - - // Get reference output - at::Tensor reference_out = dequantize_per_tensor_reference_impl( - input, scale, zero_point, quant_min, quant_max, dtype, out_dtype); - - // Get implementation output - at::Tensor impl_out = torch::executor::native::dequantize_per_tensor_aten( - input, scale, zero_point, quant_min, quant_max, dtype, out_dtype); - - // Compare outputs - const bool output_correct = at::allclose(reference_out, impl_out); - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale: " << scale << std::endl; - std::cout << " zero_point: " << zero_point << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " input dtype: " << dtype << std::endl; - std::cout << " output dtype: " << out_dtype << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_out << std::endl; - std::cout << "implementation:" << std::endl; - std::cout << impl_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanDequantizePerTensorTest, - test_reference_dequantize_per_tensor_uint8_to_float) { - test_reference_dequantize_per_tensor( - {2, 3, 4}, // input sizes - 0.1, // scale - 5, // zero_point - 0, // quant_min - 255, // quant_max - at::kByte, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTensorTest, - test_reference_dequantize_per_tensor_int8_to_float) { - test_reference_dequantize_per_tensor( - {3, 4, 5}, // input sizes - 0.05, // scale - 0, // zero_point - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTensorTest, - test_reference_dequantize_per_tensor_int32_to_float) { - test_reference_dequantize_per_tensor( - {4, 6, 2}, // input sizes - 0.2, // scale - 2, // zero_point - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTensorTest, - test_reference_dequantize_per_tensor_uint8_to_half) { - test_reference_dequantize_per_tensor( - {7, 4}, // input sizes - 0.1, // scale - 10, // zero_point - 0, // quant_min - 255, // quant_max - at::kByte, // input dtype (uint8) - at::kHalf); // output dtype -} - -TEST( - VulkanDequantizePerTensorTest, - test_reference_dequantize_per_tensor_int32_to_half) { - test_reference_dequantize_per_tensor( - {2, 6, 5}, // input sizes - 0.3, // scale - -10, // zero_point - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, // input dtype - at::kHalf); // output dtype -} - -// No Vulkan tests for quantized_decomposed.dequantize_per_tensor.default -// because it is not going to be implemented in Vulkan since we will -// be handling any future calls to this op via the export stage - -void test_reference_dequantize_per_token( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - check_dequantize_args(quant_min, quant_max, dtype, out_dtype); - int num_tokens = 1; - for (int i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - ASSERT_EQ(num_tokens, scales.size()); - ASSERT_EQ(num_tokens, zero_points.size()); - - // Create input tensor with quantized values - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input; - if (dtype == at::kByte) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte)); - } else if (dtype == at::kChar) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar)); - } else if (dtype == at::kShort) { - input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort)); - } else if (dtype == at::kInt) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt)); - } else { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong)); - } - - // Fill with a simple pattern: values from quant_min to quant_max in steps - at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)}); - for (int token_idx = 0; token_idx < num_tokens; token_idx++) { - float step = 1.0f; - if (input.size(-1) > 1) { - step = static_cast(quant_max - quant_min) / (input.size(-1) - 1); - } - - for (int i = 0; i < input.size(-1); i++) { - int64_t qvalue = quant_min + i * step; - if (dtype == at::kByte) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } - } - } - - // Reshape back to original dimensions - input = reshaped_input.reshape(input_sizes_int64); - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output - at::Tensor reference_out = dequantize_per_token_reference_impl( - input, - scale_tensor, - zero_point_tensor, - quant_min, - quant_max, - dtype, - out_dtype); - - // Get implementation output - at::Tensor impl_out = torch::executor::native::dequantize_per_token_aten( - input, - scale_tensor, - zero_point_tensor, - quant_min, - quant_max, - dtype, - out_dtype); - - // Compare outputs - const bool output_correct = at::allclose(reference_out, impl_out); - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " input dtype: " << dtype << std::endl; - std::cout << " output dtype: " << out_dtype << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_out << std::endl; - std::cout << "implementation:" << std::endl; - std::cout << impl_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -void test_vulkan_dequantize_per_token_impl( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage) { - check_dequantize_args(quant_min, quant_max, dtype, out_dtype); - int num_tokens = 1; - for (int i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - ASSERT_EQ(num_tokens, scales.size()); - ASSERT_EQ(num_tokens, zero_points.size()); - - // Create input tensor with quantized values - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input; - if (dtype == at::kByte) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte)); - } else if (dtype == at::kChar) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar)); - } else if (dtype == at::kShort) { - input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort)); - } else if (dtype == at::kInt) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt)); - } else { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong)); - } - - // Fill with a simple pattern: values from quant_min to quant_max in steps - at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)}); - for (int token_idx = 0; token_idx < num_tokens; token_idx++) { - float step = 1.0f; - if (input.size(-1) > 1) { - step = static_cast(quant_max - quant_min) / (input.size(-1) - 1); - } - - for (int i = 0; i < input.size(-1); i++) { - int64_t qvalue = quant_min + i * step; - if (dtype == at::kByte) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - reshaped_input[token_idx][i] = static_cast(qvalue); - } - } - } - - // Reshape back to original dimensions - input = reshaped_input.reshape(input_sizes_int64); - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output - at::Tensor reference_out = torch::executor::native::dequantize_per_token_aten( - input, - scale_tensor, - zero_point_tensor, - quant_min, - quant_max, - dtype, - out_dtype); - - // Build Vulkan dequantize_per_token graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(dtype), in_storage); - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(out_dtype), out_storage); - - const ValueRef r_dtype = - graph.add_scalar(static_cast(out_dtype)); - - VK_GET_OP_FN("quantized_decomposed.dequantize_per_token.default") - (graph, - { - r_input.value, - r_scale.value, - r_zero_point.value, - r_quant_min, - r_quant_max, - r_dtype, - r_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Convert scale tensor to float and copy to GPU - at::Tensor scale_float = scale_tensor.to(at::kFloat); - graph.copy_into_staging( - r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); - - // Convert zero_point tensor to int and copy to GPU - at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); - graph.copy_into_staging( - r_zero_point.staging, - zero_point_int.const_data_ptr(), - zero_point_int.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs with appropriate tolerance for half precision - bool output_correct; - if (out_dtype == at::kHalf) { - // Use higher tolerance for half precision due to limited precision - output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2); - } else { - output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5); - } - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - std::cout << " input dtype: " << dtype << std::endl; - std::cout << " output dtype: " << out_dtype << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_out << std::endl; - std::cout << "vulkan:" << std::endl; - std::cout << vk_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanDequantizePerTokenTest, - test_reference_dequantize_per_token_uint8_to_float) { - std::vector scales = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6}; - std::vector zero_points = {5, 10, 15, 20, 25, 30}; - - test_reference_dequantize_per_token( - {2, 3, 4}, // input sizes (2*3=6 tokens) - scales, - zero_points, - 0, // quant_min - 255, // quant_max - at::kByte, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_reference_dequantize_per_token_int8_to_float) { - std::vector scales = {0.05, 0.1, 0.15, 0.2}; - std::vector zero_points = {0, -5, 5, 10}; - - test_reference_dequantize_per_token( - {2, 2, 5}, // input sizes (2*2=4 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_reference_dequantize_per_token_int32_to_float) { - std::vector scales = {0.05, 0.1, 0.15, 0.2}; - std::vector zero_points = {0, -5, 5, 10}; - - test_reference_dequantize_per_token( - {2, 2, 10}, // input sizes (2*2=4 tokens) - scales, - zero_points, - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_reference_dequantize_per_token_int8_to_half) { - std::vector scales = {0.05, 0.1, 0.15, 0.2}; - std::vector zero_points = {0, -5, 5, 10}; - - test_reference_dequantize_per_token( - {4, 1, 5}, // input sizes (4*1=4 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype (int8) - at::kHalf); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_reference_dequantize_per_token_int32_to_half) { - std::vector scales = {0.05, 0.1}; - std::vector zero_points = {0, -5}; - - test_reference_dequantize_per_token( - {2, 2}, // input sizes (2 tokens) - scales, - zero_points, - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, // input dtype - at::kHalf); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_vulkan_dequantize_per_token_uint8_to_float) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6}; - std::vector zero_points = {5, 10, 15, 20, 25, 30}; - - test_vulkan_dequantize_per_token( - {2, 3, 6}, // input sizes (2*3=6 tokens) - scales, - zero_points, - 0, // quant_min - 255, // quant_max - at::kByte, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_vulkan_dequantize_per_token_int8_to_float) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.05, 0.0}; - std::vector zero_points = {10, -5}; - - test_vulkan_dequantize_per_token( - {2, 2}, // input sizes (2*2=4 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_vulkan_dequantize_per_token_int32_to_float) { - std::vector scales = { - 0.0001, 0.0002, 0.0003, 0.0, 0.0011, 0.0102, 0.1003, 0.0}; - std::vector zero_points = {100, -100, 50, -50, 12, -6, 4, -24}; - - test_vulkan_dequantize_per_token( - {2, 2, 2, 12}, // input sizes (2*2=4 tokens) - scales, - zero_points, - -2147483648, // quant_min - 2147483647, // quant_max - at::kInt, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_vulkan_dequantize_per_token_int8_to_half) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_float16_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.05, 0.2}; - std::vector zero_points = {2, -5}; - - test_vulkan_dequantize_per_token( - {2, 2}, // input sizes (2=4 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kHalf); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_vulkan_dequantize_per_token_int32_to_half) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_float16_buffers_support()) { - GTEST_SKIP(); - } - // Use much smaller scales to avoid overflow to infinity in half precision - // Half precision max value is ~65504, so with int32 values around 2e9, - // we need scales smaller than 65504/2e9 ≈ 3e-5 to avoid overflow - std::vector scales = {1e-5, 2e-5, 1.5e-5}; - std::vector zero_points = {20, -15, 1}; - - test_vulkan_dequantize_per_token( - {3, 6}, // input sizes (3 tokens) - scales, - zero_points, - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, // input dtype - at::kHalf); // output dtype -} - -TEST( - VulkanDequantizePerTokenTest, - test_vulkan_dequantize_per_token_int8_to_double) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.05, 0.001}; - std::vector zero_points = {10, -5}; - - test_vulkan_dequantize_per_token( - {2, 2}, // input sizes (2 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kDouble); // output dtype -} - -void test_reference_dequantize_per_channel( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype) { - check_dequantize_args(quant_min, quant_max, dtype, out_dtype); - check_dequantize_per_channel_args(input_sizes, scales, zero_points, axis); - - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - - // Create input tensor with quantized values - at::Tensor input; - if (dtype == at::kByte) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte)); - } else if (dtype == at::kChar) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar)); - } else if (dtype == at::kShort) { - input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort)); - } else if (dtype == at::kInt) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt)); - } else { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong)); - } - - // Fill with a simple pattern: values from quant_min to quant_max in steps - float step = 1.0f; - if (input.numel() > 1) { - step = static_cast(quant_max - quant_min) / (input.numel() - 1); - } - - auto flat_input = input.flatten(); - for (int i = 0; i < flat_input.numel(); i++) { - int64_t qvalue = quant_min + i * step; - if (dtype == at::kByte) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - flat_input[i] = static_cast(qvalue); - } - } - - // Reshape back to original dimensions - input = flat_input.reshape(input_sizes_int64); - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output - at::Tensor my_ref = dequantize_per_channel_reference_impl( - input, - scale_tensor, - zero_point_tensor, - axis, - quant_min, - quant_max, - dtype, - out_dtype); - - // Get implementation output - at::Tensor cpu_ref = torch::executor::native::dequantize_per_channel_aten( - input, - scale_tensor, - zero_point_tensor, - axis, - quant_min, - quant_max, - dtype, - out_dtype); - - // Compare outputs - const bool output_correct = at::allclose(my_ref, cpu_ref); - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " axis: " << axis << std::endl; - std::cout << " input sizes:"; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << " " << input_sizes[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " input dtype: " << dtype << std::endl; - std::cout << " output dtype: " << out_dtype << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "cpu_ref:" << std::endl; - std::cout << cpu_ref << std::endl; - std::cout << "my_ref:" << std::endl; - std::cout << my_ref << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -void test_vulkan_dequantize_per_channel_impl( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage) { - check_dequantize_args(quant_min, quant_max, dtype, out_dtype); - check_dequantize_per_channel_args(input_sizes, scales, zero_points, axis); - - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - - // Create random float tensor - at::Tensor float_x = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat)); - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt)); - - // Map the dtype to the corresponding quantized type and quantize the float - // tensor - c10::ScalarType qtype; - at::Tensor adjusted_zero_points = zero_point_tensor; - - if (dtype == at::kByte) { - qtype = c10::kQUInt8; - // ATEN ONLY: Adjust zero points for unsigned types (must be non-negative) - adjusted_zero_points = at::clamp_min(zero_point_tensor, 0); - } else if (dtype == at::kChar) { - qtype = c10::kQInt8; - } else if (dtype == at::kInt) { - qtype = c10::kQInt32; - } else { - std::cout << "invalid dtype for ATEN: " << dtype << std::endl; - std::cout << " --> Delegating to c10::kQInt32" << std::endl; - qtype = c10::kQInt32; - } - - // Normalize axis for ATen (ATen doesn't handle negative axes in - // quantize_per_channel) - int64_t normalized_axis = axis; - if (normalized_axis < 0) { - normalized_axis += input_sizes_int64.size(); - } - - // Quantize using ATen - at::Tensor quantized_aten = at::quantize_per_channel( - float_x, scale_tensor, adjusted_zero_points, normalized_axis, qtype); - - // Get ATen dequantized output - at::Tensor aten_out = at::dequantize(quantized_aten).to(out_dtype); - - // Extract the quantized values (int_repr) to use with our implementations - at::Tensor quantized_input = quantized_aten.int_repr().to(dtype); - - // Get reference output using - // torch::executor::native::dequantize_per_channel_aten - at::Tensor reference_out = - torch::executor::native::dequantize_per_channel_aten( - quantized_input, - scale_tensor.to(at::kDouble), - zero_point_tensor.to(at::kLong), - axis, - quant_min, - quant_max, - dtype, - out_dtype); - - // Build Vulkan dequantize_per_channel graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - // Add tensors to graph - IOValueRef r_input = graph.add_input_tensor( - quantized_input.sizes().vec(), - from_at_scalartype(quantized_input.scalar_type()), - in_storage); - - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - - IOValueRef r_zero_point = graph.add_input_tensor( - adjusted_zero_points.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - ValueRef r_out = graph.add_tensor( - quantized_input.sizes().vec(), - from_at_scalartype(out_dtype), - out_storage); - - const ValueRef r_axis = graph.add_scalar(axis); - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - const ValueRef r_output_dtype = - graph.add_scalar(static_cast(out_dtype)); - - VK_GET_OP_FN("quantized_decomposed.dequantize_per_channel.default") - (graph, - { - r_input.value, - r_scale.value, - r_zero_point.value, - r_axis, - r_quant_min, - r_quant_max, - r_dtype, - r_output_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, - quantized_input.const_data_ptr(), - quantized_input.numel()); - - // copy scale tensor to GPU - graph.copy_into_staging( - r_scale.staging, scale_tensor.const_data_ptr(), scale_tensor.numel()); - - // copy zero_point tensor to GPU - graph.copy_into_staging( - r_zero_point.staging, - zero_point_tensor.const_data_ptr(), - zero_point_tensor.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs with appropriate tolerance for half precision - bool output_correct; - if (out_dtype == at::kHalf) { - // Use higher tolerance for half precision due to limited precision - output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2); - } else { - output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5); - } - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " axis: " << axis << std::endl; - std::cout << " input sizes:"; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << " " << input_sizes[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " input dtype: " << dtype << std::endl; - std::cout << " output dtype: " << out_dtype << std::endl; - std::cout << " storage: " << in_storage << std::endl; - std::cout << std::endl; - - std::cout << "\033[91m quantized_input: \033[0m" << std::endl; - std::cout << quantized_input << std::endl; - std::cout << "\033[91m aten: \033[0m" << std::endl; - std::cout << aten_out << std::endl; - std::cout << "\033[91m reference: \033[0m" << std::endl; - std::cout << reference_out << std::endl; - std::cout << "\033[91m vulkan: \033[0m" << std::endl; - std::cout << vk_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanDequantizePerChannelTest, - test_reference_dequantize_per_channel_uint8_to_float_3D_axis0) { - std::vector scales = {0.1, 0.2, 0.3}; - std::vector zero_points = {0, 5, -2}; - - test_reference_dequantize_per_channel( - {3, 4, 2}, // input sizes - scales, - zero_points, - 0, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_reference_dequantize_per_channel_int8_to_float_3D_axis2) { - std::vector scales = {0.1, 0.2}; - std::vector zero_points = {0, 5}; - - test_reference_dequantize_per_channel( - {3, 4, 2}, // input sizes - scales, - zero_points, - 2, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_reference_dequantize_per_channel_int8_to_float_3D_axisn1) { - std::vector scales = {0.1, 0.2}; - std::vector zero_points = {0, 5}; - - test_reference_dequantize_per_channel( - {3, 4, 2}, // input sizes - scales, - zero_points, - -1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_reference_dequantize_per_channel_int32_to_float_4D_axis0) { - std::vector scales = {0.1, 0.2, 0.00002}; - std::vector zero_points = {0, 5, -4}; - - test_reference_dequantize_per_channel( - {3, 4, 2, 5}, // input sizes - scales, - zero_points, - 0, // axis - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, - at::kFloat); -} - -// END OF REFERENCE TESTS - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_int8_to_float_axis0) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(9, 0.1f); - std::vector zero_points(9, 2); - - // 1D Tensor - test_vulkan_dequantize_per_channel( - {9}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 2D Tensor - test_vulkan_dequantize_per_channel( - {9, 14}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 3D Tensor - test_vulkan_dequantize_per_channel( - {9, 7, 11}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 17, 5, 5}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {5, 17, 5, 9}, // input sizes - scales, - zero_points, - -1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_int8_to_float_axis1) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(14, 0.001f); - std::vector zero_points(14, -5); - - // 2D Tensor - test_vulkan_dequantize_per_channel( - {9, 14}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 3D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 5, 5}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {9, 7, 14, 5}, // input sizes - scales, - zero_points, - -2, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_int8_to_float_axis2) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(11, 0.5f); - std::vector zero_points(11, 12); - - // 3D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11}, // input sizes - scales, - zero_points, - 2, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 2, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {9, 11, 14, 5}, // input sizes - scales, - zero_points, - -3, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_int8_to_float_axis3) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(7, 0.5f); - std::vector zero_points(7, 12); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11, 7}, // input sizes - scales, - zero_points, - 3, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {7, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_uint8_to_float_comprehensive) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.0001, 0.5, 0.02}; - std::vector zero_points = {0, 5, -5, 1, 12}; - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - 0, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kFloat); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 5, 11, 7}, // input sizes - scales, - zero_points, - 1, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kFloat); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 5, 7}, // input sizes - scales, - zero_points, - 2, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kFloat); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 3, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kFloat); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kFloat); -} - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_8bit_to_half) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_float16_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.01, 0.5, 0.02}; - std::vector zero_points = {0, 5, 5, 1, 12}; - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kHalf); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 5, 11, 7}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kHalf); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 5, 7}, // input sizes - scales, - zero_points, - 2, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kHalf); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 3, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kHalf); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kHalf); -} - -TEST( - VulkanDequantizePerChannelTest, - test_vulkan_dequantize_per_channel_8bit_to_double) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.01, 0.5, 0.02}; - std::vector zero_points = {0, 5, 5, 1, 12}; - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kDouble); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 5, 11, 7}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kDouble); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 5, 7}, // input sizes - scales, - zero_points, - 2, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kDouble); - - // 4D Tensor - test_vulkan_dequantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 3, // axis - -128, // quant_min - 127, // quant_max - at::kChar, - at::kDouble); - - // 4D Tensor (negative axis) - test_vulkan_dequantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - 0, // quant_min - 255, // quant_max - at::kByte, - at::kDouble); -} - -void test_vulkan_dequantize_per_tensor_tensor_impl( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype, - at::ScalarType out_dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage) { - check_dequantize_args(quant_min, quant_max, dtype, out_dtype); - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - - // Create a quantized input tensor with values from quant_min to quant_max - at::Tensor input; - if (dtype == at::kByte) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte)); - } else if (dtype == at::kChar) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar)); - } else if (dtype == at::kShort) { - input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort)); - } else if (dtype == at::kInt) { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt)); - } else { - input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong)); - } - - // Fill with a simple pattern: values from quant_min to quant_max in steps - float step = 1.0f; - if (input.numel() > 1) { - step = static_cast(quant_max - quant_min) / (input.numel() - 1); - } - - auto flat_input = input.flatten(); - for (int i = 0; i < flat_input.numel(); i++) { - int64_t qvalue = quant_min + i * step; - if (dtype == at::kByte) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - flat_input[i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - flat_input[i] = static_cast(qvalue); - } - } - - // Reshape back to original dimensions - input = flat_input.reshape(input_sizes_int64); - - // Create scale and zero_point as tensors (single element tensors) - at::Tensor scale_tensor = - at::tensor({scale}, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor({zero_point}, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output using tensor variant - at::Tensor reference_out = - torch::executor::native::dequantize_per_tensor_tensor_args_aten( - input, - scale_tensor, - zero_point_tensor, - quant_min, - quant_max, - dtype, - out_dtype); - - // Build Vulkan dequantize_per_tensor.tensor graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(dtype), in_storage); - - // Add scale and zero_point as tensor inputs (buffer storage, width packed) - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(out_dtype), out_storage); - - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - const ValueRef r_out_dtype = - graph.add_scalar(static_cast(out_dtype)); - - VK_GET_OP_FN("quantized_decomposed.dequantize_per_tensor.tensor") - (graph, - { - r_input.value, - r_scale.value, - r_zero_point.value, - r_quant_min, - r_quant_max, - r_dtype, - r_out_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Run Vulkan dequantize_per_tensor.tensor - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Convert scale tensor to float and copy to GPU - at::Tensor scale_float = scale_tensor.to(at::kFloat); - graph.copy_into_staging( - r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); - - // Convert zero_point tensor to int and copy to GPU - at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); - graph.copy_into_staging( - r_zero_point.staging, - zero_point_int.const_data_ptr(), - zero_point_int.numel()); - - graph.execute(); - - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs with appropriate tolerance for half precision - bool output_correct; - if (out_dtype == at::kHalf) { - // Use higher tolerance for half precision due to limited precision - output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2); - } else { - output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5); - } - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale: " << scale << std::endl; - std::cout << " zero_point: " << zero_point << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - std::cout << " input dtype: " << dtype << std::endl; - std::cout << " output dtype: " << out_dtype << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_out << std::endl; - std::cout << "vulkan:" << std::endl; - std::cout << vk_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanDequantizePerTensorTensorTest, - test_vulkan_dequantize_per_tensor_tensor_int8_to_float) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_dequantize_per_tensor_tensor( - {2, 3, 4}, // input sizes - 0.01, // scale - 1, // zero_point - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTensorTensorTest, - test_vulkan_dequantize_per_tensor_tensor_uint8_to_float) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_dequantize_per_tensor_tensor( - {2, 3, 4, 12}, // input sizes - 0.1, // scale - 5, // zero_point - 0, // quant_min - 255, // quant_max - at::kByte, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTensorTensorTest, - test_vulkan_dequantize_per_tensor_tensor_int32_to_float) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_dequantize_per_tensor_tensor( - {2, 3}, // input sizes - 0.01, // scale - 12, // zero_point - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kInt, // input dtype - at::kFloat); // output dtype -} - -TEST( - VulkanDequantizePerTensorTensorTest, - test_vulkan_dequantize_per_tensor_tensor_uint8_to_half) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_dequantize_per_tensor_tensor( - {3, 4}, // input sizes - 0.3, // scale - 2, // zero_point - 0, // quant_min - 255, // quant_max - at::kByte, // input dtype - at::kHalf); // output dtype -} - -TEST( - VulkanDequantizePerTensorTensorTest, - test_vulkan_dequantize_per_tensor_tensor_int8_to_double) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_dequantize_per_tensor_tensor( - {2, 3, 4}, // input sizes - 0.03, // scale - -2, // zero_point - -128, // quant_min - 127, // quant_max - at::kChar, // input dtype - at::kDouble); // output dtype -} diff --git a/backends/vulkan/test/op_tests/generate_op_benchmarks.py b/backends/vulkan/test/op_tests/generate_op_benchmarks.py deleted file mode 100644 index 7f286123df9..00000000000 --- a/backends/vulkan/test/op_tests/generate_op_benchmarks.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import os - -from typing import Dict - -from executorch.backends.vulkan.test.op_tests.cases import test_suites - -from executorch.backends.vulkan.test.op_tests.utils.gen_benchmark_vk import ( - VkBenchmarkFileGen, -) -from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( - ComputeGraphGen, -) -from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite -from torchgen import local - -from torchgen.gen import parse_native_yaml, ParsedYaml -from torchgen.model import DispatchKey, NativeFunction - - -def registry_name(f: NativeFunction) -> str: - name = str(f.namespace) + "." + str(f.func.name) - if len(f.func.name.overload_name) == 0: - name += ".default" - return name - - -def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]: - f_map: Dict[str, NativeFunction] = {} - for f in parsed_yaml.native_functions: - f_map[registry_name(f)] = f - return f_map - - -def process_test_suites( - cpp_generator: VkBenchmarkFileGen, - f_map: Dict[str, NativeFunction], - test_suites: Dict[str, TestSuite], -) -> None: - for registry_name, op_test_suites in test_suites.items(): - f = f_map[registry_name] - if isinstance(op_test_suites, list): - for suite in op_test_suites: - cpp_generator.add_suite(registry_name, f, suite) - else: - cpp_generator.add_suite(registry_name, f, op_test_suites) - - -@local.parametrize( - use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False -) -def generate_cpp( - native_functions_yaml_path: str, tags_path: str, output_dir: str -) -> None: - output_file = os.path.join(output_dir, "op_benchmarks.cpp") - cpp_generator = VkBenchmarkFileGen(output_file) - - parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path) - f_map = construct_f_map(parsed_yaml) - - ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU] - - process_test_suites(cpp_generator, f_map, test_suites) - - with open(output_file, "w") as file: - file.write(cpp_generator.generate_cpp()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--aten-yaml-path", - help="path to native_functions.yaml file.", - ) - parser.add_argument( - "--tags-path", - help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.", - ) - - parser.add_argument("-o", "--output", help="Output directory", required=True) - args = parser.parse_args() - generate_cpp(args.aten_yaml_path, args.tags_path, args.output) diff --git a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py deleted file mode 100644 index 8814070abd3..00000000000 --- a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import os - -from typing import Dict - -from executorch.backends.vulkan.test.op_tests.cases import test_suites -from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( - ComputeGraphGen, -) - -from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_vk import ( - VkCorrectnessTestFileGen, -) -from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite -from torchgen import local - -from torchgen.gen import parse_native_yaml, ParsedYaml -from torchgen.model import DispatchKey, NativeFunction - - -def registry_name(f: NativeFunction) -> str: - name = str(f.namespace) + "." + str(f.func.name) - if len(f.func.name.overload_name) == 0: - name += ".default" - return name - - -def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]: - f_map: Dict[str, NativeFunction] = {} - for f in parsed_yaml.native_functions: - f_map[registry_name(f)] = f - return f_map - - -def process_test_suites( - cpp_generator: VkCorrectnessTestFileGen, - f_map: Dict[str, NativeFunction], - test_suites: Dict[str, TestSuite], -) -> None: - for registry_name, op_test_suites in test_suites.items(): - f = f_map[registry_name] - if isinstance(op_test_suites, list): - for suite in op_test_suites: - cpp_generator.add_suite(registry_name, f, suite) - else: - cpp_generator.add_suite(registry_name, f, op_test_suites) - - -@local.parametrize( - use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False -) -def generate_cpp( - native_functions_yaml_path: str, tags_path: str, output_dir: str -) -> None: - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - output_file = os.path.join(output_dir, "op_tests.cpp") - cpp_generator = VkCorrectnessTestFileGen(output_file) - - parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path) - f_map = construct_f_map(parsed_yaml) - - ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU] - - process_test_suites(cpp_generator, f_map, test_suites) - - with open(output_file, "w") as file: - file.write(cpp_generator.generate_cpp()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--aten-yaml-path", - help="path to native_functions.yaml file.", - ) - parser.add_argument( - "--tags-path", - help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.", - ) - parser.add_argument("-o", "--output", help="Output directory", required=True) - args = parser.parse_args() - generate_cpp(args.aten_yaml_path, args.tags_path, args.output) diff --git a/backends/vulkan/test/op_tests/quantize_affine_test.cpp b/backends/vulkan/test/op_tests/quantize_affine_test.cpp deleted file mode 100644 index 1c0a6c2e6b9..00000000000 --- a/backends/vulkan/test/op_tests/quantize_affine_test.cpp +++ /dev/null @@ -1,1376 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include "test_utils.h" - -#include -#include -#include - -static inline void -_check_dims(c10::string_view name, int64_t expected, int64_t actual) { - VK_CHECK_COND( - expected == actual, - name, - " has rank ", - actual, - " but block_size has length ", - expected); -} - -at::Tensor quantize_affine_reference_impl( - const at::Tensor& input_, - const std::vector& block_size, - const at::Tensor& scale, - const c10::optional& zero_point_opt, - int64_t quant_min, - int64_t quant_max, - at::ScalarType out_dtype, - c10::optional zero_point_domain_opt = std::string("INT")) { - constexpr float kEps = 1e-7f; - - const int64_t ndim = input_.dim(); - _check_dims("input", block_size.size(), ndim); - - VK_CHECK_COND( - input_.scalar_type() == at::kFloat || input_.scalar_type() == at::kHalf || - input_.scalar_type() == at::kBFloat16, - "Unsupported input dtype: ", - input_.dtype()); - - auto zero_point_domain = - zero_point_domain_opt.has_value() ? *zero_point_domain_opt : "INT"; - - bool has_zp = zero_point_opt.has_value(); - VK_CHECK_COND( - has_zp || zero_point_domain == "NONE" || zero_point_domain == "", - "zero_point must be supplied unless zero_point_domain is NONE or null"); - - at::Tensor input = input_.contiguous(); - - std::vector shape_for_reduction; - std::vector reduction_dims; - int64_t cur_dim = 0; - - auto in_sizes = input.sizes(); - for (int64_t i = 0; i < ndim; ++i) { - const int64_t blk = block_size[i]; - const int64_t dim = in_sizes[i]; - - if (blk != dim && blk > 1) { - VK_CHECK_COND( - dim % blk == 0, - "Input size ", - dim, - " is not divisible by block_size ", - blk, - " at dimension ", - i); - shape_for_reduction.push_back(dim / blk); - shape_for_reduction.push_back(blk); - reduction_dims.push_back(cur_dim + 1); - cur_dim += 2; - } else { - shape_for_reduction.push_back(dim); - if (blk != 1) { - reduction_dims.push_back(cur_dim); - } - cur_dim += 1; - } - } - - at::Tensor input_reshaped = input.view(shape_for_reduction); - - std::vector shape_after_reduction = shape_for_reduction; - for (int64_t d : reduction_dims) { - shape_after_reduction[d] = 1; - } - - at::Tensor scale_b = - scale.view(shape_after_reduction).to(input_reshaped.scalar_type()); - - at::Tensor zp_b; - if (has_zp) { - zp_b = (*zero_point_opt).view(shape_after_reduction).toType(at::kFloat); - } - - scale_b = scale_b.clamp_min(kEps); - at::Tensor inv_scale = 1.0f / scale_b; - - at::Tensor q; - if (zero_point_domain == "INT") { - VK_CHECK_COND(has_zp, "INT zero_point_domain requires zero_point tensor"); - q = at::round(input_reshaped * inv_scale) + zp_b; - } else if (zero_point_domain == "NONE" || zero_point_domain.empty()) { - VK_CHECK_COND( - !has_zp, "zero_point must be None when domain is NONE / null"); - q = at::round(input_reshaped * inv_scale); - } else { - VK_CHECK_COND( - has_zp && zero_point_domain == "FLOAT", - "zero_point_domain must be INT, FLOAT, NONE or null"); - const float mid_point = (quant_max + quant_min + 1) * 0.5f; - at::Tensor min_val = zp_b - scale_b * mid_point; - q = at::round((input_reshaped - min_val) / scale_b); - } - - q = at::clamp(q, (double)quant_min, (double)quant_max); - - q = q.view(in_sizes).to(out_dtype); - - return q; -} - -at::Tensor dequantize_affine_reference_impl( - const at::Tensor& input_, - const std::vector& block_size, - const at::Tensor& scale, - const c10::optional& zero_point_opt, - int64_t quant_min, - int64_t quant_max, - at::ScalarType out_dtype, - c10::optional zero_point_domain_opt = std::string("INT")) { - const int64_t ndim = input_.dim(); - _check_dims("input", block_size.size(), ndim); - - VK_CHECK_COND( - input_.scalar_type() == at::kByte || input_.scalar_type() == at::kChar || - input_.scalar_type() == at::kShort || - input_.scalar_type() == at::kInt, - "Unsupported input dtype: ", - input_.dtype()); - - VK_CHECK_COND( - out_dtype == at::kFloat || out_dtype == at::kHalf || - out_dtype == at::kBFloat16, - "Unsupported output dtype: ", - out_dtype); - - auto zero_point_domain = - zero_point_domain_opt.has_value() ? *zero_point_domain_opt : "INT"; - - bool has_zp = zero_point_opt.has_value(); - VK_CHECK_COND( - has_zp || zero_point_domain == "NONE" || zero_point_domain == "", - "zero_point must be supplied unless zero_point_domain is NONE or null"); - - at::Tensor input = input_.contiguous(); - - std::vector shape_for_reduction; - std::vector reduction_dims; - int64_t cur_dim = 0; - - auto in_sizes = input.sizes(); - for (int64_t i = 0; i < ndim; ++i) { - const int64_t blk = block_size[i]; - const int64_t dim = in_sizes[i]; - - if (blk != dim && blk > 1) { - VK_CHECK_COND( - dim % blk == 0, - "Input size ", - dim, - " is not divisible by block_size ", - blk, - " at dimension ", - i); - shape_for_reduction.push_back(dim / blk); - shape_for_reduction.push_back(blk); - reduction_dims.push_back(cur_dim + 1); - cur_dim += 2; - } else { - shape_for_reduction.push_back(dim); - if (blk != 1) { - reduction_dims.push_back(cur_dim); - } - cur_dim += 1; - } - } - - at::Tensor input_reshaped = input.view(shape_for_reduction); - - std::vector shape_after_reduction = shape_for_reduction; - for (int64_t d : reduction_dims) { - shape_after_reduction[d] = 1; - } - - at::Tensor scale_b = scale.view(shape_after_reduction).to(out_dtype); - - at::Tensor zp_b; - if (has_zp) { - zp_b = (*zero_point_opt).view(shape_after_reduction).to(out_dtype); - } - - at::Tensor input_fp = input_reshaped.to(out_dtype); - at::Tensor dq; - - if (zero_point_domain == "INT") { - VK_CHECK_COND(has_zp, "INT zero_point_domain requires zero_point tensor"); - dq = (input_fp - zp_b) * scale_b; - } else if (zero_point_domain == "NONE" || zero_point_domain.empty()) { - VK_CHECK_COND( - !has_zp, "zero_point must be None when domain is NONE / null"); - dq = input_fp * scale_b; - } else { - VK_CHECK_COND( - has_zp && zero_point_domain == "FLOAT", - "zero_point_domain must be INT, FLOAT, NONE or null"); - const float mid_point = (quant_max + quant_min + 1) * 0.5f; - at::Tensor min_val = zp_b - scale_b * mid_point; - dq = input_fp * scale_b + min_val; - } - - dq = dq.view(in_sizes); - - return dq; -} - -// Wrapper function to maintain compatibility with existing test code (above is -// a good reference for how the python implementation works) -at::Tensor quantize_affine_reference_impl( - const at::Tensor& input, - const std::vector& block_size, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - return quantize_affine_reference_impl( - input, - block_size, - scale, - c10::optional(zero_point), - quant_min, - quant_max, - dtype, - std::string("INT")); -} - -// Wrapper function for dequantize_affine -at::Tensor dequantize_affine_reference_impl( - const at::Tensor& input, - const std::vector& block_size, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - return dequantize_affine_reference_impl( - input, - block_size, - scale, - c10::optional(zero_point), - quant_min, - quant_max, - dtype, - std::string("INT")); -} - -std::tuple choose_qparams_affine_reference_impl( - const at::Tensor& input_, - const std::string& mapping_type, - const std::vector& block_size, - int64_t quant_min, - int64_t quant_max, - double eps) { - const int64_t ndim = input_.dim(); - _check_dims("input", block_size.size(), ndim); - - VK_CHECK_COND( - input_.scalar_type() == at::kFloat || input_.scalar_type() == at::kHalf || - input_.scalar_type() == at::kBFloat16, - "Unsupported input dtype: ", - input_.dtype()); - - at::Tensor input = input_.contiguous(); - - std::vector shape_for_reduction; - std::vector reduction_dims; - int64_t cur_dim = 0; - - auto in_sizes = input.sizes(); - for (int64_t i = 0; i < ndim; ++i) { - const int64_t blk = block_size[i]; - const int64_t dim = in_sizes[i]; - - if (blk != dim && blk > 1) { - VK_CHECK_COND( - dim % blk == 0, - "Input size ", - dim, - " is not divisible by block_size ", - blk, - " at dimension ", - i); - shape_for_reduction.push_back(dim / blk); - shape_for_reduction.push_back(blk); - reduction_dims.push_back(cur_dim + 1); - cur_dim += 2; - } else { - shape_for_reduction.push_back(dim); - if (blk != 1) { - reduction_dims.push_back(cur_dim); - } - cur_dim += 1; - } - } - - at::Tensor input_reshaped = input.view(shape_for_reduction); - - std::vector shape_after_reduction = shape_for_reduction; - for (int64_t d : reduction_dims) { - shape_after_reduction[d] = 1; - } - - at::Tensor min_val = input_reshaped.amin(reduction_dims, /*keepdim=*/true); - at::Tensor max_val = input_reshaped.amax(reduction_dims, /*keepdim=*/true); - - at::Tensor scale, zero_point; - - if (mapping_type == "ASYMMETRIC") { - // Include zero in the range - min_val = at::minimum(min_val, at::zeros_like(min_val)); - max_val = at::maximum(max_val, at::zeros_like(max_val)); - - // Calculate scale - scale = (max_val - min_val) / (quant_max - quant_min); - scale = at::maximum(scale, at::full_like(scale, eps)); - - // Calculate zero_point - zero_point = at::round(quant_min - min_val / scale); - zero_point = at::clamp(zero_point, quant_min, quant_max); - } else if (mapping_type == "SYMMETRIC") { - // Include zero in the range - min_val = at::minimum(min_val, at::zeros_like(min_val)); - max_val = at::maximum(max_val, at::zeros_like(max_val)); - - // Calculate max absolute value - at::Tensor abs_min = at::abs(min_val); - at::Tensor abs_max = at::abs(max_val); - at::Tensor M = at::maximum(abs_min, abs_max); - - // Calculate scale - scale = M / ((quant_max - quant_min) * 0.5); - scale = at::maximum(scale, at::full_like(scale, eps)); - - // Calculate zero_point (mid-point) - zero_point = - at::full_like(scale, (quant_max + quant_min + 1) / 2, at::kInt); - } else if (mapping_type == "SYMMETRIC_NO_CLIPPING_ERR") { - // Include zero in the range - min_val = at::minimum(min_val, at::zeros_like(min_val)); - max_val = at::maximum(max_val, at::zeros_like(max_val)); - - // Calculate scale based on min/max values - at::Tensor s_min = at::abs(min_val) / std::abs(quant_min); - at::Tensor s_max = max_val / quant_max; - scale = at::maximum(s_min, s_max); - scale = at::maximum(scale, at::full_like(scale, eps)); - - // Calculate zero_point (mid-point) - zero_point = - at::full_like(scale, (quant_max + quant_min + 1) / 2, at::kInt); - } else { - VK_CHECK_COND( - false, - "Unsupported mapping_type: ", - mapping_type, - ". Expected ASYMMETRIC, SYMMETRIC, or SYMMETRIC_NO_CLIPPING_ERR"); - } - - std::vector output_shape; - for (size_t i = 0; i < shape_after_reduction.size(); ++i) { - if (shape_after_reduction[i] != 1 || - std::find(reduction_dims.begin(), reduction_dims.end(), i) == - reduction_dims.end()) { - output_shape.push_back(shape_after_reduction[i]); - } - } - - // Reshape scale and zero_point to final output shape - scale = scale.view(output_shape); - zero_point = zero_point.view(output_shape); - - return std::make_tuple(scale, zero_point); -} - -void test_vulkan_quantize_affine_impl( - const std::vector& input_sizes, - const std::vector& block_size, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - // Create input tensor with random values - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt)); - - // Get reference output - at::Tensor reference_out = quantize_affine_reference_impl( - input, - block_size, - scale_tensor, - zero_point_tensor, - quant_min, - quant_max, - dtype); - - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - - std::vector block_size_copy(block_size); - const ValueRef r_block_size = - graph.add_scalar_list(std::move(block_size_copy)); - - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - const ValueRef r_output_dtype = - graph.add_scalar(static_cast(dtype)); - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(dtype), out_storage); - - VK_GET_OP_FN("torchao.quantize_affine.default") - (graph, - { - r_input.value, - r_block_size, - r_scale.value, - r_zero_point.value, - r_output_dtype, - r_quant_min, - r_quant_max, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Copy scale tensor to GPU - graph.copy_into_staging( - r_scale.staging, scale_tensor.const_data_ptr(), scale_tensor.numel()); - - // Copy zero_point tensor to GPU - graph.copy_into_staging( - r_zero_point.staging, - zero_point_tensor.const_data_ptr(), - zero_point_tensor.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs - at::Tensor reference_int = reference_out.to(at::kInt); - at::Tensor vk_int = vk_out.to(at::kInt); - - // Tolerance is 1 to address rounding errors and fp math differences between - // CPU/GPU - const bool output_correct = - at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1); - if (!output_correct) { - std::cout << "\nFailed with parameters:" << std::endl; - std::cout << " input_sizes: ["; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << input_sizes[i] << (i < input_sizes.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " block_size: ["; - for (size_t i = 0; i < block_size.size(); i++) { - std::cout << block_size[i] << (i < block_size.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " scales: ["; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << scales[i] << (i < scales.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " zero_points: ["; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << zero_points[i] << (i < zero_points.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - std::cout << "input:" << std::endl << input << std::endl; - std::cout << "reference:" << std::endl << reference_int << std::endl; - std::cout << "vulkan:" << std::endl << vk_int << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_quantize_affine( - const std::vector& input_sizes, - const std::vector& block_size, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - // Test with buffer storage - test_vulkan_quantize_affine_impl( - input_sizes, - block_size, - scales, - zero_points, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Test with texture storage - test_vulkan_quantize_affine_impl( - input_sizes, - block_size, - scales, - zero_points, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -TEST(VulkanQuantizeAffineTest, test_1d_quantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 1D: 1x1x1x12 Tensor, block_size is 3 - test_vulkan_quantize_affine( - {12}, // input_sizes - {3}, // block_size - {0.1f, 0.2f, 0.15f, 0.25f}, // scales (4 blocks) - {10, -20, 5, 30}, // zero_points (4 blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kFloat, // input dtype - at::kChar); // output dtype -} - -TEST(VulkanQuantizeAffineTest, test_2d_quantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 2D: 1x1x8x6 Tensor, block_size is 1x1x2x3 (8/2=4, 6/3=2, so 4*2=8 blocks) - test_vulkan_quantize_affine( - {8, 6}, // input_sizes - {2, 3}, // block_size (1/1=1, 1/1=1, 8/2=4, 6/3=2) - {0.1f, 0.2f, 0.15f, 0.25f, 0.3f, 0.05f, 0.4f, 0.35f}, // scales (8 blocks) - {-10, 15, 0, 25, -5, 20, 10, -15}, // zero_points (8 blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kFloat, // input dtype - at::kChar); // output dtype -} - -TEST(VulkanQuantizeAffineTest, test_3d_quantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 3D: 1x6x4x6 Tensor, block_size is 3x2x2 (6/3=2, 4/2=2, 6/2=3, so 2*2*3=12 - // blocks) - test_vulkan_quantize_affine( - {6, 4, 6}, // input_sizes (changed 7->6 so divisible by 3) - {3, - 2, - 2}, // block_size (6 divisible by 3, 4 divisible by 2, 6 divisible by 2) - {0.1f, - 0.2f, - 0.15f, - 0.25f, - 0.3f, - 0.05f, - 0.4f, - 0.35f, - 0.12f, - 0.18f, - 0.22f, - 0.28f}, // scales (12 blocks) - {-15, 10, 5, -25, 20, -10, 15, -5, 8, -12, 18, -8}, // zero_points (12 - // blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kFloat, // input dtype - at::kChar); // output dtype -} - -TEST(VulkanQuantizeAffineTest, test_4d_quantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 4D: 8x6x6x6 Tensor, block_size is 2x3x2x3 (8/2=4, 6/3=2, 6/2=3, 6/3=2, so - // 4*2*3*2=48 blocks) - test_vulkan_quantize_affine( - {8, 6, 6, 6}, // input_sizes - {2, 3, 2, 3}, // block_size (8/2=4, 6/3=2, 6/2=3, 6/3=2) - {0.1f, 0.2f, 0.15f, 0.25f, 0.3f, 0.05f, 0.4f, 0.35f, 0.12f, 0.18f, - 0.22f, 0.28f, 0.32f, 0.08f, 0.45f, 0.38f, 0.14f, 0.24f, 0.16f, 0.26f, - 0.34f, 0.06f, 0.44f, 0.36f, 0.11f, 0.21f, 0.13f, 0.23f, 0.31f, 0.07f, - 0.41f, 0.37f, 0.19f, 0.29f, 0.17f, 0.27f, 0.33f, 0.09f, 0.43f, 0.39f, - 0.10f, 0.20f, 0.14f, 0.24f, 0.30f, 0.04f, 0.40f, 0.34f}, // scales (48 - // blocks) - {-20, 10, 5, -15, 25, -10, 15, -5, 8, -12, 18, -8, 22, - -18, 12, -22, -25, 15, 0, -20, 30, -5, 20, -10, 5, -25, - 10, -15, 35, -15, 25, -35, -30, 20, -5, -25, 40, 0, 30, - -40, 10, -30, 15, -10, 45, -20, 35, -45}, // zero_points (48 blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kFloat, // input dtype - at::kChar); // output dtype -} - -void test_vulkan_dequantize_affine_impl( - const std::vector& input_sizes, - const std::vector& block_size, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kChar, - at::ScalarType out_dtype = at::kFloat, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - // Create input tensor with random integer values within quant_min and - // quant_max - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = at::randint( - quant_min, - quant_max + 1, - input_sizes_int64, - at::device(at::kCPU).dtype(in_dtype)); - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt)); - - // Get reference output - at::Tensor reference_out = dequantize_affine_reference_impl( - input, - block_size, - scale_tensor, - zero_point_tensor, - quant_min, - quant_max, - out_dtype); - - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - - // Create block_size as IntList instead of Tensor - std::vector block_size_copy(block_size); - const ValueRef r_block_size = - graph.add_scalar_list(std::move(block_size_copy)); - - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - // Create input_dtype scalar - const ValueRef r_input_dtype = - graph.add_scalar(static_cast(in_dtype)); - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - const ValueRef r_output_dtype = - graph.add_scalar(static_cast(out_dtype)); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(out_dtype), out_storage); - - // Match the argument order in dequantize_affine_impl in Dequantize.cpp: - // input, block_size, scale, zero_point, input_dtype, quant_min, quant_max, - // output_dtype, output - VK_GET_OP_FN("torchao.dequantize_affine.default") - (graph, - { - r_input.value, - r_block_size, - r_scale.value, - r_zero_point.value, - r_input_dtype, - r_quant_min, - r_quant_max, - r_output_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Copy scale tensor to GPU - graph.copy_into_staging( - r_scale.staging, scale_tensor.const_data_ptr(), scale_tensor.numel()); - - // Copy zero_point tensor to GPU - graph.copy_into_staging( - r_zero_point.staging, - zero_point_tensor.const_data_ptr(), - zero_point_tensor.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs - const bool output_correct = - at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5); - if (!output_correct) { - std::cout << "\nFailed with parameters:" << std::endl; - std::cout << " input_sizes: ["; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << input_sizes[i] << (i < input_sizes.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " block_size: ["; - for (size_t i = 0; i < block_size.size(); i++) { - std::cout << block_size[i] << (i < block_size.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " scales: ["; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << scales[i] << (i < scales.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " zero_points: ["; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << zero_points[i] << (i < zero_points.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - std::cout << "input:" << std::endl << input << std::endl; - std::cout << "reference:" << std::endl << reference_out << std::endl; - std::cout << "vulkan:" << std::endl << vk_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_dequantize_affine( - const std::vector& input_sizes, - const std::vector& block_size, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kChar, - at::ScalarType out_dtype = at::kFloat) { - // Test with buffer storage - test_vulkan_dequantize_affine_impl( - input_sizes, - block_size, - scales, - zero_points, - quant_min, - quant_max, - in_dtype, - out_dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Test with texture storage - test_vulkan_dequantize_affine_impl( - input_sizes, - block_size, - scales, - zero_points, - quant_min, - quant_max, - in_dtype, - out_dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -TEST(VulkanDequantizeAffineTest, test_1d_dequantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 1D: 1x1x1x12 Tensor, block_size is 3 - test_vulkan_dequantize_affine( - {12}, // input_sizes - {3}, // block_size - {0.1f, 0.2f, 0.15f, 0.25f}, // scales (4 blocks) - {10, -20, 5, 30}, // zero_points (4 blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST(VulkanDequantizeAffineTest, test_2d_dequantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 2D: 1x1x8x6 Tensor, block_size is 1x1x2x3 (8/2=4, 6/3=2, so 4*2=8 blocks) - test_vulkan_dequantize_affine( - {8, 6}, // input_sizes - {2, 3}, // block_size (1/1=1, 1/1=1, 8/2=4, 6/3=2) - {0.1f, 0.2f, 0.15f, 0.25f, 0.3f, 0.05f, 0.4f, 0.35f}, // scales (8 blocks) - {-10, 15, 0, 25, -5, 20, 10, -15}, // zero_points (8 blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST(VulkanDequantizeAffineTest, test_3d_dequantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 3D: 1x6x4x6 Tensor, block_size is 3x2x2 (6/3=2, 4/2=2, 6/2=3, so 2*2*3=12 - // blocks) - test_vulkan_dequantize_affine( - {6, 4, 6}, // input_sizes (changed 7->6 so divisible by 3) - {3, - 2, - 2}, // block_size (6 divisible by 3, 4 divisible by 2, 6 divisible by 2) - {0.1f, - 0.2f, - 0.15f, - 0.25f, - 0.3f, - 0.05f, - 0.4f, - 0.35f, - 0.12f, - 0.18f, - 0.22f, - 0.28f}, // scales (12 blocks) - {-15, 10, 5, -25, 20, -10, 15, -5, 8, -12, 18, -8}, // zero_points (12 - // blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kChar, // input dtype - at::kFloat); // output dtype -} - -TEST(VulkanDequantizeAffineTest, test_4d_dequantization) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - // 4D: 8x6x6x6 Tensor, block_size is 2x3x2x3 (8/2=4, 6/3=2, 6/2=3, 6/3=2, so - // 4*2*3*2=48 blocks) - test_vulkan_dequantize_affine( - {8, 6, 6, 6}, // input_sizes - {2, 3, 2, 3}, // block_size (8/2=4, 6/3=2, 6/2=3, 6/3=2) - {0.1f, 0.2f, 0.15f, 0.25f, 0.3f, 0.05f, 0.4f, 0.35f, 0.12f, 0.18f, - 0.22f, 0.28f, 0.32f, 0.08f, 0.45f, 0.38f, 0.14f, 0.24f, 0.16f, 0.26f, - 0.34f, 0.06f, 0.44f, 0.36f, 0.11f, 0.21f, 0.13f, 0.23f, 0.31f, 0.07f, - 0.41f, 0.37f, 0.19f, 0.29f, 0.17f, 0.27f, 0.33f, 0.09f, 0.43f, 0.39f, - 0.10f, 0.20f, 0.14f, 0.24f, 0.30f, 0.04f, 0.40f, 0.34f}, // scales (48 - // blocks) - {-20, 10, 5, -15, 25, -10, 15, -5, 8, -12, 18, -8, 22, - -18, 12, -22, -25, 15, 0, -20, 30, -5, 20, -10, 5, -25, - 10, -15, 35, -15, 25, -35, -30, 20, -5, -25, 40, 0, 30, - -40, 10, -30, 15, -10, 45, -20, 35, -45}, // zero_points (48 blocks) - -128, // quant_min (char min) - 127, // quant_max (char max) - at::kChar, // input dtype - at::kFloat); // output dtype -} - -void test_vulkan_choose_qparams_affine_impl( - const std::vector& input_sizes, - const std::vector& block_size, - const std::string& mapping_type, - int64_t quant_min, - int64_t quant_max, - double eps, - at::ScalarType in_dtype = at::kFloat, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kBuffer) { - // Create input tensor with random values - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - - // Get reference output - auto reference_out = choose_qparams_affine_reference_impl( - input, mapping_type, block_size, quant_min, quant_max, eps); - - at::Tensor reference_scale = std::get<0>(reference_out); - at::Tensor reference_zero_point = std::get<1>(reference_out); - - reference_zero_point = reference_zero_point.to(at::kInt); - - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - - // Create mapping_type as string - std::string mapping_type_copy = mapping_type; - const ValueRef r_mapping_type = - graph.add_string(std::move(mapping_type_copy)); - - // Create block_size as IntList - std::vector block_size_copy(block_size); - const ValueRef r_block_size = - graph.add_scalar_list(std::move(block_size_copy)); - - // Create target_dtype, quant_min, quant_max, eps - const ValueRef r_target_dtype = - graph.add_scalar(static_cast(at::kChar)); - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - const ValueRef r_eps = graph.add_scalar(eps); - - // Create scale_dtype and zero_point_dtype - const ValueRef r_scale_dtype = - graph.add_scalar(static_cast(at::kFloat)); - const ValueRef r_zero_point_dtype = - graph.add_scalar(static_cast(at::kInt)); - - // Create output tuple - std::vector out_tuple; - - // Create scale and zero_point output tensors - const ValueRef r_scale_out = graph.add_tensor( - reference_scale.sizes().vec(), vkapi::kFloat, out_storage); - const ValueRef r_zero_point_out = graph.add_tensor( - reference_zero_point.sizes().vec(), vkapi::kInt, out_storage); - - out_tuple.push_back(r_scale_out); - out_tuple.push_back(r_zero_point_out); - - const ValueRef r_out_tuple = graph.add_value_list(std::move(out_tuple)); - - VK_GET_OP_FN("torchao.choose_qparams_affine.default") - (graph, - { - r_input.value, - r_mapping_type, - r_block_size, - r_target_dtype, - r_quant_min, - r_quant_max, - r_eps, - r_scale_dtype, - r_zero_point_dtype, - r_out_tuple, - }); - - ValueRef staging_scale = graph.set_output_tensor(r_scale_out); - ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point_out); - - graph.prepare(); - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_scale = at::empty_like(reference_scale).contiguous(); - at::Tensor vk_zero_point = at::empty_like(reference_zero_point).contiguous(); - - graph.copy_from_staging( - staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel()); - graph.copy_from_staging( - staging_zero_point, - vk_zero_point.mutable_data_ptr(), - vk_zero_point.numel()); - - // Compare outputs - const bool scale_correct = - at::allclose(reference_scale, vk_scale, /*rtol=*/1e-3, /*atol=*/1e-3); - - // For zero point, we need to compare as integers since zero point should be - // an integer First convert both tensors to int if they aren't already - at::Tensor ref_zp_int = reference_zero_point.to(at::kInt); - at::Tensor vk_zp_int = vk_zero_point.to(at::kInt); - const bool zero_point_correct = at::equal(ref_zp_int, vk_zp_int); - - if (!scale_correct || !zero_point_correct) { - std::cout << "\nFailed with parameters:" << std::endl; - std::cout << " input_sizes: ["; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << input_sizes[i] << (i < input_sizes.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " block_size: ["; - for (size_t i = 0; i < block_size.size(); i++) { - std::cout << block_size[i] << (i < block_size.size() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << " mapping_type: " << mapping_type << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " eps: " << eps << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - if (!scale_correct || !zero_point_correct) { - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - - std::cout << "reference_scale:" << std::endl - << reference_scale << std::endl; - std::cout << "vulkan_scale:" << std::endl << vk_scale << std::endl; - - std::cout << "reference_zero_point:" << std::endl - << reference_zero_point << std::endl; - std::cout << "vulkan_zero_point:" << std::endl - << vk_zero_point << std::endl; - } - } - - ASSERT_TRUE(scale_correct); - ASSERT_TRUE(zero_point_correct); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_choose_qparams_affine( - const std::vector& input_sizes, - const std::vector& block_size, - const std::string& mapping_type, - int64_t quant_min, - int64_t quant_max, - double eps, - at::ScalarType in_dtype = at::kFloat) { - // Test with buffer storage for both input and output - test_vulkan_choose_qparams_affine_impl( - input_sizes, - block_size, - mapping_type, - quant_min, - quant_max, - eps, - in_dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // Test with texture storage for input and buffer storage for output - // (shader always uses buffer storage for outputs) - test_vulkan_choose_qparams_affine_impl( - input_sizes, - block_size, - mapping_type, - quant_min, - quant_max, - eps, - in_dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kBuffer); -} - -TEST(VulkanChooseQParamsAffineTest, test_1d_asymmetric) { - // 1D: 12 Tensor, block_size is 3 - test_vulkan_choose_qparams_affine( - {12}, // input_sizes - {3}, // block_size - "ASYMMETRIC", // mapping_type - -128, // quant_min (char min) - 127, // quant_max (char max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_2d_symmetric) { - // 2D: 8x6 Tensor, block_size is 2x3 - test_vulkan_choose_qparams_affine( - {8, 6}, // input_sizes - {2, 3}, // block_size - "SYMMETRIC", // mapping_type - -128, // quant_min (char min) - 127, // quant_max (char max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_3d_symmetric_no_clipping) { - // 3D: 6x4x6 Tensor, block_size is 3x2x2 - test_vulkan_choose_qparams_affine( - {6, 4, 6}, // input_sizes - {3, 2, 2}, // block_size - "SYMMETRIC_NO_CLIPPING_ERR", // mapping_type - -128, // quant_min (char min) - 127, // quant_max (char max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_4d_asymmetric) { - // 4D: 4x6x6x6 Tensor, block_size is 2x3x2x3 - test_vulkan_choose_qparams_affine( - {4, 6, 6, 6}, // input_sizes (reduced from 8 to 4 to make test faster) - {2, 3, 2, 3}, // block_size - "ASYMMETRIC", // mapping_type - -128, // quant_min (char min) - 127, // quant_max (char max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_per_tensor) { - // Per-tensor: block_size equals tensor size - test_vulkan_choose_qparams_affine( - {4, 6, 8}, // input_sizes - {4, 6, 8}, // block_size equals tensor size - "ASYMMETRIC", // mapping_type - -128, // quant_min (char min) - 127, // quant_max (char max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_per_token) { - // Per-token: block_size is all 1s except last dimension - test_vulkan_choose_qparams_affine( - {4, 6, 8}, // input_sizes - {1, 1, 8}, // block_size is all 1s except last dimension - "ASYMMETRIC", // mapping_type - -128, // quant_min (char min) - 127, // quant_max (char max) - 1e-5, // eps - at::kFloat); // input dtype -} - -// Additional tests for choose_qparams_affine - -TEST(VulkanChooseQParamsAffineTest, test_uint8_range) { - // Test with uint8 range (0-255) - test_vulkan_choose_qparams_affine( - {6, 8}, // input_sizes - {2, 4}, // block_size - "ASYMMETRIC", // mapping_type - 0, // quant_min (uint8 min) - 255, // quant_max (uint8 max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_int16_range) { - // Test with int16 range (-32768 to 32767) - test_vulkan_choose_qparams_affine( - {6, 8}, // input_sizes - {2, 4}, // block_size - "SYMMETRIC", // mapping_type - -32768, // quant_min (int16 min) - 32767, // quant_max (int16 max) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_larger_eps) { - // Test with larger epsilon value - test_vulkan_choose_qparams_affine( - {6, 8}, // input_sizes - {2, 4}, // block_size - "ASYMMETRIC", // mapping_type - -128, // quant_min - 127, // quant_max - 1e-2, // larger eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_per_channel_first_dim) { - // Per-channel quantization on first dimension - test_vulkan_choose_qparams_affine( - {8, 6, 4}, // input_sizes - {1, 6, 4}, // block_size (per-channel on dim 0) - "SYMMETRIC", // mapping_type - -128, // quant_min - 127, // quant_max - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_per_channel_middle_dim) { - // Per-channel quantization on middle dimension - test_vulkan_choose_qparams_affine( - {4, 8, 6}, // input_sizes - {4, 1, 6}, // block_size (per-channel on dim 1) - "SYMMETRIC", // mapping_type - -128, // quant_min - 127, // quant_max - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_mixed_block_sizes) { - // Mixed block sizes (some dimensions fully quantized, some partially) - test_vulkan_choose_qparams_affine( - {8, 6, 10}, // input_sizes - {4, 6, 2}, // block_size (mixed: partial, full, partial) - "ASYMMETRIC", // mapping_type - -128, // quant_min - 127, // quant_max - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_small_tensor) { - // Test with a small tensor - test_vulkan_choose_qparams_affine( - {2, 3}, // small input_sizes - {2, 3}, // block_size (full tensor) - "ASYMMETRIC", // mapping_type - -128, // quant_min - 127, // quant_max - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_asymmetric_narrow_range) { - // Test with a narrow quantization range - test_vulkan_choose_qparams_affine( - {6, 8}, // input_sizes - {2, 4}, // block_size - "ASYMMETRIC", // mapping_type - -10, // quant_min (narrow range) - 10, // quant_max (narrow range) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_symmetric_narrow_range) { - // Test with a narrow quantization range with symmetric mapping - test_vulkan_choose_qparams_affine( - {6, 8}, // input_sizes - {2, 4}, // block_size - "SYMMETRIC", // mapping_type - -10, // quant_min (narrow range) - 10, // quant_max (narrow range) - 1e-5, // eps - at::kFloat); // input dtype -} - -TEST(VulkanChooseQParamsAffineTest, test_symmetric_no_clipping_narrow_range) { - // Test with a narrow quantization range with symmetric no clipping mapping - test_vulkan_choose_qparams_affine( - {6, 8}, // input_sizes - {2, 4}, // block_size - "SYMMETRIC_NO_CLIPPING_ERR", // mapping_type - -10, // quant_min (narrow range) - 10, // quant_max (narrow range) - 1e-5, // eps - at::kFloat); // input dtype -} \ No newline at end of file diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp deleted file mode 100644 index 86eebcf9b14..00000000000 --- a/backends/vulkan/test/op_tests/quantize_test.cpp +++ /dev/null @@ -1,2188 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include -#include - -#include "test_utils.h" - -#include -#include -#include - -float eps = 1e-7; - -namespace torch { -namespace executor { -namespace native { - -// Forward declarations of the functions we're testing -Tensor& quantize_per_tensor_out( - const Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out); - -Tensor& quantize_per_token_out( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out); - -Tensor& quantize_per_channel_out( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out); - -Tensor& quantize_per_tensor_tensor_args_out( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out); - -// Wrapper function for quantize_per_tensor_out without context -Tensor& quantize_per_tensor_out_no_context( - const Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - return torch::executor::native::quantize_per_tensor_out( - input, scale, zero_point, quant_min, quant_max, dtype, out); -} - -// Wrapper function for quantize_per_token_out without context -Tensor& quantize_per_token_out_no_context( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - return torch::executor::native::quantize_per_token_out( - input, scale, zero_point, quant_min, quant_max, dtype, out); -} - -// Wrapper function for quantize_per_channel_out without context -Tensor& quantize_per_channel_out_no_context( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - return torch::executor::native::quantize_per_channel_out( - input, scale, zero_point, axis, quant_min, quant_max, dtype, out); -} - -// Wrapper function for quantize_per_tensor_tensor_args_out without context -Tensor& quantize_per_tensor_tensor_args_out_no_context( - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - return torch::executor::native::quantize_per_tensor_tensor_args_out( - input, scale, zero_point, quant_min, quant_max, dtype, out); -} - -// ATen wrapper for quantize_per_tensor -at::Tensor quantize_per_tensor_aten( - const at::Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - auto out = at::empty_like(input, dtype); - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - - WRAP_TO_ATEN(quantize_per_tensor_out_no_context, 6) - (input, scale, zero_point, quant_min, quant_max, et_dtype, out); - return out; -} - -// ATen wrapper for quantize_per_token -at::Tensor quantize_per_token_aten( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - auto out = at::empty_like(input, dtype); - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - - WRAP_TO_ATEN(quantize_per_token_out_no_context, 6) - (input, scale, zero_point, quant_min, quant_max, et_dtype, out); - return out; -} - -// ATen wrapper for quantize_per_channel -at::Tensor quantize_per_channel_aten( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - auto out = at::empty_like(input, dtype); - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - - WRAP_TO_ATEN(quantize_per_channel_out_no_context, 7) - (input, scale, zero_point, axis, quant_min, quant_max, et_dtype, out); - return out; -} - -// ATen wrapper for quantize_per_tensor with tensor args -at::Tensor quantize_per_tensor_tensor_args_aten( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - auto out = at::empty_like(input, dtype); - ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype); - - WRAP_TO_ATEN(quantize_per_tensor_tensor_args_out_no_context, 6) - (input, scale, zero_point, quant_min, quant_max, et_dtype, out); - return out; -} - -} // namespace native -} // namespace executor -} // namespace torch - -void check_quantize_args( - int64_t quant_min, - int64_t quant_max, - c10::ScalarType out_dtype) { - using namespace vkcompute; - int32_t quant_min_lower_bound = 0, quant_max_upper_bound = 0; - switch (out_dtype) { - case c10::kByte: - quant_min_lower_bound = - static_cast(std::numeric_limits::min()); - quant_max_upper_bound = - static_cast(std::numeric_limits::max()); - break; - case c10::kChar: - quant_min_lower_bound = - static_cast(std::numeric_limits::min()); - quant_max_upper_bound = - static_cast(std::numeric_limits::max()); - break; - case c10::kBits16: - case c10::kUInt16: - quant_min_lower_bound = std::numeric_limits::min(); - quant_max_upper_bound = std::numeric_limits::max(); - break; - case c10::kShort: - quant_min_lower_bound = std::numeric_limits::min(); - quant_max_upper_bound = std::numeric_limits::max(); - break; - case c10::kInt: - quant_min_lower_bound = std::numeric_limits::min(); - quant_max_upper_bound = std::numeric_limits::max(); - break; - default: - VK_CHECK_COND(false, "Unsupported dtype: ", scalar_type_name(out_dtype)); - } - VK_CHECK_COND( - quant_min >= quant_min_lower_bound, - "quant_min out of bound for dtype, expected quant_min_lower_bound: ", - quant_min_lower_bound, - " actual quant_min: ", - quant_min); - - VK_CHECK_COND( - quant_max <= quant_max_upper_bound, - "quant_max out of bound for dtype, expected quant_max_upper_bound: ", - quant_max_upper_bound, - " actual quant_max: ", - quant_max); -} - -/** - * Helper function to validate quantize_per_channel arguments - * Similar to the validation in op_quantize.cpp - */ -void check_quantize_per_channel_args( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis) { - // Normalize axis - int64_t normalized_axis = axis; - if (normalized_axis < 0) { - normalized_axis += input_sizes.size(); - } - - ASSERT_GE(normalized_axis, 0) - << "axis " << axis << " is not legal, normalized axis " << normalized_axis - << " should be >= 0"; - - ASSERT_LT(normalized_axis, static_cast(input_sizes.size())) - << "axis " << axis << " is not legal, normalized axis " << normalized_axis - << " should be < input.dim() " << input_sizes.size(); - - int64_t num_channels = input_sizes[normalized_axis]; - - ASSERT_EQ(num_channels, static_cast(scales.size())) - << "Expected scales.size() to match input.size(axis) (" << num_channels - << "), but got " << scales.size(); - - ASSERT_EQ(num_channels, static_cast(zero_points.size())) - << "Expected zero_points.size() to match input.size(axis) (" - << num_channels << "), but got " << zero_points.size(); -} - -// -// Reference Implementation -// - -/* - * Reference implementation of quantize_per_tensor - */ -at::Tensor quantize_per_tensor_reference_impl( - const at::Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - // Create output tensor with the target dtype - at::Tensor out = at::empty_like(input, dtype); - - // Quantize the input tensor - float inv_scale = 1.0 / scale; - - // Iterate through the tensor and quantize each element - at::Tensor float_input = input.to(at::kFloat); - at::Tensor float_values = float_input.flatten(); - - auto out_flat = out.flatten(); - - for (int i = 0; i < float_values.numel(); i++) { - float value = float_values[i].item(); - int64_t qvalue = zero_point + std::nearbyint(inv_scale * value); - - qvalue = std::max(qvalue, quant_min); - qvalue = std::min(qvalue, quant_max); - - if (dtype == at::kByte) { - out_flat[i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - out_flat[i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - out_flat[i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - out_flat[i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - out_flat[i] = static_cast(qvalue); - } - } - - return out.reshape(input.sizes()); -} - -/* - * Reference implementation of quantize_per_token - */ -at::Tensor quantize_per_token_reference_impl( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - // Create output tensor with the target dtype - at::Tensor out = at::empty_like(input, dtype); - - // Calculate number of tokens - int num_tokens = 1; - for (int i = 0; i < input.dim() - 1; i++) { - num_tokens *= input.size(i); - } - - // Verify that the number of tokens matches the size of scale and zero_point - // tensors - assert(num_tokens == scale.numel()); - assert(num_tokens == zero_point.numel()); - - // Reshape input to [num_tokens, last_dim] - at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)}); - at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)}); - - // Quantize each token separately - for (int token_idx = 0; token_idx < num_tokens; token_idx++) { - // Use float for scale since Vulkan doesn't support double - float token_scale = scale[token_idx].item(); - // Use int for zero_point since Vulkan doesn't support int64_t - int token_zero_point = zero_point[token_idx].item(); - - float inv_scale = 1.0 / token_scale; - - // Quantize the token - for (int i = 0; i < input.size(-1); i++) { - float value = reshaped_input[token_idx][i].item(); - int qvalue = token_zero_point + std::nearbyint(inv_scale * value); - - qvalue = std::max(qvalue, quant_min); - qvalue = std::min(qvalue, quant_max); - - if (dtype == at::kByte) { - reshaped_out[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kChar) { - reshaped_out[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kShort) { - reshaped_out[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kInt) { - reshaped_out[token_idx][i] = static_cast(qvalue); - } else if (dtype == at::kLong) { - reshaped_out[token_idx][i] = static_cast(qvalue); - } - } - } - - return out; -} - -/* - * Reference implementation of quantize_per_channel - */ -at::Tensor quantize_per_channel_reference_impl( - const at::Tensor& input, - const at::Tensor& scale, - const at::Tensor& zero_point, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType dtype) { - // Normalize axis to handle negative values - int64_t normalized_axis = axis; - if (normalized_axis < 0) { - normalized_axis += input.dim(); - } - - // Create output tensor with the same shape as input but with target dtype - at::Tensor output = at::empty_like(input, dtype); - - // Get the number of channels along the quantization axis - int64_t num_channels = input.size(normalized_axis); - - // Calculate strides for efficient indexing - std::vector input_strides; - std::vector input_sizes; - for (int64_t i = 0; i < input.dim(); i++) { - input_sizes.push_back(input.size(i)); - input_strides.push_back(input.stride(i)); - } - - // Get data pointers - const float* input_data = input.const_data_ptr(); - const double* scale_data = scale.const_data_ptr(); - const int64_t* zero_point_data = zero_point.const_data_ptr(); - - // Iterate through all elements in the tensor - int64_t total_elements = input.numel(); - - // Helper lambda to convert flat index to multi-dimensional coordinates - auto flat_to_coords = [&](int64_t flat_idx, std::vector& coords) { - int64_t remaining = flat_idx; - for (int64_t dim = input.dim() - 1; dim >= 0; dim--) { - coords[dim] = remaining % input_sizes[dim]; - remaining /= input_sizes[dim]; - } - }; - - // Process each element - std::vector coords(input.dim()); - for (int64_t flat_idx = 0; flat_idx < total_elements; flat_idx++) { - // Convert flat index to coordinates - flat_to_coords(flat_idx, coords); - - // Get the channel index for this element - int64_t channel_idx = coords[normalized_axis]; - - // Get the quantization parameters for this channel - double channel_scale = scale_data[channel_idx]; - int64_t channel_zero_point = zero_point_data[channel_idx]; - - // Get the input value - float input_value = input_data[flat_idx]; - - // Apply quantization formula: round(input / scale) + zero_point - float inv_scale = 1.0f / static_cast(channel_scale); - int64_t quantized_value = static_cast( - static_cast(channel_zero_point) + - std::nearbyint(static_cast(inv_scale * input_value))); - - // Clamp to quantization bounds - quantized_value = std::max(quantized_value, quant_min); - quantized_value = std::min(quantized_value, quant_max); - - // Store the result based on output dtype - switch (dtype) { - case at::kByte: { - uint8_t* output_data = output.mutable_data_ptr(); - output_data[flat_idx] = static_cast(quantized_value); - break; - } - case at::kChar: { - int8_t* output_data = output.mutable_data_ptr(); - output_data[flat_idx] = static_cast(quantized_value); - break; - } - case at::kShort: { - int16_t* output_data = output.mutable_data_ptr(); - output_data[flat_idx] = static_cast(quantized_value); - break; - } - case at::kInt: { - int32_t* output_data = output.mutable_data_ptr(); - output_data[flat_idx] = static_cast(quantized_value); - break; - } - default: - assert(false && "Unsupported output dtype"); - } - } - - return output; -} - -// Forward declaration of implementation functions -void test_vulkan_quantize_per_token_impl( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -void test_vulkan_quantize_per_channel_impl( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -void test_vulkan_quantize_per_tensor_tensor_impl( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype, - at::ScalarType dtype, - const vkcompute::utils::StorageType in_storage, - const vkcompute::utils::StorageType out_storage); - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_quantize_per_token( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - // Test with buffer storage - test_vulkan_quantize_per_token_impl( - input_sizes, - scales, - zero_points, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // If the in_dtype is a double, convert to float for texture implementation - // since they don't support 64bit as inputs - if (in_dtype == at::kDouble) { - in_dtype = at::kFloat; - } - - // Test with texture storage - test_vulkan_quantize_per_token_impl( - input_sizes, - scales, - zero_points, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_quantize_per_channel( - const std::vector& input_sizes, - const std::vector& scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - // Test with buffer storage - test_vulkan_quantize_per_channel_impl( - input_sizes, - scales, - zero_points, - axis, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // If the in_dtype is a double, convert to float for texture implementation - // since they don't support 64bit as inputs - if (in_dtype == at::kDouble) { - in_dtype = at::kFloat; - } - - test_vulkan_quantize_per_channel_impl( - input_sizes, - scales, - zero_points, - axis, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -// Wrapper function to test both buffer and texture storage types -void test_vulkan_quantize_per_tensor_tensor( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - // Test with buffer storage - test_vulkan_quantize_per_tensor_tensor_impl( - input_sizes, - scale, - zero_point, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - // If the in_dtype is a double, convert to float for texture implementation - // since they don't support 64bit as inputs - if (in_dtype == at::kDouble) { - in_dtype = at::kFloat; - } - - // Test with texture storage - test_vulkan_quantize_per_tensor_tensor_impl( - input_sizes, - scale, - zero_point, - quant_min, - quant_max, - in_dtype, - dtype, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -void test_reference_quantize_per_tensor( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - check_quantize_args(quant_min, quant_max, dtype); - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - - // Fill with a simple pattern: values from 0 to 1 in steps - float step = 1.0f / (input.numel() - 1); - auto flat_input = input.flatten(); - for (int i = 0; i < flat_input.numel(); i++) { - flat_input[i] = i * step; - } - - // Reshape back to original dimensions - input = flat_input.reshape(input_sizes_int64); - - scale = scale < eps ? eps : scale; - - // Get reference output - at::Tensor reference_out = quantize_per_tensor_reference_impl( - input, scale, zero_point, quant_min, quant_max, dtype); - - // Get implementation output - at::Tensor impl_out = torch::executor::native::quantize_per_tensor_aten( - input, scale, zero_point, quant_min, quant_max, dtype); - - // Convert to int for consistent display regardless of underlying type - at::Tensor reference_int = reference_out.to(at::kInt); - at::Tensor impl_int = impl_out.to(at::kInt); - - const bool output_correct = at::equal(reference_int, impl_int); - if (!output_correct) { - at::Tensor diffs = at::abs(reference_int - impl_int); - - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale: " << scale << std::endl; - std::cout << " zero_point: " << zero_point << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_int << std::endl; - std::cout << "my_reference:" << std::endl; - std::cout << impl_int << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanQuantizePerTensorTest, - test_reference_quantize_per_tensor_float_to_int8) { - test_reference_quantize_per_tensor( - {2, 3, 4}, // input sizes - 0.1, // scale - 0, // zero_point - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerTensorTest, - test_reference_quantize_per_tensor_float_to_int32) { - test_reference_quantize_per_tensor( - {2, 3, 4}, // input sizes - 0.04, // scale - 5, // zero_point - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kFloat, - at::kInt); -} - -TEST( - VulkanQuantizePerTensorTest, - test_reference_quantize_per_tensor_half_to_uint8) { - test_reference_quantize_per_tensor( - {2, 3, 4}, // input sizes - 0.2, // scale - 2, // zero_point - 0, // quant_min - 255, // quant_max - at::kHalf, - at::kByte); -} - -TEST( - VulkanQuantizePerTensorTest, - test_reference_quantize_per_tensor_half_to_int32) { - test_reference_quantize_per_tensor( - {2, 3, 4}, // input sizes - 0.01, // scale - 1, // zero_point - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kHalf, - at::kInt); -} - -// No Vulkan tests for quantized_decomposed.quantize_per_tensor.default -// because it is not going to be implemented in Vulkan since we will -// be handling any future calls to this op via the export stage - -void test_reference_quantize_per_token( - const std::vector& input_sizes, - const std::vector& pre_scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - check_quantize_args(quant_min, quant_max, dtype); - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - - // Fill with a simple pattern: values from 0 to 1 in steps - float step = 1.0 / (input.numel() - 1); - auto flat_input = input.flatten(); - for (int i = 0; i < flat_input.numel(); i++) { - flat_input[i] = i * step; - } - - // Reshape back to original dimensions - input = flat_input.reshape(input_sizes_int64); - - // Calculate number of tokens - int num_tokens = 1; - for (int i = 0; i < input.dim() - 1; i++) { - num_tokens *= input.size(i); - } - - // Verify that the number of tokens matches the size of scales and zero_points - ASSERT_EQ(num_tokens, pre_scales.size()); - ASSERT_EQ(num_tokens, zero_points.size()); - - std::vector scales = pre_scales; - for (auto& s : scales) { - s = s < eps ? eps : s; - } - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output - at::Tensor reference_out = quantize_per_token_reference_impl( - input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); - - // Get implementation output - at::Tensor impl_out = torch::executor::native::quantize_per_token_aten( - input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); - - // Convert to int for consistent display regardless of underlying type - at::Tensor reference_int = reference_out.to(at::kInt); - at::Tensor impl_int = impl_out.to(at::kInt); - - const bool output_correct = at::equal(reference_int, impl_out); - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_int << std::endl; - std::cout << "my_reference:" << std::endl; - std::cout << impl_out << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -void test_vulkan_quantize_per_token_impl( - const std::vector& input_sizes, - const std::vector& pre_scales, - const std::vector& zero_points, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - check_quantize_args(quant_min, quant_max, dtype); - int num_tokens = 1; - for (int i = 0; i < input_sizes.size() - 1; i++) { - num_tokens *= input_sizes[i]; - } - - ASSERT_EQ(num_tokens, pre_scales.size()); - ASSERT_EQ(num_tokens, zero_points.size()); - - std::vector scales = pre_scales; - for (auto& s : scales) { - s = s < eps ? eps : s; - } - - // Create input tensor with random values - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output to show what we would compare against - at::Tensor reference_out = torch::executor::native::quantize_per_token_aten( - input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); - - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(dtype), out_storage); - - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - - VK_GET_OP_FN("quantized_decomposed.quantize_per_token.default") - (graph, - { - r_input.value, - r_scale.value, - r_zero_point.value, - r_quant_min, - r_quant_max, - r_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Convert scale tensor to float and copy to GPU - at::Tensor scale_float = scale_tensor.to(at::kFloat); - graph.copy_into_staging( - r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); - - // Convert zero_point tensor to int and copy to GPU - at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); - graph.copy_into_staging( - r_zero_point.staging, - zero_point_int.const_data_ptr(), - zero_point_int.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs - at::Tensor reference_int = reference_out.to(at::kInt); - at::Tensor vk_int = vk_out.to(at::kInt); - - // Tolerance is 1 to address rounding errors and fp math differences between - // CPU/GPU - const bool output_correct = - at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1); - if (!output_correct) { - at::Tensor diffs = at::abs(reference_int - vk_int); - - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_int << std::endl; - std::cout << "vulkan:" << std::endl; - std::cout << vk_int << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanQuantizePerTokenTest, - test_reference_quantize_per_token_float_to_int8) { - std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; - std::vector zero_points = {1, 2, 3, 0, -1, -2}; - - test_reference_quantize_per_token( - {2, 3, 4}, // input sizes (2*3=6 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerTokenTest, - test_reference_quantize_per_token_float_to_int32) { - std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; - std::vector zero_points = {1, 2, 3, 0, -1, -2}; - - test_reference_quantize_per_token( - {2, 3, 4}, // input sizes (2*3=6 tokens) - scales, - zero_points, - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kFloat, - at::kInt); -} - -TEST( - VulkanQuantizePerTokenTest, - test_reference_quantize_per_token_half_to_int32) { - std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; - std::vector zero_points = {1, 2, 3, 0, -1, -2}; - - test_reference_quantize_per_token( - {2, 3, 4}, // input sizes (2*3=6 tokens) - scales, - zero_points, - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kHalf, - at::kInt); -} - -TEST( - VulkanQuantizePerTokenTest, - test_reference_quantize_per_token_half_to_uint8) { - std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; - std::vector zero_points = {1, 2, 3, 0, -1, -2}; - - test_reference_quantize_per_token( - {2, 3, 4}, // input sizes (2*3=6 tokens) - scales, - zero_points, - 0, // quant_min - 255, // quant_max - at::kHalf, - at::kByte); -} - -TEST( - VulkanQuantizePerTokenTest, - test_vulkan_quantize_per_token_float_to_uint8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = { - -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4}; - std::vector zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12}; - - test_vulkan_quantize_per_token( - {5, 2, 4}, // input sizes (5*2=10 tokens) - scales, - zero_points, - 0, // quant_min - 255, // quant_max - at::kFloat, - at::kByte); -} - -TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_float_to_int8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = { - -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4}; - std::vector zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12}; - - test_vulkan_quantize_per_token( - {5, 2, 4}, // input sizes (5 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerTokenTest, - test_vulkan_quantize_per_token_float_to_int32) { - std::vector scales = { - -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4}; - std::vector zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12}; - - test_vulkan_quantize_per_token( - {5, 2, 4}, // input sizes (5*2=10 tokens) - scales, - zero_points, - -2147483648, // quant_min - 2147483647, // quant_max - at::kFloat, - at::kInt); -} - -TEST( - VulkanQuantizePerTokenTest, - test_vulkan_quantize_per_token_float_to_int32_small_scales) { - std::vector scales = { - 0, - 2.9387358770557188e-39f, - 1.40129846e-45f, - 1.17549435e-38f, - 0.0000000000001}; - std::vector zero_points = {20, -10, 15, 200, 50}; - - test_vulkan_quantize_per_token( - {5, 2}, // input sizes (3 tokens) - scales, - zero_points, - -2147483648, // quant_min - 2147483647, // quant_max - at::kFloat, - at::kInt); -} - -TEST( - VulkanQuantizePerTokenTest, - test_vulkan_quantize_per_token_float_to_uint8_many_tokens) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(18, 0.1); - std::vector zero_points(18, 5); - - // Alternate scale values - for (size_t i = 0; i < scales.size(); i++) { - scales[i] = (i % 2 == 0) ? 0.3 : -0.5; - } - - test_vulkan_quantize_per_token( - {3, 3, 2, 3}, // input sizes (3*3*2=18 tokens) - scales, - zero_points, - 0, // quant_min - 125, // quant_max - at::kFloat, - at::kByte); -} - -TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_half_to_int8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_float16_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2}; - std::vector zero_points = {0, 5}; - - test_vulkan_quantize_per_token( - {2, 2}, // input sizes (2*2=4 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kHalf, // input dtype - at::kChar); // output dtype -} - -TEST( - VulkanQuantizePerTokenTest, - test_vulkan_quantize_per_token_double_to_int8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2}; - std::vector zero_points = {0, 5}; - - test_vulkan_quantize_per_token( - {2, 2}, // input sizes (2*2=4 tokens) - scales, - zero_points, - -128, // quant_min - 127, // quant_max - at::kDouble, // input dtype - at::kChar); // output dtype -} - -void test_reference_quantize_per_channel( - const std::vector& input_sizes, - const std::vector& pre_scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt) { - check_quantize_args(quant_min, quant_max, dtype); - check_quantize_per_channel_args(input_sizes, pre_scales, zero_points, axis); - - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - - // Fill with a simple pattern: values from 0 to 1 in steps - float step = 1.0f / (input.numel() - 1); - auto flat_input = input.flatten(); - for (int i = 0; i < flat_input.numel(); i++) { - flat_input[i] = i * step; - } - - // Reshape back to original dimensions - input = flat_input.reshape(input_sizes_int64); - - std::vector scales = pre_scales; - for (auto& s : scales) { - s = s < eps ? eps : s; - } - - // Create scale and zero_point tensors - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output - at::Tensor my_ref = quantize_per_channel_reference_impl( - input, - scale_tensor, - zero_point_tensor, - axis, - quant_min, - quant_max, - dtype); - - // Get implementation output - at::Tensor cpu_ref = torch::executor::native::quantize_per_channel_aten( - input, - scale_tensor, - zero_point_tensor, - axis, - quant_min, - quant_max, - dtype); - - // Get direct ATen implementation output - c10::ScalarType aten_dtype = dtype; - if (dtype == at::kChar) { - aten_dtype = c10::kQInt8; - } else if (dtype == at::kByte) { - aten_dtype = c10::kQUInt8; - } - - // Normalize axis for ATen (it doesn't handle negative values) - int64_t normalized_axis = axis; - if (normalized_axis < 0) { - normalized_axis += input.dim(); - } - - at::Tensor aten_ref = at::quantize_per_channel( - input, scale_tensor, zero_point_tensor, normalized_axis, aten_dtype); - - // Convert to int for consistent display regardless of underlying type - at::Tensor my_ref_int = my_ref.to(at::kInt); - at::Tensor cpu_ref_int = cpu_ref.to(at::kInt); - // For quantized tensors, we need to use int_repr() to get the underlying - // integer values - at::Tensor aten_ref_int = aten_ref.int_repr().to(at::kInt); - - const bool output_correct = at::equal(my_ref_int, cpu_ref_int); - if (!output_correct) { - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " axis: " << axis << std::endl; - std::cout << " input sizes:"; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << " " << input_sizes[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "aten_ref:" << std::endl; - std::cout << aten_ref_int << std::endl; - std::cout << "cpu_ref:" << std::endl; - std::cout << cpu_ref_int << std::endl; - std::cout << "my_ref:" << std::endl; - std::cout << my_ref_int << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -void test_vulkan_quantize_per_channel_impl( - const std::vector& input_sizes, - const std::vector& pre_scales, - const std::vector& zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - check_quantize_args(quant_min, quant_max, dtype); - check_quantize_per_channel_args(input_sizes, pre_scales, zero_points, axis); - - std::vector scales = pre_scales; - for (auto& s : scales) { - s = s < eps ? eps : s; - } - - // Create input tensor with random values - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - at::Tensor scale_tensor = - at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output - at::Tensor reference_out = torch::executor::native::quantize_per_channel_aten( - input, - scale_tensor, - zero_point_tensor, - axis, - quant_min, - quant_max, - dtype); - - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - const ValueRef r_axis = graph.add_scalar(axis); - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(dtype), out_storage); - - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - - VK_GET_OP_FN("quantized_decomposed.quantize_per_channel.default") - (graph, - { - r_input.value, - r_scale.value, - r_zero_point.value, - r_axis, - r_quant_min, - r_quant_max, - r_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Copy input data to GPU - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Convert scale tensor to float and copy to GPU - at::Tensor scale_float = scale_tensor.to(at::kFloat); - graph.copy_into_staging( - r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); - - // Convert zero_point tensor to int and copy to GPU - at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); - graph.copy_into_staging( - r_zero_point.staging, - zero_point_int.const_data_ptr(), - zero_point_int.numel()); - - // Execute the graph - graph.execute(); - - // Copy output data back to CPU - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs - at::Tensor reference_int = reference_out.to(at::kInt); - at::Tensor vk_int = vk_out.to(at::kInt); - - // Tolerance is 1 to address rounding errors and fp math differences between - // CPU/GPU - const bool output_correct = - at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1); - if (!output_correct) { - at::Tensor diffs = at::abs(reference_int - vk_int); - - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " axis: " << axis << std::endl; - std::cout << " input sizes:"; - for (size_t i = 0; i < input_sizes.size(); i++) { - std::cout << " " << input_sizes[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " scale(s):"; - for (size_t i = 0; i < scales.size(); i++) { - std::cout << " " << scales[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " zero_point(s):"; - for (size_t i = 0; i < zero_points.size(); i++) { - std::cout << " " << zero_points[i] << " "; - } - std::cout << "" << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_int << std::endl; - std::cout << "vulkan:" << std::endl; - std::cout << vk_int << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanQuantizePerChannelTest, - test_reference_quantize_per_channel_float_to_int8_3D_axis0) { - std::vector scales = {0.1, 0.2, 0.3}; - std::vector zero_points = {0, 5, -2}; - - test_reference_quantize_per_channel( - {3, 4, 2}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_reference_quantize_per_channel_float_to_int8_3D_axis2) { - std::vector scales = {0.1, 0.2}; - std::vector zero_points = {0, 5}; - - test_reference_quantize_per_channel( - {3, 4, 2}, // input sizes - scales, - zero_points, - 2, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_reference_quantize_per_channel_float_to_int8_3D_axisn1) { - std::vector scales = {0.1, 0.2}; - std::vector zero_points = {0, 5}; - - test_reference_quantize_per_channel( - {3, 4, 2}, // input sizes - scales, - zero_points, - -1, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_reference_quantize_per_channel_float_to_int8_4D_axis0) { - std::vector scales = {0.1, 0.2, 0.00002}; - std::vector zero_points = {0, 5, -4}; - - test_reference_quantize_per_channel( - {3, 4, 2, 5}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -// END OF REFERENCE TESTS - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_float_to_int8_axis0) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(9, 0.1f); - std::vector zero_points(9, 2); - - // 1D Tensor - test_vulkan_quantize_per_channel( - {9}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 2D Tensor - test_vulkan_quantize_per_channel( - {9, 14}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 3D Tensor - test_vulkan_quantize_per_channel( - {9, 7, 11}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 17, 5, 5}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {5, 17, 5, 9}, // input sizes - scales, - zero_points, - -1, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_float_to_int8_axis1) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(14, 0.001f); - std::vector zero_points(14, -5); - - // 2D Tensor - test_vulkan_quantize_per_channel( - {9, 14}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 3D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 5, 5}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {9, 7, 14, 5}, // input sizes - scales, - zero_points, - -2, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_float_to_int8_axis2) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(11, 0.5f); - std::vector zero_points(11, 12); - - // 3D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11}, // input sizes - scales, - zero_points, - 2, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 2, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {9, 11, 14, 5}, // input sizes - scales, - zero_points, - -3, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_float_to_int8_axis3) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales(7, 0.5f); - std::vector zero_points(7, 12); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11, 7}, // input sizes - scales, - zero_points, - 3, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {7, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - -128, // quant_min - 127, // quant_max - at::kFloat, - at::kChar); -} - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.0001, 0.5, 0.02}; - std::vector zero_points = {0, 5, -5, 1, 12}; - - // 4D Tensor - test_vulkan_quantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - 0, // axis - 0, // quant_min - 255, // quant_max - at::kFloat, - at::kByte); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 5, 11, 7}, // input sizes - scales, - zero_points, - 1, // axis - 0, // quant_min - 255, // quant_max - at::kFloat, - at::kByte); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 5, 7}, // input sizes - scales, - zero_points, - 2, // axis - 0, // quant_min - 255, // quant_max - at::kFloat, - at::kByte); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 3, // axis - 0, // quant_min - 255, // quant_max - at::kFloat, - at::kByte); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - 0, // quant_min - 255, // quant_max - at::kFloat, - at::kByte); -} - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_half_to_8bit) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_float16_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.01, 0.5, 0.02}; - std::vector zero_points = {0, 5, 5, 1, 12}; - - // 4D Tensor - test_vulkan_quantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kHalf, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 5, 11, 7}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kHalf, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 5, 7}, // input sizes - scales, - zero_points, - 2, // axis - 0, // quant_min - 255, // quant_max - at::kHalf, - at::kByte); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 3, // axis - -128, // quant_min - 127, // quant_max - at::kHalf, - at::kChar); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - 0, // quant_min - 255, // quant_max - at::kHalf, - at::kByte); -} - -TEST( - VulkanQuantizePerChannelTest, - test_vulkan_quantize_per_channel_double_to_8bit) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - std::vector scales = {0.1, 0.2, 0.01, 0.5, 0.02}; - std::vector zero_points = {0, 5, 5, 1, 12}; - - // 4D Tensor - test_vulkan_quantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - 0, // axis - -128, // quant_min - 127, // quant_max - at::kDouble, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 5, 11, 7}, // input sizes - scales, - zero_points, - 1, // axis - -128, // quant_min - 127, // quant_max - at::kDouble, - at::kChar); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 5, 7}, // input sizes - scales, - zero_points, - 2, // axis - 0, // quant_min - 255, // quant_max - at::kDouble, - at::kByte); - - // 4D Tensor - test_vulkan_quantize_per_channel( - {9, 14, 11, 5}, // input sizes - scales, - zero_points, - 3, // axis - -128, // quant_min - 127, // quant_max - at::kDouble, - at::kChar); - - // 4D Tensor (negative axis) - test_vulkan_quantize_per_channel( - {5, 14, 11, 7}, // input sizes - scales, - zero_points, - -4, // axis - 0, // quant_min - 255, // quant_max - at::kDouble, - at::kByte); -} - -void test_vulkan_quantize_per_tensor_tensor_impl( - const std::vector& input_sizes, - float scale, - int zero_point, - int64_t quant_min, - int64_t quant_max, - at::ScalarType in_dtype = at::kFloat, - at::ScalarType dtype = at::kInt, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - check_quantize_args(quant_min, quant_max, dtype); - std::vector input_sizes_int64( - input_sizes.begin(), input_sizes.end()); - at::Tensor input = - at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); - - scale = scale < eps ? eps : scale; - - // Create scale and zero_point as tensors (single element tensors) - at::Tensor scale_tensor = - at::tensor({scale}, at::device(at::kCPU).dtype(at::kDouble)); - at::Tensor zero_point_tensor = - at::tensor({zero_point}, at::device(at::kCPU).dtype(at::kLong)); - - // Get reference output using tensor variant - at::Tensor reference_out = - torch::executor::native::quantize_per_tensor_tensor_args_aten( - input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); - - // Build Vulkan quantize_per_tensor.tensor graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(in_storage); - ComputeGraph graph(config); - - IOValueRef r_input = graph.add_input_tensor( - input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); - - // Add scale and zero_point as tensor inputs (buffer storage, width packed) - IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked); - IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), - vkapi::kInt, - utils::kBuffer, - utils::kWidthPacked); - - const ValueRef r_quant_min = graph.add_scalar(quant_min); - const ValueRef r_quant_max = graph.add_scalar(quant_max); - - const ValueRef r_out = graph.add_tensor( - input.sizes().vec(), from_at_scalartype(dtype), out_storage); - - const ValueRef r_dtype = - graph.add_scalar(static_cast(dtype)); - - VK_GET_OP_FN("quantized_decomposed.quantize_per_tensor.tensor") - (graph, - { - r_input.value, - r_scale.value, - r_zero_point.value, - r_quant_min, - r_quant_max, - r_dtype, - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Run Vulkan quantize_per_tensor.tensor - graph.copy_into_staging( - r_input.staging, input.const_data_ptr(), input.numel()); - - // Convert scale tensor to float and copy to GPU - at::Tensor scale_float = scale_tensor.to(at::kFloat); - graph.copy_into_staging( - r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); - - // Convert zero_point tensor to int and copy to GPU - at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); - graph.copy_into_staging( - r_zero_point.staging, - zero_point_int.const_data_ptr(), - zero_point_int.numel()); - - graph.execute(); - - at::Tensor vk_out = at::empty_like(reference_out).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare outputs - // For quantized types, we need to compare the actual integer values - at::Tensor reference_int = reference_out.to(at::kInt); - at::Tensor vk_int = vk_out.to(at::kInt); - - // Tolerance is 1 to address rounding errors and fp math differences between - // CPU/GPU - const bool output_correct = - at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1); - if (!output_correct) { - at::Tensor diffs = at::abs(reference_int - vk_int); - - std::cout << "\n" - << "Failed with parameters: " << std::endl; - std::cout << " scale: " << scale << std::endl; - std::cout << " zero_point: " << zero_point << std::endl; - std::cout << " quant_min: " << quant_min << std::endl; - std::cout << " quant_max: " << quant_max << std::endl; - std::cout << " storage type: " - << (in_storage == vkcompute::utils::kBuffer ? "buffer" - : "texture") - << std::endl; - - std::cout << "input:" << std::endl; - std::cout << input << std::endl; - std::cout << "reference:" << std::endl; - std::cout << reference_int << std::endl; - std::cout << "vulkan:" << std::endl; - std::cout << vk_int << std::endl; - } - - ASSERT_TRUE(output_correct); -} - -TEST( - VulkanQuantizePerTensorTensorTest, - test_vulkan_quantize_per_tensor_tensor_float_to_int8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_quantize_per_tensor_tensor( - {2, 3, 4}, // input sizes - 0.01, // scale - 1, // zero_point - -128, // quant_min - 127, // quant_max - at::kFloat, // input dtype - at::kChar); // output dtype -} - -TEST( - VulkanQuantizePerTensorTensorTest, - test_vulkan_quantize_per_tensor_tensor_float_to_uint8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_quantize_per_tensor_tensor( - {2, 3, 4, 12}, // input sizes - 0.1, // scale - 5, // zero_point - 0, // quant_min - 255, // quant_max - at::kFloat, // input dtype - at::kByte); // output dtype -} - -TEST( - VulkanQuantizePerTensorTensorTest, - test_vulkan_quantize_per_tensor_tensor_float_to_int32) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_quantize_per_tensor_tensor( - {2, 3}, // input sizes - 0.01, // scale - 12, // zero_point - std::numeric_limits::min(), // quant_min - std::numeric_limits::max(), // quant_max - at::kFloat, // input dtype - at::kInt); // output dtype -} - -TEST( - VulkanQuantizePerTensorTensorTest, - test_vulkan_quantize_per_tensor_tensor_half_to_uint8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_quantize_per_tensor_tensor( - {3, 4}, // input sizes - 0.3, // scale - 2, // zero_point - 0, // quant_min - 255, // quant_max - at::kHalf, // input dtype - at::kByte); // output dtype -} - -TEST( - VulkanQuantizePerTensorTensorTest, - test_vulkan_quantize_per_tensor_tensor_double_to_int8) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_vulkan_quantize_per_tensor_tensor( - {2, 3, 4}, // input sizes - 0.03, // scale - -2, // zero_point - -128, // quant_min - 127, // quant_max - at::kDouble, // input dtype - at::kChar); // output dtype -} diff --git a/backends/vulkan/test/op_tests/quantized_linear_test.cpp b/backends/vulkan/test/op_tests/quantized_linear_test.cpp deleted file mode 100644 index db95f4a793f..00000000000 --- a/backends/vulkan/test/op_tests/quantized_linear_test.cpp +++ /dev/null @@ -1,900 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include "test_utils.h" - -#include - -class VulkanLinearQCS4WTest : public ::testing::Test { - public: - void SetUp() override { - if (!vkcompute::api::context() - ->adapter_ptr() - ->supports_int16_shader_types()) { - GTEST_SKIP(); - } - } - - void TearDown() override { - // Clean up any resources if needed - } -}; - -class VulkanLinearQTA8AQGA4WTest : public ::testing::Test { - public: - void SetUp() override { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - } - - void TearDown() override { - // Clean up any resources if needed - } -}; - -// -// Reference Implementations -// - -at::Tensor linear_qga4w_reference_impl( - const at::Tensor& x, - const at::Tensor& weights_4x2, - const int64_t groupsize, - const at::Tensor& scales_and_zeros, - const int64_t inner_k_tiles) { - const std::vector original_x_size(x.sizes().vec()); - const size_t ndim = original_x_size.size(); - const int64_t out_features = weights_4x2.size(0); - const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]}); - at::Tensor out = at::_weight_int4pack_mm_for_cpu( - x_flattened, weights_4x2, groupsize, scales_and_zeros); - std::vector out_shape( - original_x_size.begin(), original_x_size.end()); - out_shape.at(ndim - 1) = out_features; - return out.reshape(out_shape); -} - -at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) { - std::vector weights_shape(weights_4x2.sizes().vec()); - weights_shape[1] *= 2; - - at::Tensor weights_unpacked = - at::empty(weights_shape, at::device(at::kCPU).dtype(at::kInt)); - - const int64_t N = weights_unpacked.size(0); - const int64_t K = weights_unpacked.size(1); - - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k += 2) { - const uint8_t packed_val = weights_4x2[n][k / 2].item().to(); - const uint8_t second_val = packed_val & 0x0F; - const uint8_t first_val = (packed_val & 0xF0) >> 4; - - weights_unpacked[n][k] = int(first_val); - weights_unpacked[n][k + 1] = int(second_val); - } - } - - return weights_unpacked; -} - -at::Tensor dequantize_and_linear_qga4w( - const at::Tensor& x, - const at::Tensor& weights_4x2, - const int64_t groupsize, - const at::Tensor& scales_and_zeros, - const int64_t inner_k_tiles) { - std::vector weights_shape(weights_4x2.sizes().vec()); - weights_shape[1] *= 2; - - at::Tensor weights_dequantized = - at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat)); - - const int64_t N = weights_dequantized.size(0); - const int64_t K = weights_dequantized.size(1); - - const int k_groups = K / groupsize; - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k += 2) { - const int group_idx = k / groupsize; - // const int scale_idx = k_groups * n + group_idx; - const uint8_t packed_val = weights_4x2[n][k / 2].item().to(); - const uint8_t second_val = packed_val & 0x0F; - const uint8_t first_val = (packed_val & 0xF0) >> 4; - - const float scale = scales_and_zeros[group_idx][n][0].item().to(); - const float zero = scales_and_zeros[group_idx][n][1].item().to(); - - weights_dequantized[n][k] = (float(first_val) - 8.0) * scale + zero; - weights_dequantized[n][k + 1] = (float(second_val) - 8.0) * scale + zero; - } - } - - return at::linear(x, weights_dequantized); -} - -at::Tensor dequantize_and_linear_qcs4w( - const at::Tensor& x, - const at::Tensor& weights_4x2, - const at::Tensor& scales) { - std::vector weights_shape(weights_4x2.sizes().vec()); - weights_shape[1] *= 2; - - at::Tensor weights_dequantized = - at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat)); - - const int64_t N = weights_dequantized.size(0); - const int64_t K = weights_dequantized.size(1); - - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k += 2) { - // const int scale_idx = k_groups * n + group_idx; - const uint8_t packed_val = weights_4x2[n][k / 2].item().to(); - const uint8_t second_val = packed_val & 0x0F; - const uint8_t first_val = (packed_val & 0xF0) >> 4; - - const float scale = scales[n].item().to(); - - weights_dequantized[n][k] = (float(first_val) - 8.0) * scale; - weights_dequantized[n][k + 1] = (float(second_val) - 8.0) * scale; - } - } - - return at::linear(x, weights_dequantized); -} - -at::Tensor linear_qcs4w_reference_impl( - const at::Tensor& x, - const at::Tensor& weights_4x2, - const at::Tensor& scales) { - const std::vector original_x_size(x.sizes().vec()); - const size_t ndim = original_x_size.size(); - const int64_t out_features = weights_4x2.size(0); - const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]}); - - const at::Tensor weights_unpacked = - (unpack_weights_4x2(weights_4x2) - 8).to(at::kChar); - at::Tensor out = - at::_weight_int8pack_mm(x_flattened, weights_unpacked, scales); - - std::vector out_shape( - original_x_size.begin(), original_x_size.end()); - out_shape.at(ndim - 1) = out_features; - return out.reshape(out_shape); -} - -at::Tensor linear_qta8a_qga4w_quantized_matmul( - const at::Tensor& quantized_input, // [B, M, K] int8 quantized input - const at::Tensor& input_scale, // [B*M] per-token input scales - const at::Tensor& input_zero_point, // [B*M] per-token input zero points - const at::Tensor& weights_4x2, // [N, K/2] 4-bit packed weights - const int64_t group_size, // Group size for weight quantization - const at::Tensor& weight_scales, // [K/group_size, N] weight scales - const at::Tensor& weight_zeros) { // [K/group_size, N] weight zeros - - const int64_t B = quantized_input.size(0); - const int64_t M = quantized_input.size(1); - const int64_t K = quantized_input.size(2); - const int64_t N = weights_4x2.size(0); - - // Create output tensor for floating point results - at::Tensor float_output = - at::zeros({B, M, N}, at::device(at::kCPU).dtype(at::kFloat)); - - // Accessors for efficient access - auto input_accessor = quantized_input.accessor(); - auto output_accessor = float_output.accessor(); - auto weights_accessor = weights_4x2.accessor(); - auto weight_scales_accessor = weight_scales.accessor(); - auto weight_zeros_accessor = weight_zeros.accessor(); - auto input_scale_accessor = input_scale.accessor(); - auto input_zero_accessor = input_zero_point.accessor(); - - // Perform quantized matrix multiplication following quantization.md equation - // (5): result_real_value = lhs_scale * rhs_scale * Sum_over_k( - // (lhs_quantized_value[k] - lhs_zero_point) * - // (rhs_quantized_value[k] - rhs_zero_point) - // ) - for (int64_t b = 0; b < B; b++) { - for (int64_t m = 0; m < M; m++) { - const int64_t token_idx = b * M + m; - const float lhs_scale = - input_scale_accessor[token_idx]; // Per-token input scale - const int32_t lhs_zero_point = - input_zero_accessor[token_idx]; // Per-token input zero point - - for (int64_t n = 0; n < N; n++) { - float result_real_value = 0.0f; - - for (int64_t k = 0; k < K; k++) { - // Get per-group weight quantization parameters - const int64_t group_idx = k / group_size; - const float rhs_scale = - weight_scales_accessor[group_idx][n]; // Per-group weight scale - const int32_t rhs_zero_point = - weight_zeros_accessor[group_idx] - [n]; // Per-group weight zero point - - // Unpack the 4-bit weight for this position - const uint8_t packed_val = weights_accessor[n][k / 2]; - uint8_t weight_4bit; - if (k % 2 == 0) { - weight_4bit = (packed_val & 0xF0) >> 4; // First weight in pair - } else { - weight_4bit = packed_val & 0x0F; // Second weight in pair - } - - // Get quantized values - const int32_t lhs_quantized_value = - static_cast(input_accessor[b][m][k]); - // Convert 4-bit weight to signed: subtract 8 to get range [-8, 7] - const int32_t rhs_quantized_value = - static_cast(weight_4bit) - 8; - - // Apply proper quantization paradigm from quantization.md equation - // (3): real_value = scale * (quantized_value - zero_point) Following - // equation (5): result = lhs_scale * rhs_scale * - // (lhs_quantized - lhs_zero) * (rhs_quantized - rhs_zero) - const float lhs_diff = - static_cast(lhs_quantized_value - lhs_zero_point); - const float rhs_diff = - static_cast(rhs_quantized_value - rhs_zero_point); - - result_real_value += lhs_scale * rhs_scale * lhs_diff * rhs_diff; - } - - output_accessor[b][m][n] = result_real_value; - } - } - } - - return float_output; -} - -at::Tensor linear_qta8a_qga4w_4bit_dequant_impl( - const at::Tensor& quantized_input, - const at::Tensor& input_scale, - const at::Tensor& input_zero_point, - const at::Tensor& weights_4x2, - const int64_t group_size, - const at::Tensor& weight_scales, - const at::Tensor& weight_zeros) { - // Calculate number of input tokens - int64_t input_num_tokens = 1; - for (size_t i = 0; i < quantized_input.sizes().size() - 1; i++) { - input_num_tokens *= quantized_input.size(i); - } - - // Manually dequantize the char tensor using per-token quantization - at::Tensor x_float = at::zeros_like(quantized_input, at::kFloat); - - // Apply per-token dequantization - auto input_accessor = quantized_input.accessor(); - auto output_accessor = x_float.accessor(); - - for (int64_t token_idx = 0; token_idx < input_num_tokens; token_idx++) { - float scale_val = input_scale[token_idx].item(); - int zero_point_val = input_zero_point[token_idx].item(); - - // Calculate batch and sequence indices for this token - int64_t b = token_idx / quantized_input.size(1); - int64_t m = token_idx % quantized_input.size(1); - - // Apply dequantization for all features in this token - for (int64_t k = 0; k < quantized_input.size(-1); k++) { - float dequant_val = - (input_accessor[b][m][k] - zero_point_val) * scale_val; - output_accessor[b][m][k] = dequant_val; - } - } - - std::vector weights_shape(weights_4x2.sizes().vec()); - weights_shape[1] *= 2; - - at::Tensor weights_dequantized = - at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat)); - - const int64_t N = weights_dequantized.size(0); - const int64_t K = weights_dequantized.size(1); - - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k += 2) { - const int group_idx = k / group_size; - const uint8_t packed_val = weights_4x2[n][k / 2].item().to(); - const uint8_t second_val = packed_val & 0x0F; - const uint8_t first_val = (packed_val & 0xF0) >> 4; - - const float scale = weight_scales[group_idx][n].item().to(); - const int zero = weight_zeros[group_idx][n].item().to(); - - weights_dequantized[n][k] = - ((float(first_val) - 8.0) - float(zero)) * scale; - weights_dequantized[n][k + 1] = - ((float(second_val) - 8.0) - float(zero)) * scale; - } - } - - at::Tensor linear_result = at::linear(x_float, weights_dequantized); - - return linear_result; -} - -// -// Test functions -// - -void test_reference_linear_qga4w( - const int B, - const int M, - const int K, - const int N, - const int group_size = 32, - const int inner_k_tiles = 8) { - assert(K % group_size == 0); - - at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor weights_4x2 = - at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); - at::Tensor weights_int = unpack_weights_4x2(weights_4x2); - - const int k_groups = K / group_size; - at::Tensor scales_and_zeros = - at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat)); - - at::Tensor out = linear_qga4w_reference_impl( - x, - at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size), - group_size, - scales_and_zeros, - inner_k_tiles); - - at::Tensor out_ref = dequantize_and_linear_qga4w( - x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); - - ASSERT_TRUE(at::allclose(out, out_ref)); -} - -void test_reference_linear_qcs4w( - const int B, - const int M, - const int K, - const int N) { - at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor weights_4x2 = - at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); - at::Tensor weights_int = unpack_weights_4x2(weights_4x2); - - at::Tensor scales = at::rand({N}, at::device(at::kCPU).dtype(at::kFloat)); - - at::Tensor out = linear_qcs4w_reference_impl(x, weights_4x2, scales); - - at::Tensor out_ref = dequantize_and_linear_qcs4w(x, weights_4x2, scales); - - ASSERT_TRUE(at::allclose(out, out_ref)); -} - -void test_vulkan_linear_qga4w_impl( - const int B, - const int M, - const int K, - const int N, - const int group_size = 32, - const int inner_k_tiles = 8, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - assert(K % group_size == 0); - - at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor weights_4x2 = - at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); - - const int k_groups = K / group_size; - at::Tensor scales_and_zeros = - at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat)); - - at::Tensor weights_int = unpack_weights_4x2(weights_4x2); - at::Tensor out_ref = linear_qga4w_reference_impl( - x, - at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size), - group_size, - scales_and_zeros, - inner_k_tiles); - - // Build Vulkan graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - -#define MAKE_TENSORREF_FOR(x) \ - ValueRef r_##x = graph.add_tensorref( \ - x.sizes().vec(), \ - from_at_scalartype(x.scalar_type()), \ - x.const_data_ptr()); - - MAKE_TENSORREF_FOR(weights_4x2); - MAKE_TENSORREF_FOR(scales_and_zeros); - - IOValueRef r_x = graph.add_input_tensor( - x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage); - - const ValueRef r_out = graph.add_tensor( - out_ref.sizes().vec(), - from_at_scalartype(out_ref.scalar_type()), - out_storage); - - VK_GET_OP_FN("et_vk.linear_weight_int4.default") - (graph, - {r_x.value, - r_weights_4x2, - graph.add_scalar(group_size), - r_scales_and_zeros, - kDummyValueRef, - r_out}); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - - graph.prepack(); - - // - // Run model - // - - graph.propagate_resize(); - graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel()); - - graph.execute(); - - at::Tensor vk_out = at::empty_like(out_ref); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4)); -} - -void test_vulkan_linear_qga4w( - const int B, - const int M, - const int K, - const int N, - const int group_size = 32, - const int inner_k_tiles = 8) { - test_vulkan_linear_qga4w_impl( - B, - M, - K, - N, - group_size, - inner_k_tiles, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - test_vulkan_linear_qga4w_impl( - B, - M, - K, - N, - group_size, - inner_k_tiles, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -void test_vulkan_linear_qcs4w_impl( - const int B, - const int M, - const int K, - const int N, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor weights_4x2 = - at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); - - at::Tensor scales = at::rand({N}, at::device(at::kCPU).dtype(at::kFloat)); - - at::Tensor out_ref = linear_qcs4w_reference_impl(x, weights_4x2, scales); - - // Build Vulkan graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - -#define MAKE_TENSORREF_FOR(x) \ - ValueRef r_##x = graph.add_tensorref( \ - x.sizes().vec(), \ - from_at_scalartype(x.scalar_type()), \ - x.const_data_ptr()); - - MAKE_TENSORREF_FOR(weights_4x2); - MAKE_TENSORREF_FOR(scales); - - IOValueRef r_x = graph.add_input_tensor( - x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage); - - const ValueRef r_out = graph.add_tensor( - out_ref.sizes().vec(), - from_at_scalartype(out_ref.scalar_type()), - out_storage); - - VK_GET_OP_FN("et_vk.linear_qcs4w.default") - (graph, {r_x.value, r_weights_4x2, r_scales, r_out}); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - - graph.prepack(); - - // - // Run model - // - - graph.propagate_resize(); - graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel()); - - graph.execute(); - - at::Tensor vk_out = at::empty_like(out_ref); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4)); -} - -void test_vulkan_linear_qcs4w( - const int B, - const int M, - const int K, - const int N) { - test_vulkan_linear_qcs4w_impl( - B, M, K, N, vkcompute::utils::kBuffer, vkcompute::utils::kBuffer); - - test_vulkan_linear_qcs4w_impl( - B, M, K, N, vkcompute::utils::kTexture3D, vkcompute::utils::kTexture3D); -} - -void test_vulkan_linear_qta8a_qga4w_impl( - const int B, - const int M, - const int K, - const int N, - const int group_size = 8, - const vkcompute::utils::StorageType in_storage = - vkcompute::utils::kTexture3D, - const vkcompute::utils::StorageType out_storage = - vkcompute::utils::kTexture3D) { - assert(K % group_size == 0); - - const int64_t input_num_tokens = B * M; - const int k_groups = K / group_size; - - at::Tensor input_scale = - at::rand({input_num_tokens}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor input_zero_point = at::randint( - -10, 10, {input_num_tokens}, at::device(at::kCPU).dtype(at::kInt)); - - at::Tensor float_x = - at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); - - // Create a reference quantized tensor using per-token quantization - // Mimic per-token quantization using at::quantize_per_channel by reshaping - // [num_tokens, features] - at::Tensor float_x_reshaped = float_x.view({input_num_tokens, K}); - at::Tensor qx_ref_reshaped = at::quantize_per_channel( - float_x_reshaped, - input_scale.to(at::kDouble), - input_zero_point.to(at::kLong), - 0, // axis 0 for per-token (first dimension after reshape) - c10::ScalarType::QInt8); - - at::Tensor x = - at::int_repr(qx_ref_reshaped).view(float_x.sizes()).to(at::kChar); - - at::Tensor weights_4x2 = - at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); - at::Tensor weight_scales = - at::rand({k_groups, N}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor weight_zeros = at::randint( - -128, 128, {k_groups, N}, at::device(at::kCPU).dtype(at::kInt)); - - at::Tensor out_ref = linear_qta8a_qga4w_4bit_dequant_impl( - x, - input_scale, - input_zero_point, - weights_4x2, - group_size, - weight_scales, - weight_zeros); - - // Build Vulkan graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - -#define MAKE_TENSORREF_FOR(x) \ - ValueRef r_##x = graph.add_tensorref( \ - x.sizes().vec(), \ - from_at_scalartype(x.scalar_type()), \ - x.const_data_ptr()); - - MAKE_TENSORREF_FOR(weights_4x2); - MAKE_TENSORREF_FOR(weight_scales); - MAKE_TENSORREF_FOR(weight_zeros); - - IOValueRef r_x = graph.add_input_tensor( - x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage); - - IOValueRef r_input_scale = graph.add_input_tensor( - input_scale.sizes().vec(), - from_at_scalartype(input_scale.scalar_type()), - utils::kBuffer); - - IOValueRef r_input_zero_point = graph.add_input_tensor( - input_zero_point.sizes().vec(), - from_at_scalartype(input_zero_point.scalar_type()), - utils::kBuffer); - - const ValueRef r_out = graph.add_tensor( - out_ref.sizes().vec(), - from_at_scalartype(out_ref.scalar_type()), - out_storage); - - VK_GET_OP_FN("et_vk.linear_qta8a_qga4w.default") - (graph, - {r_x.value, - r_input_scale.value, - r_input_zero_point.value, - r_weights_4x2, - graph.add_scalar(group_size), - r_weight_scales, - r_weight_zeros, - r_out}); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - - graph.prepack(); - - // - // Run model - // - - graph.propagate_resize(); - graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel()); - graph.copy_into_staging( - r_input_scale.staging, input_scale.const_data_ptr(), input_scale.numel()); - graph.copy_into_staging( - r_input_zero_point.staging, - input_zero_point.const_data_ptr(), - input_zero_point.numel()); - - graph.execute(); - - at::Tensor vk_out = at::empty_like(out_ref); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // This is a reference implementation that uses the quantized - // matmul paradigm. It should follow closely with how the vulkan - // implementation works, and demonstrates reasonably close results. - at::Tensor qmm_ref = linear_qta8a_qga4w_quantized_matmul( - x, - input_scale, - input_zero_point, - weights_4x2, - group_size, - weight_scales, - weight_zeros); - - // For quantized int8 operations, allow for 1-unit differences due to rounding - bool is_close = at::allclose(vk_out, out_ref, 5e-3, 5e-3); - if (!is_close) { - std::cout << "qmm_ref: \n" << qmm_ref << std::endl; - std::cout << "out_ref: \n" << out_ref << std::endl; - std::cout << "vk_out: \n" << vk_out << std::endl; - } - - ASSERT_TRUE(is_close); -} - -void test_vulkan_linear_qta8a_qga4w( - const int B, - const int M, - const int K, - const int N, - const int group_size = 32) { - test_vulkan_linear_qta8a_qga4w_impl( - B, - M, - K, - N, - group_size, - vkcompute::utils::kBuffer, - vkcompute::utils::kBuffer); - - test_vulkan_linear_qta8a_qga4w_impl( - B, - M, - K, - N, - group_size, - vkcompute::utils::kTexture3D, - vkcompute::utils::kTexture3D); -} - -// Test linear_qga4w operator - -TEST(VulkanLinearQGA4WTest, test_reference_impl) { - test_reference_linear_qga4w( - /*B = */ 1, - /*M = */ 4, - /*K = */ 128, - /*N = */ 32); -} - -TEST(VulkanLinearQGA4WTest, test_vulkan_impl_small_m) { - test_vulkan_linear_qga4w( - /*B = */ 1, - /*M = */ 4, - /*K = */ 128, - /*N = */ 32); - - test_vulkan_linear_qga4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 256, - /*N = */ 256); -} - -TEST(VulkanLinearQGA4WTest, test_vulkan_impl_gemm) { - test_vulkan_linear_qga4w( - /*B = */ 1, - /*M = */ 256, - /*K = */ 256, - /*N = */ 256); -} - -// Test linear_qcs4w operator - -TEST_F(VulkanLinearQCS4WTest, test_reference_impl) { - test_reference_linear_qcs4w( - /*B = */ 1, - /*M = */ 4, - /*K = */ 128, - /*N = */ 32); -} - -TEST_F(VulkanLinearQCS4WTest, test_vulkan_impl_small_m) { - test_vulkan_linear_qcs4w( - /*B = */ 1, - /*M = */ 4, - /*K = */ 128, - /*N = */ 32); - - test_vulkan_linear_qcs4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 256, - /*N = */ 256); -} - -TEST_F(VulkanLinearQCS4WTest, test_vulkan_impl_gemm) { - test_vulkan_linear_qcs4w( - /*B = */ 1, - /*M = */ 32, - /*K = */ 32, - /*N = */ 32); -} - -// Test linear_qta8a_qga4w operator - -TEST_F( - VulkanLinearQTA8AQGA4WTest, - test_vulkan_linear_quant_gemm_custom_groupsize) { - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 2, - /*K = */ 8, - /*N = */ 8, - /*group_size = */ 8); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 2, - /*K = */ 16, - /*N = */ 8, - /*group_size = */ 8); -} - -TEST_F(VulkanLinearQTA8AQGA4WTest, test_vulkan_linear_quant_gemm) { - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 4, - /*K = */ 64, - /*N = */ 32); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 4, - /*K = */ 128, - /*N = */ 32); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 8, - /*K = */ 64, - /*N = */ 16); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 256, - /*K = */ 256, - /*N = */ 256); -} - -TEST_F( - VulkanLinearQTA8AQGA4WTest, - test_vulkan_linear_quant_gemv_custom_groupsize) { - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 8, - /*N = */ 8, - /*group_size = */ 8); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 16, - /*N = */ 8, - /*group_size = */ 8); -} - -TEST_F(VulkanLinearQTA8AQGA4WTest, test_vulkan_linear_quant_gemv) { - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 32, - /*N = */ 32); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 64, - /*N = */ 16); - - test_vulkan_linear_qta8a_qga4w( - /*B = */ 1, - /*M = */ 1, - /*K = */ 256, - /*N = */ 256); -} diff --git a/backends/vulkan/test/op_tests/rotary_embedding_test.cpp b/backends/vulkan/test/op_tests/rotary_embedding_test.cpp deleted file mode 100644 index 9f9bdef24aa..00000000000 --- a/backends/vulkan/test/op_tests/rotary_embedding_test.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include "test_utils.h" - -#include - -// -// Reference Implementations -// - -std::pair rotary_embedding_impl( - const at::Tensor& xq, - const at::Tensor& xk, - const at::Tensor& freqs_cos, - const at::Tensor& freqs_sin) { - std::vector xq_even_odd = at::unbind( - xq.reshape({xq.size(0), xq.size(1), xq.size(2), xq.size(3) / 2, 2}), -1); - at::Tensor& xq_r = xq_even_odd[0]; - at::Tensor& xq_i = xq_even_odd[1]; - - std::vector xk_even_odd = at::unbind( - xk.reshape({xk.size(0), xk.size(1), xk.size(2), xk.size(3) / 2, 2}), -1); - at::Tensor& xk_r = xk_even_odd[0]; - at::Tensor& xk_i = xk_even_odd[1]; - - at::Tensor freqs_cos_reshape = - freqs_cos.reshape({1, freqs_cos.size(0), 1, freqs_cos.size(1)}); - at::Tensor freqs_sin_reshape = - freqs_sin.reshape({1, freqs_sin.size(0), 1, freqs_sin.size(1)}); - - at::Tensor xq_out_r = xq_r * freqs_cos_reshape - xq_i * freqs_sin_reshape; - at::Tensor xq_out_i = xq_r * freqs_sin_reshape + xq_i * freqs_cos_reshape; - at::Tensor xk_out_r = xk_r * freqs_cos_reshape - xk_i * freqs_sin_reshape; - at::Tensor xk_out_i = xk_r * freqs_sin_reshape + xk_i * freqs_cos_reshape; - - at::Tensor xq_out = at::flatten(at::stack({xq_out_r, xq_out_i}, -1), 3); - at::Tensor xk_out = at::flatten(at::stack({xk_out_r, xk_out_i}, -1), 3); - - return std::make_pair(xq_out, xk_out); -} - -// -// Test functions -// - -void test_reference( - const int n_heads = 4, - const int n_kv_heads = 2, - const int dim = 32, - const int seq_len = 1) { - const int head_dim = dim / n_heads; - - at::Tensor xq = at::rand( - {1, seq_len, n_heads, head_dim}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor xk = at::rand( - {1, seq_len, n_kv_heads, head_dim}, - at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor freqs_cos = - at::rand({seq_len, head_dim / 2}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor freqs_sin = - at::rand({seq_len, head_dim / 2}, at::device(at::kCPU).dtype(at::kFloat)); - - std::pair outs = - rotary_embedding_impl(xq, xk, freqs_cos, freqs_sin); - at::Tensor& xq_out = outs.first; - at::Tensor& xk_out = outs.second; - - // Build Vulkan graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - -#define MAKE_INPUT_FOR(x) \ - IOValueRef r_##x = graph.add_input_tensor( \ - x.sizes().vec(), from_at_scalartype(x.scalar_type())); - - MAKE_INPUT_FOR(xq); - MAKE_INPUT_FOR(xk); - MAKE_INPUT_FOR(freqs_cos); - MAKE_INPUT_FOR(freqs_sin); - - const ValueRef r_xq_out = graph.add_tensor( - xq_out.sizes().vec(), from_at_scalartype(xq_out.scalar_type())); - const ValueRef r_xk_out = graph.add_tensor( - xk_out.sizes().vec(), from_at_scalartype(xk_out.scalar_type())); - - VK_GET_OP_FN("et_vk.apply_rotary_emb.default") - (graph, - {r_xq.value, - r_xk.value, - r_freqs_cos.value, - r_freqs_sin.value, - graph.add_value_list({r_xq_out, r_xk_out})}); - - ValueRef staging_xq_out = graph.set_output_tensor(r_xq_out); - ValueRef staging_xk_out = graph.set_output_tensor(r_xk_out); - - graph.prepare(); - - graph.prepack(); - - // - // Run model - // - - graph.propagate_resize(); - graph.copy_into_staging(r_xq.staging, xq.const_data_ptr(), xq.numel()); - graph.copy_into_staging(r_xk.staging, xk.const_data_ptr(), xk.numel()); - graph.copy_into_staging( - r_freqs_cos.staging, freqs_cos.const_data_ptr(), freqs_cos.numel()); - graph.copy_into_staging( - r_freqs_sin.staging, freqs_sin.const_data_ptr(), freqs_sin.numel()); - - graph.execute(); - - at::Tensor vk_xq_out = at::empty_like(xq_out); - graph.copy_from_staging( - staging_xq_out, vk_xq_out.mutable_data_ptr(), vk_xq_out.numel()); - - at::Tensor vk_xk_out = at::empty_like(xk_out); - graph.copy_from_staging( - staging_xk_out, vk_xk_out.mutable_data_ptr(), vk_xk_out.numel()); - - EXPECT_TRUE(at::allclose(xq_out, vk_xq_out, 1e-4, 1e-4)); - EXPECT_TRUE(at::allclose(xk_out, vk_xk_out, 1e-4, 1e-4)); -} - -TEST(VulkanRotaryEmbeddingTest, rotary_embedding_test) { - test_reference(); -} - -TEST(VulkanRotaryEmbeddingTest, rotary_embedding_llama3_params_test) { - test_reference( - /*n_heads=*/32, - /*n_kv_heads=*/8, - /*dim=*/2048); -} - -TEST(VulkanRotaryEmbeddingTest, rotary_embedding_llama3_params_test_seq_len_3) { - test_reference( - /*n_heads=*/32, - /*n_kv_heads=*/8, - /*dim=*/2048, - /*seq_len=*/3); -} diff --git a/backends/vulkan/test/op_tests/sdpa_test.cpp b/backends/vulkan/test/op_tests/sdpa_test.cpp deleted file mode 100644 index e4b3f662c04..00000000000 --- a/backends/vulkan/test/op_tests/sdpa_test.cpp +++ /dev/null @@ -1,839 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include "test_utils.h" - -#include -#include - -namespace torch { -namespace executor { -namespace native { - -// The below are copied from executorch/extension/llm/custom_ops/op_sdpa_aot.cpp -// They are needed because the original definitions are inaccessible due to -// being defined in an anonymous namespace. - -Tensor& sdpa_with_kv_cache_out_no_context( - const Tensor& q_projected, - const Tensor& k_projected, - const Tensor& v_projected, - Tensor& key_cache, - Tensor& value_cache, - const int64_t start_pos, - const int64_t seq_len, - // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue - // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy - const optional attn_mask, - const double dropout_p, - const bool is_causal, - // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy - const optional scale, - Tensor& output) { - executorch::runtime::KernelRuntimeContext context{}; - return torch::executor::native::sdpa_with_kv_cache_out( - context, - q_projected, - k_projected, - v_projected, - key_cache, - value_cache, - start_pos, - seq_len, - attn_mask, - dropout_p, - is_causal, - scale, - output); -} - -at::Tensor sdpa_with_kv_cache_aten( - const at::Tensor& q_projected, - const at::Tensor& k_projected, - const at::Tensor& v_projected, - at::Tensor& key_cache, - at::Tensor& value_cache, - const int64_t start_pos, - const int64_t seq_len, - // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue - // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy - const std::optional attn_mask, - const double dropout_p, - const bool is_causal, - // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy - const std::optional scale) { - auto output = at::empty_like(q_projected); - WRAP_TO_ATEN(sdpa_with_kv_cache_out_no_context, 11) - (q_projected, - k_projected, - v_projected, - key_cache, - value_cache, - start_pos, - seq_len, - attn_mask, - dropout_p, - is_causal, - scale, - output); - return output; -} - -} // namespace native -} // namespace executor -} // namespace torch - -// -// Reference Implementation -// - -/* - * Converts a boolean mask to an additive mask. Values that are false are - * converted to -inf, and values that are true are converted to 0. - */ -at::Tensor convert_boolean_attn_mask( - const at::Tensor& attn_mask, - caffe2::TypeMeta dtype) { - // Convert boolean mask to additive mask; need to invert mask to indicate what - // to mask *out*. - if (attn_mask.dtype() == at::kBool) { - return at::where( - attn_mask.logical_not(), - -std::numeric_limits::infinity(), - at::scalar_tensor( - 0.0, at::TensorOptions().dtype(dtype).device(attn_mask.device()))); - } - // Otherwise, attn_mask represents an additive attention tensor - return attn_mask; -} - -/* - * Construct an attention mask for SDPA. - * 1. Construct a square matrix of ones with each dim equal to start_pos + - * seq_len - * 2. Keep the lower triangular elements as 1 and set the rest to 0 - * 3. Slice the mask to keep only seq_len rows starting from input_pos - * 4. Convert the mask to an additive mask - */ -at::Tensor construct_attention_mask( - const at::Tensor& q, - const at::Tensor& k_cache, - const int start_pos) { - const int max_seq_len = k_cache.size(1); - const int seq_len = q.size(1); - - const int length = start_pos + seq_len; - at::Tensor attn_mask_base = - at::ones({length, length}, q.options().dtype(at::kBool)).tril(); - - at::Tensor attn_mask_sliced = - at::slice(attn_mask_base, 0, start_pos, start_pos + seq_len); - - attn_mask_sliced = convert_boolean_attn_mask(attn_mask_sliced, q.dtype()); - return attn_mask_sliced; -} - -/* - * Reference implementation of SDPA - */ -at::Tensor sdpa_reference_impl( - const at::Tensor& q_projected, - const at::Tensor& k_projected, - const at::Tensor& v_projected, - at::Tensor& key_cache, - at::Tensor& value_cache, - const int64_t start_pos, - const int64_t seq_len, - const std::optional __attn_mask_ignored, - const double dropout_p, - const bool is_causal, - const std::optional scale) { - at::Tensor attn_mask = - construct_attention_mask(q_projected, key_cache, start_pos); - - // Cache update - at::Tensor key_cache_updated = at::slice_scatter( - key_cache, k_projected, 1, start_pos, start_pos + k_projected.size(1)); - at::Tensor value_cache_updated = at::slice_scatter( - value_cache, v_projected, 1, start_pos, start_pos + v_projected.size(1)); - - // Write back to input - key_cache = key_cache_updated; - value_cache = value_cache_updated; - - at::Tensor key_cache_sliced = - at::slice(key_cache_updated, 1, 0, start_pos + q_projected.size(1)); - - at::Tensor value_cache_sliced = - at::slice(value_cache_updated, 1, 0, start_pos + q_projected.size(1)); - - // Since n_heads may not be the same as n_kv_heads, the sliced k and v cache - // matrices need to be "expanded" to match - const int num_repeats = q_projected.size(2) / key_cache.size(2); - at::Tensor key_cache_sliced_repeated = - at::repeat_interleave(key_cache_sliced, num_repeats, 2); - at::Tensor value_cache_sliced_repeated = - at::repeat_interleave(value_cache_sliced, num_repeats, 2); - - at::Tensor q_transposed = q_projected.transpose(1, 2); - at::Tensor k_transposed = key_cache_sliced_repeated.transpose(1, 2); - at::Tensor v_transposed = value_cache_sliced_repeated.transpose(1, 2); - - at::Tensor k_transposed_2 = k_transposed.transpose(-2, -1); - at::Tensor attn_weight_prescale = at::matmul(q_transposed, k_transposed_2); - - float scale_factor = 1.0 / sqrt(q_transposed.size(-1)); - at::Tensor attn_weight = attn_weight_prescale * scale_factor + attn_mask; - - at::Tensor attn_weight_softmax = at::softmax(attn_weight, -1); - at::Tensor out = at::matmul(attn_weight_softmax, v_transposed); - - return out.transpose(1, 2); -} - -// -// Test functions -// - -void test_reference_sdpa( - const int start_input_pos, - const int sequence_len, - const int embedding_dim, - const int num_heads, - const int num_kv_heads, - const int batch_size, - const int max_seq_len, - at::ScalarType dtype = at::kFloat) { - const int head_dim = embedding_dim / num_heads; - - // K and V caches. Need an extra set for the reference implementation - - at::Tensor k_cache = at::zeros( - {batch_size, max_seq_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor v_cache = at::zeros_like(k_cache); - - at::Tensor k_cache_ref = at::zeros_like(k_cache); - at::Tensor v_cache_ref = at::zeros_like(v_cache); - - for (int input_pos = start_input_pos; input_pos + sequence_len < max_seq_len; - input_pos += sequence_len) { - at::Tensor q = at::rand( - {batch_size, sequence_len, num_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor k = at::rand( - {batch_size, sequence_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor v = at::rand_like(k); - - at::Tensor reference_impl_out = sdpa_reference_impl( - q, k, v, k_cache, v_cache, input_pos, sequence_len, {}, 0.0, true, {}); - - at::Tensor reference_out = torch::executor::native::sdpa_with_kv_cache_aten( - q, - k, - v, - k_cache_ref, - v_cache_ref, - input_pos, - sequence_len, - {}, - 0.0, - true, - {}); - - ASSERT_TRUE(at::allclose(reference_impl_out, reference_out)); - } -} - -void test_vulkan_sdpa( - const int start_input_pos, - const int base_sequence_len, - const int embedding_dim, - const int num_heads, - const int num_kv_heads, - const int batch_size, - const int max_seq_len, - const bool dynamic_seq_len = true, - at::ScalarType dtype = at::kFloat) { - const int head_dim = embedding_dim / num_heads; - - const int init_seq_len = dynamic_seq_len ? max_seq_len : base_sequence_len; - // K and V caches - - at::Tensor k_cache = at::zeros( - {batch_size, max_seq_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - - at::Tensor v_cache = at::zeros_like(k_cache); - - // Reference input data - at::Tensor q = at::empty( - {batch_size, init_seq_len, num_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor k = at::empty( - {batch_size, init_seq_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor v = at::empty_like(k); - - // Get reference output - at::Tensor out = at::empty_like(q); - - // Build Vulkan SDPA graph - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - - // "Data" variant for vulkan initialization - - at::Tensor k_cache_data = at::zeros_like(k_cache); - at::Tensor v_cache_data = at::zeros_like(v_cache); - -#define MAKE_TENSORREF_FOR(x) \ - ValueRef r_##x = graph.add_tensorref( \ - x.sizes().vec(), \ - from_at_scalartype(x.scalar_type()), \ - x.const_data_ptr()); - - MAKE_TENSORREF_FOR(k_cache_data); - MAKE_TENSORREF_FOR(v_cache_data); - -#define MAKE_INPUT_FOR(x) \ - IOValueRef r_##x = graph.add_input_tensor( \ - x.sizes().vec(), from_at_scalartype(x.scalar_type())); - - MAKE_INPUT_FOR(q); - MAKE_INPUT_FOR(k); - MAKE_INPUT_FOR(v); -#undef MAKE_INPUT_FOR - - const ValueRef r_input_pos_symint = graph.add_symint(start_input_pos); - const ValueRef r_out = graph.add_tensor( - out.sizes().vec(), from_at_scalartype(out.scalar_type())); - - VK_GET_OP_FN("sdpa_with_kv_cache.default") - (graph, - { - r_q.value, - r_k.value, - r_v.value, - r_k_cache_data, - r_v_cache_data, - r_input_pos_symint, - kDummyValueRef, // sequence_len - kDummyValueRef, // attn_mask - kDummyValueRef, // dropout_p - kDummyValueRef, // is_causal - kDummyValueRef, // scale - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - - graph.prepack(); - - // - // Run model - // - -#define COPY_INPUT(x) \ - graph.copy_into_staging(r_##x.staging, x.const_data_ptr(), x.numel()); - -#define EXTRACT_TENSOR(x) \ - at::Tensor vk_##x = at::zeros_like(x).contiguous(); \ - graph.copy_from_staging( \ - staging_##x, vk_##x.mutable_data_ptr(), vk_##x.numel()); - - int seq_len = base_sequence_len; - for (int i = 0, input_pos = start_input_pos; - input_pos + seq_len < max_seq_len; - input_pos += seq_len, i++) { - q = at::rand( - {batch_size, seq_len, num_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - k = at::rand( - {batch_size, seq_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - v = at::rand_like(k); - - at::Tensor reference_out = sdpa_reference_impl( - q, k, v, k_cache, v_cache, input_pos, seq_len, {}, 0.0, true, {}); - - graph.set_symint(r_input_pos_symint, input_pos); - graph.resize_input(0, q.sizes().vec()); - graph.resize_input(1, k.sizes().vec()); - graph.resize_input(2, v.sizes().vec()); - graph.propagate_resize(); - - // Run Vulkan SDPA - COPY_INPUT(q); - COPY_INPUT(k); - COPY_INPUT(v); - - graph.execute(); - - out = at::empty_like(q); - EXTRACT_TENSOR(out); - - const bool output_correct = at::allclose(reference_out, vk_out); - if (!output_correct) { - at::Tensor diffs = at::abs(reference_out - vk_out); - - std::cout << "Failed at input_pos " << input_pos << " with seq_len " - << seq_len << std::endl; - - std::cout << "Maximum difference: " << std::endl; - std::cout << at::max(diffs).item() << std::endl; - std::cout << "Found at index " << std::endl; - std::cout << at::argmax(diffs).item() << std::endl; - - std::cout << "Maximum value observed: " << std::endl; - std::cout << at::max(at::abs(at::cat({reference_out, vk_out}, -1))).item() - << std::endl; - } - ASSERT_TRUE(output_correct); - - if (dynamic_seq_len) { - seq_len = base_sequence_len + (i % 3); - } - } -} - -TEST(VulkanSDPATest, test_sdpa_op_small_params) { - const int starting_input_pos = 0; - const int base_sequence_len = 3; - const int embedding_dim = 18; - const int num_heads = 6; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 7; - - test_vulkan_sdpa( - starting_input_pos, - base_sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len, - false); -} - -TEST(VulkanSDPATest, test_sdpa_op_small_params_dynamic) { - const int starting_input_pos = 0; - const int base_sequence_len = 3; - const int embedding_dim = 18; - const int num_heads = 6; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 12; - - test_vulkan_sdpa( - starting_input_pos, - base_sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_sdpa_op_llama3_params_dynamic) { - const int starting_input_pos = 0; - const int base_sequence_len = 3; - const int embedding_dim = 2048; - const int num_heads = 32; - const int num_kv_heads = 8; - const int batch_size = 1; - const int max_seq_len = 128; - - test_vulkan_sdpa( - starting_input_pos, - base_sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_reference_impl) { - const int starting_input_pos = 0; - const int base_sequence_len = 3; - const int embedding_dim = 2048; - const int num_heads = 32; - const int num_kv_heads = 8; - const int batch_size = 1; - const int max_seq_len = 128; - - test_reference_sdpa( - starting_input_pos, - base_sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -void test_vulkan_flash_attention_impl( - const int start_input_pos, - const int sequence_len, - const int embedding_dim, - const int num_heads, - const int num_kv_heads, - const int batch_size, - const int max_seq_len, - vkcompute::utils::StorageType storage_type, - at::ScalarType dtype = at::kFloat) { - const int head_dim = embedding_dim / num_heads; - - at::Tensor k_cache = at::zeros( - {batch_size, max_seq_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor v_cache = at::zeros_like(k_cache); - - at::Tensor q = at::rand( - {batch_size, sequence_len, num_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor k = at::rand( - {batch_size, sequence_len, num_kv_heads, head_dim}, - at::device(at::kCPU).dtype(dtype)); - at::Tensor v = at::rand_like(k); - - // Get reference output using existing SDPA - at::Tensor reference_out = sdpa_reference_impl( - q, - k, - v, - k_cache, - v_cache, - start_input_pos, - sequence_len, - {}, - 0.0, - true, - {}); - - using namespace vkcompute; - - GraphConfig config; - config.set_storage_type_override(storage_type); - ComputeGraph graph(config); - - // Create input references - IOValueRef r_q = graph.add_input_tensor( - q.sizes().vec(), from_at_scalartype(q.scalar_type())); - IOValueRef r_k = graph.add_input_tensor( - k.sizes().vec(), from_at_scalartype(k.scalar_type())); - IOValueRef r_v = graph.add_input_tensor( - v.sizes().vec(), from_at_scalartype(v.scalar_type())); - - // Create cache tensors (these would be updated by cache update operations in - // practice) - ValueRef r_k_cache = graph.add_tensorref( - k_cache.sizes().vec(), - from_at_scalartype(k_cache.scalar_type()), - k_cache.const_data_ptr()); - ValueRef r_v_cache = graph.add_tensorref( - v_cache.sizes().vec(), - from_at_scalartype(v_cache.scalar_type()), - v_cache.const_data_ptr()); - - const ValueRef r_input_pos_symint = graph.add_symint(start_input_pos); - const ValueRef r_out = - graph.add_tensor(q.sizes().vec(), from_at_scalartype(q.scalar_type())); - - // Call Flash Attention implementation - VK_GET_OP_FN("llama.flash_attention.default") - (graph, - { - r_q.value, - r_k.value, // Use actual K tensor, not cache - r_v.value, // Use actual V tensor, not cache - r_input_pos_symint, - kDummyValueRef, // attn_mask - kDummyValueRef, // dropout_p - kDummyValueRef, // is_causal - kDummyValueRef, // scale - r_out, - }); - - ValueRef staging_out = graph.set_output_tensor(r_out); - - graph.prepare(); - graph.prepack(); - - // Copy inputs and run - graph.copy_into_staging(r_q.staging, q.const_data_ptr(), q.numel()); - graph.copy_into_staging(r_k.staging, k.const_data_ptr(), k.numel()); - graph.copy_into_staging(r_v.staging, v.const_data_ptr(), v.numel()); - - graph.execute(); - - // Extract output - at::Tensor vk_out = at::zeros_like(q).contiguous(); - graph.copy_from_staging( - staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); - - // Compare results - const bool output_correct = at::allclose(reference_out, vk_out, 1e-3, 1e-3); - - if (!output_correct) { - at::Tensor diffs = at::abs(reference_out - vk_out); - std::cout << "Maximum difference: " << at::max(diffs).item() << std::endl; - std::cout << "Maximum value observed: " - << at::max(at::abs(at::cat({reference_out, vk_out}, -1))).item() - << std::endl; - } - ASSERT_TRUE(output_correct); -} - -void test_vulkan_flash_attention( - const int start_input_pos, - const int sequence_len, - const int embedding_dim, - const int num_heads, - const int num_kv_heads, - const int batch_size, - const int max_seq_len, - at::ScalarType dtype = at::kFloat) { - test_vulkan_flash_attention_impl( - start_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len, - vkcompute::utils::kBuffer, - dtype); - - test_vulkan_flash_attention_impl( - start_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len, - vkcompute::utils::kTexture3D, - dtype); -} - -// Flash Attention Tests (both Buffer and Texture) -TEST(VulkanSDPATest, test_flash_attention_small_params) { - const int starting_input_pos = 0; - const int sequence_len = 2; - const int embedding_dim = 4; - const int num_heads = 2; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 4; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_multi_tile) { - const int starting_input_pos = 0; - const int sequence_len = 48; - const int embedding_dim = 32; - const int num_heads = 2; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 64; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_op_small_params) { - const int starting_input_pos = 0; - const int sequence_len = 3; - const int embedding_dim = 18; - const int num_heads = 6; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 7; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_op_small_params_dynamic) { - const int starting_input_pos = 0; - const int sequence_len = 3; - const int embedding_dim = 18; - const int num_heads = 6; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 12; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_op_llama3_params) { - const int starting_input_pos = 0; - const int sequence_len = 3; - const int embedding_dim = 2048; - const int num_heads = 32; - const int num_kv_heads = 8; - const int batch_size = 1; - const int max_seq_len = 128; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_op_llama3_params_dynamic) { - const int starting_input_pos = 0; - const int embedding_dim = 2048; - const int num_heads = 32; - const int num_kv_heads = 8; - const int batch_size = 1; - const int max_seq_len = 128; - - // Test with different sequence lengths - std::vector sequence_lengths = {1, 3, 5, 7, 16, 32}; - - for (int seq_len : sequence_lengths) { - if (seq_len < max_seq_len) { - test_vulkan_flash_attention( - starting_input_pos, - seq_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); - } - } -} - -TEST(VulkanSDPATest, test_flash_attention_reference_impl) { - const int starting_input_pos = 0; - const int sequence_len = 3; - const int embedding_dim = 2048; - const int num_heads = 32; - const int num_kv_heads = 8; - const int batch_size = 1; - const int max_seq_len = 128; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_reference_impl_small) { - const int starting_input_pos = 0; - const int sequence_len = 2; - const int embedding_dim = 32; - const int num_heads = 4; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 16; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_vec4_alignment) { - const int starting_input_pos = 0; - const int sequence_len = 8; - const int embedding_dim = 64; - const int num_heads = 4; - const int num_kv_heads = 2; - const int batch_size = 1; - const int max_seq_len = 16; - - test_vulkan_flash_attention( - starting_input_pos, - sequence_len, - embedding_dim, - num_heads, - num_kv_heads, - batch_size, - max_seq_len); -} - -TEST(VulkanSDPATest, test_flash_attention_edge_cases) { - // Test with single head (no multi-query complexity) - test_vulkan_flash_attention(0, 1, 8, 1, 1, 1, 4); - - // Test with equal heads (no multi-query complexity) - test_vulkan_flash_attention(0, 2, 16, 4, 4, 1, 8); - - // Test with large head dimension - test_vulkan_flash_attention(0, 2, 128, 2, 1, 1, 8); - - // Test with sequence length that exactly matches block size (32) - test_vulkan_flash_attention(0, 32, 64, 2, 1, 1, 64); - - // Test with sequence length slightly larger than block size - test_vulkan_flash_attention( - 0, 33, 68, 2, 1, 1, 64); // 68 = 4*17, good for vec4 -} diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl deleted file mode 100644 index b9386f92772..00000000000 --- a/backends/vulkan/test/op_tests/targets.bzl +++ /dev/null @@ -1,224 +0,0 @@ -load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID") -load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_pt_ops_deps") -load("@fbsource//xplat/caffe2:pt_ops.bzl", "pt_operator_library") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load( - "@fbsource//xplat/executorch/backends/vulkan:targets.bzl", - "get_platforms", -) - -def define_test_targets(test_name, extra_deps = [], src_file = None, is_fbcode = False): - deps_list = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ] + extra_deps - - src_file_str = src_file if src_file else "{}.cpp".format(test_name) - - runtime.cxx_binary( - name = "{}_bin".format(test_name), - srcs = [ - src_file_str, - ], - compiler_flags = [ - "-Wno-unused-variable", - ], - platforms = get_platforms(), - define_static_target = False, - deps = deps_list, - ) - - runtime.cxx_test( - name = test_name, - srcs = [ - src_file_str, - ], - contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], - fbandroid_additional_loaded_sonames = [ - "torch-code-gen", - "vulkan_graph_runtime", - "vulkan_graph_runtime_shaderlib", - ], - platforms = [ANDROID], - use_instrumentation_test = True, - deps = deps_list, - ) - - -def define_common_targets(is_fbcode = False): - if is_fbcode: - return - - runtime.python_library( - name = "generate_op_correctness_tests_lib", - srcs = native.glob(["utils/*.py"]) + [ - "generate_op_correctness_tests.py", - "cases.py", - ], - base_module = "executorch.backends.vulkan.test.op_tests", - deps = [ - "fbsource//third-party/pypi/expecttest:expecttest", - ], - external_deps = ["torchgen"], - ) - - runtime.python_library( - name = "generate_op_benchmarks_lib", - srcs = native.glob(["utils/*.py"]) + [ - "generate_op_benchmarks.py", - "cases.py", - ], - base_module = "executorch.backends.vulkan.test.op_tests", - deps = [ - "fbsource//third-party/pypi/expecttest:expecttest", - ], - external_deps = ["torchgen"], - ) - - runtime.python_binary( - name = "generate_op_correctness_tests", - main_module = "executorch.backends.vulkan.test.op_tests.generate_op_correctness_tests", - deps = [ - ":generate_op_correctness_tests_lib", - ], - ) - - runtime.python_binary( - name = "generate_op_benchmarks", - main_module = "executorch.backends.vulkan.test.op_tests.generate_op_benchmarks", - deps = [ - ":generate_op_benchmarks_lib", - ], - ) - - aten_src_path = runtime.external_dep_location("aten-src-path") - genrule_cmd = [ - "$(exe :generate_op_correctness_tests)", - "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path), - "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path), - "-o $OUT", - ] - - runtime.genrule( - name = "generated_op_correctness_tests_cpp", - outs = { - "op_tests.cpp": ["op_tests.cpp"], - }, - cmd = " ".join(genrule_cmd), - default_outs = ["."], - ) - - benchmarks_genrule_cmd = [ - "$(exe :generate_op_benchmarks)", - "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path), - "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path), - "-o $OUT", - ] - - runtime.genrule( - name = "generated_op_benchmarks_cpp", - outs = { - "op_benchmarks.cpp": ["op_benchmarks.cpp"], - }, - cmd = " ".join(benchmarks_genrule_cmd), - default_outs = ["."], - ) - - runtime.cxx_binary( - name = "compute_graph_op_benchmarks_bin", - srcs = [ - ":generated_op_benchmarks_cpp[op_benchmarks.cpp]", - ], - compiler_flags = [ - "-Wno-unused-variable", - ], - define_static_target = False, - deps = [ - "//third-party/benchmark:benchmark", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ], - platforms = get_platforms(), - ) - - runtime.cxx_library( - name = "test_utils", - srcs = [ - "test_utils.cpp", - ], - headers = [ - "test_utils.h", - ], - exported_headers = [ - "test_utils.h", - ], - deps = [ - "//executorch/backends/vulkan:vulkan_graph_runtime", - "//executorch/runtime/core/exec_aten:lib", - runtime.external_dep_location("libtorch"), - ], - visibility = [ - "//executorch/backends/vulkan/test/op_tests/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - define_test_targets( - "compute_graph_op_tests", - src_file=":generated_op_correctness_tests_cpp[op_tests.cpp]" - ) - - define_test_targets( - "sdpa_test", - extra_deps = [ - ":test_utils", - "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", - "//executorch/extension/tensor:tensor", - ] - ) - define_test_targets( - "quantize_test", - extra_deps = [ - ":test_utils", - "//executorch/kernels/quantized/cpu:op_quantize", - "//executorch/extension/tensor:tensor", - "//executorch/extension/aten_util:aten_bridge", - ] - ) - define_test_targets( - "dequantize_test", - extra_deps = [ - ":test_utils", - "//executorch/kernels/quantized/cpu:op_dequantize", - "//executorch/extension/tensor:tensor", - "//executorch/extension/aten_util:aten_bridge", - ] - ) - define_test_targets( - "choose_qparams_test", - extra_deps = [ - ":test_utils", - "//executorch/kernels/quantized/cpu:op_choose_qparams", - "//executorch/extension/tensor:tensor", - "//executorch/extension/aten_util:aten_bridge", - ] - ) - define_test_targets( - "quantized_linear_test", - extra_deps = [ - ":test_utils", - ] - ) - define_test_targets( - "rotary_embedding_test", - extra_deps = [ - ":test_utils", - ] - ) - define_test_targets( - "quantize_affine_test", - extra_deps = [ - ":test_utils", - ] - ) diff --git a/backends/vulkan/test/op_tests/test_utils.cpp b/backends/vulkan/test/op_tests/test_utils.cpp deleted file mode 100644 index c5702abd079..00000000000 --- a/backends/vulkan/test/op_tests/test_utils.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "test_utils.h" - -#include - -executorch::aten::ScalarType at_scalartype_to_et_scalartype( - at::ScalarType dtype) { - using ScalarType = executorch::aten::ScalarType; - switch (dtype) { - case at::kByte: - return ScalarType::Byte; - case at::kChar: - return ScalarType::Char; - case at::kShort: - return ScalarType::Short; - case at::kInt: - return ScalarType::Int; - case at::kLong: - return ScalarType::Long; - case at::kHalf: - return ScalarType::Half; - case at::kFloat: - return ScalarType::Float; - case at::kDouble: - return ScalarType::Double; - default: - throw std::runtime_error("Unsupported dtype"); - } -} - -std::string scalar_type_name(c10::ScalarType dtype) { - switch (dtype) { - case c10::kLong: - return "c10::kLong"; - case c10::kShort: - return "c10::kShort"; - case c10::kComplexHalf: - return "c10::kComplexHalf"; - case c10::kComplexFloat: - return "c10::kComplexFloat"; - case c10::kComplexDouble: - return "c10::kComplexDouble"; - case c10::kBool: - return "c10::kBool"; - case c10::kQInt8: - return "c10::kQInt8"; - case c10::kQUInt8: - return "c10::kQUInt8"; - case c10::kQInt32: - return "c10::kQInt32"; - case c10::kBFloat16: - return "c10::kBFloat16"; - case c10::kQUInt4x2: - return "c10::kQUInt4x2"; - case c10::kQUInt2x4: - return "c10::kQUInt2x4"; - case c10::kFloat: - return "c10::kFloat"; - case c10::kHalf: - return "c10::kHalf"; - case c10::kInt: - return "c10::kInt"; - case c10::kChar: - return "c10::kChar"; - case c10::kByte: - return "c10::kByte"; - case c10::kDouble: - return "c10::kDouble"; - case c10::kUInt16: - return "c10::kUInt16"; - case c10::kBits16: - return "c10::kBits16"; - default: - return "Unknown(" + std::to_string(static_cast(dtype)) + ")"; - } -} - -vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { - using namespace vkcompute; - switch (at_scalartype) { - case c10::kHalf: - return vkapi::kHalf; - case c10::kFloat: - return vkapi::kFloat; - case c10::kDouble: - return vkapi::kDouble; - case c10::kInt: - return vkapi::kInt; - case c10::kLong: - // No support for 64-bit integers - return vkapi::kInt; - case c10::kChar: - return vkapi::kChar; - case c10::kByte: - return vkapi::kByte; - case c10::kShort: - return vkapi::kShort; - case c10::kUInt16: - return vkapi::kUInt16; - default: - VK_THROW( - "Unsupported at::ScalarType: ", - scalar_type_name(at_scalartype), - " (", - static_cast(at_scalartype), - ")"); - } -} diff --git a/backends/vulkan/test/op_tests/test_utils.h b/backends/vulkan/test/op_tests/test_utils.h deleted file mode 100644 index 369767007e0..00000000000 --- a/backends/vulkan/test/op_tests/test_utils.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include -#include -#include - -/** - * Convert at::ScalarType to executorch::ScalarType - */ -executorch::aten::ScalarType at_scalartype_to_et_scalartype( - at::ScalarType dtype); - -/** - * Get the string name of a c10::ScalarType for better error messages - */ -std::string scalar_type_name(c10::ScalarType dtype); - -/** - * Convert c10::ScalarType to vkcompute::vkapi::ScalarType - */ -vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype); diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py deleted file mode 100644 index 6ad2f568e91..00000000000 --- a/backends/vulkan/test/op_tests/utils/aten_types.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -#################### -## ATen C++ Types ## -#################### - -AT_INT_ARRAY_REF = "at::IntArrayRef" -AT_SCALAR = "at::Scalar" -AT_TENSOR = "at::Tensor" -AT_TENSOR_LIST = "at::TensorList" -BOOL = "bool" -DOUBLE = "double" -INT = "int64_t" -OPT_AT_DOUBLE_ARRAY_REF = "::std::optional>" -OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef" -OPT_AT_TENSOR = "::std::optional" -OPT_BOOL = "::std::optional" -OPT_INT64 = "::std::optional" -OPT_DEVICE = "::std::optional" -OPT_LAYOUT = "::std::optional" -OPT_MEMORY_FORMAT = "::std::optional" -OPT_SCALAR_TYPE = "::std::optional" -STRING = "std::string_view" -OLD_STRING = "c10::string_view" -TWO_TENSOR_TUPLE = "::std::tuple" -THREE_TENSOR_TUPLE = "::std::tuple" -TENSOR_VECTOR = "::std::vector" diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py deleted file mode 100644 index 76eb9dbe838..00000000000 --- a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py +++ /dev/null @@ -1,346 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import re - -from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( - ComputeGraphGen, -) -from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import ( - CorrectnessTestGen, -) -from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite - -from torchgen.model import NativeFunction - -########################## -## Test Suite Generation ## -########################## - -benchmark_fixture_template = """ -class GeneratedOpBenchmark_{op_name} : public ::benchmark::Fixture {{ - protected: - ComputeGraph* graph; - at::ScalarType test_dtype = at::kFloat; - float rtol = {rtol}; - float atol = {atol}; - - {arg_valuerefs} - - void SetUp(::benchmark::State& state) override {{ - GraphConfig config; - config.descriptor_pool_safety_factor = 2.0; - test_dtype = at::ScalarType(state.range(0)); - const utils::StorageType storage_type = utils::StorageType(state.range(1)); - const utils::GPUMemoryLayout memory_layout = utils::GPUMemoryLayout(state.range(2)); - config.set_storage_type_override(storage_type); - config.set_memory_layout_override(memory_layout); - config.enable_querypool = true; - graph = new ComputeGraph(config); - }} - - void TearDown(::benchmark::State& state) override {{ - delete graph; - graph = nullptr; - }} - - {build_graph_fn} - {benchmark_fn} -}}; -""" - -benchmark_template = """ -BENCHMARK_DEFINE_F(GeneratedOpBenchmark_{op_name}, {case_name})(benchmark::State& state) {{ - {skips} - {create_ref_data} - {call_build_graph} - ShaderTimes shader_times; - for (auto _ : state) {{ - {call_benchmark} - graph->context()->querypool().extract_results(); - QueryPoolResults results = graph->context()->querypool().get_shader_timestamp_data(); - process_querypool_results(results, shader_times); - }} - register_shader_time_counters(state, shader_times); -}} - -BENCHMARK_REGISTER_F(GeneratedOpBenchmark_{op_name}, {case_name})->Threads(1)->ArgsProduct({combos}); -""" - - -class VkBenchmarkGen(CorrectnessTestGen): - def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite): - super().__init__(f, inputs) - self.op_reg_name = op_reg_name - self.generator = ComputeGraphGen( - self.op_reg_name, self.f, self.suite_def, inputs.force_io - ) - - def gen_call_benchmark(self, prepack=False) -> str: - test_str = f"benchmark_{self.op_name}(" - if prepack: - test_str = f"prepacked_benchmark_{self.op_name}(" - for binding in self.f_sig.arguments(): - arg = binding.argument - test_str += f"{arg.name}, " - test_str = test_str[:-2] + ");" - test_str = re.sub(r"^", " ", test_str, flags=re.M) - return test_str - - def gen_call_build_graph(self, prepack=False) -> str: - test_str = f"build_graph_{self.op_name}(" - if prepack: - test_str = f"prepacked_build_graph_{self.op_name}(" - for binding in self.f_sig.arguments(): - arg = binding.argument - test_str += f"{arg.name}, " - test_str = test_str[:-2] + ");" - test_str = re.sub(r"^", " ", test_str, flags=re.M) - return test_str - - def gen_combos(self, inputs) -> str: - dtypes_list = ", ".join(f"int({dtype})" for dtype in self.suite_def.dtypes) - storage_types_list = ", ".join( - f"int({storage_type})" for storage_type in self.suite_def.storage_types - ) - layouts_list = ", ".join(f"int({layout})" for layout in self.suite_def.layouts) - return f"{{ {{ {dtypes_list} }}, {{ {storage_types_list} }}, {{ {layouts_list} }} }}" - - def generate_benchmark_case(self, inputs, prepack=False) -> str: - return benchmark_template.format( - op_name=f"{self.op_name}", - case_name=self.gen_case_name(inputs, prepack), - skips=self.generator.gen_conditional_skips( - 'state.SkipWithError("unsupported type"); return;' - ), - create_ref_data=self.gen_create_ref_data(inputs), - call_build_graph=self.gen_call_build_graph(prepack), - call_benchmark=self.gen_call_benchmark(prepack), - combos=self.gen_combos(inputs), - ) - - def generate_benchmark(self) -> str: - benchmarks_cpp = "" - for inputs in self.suite_def.input_cases: - if not self.suite_def.requires_prepack: - benchmarks_cpp += self.generate_benchmark_case(inputs) - if self.suite_def.supports_prepack(): - benchmarks_cpp += self.generate_benchmark_case(inputs, prepack=True) - return benchmarks_cpp - - def generate_benchmark_fixture(self) -> str: - build_graph_fn = "" - benchmark_fn = "" - if not self.suite_def.requires_prepack: - build_graph_fn = self.generator.gen_build_graph_fn() - benchmark_fn = self.generator.gen_op_exec_graph_fn() - - prepacked_build_graph_fn = "" - prepacked_benchmark_fn = "" - if self.suite_def.supports_prepack(): - self.generator.should_prepack = True - prepacked_build_graph_fn = self.generator.gen_build_graph_fn() - build_graph_fn += "\n\n " - build_graph_fn += prepacked_build_graph_fn - prepacked_benchmark_fn = self.generator.gen_op_exec_graph_fn() - benchmark_fn += "\n\n " - benchmark_fn += prepacked_benchmark_fn - - return benchmark_fixture_template.format( - op_name=self.op_name, - build_graph_fn=build_graph_fn, - benchmark_fn=benchmark_fn, - rtol=self.suite_def.rtol, - arg_valuerefs=self.generator.gen_arg_valueref_decls(), - atol=self.suite_def.atol, - ) - - -########################## -## Test File Generation ## -########################## - -cpp_test_template = """ -#include -#include -#include - -#include -#include -#include - -using namespace vkcompute; -using TensorOptions = at::TensorOptions; - -vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {{ - switch (at_scalartype) {{ - case c10::kDouble: - return vkapi::kDouble; - case c10::kFloat: - return vkapi::kFloat; - case c10::kHalf: - return vkapi::kHalf; - case c10::kInt: - return vkapi::kInt; - case c10::kLong: - return vkapi::kInt; - case c10::kChar: - return vkapi::kChar; - case c10::kBool: - return vkapi::kBool; - default: - VK_THROW("Unsupported at::ScalarType!"); - }} -}} - -at::Tensor make_casted_randint_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - int low = 0, - int high = 10) {{ - - return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); -}} - -at::Tensor make_rand_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - float low = 0.0, - float high = 1.0) {{ - if (high == 1.0 && low == 0.0) - return at::rand(sizes, at::device(at::kCPU).dtype(dtype)); - - if (dtype == at::kChar) - return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); - - return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low; -}} - -at::Tensor make_seq_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - float low = 0.0, - float high = 1.0) {{ - (void)low; - (void)high; - - int64_t n = 1; - for (auto size: sizes) {{ - n *= size; - }} - - std::vector values(n); - for (int i=0;i indices) {{ - at::ScalarType dtype = at::kInt; - std::vector sizes = {{static_cast(indices.size())}}; - - // Clone as original data will be deallocated upon return. - return at::from_blob(indices.data(), sizes, dtype).detach().clone(); -}} - -at::Tensor make_index_tensor_2d(std::vector> indices) {{ - at::ScalarType dtype = at::kInt; - std::vector sizes = {{ - static_cast(indices.size()), - static_cast(indices[0].size())}}; - - // Flatten indices as from_blob reads garbage otherwise. - std::vector acc; - for (auto& vec: indices) {{ - acc.insert(acc.end(), vec.begin(), vec.end()); - }} - - // Clone as original data will be deallocated upon return. - return at::from_blob(acc.data(), sizes, dtype).detach().clone(); -}} - -at::Tensor make_index_tensor_3d(std::vector>> indices) {{ - at::ScalarType dtype = at::kInt; - std::vector sizes = {{ - static_cast(indices.size()), - static_cast(indices[0].size()), - static_cast(indices[0][0].size())}}; - - // Flatten indices as from_blob reads garbage otherwise. - std::vector acc; - for (auto& v: indices) {{ - for (auto& vv: v) {{ - acc.insert(acc.end(), vv.begin(), vv.end()); - }} - }} - - // Clone as original data will be deallocated upon return. - return at::from_blob(acc.data(), sizes, dtype).detach().clone(); -}} - -using QueryPoolResults = std::vector; -using ShaderTimes = std::unordered_map>; - -void process_querypool_results( - QueryPoolResults& results, - ShaderTimes& shader_times) {{ - for (const vkcompute::vkapi::ShaderResult& r : results) {{ - uint64_t duration_ns = r.end_time_ns - r.start_time_ns; - if (shader_times.find(r.kernel_name) == shader_times.end()) {{ - shader_times[r.kernel_name] = std::vector(); - }} - shader_times[r.kernel_name].emplace_back(duration_ns); - }} -}} - -void register_shader_time_counters( - benchmark::State& state, - ShaderTimes& shader_times) {{ - for (auto& times_list : shader_times) {{ - // Filter to_nchw and nchw_to shaders - if (times_list.first.find("to_nchw") != std::string::npos) {{ - continue; - }} - if (times_list.first.find("nchw_to") != std::string::npos) {{ - continue; - }} - - std::sort(times_list.second.begin(), times_list.second.end()); - uint64_t median_time; - median_time = times_list.second[times_list.second.size() / 2]; - state.counters[times_list.first + " median ns"] = median_time; - }} -}} - -{benchmark_fixtures} - -{def_benchmarks} -""" - - -class VkBenchmarkFileGen: - def __init__(self, out_path): - self.out_path = out_path - self.suites_gens = [] - - def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: - suites_gen = VkBenchmarkGen(op_reg_name, f, all_input_cases) - self.suites_gens.append(suites_gen) - - def generate_benchmarks_cpp(self) -> str: - return "\n".join([h.generate_benchmark() for h in self.suites_gens]) - - def generate_benchmark_fixtures(self) -> str: - return "\n".join([h.generate_benchmark_fixture() for h in self.suites_gens]) - - def generate_cpp(self) -> str: - return cpp_test_template.format( - benchmark_fixtures=self.generate_benchmark_fixtures(), - def_benchmarks=self.generate_benchmarks_cpp(), - ) diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py deleted file mode 100644 index 490044340d6..00000000000 --- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py +++ /dev/null @@ -1,788 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import re -from dataclasses import dataclass -from typing import List, Optional, Union - -from executorch.backends.vulkan.test.op_tests.utils.aten_types import ( - AT_INT_ARRAY_REF, - AT_SCALAR, - AT_TENSOR, - AT_TENSOR_LIST, - BOOL, - DOUBLE, - INT, - OLD_STRING, - OPT_AT_DOUBLE_ARRAY_REF, - OPT_AT_INT_ARRAY_REF, - OPT_AT_TENSOR, - OPT_BOOL, - OPT_DEVICE, - OPT_INT64, - OPT_LAYOUT, - OPT_MEMORY_FORMAT, - OPT_SCALAR_TYPE, - STRING, - TENSOR_VECTOR, - THREE_TENSOR_TUPLE, - TWO_TENSOR_TUPLE, -) -from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite - -from torchgen.api import cpp -from torchgen.api.types import CppSignatureGroup -from torchgen.gen import generate_static_dispatch_backend_call, translate_args -from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature -from torchgen.model import NativeFunction, Variant - -################################### -## Compute Graph Code Generation ## -################################### - - -@dataclass -class ATenArg: - name: str - cpp_type: str - default: Optional[str] - - -@dataclass -class ValueRef: - name: str - src_cpp_name: str - src_cpp_type: str - is_in: bool = False - is_out: bool = False - fixed_storage_type: Optional[str] = None - fixed_memory_layout: Optional[str] = None - requires_prepack: bool = False - supports_prepack: bool = False - # When is_dynamic_size is true, the underlying object size is not known - # during code-gen. Example is the out value for aten.split where the out - # value is a vector. In these cases, we need to use an additional - # vector or at::TensorList to track these values. - is_dynamic_size: bool = False - - @property - def io_value_list_name(self): - assert self.is_dynamic_size - return f"{self.name}_io_value_list" - - @property - def value_list_name(self): - assert self.is_dynamic_size - return f"{self.name}_value_list" - - @property - def vk_out(self): - assert self.is_out - return f"vk_{self.name}" - - -ValueRefList = Union[ValueRef, List[ValueRef]] - -InableCppType = frozenset([AT_TENSOR, AT_TENSOR_LIST]) - - -class ComputeGraphGen: - backend_key = None - - def __init__( - self, - op_reg_name: str, - f: NativeFunction, - suite_def: TestSuite, - include_io: bool = True, - ): - self.op_reg_name = op_reg_name - self.f = f - self.suite_def = suite_def - self.include_io = include_io - - self.f_sig = CppSignatureGroup.from_native_function( - self.f, method=False, fallback_binding=self.f.manual_cpp_binding - ).most_faithful_signature() - - self.graph = "graph" - self.dot = "->" - - self.args = [] - self.refs = {} - - self.should_prepack = False - - for binding in self.f_sig.arguments(): - arg = binding.argument - ctype = cpp.argumenttype_type( - arg.type, mutable=arg.is_write, binds=arg.name - ) - cpp_type = ctype.cpp_type(strip_ref=True) - - self.args.append( - ATenArg(name=arg.name, cpp_type=cpp_type, default=arg.default) - ) - - # These are the argument will be passed as a "weight" tensor, the - # corresponding object will be TensorRef in the compute graph. - requires_prepack = ( - "weight" in arg.name - or "bias" in arg.name - or "running_mean" in arg.name - or "running_var" in arg.name - ) - supports_prepack = False - if arg.name in self.suite_def.prepacked_args: - supports_prepack = True - - fixed_storage_type = None - if arg.name in self.suite_def.arg_storage_types: - fixed_storage_type = self.suite_def.arg_storage_types[arg.name] - - fixed_memory_layout = None - if arg.name in self.suite_def.arg_memory_layouts: - fixed_memory_layout = self.suite_def.arg_memory_layouts[arg.name] - - self.refs[arg.name] = ValueRef( - name=f"{arg.name}_ref", - src_cpp_name=arg.name, - src_cpp_type=cpp_type, - is_in=(cpp_type in InableCppType), - fixed_storage_type=fixed_storage_type, - fixed_memory_layout=fixed_memory_layout, - requires_prepack=requires_prepack, - supports_prepack=supports_prepack, - ) - - ret_type = cpp.returns_type(self.f.func.returns, symint=False).cpp_type() - self.out = ATenArg(name="out", cpp_type=ret_type, default=None) - - fixed_storage_type = None - if "out" in self.suite_def.arg_storage_types: - fixed_storage_type = self.suite_def.arg_storage_types["out"] - fixed_memory_layout = None - if "out" in self.suite_def.arg_memory_layouts: - fixed_memory_layout = self.suite_def.arg_memory_layouts["out"] - - if ret_type == AT_TENSOR: - self.refs["out"] = ValueRef( - name="out_ref", - src_cpp_name="out", - src_cpp_type=ret_type, - is_out=True, - fixed_storage_type=fixed_storage_type, - fixed_memory_layout=fixed_memory_layout, - ) - elif ret_type == TWO_TENSOR_TUPLE: - self.refs["out"] = [ - ValueRef( - name="out_ref_first", - src_cpp_name="std::get<0>(out)", - src_cpp_type="at::Tensor", - is_out=True, - fixed_storage_type=( - fixed_storage_type[0] if fixed_storage_type else None - ), - fixed_memory_layout=( - fixed_memory_layout[0] if fixed_memory_layout else None - ), - ), - ValueRef( - name="out_ref_second", - src_cpp_name="std::get<1>(out)", - src_cpp_type="at::Tensor", - is_out=True, - fixed_storage_type=( - fixed_storage_type[1] if fixed_storage_type else None - ), - fixed_memory_layout=( - fixed_memory_layout[1] if fixed_memory_layout else None - ), - ), - ValueRef( - name="out_ref", - src_cpp_name="out", - src_cpp_type=ret_type, - is_out=False, - ), - ] - elif ret_type == THREE_TENSOR_TUPLE: - self.refs["out"] = [ - ValueRef( - name="out_ref_first", - src_cpp_name="std::get<0>(out)", - src_cpp_type="at::Tensor", - is_out=True, - fixed_storage_type=( - fixed_storage_type[0] if fixed_storage_type else None - ), - fixed_memory_layout=( - fixed_memory_layout[0] if fixed_memory_layout else None - ), - ), - ValueRef( - name="out_ref_second", - src_cpp_name="std::get<1>(out)", - src_cpp_type="at::Tensor", - is_out=True, - fixed_storage_type=( - fixed_storage_type[1] if fixed_storage_type else None - ), - fixed_memory_layout=( - fixed_memory_layout[1] if fixed_memory_layout else None - ), - ), - ValueRef( - name="out_ref_third", - src_cpp_name="std::get<2>(out)", - src_cpp_type="at::Tensor", - is_out=True, - fixed_storage_type=( - fixed_storage_type[2] if fixed_storage_type else None - ), - fixed_memory_layout=( - fixed_memory_layout[2] if fixed_memory_layout else None - ), - ), - ValueRef( - name="out_ref", - src_cpp_name="out", - src_cpp_type=ret_type, - is_out=False, - ), - ] - elif ret_type == TENSOR_VECTOR: - self.refs["out"] = ValueRef( - name="out_ref", - src_cpp_name="out", - src_cpp_type=ret_type, - is_out=True, - is_dynamic_size=True, - ) - else: - raise NotImplementedError( - f"ret_type: {ret_type} not supported for out value" - ) - - ## ATen code generation - - def gen_decl(self, fn_name: str, ret_type: str = "void") -> str: - cpp_args = [a.decl() for a in self.f_sig.arguments()] - cpp_args_str = ", ".join(cpp_args) - return f"{ret_type} {fn_name}({cpp_args_str})" - - def create_aten_fn_call(self) -> str: - func_call = generate_static_dispatch_backend_call( - self.f_sig, self.f, ComputeGraphGen.backend_key - )[7:].replace("::cpu", "") - - return func_call - - def create_aten_method_call(self) -> str: - # For functions with only Method variant, we fallback to the function - # declared in MethodOperators.h - cpp_sig = gen_static_dispatch_backend_call_signature(self.f_sig, self.f) - exprs = translate_args(self.f_sig, cpp_sig) - func_call = f"at::_ops::{self.f_sig.name()}::call({exprs});" - return func_call - - def create_out_src(self, include_declarations: bool = True) -> str: - cpp_type = self.out.cpp_type if include_declarations else "" - if Variant.function in self.f.variants: - return f"{cpp_type} out = " + self.create_aten_fn_call() + "\n" - else: - return f"{cpp_type} out = " + self.create_aten_method_call() + "\n" - - ## Graph code generation utils - - def prepack_ref(self, ref: ValueRef) -> bool: - if ref.requires_prepack: - return True - else: - return ref.supports_prepack and self.should_prepack - - def create_value_decl_for(self, ref: ValueRefList) -> str: # noqa: C901 - if isinstance(ref, list): - ret_str = "" - for r in ref: - ret_str += self.create_value_decl_for(r) - return ret_str - - cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef" - if ref.src_cpp_type == AT_TENSOR_LIST: - ret_str = f"std::vector {ref.name}_io_value_refs;\n" - ret_str += f"std::vector {ref.name}_value_refs;\n" - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - ret_str = f"std::vector {ref.io_value_list_name};\n" - ret_str += f"std::vector {ref.value_list_name};\n" - return ret_str - else: - return f"{cpp_type} {ref.name};\n" - - def create_value_for( # noqa: C901 - self, ref: ValueRefList, include_declarations: bool = True - ) -> str: - if isinstance(ref, list): - ret_str = "" - for r in ref: - ret_str += self.create_value_for(r) - return ret_str - - prepack = self.prepack_ref(ref) - ref_is_view = self.suite_def.is_view_op and ref.is_out - - # If skipping IO, force is_in to be False - if not self.include_io and ref.is_in: - ref.is_in = False - - cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef" - if not include_declarations: - cpp_type = "" - - if ref.src_cpp_type == OPT_AT_TENSOR: - ret_str = f"{cpp_type} {ref.name} = " - if prepack: - ret_str = "" - if include_declarations: - ret_str += f"IOValueRef {ref.name};\n" - ret_str += f"{ref.name}.value = " - ret_str += f"!{ref.src_cpp_name}.has_value() ? " - ret_str += f"{self.graph}{self.dot}add_none() : " - if not prepack: - ret_str += f"{self.graph}{self.dot}" - ret_str += "add_input_tensor(" if ref.is_in else "add_tensor(" - ret_str += f"{ref.src_cpp_name}->sizes().vec(), " - ret_str += f"from_at_scalartype({ref.src_cpp_name}->scalar_type()" - if ref.fixed_storage_type: - ret_str += f", {ref.fixed_storage_type}" - if ref.fixed_memory_layout: - ret_str += f", {ref.fixed_memory_layout}" - ret_str += "));\n" - elif prepack: - ret_str += f"{self.graph}{self.dot}" - ret_str += f"add_tensorref({ref.src_cpp_name}->sizes().vec(), " - ret_str += f"from_at_scalartype({ref.src_cpp_name}->scalar_type()), " - ret_str += f"{ref.src_cpp_name}->const_data_ptr()); \n" - return ret_str - elif ref.src_cpp_type == OPT_INT64: - ret_str = f"{cpp_type} {ref.name} = " - ret_str += f"!{ref.src_cpp_name}.has_value() ? " - ret_str += f"{self.graph}{self.dot}add_none() : " - ret_str += f"{self.graph}{self.dot}add_scalar" - ret_str += f"({ref.src_cpp_name}.value());\n" - return ret_str - elif ( - ref.src_cpp_type == OPT_AT_DOUBLE_ARRAY_REF - or ref.src_cpp_type == OPT_AT_INT_ARRAY_REF - ): - ret_str = f"{cpp_type} {ref.name} = " - ret_str += f"!{ref.src_cpp_name}.has_value() ? " - ret_str += f"{self.graph}{self.dot}add_none() : " - ret_str += f"{self.graph}{self.dot}add_scalar_list" - ret_str += f"({ref.src_cpp_name}->vec());\n" - return ret_str - elif ref.src_cpp_type == AT_TENSOR_LIST: - assert ref.is_in, "AT_TENSOR_LIST must be an input" - # This logic is a bit convoluted. We need to create a IOValueRef for - # each tensor, to facilate staging. On the other hand, we will - # use the .value tensor to create a ValueList, which will be passed - # to the corresponding ops. - ret_str = "" - if include_declarations: - ret_str += f"std::vector {ref.name}_io_value_refs;\n" - ret_str += f"std::vector {ref.name}_value_refs;\n" - ret_str += f"for (int i=0; i < {ref.src_cpp_name}.size(); i++) {{\n" - ret_str += ( - f" IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n" - ) - ret_str += f" {ref.src_cpp_name}[i].sizes().vec(),\n" - ret_str += ( - f" from_at_scalartype({ref.src_cpp_name}[i].scalar_type())); \n" - ) - ret_str += f" {ref.name}_value_refs.emplace_back(io_value_ref.value);\n" - ret_str += f" {ref.name}_io_value_refs.emplace_back(io_value_ref);\n" - ret_str += "}\n" - ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n" - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - ret_str = "" - if include_declarations: - ret_str += f"std::vector {ref.io_value_list_name};\n" - ret_str += f"std::vector {ref.value_list_name};\n" - ret_str += f""" -for (int i=0; i({ref.src_cpp_name}.toDouble()); \n" - elif ref.src_cpp_type == AT_INT_ARRAY_REF: - ret_str += f"add_scalar_list({ref.src_cpp_name}.vec()); \n" - elif ref.src_cpp_type == BOOL: - ret_str += f"add_scalar({ref.src_cpp_name}); \n" - elif ref.src_cpp_type == INT: - ret_str += f"add_scalar({ref.src_cpp_name}); \n" - elif ref.src_cpp_type == DOUBLE: - ret_str += f"add_scalar({ref.src_cpp_name}); \n" - elif ( - ref.src_cpp_type == OPT_SCALAR_TYPE - or ref.src_cpp_type == OPT_LAYOUT - or ref.src_cpp_type == OPT_DEVICE - or ref.src_cpp_type == OPT_BOOL - or ref.src_cpp_type == OPT_MEMORY_FORMAT - ): - ret_str += "add_none(); \n" - elif ref.src_cpp_type == STRING or ref.src_cpp_type == OLD_STRING: - ret_str += f"add_string(std::string({ref.src_cpp_name})); \n" - elif ref.src_cpp_type == TWO_TENSOR_TUPLE: - ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second}}); \n" - elif ref.src_cpp_type == THREE_TENSOR_TUPLE: - ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second, {ref.name}_third}}); \n" - else: - raise RuntimeError(f"Unsupported cpp type {ref.src_cpp_type}") - - return ret_str - - def create_op_call(self) -> str: - deref = "*" if self.dot == "->" else "" - op_create_code = f'VK_GET_OP_FN("{self.op_reg_name}")({deref}{self.graph}, {{' - - for aten_arg in self.args: - ref = self.refs[aten_arg.name] - if ref.src_cpp_type == AT_TENSOR_LIST: - # Special case. Underlying tensors are input tensors, but the - # container itself is just a normal value. - op_create_code += f"{ref.name}, " - else: - op_create_code += ( - f"{ref.name}.value, " - if ref.is_in or ref.requires_prepack or ref.is_out - else f"{ref.name}, " - ) - # op_create_code += f"{ref.name}, " - - op_create_code += "out_ref});\n" - return op_create_code - - def gen_output_staging_valueref_decl(self, ref: ValueRefList) -> str: - if isinstance(ref, list): - ret_str = "" - for r in ref[:-1]: - ret_str += self.gen_output_staging_valueref_decl(r) - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - assert ref.is_out - ret_str = "" - return ret_str - - assert ref.src_cpp_type == AT_TENSOR and ref.is_out - return f"ValueRef {ref.name}_staging;\n" - - def set_output(self, ref: ValueRefList, include_declarations: bool = True) -> str: - if isinstance(ref, list): - ret_str = "" - for r in ref[:-1]: - ret_str += self.set_output(r, include_declarations) - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - assert ref.is_out - ret_str = f""" -for (int i=0; i str: - assert isinstance(ref, ValueRef) - assert ref.src_cpp_type in InableCppType and ref.is_in - if self.prepack_ref(ref): - return "" - - if ref.src_cpp_type == AT_TENSOR: - ret_str = f"{self.graph}{self.dot}virtual_resize({ref.name}.value, " - ret_str += f"{ref.src_cpp_name}.sizes().vec());\n" - elif ref.src_cpp_type == AT_TENSOR_LIST: - ret_str = "" - ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n" - ret_str += f" {self.graph}{self.dot}virtual_resize({ref.name}_io_value_refs[i].value, " - ret_str += f"{ref.src_cpp_name}[i].sizes().vec());\n" - ret_str += "}\n" - else: - raise AssertionError(f"{ref.src_cpp_type} not expected") - - return ret_str - - def copy_into_staging(self, ref: ValueRefList) -> str: - assert isinstance(ref, ValueRef) - assert ref.src_cpp_type in InableCppType and ref.is_in - - if self.prepack_ref(ref): - return "" - - if ref.src_cpp_type == AT_TENSOR: - ret_str = f"{self.graph}{self.dot}copy_into_staging(" - ret_str += f"{ref.name}.staging, " - ret_str += f"{ref.src_cpp_name}.const_data_ptr(), " - ret_str += f"{ref.src_cpp_name}.numel());\n" - elif ref.src_cpp_type == AT_TENSOR_LIST: - ret_str = "" - ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n" - ret_str += f" {self.graph}{self.dot}copy_into_staging(" - ret_str += f"{ref.name}_io_value_refs[i].staging, " - ret_str += f"{ref.src_cpp_name}[i].const_data_ptr(), " - ret_str += f"{ref.src_cpp_name}[i].numel());\n" - ret_str += "}\n" - else: - raise AssertionError(f"{ref.src_cpp_type} not expected") - return ret_str - - def declare_vk_out_for(self, ref: Union[ValueRef, List[ValueRef]]) -> str: - if isinstance(ref, list): - ret_str = "" - for r in ref[:-1]: - ret_str += self.declare_vk_out_for(r) - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - assert ref.is_out - ret_str = f""" -std::vector {ref.vk_out}; -for (int i=0; i str: - if isinstance(ref, list): - ret_str = "" - for r in ref[:-1]: - ret_str += self.copy_from_staging(r) - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - assert ref.is_out - ret_str = f""" -for (int i=0; i str: - if isinstance(ref, list): - ret_str = "" - for r in ref[:-1]: - ret_str += self.check_graph_out(r) - return ret_str - elif ref.src_cpp_type == TENSOR_VECTOR: - assert ref.is_out - ret_str = f""" -for (int i=0; i str: - ret_str = "" - for aten_arg in self.args: - ref = self.refs[aten_arg.name] - ret_str += self.create_value_decl_for(ref) - - ret_str += self.create_value_decl_for(self.refs["out"]) - ret_str += f"{self.out.cpp_type} out;\n" - ret_str += self.gen_output_staging_valueref_decl(self.refs["out"]) - return ret_str - - def gen_graph_build_code(self, include_declarations: bool = True) -> str: - graph_build = self.create_out_src(include_declarations) - for aten_arg in self.args: - graph_build += self.create_value_for( - self.refs[aten_arg.name], include_declarations - ) - - graph_build += self.create_value_for(self.refs["out"], include_declarations) - graph_build += self.create_op_call() - - if self.include_io: - graph_build += self.set_output(self.refs["out"], include_declarations) - - graph_build += f"{self.graph}{self.dot}prepare();\n" - graph_build += f"{self.graph}{self.dot}prepack();\n" - - graph_build += "\n" - return graph_build - - def gen_graph_exec_code(self, check_output=True) -> str: - graph_exec = "" - if self.include_io: - for aten_arg in self.args: - ref = self.refs[aten_arg.name] - if ref.is_in: - graph_exec += self.virtual_resize(ref) - graph_exec += self.copy_into_staging(ref) - - graph_exec += f"{self.graph}{self.dot}propagate_resize();\n" - - graph_exec += f"{self.graph}{self.dot}execute();\n" - - graph_exec += self.declare_vk_out_for(self.refs["out"]) - if self.include_io: - graph_exec += self.copy_from_staging(self.refs["out"]) - - if self.include_io and check_output: - graph_exec += self.check_graph_out(self.refs["out"]) - - graph_exec = re.sub(r"^", " ", graph_exec, flags=re.M) - graph_exec = "{\n" + graph_exec + "\n}" - - return graph_exec - - def gen_conditional_skips(self, skip_str: str = "GTEST_SKIP();") -> str: - fp16_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_float16_buffers_support()) {{\n" - fp16_skip += f" {skip_str}\n" - fp16_skip += "}" - fp16_skip = re.sub(r"^", " ", fp16_skip, flags=re.M) + "\n" - - int8_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_int8_buffers_support()) {{\n" - int8_skip += f" {skip_str};\n" - int8_skip += "}\n" - - skips = "" - - skips += "if (test_dtype == at::kHalf) {\n" - skips += fp16_skip - skips += "}\n" - - for _, dtype in self.suite_def.arg_dtype.items(): - if dtype == "at::kChar" or dtype == "at::kQInt8": - skips += int8_skip - continue - - skips += "\n" - return skips - - def gen_op_check_fn(self) -> str: - op_name = self.f.func.name.unambiguous_name() - if self.suite_def.test_name_suffix is not None: - op_name += "_" + self.suite_def.test_name_suffix - - op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n" - if self.should_prepack: - op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n" - - op_check_fn_body = "" - op_check_fn_body += self.gen_graph_build_code() - op_check_fn_body += self.gen_graph_exec_code() - - op_check_fn_body = re.sub(r"^", " ", op_check_fn_body, flags=re.M) - - op_check_fn += op_check_fn_body - op_check_fn += "\n }" - - return op_check_fn - - def gen_build_graph_fn(self, include_declarations: bool = False) -> str: - op_name = self.f.func.name.unambiguous_name() - if self.suite_def.test_name_suffix is not None: - op_name += "_" + self.suite_def.test_name_suffix - op_build_graph_fn = self.gen_decl(f"build_graph_{op_name}") + " {\n" - if self.should_prepack: - op_build_graph_fn = ( - self.gen_decl(f"prepacked_build_graph_{op_name}") + " {\n" - ) - - op_build_graph_fn_body = "" - op_build_graph_fn_body += self.gen_graph_build_code(include_declarations) - - op_build_graph_fn += op_build_graph_fn_body - op_build_graph_fn += "\n }" - return op_build_graph_fn - - def gen_op_exec_graph_fn(self) -> str: - op_name = self.f.func.name.unambiguous_name() - if self.suite_def.test_name_suffix is not None: - op_name += "_" + self.suite_def.test_name_suffix - op_benchmark_fn = self.gen_decl(f"benchmark_{op_name}") + " {\n" - if self.should_prepack: - op_benchmark_fn = self.gen_decl(f"prepacked_benchmark_{op_name}") + " {\n" - - op_benchmark_fn_body = "" - op_benchmark_fn_body += self.gen_graph_exec_code(False) - - op_benchmark_fn_body = re.sub(r"^", " ", op_benchmark_fn_body, flags=re.M) - - op_benchmark_fn += op_benchmark_fn_body - op_benchmark_fn += "\n }" - return op_benchmark_fn diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py deleted file mode 100644 index 80b4d5dead9..00000000000 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py +++ /dev/null @@ -1,417 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import re -from typing import Any, List - -from executorch.backends.vulkan.test.op_tests.utils.aten_types import ( - AT_INT_ARRAY_REF, - AT_SCALAR, - AT_TENSOR, - AT_TENSOR_LIST, - BOOL, - DOUBLE, - INT, - OLD_STRING, - OPT_AT_DOUBLE_ARRAY_REF, - OPT_AT_INT_ARRAY_REF, - OPT_AT_TENSOR, - OPT_BOOL, - OPT_DEVICE, - OPT_INT64, - OPT_LAYOUT, - OPT_MEMORY_FORMAT, - OPT_SCALAR_TYPE, - STRING, -) -from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite - -from torchgen.api import cpp -from torchgen.api.types import CppSignatureGroup -from torchgen.model import Argument, NativeFunction - -########################## -## Test Suite Generation ## -########################## - -test_fixture_template = """ -class GeneratedOpsTest_{op_name} : public ::testing::Test {{ -}}; -""" - -test_suite_template = """ -TEST_P(GeneratedOpsTest_{op_name}, {case_name}) {{ -{create_ref_data} -try {{ -{create_and_check_out} -}} -catch (const vkcompute::vkapi::ShaderNotSupportedError& e) {{ - GTEST_SKIP() << e.what(); -}} -}} -""" - - -def init_list_str(pylist: Any) -> str: - if pylist == "[]": - return "{" + "}" - - if not isinstance(pylist, (list, tuple)): - pylist = [pylist] - - list_str = "{" - for s in pylist: - if isinstance(s, (list, tuple)): - list_str += f"{init_list_str(s)}, " - else: - list_str += f"{s}, " - list_str = list_str[:-2] + "}" - return list_str - - -def get_or_return_default(arg: Argument, inputs: List[Any], i: int): - if i < len(inputs): - return inputs[i] - else: - assert arg.default is not None - return arg.default - - -class CorrectnessTestGen: - def __init__(self, f: NativeFunction, test_suite: TestSuite): - self.f = f - self.suite_def = test_suite - self.op_name = f.func.name.unambiguous_name() - if test_suite.test_name_suffix is not None: - self.op_name += f"_{test_suite.test_name_suffix}" - - self.f_sig = CppSignatureGroup.from_native_function( - self.f, method=False, fallback_binding=self.f.manual_cpp_binding - ).most_faithful_signature() - - def gen_case_name_tuple(self, t) -> str: - return "x".join( - [ - ( - str(e) - if not isinstance(e, (list, tuple)) - else self.gen_case_name_tuple(e) - ) - for e in t - ] - ) - - def gen_case_name(self, inputs: List[Any], prepack: bool = False) -> str: - name_str = self.op_name - if prepack: - name_str += "_prepack" - for arg_sizes_or_val in inputs: - name_str += "_" - if isinstance(arg_sizes_or_val, tuple): - name_str += self.gen_case_name_tuple(arg_sizes_or_val) - elif isinstance(arg_sizes_or_val, list): - lst = [] - for size in arg_sizes_or_val: - if isinstance(size, (list, tuple)): - lst.append(self.gen_case_name_tuple(size)) - else: - lst.append(str(size)) - name_str += "c".join(lst) - else: - name_str += str(arg_sizes_or_val).replace(".", "p") - - # minus sign is a invalid char for test case. change to "n". - name_str = name_str.replace("-", "n") - return name_str - - def call_data_gen_fn(self, arg: Argument, data: Any, terminate: bool = True) -> str: - tensor_dtype = ( - "test_dtype" - if arg.name not in self.suite_def.arg_dtype - else self.suite_def.arg_dtype[arg.name] - ) - - data_range = ( - self.suite_def.data_range - if arg.name not in self.suite_def.arg_data_range - else self.suite_def.arg_data_range[arg.name] - ) - - data_gen_fn = ( - self.suite_def.data_gen - if arg.name not in self.suite_def.arg_data_gen_fn - else self.suite_def.arg_data_gen_fn[arg.name] - ) - - ret_str = f"{data_gen_fn}({init_list_str(data)}, {tensor_dtype}, {data_range[0]}, {data_range[1]})" - if terminate: - ret_str += ";" - - return ret_str - - def create_input_data(self, arg: Argument, data: Any) -> str: # noqa: C901 - ctype = cpp.argumenttype_type(arg.type, mutable=arg.is_write, binds=arg.name) - cpp_type = ctype.cpp_type(strip_ref=True) - - # Short cut exit for TENSORLIST, because it needs multiple lines of - # construction, deviates from the rest. - if cpp_type == AT_TENSOR_LIST: - ret_str = f"std::vector<{AT_TENSOR}> tensor_vec;\n" - for elem in data: - ret_str += f"tensor_vec.emplace_back({self.call_data_gen_fn(arg, elem, False)});\n" - ret_str += f"{cpp_type} {arg.name} = tensor_vec;\n" - return ret_str + "\n" - - if cpp_type == AT_INT_ARRAY_REF: - ret_str = f"std::vector {arg.name} = " - elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF and str(data) != "None": - ret_str = f"std::vector {arg.name} = " - elif cpp_type == OPT_AT_INT_ARRAY_REF and str(data) != "None": - ret_str = f"std::vector {arg.name} = " - else: - ret_str = f"{cpp_type} {arg.name} = " - - if cpp_type == AT_TENSOR: - if arg.name == "index" or arg.name == "indices": - args_str = init_list_str(data) - if args_str[:3] == "{{{": - ret_str += f"make_index_tensor_3d({init_list_str(data)});" - elif args_str[:2] == "{{": - ret_str += f"make_index_tensor_2d({init_list_str(data)});" - else: - ret_str += f"make_index_tensor_1d({init_list_str(data)});" - else: - ret_str += self.call_data_gen_fn(arg, data) - elif cpp_type == OPT_AT_TENSOR: - if str(data) == "None": - ret_str += "std::nullopt;" - else: - ret_str += self.call_data_gen_fn(arg, data) - elif cpp_type == AT_SCALAR: - ret_str += f"{data};" - elif cpp_type == AT_INT_ARRAY_REF: - ret_str += f"{init_list_str(data)};" - elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF or cpp_type == OPT_AT_INT_ARRAY_REF: - if str(data) == "None": - ret_str += "std::nullopt;" - else: - ret_str += f"{init_list_str(data)};" - elif cpp_type == BOOL: - ret_str += f"{str(data).lower()};" - elif cpp_type == INT: - ret_str += f"{str(data).lower()};" - elif cpp_type == DOUBLE: - ret_str += f"{str(data).lower()};" - elif cpp_type == OPT_INT64: - if str(data) == "None": - ret_str += "std::nullopt;" - else: - ret_str += f"{str(data)};" - elif cpp_type == STRING or cpp_type == OLD_STRING: - ret_str += f'std::string_view("{data}");' - elif ( - cpp_type == OPT_SCALAR_TYPE - or cpp_type == OPT_LAYOUT - or cpp_type == OPT_DEVICE - or cpp_type == OPT_BOOL - or cpp_type == OPT_MEMORY_FORMAT - ): - ret_str += "std::nullopt;" - else: - raise RuntimeError(f"Unsupported cpp type {cpp_type}") - return ret_str + "\n" - - def gen_create_ref_data(self, inputs: List[Any]) -> str: - ref_code = "" - - for i, binding in enumerate(self.f_sig.arguments()): - arg = binding.argument - arg_data = get_or_return_default(arg, inputs, i) - ref_code += self.create_input_data(arg, arg_data) - - ref_code = re.sub(r"^", " ", ref_code, flags=re.M) - return ref_code - - def gen_create_and_check_out(self, prepack=False) -> str: - test_str = f"check_{self.op_name}(" - if prepack: - test_str = f"prepacked_check_{self.op_name}(" - for binding in self.f_sig.arguments(): - arg = binding.argument - test_str += f"{arg.name}, " - test_str = test_str[:-2] + ");" - test_str = re.sub(r"^", " ", test_str, flags=re.M) - return test_str - - def gen_parameterization(self) -> str: - return "" - - def generate_fixture_cpp(self) -> str: - return test_fixture_template.format(op_name=self.f.func.name) - - def generate_case_cpp(self, inputs, prepack=False) -> str: - return test_suite_template.format( - op_name=f"{self.op_name}", - case_name=self.gen_case_name(inputs, prepack), - create_ref_data=self.gen_create_ref_data(inputs), - create_and_check_out=self.gen_create_and_check_out(prepack), - ) - - def generate_suite_cpp(self) -> str: - suite_cpp = self.generate_fixture_cpp() - for inputs in self.suite_def.input_cases: - if not self.suite_def.requires_prepack: - suite_cpp += self.generate_case_cpp(inputs) - if self.suite_def.supports_prepack(): - suite_cpp += self.generate_case_cpp(inputs, prepack=True) - - suite_cpp += self.gen_parameterization() - return suite_cpp - - -########################## -## Test File Generation ## -########################## - -cpp_test_template = """ -#include - -#include - -{preamble} - -at::Tensor make_casted_randint_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - int low = 0, - int high = 10) {{ - - return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); -}} - -at::Tensor make_rand_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - float low = 0.0, - float high = 1.0) {{ - - if (dtype == at::kChar) - return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); - - if (dtype == at::kBool) - return at::rand(sizes, at::device(at::kCPU)) > 0.5; - - if (high == 1.0 && low == 0.0) - return at::rand(sizes, at::device(at::kCPU).dtype(dtype)); - - return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low; -}} - -at::Tensor make_zeros_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - float low = 0.0, - float high = 1.0) {{ - return at::zeros(sizes, at::device(at::kCPU).dtype(dtype)); -}} - -at::Tensor make_ones_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - float low = 0.0, - float high = 1.0) {{ - return at::ones(sizes, at::device(at::kCPU).dtype(dtype)); -}} - -at::Tensor make_seq_tensor( - std::vector sizes, - at::ScalarType dtype = at::kFloat, - float low = 0.0, - float high = 1.0) {{ - (void)low; - (void)high; - - int64_t n = 1; - for (auto size: sizes) {{ - n *= size; - }} - - std::vector values(n); - for (int i=0;i indices) {{ - at::ScalarType dtype = at::kInt; - std::vector sizes = {{static_cast(indices.size())}}; - - // Clone as original data will be deallocated upon return. - return at::from_blob(indices.data(), sizes, dtype).detach().clone(); -}} - -at::Tensor make_index_tensor_2d(std::vector> indices) {{ - at::ScalarType dtype = at::kInt; - std::vector sizes = {{ - static_cast(indices.size()), - static_cast(indices[0].size())}}; - - // Flatten indices as from_blob reads garbage otherwise. - std::vector acc; - for (auto& vec: indices) {{ - acc.insert(acc.end(), vec.begin(), vec.end()); - }} - - // Clone as original data will be deallocated upon return. - return at::from_blob(acc.data(), sizes, dtype).detach().clone(); -}} - -at::Tensor make_index_tensor_3d(std::vector>> indices) {{ - at::ScalarType dtype = at::kInt; - std::vector sizes = {{ - static_cast(indices.size()), - static_cast(indices[0].size()), - static_cast(indices[0][0].size())}}; - - // Flatten indices as from_blob reads garbage otherwise. - std::vector acc; - for (auto& v: indices) {{ - for (auto& vv: v) {{ - acc.insert(acc.end(), vv.begin(), vv.end()); - }} - }} - - // Clone as original data will be deallocated upon return. - return at::from_blob(acc.data(), sizes, dtype).detach().clone(); -}} - -{test_suites_cpp} -""" - - -class CorrectnessTestFileGen: - def __init__(self, out_path): - self.out_path = out_path - self.suites_gens = [] - - def generate_cpp(self) -> str: - return cpp_test_template.format( - preamble=self.generate_preamble(), - test_suites_cpp=self.generate_test_suites_cpp(), - ) - - def generate_preamble(self) -> str: - return "" - - def generate_test_suites_cpp(self) -> str: - return "\n".join([h.generate_suite_cpp() for h in self.suites_gens]) - - def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: - suites_gen = CorrectnessTestGen(f, all_input_cases) - self.suites_gens.append(suites_gen) diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py deleted file mode 100644 index c368c23c539..00000000000 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( - ComputeGraphGen, -) -from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import ( - CorrectnessTestFileGen, - CorrectnessTestGen, -) -from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite - -from torchgen.model import NativeFunction - -################################## -## Test Fixture Code Generation ## -################################## - -test_fixture_template = """ -class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple> {{ - protected: - ComputeGraph* graph; - at::ScalarType test_dtype = at::kFloat; - float rtol = {rtol}; - float atol = {atol}; - - void SetUp() override {{ - GraphConfig config; - utils::StorageType default_storage_type; - utils::GPUMemoryLayout default_memory_layout; - std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam(); - config.set_storage_type_override(default_storage_type); - config.set_memory_layout_override(default_memory_layout); - graph = new ComputeGraph(config); - - if (test_dtype == at::kHalf) {{ - rtol = 1e-2; - atol = 1e-2; - }} - }} - - void TearDown() override {{ - delete graph; - graph = nullptr; - }} - - {check_fn} -}}; -""" - - -class VkCorrectnessTestGen(CorrectnessTestGen): - def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite): - super().__init__(f, inputs) - self.op_reg_name = op_reg_name - self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def) - - def generate_fixture_cpp(self) -> str: - check_fn = "" - if not self.suite_def.requires_prepack: - check_fn = self.generator.gen_op_check_fn() - - prepacked_check_fn = "" - if self.suite_def.supports_prepack(): - self.generator.should_prepack = True - prepacked_check_fn = self.generator.gen_op_check_fn() - check_fn += "\n\n " - check_fn += prepacked_check_fn - - return test_fixture_template.format( - op_name=self.op_name, - check_fn=check_fn, - rtol=self.suite_def.rtol, - atol=self.suite_def.atol, - ) - - def gen_parameterization(self) -> str: - dtypes = self.suite_def.dtypes - storage_types = self.suite_def.storage_types - layouts = self.suite_def.layouts - - return f""" -INSTANTIATE_TEST_SUITE_P( - Combos_{self.op_name}, - GeneratedOpsTest_{self.op_name}, - ::testing::Combine( - ::testing::Values({', '.join(dtypes)}), - ::testing::Values({', '.join(storage_types)}), - ::testing::Values({', '.join(layouts)}))); - """ - - -############################## -## Test File Code Generation ## -############################### - -preamble_str = """ -#include -#include -#include - -#include - -using namespace vkcompute; -using TensorOptions = at::TensorOptions; - -vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { - switch (at_scalartype) { - case c10::kDouble: - return vkapi::kDouble; - case c10::kFloat: - return vkapi::kFloat; - case c10::kHalf: - return vkapi::kHalf; - case c10::kInt: - return vkapi::kInt; - case c10::kLong: - return vkapi::kInt; - case c10::kChar: - return vkapi::kChar; - case c10::kBool: - return vkapi::kBool; - default: - VK_THROW("Unsupported at::ScalarType!"); - } -} - -#ifdef USE_VULKAN_FP16_INFERENCE -bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) { -#else -bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) { -#endif - // Skip checking index tensors - if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) { - return true; - } - bool is_close = at::allclose(t1, t2, rtol, atol); - if (!is_close && t1.numel() < 500) { - std::cout << "reference: " << std::endl; - print(t1, 150); - std::cout << std::endl; - std::cout << "vulkan: " << std::endl; - print(t2, 150); - std::cout << std::endl; - } - return is_close; -} -""" - - -class VkCorrectnessTestFileGen(CorrectnessTestFileGen): - def __init__(self, out_path: str): - super().__init__(out_path) - - def generate_preamble(self) -> str: - return preamble_str - - def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: - suites_gen = VkCorrectnessTestGen(op_reg_name, f, all_input_cases) - self.suites_gens.append(suites_gen) diff --git a/backends/vulkan/test/op_tests/utils/test_suite.py b/backends/vulkan/test/op_tests/utils/test_suite.py deleted file mode 100644 index 427864b0d5d..00000000000 --- a/backends/vulkan/test/op_tests/utils/test_suite.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional - -################################### -## Generic Test Suite definition ## -################################### - - -class TestSuite: - def __init__(self, input_cases: List[Any]): - self.input_cases: List[Any] = input_cases - self.prepacked_args: List[str] = [] - self.requires_prepack: bool = False - self.dtypes: List[str] = ["at::kFloat", "at::kHalf"] - - self.data_gen: str = "make_rand_tensor" - self.data_range = (0, 1) - - self.arg_dtype = {} - self.arg_data_gen_fn: Dict[str, str] = {} - self.arg_data_range = {} - - self.atol: str = "1e-5" - self.rtol: str = "1e-5" - - self.is_view_op: bool = False - self.test_name_suffix: Optional[str] = None - - def supports_prepack(self): - return len(self.prepacked_args) > 0 - - -################################## -## Vulkan Test Suite Definition ## -################################## - - -@dataclass -class VkTestSuite(TestSuite): - def __init__(self, input_cases: List[Any]): - super().__init__(input_cases) - self.storage_types: List[str] = ["utils::kTexture3D"] - self.layouts: List[str] = ["utils::kChannelsPacked"] - self.data_gen: str = "make_rand_tensor" - self.force_io: bool = True - self.arg_storage_types: Dict[str, str] = {} - self.arg_memory_layouts: Dict[str, str] = {} diff --git a/backends/vulkan/test/scripts/test_model.sh b/backends/vulkan/test/scripts/test_model.sh deleted file mode 100755 index 5f06d2c039b..00000000000 --- a/backends/vulkan/test/scripts/test_model.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -exu - -# Initialize variables -RUN_BUILD=false -RUN_CORRECTNESS_TEST=false -RUN_CLEAN=false -RUN_RECOMPILE=false -MODEL_NAME="" -OUTPUT_DIRECTORY="." - -# Parse arguments -SKIP_NEXT=false -for i in $(seq 1 $#); do - if [[ "$SKIP_NEXT" == true ]]; then - SKIP_NEXT=false - continue - fi - - arg="${!i}" - case $arg in - --build|-b) - RUN_BUILD=true - ;; - --clean|-c) - RUN_CLEAN=true - ;; - --recompile|-rc) - RUN_RECOMPILE=true - ;; - --output_directory|-o) - next_i=$((i + 1)) - if [[ $next_i -le $# ]]; then - OUTPUT_DIRECTORY="${!next_i}" - SKIP_NEXT=true - else - echo "Error: --output_directory|-o requires a value" - exit 1 - fi - ;; - --*|-*) - echo "Unknown argument: $arg" - exit 1 - ;; - *) - if [[ -z "$MODEL_NAME" ]]; then - MODEL_NAME="$arg" - else - echo "Multiple model names provided: $MODEL_NAME and $arg" - exit 1 - fi - ;; - esac -done - -# Determine execution mode based on parsed arguments -if [[ "$RUN_BUILD" == true ]] && [[ -z "$MODEL_NAME" ]]; then - # Build-only mode - RUN_CORRECTNESS_TEST=false -elif [[ "$RUN_BUILD" == true ]] && [[ -n "$MODEL_NAME" ]]; then - # Build and test mode - RUN_CORRECTNESS_TEST=true -elif [[ "$RUN_BUILD" == false ]] && [[ -n "$MODEL_NAME" ]]; then - # Test-only mode - RUN_CORRECTNESS_TEST=true -else - echo "Invalid argument combination. Usage:" - echo " $0 --build|-b [--clean|-c] [--recompile|-rc] [-o|--output_directory DIR] # Build-only mode" - echo " $0 model_name [--build|-b] [--clean|-c] [--recompile|-rc] [-o|--output_directory DIR] # Test mode or build+test mode" - exit 1 -fi - -if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 -fi -which "${PYTHON_EXECUTABLE}" - -CMAKE_OUTPUT_DIR=cmake-out - -# Only set EXPORTED_MODEL if running correctness test -if [[ "${RUN_CORRECTNESS_TEST}" == true ]]; then - EXPORTED_MODEL=${MODEL_NAME}_vulkan -fi - - -clean_build_directory() { - echo "Cleaning build directory: ${CMAKE_OUTPUT_DIR}" - rm -rf ${CMAKE_OUTPUT_DIR} -} - -recompile() { - cmake --build cmake-out -j64 --target install -} - -build_core_libraries_and_devtools() { - echo "Building core libraries and devtools with comprehensive Vulkan support..." - - # Build core libraries with all required components - cmake . \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ - -DEXECUTORCH_BUILD_DEVTOOLS=ON \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -Bcmake-out && \ - cmake --build cmake-out -j64 --target install - - # Build devtools example runner - cmake examples/devtools \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -Bcmake-out/examples/devtools && \ - cmake --build cmake-out/examples/devtools -j16 --config Release -} - -run_example_runner() { - ./${CMAKE_OUTPUT_DIR}/examples/devtools/example_runner -bundled_program_path "${OUTPUT_DIRECTORY}/${EXPORTED_MODEL}.bpte" -output_verification -} - -test_bundled_model_with_vulkan() { - # Export model as bundled program with Vulkan backend - "${PYTHON_EXECUTABLE}" -m examples.vulkan.export --model_name="${MODEL_NAME}" --output_dir="${OUTPUT_DIRECTORY}" --bundled - - # Update exported model name for bundled program - EXPORTED_MODEL="${MODEL_NAME}_vulkan" - - # Verify the exported bundled model exists - if [[ ! -f "${OUTPUT_DIRECTORY}/${EXPORTED_MODEL}.bpte" ]]; then - echo "Error: Failed to export bundled model ${MODEL_NAME} with Vulkan backend" - exit 1 - fi - - # Note: Running bundled programs may require different executor runner - echo "Bundled program created successfully. Use appropriate bundled program runner to test." - - run_example_runner -} - - -# Main execution -if [[ "${RUN_BUILD}" == true ]]; then - if [[ "${RUN_CLEAN}" == true ]]; then - clean_build_directory - fi - build_core_libraries_and_devtools -fi - -if [[ "${RUN_RECOMPILE}" == true ]]; then - recompile -fi - -if [[ "${RUN_CORRECTNESS_TEST}" == true ]]; then - echo "Testing ${MODEL_NAME} with Vulkan backend..." - # Always use bundled program testing - test_bundled_model_with_vulkan - - # Check if test completed successfully - if [[ $? -eq 0 ]]; then - echo "Vulkan model test completed successfully!" - else - echo "Vulkan model test failed!" - exit 1 - fi -fi diff --git a/backends/vulkan/test/scripts/test_op.sh b/backends/vulkan/test/scripts/test_op.sh deleted file mode 100755 index 36920cb73cc..00000000000 --- a/backends/vulkan/test/scripts/test_op.sh +++ /dev/null @@ -1,258 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -exu - -# Initialize variables -RUN_BUILD=false -RUN_CLEAN=false -RUN_CLEAN_TESTS=false -RUN_RECOMPILE=false -RUN_TESTS=false -TEST_BINARY="" -ATEN_OP="" - -# Parse arguments -SKIP_NEXT=false -if [[ $# -eq 0 ]]; then - # No arguments provided - run default test - TEST_BINARY="vulkan_op_correctness_tests" - RUN_TESTS=true -else - for i in $(seq 1 $#); do - if [[ "$SKIP_NEXT" == true ]]; then - SKIP_NEXT=false - continue - fi - - arg="${!i}" - case $arg in - --build|-b) - RUN_BUILD=true - ;; - --clean|-c) - RUN_CLEAN=true - RUN_BUILD=true - ;; - --clean_tests|-ct) - RUN_CLEAN_TESTS=true - ;; - --recompile|-rc) - RUN_RECOMPILE=true - ;; - --test|-t) - RUN_TESTS=true - ;; - --aten) - next_i=$((i + 1)) - if [[ $next_i -le $# ]]; then - ATEN_OP="${!next_i}" - TEST_BINARY="vulkan_op_correctness_tests" - RUN_TESTS=true - SKIP_NEXT=true - else - echo "Error: --aten requires an operator name" - exit 1 - fi - ;; - --*|-*) - echo "Unknown argument: $arg" - exit 1 - ;; - *) - if [[ -z "$TEST_BINARY" ]]; then - TEST_BINARY="$arg" - RUN_TESTS=true - else - echo "Multiple test binaries provided: $TEST_BINARY and $arg" - exit 1 - fi - ;; - esac - done -fi - -# Determine execution mode based on parsed arguments -if [[ "$RUN_BUILD" == true ]] && [[ -z "$TEST_BINARY" ]] && [[ "$RUN_TESTS" == false ]]; then - # Build-only mode - echo "Build-only mode" -elif [[ "$RUN_BUILD" == true ]] && [[ -n "$TEST_BINARY" ]]; then - # Build and test mode - echo "Build and test mode for: $TEST_BINARY" -elif [[ "$RUN_BUILD" == false ]] && [[ -n "$TEST_BINARY" ]]; then - # Test-only mode - echo "Test-only mode for: $TEST_BINARY" -elif [[ "$RUN_TESTS" == true ]] && [[ -z "$TEST_BINARY" ]]; then - # Run all available tests - echo "Running all available operator tests" -elif [[ $# -eq 0 ]]; then - # No arguments provided - run default test - TEST_BINARY="vulkan_op_correctness_tests" - RUN_TESTS=true - echo "No arguments provided, running default test: $TEST_BINARY" -else - echo "Invalid argument combination. Usage:" - echo " $0 # Run default vulkan_op_correctness_tests" - echo " $0 --build|-b [--clean|-c] [--clean_tests|-ct] [--recompile|-rc] # Build-only mode" - echo " $0 [test_binary_name] [--build|-b] [--clean|-c] [--clean_tests|-ct] [--recompile|-rc] # Test mode or build+test mode" - echo " $0 --test|-t [--build|-b] [--clean|-c] [--clean_tests|-ct] [--recompile|-rc] # Run all tests mode" - echo " $0 --aten [--build|-b] [--clean|-c] [--clean_tests|-ct] [--recompile|-rc] # Run specific ATen operator test" - echo " $0 --clean_tests|-ct # Clean and rebuild only operator tests" - echo "" - echo "Available test binaries:" - echo " - vulkan_op_correctness_tests" - echo " - vulkan_op_benchmarks" - echo " - compute_graph_op_tests" - echo " - sdpa_test" - exit 1 -fi - -if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 -fi -which "${PYTHON_EXECUTABLE}" - -CMAKE_OUTPUT_DIR=cmake-out - -clean_build_directory() { - echo "Cleaning build directory: ${CMAKE_OUTPUT_DIR}" - rm -rf ${CMAKE_OUTPUT_DIR} -} - -clean_test_directory() { - echo "Cleaning test build directory: ${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests" - rm -rf ${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests -} - -build_core_libraries() { - cmake . \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ - -DEXECUTORCH_BUILD_DEVTOOLS=ON \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_TESTS=ON \ - -Bcmake-out && \ - cmake --build cmake-out -j64 --target install -} - -build_operator_tests() { - echo "Building Vulkan operator tests..." - - # Check if TORCH_OPS_YAML_PATH is set, if not use default - if [[ -z "${TORCH_OPS_YAML_PATH:-}" ]]; then - TORCH_OPS_YAML_PATH="$HOME/Github/pytorch/aten/src/ATen/native" - echo "Using default TORCH_OPS_YAML_PATH: $TORCH_OPS_YAML_PATH" - fi - - # Verify that TORCH_OPS_YAML_PATH exists - if [[ ! -d "$TORCH_OPS_YAML_PATH" ]]; then - echo "Error: TORCH_OPS_YAML_PATH directory does not exist: $TORCH_OPS_YAML_PATH" - echo "Please set TORCH_OPS_YAML_PATH to a valid PyTorch native operations directory" - echo "Example: export TORCH_OPS_YAML_PATH=/path/to/pytorch/aten/src/ATen/native" - exit 1 - fi - - # Verify required YAML files exist - if [[ ! -f "$TORCH_OPS_YAML_PATH/native_functions.yaml" ]]; then - echo "Error: Required file not found: $TORCH_OPS_YAML_PATH/native_functions.yaml" - exit 1 - fi - - if [[ ! -f "$TORCH_OPS_YAML_PATH/tags.yaml" ]]; then - echo "Error: Required file not found: $TORCH_OPS_YAML_PATH/tags.yaml" - exit 1 - fi - - echo "Using TORCH_OPS_YAML_PATH: $TORCH_OPS_YAML_PATH" - - # Build operator tests - cmake backends/vulkan/test/op_tests \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ - -DTORCH_OPS_YAML_PATH="$TORCH_OPS_YAML_PATH" \ - -DCMAKE_CXX_STANDARD=17 \ - -Bcmake-out/backends/vulkan/test/op_tests && \ - cmake --build cmake-out/backends/vulkan/test/op_tests -j16 -} - -recompile() { - echo "Recompiling..." - cmake --build cmake-out -j64 --target install - cmake --build cmake-out/backends/vulkan/test/op_tests -j16 -} - -run_operator_test() { - local test_name="$1" - local test_binary_path="" - - case "$test_name" in - "aten") - test_binary_path="${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/vulkan_op_correctness_tests" - ;; - *) - # Try to find the binary directly - test_binary_path="${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/${test_name}" - ;; - esac - - if [[ -f "$test_binary_path" ]]; then - echo "Running test binary: $test_binary_path" - - # Add gtest filter if ATEN_OP is specified - if [[ -n "$ATEN_OP" ]]; then - echo "Filtering tests for ATen operator: $ATEN_OP" - "$test_binary_path" --gtest_filter="*${ATEN_OP}*" - else - "$test_binary_path" - fi - else - echo "Error: Test binary not found at $test_binary_path" - echo "Available binaries in ${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/:" - ls -la "${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/" 2>/dev/null || echo "Directory not found" - exit 1 - fi -} - -# Main execution -if [[ "${RUN_CLEAN_TESTS}" == true ]]; then - clean_test_directory - build_operator_tests -fi - -if [[ "${RUN_BUILD}" == true ]]; then - if [[ "${RUN_CLEAN}" == true ]]; then - clean_build_directory - fi - build_core_libraries - build_operator_tests -fi - -if [[ "${RUN_RECOMPILE}" == true ]]; then - recompile -fi - -if [[ "${RUN_TESTS}" == true ]]; then - run_operator_test "$TEST_BINARY" - - # Check if tests completed successfully - if [[ $? -eq 0 ]]; then - echo "Vulkan operator tests completed successfully!" - else - echo "Some Vulkan operator tests failed!" - exit 1 - fi -fi diff --git a/backends/vulkan/test/test_serialization.py b/backends/vulkan/test/test_serialization.py deleted file mode 100644 index c373f5216d2..00000000000 --- a/backends/vulkan/test/test_serialization.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# pyre-strict -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import ctypes -import random -import unittest -from typing import List - -import torch - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - IntList, - OperatorCall, - String, - VkGraph, - VkValue, -) - -from executorch.backends.vulkan.serialization.vulkan_graph_serialize import ( - convert_to_flatbuffer, - flatbuffer_to_vk_graph, - serialize_vulkan_graph, - VulkanDelegateHeader, -) - - -class TestSerialization(unittest.TestCase): - def _generate_random_const_tensors(self, num_tensors: int) -> List[torch.Tensor]: - """ - Helper function to generate `num_tensor` buffers of random sizes and random contents, - we return a tuple of (list_of_buffers, list_of_mem_sizes), - """ - tensors = [] - for _ in range(num_tensors): - width = random.randint(4, 100) - height = random.randint(4, 100) - channels = random.randint(2, 8) - - tensor = torch.randn(channels, width, height) - tensors.append(tensor) - - return tensors - - def test_serialize_vulkan_binary(self) -> None: - vk_graph = VkGraph( - version="0", - chain=[], - values=[], - input_ids=[], - output_ids=[], - constants=[], - shaders=[], - ) - const_tensors = self._generate_random_const_tensors(5) - - serialized_binary = serialize_vulkan_graph(vk_graph, const_tensors, []) - - # Check header - self.assertEqual(serialized_binary[0:4], b"\x00\x00\x00\x00") - self.assertEqual(serialized_binary[VulkanDelegateHeader.MAGIC_IX], b"VH00") - flatbuffer_offset = int.from_bytes( - serialized_binary[VulkanDelegateHeader.FLATBUFFER_OFFSET_IX], - byteorder="little", - ) - constants_offset = int.from_bytes( - serialized_binary[VulkanDelegateHeader.BYTES_OFFSET_IX], - byteorder="little", - ) - constants_size = int.from_bytes( - serialized_binary[VulkanDelegateHeader.BYTES_SIZE_IX], - byteorder="little", - ) - - # Flatbuffer magic should be in the same spot as the Header's magic - self.assertEqual( - serialized_binary[flatbuffer_offset:][VulkanDelegateHeader.MAGIC_IX], - b"VK00", - ) - - constant_data_payload = serialized_binary[ - constants_offset : constants_offset + constants_size - ] - - # We check that constant data indexes stored in the vk_graph correctly index - # into the correct buffer in the constant data section - self.assertEqual(len(vk_graph.constants), len(const_tensors)) - for bytes_range, tensor in zip(vk_graph.constants, const_tensors): - offset = bytes_range.offset - length = bytes_range.length - - constant_data_bytes = constant_data_payload[offset : offset + length] - - array_type = ctypes.c_char * tensor.untyped_storage().nbytes() - array = ctypes.cast( - tensor.untyped_storage().data_ptr(), - ctypes.POINTER(array_type), - ).contents - - tensor_bytes = bytes(array) - self.assertEqual(constant_data_bytes, tensor_bytes) - - def test_serialize_deserialize_vkgraph(self) -> None: - in_vk_graph = VkGraph( - version="1", - chain=[ - OperatorCall(node_id=1, name="foo", args=[1, 2, 3]), - OperatorCall(node_id=2, name="bar", args=[]), - ], - values=[ - VkValue( - value=String( - string_val="abc", - ), - ), - VkValue( - value=IntList( - items=[-1, -4, 2], - ), - ), - ], - input_ids=[], - output_ids=[], - constants=[], - shaders=[], - ) - - bs = convert_to_flatbuffer(in_vk_graph) - out_vk_graph = flatbuffer_to_vk_graph(bs) - - self.assertEqual(in_vk_graph, out_vk_graph) diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py deleted file mode 100644 index 00a357b0b67..00000000000 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ /dev/null @@ -1,2652 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -import ctypes -import unittest -from typing import Tuple - -import executorch.backends.vulkan.test.utils as test_utils - -import torch - -from executorch.backends.transforms.convert_dtype_pass import I64toI32 - -from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner - -from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend - -from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( - get_symmetric_quantization_config, - XNNPACKQuantizer, -) - -from executorch.exir import ( - EdgeCompileConfig, - EdgeProgramManager, - ExecutorchProgramManager, - to_edge_transform_and_lower, -) -from executorch.extension.pybindings.portable_lib import ( # @manual - _load_for_executorch_from_buffer, -) -from executorch.extension.pytree import tree_flatten -from torch.export import Dim, export, ExportedProgram - -from torchao.quantization.granularity import PerGroup - -from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e - -from torchao.quantization.pt2e.quantizer import Quantizer -from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_ -from torchao.utils import unwrap_tensor_subclass - -try: - ctypes.CDLL("libvulkan.so.1") -except: - pass - - -def lower_module( - model: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], dynamic_shapes=None -) -> EdgeProgramManager: - compile_options = {} - if dynamic_shapes is not None: - compile_options["require_dynamic_shapes"] = True - - edge_compile_config = EdgeCompileConfig( - _skip_dim_order=False, # TODO(T182928844): Delegate dim order op to backend. - ) - - program: ExportedProgram = export( - model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True - ) - - edge_program = to_edge_transform_and_lower( - program, - compile_config=edge_compile_config, - transform_passes=[ - I64toI32(edge_compile_config._skip_dim_order), - ], - partitioner=[VulkanPartitioner(compile_options)], - ) - - return edge_program - - -def quantize_and_lower_module( - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - quantizer: Quantizer, - dynamic_shapes=None, -) -> EdgeProgramManager: - compile_options = {} - if dynamic_shapes is not None: - compile_options["require_dynamic_shapes"] = True - - edge_compile_config = EdgeCompileConfig( - _skip_dim_order=False, # TODO(T182928844): Delegate dim order op to backend. - ) - - program = export( - model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True - ).module() - - program = prepare_pt2e(program, quantizer) - # Calibrate - program(*sample_inputs) - - program = convert_pt2e(program) - - program = export(program, sample_inputs, dynamic_shapes=dynamic_shapes) - - edge_program = to_edge_transform_and_lower( - program, - compile_config=edge_compile_config, - transform_passes=[ - I64toI32(edge_compile_config._skip_dim_order), - ], - partitioner=[VulkanPartitioner(compile_options)], - ) - - return edge_program - - -class TestVulkanBackend(unittest.TestCase): - def assert_outputs_equal( - self, - model_output, - ref_output, - atol=1e-03, - rtol=1e-03, - first_output_only=False, - equal_nan=True, - ): - """ - Helper testing function that asserts that the model output and the reference output - are equal with some tolerance. Due to numerical differences between eager mode and - the Vulkan's backend, we relax the detal such that default absolute - tolerance is 1e-3. and default relative tolerance is 1e-3. - """ - - # Compare the result from executor and eager mode direclty - if isinstance(ref_output, tuple) or isinstance(ref_output, list): - # Multiple outputs executor always returns tuple, even if there is one output - self.assertTrue(len(ref_output) == len(model_output)) - if first_output_only: - result = torch.allclose( - model_output[0], - ref_output[0], - atol=atol, - rtol=rtol, - equal_nan=equal_nan, - ) - if not result: - test_utils.print_tensor_comparison_errors( - model_output[0], ref_output[0], atol, rtol - ) - self.assertTrue(result) - else: - for i in range(len(ref_output)): - result = torch.allclose( - model_output[i], - ref_output[i], - atol=atol, - rtol=rtol, - equal_nan=equal_nan, - ) - if not result: - print(f"\n=== Output {i} comparison failed ===") - test_utils.print_tensor_comparison_errors( - model_output[i], ref_output[i], atol, rtol - ) - self.assertTrue(result) - else: - # If one output, eager returns tensor while executor tuple of size 1 - result = torch.allclose( - model_output[0], - ref_output, - atol=atol, - rtol=rtol, - equal_nan=equal_nan, - ) - if not result: - test_utils.print_tensor_comparison_errors( - model_output[0], ref_output, atol, rtol - ) - self.assertTrue(result) - - def check_no_delegation(self, et_program: ExecutorchProgramManager): - self.assertEqual( - len(et_program.executorch_program.execution_plan[0].delegates), - 0, - ) - return - - def check_vk_delegation(self, et_program: ExecutorchProgramManager): - self.assertEqual( - et_program.executorch_program.execution_plan[0].delegates[0].id, - VulkanBackend.__name__, - ) - - def run_delegated_model_and_check_output( - self, - et_program: ExecutorchProgramManager, - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - atol=1e-03, - rtol=1e-01, - test_inputs=None, - first_output_only=False, - ): - executorch_module = _load_for_executorch_from_buffer(et_program.buffer) - inputs_flattened, _ = tree_flatten(sample_inputs) - - model_output = executorch_module.run_method("forward", tuple(inputs_flattened)) - ref_output = model(*sample_inputs) - - self.assert_outputs_equal( - model_output, - ref_output, - atol=atol, - rtol=rtol, - first_output_only=first_output_only, - ) - - if test_inputs is not None: - for test_input in test_inputs: - test_inputs_flattened, _ = tree_flatten(test_input) - model_output = executorch_module.run_method( - "forward", tuple(test_inputs_flattened) - ) - ref_output = model(*test_input) - - self.assert_outputs_equal( - model_output, - ref_output, - atol=atol, - rtol=rtol, - first_output_only=first_output_only, - ) - - def lower_module_and_test_output( - self, - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - atol=1e-03, - rtol=1e-01, - dynamic_shapes=None, - test_inputs=None, - first_output_only=False, - expect_no_delegates=False, - ): - """ - Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with - the given sample inputs. It then runs the lowered module and compares its - outputs with the outputs of the eager module. - """ - - # Validate that the model can execute in eager mode - model.eval() - model(*sample_inputs) - - edge_program = lower_module(model, sample_inputs, dynamic_shapes=dynamic_shapes) - - et_program = edge_program.to_executorch() - - if expect_no_delegates: - self.check_no_delegation(et_program) - return - - self.check_vk_delegation(et_program) - - self.run_delegated_model_and_check_output( - et_program, - model, - sample_inputs, - atol, - rtol, - test_inputs=test_inputs, - first_output_only=first_output_only, - ) - - def test_vulkan_backend_add(self): - # This test is the simplest test by manually lowering some submodules, we can use paritioner - # for auto detecting lowerable parts. - class AddModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y, w): - z = x + y - z = z + x - z = z + x - z = z + w - z = w + z - z = z + 3 # test scalar broadcasting - return z - - add_module = AddModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 1), dtype=torch.float32), # test broadcasting - ) - - self.lower_module_and_test_output(add_module, sample_inputs) - - sample_inputs = ( - torch.rand(size=(4, 5, 2, 3), dtype=torch.float32), - torch.rand(size=(4, 5, 2, 3), dtype=torch.float32), - torch.rand( - size=(2, 3), dtype=torch.float32 - ), # test broadcasting on packed dim - ) - - self.lower_module_and_test_output(add_module, sample_inputs) - - def test_vulkan_backend_add_int(self): - class AddIntModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - z = x + y - return z - - add_int_module = AddIntModule() - sample_inputs = ( - torch.randint(low=-100, high=100, size=(2, 3), dtype=torch.int32), - torch.randint(low=-100, high=100, size=(2, 3), dtype=torch.int32), - ) - - self.lower_module_and_test_output(add_int_module, sample_inputs) - - def test_vulkan_backend_zero_dim_tensor(self): - class ZeroDimModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.zero = torch.full([], 1.3, dtype=torch.float32) - - def forward(self, x): - return x + self.zero - - internal_data_module = ZeroDimModule() - sample_inputs = (torch.rand(size=(2, 3), dtype=torch.float32),) - self.lower_module_and_test_output(internal_data_module, sample_inputs) - - def test_vulkan_backend_internal_data(self): - class InternalDataModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.weight = torch.rand(size=(2, 3), dtype=torch.float32) - - def forward(self, x, y): - inter1 = torch.add(x, y, alpha=2) - inter2 = torch.add(x, y, alpha=3.14) - inter3 = inter1 * self.weight - inter4 = inter2 * self.weight - return inter4 - inter3 - - internal_data_module = InternalDataModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - ) - - self.lower_module_and_test_output(internal_data_module, sample_inputs) - - def test_vulkan_backend_sub(self): - class SubModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - z = torch.sub(x, y, alpha=2) - z = torch.sub(z, x, alpha=3.14) - z = z - x - return z - - sub_module = SubModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - ) - - self.lower_module_and_test_output(sub_module, sample_inputs) - - def test_vulkan_backend_mul(self): - class MulModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - z = x * y - z = z * x - z = z * x - return z - - mul_module = MulModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - ) - - self.lower_module_and_test_output(mul_module, sample_inputs) - - def test_vulkan_backend_div(self): - class DivModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - z = x / y - z = z / x - z = z / x - return z - - div_module = DivModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - ) - - self.lower_module_and_test_output(div_module, sample_inputs) - - def test_vulkan_backend_arithmetic(self): - class ArithmeticModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.weight = torch.rand(size=(2, 3), dtype=torch.float32) - - def forward(self, x, y): - z = x + y - z = z - x - z = z / x - z = z * self.weight - return z - - arithmetic_module = ArithmeticModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - ) - - self.lower_module_and_test_output(arithmetic_module, sample_inputs) - - def test_vulkan_backend_floor_div(self): - class FloorDivModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - z = x // y - return z - - floor_div_module = FloorDivModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32) * 10.0, - torch.rand(size=(2, 3), dtype=torch.float32) + 1.0, - ) - - # absolute tolerance is 1 because of flooring - self.lower_module_and_test_output( - floor_div_module, sample_inputs, atol=1.0 + 1e-03 - ) - - def test_vulkan_backend_pow(self): - class PowModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - z = torch.pow(x, y) - return z - - pow_module = PowModule() - sample_inputs = ( - torch.rand(size=(2, 3), dtype=torch.float32), - torch.rand(size=(2, 3), dtype=torch.float32), - ) - - self.lower_module_and_test_output(pow_module, sample_inputs) - - def lower_unary_module_and_test_output(self, module): - batch = Dim("batch", max=8) - sample_inputs = (torch.randn(8, 16, 96, 92),) - - dynamic_shapes = {"x": {0: batch}} - test_inputs = [ - (torch.randn(3, 14, 15, 92),), - (torch.randn(6, 5, 35, 89),), - (torch.randn(7, 9, 32, 38),), - ] - - self.lower_module_and_test_output( - module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_vulkan_backend_clamp(self): - class ClampModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.clamp(x, min=-3.14) - - self.lower_unary_module_and_test_output(ClampModule()) - - def test_vulkan_backend_clamp_int(self): - class ClampModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.clamp(x, min=-3) - - sample_inputs = ( - torch.randint(low=-100, high=100, size=(5, 5), dtype=torch.int32), - ) - - self.lower_module_and_test_output(ClampModule(), sample_inputs) - - def test_vulkan_backend_clamp_int64(self): - class ClampModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.clamp(x, min=-3) - - sample_inputs = ( - torch.randint(low=-100, high=100, size=(5, 5), dtype=torch.int64), - ) - - self.lower_module_and_test_output(ClampModule(), sample_inputs) - - def test_vulkan_backend_cos(self): - class CosModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.cos(x) - - self.lower_unary_module_and_test_output(CosModule()) - - def test_vulkan_backend_hardtanh(self): - class HardTanHModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.tanh = torch.nn.Hardtanh(min_val=-3.14, max_val=6.28) - - def forward(self, x): - return self.tanh(x) - - self.lower_unary_module_and_test_output(HardTanHModule()) - - def test_vulkan_backend_exp(self): - class ExpModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.exp(x) - - self.lower_unary_module_and_test_output(ExpModule()) - - def test_vulkan_backend_neg(self): - class NegModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.neg(x) - - self.lower_unary_module_and_test_output(NegModule()) - - def test_vulkan_backend_sin(self): - class SinModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.sin(x) - - self.lower_unary_module_and_test_output(SinModule()) - - def test_vulkan_backend_relu(self): - class ReLUModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.relu(x) - - self.lower_unary_module_and_test_output(ReLUModule()) - - def test_vulkan_backend_sqrt(self): - class SqrtModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.sqrt(x) - - self.lower_unary_module_and_test_output(SqrtModule()) - - def test_vulkan_backend_hardshrink(self): - class HardshrinkModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.hardshrink = torch.nn.Hardshrink(lambd=0.3) - - def forward(self, x): - return self.hardshrink(x) - - self.lower_unary_module_and_test_output(HardshrinkModule()) - - def test_vulkan_backend_max_pool2d(self): - class MaxPool2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.max_pool = torch.nn.MaxPool2d( - kernel_size=(2, 3), - stride=(1, 1), - padding=0, - dilation=1, - ceil_mode=False, - return_indices=True, - ) - - def forward(self, x): - return self.max_pool(x) - - max_pool2d_module = MaxPool2dModule() - sample_inputs = (torch.randn(5, 13, 55, 68),) - - batch = Dim("batch", max=8) - dynamic_shapes = {"x": {0: batch}} - test_inputs = [ - (torch.randn(3, 14, 15, 9),), - (torch.randn(1, 1, 4, 6),), - (torch.randn(5, 10, 50, 40),), - ] - self.lower_module_and_test_output( - max_pool2d_module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - first_output_only=True, - ) - - def test_vulkan_backend_avg_pool2d(self): - class AvgPool2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.avg_pool = torch.nn.AvgPool2d( - kernel_size=(4, 4), - stride=(4, 4), - padding=(0, 0), - ceil_mode=True, - count_include_pad=True, - divisor_override=None, - ) - - def forward(self, x): - return self.avg_pool(x) - - avg_pool2d_module = AvgPool2dModule() - sample_inputs = (torch.randn(5, 13, 55, 68),) - - batch = Dim("batch", max=8) - dynamic_shapes = {"x": {0: batch}} - test_inputs = [ - (torch.randn(3, 14, 15, 9),), - (torch.randn(1, 1, 4, 6),), - (torch.randn(5, 10, 50, 40),), - ] - self.lower_module_and_test_output( - avg_pool2d_module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_vulkan_backend_abs(self): - class AbsModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.abs(x) - - self.lower_unary_module_and_test_output(AbsModule()) - - def test_vulkan_backend_sigmoid(self): - class SigmoidModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.sigmoid(x) - - self.lower_unary_module_and_test_output(SigmoidModule()) - - def test_vulkan_backend_tanh(self): - class TanhModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.tanh(x) - - self.lower_unary_module_and_test_output(TanhModule()) - - def test_vulkan_backend_linear(self): - class LinearModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(128, 64, bias=False) - - def forward(self, x): - return self.linear(x) - - module = LinearModule() - sample_inputs = (torch.rand(size=(32, 128), dtype=torch.float32),) - batch = Dim("batch", max=32) - dynamic_shapes = {"x": {0: batch}} - - test_inputs = [ - (torch.rand(15, 128),), - (torch.rand(6, 128),), - (torch.rand(30, 128),), - (torch.rand(20, 128),), - (torch.rand(19, 128),), - ] - - self.lower_module_and_test_output( - module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_vulkan_backend_partial(self): - class SimpleModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(10, 10) - self.offset_1 = torch.rand(size=(2, 10), dtype=torch.float32) - self.offset_2 = torch.rand(size=(2, 10), dtype=torch.float32) - - def forward(self, x): - return self.linear(x + self.offset_1) - self.offset_2 - - model = SimpleModel() - sample_inputs = (torch.rand(size=(2, 10), dtype=torch.float32),) - - self.lower_module_and_test_output(model, sample_inputs) - - @unittest.skip( - "Currently this test is failing due to weird partitioning because the eq scalar" - "operator is not supported yet. Re-enable when the operator is supported." - ) - def test_vulkan_backend_partial_dynamic_shapes(self): - class SimpleModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.branch1 = torch.nn.Sequential( - torch.nn.Linear(64, 64), torch.nn.ReLU() - ) - self.branch2 = torch.nn.Sequential( - torch.nn.Linear(128, 64), torch.nn.ReLU() - ) - self.buffer_1 = torch.ones((1, 64)) * 0.5 - self.buffer_2 = torch.ones((1, 64)) * 1.4 - - def forward(self, x1, x2): - out1 = self.branch1(x1) - out2 = self.branch2(x2) - return (out1 + self.buffer_1 + out2) * self.buffer_2 - - model = SimpleModel() - sample_inputs = (torch.randn(32, 64), torch.randn(32, 128)) - batch = Dim("batch", max=32) - dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}} - - test_inputs = [ - (torch.randn(15, 64), torch.randn(15, 128)), - (torch.randn(6, 64), torch.randn(6, 128)), - (torch.randn(30, 64), torch.randn(30, 128)), - (torch.randn(20, 64), torch.randn(20, 128)), - (torch.randn(19, 64), torch.randn(19, 128)), - ] - - self.lower_module_and_test_output( - model, sample_inputs, dynamic_shapes=dynamic_shapes, test_inputs=test_inputs - ) - - def test_vulkan_backend_matmul(self): - class MatMulModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.weight = torch.ones(size=(63, 22), dtype=torch.float32) - - def forward(self, x): - return torch.matmul(x, self.weight) - - module = MatMulModule() - sample_inputs = (torch.ones(size=(31, 63), dtype=torch.float32),) - - self.lower_module_and_test_output(module, sample_inputs) - - def test_vulkan_backend_bmm(self): - class BMMModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.weight = torch.randn(size=(4, 4, 5), dtype=torch.float32) - - def forward(self, x): - return torch.bmm(x, self.weight) - - module = BMMModule() - sample_inputs = (torch.randn(size=(4, 3, 4), dtype=torch.float32),) - - self.lower_module_and_test_output(module, sample_inputs) - - @unittest.skip( - "Reduce shader does not support multiple reduction axes at the moment" - ) - def test_vulkan_backend_sum_dim_list(self): - class SumModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - x = torch.sum(x, (0, -1), keepdim=True) - x = torch.sum(x, 2, keepdim=False) - return x - - module = SumModule() - sample_inputs = (torch.ones(size=(3, 2, 7, 5), dtype=torch.float32),) - - self.lower_module_and_test_output( - module, - sample_inputs, - ) - - @unittest.skip( - "Reduce shader does not support multiple reduction axes at the moment" - ) - def test_vulkan_backend_sum(self): - class SumModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - x = torch.sum(x, (), keepdim=True) - x = torch.sum(x) - return x - - module = SumModule() - sample_inputs = (torch.rand(size=(3, 2, 7, 5), dtype=torch.float32),) - - self.lower_module_and_test_output( - module, - sample_inputs, - ) - - def test_vulkan_backend_conv2d(self): - class Conv2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d( - in_channels=6, - out_channels=8, - kernel_size=(3, 3), - padding=(2, 3), - stride=(1, 2), - dilation=1, - groups=1, - bias=True, - ) - - def forward(self, x): - return self.conv(x) - - conv2d_module = Conv2dModule() - sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv2d_module, - sample_inputs, - ) - - def test_vulkan_backend_conv_transpose2d(self): - class ConvTranspose2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.ConvTranspose2d( - in_channels=6, - out_channels=8, - kernel_size=(3, 3), - padding=(2, 3), - stride=(1, 2), - output_padding=(0, 1), - dilation=1, - groups=1, - bias=True, - ) - - def forward(self, x): - return self.conv(x) - - conv_transpose2d_module = ConvTranspose2dModule() - sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv_transpose2d_module, - sample_inputs, - ) - - def test_vulkan_backend_conv2d_dw(self): - class Conv2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d( - in_channels=8, - out_channels=8, - kernel_size=3, - padding=1, - groups=8, - bias=True, - ) - - def forward(self, x): - return self.conv(x) - - conv2d_module = Conv2dModule() - sample_inputs = (torch.randn(size=(1, 8, 72, 96), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv2d_module, - sample_inputs, - ) - - def test_vulkan_backend_conv2d_pw(self): - class Conv2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d( - in_channels=8, - out_channels=8, - kernel_size=1, - padding=1, - groups=1, - bias=True, - ) - - def forward(self, x): - return self.conv(x) - - conv2d_module = Conv2dModule() - sample_inputs = (torch.randn(size=(1, 8, 72, 96), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv2d_module, - sample_inputs, - ) - - def test_vulkan_backend_conv2d_bias_false(self): - class Conv2dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d( - in_channels=6, - out_channels=8, - kernel_size=(3, 3), - padding=(2, 3), - stride=(1, 2), - dilation=1, - groups=1, - bias=False, - ) - - def forward(self, x): - return self.conv(x) - - conv2d_module = Conv2dModule() - sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv2d_module, - sample_inputs, - ) - - def test_vulkan_backend_conv1d(self): - class Conv1dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv1d( - in_channels=20, - out_channels=10, - kernel_size=6, - stride=5, - padding=5, - dilation=3, - groups=5, - bias=True, - ) - - def forward(self, x): - return self.conv(x) - - conv1d_module = Conv1dModule() - sample_inputs = (torch.randn(size=(3, 20, 30), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv1d_module, - sample_inputs, - ) - - def test_vulkan_backend_conv1d_bias_false(self): - class Conv1dModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv1d( - in_channels=6, - out_channels=6, - kernel_size=3, - groups=6, - bias=False, - ) - - def forward(self, x): - return self.conv(x) - - conv1d_module = Conv1dModule() - sample_inputs = (torch.randn(size=(1, 6, 7), dtype=torch.float32),) - - self.lower_module_and_test_output( - conv1d_module, - sample_inputs, - ) - - @unittest.skip("layer norm compute shader not working with swiftshader") - def test_vulkan_backend_native_layer_norm(self): - class NativeLayerNormModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.layer_norm = torch.nn.LayerNorm(5) - - def forward(self, x): - return self.layer_norm(x) - - sample_inputs = (torch.randn(size=(3, 4, 5), dtype=torch.float32),) - - self.lower_module_and_test_output( - NativeLayerNormModule(), - sample_inputs, - ) - - def test_vulkan_backend_batch_norm(self): - class BatchNormModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.bn = torch.nn.BatchNorm2d(num_features=3) - - def forward(self, x): - return self.bn(x) - - sample_inputs = (torch.randn(size=(4, 3, 2, 5), dtype=torch.float32),) - - self.lower_module_and_test_output( - BatchNormModule(), - sample_inputs, - ) - - def test_vulkan_backend_full(self): - class FullModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.full(x.shape, 42.0) - - class ZerosModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.zeros(x.shape) - - class OnesModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.ones(x.shape) - - sample_inputs = (torch.randn(size=(2, 3, 4, 5), dtype=torch.float32),) - - self.lower_module_and_test_output( - FullModule(), - sample_inputs, - ) - - self.lower_module_and_test_output( - ZerosModule(), - sample_inputs, - ) - - self.lower_module_and_test_output( - OnesModule(), - sample_inputs, - ) - - def test_vulkan_backend_full_like(self): - class FullLikeModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.full_like(x, 42.0) - - class ZerosLikeModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.zeros_like(x) - - class OnesLikeModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.ones_like(x) - - sample_inputs = (torch.randn(size=(2, 3, 4, 5), dtype=torch.float32),) - - self.lower_module_and_test_output( - FullLikeModule(), - sample_inputs, - ) - - self.lower_module_and_test_output( - ZerosLikeModule(), - sample_inputs, - ) - - self.lower_module_and_test_output( - OnesLikeModule(), - sample_inputs, - ) - - def test_vulkan_backend_upsample_nearest2d(self): - class UpsampleNearest2d(torch.nn.Module): - def __init__(self): - super().__init__() - self.upsample = torch.nn.Upsample(scale_factor=2, mode="nearest") - - def forward(self, x): - return self.upsample(x) - - sample_inputs = (torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2),) - - self.lower_module_and_test_output( - UpsampleNearest2d(), - sample_inputs, - ) - - def test_vulkan_backend_minimum(self): - class MinimumModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - return torch.minimum(x, y) - - sample_inputs = ( - torch.rand(size=(3, 5, 6, 4), dtype=torch.float32), - torch.rand(size=(6, 4), dtype=torch.float32), - ) - - self.lower_module_and_test_output( - MinimumModule(), - sample_inputs, - ) - - def test_vulkan_backend_reshape(self): - class ReshapeModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.reshape(x, [-1, x.size(-1)]) - - sample_inputs = (torch.randn(size=(5, 3, 4), dtype=torch.float32),) - - self.lower_module_and_test_output( - ReshapeModule(), - sample_inputs, - ) - - def test_vulkan_backend_view(self): - class ViewModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x.view([-1, x.size(-1)]) - - sample_inputs = (torch.randn(size=(3, 2, 3, 4), dtype=torch.float32),) - - self.lower_module_and_test_output( - ViewModule(), - sample_inputs, - ) - - def test_vulkan_backend_view_int(self): - class ViewModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x.view([-1, x.size(-1)]) - - sample_inputs = (torch.randint(size=(3, 6, 2, 7), high=100, dtype=torch.int32),) - - self.lower_module_and_test_output( - ViewModule(), - sample_inputs, - ) - - def test_vulkan_backend_unsqueeze(self): - class UnsqueezeModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - x = torch.unsqueeze(x, 1) - x = torch.unsqueeze(x, 0) - return x - - sample_inputs = (torch.randn(size=(3,), dtype=torch.float32),) - - self.lower_module_and_test_output( - UnsqueezeModule(), - sample_inputs, - ) - - def test_vulkan_backend_squeeze(self): - class SqueezeModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.squeeze(x, 0) - - sample_inputs = (torch.randn(size=(1, 2, 2, 1), dtype=torch.float32),) - - self.lower_module_and_test_output( - SqueezeModule(), - sample_inputs, - ) - - def test_vulkan_backend_select(self): - class SelectModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x[0][3] - - sample_inputs = (torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),) - - self.lower_module_and_test_output( - SelectModule(), - sample_inputs, - ) - - def test_vulkan_backend_permute_copy(self): - class PermuteModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.permute(x, [3, 0, 2, 1]) - - sample_inputs = (torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),) - - self.lower_module_and_test_output( - PermuteModule(), - sample_inputs, - ) - - def test_vulkan_backend_permute_copy_int(self): - class PermuteModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.permute(x, [3, 0, 2, 1]) - - sample_inputs = (torch.randint(size=(3, 6, 2, 7), high=100, dtype=torch.int32),) - - self.lower_module_and_test_output( - PermuteModule(), - sample_inputs, - ) - - def test_vulkan_backend_cat(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y, z): - return torch.cat([x, y, z], dim=1) - - sample_inputs = ( - torch.randn(size=(3, 6, 2, 7), dtype=torch.float32), - torch.randn(size=(3, 1, 2, 7), dtype=torch.float32), - torch.randn(size=(3, 9, 2, 7), dtype=torch.float32), - ) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_cat_with_zero_size(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y, z, w): - return torch.cat([x, y, z, w], dim=1) - - sample_inputs = ( - torch.randn(size=(3, 6, 2, 7), dtype=torch.float32), - torch.randn(size=(3, 0, 2, 7), dtype=torch.float32), - torch.randn(size=(3, 0, 2, 7), dtype=torch.float32), - torch.randn(size=(3, 3, 2, 7), dtype=torch.float32), - ) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_slice(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x[:, 2:9:2, :] - - sample_inputs = (torch.randn(size=(3, 13, 7, 3), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_split_with_sizes(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.split(x, (3, 6, 1, 3), dim=1) - - sample_inputs = (torch.randn(size=(3, 13, 7, 3), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_split_tensor(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.tensor_split(x, 2, dim=1) - - sample_inputs = (torch.randn(size=(3, 14, 7, 3), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_clone(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.clone(x) - - sample_inputs = (torch.randn(size=(3, 14, 7, 3), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_constant_pad_nd(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.nn.functional.pad(x, (1, 2, 3, 4, 5, 6), "constant", 24.2) - - sample_inputs = (torch.randn(size=(3, 7, 5, 11), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_repeat(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x.repeat([2, 3, 1, 2]) - - sample_inputs = (torch.randn(size=(3, 7, 5, 9), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - def test_vulkan_backend_t_default(self): - # aten.permute_copy.default is not enabled yet in partitioner - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - # torch.t is actually exported as aten::permute. - return torch.t(x) - - sample_inputs = (torch.randn(size=(3, 14), dtype=torch.float32),) - - self.lower_module_and_test_output( - TestModule(), - sample_inputs, - ) - - @unittest.skip( - "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug" - ) - def test_vulkan_backend_softmax(self): - class SoftmaxModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - x = x.softmax(dim=0) - x = x.softmax(dim=1) - x = x.softmax(dim=2) - return x - - sample_inputs = (torch.randn(size=(3, 2, 7), dtype=torch.float32),) - - self.lower_module_and_test_output( - SoftmaxModule(), - sample_inputs, - ) - - @unittest.skip( - "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug" - ) - def test_vulkan_backend_logsoftmax(self): - class LogSoftmaxModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - x = x.log_softmax(dim=0) - x = x.log_softmax(dim=1) - x = x.log_softmax(dim=2) - return x - - sample_inputs = (torch.randn(size=(3, 2, 7), dtype=torch.float32),) - - self.lower_module_and_test_output( - LogSoftmaxModule(), - sample_inputs, - ) - - def test_vulkan_backend_gelu(self): - class GeluModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.gelu = torch.nn.GELU(approximate="tanh") - - def forward(self, x): - return self.gelu(x) - - self.lower_unary_module_and_test_output(GeluModule()) - - @unittest.skip( - "Reduce shader does not support multiple reduction axes at the moment" - ) - def test_vulkan_backend_mean(self): - class MeanModule(torch.nn.Module): - def __init__(self, dims, keepdim=True): - super().__init__() - self.dims = dims - self.keepdim = keepdim - - def forward(self, x): - return torch.mean(x, self.dims, keepdim=self.keepdim) - - sample_inputs = ( - torch.arange(end=2 * 3 * 2 * 5, dtype=torch.float32).reshape(2, 3, 2, 5), - ) - - self.lower_module_and_test_output( - MeanModule(dims=[-1, -2]), - sample_inputs, - ) - - self.lower_module_and_test_output( - MeanModule(dims=[1]), - sample_inputs, - ) - - self.lower_module_and_test_output( - MeanModule(dims=[0, 1, 2, 3]), - sample_inputs, - ) - - self.lower_module_and_test_output( - MeanModule(dims=[-1, -2], keepdim=False), - sample_inputs, - ) - - self.lower_module_and_test_output( - MeanModule(dims=[1], keepdim=False), - sample_inputs, - ) - - def test_vulkan_backend_index_select_int(self): - class IndexSelectModule(torch.nn.Module): - def __init__(self, dim, indices): - super().__init__() - self.dim = dim - self.index = torch.tensor(indices) - - def forward(self, x): - return torch.index_select(x, self.dim, self.index) - - sample_inputs = (torch.arange(96).reshape(2, 8, 2, 3),) - - self.lower_module_and_test_output( - IndexSelectModule(dim=1, indices=[2, 3, 5, 6, 7]), - sample_inputs, - ) - - def test_vulkan_backend_index_select(self): - class IndexSelectModule(torch.nn.Module): - def __init__(self, dim, indices): - super().__init__() - self.dim = dim - self.index = torch.tensor(indices) - - def forward(self, x): - return torch.index_select(x, self.dim, self.index) - - sample_inputs = (torch.arange(144).reshape(12, 1, 3, 4).float(),) - - self.lower_module_and_test_output( - IndexSelectModule(dim=0, indices=[1, 3, 5, 7, 8, 9, 10, 11, 2, 3]), - sample_inputs, - ) - - def test_vulkan_backend_arange_int(self): - class ArangeModule(torch.nn.Module): - def __init__(self, input): - super().__init__() - self.input = input - - def forward(self, x): - return torch.arange(*self.input, dtype=torch.int32) - - # `torch.arange` could take one, two or three arguments as input. - # If only one argument is provided, it will be interpreted as `end`. - # If two arguments are provided, the first one will be interpreted as `start` - # and the second one will be interpreted as `end`. - # If three arguments are provided, the first one will be interpreted as `start`, - # the second one will be interpreted as `end` and the third one will be - # interpreted as `step`. - inputs = [ - [1], - [-3, 5], - [1, 11, 2], - [12, 1, -2], - ] - for i in inputs: - self.lower_module_and_test_output( - ArangeModule(i), - (torch.randn(size=(1,), dtype=torch.float32),), # dummy input - ) - - def test_vulkan_backend_arange_float(self): - class ArangeModule(torch.nn.Module): - def __init__(self, input): - super().__init__() - self.input = input - - def forward(self, x): - return torch.arange(*self.input) - - inputs = [ - [1.5], - [-3, 5.0], - [1.0, 11, 2], - [12, 1, -2.0], - ] - for i in inputs: - self.lower_module_and_test_output( - ArangeModule(i), - (torch.randn(size=(1,), dtype=torch.float32),), # dummy input - ) - - def test_vulkan_backend_arange_int64(self): - class ArangeModule(torch.nn.Module): - def __init__(self, input): - super().__init__() - self.input = input - - def forward(self, x): - return torch.arange(*self.input) - - inputs = [ - [1], - [-3, 5], - [1, 11, 2], - [12, 1, -2], - [1.5], - [-3, 5.0], - [1.0, 11, 2], - [12, 1, -2.0], - ] - for i in inputs: - self.lower_module_and_test_output( - ArangeModule(i), - (torch.randn(size=(1,), dtype=torch.float32),), # dummy input - ) - self.lower_module_and_test_output( - ArangeModule(i), - (torch.randint(low=-100, high=100, size=(5, 5)),), # dummy input - ) - - def test_vulkan_backend_embedding_1d(self): - class EmbeddingModule(torch.nn.Module): - def __init__(self, embedding): - super().__init__() - self.embedding = embedding - - def forward(self, x): - return self.embedding(x) - - self.lower_module_and_test_output( - EmbeddingModule(torch.nn.Embedding(5, 4)), - (torch.tensor([0, 1, 0, 4, 2, 0]),), - ) - - def test_vulkan_backend_embedding_2d(self): - class EmbeddingModule(torch.nn.Module): - def __init__(self, embedding): - super().__init__() - self.embedding = embedding - - def forward(self, x): - return self.embedding(x) - - self.lower_module_and_test_output( - EmbeddingModule(torch.nn.Embedding(5, 4)), - (torch.tensor([[0, 1, 0], [4, 2, 0]]),), - ) - - def test_vulkan_backend_embedding_3d(self): - class EmbeddingModule(torch.nn.Module): - def __init__(self, embedding): - super().__init__() - self.embedding = embedding - - def forward(self, x): - return self.embedding(x) - - self.lower_module_and_test_output( - EmbeddingModule(torch.nn.Embedding(5, 4)), - (torch.tensor([[[0, 1], [0, 1]], [[4, 2], [3, 3]]]),), - ) - - # def test_vulkan_backend_conv_with_dim_order(self): - # class Conv2dSequential(torch.nn.Module): - # def __init__(self, bias=True, channel_last=False): - # super().__init__() - # self.first = torch.nn.Conv2d( - # in_channels=1, - # out_channels=3, - # kernel_size=(3, 3), - # padding=1, - # bias=bias, - # ) - # self.second = torch.nn.Conv2d( - # in_channels=3, - # out_channels=2, - # kernel_size=(3, 3), - # padding=1, - # bias=bias, - # ) - - # def forward(self, x): - # x = x.to(memory_format=torch.channels_last) - # return self.second(self.first(x)) - - # self.lower_module_and_test_output( - # Conv2dSequential(), - # (torch.rand(size=[1, 1, 3, 3]),), - # - # ) - - def test_vulkan_backend_flip(self): - class FlipModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.flip(x, [0, 1, 2, 3]) - - self.lower_module_and_test_output( - FlipModule(), - (torch.arange(48).reshape(2, 3, 4, 2),), - ) - - def test_vulkan_backend_conv_with_clamp(self): - class ConvWithClampModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.weight = torch.randn(6, 8, 3, 3) - self.bias = torch.randn(8) - self.stride = (1, 2) - self.padding = (2, 3) - self.dilation = (1, 1) - self.transposed = True - self.output_padding = (0, 1) - self.groups = 1 - self.output_min = 0 - self.output_max = 10 - - def forward(self, x): - return torch.ops.et_vk.conv_with_clamp( - x, - self.weight, - self.bias, - self.stride, - self.padding, - self.dilation, - self.transposed, - self.output_padding, - self.groups, - self.output_min, - self.output_max, - ) - - self.lower_module_and_test_output( - ConvWithClampModule(), - (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),), - ) - - def test_vulkan_backend_grid_priors(self): - class GridPriorsModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.ops.et_vk.grid_priors( - x, - stride=8, - offset=0.5, - ) - - self.lower_module_and_test_output( - GridPriorsModule(), - (torch.rand(size=[1, 5, 2, 3]),), - ) - - def test_vulkan_backend_large_linear_layer(self): - class LinearModel(torch.nn.Module): - def __init__(self, large_out_channels: int) -> None: - super(LinearModel, self).__init__() - self.fc0 = torch.nn.Linear(1024, 128) - self.fc1 = torch.nn.Linear(128, large_out_channels) - - def forward(self, x: torch.Tensor): - x = self.fc0(x) - out = self.fc1(x) - return out - - large_out_channels = 2**16 - - self.lower_module_and_test_output( - LinearModel(large_out_channels), - (torch.ones(1024),), - ) - - def test_vulkan_backend_sym_size_int(self): - """ - Test the sym_size.int operator with a model that: - 1. Takes an input tensor with shape [1, M, K] - 2. Reshapes it to [M, K] - 3. Applies a linear layer - 4. Reshapes the output back to [1, M, N] - """ - K = 64 # Input feature dimension - N = 32 # Output feature dimension - - class SymSizeModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(K, N) - - def forward(self, x): - M = x.size(1) - - reshaped = torch.reshape(x, [M, K]) - output = self.linear(reshaped) - return torch.reshape(output, [1, M, N]) - - sample_inputs = (torch.randn(1, 64, K),) - - batch = Dim("batch", min=1, max=128) - dynamic_shapes = {"x": {1: batch}} - - test_inputs = [ - (torch.randn(1, 32, K),), - (torch.randn(1, 96, K),), - (torch.randn(1, 128, K),), - ] - - self.lower_module_and_test_output( - SymSizeModel(), - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_select_last_height_dynamic_shapes(self): - """ - Test selecting the last element along the height dimension with dynamic shapes. - The height dimension (dim=1) is variable. - """ - - class SelectLastHeightModule(torch.nn.Module): - """ - Module that selects the last element along the height dimension (dim=1) of a 3D tensor. - This is equivalent to the operation: x[:, -1, :] - """ - - def __init__(self): - super().__init__() - - def forward(self, x): - # Select the last element along dimension 1 (height) - return x[:, -1, :] - - # Create the module - module = SelectLastHeightModule() - - # Create sample inputs with a specific shape - # Shape: [batch_size, height, width] - sample_inputs = (torch.arange(1, 61).reshape(2, 10, 3).float(),) - - # Define dynamic shapes for the height dimension - height = Dim("height", min=1, max=10) - dynamic_shapes = {"x": {1: height}} - - # Create test inputs with different heights - test_inputs = [ - (torch.arange(1, 7).reshape(2, 1, 3).float(),), # Minimum height - (torch.arange(1, 19).reshape(2, 3, 3).float(),), # Small height - (torch.arange(1, 43).reshape(2, 7, 3).float(),), # Medium height - (torch.arange(1, 31).reshape(2, 5, 3).float(),), # Maximum height - ] - - # Use the testing infrastructure from TestVulkanBackend - test_backend = TestVulkanBackend() - test_backend.lower_module_and_test_output( - module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_vulkan_backend_group_norm(self): - class ConvGroupNormModule(torch.nn.Module): - def __init__(self): - super().__init__() - # Conv2d: 3 input channels -> 16 output channels - self.conv = torch.nn.Conv2d( - in_channels=3, - out_channels=16, - kernel_size=3, - padding=1, - bias=True, - ) - # GroupNorm: 4 groups for 16 channels (16 % 4 == 0) - self.group_norm = torch.nn.GroupNorm( - num_groups=4, - num_channels=16, - eps=1e-5, - affine=True, - ) - - def forward(self, x): - x = self.conv(x) - x = self.group_norm(x) - return x - - # Create sample inputs: [batch, channels, height, width] - sample_inputs = (torch.randn(size=(1, 3, 32, 32), dtype=torch.float32),) - - # Test with static shapes first - self.lower_module_and_test_output( - ConvGroupNormModule(), - sample_inputs, - ) - - def test_vulkan_backend_group_norm_different_groups(self): - class GroupNormModule(torch.nn.Module): - def __init__(self, num_groups, num_channels): - super().__init__() - self.group_norm = torch.nn.GroupNorm( - num_groups=num_groups, - num_channels=num_channels, - eps=1e-5, - affine=True, - ) - - def forward(self, x): - return self.group_norm(x) - - # Test different group configurations - test_configs = [ - (2, 8), # 2 groups, 8 channels - (4, 16), # 4 groups, 16 channels - (8, 32), # 8 groups, 32 channels - ] - - for num_groups, num_channels in test_configs: - with self.subTest(num_groups=num_groups, num_channels=num_channels): - sample_inputs = ( - torch.randn(size=(2, num_channels, 16, 16), dtype=torch.float32), - ) - - self.lower_module_and_test_output( - GroupNormModule(num_groups, num_channels), - sample_inputs, - ) - - def test_vulkan_backend_full_quantization_workflow(self): - class FullQuantizationWorkflowModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - # Step 1: Choose quantization parameters per tensor - scale, zero_point = ( - torch.ops.quantized_decomposed.choose_qparams.tensor( - x, - quant_min=-2147483648, # int32 min - quant_max=2147483647, # int32 max - eps=1e-5, - dtype=torch.int32, - ) - ) - - # Step 2: Quantize using the calculated parameters - quantized = torch.ops.quantized_decomposed.quantize_per_tensor.tensor( - x, - scale, - zero_point, - quant_min=-2147483648, # int32 min - quant_max=2147483647, # int32 max - dtype=torch.int32, - ) - - # Step 3: Dequantize back to float - dequantized = ( - torch.ops.quantized_decomposed.dequantize_per_tensor.tensor( - quantized, - scale, - zero_point, - quant_min=-2147483648, # int32 min - quant_max=2147483647, # int32 max - dtype=torch.int32, - ) - ) - - return dequantized - - full_workflow_module = FullQuantizationWorkflowModule() - sample_inputs = (torch.rand(size=(2, 3, 4), dtype=torch.float32),) - - # Use higher tolerance since quantization introduces some error - self.lower_module_and_test_output( - full_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3 - ) - - def test_vulkan_backend_full_per_token_quantization_workflow(self): - class FullPerTokenQuantizationWorkflowModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - # Step 1: Choose quantization parameters per token - scale, zero_point = ( - torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( - x, - dtype=torch.int32, - ) - ) - - # Step 2: Quantize using the calculated parameters per token - quantized = torch.ops.quantized_decomposed.quantize_per_token.default( - x, - scale, - zero_point, - quant_min=-2147483648, # int32 min - quant_max=2147483647, # int32 max - dtype=torch.int32, - ) - - # Step 3: Dequantize back to float per token - dequantized = ( - torch.ops.quantized_decomposed.dequantize_per_token.default( - quantized, - scale, - zero_point, - quant_min=-2147483648, # int32 min - quant_max=2147483647, # int32 max - dtype=torch.int32, - output_dtype=torch.float32, - ) - ) - - return dequantized - - full_per_token_workflow_module = FullPerTokenQuantizationWorkflowModule() - sample_inputs = (torch.rand(size=(6, 4), dtype=torch.float32),) - - # Use higher tolerance since quantization introduces some error - self.lower_module_and_test_output( - full_per_token_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3 - ) - - def test_vulkan_backend_different_required_reprs(self): - class ComplexModule(torch.nn.Module): - """ - This Module tests the tag memory metadata pass. The first few ops executed - are binary ops, which don't require any specific representation for input - and output tensors. - - This is followed by a linear layer, which requires the input tensor to be - width packed. - - Three linear layer outputs are then concatenated, and the result is passed - to a convolution layer which requires channels packing. Finally, group norm - is called and the output is postprocessed by a binary op before returning. - - In addition to requiring memory layout transitions between the linear and - conv stages, the module also contains ops which have "non-standard" - torch.fx.Nodes; cat will contain an argument node that is a list of nodes, - and group norm's node will be associated with multiple output tensors. - """ - - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(10, 10) - self.conv = torch.nn.Conv2d( - in_channels=3, # Assuming concatenation triples the channels - out_channels=16, - kernel_size=3, - padding=1, - ) - self.group_norm = torch.nn.GroupNorm(num_groups=4, num_channels=16) - - def forward(self, x, a, b, c, d): - w = a + b - y = a + c - z = a + d - - b1 = x + y - b2 = x + z - b3 = x + w - - l1 = self.linear(b1).unsqueeze(0) - l2 = self.linear(b2).unsqueeze(0) - l3 = self.linear(b3).unsqueeze(0) - - concat = torch.cat([l1, l2, l3], dim=0) # Concatenate along channels - conv = self.conv(concat + a) - g = self.group_norm(conv.unsqueeze(0)) - return g + x - - complex_module = ComplexModule() - sample_inputs = ( - torch.rand(size=(10, 10), dtype=torch.float32), # x - torch.rand(size=(10, 10), dtype=torch.float32), # a - torch.rand(size=(10, 10), dtype=torch.float32), # b - torch.rand(size=(10, 10), dtype=torch.float32), # c - torch.rand(size=(10, 10), dtype=torch.float32), # d - ) - - self.lower_module_and_test_output(complex_module, sample_inputs) - - def test_vulkan_backend_cat_different_reprs(self): - class CustomComplexModule(torch.nn.Module): - """ - This test validates that the memory metadata tagging pass can handle - transitioning arguments to the cat operator. Linear layers require width - packing, while conv layers require channels packing. Before executing the - cat operator, all input tensors should use the same representation. - """ - - def __init__(self): - super().__init__() - self.linear1 = torch.nn.Linear(10, 10) - self.linear2 = torch.nn.Linear(10, 10) - self.conv = torch.nn.Conv2d( - in_channels=4, # Assuming input b has 3 channels - out_channels=8, - kernel_size=3, - padding=1, - ) - - def forward(self, a, b): - x1 = self.linear1(a).unsqueeze(0) - x2 = self.linear2(a).unsqueeze(0) - y = self.conv(b) - return torch.cat([x1, x2, y], dim=0) - - custom_complex_module = CustomComplexModule() - sample_inputs = ( - torch.rand(size=(10, 10), dtype=torch.float32), # a - torch.rand(size=(4, 10, 10), dtype=torch.float32), # b - ) - - self.lower_module_and_test_output(custom_complex_module, sample_inputs) - - def test_vulkan_backend_cat_width_dynamic_shapes(self): - class CatWidthModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x1, x2, x3, x4, x5, x6): - return torch.cat([x1, x2, x3, x4, x5, x6], dim=3) - - cat_width_module = CatWidthModule() - - # Create 6 tensors with different widths but same batch, channel, and height dimensions - sample_inputs = ( - torch.randn(size=(2, 3, 4, 5), dtype=torch.float32), # width=5 - torch.randn(size=(2, 3, 4, 3), dtype=torch.float32), # width=3 - torch.randn(size=(2, 3, 4, 7), dtype=torch.float32), # width=7 - torch.randn(size=(2, 3, 4, 2), dtype=torch.float32), # width=2 - torch.randn(size=(2, 3, 4, 4), dtype=torch.float32), # width=4 - torch.randn(size=(2, 3, 4, 6), dtype=torch.float32), # width=6 - ) - - # Define dynamic shapes for the width dimension (dim=3) for each input - width1 = Dim("width1", min=1, max=10) - width2 = Dim("width2", min=1, max=10) - width3 = Dim("width3", min=1, max=10) - width4 = Dim("width4", min=1, max=10) - width5 = Dim("width5", min=1, max=10) - width6 = Dim("width6", min=1, max=10) - - dynamic_shapes = { - "x1": {3: width1}, - "x2": {3: width2}, - "x3": {3: width3}, - "x4": {3: width4}, - "x5": {3: width5}, - "x6": {3: width6}, - } - - # Create test inputs with different width combinations - test_inputs = [ - ( - torch.randn(2, 3, 4, 2), # width=2 - torch.randn(2, 3, 4, 1), # width=1 - torch.randn(2, 3, 4, 3), # width=3 - torch.randn(2, 3, 4, 1), # width=1 - torch.randn(2, 3, 4, 2), # width=2 - torch.randn(2, 3, 4, 4), # width=4 - ), - ( - torch.randn(2, 3, 4, 8), # width=8 - torch.randn(2, 3, 4, 2), # width=2 - torch.randn(2, 3, 4, 1), # width=1 - torch.randn(2, 3, 4, 3), # width=3 - torch.randn(2, 3, 4, 5), # width=5 - torch.randn(2, 3, 4, 1), # width=1 - ), - ( - torch.randn(2, 3, 4, 1), # width=1 - torch.randn(2, 3, 4, 9), # width=9 - torch.randn(2, 3, 4, 2), # width=2 - torch.randn(2, 3, 4, 4), # width=4 - torch.randn(2, 3, 4, 1), # width=1 - torch.randn(2, 3, 4, 3), # width=3 - ), - ] - - self.lower_module_and_test_output( - cat_width_module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_vulkan_backend_cat_channels_dynamic_shapes(self): - class CatChannelsModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x1, x2, x3, x4, x5, x6): - return torch.cat([x1, x2, x3, x4, x5, x6], dim=1) - - cat_channels_module = CatChannelsModule() - - # Create 6 tensors with different channel counts but same batch, height, and width dimensions - sample_inputs = ( - torch.randn(size=(2, 8, 8, 6), dtype=torch.float32), # channels=4 - torch.randn(size=(2, 8, 8, 6), dtype=torch.float32), # channels=2 - torch.randn(size=(2, 8, 8, 6), dtype=torch.float32), # channels=6 - torch.randn(size=(2, 8, 8, 6), dtype=torch.float32), # channels=1 - torch.randn(size=(2, 8, 8, 6), dtype=torch.float32), # channels=3 - torch.randn(size=(2, 8, 8, 6), dtype=torch.float32), # channels=5 - ) - - # Define dynamic shapes for the channels dimension (dim=1) for each input - channels1 = Dim("channels1", min=1, max=8) - channels2 = Dim("channels2", min=1, max=8) - channels3 = Dim("channels3", min=1, max=8) - channels4 = Dim("channels4", min=1, max=8) - channels5 = Dim("channels5", min=1, max=8) - channels6 = Dim("channels6", min=1, max=8) - - dynamic_shapes = { - "x1": {1: channels1}, - "x2": {1: channels2}, - "x3": {1: channels3}, - "x4": {1: channels4}, - "x5": {1: channels5}, - "x6": {1: channels6}, - } - - # Create test inputs with different channel combinations - test_inputs = [ - ( - torch.randn(2, 1, 8, 6), # channels=1 - torch.randn(2, 2, 8, 6), # channels=2 - torch.randn(2, 1, 8, 6), # channels=1 - torch.randn(2, 3, 8, 6), # channels=3 - torch.randn(2, 1, 8, 6), # channels=1 - torch.randn(2, 2, 8, 6), # channels=2 - ), - ( - torch.randn(2, 6, 8, 6), # channels=6 - torch.randn(2, 1, 8, 6), # channels=1 - torch.randn(2, 3, 8, 6), # channels=3 - torch.randn(2, 2, 8, 6), # channels=2 - torch.randn(2, 4, 8, 6), # channels=4 - torch.randn(2, 1, 8, 6), # channels=1 - ), - ( - torch.randn(2, 2, 8, 6), # channels=2 - torch.randn(2, 7, 8, 6), # channels=7 - torch.randn(2, 1, 8, 6), # channels=1 - torch.randn(2, 1, 8, 6), # channels=1 - torch.randn(2, 3, 8, 6), # channels=3 - torch.randn(2, 2, 8, 6), # channels=2 - ), - ] - - self.lower_module_and_test_output( - cat_channels_module, - sample_inputs, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - ) - - def test_vulkan_backend_high_dimensional_tensors(self): - class HighDimTensorModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - # Unsqueeze inputs twice to create 5-dim tensors - x_5d = torch.unsqueeze(torch.unsqueeze(x, 0), 0) - y_5d = torch.unsqueeze(torch.unsqueeze(y, 0), 0) - # Add tensors together - result = x_5d + y_5d - return result - - high_dim_module = HighDimTensorModule() - # Create 2 4-dim inputs - sample_inputs = ( - torch.rand(size=(2, 3, 4, 5), dtype=torch.float32), - torch.rand(size=(2, 3, 4, 5), dtype=torch.float32), - ) - - self.lower_module_and_test_output(high_dim_module, sample_inputs) - - def test_vulkan_backend_torchao_wo_quantized_linear(self): - in_features = 1024 - out_features = 512 - bias = False - group_size = 64 - weight_bits = 4 - - class TorchAOQuantizedLinearModule(torch.nn.Module): - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = False, - group_size: int = 64, - weight_bits: int = 4, - ): - super().__init__() - self.linear = torch.nn.Linear(in_features, out_features, bias=bias) - self.group_size = group_size - self.weight_bits = weight_bits - - if self.weight_bits == 4: - self.weight_dtype = torch.int4 - else: - self.weight_dtype = torch.int8 - - self.quant_granularity = PerGroup(self.group_size) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.linear(x) - - def apply_quantization(self): - """Apply TorchAO weight-only quantization to the linear layer.""" - q_config = IntxWeightOnlyConfig( - weight_dtype=self.weight_dtype, - granularity=self.quant_granularity, - ) - quantize_(self, q_config) - unwrap_tensor_subclass(self) - return self - - # Test with GEMV pattern (batch_size=1, seq_len=1) - quantized_linear_module = TorchAOQuantizedLinearModule( - in_features=in_features, - out_features=out_features, - bias=bias, - group_size=group_size, - weight_bits=weight_bits, - ) - - # Apply quantization - quantized_linear_module = quantized_linear_module.apply_quantization() - - # Test with 2D input (GEMV pattern) - sample_inputs = (torch.randn(size=(1, in_features), dtype=torch.float32),) - - # Use higher tolerance since quantization introduces some error - self.lower_module_and_test_output( - quantized_linear_module, sample_inputs, atol=1e-2, rtol=1e-2 - ) - - # Test with GEMM pattern (batch_size > 1) - quantized_linear_module_gemm = TorchAOQuantizedLinearModule( - in_features=in_features, - out_features=out_features, - bias=bias, - group_size=group_size, - weight_bits=weight_bits, - ) - - # Apply quantization - quantized_linear_module_gemm = quantized_linear_module_gemm.apply_quantization() - - # Test with 3D input (GEMM pattern) - sample_inputs_gemm = ( - torch.randn(size=(1, 248, in_features), dtype=torch.float32), - ) - - # Use higher tolerance since quantization introduces some error - self.lower_module_and_test_output( - quantized_linear_module_gemm, sample_inputs_gemm, atol=1e-2, rtol=1e-2 - ) - - def test_vulkan_backend_xnnpack_pt2e_quantized_linear_sequence(self): - """ - Test a sequence of linear layers quantized with XNNPACK quantization config. - This test creates a module with multiple linear layers in sequence and applies - XNNPACK symmetric quantization to test the quantized model execution. - """ - - import executorch.backends.vulkan.test.utils as test_utils - - class LinearSequenceModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1 = torch.nn.Linear(128, 64, bias=False) - self.linear2 = torch.nn.Linear(64, 32, bias=False) - self.linear3 = torch.nn.Linear(32, 16, bias=False) - - MAX = 0.75 - MIN = -0.25 - self.linear1.weight.data = test_utils.random_uniform_tensor( - self.linear1.weight.shape, MIN, MAX - ) - self.linear2.weight.data = test_utils.random_uniform_tensor( - self.linear2.weight.shape, MIN, MAX - ) - self.linear3.weight.data = test_utils.random_uniform_tensor( - self.linear3.weight.shape, MIN, MAX - ) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - x = self.linear3(x) - return x - - # Create the module - linear_sequence_module = LinearSequenceModule() - - M = 32 - # Create sample inputs - sample_inputs = ( - ( - test_utils.random_uniform_tensor( - (M, linear_sequence_module.linear1.in_features), - -0.25, - 0.75, - ) - ), - ) - - # Create XNNPACK quantizer with symmetric quantization config - quantizer = XNNPACKQuantizer() - operator_config = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=False, - ) - quantizer.set_global(operator_config) - - # Test the quantized module using the existing quantize_and_lower_module function - # Use higher tolerance since quantization introduces some error - edge_program = quantize_and_lower_module( - linear_sequence_module, sample_inputs, quantizer - ) - - et_program = edge_program.to_executorch() - self.check_vk_delegation(et_program) - - self.run_delegated_model_and_check_output( - et_program, - linear_sequence_module, - sample_inputs, - atol=1e-2, - rtol=1e-1, - ) - - def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence(self): - """ - Test a sequence of convolution layers quantized with PT2E quantization. - This test creates a module with multiple Conv2d layers in sequence and applies - XNNPACK symmetric quantization to test the quantized model execution. - Similar to the linear sequence test but using convolution layers. - """ - - import executorch.backends.vulkan.test.utils as test_utils - - class ConvSequenceModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv1 = torch.nn.Conv2d( - in_channels=3, - out_channels=16, - kernel_size=3, - padding=1, - bias=False, - ) - self.conv2 = torch.nn.Conv2d( - in_channels=16, - out_channels=32, - kernel_size=3, - padding=1, - bias=False, - ) - self.conv3 = torch.nn.Conv2d( - in_channels=32, - out_channels=64, - kernel_size=3, - padding=1, - bias=False, - ) - - MAX = 0.75 - MIN = -0.25 - self.conv1.weight.data = test_utils.random_uniform_tensor( - self.conv1.weight.shape, MIN, MAX - ) - self.conv2.weight.data = test_utils.random_uniform_tensor( - self.conv2.weight.shape, MIN, MAX - ) - self.conv3.weight.data = test_utils.random_uniform_tensor( - self.conv3.weight.shape, MIN, MAX - ) - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.conv3(x) - return x - - # Create the module - conv_sequence_module = ConvSequenceModule() - - input_tensor = test_utils.random_uniform_tensor( - (1, 3, 32, 32), - -0.25, - 0.75, - ) - - # Create sample inputs - sample_inputs = (input_tensor,) - - # Create XNNPACK quantizer with symmetric quantization config - quantizer = XNNPACKQuantizer() - operator_config = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=False, - ) - quantizer.set_global(operator_config) - - # Test the quantized module using the existing quantize_and_lower_module function - # Use higher tolerance since quantization introduces some error - edge_program = quantize_and_lower_module( - conv_sequence_module, sample_inputs, quantizer - ) - - et_program = edge_program.to_executorch() - self.check_vk_delegation(et_program) - - self.run_delegated_model_and_check_output( - et_program, - conv_sequence_module, - sample_inputs, - atol=1e-2, - rtol=1e-1, - ) - - def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence_all_reduced(self): - """ - Test a sequence of convolution layers quantized with PT2E quantization. - This test creates a module with multiple Conv2d layers in sequence and applies - XNNPACK symmetric quantization to test the quantized model execution. - Similar to the linear sequence test but using convolution layers. - """ - - import executorch.backends.vulkan.test.utils as test_utils - - class ConvSequenceModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv1 = torch.nn.Conv2d( - in_channels=3, - out_channels=32, - kernel_size=3, - padding=1, - bias=False, - ) - self.conv2 = torch.nn.Conv2d( - in_channels=32, - out_channels=1, - kernel_size=3, - padding=1, - bias=False, - ) - - MAX = 0.75 - MIN = -0.25 - self.conv1.weight.data = test_utils.random_uniform_tensor( - self.conv1.weight.shape, MIN, MAX - ) - self.conv2.weight.data = test_utils.random_uniform_tensor( - self.conv2.weight.shape, MIN, MAX - ) - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - return x - - # Create the module - conv_sequence_module = ConvSequenceModule() - - input_tensor = test_utils.random_uniform_tensor( - (1, 3, 32, 32), - -0.25, - 0.75, - ) - - # Create sample inputs - sample_inputs = (input_tensor,) - - # Create XNNPACK quantizer with symmetric quantization config - quantizer = XNNPACKQuantizer() - operator_config = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=False, - ) - quantizer.set_global(operator_config) - - # Test the quantized module using the existing quantize_and_lower_module function - # Use higher tolerance since quantization introduces some error - edge_program = quantize_and_lower_module( - conv_sequence_module, sample_inputs, quantizer - ) - - et_program = edge_program.to_executorch() - self.check_vk_delegation(et_program) - - self.run_delegated_model_and_check_output( - et_program, - conv_sequence_module, - sample_inputs, - atol=1e-2, - rtol=1e-1, - ) diff --git a/backends/vulkan/test/test_vulkan_delegate_header.py b/backends/vulkan/test/test_vulkan_delegate_header.py deleted file mode 100644 index bf8b59fc49d..00000000000 --- a/backends/vulkan/test/test_vulkan_delegate_header.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import unittest - -from executorch.backends.vulkan.serialization.vulkan_graph_serialize import ( - VulkanDelegateHeader, -) - -EXAMPLE_FLATBUFFER_OFFSET: int = 0x11223344 -EXAMPLE_FLATBUFFER_SIZE: int = 0x55667788 -EXAMPLE_BYTES_OFFSET: int = EXAMPLE_FLATBUFFER_OFFSET + EXAMPLE_FLATBUFFER_SIZE -EXAMPLE_BYTES_SIZE: int = 0x99AABBCC99AABBCC - -# If header layout or magic changes, this test must change too. -# The layout of the header is a contract, not an implementation detail -EXAMPLE_HEADER_DATA: bytes = ( - # zeros - b"\x00\x00\x00\x00" - # magic - + b"VH00" - # All Values below are littl Endian - # header length - + b"\x1E\x00" - # Flatbuffer Offset - + b"\x44\x33\x22\x11" - # Flatbuffer Size - + b"\x88\x77\x66\x55" - # Bytes Data Offset - + b"\xCC\xAA\x88\x66" - # Bytes Data Size - + b"\xCC\xBB\xAA\x99\xCC\xBB\xAA\x99" -) - - -class TestVulkanDelegateHeader(unittest.TestCase): - def test_to_bytes(self) -> None: - header = VulkanDelegateHeader( - EXAMPLE_FLATBUFFER_OFFSET, - EXAMPLE_FLATBUFFER_SIZE, - EXAMPLE_BYTES_OFFSET, - EXAMPLE_BYTES_SIZE, - ) - self.assertEqual(header.to_bytes(), EXAMPLE_HEADER_DATA) - self.assertTrue(header.is_valid()) - - def test_from_bytes(self) -> None: - header = VulkanDelegateHeader.from_bytes(EXAMPLE_HEADER_DATA) - self.assertEqual(header.flatbuffer_offset, EXAMPLE_FLATBUFFER_OFFSET) - self.assertEqual(header.flatbuffer_size, EXAMPLE_FLATBUFFER_SIZE) - self.assertEqual(header.bytes_offset, EXAMPLE_BYTES_OFFSET) - self.assertEqual(header.bytes_size, EXAMPLE_BYTES_SIZE) - - def test_invalid_metadata(self) -> None: - WRONG_MAGIC_DATA = EXAMPLE_HEADER_DATA[0:4] + b"YT01" + EXAMPLE_HEADER_DATA[8:] - with self.assertRaisesRegex( - ValueError, - "Expected magic bytes to be b'VH00', but got b'YT01'", - ): - VulkanDelegateHeader.from_bytes(WRONG_MAGIC_DATA) - - WRONG_LENGTH_DATA = ( - EXAMPLE_HEADER_DATA[0:8] + b"\x1D\x00" + EXAMPLE_HEADER_DATA[10:] - ) - with self.assertRaisesRegex( - ValueError, "Expected header to be 30 bytes, but got 29 bytes." - ): - VulkanDelegateHeader.from_bytes(WRONG_LENGTH_DATA) - - with self.assertRaisesRegex( - ValueError, "Expected header to be 30 bytes, but got 31 bytes." - ): - VulkanDelegateHeader.from_bytes(EXAMPLE_HEADER_DATA + b"\x00") - - def test_invalid_flatbuffer_size(self) -> None: - header = VulkanDelegateHeader( - EXAMPLE_FLATBUFFER_OFFSET, - 0, - EXAMPLE_BYTES_OFFSET, - EXAMPLE_BYTES_SIZE, - ) - - with self.assertRaises(ValueError): - header.to_bytes() - - def test_invalid_constants_offset(self) -> None: - header = VulkanDelegateHeader( - EXAMPLE_FLATBUFFER_OFFSET, - EXAMPLE_FLATBUFFER_SIZE, - EXAMPLE_FLATBUFFER_OFFSET + EXAMPLE_FLATBUFFER_SIZE - 1, - EXAMPLE_BYTES_SIZE, - ) - - with self.assertRaises(ValueError): - header.to_bytes() - - def test_to_bytes_same_as_from_bytes(self) -> None: - header = VulkanDelegateHeader.from_bytes(EXAMPLE_HEADER_DATA) - - to_bytes = header.to_bytes() - self.assertEqual(EXAMPLE_HEADER_DATA, to_bytes) diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py deleted file mode 100644 index b277dff2a76..00000000000 --- a/backends/vulkan/test/test_vulkan_passes.py +++ /dev/null @@ -1,317 +0,0 @@ -import unittest -from typing import Optional, Tuple - -import torch - -from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform -from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform -from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass - -from executorch.backends.vulkan.quantizer.vulkan_quantizer import ( - get_symmetric_quantization_config, - VulkanQuantizer, -) - -from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge - -from executorch.exir.backend.canonical_partitioners.config_partitioner import ( - format_target_name, -) -from torchao.quantization.linear_quant_modules import Int8DynActInt4WeightQuantizer - -from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e -from torchao.quantization.pt2e.quantizer import Quantizer - -################### -## Common Models ## -################### - - -class SingleLinearModule(torch.nn.Module): - def __init__(self, K=256, N=128): - super().__init__() - self.K = K - self.N = N - self.linear = torch.nn.Linear(K, N, bias=False) - - def forward(self, x): - return self.linear(x) - - def get_sample_inputs(self): - sample_inputs = (torch.rand(size=(32, self.K), dtype=torch.float32),) - return sample_inputs - - -########### -## Tests ## -########### - - -def quantize_and_lower_module( - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - quantizer: Quantizer, - dynamic_shapes=None, -) -> EdgeProgramManager: - edge_compile_config = EdgeCompileConfig( - _skip_dim_order=False, # TODO(T182928844): Delegate dim order op to backend. - _check_ir_validity=False, - ) - - program = torch.export.export_for_training( - model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True - ).module() - - program = prepare_pt2e(program, quantizer) # pyre-ignore - # Calibrate - program(*sample_inputs) - - program = convert_pt2e(program) - - program = torch.export.export(program, sample_inputs, dynamic_shapes=dynamic_shapes) - - edge_program = to_edge( - program, - compile_config=edge_compile_config, - ) - - return edge_program - - -def get_target_canonical_name(node: torch.fx.Node) -> Optional[str]: - if node.op != "call_function": - return None - node_name = format_target_name(node.target.__name__) # pyre-ignore - return node_name - - -def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) -> int: - count = 0 - for node in graph_module.graph.nodes: - canonical_name = get_target_canonical_name(node) - if canonical_name is not None and canonical_name == canonical_op_name: - count += 1 - return count - - -class TestVulkanPasses(unittest.TestCase): - - def test_fuse_int8pack_mm(self): - K = 256 - N = 256 - model = SingleLinearModule(K, N) - sample_inputs = model.get_sample_inputs() - - quantizer = VulkanQuantizer() - quantizer.set_global( - get_symmetric_quantization_config(is_dynamic=False, weight_bits=8) - ) - - edge_manager = quantize_and_lower_module( - model, - sample_inputs, - quantizer, - ) - - ep = edge_manager._edge_programs["forward"] - edge_manager.transform( - [ - AddmmToLinearTransform(), - FuseQuantizedOpsTransform(ep), - ] - ) - - gm = ep.graph_module - - self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1) - self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0) - - def test_fuse_linear_qcs4w(self): - K = 256 - N = 256 - model = SingleLinearModule(K, N) - sample_inputs = model.get_sample_inputs() - - quantizer = VulkanQuantizer() - quantizer.set_global( - get_symmetric_quantization_config(is_dynamic=False, weight_bits=4) - ) - - edge_manager = quantize_and_lower_module( - model, - sample_inputs, - quantizer, - ) - - ep = edge_manager._edge_programs["forward"] - edge_manager.transform( - [ - AddmmToLinearTransform(), - FuseQuantizedOpsTransform(ep), - ] - ) - - gm = ep.graph_module - - self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1) - self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0) - - @unittest.skip( - "linear_qta8a_qga4w currently does not support E2E dynamic quantization" - ) - def test_fuse_linear_qta8a_qga4w(self): - """Test fusion of dynamic activation + grouped weight quantized linear (QTA8A_QGA4W).""" - K = 256 - N = 256 - model = SingleLinearModule(K, N) - sample_inputs = model.get_sample_inputs() - - # Use source transform quantizer for dynamic activation + grouped weight quantization - quantizer = Int8DynActInt4WeightQuantizer( - groupsize=128, # Group size for 4-bit weights - padding_allowed=False, - precision=torch.float32, - scales_precision=torch.float32, - device=torch.device("cpu"), - ) - - # Apply source transform quantization - quantized_model = quantizer.quantize(model) - - # Export the quantized model - edge_compile_config = EdgeCompileConfig( - _skip_dim_order=False, - _check_ir_validity=False, - ) - - program = torch.export.export_for_training( - quantized_model, sample_inputs, strict=True - ).module() - - program = torch.export.export(program, sample_inputs) - - edge_manager = to_edge( - program, - compile_config=edge_compile_config, - ) - - ep = edge_manager._edge_programs["forward"] - edge_manager.transform( - [ - AddmmToLinearTransform(), - FuseQuantizedOpsTransform(ep), - ] - ) - - gm = ep.graph_module - - # Check that the linear_qta8a_qga4w operator was created - self.assertEqual(op_node_count(gm, "linear_qta8a_qga4w.default"), 1) - # Check that the original quantization/dequantization nodes were removed - self.assertEqual(op_node_count(gm, "quantize_per_token.default"), 0) - self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0) - self.assertEqual(op_node_count(gm, "linear.default"), 0) - - def test_fuse_rotary_emb(self): - """Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op.""" - - class RotaryEmbeddingModel(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward( - self, - xq: torch.Tensor, - xk: torch.Tensor, - freqs_cos: torch.Tensor, - freqs_sin: torch.Tensor, - ): - # This implementation matches the apply_rotary_emb function in rope.py - # Split into real and imaginary parts - xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) - xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) - - # Reshape frequencies for broadcasting - freqs_cos = self._reshape_for_broadcast(freqs_cos, xq_r) - freqs_sin = self._reshape_for_broadcast(freqs_sin, xq_r) - - # Apply rotary embedding - xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin - xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos - xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin - xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos - - # Recombine real and imaginary parts - xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3) - xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) - - return xq_out.type_as(xq), xk_out.type_as(xk) - - def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor): - """Helper function to reshape frequencies for broadcasting""" - ndim = x.ndim - freqs_cis_ndim = freqs_cis.ndim - if freqs_cis_ndim == 3: - # freqs_cis: (seq_len, n_heads, head_dim // 2) - shape = [ - d if (i == ndim - 3 or i == ndim - 2 or i == ndim - 1) else 1 - for i, d in enumerate(x.shape) - ] - else: - # freqs_cis: (seq_len, head_dim // 2) - shape = [ - d if i == 1 or i == ndim - 1 else 1 - for i, d in enumerate(x.shape) - ] - return freqs_cis.view(shape) - - # Create sample inputs based on the test file - batch_size = 1 - seq_len = 5 - n_heads = 32 - n_kv_heads = 8 - head_dim = 2048 - - xq = torch.randn(batch_size, seq_len, n_heads, head_dim, dtype=torch.float) - xk = torch.randn(batch_size, seq_len, n_kv_heads, head_dim, dtype=torch.float) - freqs_cos = torch.randn(seq_len, head_dim // 2, dtype=torch.float) - freqs_sin = torch.randn(seq_len, head_dim // 2, dtype=torch.float) - - sample_inputs = (xq, xk, freqs_cos, freqs_sin) - - model = RotaryEmbeddingModel() - - # Export the model - edge_compile_config = EdgeCompileConfig( - _skip_dim_order=False, - _check_ir_validity=False, - ) - - program = torch.export.export(model, sample_inputs, strict=True) - - edge_manager = to_edge( - program, - compile_config=edge_compile_config, - ) - - # Apply the rotary embedding pass - ep = edge_manager._edge_programs["forward"] - rotary_pass = FusePatternsPass(ep) - result = rotary_pass.call(ep.graph_module) - - # Verify that the pass was successful - self.assertTrue(result.modified) - - # Check that the custom op was created - gm = ep.graph_module - custom_op_count = 0 - for node in gm.graph.nodes: - if ( - node.op == "call_function" - and hasattr(node.target, "__name__") - and "apply_rotary_emb" in str(node.target) - ): - custom_op_count += 1 - - # We expect at least one custom op to be created - self.assertGreater(custom_op_count, 0) diff --git a/backends/vulkan/test/tester.py b/backends/vulkan/test/tester.py deleted file mode 100644 index b2066a06ec0..00000000000 --- a/backends/vulkan/test/tester.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Any, List, Optional, Sequence, Tuple - -import executorch -import executorch.backends.test.harness.stages as BaseStages - -import torch -from executorch.backends.test.harness import Tester as TesterBase -from executorch.backends.test.harness.stages import StageType -from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner -from executorch.backends.vulkan.quantizer.vulkan_quantizer import ( - get_symmetric_quantization_config as get_symmetric_quantization_config_vulkan, - VulkanQuantizer, -) -from executorch.exir import EdgeCompileConfig -from executorch.exir.backend.partitioner import Partitioner -from torchao.quantization.pt2e.quantizer import Quantizer - - -class Quantize(BaseStages.Quantize): - def __init__( - self, - quantizer: Optional[Quantizer] = None, - quantization_config: Any | None = None, - calibrate: bool = True, - calibration_samples: Optional[Sequence[Any]] = None, - is_qat: Optional[bool] = False, - ): - super().__init__( - quantizer=quantizer or VulkanQuantizer(), - quantization_config=( - quantization_config or get_symmetric_quantization_config_vulkan() - ), - calibrate=calibrate, - calibration_samples=calibration_samples, - is_qat=is_qat, - ) - - -class Partition(BaseStages.Partition): - def __init__(self, partitioner: Optional[Partitioner] = None): - super().__init__( - partitioner=partitioner or VulkanPartitioner(), - ) - - -class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower): - def __init__( - self, - partitioners: Optional[List[Partitioner]] = None, - edge_compile_config: Optional[EdgeCompileConfig] = None, - ): - super().__init__( - default_partitioner_cls=VulkanPartitioner, - partitioners=partitioners, - edge_compile_config=edge_compile_config - or EdgeCompileConfig(_check_ir_validity=False), - ) - - -class VulkanTester(TesterBase): - def __init__( - self, - module: torch.nn.Module, - example_inputs: Tuple[torch.Tensor], - dynamic_shapes: Optional[Tuple[Any]] = None, - ): - stage_classes = ( - executorch.backends.test.harness.Tester.default_stage_classes() - | { - StageType.PARTITION: Partition, - StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower, - } - ) - - super().__init__( - module=module, - stage_classes=stage_classes, - example_inputs=example_inputs, - dynamic_shapes=dynamic_shapes, - ) diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py deleted file mode 100644 index 363ee37058d..00000000000 --- a/backends/vulkan/test/utils.py +++ /dev/null @@ -1,787 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -import logging -from collections import OrderedDict -from copy import deepcopy - -from enum import auto, Enum -from typing import Any, List, Optional, Tuple - -import executorch.backends.vulkan.utils as utils - -import torch - -from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner -from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend -from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner - -from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( - get_symmetric_quantization_config, - XNNPACKQuantizer, -) -from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend -from executorch.devtools import BundledProgram -from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.devtools.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) -from executorch.exir import ExecutorchProgramManager, to_edge_transform_and_lower -from executorch.extension.pybindings.portable_lib import ( # @manual - _load_for_executorch_from_buffer, -) -from executorch.extension.pytree import tree_flatten -from torch.export import export, export_for_training - -from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e - - -class QuantizationMode(Enum): - """Enum to describe how a model should be quantized.""" - - NONE = auto() - INT8_STATIC_PER_CHANNEL = auto() - - -def get_exported_graph( - model, - sample_inputs, - dynamic_shapes=None, - qmode=QuantizationMode.NONE, -) -> torch.fx.GraphModule: - export_training_graph = export_for_training( - model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True - ).module() - - if qmode == QuantizationMode.NONE: - return export_training_graph - - quantizer = XNNPACKQuantizer() - - operator_config = get_symmetric_quantization_config(is_per_channel=True) - quantizer.set_global(operator_config) - - prepared_graph = prepare_pt2e(export_training_graph, quantizer) - prepared_graph(*sample_inputs) - converted_graph = convert_pt2e(prepared_graph) - - return converted_graph - - -def random_uniform_tensor(shape, low=0.0, high=1.0, device=None, dtype=None): - if dtype is None: - dtype = torch.float32 - - return torch.empty(shape, device=device, dtype=dtype).uniform_(low, high) - - -def export_model_to_vulkan( - model, - sample_inputs, - dynamic_shapes=None, - operator_blocklist=None, - operator_allowlist=None, - nn_module_blocklist=None, - nn_module_allowlist=None, - qmode=QuantizationMode.NONE, -): - compile_options = {} - exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode) - program = export( - exported_graph, - sample_inputs, - dynamic_shapes=dynamic_shapes, - strict=True, - ) - - edge_program = to_edge_transform_and_lower( - program, - partitioner=[ - VulkanPartitioner( - compile_options, - operator_blocklist=operator_blocklist, - operator_allowlist=operator_allowlist, - nn_module_blocklist=nn_module_blocklist, - nn_module_allowlist=nn_module_allowlist, - ) - ], - transform_passes=None, - compile_config=None, - ) - - executorch_program = edge_program.to_executorch() - - # Check if the delegate ID matches VulkanBackend - if ( - executorch_program.executorch_program.execution_plan[0].delegates[0].id - != VulkanBackend.__name__ - ): - raise RuntimeError( - f"Expected delegate ID {VulkanBackend.__name__}, but got {executorch_program.executorch_program.execution_plan[0].delegates[0].id}" - ) - - return executorch_program - - -def export_model_to_xnnpack( - model, - sample_inputs, - dynamic_shapes=None, - operator_blocklist=None, - operator_allowlist=None, - nn_module_blocklist=None, - nn_module_allowlist=None, - qmode=QuantizationMode.NONE, -): - compile_options = {} - exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode) - program = export( - exported_graph, - sample_inputs, - dynamic_shapes=dynamic_shapes, - strict=True, - ) - - edge_program = to_edge_transform_and_lower( - program, - partitioner=[XnnpackPartitioner(compile_options)], - transform_passes=None, - compile_config=None, - ) - - executorch_program = edge_program.to_executorch() - - # Check if the delegate ID matches XnnpackBackend - if ( - executorch_program.executorch_program.execution_plan[0].delegates[0].id - != XnnpackBackend.__name__ - ): - raise RuntimeError( - f"Expected delegate ID {XnnpackBackend.__name__}, but got {executorch_program.executorch_program.execution_plan[0].delegates[0].id}" - ) - - return executorch_program - - -def print_tensor_comparison_errors( - tensor1, tensor2, atol=1e-03, rtol=1e-03, max_errors=10 -): - """ - Print the first max_errors tensor indexes that exceed the absolute/relative tolerance - and the error at each of those locations. - - Args: - tensor1: First tensor to compare - tensor2: Second tensor to compare - atol: Absolute tolerance - rtol: Relative tolerance - max_errors: Maximum number of errors to print (default: 10) - """ - # Handle lists/tuples of tensors - if isinstance(tensor1, (list, tuple)) and isinstance(tensor2, (list, tuple)): - if len(tensor1) != len(tensor2): - print(f"Tensor count mismatch: {len(tensor1)} vs {len(tensor2)}") - return - - for i, (t1, t2) in enumerate(zip(tensor1, tensor2)): - print(f"\n=== Tensor {i} comparison ===") - print_tensor_comparison_errors(t1, t2, atol, rtol, max_errors) - return - - # Handle single tensor comparison - if not isinstance(tensor1, torch.Tensor) or not isinstance(tensor2, torch.Tensor): - print("Error: Both inputs must be torch.Tensor objects") - return - - if tensor1.shape != tensor2.shape: - print(f"Shape mismatch: {tensor1.shape} vs {tensor2.shape}") - return - - # Calculate absolute and relative errors - abs_diff = torch.abs(tensor1 - tensor2) - rel_diff = abs_diff / ( - torch.abs(tensor2) + 1e-8 - ) # Add small epsilon to avoid division by zero - - # Find locations where tolerance is exceeded - tolerance_mask = (abs_diff > atol) & (rel_diff > rtol) - - if not tolerance_mask.any(): - print("All values are within tolerance") - return - - # Get indices where tolerance is exceeded - error_indices = torch.nonzero(tolerance_mask, as_tuple=False) - total_errors = error_indices.shape[0] - - print(f"Found {total_errors} values exceeding tolerance (atol={atol}, rtol={rtol})") - print(f"Showing first {min(max_errors, total_errors)} errors:") - print("Index -> tensor1_value, tensor2_value, abs_error, rel_error") - - # Print first max_errors locations - for i in range(min(max_errors, total_errors)): - idx = tuple(error_indices[i].tolist()) - val1 = tensor1[idx].item() - val2 = tensor2[idx].item() - abs_err = abs_diff[idx].item() - rel_err = rel_diff[idx].item() - - print( - f"{idx} -> {val1:.6f}, {val2:.6f}, abs_err={abs_err:.6f}, rel_err={rel_err:.6f}" - ) - - -def check_outputs_equal( - model_output, ref_output, atol=1e-03, rtol=1e-03, first_output_only=False -): - """ - Helper function that checks if model output and reference output are equal with some tolerance. - Returns True if equal, False otherwise. - """ - # Convert OrderedDict to list if needed - if isinstance(ref_output, OrderedDict): - ref_output = list(ref_output.values()) - - # Compare the result from executor and eager mode directly - if isinstance(ref_output, tuple) or isinstance(ref_output, list): - # Multiple outputs executor always returns tuple, even if there is one output - if len(ref_output) != len(model_output): - print_tensor_comparison_errors(model_output, ref_output, atol, rtol) - return False - if first_output_only: - result = torch.allclose( - model_output[0], ref_output[0], atol=atol, rtol=rtol - ) - if not result: - print_tensor_comparison_errors( - model_output[0], ref_output[0], atol, rtol - ) - return result - else: - for i in range(len(ref_output)): - if not torch.allclose( - model_output[i], ref_output[i], atol=atol, rtol=rtol - ): - print(f"\n=== Output {i} comparison failed ===") - print_tensor_comparison_errors( - model_output[i], ref_output[i], atol, rtol - ) - return False - return True - else: - # If one output, eager returns tensor while executor tuple of size 1 - result = torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol) - if not result: - print_tensor_comparison_errors(model_output[0], ref_output, atol, rtol) - return result - - -def run_and_check_output( - reference_model: torch.nn.Module, - executorch_program: ExecutorchProgramManager, - sample_inputs: Tuple[torch.Tensor], - atol=1e-03, - rtol=1e-01, - first_output_only=False, -) -> bool: - """ - Utility function that accepts an already lowered ExecuTorch program, executes it with - the provided sample input, and checks the output for correctness. - - Args: - executorch_program: Already lowered ExecutorchProgramManager - sample_inputs: Sample inputs to run the program with - reference_model: Reference model to generate reference outputs for comparison - atol: Absolute tolerance for output comparison - rtol: Relative tolerance for output comparison - first_output_only: Whether to compare only the first output - - Returns: - bool: True if outputs match within tolerance, False otherwise - """ - # Load the ExecutorTorch program - executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer) - - # Flatten inputs for execution - inputs_flattened, _ = tree_flatten(sample_inputs) - - # Run the ExecutorTorch program - model_output = executorch_module.run_method("forward", tuple(inputs_flattened)) - - # Generate reference outputs using the reference model - ref_output = reference_model(*sample_inputs) - - # Check if outputs are equal - return check_outputs_equal( - model_output, - ref_output, - atol=atol, - rtol=rtol, - first_output_only=first_output_only, - ) - - -def make_copy_of_inputs(sample_inputs: Tuple[Any]) -> Tuple[Any]: - sample_inputs_copy = [] - for input_val in sample_inputs: - if isinstance(input_val, torch.Tensor): - sample_inputs_copy.append(input_val.clone()) - else: - sample_inputs_copy.append(deepcopy(input_val)) - return tuple(sample_inputs_copy) - - -def lower_module_and_test_output( - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - atol=1e-03, - rtol=1e-01, - dynamic_shapes=None, - test_inputs=None, - first_output_only=False, - operator_blocklist=None, - operator_allowlist=None, - nn_module_allowlist=None, - nn_module_blocklist=None, - xnnpack=False, -) -> bool: - """ - Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with - the given sample inputs. It then runs the lowered module and compares its - outputs with the outputs of the eager module. - - Returns: - bool: True if all comparisons pass, False otherwise. - """ - # Export model to Vulkan using the helper function - if xnnpack: - executorch_program = export_model_to_xnnpack( - model, - make_copy_of_inputs(sample_inputs), - dynamic_shapes, - operator_blocklist, - operator_allowlist, - nn_module_blocklist, - nn_module_allowlist, - ) - else: - executorch_program = export_model_to_vulkan( - model, - make_copy_of_inputs(sample_inputs), - dynamic_shapes, - operator_blocklist=operator_blocklist, - operator_allowlist=operator_allowlist, - nn_module_blocklist=nn_module_blocklist, - nn_module_allowlist=nn_module_allowlist, - ) - - executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer) - - inputs_flattened, _ = tree_flatten(sample_inputs) - - model_output = executorch_module.run_method("forward", tuple(inputs_flattened)) - ref_output = model(*make_copy_of_inputs(sample_inputs)) - - if not check_outputs_equal( - model_output, - ref_output, - atol=atol, - rtol=rtol, - first_output_only=first_output_only, - ): - return False - - if test_inputs is not None: - for test_input in test_inputs: - test_inputs_flattened, _ = tree_flatten(test_input) - model_output = executorch_module.run_method( - "forward", tuple(test_inputs_flattened) - ) - ref_output = model(*test_input) - - if not check_outputs_equal( - model_output, - ref_output, - atol=atol, - rtol=rtol, - first_output_only=first_output_only, - ): - return False - - return True - - -def save_bundled_program( - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - output_path: str, - method_name: str = "forward", - et_program: Optional[ExecutorchProgramManager] = None, - dynamic_shapes=None, -) -> str: - """ - Export a bundled .pte file containing the model and test cases. - - Args: - model: The PyTorch model to export - sample_inputs: Sample inputs for the model - output_path: Path where the bundled .pte file should be saved (should end with .bpte) - method_name: Name of the method to test (default: "forward") - et_program: Optional pre-exported ExecutorchProgramManager. If None, will export to Vulkan - dynamic_shapes: Optional dynamic shapes for export - - Returns: - str: Path to the saved bundled program file - """ - # If no ExecutorchProgramManager provided, export to Vulkan - if et_program is None: - et_program = export_model_to_vulkan(model, sample_inputs, dynamic_shapes) - - # Generate expected outputs by running the model - expected_outputs = [getattr(model, method_name)(*sample_inputs)] - - # Flatten sample inputs to match expected format - inputs_flattened, _ = tree_flatten(sample_inputs) - - # Create test suite with the sample inputs and expected outputs - test_suites = [ - MethodTestSuite( - method_name=method_name, - test_cases=[ - MethodTestCase( - inputs=inputs_flattened, - expected_outputs=expected_outputs, - ) - ], - ) - ] - - # Create bundled program - bp = BundledProgram(et_program, test_suites) - - # Serialize to flatbuffer - bp_buffer = serialize_from_bundled_program_to_flatbuffer(bp) - - # Ensure output path has correct extension - if not output_path.endswith(".bpte"): - output_path = output_path + ".bpte" - - # Write to file - with open(output_path, "wb") as file: - file.write(bp_buffer) - return output_path - - -def save_executorch_program( - executorch_program: ExecutorchProgramManager, - output_path: str, -) -> str: - """ - Save an ExecutorchProgramManager as a .pte file. - - Args: - executorch_program: The ExecutorchProgramManager to save - output_path: Path where the .pte file should be saved (should end with .pte) - - Returns: - str: Path to the saved .pte file - """ - # Ensure output path has correct extension - if not output_path.endswith(".pte"): - output_path = output_path + ".pte" - - # Write to file - with open(output_path, "wb") as file: - executorch_program.write_to_file(file) - - return output_path - - -def print_occurrences(edge_program, operator_list: List): - """ - Print the input/output information for all occurrences of specified operators in the edge program. - - Args: - edge_program: The edge program created by to_edge_transform_and_lower - operator_list: List of operators to search for in the graph - """ - logger = logging.getLogger("") - logger.setLevel(logging.INFO) - - logger.info( - f"Searching for occurrences of {len(operator_list)} operators in the graph..." - ) - - occurrence_count = 0 - - for node in edge_program.exported_program().graph.nodes: - if utils.is_torch_op_node(node): - target = node.target - # Handle auto_functionalized nodes - if node.target == torch.ops.higher_order.auto_functionalized: - first_arg = node.args[0] - if hasattr(first_arg, "name"): - target = first_arg.name() - elif hasattr(first_arg, "__name__"): - target = first_arg.__name__ - - # Check if this operator is in our list - if target in operator_list: - occurrence_count += 1 - logger.info(f"Occurrence {occurrence_count}: {node.format_node()}") - - # Get the node I/O string using the utils function - try: - io_str = utils.node_io_str(node) - logger.info(f" {io_str}") - except Exception as e: - logger.info(f" Error getting I/O string: {e}") - - if occurrence_count == 0: - logger.info("No occurrences of the specified operators found in the graph.") - else: - logger.info( - f"Found {occurrence_count} total occurrences of the specified operators." - ) - - -def op_ablation_test( # noqa: C901 - model: torch.nn.Module, - sample_inputs: Tuple[torch.Tensor], - atol=1e-03, - rtol=1e-01, - dynamic_shapes=None, - test_inputs=None, - first_output_only=False, -) -> dict: - """ - Fast binary search utility function to determine which operators work correctly when delegated to Vulkan. - - This function uses a binary search approach to efficiently find bad operators: - 1. Split operators into two halves (least frequent first, most frequent second) - 2. Test each half to see if it produces correct output - 3. Add good halves to known_good_ops and recursively search bad halves - 4. Continue until all operators are classified - - Args: - model: The PyTorch model to test - sample_inputs: Sample inputs for the model - atol: Absolute tolerance for output comparison - rtol: Relative tolerance for output comparison - dynamic_shapes: Optional dynamic shapes for export - test_inputs: Optional additional test inputs - first_output_only: Whether to compare only the first output - - Returns: - dict: Dictionary with keys: - - 'good_operators': List of operators that work correctly - - 'bad_operators': List of operators that cause failures - - 'operator_frequencies': Dictionary mapping operators to their occurrence count - - 'all_operators': List of all unique operators found in the graph - - 'test_count': Number of tests performed - """ - logger = logging.getLogger("") - logger.setLevel(logging.INFO) - - logger.info("Starting fast binary search operator ablation test...") - - # Step 1: Export model to get edge_program and extract operators - export_training_graph = export_for_training( - model, sample_inputs, strict=True - ).module() - program = export( - export_training_graph, - sample_inputs, - dynamic_shapes=dynamic_shapes, - strict=True, - ) - edge_program = to_edge_transform_and_lower( - program, - partitioner=[], # No partitioner to get the full graph - transform_passes=None, - compile_config=None, - ) - - # Step 2: Scan edge_program.graph_module to obtain unique operators and their frequencies - operator_frequencies = {} - for node in edge_program.exported_program().graph.nodes: - if utils.is_torch_op_node(node): - target = node.target - # Handle auto_functionalized nodes - if node.target == torch.ops.higher_order.auto_functionalized: - first_arg = node.args[0] - if hasattr(first_arg, "name"): - target = first_arg.name() - elif hasattr(first_arg, "__name__"): - target = first_arg.__name__ - - if target in operator_frequencies: - operator_frequencies[target] += 1 - else: - operator_frequencies[target] = 1 - - all_operators = list(operator_frequencies.keys()) - logger.info(f"Found {len(all_operators)} unique operators in the graph") - - # Sort operators by frequency (most frequent first for binary search) - operators_by_frequency = sorted( - all_operators, key=lambda op: operator_frequencies[op], reverse=True - ) - - logger.info("Operator frequencies (sorted by occurrence, most frequent first):") - for op in operators_by_frequency: - logger.info(f" {op.name()}: {operator_frequencies[op]} occurrences") - - # Global test counter - test_count = 0 - - def test_operator_set(ops_to_test: List, known_good_ops: List) -> bool: - """Test if a set of operators works correctly when combined with known good operators.""" - nonlocal test_count - test_count += 1 - - test_allowlist = known_good_ops + ops_to_test - logger.info( - f"Test {test_count}: Testing {len(ops_to_test)} operators with {len(known_good_ops)} known good" - ) - - try: - success = lower_module_and_test_output( - model=model, - sample_inputs=sample_inputs, - atol=atol, - rtol=rtol, - dynamic_shapes=dynamic_shapes, - test_inputs=test_inputs, - first_output_only=first_output_only, - operator_allowlist=test_allowlist, - ) - logger.info(f" {'✓ PASS' if success else '✗ FAIL'}") - - # Log known good ops - logger.info(" Known good:") - for op in known_good_ops: - logger.info(f" * {op.name()}") - - # Log tested ops - logger.info(" Tested ops:") - for op in ops_to_test: - logger.info(f" * {op.name()}") - - return success - except Exception as e: - logger.info(f" ! Error: {e}") - return False - - def find_bad_operators( - ops_to_test: List, known_good_ops: List - ) -> Tuple[List, List]: - """ - Recursively find bad operators using binary search. - - Returns: - Tuple of (good_operators, bad_operators) from ops_to_test - """ - if not ops_to_test: - return [], [] - - if len(ops_to_test) == 1: - # Base case: single operator - op = ops_to_test[0] - if test_operator_set([op], known_good_ops): - logger.info(f" Single operator {op.name()} is GOOD") - return [op], [] - else: - logger.info(f" Single operator {op.name()} is BAD") - return [], [op] - - # Split ops_to_test into two halves - mid = len(ops_to_test) // 2 - first_half = ops_to_test[:mid] # Least frequent operators - second_half = ops_to_test[mid:] # Most frequent operators - - logger.info( - f"Splitting {len(ops_to_test)} operators: {len(first_half)} + {len(second_half)}" - ) - - # Log known good ops - logger.info(" Known good:") - for op in known_good_ops: - logger.info(f" * {op.name()}") - - # Log first half ops - logger.info(" First half ops:") - for op in first_half: - logger.info(f" * {op.name()}") - - # Log second half ops - logger.info(" Second half ops:") - for op in second_half: - logger.info(f" * {op.name()}") - - good_ops = [] - bad_ops = [] - - first_half_good = test_operator_set(first_half, known_good_ops) - if first_half_good: - logger.info( - f"First half ({len(first_half)} ops) is good - adding to known good" - ) - good_ops.extend(first_half) - known_good_ops.extend(first_half) - - second_half_good = test_operator_set(second_half, known_good_ops) - if second_half_good: - logger.info( - f"Second half ({len(second_half)} ops) is good - adding to known good" - ) - good_ops.extend(second_half) - - if not first_half_good: - logger.info(f"First half ({len(first_half)} ops) is bad - recursing") - sub_good, sub_bad = find_bad_operators(first_half, known_good_ops) - good_ops.extend(sub_good) - bad_ops.extend(sub_bad) - known_good_ops.extend(sub_good) - if not second_half_good: - logger.info(f"Second half ({len(second_half)} ops) is bad - recursing") - sub_good, sub_bad = find_bad_operators(second_half, known_good_ops) - good_ops.extend(sub_good) - bad_ops.extend(sub_bad) - - return good_ops, bad_ops - - # Start the binary search - logger.info( - f"\n=== Starting binary search on {len(operators_by_frequency)} operators ===" - ) - good_operators, bad_operators = find_bad_operators(operators_by_frequency, []) - - # Summary of results - logger.info(f"\n=== Binary search complete after {test_count} tests ===") - logger.info(f"Good operators ({len(good_operators)}):") - for op in good_operators: - logger.info(f" ✓ {op.name()} (frequency: {operator_frequencies[op]})") - - logger.info(f"Bad operators ({len(bad_operators)}):") - for op in bad_operators: - logger.info(f" ✗ {op.name()} (frequency: {operator_frequencies[op]})") - - print_occurrences(edge_program, bad_operators) - - efficiency_gain = len(all_operators) - test_count - logger.info( - f"Efficiency: {test_count} tests instead of {len(all_operators)} (saved {efficiency_gain} tests)" - ) - - return { - "good_operators": good_operators, - "bad_operators": bad_operators, - "operator_frequencies": operator_frequencies, - "all_operators": all_operators, - "test_count": test_count, - } diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp deleted file mode 100644 index 07d28229221..00000000000 --- a/backends/vulkan/test/utils/test_utils.cpp +++ /dev/null @@ -1,626 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -#include -#include -#include - -using namespace vkcompute; - -bool is_bitw8(vkapi::ScalarType dtype) { - return dtype == vkapi::kByte || dtype == vkapi::kChar || - dtype == vkapi::kQInt8 || dtype == vkapi::kQUInt8; -} - -vkapi::ShaderInfo get_nchw_to_tensor_shader( - const api::vTensor& v_dst, - bool int8_buffer_enabled, - bool push_constant_variant) { - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - - if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer && - !int8_buffer_enabled) { - kernel_name = "nchw_to_bitw8_image_nobitw8buffer"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, v_dst.storage_type()); - add_dtype_suffix(kernel_name, v_dst.dtype()); - return VK_KERNEL_FROM_STR(kernel_name); - } - - if (v_dst.storage_type() == utils::kBuffer) { - kernel_name = "nchw_to_buffer"; - add_dtype_suffix(kernel_name, v_dst.dtype()); - return VK_KERNEL_FROM_STR(kernel_name); - } - - kernel_name = "nchw_to_image"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, v_dst.storage_type()); - add_dtype_suffix(kernel_name, v_dst.dtype()); - - return VK_KERNEL_FROM_STR(kernel_name); -} - -vkapi::ShaderInfo get_tensor_to_nchw_shader( - const api::vTensor& v_src, - bool int8_buffer_enabled, - bool push_constant_variant) { - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - - if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer && - !int8_buffer_enabled) { - kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, v_src.storage_type()); - add_dtype_suffix(kernel_name, v_src.dtype()); - return VK_KERNEL_FROM_STR(kernel_name); - } - - if (v_src.storage_type() == utils::kBuffer) { - kernel_name = "buffer_to_nchw"; - add_dtype_suffix(kernel_name, v_src.dtype()); - return VK_KERNEL_FROM_STR(kernel_name); - } - - kernel_name = "image_to_nchw"; - if (!push_constant_variant) { - kernel_name += "_no_pc"; - } - add_storage_type_suffix(kernel_name, v_src.storage_type()); - add_dtype_suffix(kernel_name, v_src.dtype()); - - return VK_KERNEL_FROM_STR(kernel_name); -} -// -// Operator Recording Functions -// - -void record_nchw_to_buffer_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst) { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()}; - - context->submit_compute_job( - get_nchw_to_tensor_shader(v_dst, true, false), - pipeline_barrier, - {uint32_t(v_dst.numel()), 1, 1}, - {64, 1, 1}, - specialization_constants, - VK_NULL_HANDLE, - 0, - v_dst.buffer( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - src_buffer, - v_dst.buffer_meta_ubo()); -} - -void record_buffer_to_nchw_op( - api::Context* const context, - api::vTensor& v_src, - vkapi::VulkanBuffer& dst_buffer) { - vkapi::PipelineBarrier pipeline_barrier{}; - context->submit_compute_job( - get_tensor_to_nchw_shader(v_src, true, false), - pipeline_barrier, - {uint32_t(v_src.numel()), 1, 1}, - {64, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - dst_buffer, - v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.buffer_meta_ubo()); -} - -void record_nchw_to_image_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst) { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()}; - - context->submit_compute_job( - get_nchw_to_tensor_shader( - v_dst, - context->adapter_ptr()->has_full_int8_buffers_support(), - false), - pipeline_barrier, - v_dst.logical_limits(), - adaptive_work_group_size(v_dst.logical_limits()), - specialization_constants, - VK_NULL_HANDLE, - 0, - v_dst.image( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - src_buffer, - v_dst.sizes_ubo()); -} - -void record_image_to_nchw_op( - api::Context* const context, - api::vTensor& v_src, - vkapi::VulkanBuffer& dst_buffer) { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()}; - - context->submit_compute_job( - get_tensor_to_nchw_shader(v_src, true, false), - pipeline_barrier, - v_src.logical_limits(), - adaptive_work_group_size(v_src.logical_limits()), - specialization_constants, - VK_NULL_HANDLE, - 0, - dst_buffer, - v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.sizes_ubo()); -} - -void record_bitw8_image_to_nchw_nobitw8buffer_op( - api::Context* const context, - api::vTensor& v_src, - api::StagingBuffer& dst_buffer) { - vkapi::PipelineBarrier pipeline_barrier{}; - uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); - utils::uvec3 global_wg_size = {buffer_len, 1, 1}; - - std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer_no_pc"; - add_storage_type_suffix(kernel_name, v_src.storage_type()); - add_dtype_suffix(kernel_name, v_src.dtype()); - - context->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - global_wg_size, - adaptive_work_group_size(global_wg_size), - {v_src.hashed_layout()}, - VK_NULL_HANDLE, - 0, - dst_buffer.buffer(), - v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.sizes_ubo(), - v_src.numel_ubo()); -} - -void record_binary_op( - api::Context* const context, - const std::string& op_name, - api::vTensor& v_in1, - api::vTensor& v_in2, - api::vTensor& v_dst) { - std::string kernel_name = "binary_" + op_name + "_nobroadcast__test"; - add_dtype_suffix(kernel_name, v_dst.dtype()); - - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {}; - context->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - v_dst.logical_limits(), - adaptive_work_group_size(v_dst.logical_limits()), - specialization_constants, - VK_NULL_HANDLE, - 0, - v_dst.image( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_dst.sizes_ubo()); -} - -void execute_and_check_add( - api::vTensor& a, - api::vTensor& b, - api::vTensor& c, - float a_val, - float b_val) { - // Fill input tensors - fill_vtensor(a, a_val); - fill_vtensor(b, b_val); - - // a + b = c - record_binary_op(api::context(), "add", a, b, c); - - // Extract output tensor - std::vector data_out = extract_vtensor(c); - - // Check output - for (size_t i = 0; i < data_out.size(); ++i) { - CHECK_VALUE(data_out, i, (a_val + b_val)); - } -} - -void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) { - std::string kernel_name("idx_fill_buffer"); - switch (v_ten.dtype()) { - case vkapi::kFloat: - kernel_name += "_float"; - break; - case vkapi::kHalf: - kernel_name += "_half"; - break; - case vkapi::kQInt8: - kernel_name += "_int8"; - break; - case vkapi::kQUInt8: - kernel_name += "_uint8"; - break; - default: - throw std::runtime_error("Unsupported dtype"); - break; - } - - api::ParamsBuffer params(api::context(), int32_t(v_ten.numel())); - - { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {}; - api::context()->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - {uint32_t(v_ten.numel()), 1, 1}, - {64, 1, 1}, - specialization_constants, - VK_NULL_HANDLE, - 0, - v_ten.buffer( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::READ), - params.buffer()); - } -} - -void record_scalar_add_buffer( - api::Context* context, - api::vTensor& v_ten, - float offset) { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {SV(offset)}; - std::string kernel = "scalar_add_buffer"; - add_dtype_suffix(kernel, v_ten.dtype()); - api::context()->submit_compute_job( - VK_KERNEL_FROM_STR(kernel), - pipeline_barrier, - {uint32_t(v_ten.numel()), 1, 1}, - {64, 1, 1}, - specialization_constants, - VK_NULL_HANDLE, - 0, - v_ten.buffer( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE), - v_ten.numel_ubo()); -} - -void record_reference_matmul( - api::Context* context, - api::vTensor& out, - api::vTensor& mat1, - api::vTensor& mat2) { - vkapi::PipelineBarrier pipeline_barrier{}; - api::context()->submit_compute_job( - VK_KERNEL(reference_matmul), - pipeline_barrier, - {uint32_t(out.size(1)), uint32_t(out.size(0)), 1}, - {64, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - out.buffer( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - out.sizes_ubo(), - out.strides_ubo(), - mat1.sizes_ubo(), - mat1.strides_ubo(), - mat2.sizes_ubo(), - mat2.strides_ubo()); -} - -void record_matmul_texture3d( - api::Context* context, - api::vTensor& out, - api::vTensor& mat1, - api::vTensor& mat2) { - std::string kernel_name = "matmul_naive"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, out.storage_type()); - add_dtype_suffix(kernel_name, out.dtype()); - - utils::uvec3 global_wg_size = out.logical_limits(); - - vkapi::PipelineBarrier pipeline_barrier{}; - api::context()->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - global_wg_size, - {8, 8, 1}, - {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()}, - VK_NULL_HANDLE, - 0, - out.image( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - out.sizes_ubo(), - out.logical_limits_ubo(), - mat1.sizes_ubo(), - mat2.sizes_ubo()); -} - -// -// Input & Output Utilities -// - -#define FORALL_SUPPORTED_TYPES(_) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int32_t, Int) \ - _(executorch::aten::Half, Half) \ - _(float, Float) \ - _(int8_t, QInt8) - -void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); - -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted; \ - data_converted.resize(data.size()); \ - for (int i = 0; i < data.size(); ++i) { \ - data_converted[i] = ctype(data[i]); \ - } \ - staging_buffer.copy_from( \ - data_converted.data(), vten.staging_buffer_nbytes()); \ - } break; - - switch (vten.dtype()) { - FORALL_SUPPORTED_TYPES(CASE) - default: - VK_THROW("Unsupported dtype"); - } - -#undef CASE - - if (vten.storage_type() == utils::StorageType::BUFFER) { - record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten); - } else { - record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten); - } -} - -void fill_vtensor(api::vTensor& vten, float val, bool iota) { - std::vector vten_data(vten.staging_buffer_numel()); - if (iota) { - std::iota(vten_data.begin(), vten_data.end(), val); - } else { - std::fill(vten_data.begin(), vten_data.end(), val); - } - - fill_vtensor(vten, vten_data); -} - -std::vector create_random_float_buffer( - const size_t numel, - const float min, - const float max) { - std::vector data(numel); - std::default_random_engine rng; - std::uniform_real_distribution dist(min, max); - - for (size_t i = 0; i < data.size(); ++i) { - data[i] = dist(rng); - } - return data; -} - -std::vector create_random_uint8_buffer( - const size_t numel, - const uint8_t min, - const uint8_t max) { - std::vector data(numel); - std::default_random_engine rng; - std::uniform_real_distribution dist(min, max); - - for (size_t i = 0; i < data.size(); ++i) { - data[i] = (uint8_t)dist(rng); - } - return data; -} - -void fill_vtensor( - ComputeGraph& graph, - const IOValueRef idx, - float val, - bool iota) { - std::vector data(graph.numel_of(idx.value)); - if (graph.storage_type_of(idx.value) != utils::kBuffer) { - data.resize(graph.staging_buffer_numel_of(idx.value)); - } - if (iota) { - std::iota(data.begin(), data.end(), val); - } else { - std::fill(data.begin(), data.end(), val); - } - - graph.copy_into_staging(idx.staging, data.data(), data.size()); -} - -void extract_vtensor(api::vTensor& vten, std::vector& data) { - api::StagingBuffer staging_buffer( - api::context(), vten.dtype(), vten.staging_buffer_numel()); - - if (vten.storage_type() == utils::StorageType::BUFFER) { - record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer()); - } else { - record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer()); - } - - vkapi::VulkanFence fence = api::context()->fences().get_fence(); - api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); - fence.wait(); - -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted(data.size()); \ - staging_buffer.copy_to( \ - data_converted.data(), vten.staging_buffer_nbytes()); \ - for (int i = 0; i < data.size(); ++i) { \ - data[i] = float(data_converted[i]); \ - } \ - } break; - - switch (vten.dtype()) { - FORALL_SUPPORTED_TYPES(CASE) - default: - VK_THROW("Unsupported dtype"); - } - -#undef CASE -} - -// -// Context Management -// - -void submit_to_gpu() { - vkapi::VulkanFence fence = api::context()->fences().get_fence(); - api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); - fence.wait(); -} - -vkapi::Allocation allocate_memory_for(const api::vTensor& vten) { - VmaAllocationCreateInfo alloc_create_info = - api::context()->adapter_ptr()->vma().gpuonly_resource_create_info(); - return api::context()->adapter_ptr()->vma().create_allocation( - vten.get_memory_requirements(), alloc_create_info); -} - -VmaTotalStatistics get_vma_stats() { - return api::context()->adapter_ptr()->vma().get_memory_statistics(); -} - -size_t get_vma_allocation_count() { - return get_vma_stats().total.statistics.allocationCount; -} - -// -// Graph Test Utilities -// - -void execute_graph_and_check_output( - ComputeGraph& graph, - std::vector input_vals, - std::vector expected_outputs) { - assert(input_vals.size() == graph.inputs().size()); - assert(expected_outputs.size() == graph.outputs().size()); - - for (size_t i = 0; i < graph.inputs().size(); ++i) { - fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i)); - } - - graph.execute(); - - for (size_t i = 0; i < graph.outputs().size(); ++i) { - IOValueRef out_ioval = graph.outputs().at(i); - std::vector output_data( - graph.staging_buffer_numel_of(out_ioval.value)); - graph.copy_from_staging( - out_ioval.staging, output_data.data(), output_data.size()); - - for (size_t j = 0; j < graph.numel_of(out_ioval.value); ++j) { - CHECK_VALUE(output_data, j, expected_outputs.at(i)); - } - } -} - -vkcompute::ComputeGraph build_mm_graph( - int B, - int M, - int K, - int N, - vkcompute::vkapi::ScalarType dtype, - vkcompute::utils::StorageType in_out_stype, - vkcompute::utils::GPUMemoryLayout memory_layout, - const std::vector& mat2_data, - const bool prepack_mat2) { - using namespace vkcompute; - GraphConfig config; - config.expect_dynamic_shapes = true; - ComputeGraph graph(config); - - std::vector mat1_size = {M, K}; - std::vector mat2_size = {K, N}; - std::vector out_size = {M, N}; - if (B > 1) { - mat1_size.resize(3); - mat1_size = {B, M, K}; - mat2_size.resize(3); - mat2_size = {B, K, N}; - out_size.resize(3); - out_size = {B, M, N}; - } - - IOValueRef mat1 = - graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout); - IOValueRef mat2{}; - - ValueRef mat2_w = graph.add_tensorref(mat2_size, dtype, mat2_data.data()); - - if (prepack_mat2) { - mat2.value = mat2_w; - } else { - mat2.value = - graph.add_tensor(mat2_size, dtype, in_out_stype, memory_layout); - mat2.staging = graph.set_input_tensor(mat2.value); - } - - IOValueRef out; - out.value = graph.add_tensor(out_size, dtype, in_out_stype, memory_layout); - - VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - return graph; -} - -bool check_close(float a, float b, float atol, float rtol) { - float max = std::max(std::abs(a), std::abs(b)); - float diff = std::abs(a - b); - return diff <= (atol + rtol * max); -} diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h deleted file mode 100644 index 1fd40b6f815..00000000000 --- a/backends/vulkan/test/utils/test_utils.h +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -#include - -#include -#include -#include - -#include - -#define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \ - vkcompute::api::vTensor( \ - vkcompute::api::context(), \ - sizes, \ - vkapi::kFloat, \ - utils::StorageType::TEXTURE_3D, \ - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \ - allocate_memory); - -#define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \ - vkcompute::api::vTensor( \ - vkcompute::api::context(), \ - sizes, \ - vkapi::kFloat, \ - utils::StorageType::BUFFER, \ - utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, \ - allocate_memory); - -#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - vkcompute::api::StagingBuffer staging_buffer_##tensor( \ - vkcompute::api::context(), \ - vkapi::kFloat, \ - tensor.staging_buffer_numel()); \ - record_nchw_to_image_op( \ - vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor); - -#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - vkcompute::api::StagingBuffer staging_buffer_##tensor( \ - vkcompute::api::context(), \ - vkapi::kFloat, \ - tensor.staging_buffer_numel()); \ - record_image_to_nchw_op( \ - vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer()); - -#define CHECK_VALUE(data, idx, expected) \ - do { \ - if (data[idx] != expected) { \ - std::cout << "Output at [" << idx << "] = " << data[idx] \ - << ", does not match expected value " << expected \ - << std::endl; \ - } \ - ASSERT_TRUE(data[idx] == expected); \ - } while (false) - -// -// Operator Recording -// - -void record_nchw_to_buffer_op( - vkcompute::api::Context* const context, - vkcompute::vkapi::VulkanBuffer& src_buffer, - vkcompute::api::vTensor& v_dst); - -void record_buffer_to_nchw_op( - vkcompute::api::Context* const context, - vkcompute::api::vTensor& v_src, - vkcompute::vkapi::VulkanBuffer& dst_buffer); - -void record_nchw_to_image_op( - vkcompute::api::Context* const context, - vkcompute::vkapi::VulkanBuffer& src_buffer, - vkcompute::api::vTensor& v_dst); - -void record_image_to_nchw_op( - vkcompute::api::Context* const context, - vkcompute::api::vTensor& v_src, - vkcompute::vkapi::VulkanBuffer& dst_buffer); - -void record_bitw8_image_to_nchw_nobitw8buffer_op( - vkcompute::api::Context* const context, - vkcompute::api::vTensor& v_src, - vkcompute::api::StagingBuffer& dst_buffer); - -void record_conv2d_prepack_weights_op( - vkcompute::api::Context* const context, - vkcompute::vkapi::VulkanBuffer& src_buffer, - vkcompute::api::vTensor& v_dst, - const std::vector& original_sizes, - const bool transposed); - -void record_binary_op( - vkcompute::api::Context* const context, - const std::string& op_name, - vkcompute::api::vTensor& v_in1, - vkcompute::api::vTensor& v_in2, - vkcompute::api::vTensor& v_dst); - -void execute_and_check_add( - vkcompute::api::vTensor& a, - vkcompute::api::vTensor& b, - vkcompute::api::vTensor& c, - float a_val, - float b_val); - -void record_index_fill_buffer( - vkcompute::api::Context* const context, - vkcompute::api::vTensor& v_ten); - -void record_scalar_add_buffer( - vkcompute::api::Context* context, - vkcompute::api::vTensor& v_ten, - float offset); - -void record_reference_matmul( - vkcompute::api::Context* context, - vkcompute::api::vTensor& out, - vkcompute::api::vTensor& mat1, - vkcompute::api::vTensor& mat2); - -void record_matmul_texture3d( - vkcompute::api::Context* context, - vkcompute::api::vTensor& out, - vkcompute::api::vTensor& mat1, - vkcompute::api::vTensor& mat2); - -// -// Input & Output Utilities -// - -inline std::vector create_random_float_vector( - const size_t numel, - const float min = 0.0f, - const float max = 1.0f) { - std::vector result(numel); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(min, max); - - for (size_t i = 0; i < numel; ++i) { - result[i] = dis(gen); - } - - return result; -} - -inline void fill_staging( - vkcompute::api::StagingBuffer& staging, - float val, - int numel = -1) { - if (numel < 0) { - numel = staging.numel(); - } - std::vector data(numel); - std::fill(data.begin(), data.end(), val); - staging.copy_from(data.data(), sizeof(float) * numel); -} - -void fill_vtensor(vkcompute::api::vTensor& vten, std::vector& data); - -void fill_vtensor(vkcompute::api::vTensor& vten, float val, bool iota = false); - -std::vector create_random_float_buffer( - const size_t numel, - const float min = 0, - const float max = 1); - -std::vector create_random_uint8_buffer( - const size_t numel, - const uint8_t min = 0, - const uint8_t max = 255); - -void fill_vtensor( - vkcompute::ComputeGraph& graph, - const vkcompute::IOValueRef idx, - float val, - bool iota = false); - -void extract_vtensor(vkcompute::api::vTensor& vten, std::vector& data); - -inline std::vector extract_vtensor(vkcompute::api::vTensor& vten) { - std::vector data_out(vten.staging_buffer_numel()); - extract_vtensor(vten, data_out); - return data_out; -} - -inline void check_staging_buffer( - vkcompute::api::StagingBuffer& staging, - float val, - int numel = -1) { - if (numel < 0) { - numel = staging.numel(); - } - std::vector data(numel); - staging.copy_to(data.data(), sizeof(float) * numel); - - for (size_t i = 0; i < data.size(); ++i) { - CHECK_VALUE(data, i, val); - } -} - -inline int64_t get_buf_idx( - vkcompute::ComputeGraph& graph, - vkcompute::IOValueRef ref, - const std::vector& tensor_coor) { - const std::vector& sizes = graph.sizes_of(ref.value); - - int64_t c = vkcompute::dim_at(sizes); - int64_t h = vkcompute::dim_at(sizes); - int64_t w = vkcompute::dim_at(sizes); - - int64_t ni = vkcompute::dim_at(tensor_coor); - int64_t ci = vkcompute::dim_at(tensor_coor); - int64_t hi = vkcompute::dim_at(tensor_coor); - int64_t wi = vkcompute::dim_at(tensor_coor); - - return (ni * c * h * w + ci * h * w + hi * w + wi); -} - -// -// Context Management -// - -void submit_to_gpu(); - -vkcompute::vkapi::Allocation allocate_memory_for( - const vkcompute::api::vTensor& vten); - -VmaTotalStatistics get_vma_stats(); - -size_t get_vma_allocation_count(); - -// -// Graph Test Utilities -// - -void execute_graph_and_check_output( - vkcompute::ComputeGraph& graph, - std::vector input_vals, - std::vector expected_outputs); - -#define CREATE_RAND_WEIGHT_TENSOR(name, sizes, dtype) \ - std::vector data_##name = \ - create_random_float_buffer(utils::multiply_integers(sizes)); \ - ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data()); - -vkcompute::ComputeGraph build_mm_graph( - int B, - int M, - int K, - int N, - vkcompute::vkapi::ScalarType dtype, - vkcompute::utils::StorageType in_out_stype, - vkcompute::utils::GPUMemoryLayout memory_layout, - const std::vector& mat2_data, - const bool prepack_mat2 = false); - -// -// Debugging Utilities -// - -#define PRINT_DATA(vec) \ - do { \ - std::cout << #vec << ": "; \ - print_vector(vec); \ - } while (false); - -#define PRINT_DATA_RANGE(vec, start, range) \ - do { \ - std::cout << #vec << "[" << start << ", " << (start + range) << "]: "; \ - print_vector(vec, start, range); \ - } while (false); - -template -void print_vector( - const std::vector& data, - size_t start = 0, - size_t range = 20) { - size_t end = data.size(); - if (range >= 1) { - end = std::min(data.size(), start + range); - } - for (size_t i = start; i < end; ++i) { - std::cout << data.at(i) << ", "; - } - std::cout << std::endl; -} - -// -// Misc. Utilities -// - -bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp deleted file mode 100644 index a193d02da88..00000000000 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ /dev/null @@ -1,3232 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -#include - -#include - -#include - -#include - -#include - -using namespace vkcompute; -using namespace vkcompute::api; - -std::vector -transpose_matrix(std::vector& mat, const int H, const int W) { - std::vector out(W * H); - for (int out_y = 0; out_y < H; ++out_y) { - for (int out_x = 0; out_x < W; ++out_x) { - out[out_x * H + out_y] = mat[out_y * W + out_x]; - } - } - return out; -} - -std::vector compute_reference_matmul( - std::vector& mat1, - std::vector& mat2, - const int M, - const int K, - const int N) { - std::vector out(M * N); - for (int out_y = 0; out_y < M; ++out_y) { - for (int out_x = 0; out_x < N; ++out_x) { - out[out_y * N + out_x] = 0; - for (int k = 0; k < K; ++k) { - out[out_y * N + out_x] += mat1[out_y * K + k] * mat2[k * N + out_x]; - } - } - } - return out; -} - -std::vector> standard_sizes_to_test = { - // 2D - {7, 11}, - {13, 6}, - // 3D - {2, 9, 7}, - {9, 15, 19}, - {7, 11, 24}, - {13, 8, 11}, - {12, 11, 19}, - // 4D - {2, 2, 3, 5}, - {9, 13, 11, 17}, - {17, 14, 18, 20}, - {7, 13, 12, 21}, - {3, 8, 13, 17}, -}; - -// -// Compute API Tests -// - -class VulkanComputeAPITest : public ::testing::Test { - public: - void SetUp() override { - // Make sure we are starting with a clean slate - EXPECT_TRUE(get_vma_allocation_count() == 0); - } - - void TearDown() override { - context()->flush(); - - // Make sure we are ending with a clean slate - EXPECT_TRUE(get_vma_allocation_count() == 0); - } -}; - -TEST_F(VulkanComputeAPITest, print_adapter) { - std::cout << *(context()->adapter_ptr()) << std::endl; -} - -#if defined(VULKAN_DEBUG) && defined(VK_KHR_pipeline_executable_properties) - -TEST_F(VulkanComputeAPITest, print_shader_executable_properties) { - context()->print_shader_executable_properties( - VK_KERNEL(binary_add_nobroadcast__test_half), {0}); -} - -#endif // VULKAN_DEBUG && VK_KHR_pipeline_executable_properties - -std::vector get_reference_strides( - const std::vector& sizes, - const utils::GPUMemoryLayout layout, - const bool flip_unsqueezed = false) { - int64_t C = utils::val_at(-3, sizes); - int64_t H = utils::val_at(-2, sizes); - int64_t W = utils::val_at(-1, sizes); - - int64_t numel = utils::multiply_integers(sizes); - - switch (layout) { - case utils::kWidthPacked: - switch (sizes.size()) { - case 1: - if (flip_unsqueezed) - return {1, numel, numel, numel}; - return {1}; - case 2: - if (flip_unsqueezed) - return {1, W, numel, numel}; - return {W, 1}; - case 3: - if (flip_unsqueezed) - return {1, W, H * W, numel}; - return {H * W, W, 1}; - case 4: - if (flip_unsqueezed) - return {1, W, H * W, C * H * W}; - return {C * H * W, H * W, W, 1}; - default: - return {}; - } - break; - case utils::kHeightPacked: - switch (sizes.size()) { - case 1: - if (flip_unsqueezed) - return {1, numel, numel, numel}; - return {1}; - case 2: - if (flip_unsqueezed) - return {H, 1, numel, numel}; - return {1, H}; - return {1, H}; - case 3: - if (flip_unsqueezed) - return {H, 1, H * W, numel}; - return {W * H, 1, H}; - case 4: - if (flip_unsqueezed) - return {H, 1, W * H, C * W * H}; - return {C * W * H, W * H, 1, H}; - default: - return {}; - } - case utils::kChannelsPacked: - switch (sizes.size()) { - case 1: - if (flip_unsqueezed) - return {1, numel, numel, numel}; - return {1}; - case 2: - if (flip_unsqueezed) - return {1, W, numel, numel}; - return {W, 1}; - case 3: - if (flip_unsqueezed) - return {C, W * C, 1, numel}; - return {1, W * C, C}; - case 4: - if (flip_unsqueezed) - return {C, W * C, 1, H * W * C}; - return {H * W * C, 1, W * C, C}; - default: - return {}; - } - } - return {}; -} - -/* - * Applies the following transformations to a tensor's dim_order vector: - * 1. Reverse the order of elements so that the fastest moving dimensions are - * first. - * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the - * width dimension, 1 represents the height dimension, and 2 represents the - * channels dimension. - * 3. Unsqueeze the dim_order vector to the next multiple of 4. - */ -std::vector create_whcn_dim_order( - const std::vector& dim_order) { - size_t ndim = dim_order.size(); - std::vector whcn_order(ndim); - - // Convert from NCHW to WHCN index, and flip the dim order so that the fastest - // moving dimension is first. - // example: { 1, 2, 0} -> { 2, 0, 1} - // {height, width, channels} -> {channels, width, height} - for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim; - ++whcn_i, --nchw_i) { - whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i); - } - - // Unsqueeze to the next multiple of 4 - size_t ndim_up4 = utils::align_up_4(ndim); - whcn_order.resize(ndim_up4); - - // Append unsqueezed dimensions - for (size_t i = ndim; i < ndim_up4; ++i) { - whcn_order.at(i) = i; - } - - return whcn_order; -} - -TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { - vkapi::ShaderInfo empty_shader_info; - EXPECT_FALSE(empty_shader_info); - EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr); - EXPECT_TRUE(empty_shader_info.src_code.size == 0u); -} - -bool compare_vectors( - const std::vector& v32, - const std::vector& v64) { - if (v32.size() != v64.size()) { - return false; - } - for (size_t i = 0; i < v32.size(); ++i) { - if (static_cast(v32[i]) != v64[i]) { - return false; - } - } - return true; -} - -TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { - // ndim, GPUMemoryLayout, expected dim order pairs - std::vector>> test_cases = { - {1, WHCN::kWidthDim, {0}}, - {1, WHCN::kHeightDim, {0}}, - {1, WHCN::kChannelsDim, {0}}, - {2, WHCN::kWidthDim, {0, 1}}, - {2, WHCN::kHeightDim, {1, 0}}, - {2, WHCN::kChannelsDim, {0, 1}}, - {3, WHCN::kWidthDim, {0, 1, 2}}, - {3, WHCN::kHeightDim, {0, 2, 1}}, - {3, WHCN::kChannelsDim, {1, 2, 0}}, - {4, WHCN::kWidthDim, {0, 1, 2, 3}}, - {4, WHCN::kHeightDim, {0, 1, 3, 2}}, - {4, WHCN::kChannelsDim, {0, 2, 3, 1}}, - }; - - for (const auto& test_case : test_cases) { - const size_t& ndim = std::get<0>(test_case); - const int32_t packed_dim = std::get<1>(test_case); - const auto& expected_dim_order = std::get<2>(test_case); - std::vector dim_order = calculate_dim_order(ndim, packed_dim); - - ASSERT_TRUE(dim_order == expected_dim_order); - } -} - -TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { - vTensor v_tensor_to_resize( - context(), - {25, 25, 25, 25}, - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked, - /*allocate_memory = */ false); - - for (const auto& sizes : standard_sizes_to_test) { - if (sizes.size() < 3) { - continue; - } - for (const auto& layout : - {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) { - { - const int32_t packed_dim = static_cast(layout); - std::vector dim_order = - calculate_dim_order(sizes.size(), packed_dim); - std::vector strides = calculate_strides(sizes, dim_order); - int64_t numel = utils::multiply_integers(sizes); - - std::vector ref_strides = get_reference_strides(sizes, layout); - ASSERT_TRUE(strides == ref_strides); - - std::vector unsqueezed_strides = - flip_and_unsqueeze(strides, kTensorStrides, numel); - - std::vector ref_unsqueezed_strides = - get_reference_strides(sizes, layout, true); - - ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides); - - std::vector whcn_dim_order = - flip_and_unsqueeze(dim_order, kTensorDimOrder, numel); - - std::vector ref_whcn_dim_order = - create_whcn_dim_order(dim_order); - - ASSERT_TRUE(whcn_dim_order == ref_whcn_dim_order); - - // Create new vTensor and check that the strides are correct - vTensor new_v_tensor( - context(), - sizes, - vkapi::kFloat, - utils::kBuffer, - layout, - /*allocate_memory = */ false); - - ASSERT_TRUE(new_v_tensor.strides() == ref_strides); - - // Resize vtensor and check that updated metadata is correct - v_tensor_to_resize.virtual_reconfigure(sizes, dim_order); - ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides); - } - } - } -} - -TEST_F(VulkanComputeAPITest, virtual_transpose_test) { - std::vector sizes = {7, 9, 11, 13}; - // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx - std::vector>> test_cases = { - {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}}, - {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}}, - {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}}, - }; - - for (const auto& test_case : test_cases) { - const int dim0 = test_case.at(0).at(0); - const int dim1 = test_case.at(0).at(1); - - const auto& expected_sizes = test_case.at(1); - const auto& expected_dim_order = test_case.at(2); - const auto& expected_axis_map = test_case.at(3); - const int expected_packed_dim = test_case.at(4).at(0); - - { - vTensor a_buffer = vTensor( - context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked); - - a_buffer.virtual_transpose(dim0, dim1); - EXPECT_TRUE(a_buffer.sizes() == expected_sizes); - EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order); - } - - { - vTensor a_texture = vTensor( - context(), - sizes, - vkapi::kFloat, - utils::kTexture3D, - utils::kWidthPacked); - a_texture.virtual_transpose(dim0, dim1); - EXPECT_TRUE(a_texture.sizes() == expected_sizes); - EXPECT_TRUE(a_texture.axis_map() == expected_axis_map); - EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim); - } - } -} - -TEST_F(VulkanComputeAPITest, view_of_view_test) { - constexpr int N = 3; - constexpr int C = 5; - constexpr int H = 17; - constexpr int W = 19; - - std::vector sizes = {N, C, H, W}; - - vTensor t1 = vTensor( - context(), sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked); - - vTensor t2 = vTensor(t1); - EXPECT_TRUE(t2.sizes() == sizes); - vTensor t3 = vTensor(t2); - EXPECT_TRUE(t2.sizes() == sizes); - - t2.virtual_transpose(1, 2); - std::vector expected_t2_sizes = {N, H, C, W}; - EXPECT_TRUE(t2.sizes() == expected_t2_sizes); - - // Because t3 was created before t2's metadata was updated, we need to first - // update t3's metadata to match t2's metadata. Then the transpose will yield - // the correct metadata. - t3.virtual_clone(t2); - t3.virtual_transpose(2, 3); - std::vector expected_t3_sizes = {N, H, W, C}; - EXPECT_TRUE(t3.sizes() == expected_t3_sizes); -} - -utils::ivec3 make_temp_ivec3(int x, int y, int z) { - return utils::ivec3{x, y, z}; -} - -TEST_F(VulkanComputeAPITest, vec_test) { - { - utils::vec3 v3({1, 2, 3}); - ASSERT_TRUE(v3[0] == 1); - ASSERT_TRUE(v3[1] == 2); - ASSERT_TRUE(v3[2] == 3); - v3 = {4, 5, 6}; - ASSERT_TRUE(v3[0] == 4); - ASSERT_TRUE(v3[1] == 5); - ASSERT_TRUE(v3[2] == 6); - } - - { - utils::uvec4 uv4({4, 3, 2, 1}); - ASSERT_TRUE(uv4[0] == 4); - ASSERT_TRUE(uv4[1] == 3); - ASSERT_TRUE(uv4[2] == 2); - ASSERT_TRUE(uv4[3] == 1); - uv4 = {11, 13, 12, 88}; - ASSERT_TRUE(uv4[0] == 11); - ASSERT_TRUE(uv4[1] == 13); - ASSERT_TRUE(uv4[2] == 12); - ASSERT_TRUE(uv4[3] == 88); - } - - // Test copy from same type - { - utils::ivec3 v{5, 6, 8}; - utils::ivec3 v2 = v; - - ASSERT_TRUE(v2[0] == 5); - ASSERT_TRUE(v2[1] == 6); - ASSERT_TRUE(v2[2] == 8); - } - - // Test copy from different type - { - utils::uvec3 v{5, 6, 8}; - utils::ivec3 v2 = v; - - ASSERT_TRUE(v2[0] == 5); - ASSERT_TRUE(v2[1] == 6); - ASSERT_TRUE(v2[2] == 8); - } - - // Test construction from temporary vec - { - utils::uvec3 v{make_temp_ivec3(4, 5, 10)}; - ASSERT_TRUE(v[0] == 4); - ASSERT_TRUE(v[1] == 5); - ASSERT_TRUE(v[2] == 10); - } - - // Test initalization from temporary vec - { - utils::uvec3 v = make_temp_ivec3(4, 5, 10); - ASSERT_TRUE(v[0] == 4); - ASSERT_TRUE(v[1] == 5); - ASSERT_TRUE(v[2] == 10); - } -} - -TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) { - // Try to get shader from custom shader library - const vkapi::ShaderInfo& kernel = VK_KERNEL(test_shader); - - ASSERT_TRUE(kernel.kernel_name == "test_shader"); -} - -TEST_F(VulkanComputeAPITest, spec_var_classes_test) { - // Check equality operator - ASSERT_TRUE(SV(1.5f) == SV(1.5f)); - ASSERT_FALSE(SV(15.0f) == SV(15)); - ASSERT_FALSE(SV(1u) == SV(true)); - - size_t sv_size = sizeof(vkapi::SpecVar); - - vkapi::SpecVarList spec_vars = {}; - ASSERT_TRUE(spec_vars.size() == 0); - spec_vars = {SV(1.1f), SV(32), SV(45)}; - ASSERT_TRUE(spec_vars.size() == 3); - vkapi::SpecVarList spec_vars_other = {SV(2.6f), SV(true), SV(78u), SV(5.5f)}; - spec_vars.append(spec_vars_other); - ASSERT_TRUE(spec_vars.size() == 7); - - // Check validity of the data - const vkapi::SpecVar* data = spec_vars.data(); - ASSERT_TRUE(*(reinterpret_cast(data + 3)) == 2.6f); - ASSERT_TRUE(*(reinterpret_cast(data + 1)) == 32); - ASSERT_TRUE(*(reinterpret_cast(data + 5)) == 78u); - - // Check validity of the map entries - std::vector entries = - spec_vars.generate_map_entries(); - - for (size_t i = 0; i < spec_vars.size(); ++i) { - ASSERT_TRUE(entries[i].constantID == i); - ASSERT_TRUE(entries[i].offset == sv_size * i); - if (i != 4) { - ASSERT_TRUE(entries[i].size == 4); - } else { - ASSERT_TRUE(entries[i].size == 1); - } - } - - // Check copy - vkapi::SpecVarList spec_vars_copy(spec_vars); - ASSERT_TRUE(spec_vars_copy.size() == 7); - - // Check validity of the copied data - const vkapi::SpecVar* copy_data = spec_vars_copy.data(); - ASSERT_TRUE(*(reinterpret_cast(copy_data + 4)) == true); - ASSERT_TRUE(*(reinterpret_cast(copy_data + 2)) == 45); - ASSERT_TRUE(*(reinterpret_cast(copy_data + 6)) == 5.5f); -} - -TEST_F(VulkanComputeAPITest, spec_var_shader_test) { - size_t len = 16; - StagingBuffer buffer(context(), vkapi::kFloat, len); - - float scale = 3.0f; - float offset = 1.5f; - - { - ParamsBuffer params(context(), int32_t(len)); - uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4)); - vkapi::PipelineBarrier pipeline_barrier{}; - context()->submit_compute_job( - VK_KERNEL(fill_buffer), - pipeline_barrier, - {64, 1, 1}, - {len_div4, 1, 1}, - {SV(scale), SV(offset)}, - VK_NULL_HANDLE, - 0, - buffer.buffer(), - params.buffer()); - } - - submit_to_gpu(); - - std::vector data(len); - buffer.copy_to(data.data(), buffer.nbytes()); - - for (size_t i = 0; i < len; ++i) { - CHECK_VALUE(data, i, scale * i + offset); - } -} - -TEST_F(VulkanComputeAPITest, update_params_between_submit) { - context()->set_cmd(/*reusable = */ true); - std::vector sizes = {4, 4, 2}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - std::string kernel_name("fill_texture__test"); - add_dtype_suffix(kernel_name, a.dtype()); - - struct Params final { - utils::ivec3 size; - int32_t fill; - utils::vec4 values; - }; - - Params block{ - {2, 4, 1}, - 0, - {5.0, 5.0, 5.0, 5.0}, - }; - - ParamsBuffer params(context(), block); - - { - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {}; - context()->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - {4, 4, 4}, - {4, 4, 4}, - specialization_constants, - VK_NULL_HANDLE, - 0, - a.image( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - params.buffer()); - } - - StagingBuffer staging_buffer( - context(), vkapi::kFloat, a.staging_buffer_numel()); - record_image_to_nchw_op(context(), a, staging_buffer.buffer()); - - submit_to_gpu(); - check_staging_buffer(staging_buffer, 5.0f); - - Params new_block{ - {2, 4, 1}, - 0, - {4.0, 4.0, 4.0, 4.0}, - }; - - params.update(new_block); - - submit_to_gpu(); - check_staging_buffer(staging_buffer, 4.0f); -} - -template -void test_storage_buffer_type(const size_t len) { - StagingBuffer buffer(context(), dtype, len); - - std::string kernel_name("idx_fill_buffer"); - switch (dtype) { - case vkapi::kFloat: - kernel_name += "_float"; - break; - case vkapi::kHalf: - kernel_name += "_half"; - break; - case vkapi::kQInt8: - kernel_name += "_int8"; - break; - case vkapi::kQUInt8: - kernel_name += "_uint8"; - break; - default: - throw std::runtime_error("Unsupported dtype"); - break; - } - - ParamsBuffer params(context(), int32_t(len)); - - { - uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4)); - vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {}; - context()->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - {64, 1, 1}, - {len_div4, 1, 1}, - specialization_constants, - VK_NULL_HANDLE, - 0, - buffer.buffer(), - params.buffer()); - } - - submit_to_gpu(); - - std::vector data(len); - buffer.copy_to(data.data(), buffer.nbytes()); - - for (size_t i = 0; i < len; ++i) { - CHECK_VALUE(data, i, T(i)); - } -} - -TEST_F(VulkanComputeAPITest, test_buffer_float) { - test_storage_buffer_type(16); -} - -TEST_F(VulkanComputeAPITest, test_buffer_float16) { - if (!context()->adapter_ptr()->has_full_float16_buffers_support()) { - GTEST_SKIP(); - } - test_storage_buffer_type(16); -} - -TEST_F(VulkanComputeAPITest, test_buffer_int8) { - if (!context()->adapter_ptr()->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - test_storage_buffer_type(16); -} - -TEST_F(VulkanComputeAPITest, test_zero_size_tensor) { - // Simple test that performs a + b -> c - - std::vector sizes = {0, 5, 7}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - // Fill input tensors - fill_vtensor(a, 2.5f); - fill_vtensor(b, 1.5f); - - // a + b -> c - record_binary_op(context(), "add", a, b, c); - - // Extract output tensor - std::vector data_out = extract_vtensor(c); - - // Assert all tensors are empty - ASSERT_TRUE(a.numel() == 0); - ASSERT_TRUE(b.numel() == 0); - ASSERT_TRUE(c.numel() == 0); - ASSERT_TRUE(a.nbytes() == 0); - ASSERT_TRUE(b.nbytes() == 0); - ASSERT_TRUE(c.nbytes() == 0); - - // Check output - for (size_t i = 0; i < data_out.size(); ++i) { - CHECK_VALUE(data_out, i, 4.0f); - } -} - -template -void run_buffer_tensor_sanity_check(vTensor& tensor) { - fill_vtensor(tensor, 0.0f, true); - - record_scalar_add_buffer(context(), tensor, 2.0f); - std::vector data_out = extract_vtensor(tensor); - - // Check output - for (size_t i = 0; i < tensor.numel(); ++i) { - CHECK_VALUE(data_out, i, i + 2.0f); - } -} - -TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) { - for (const auto& sizes : standard_sizes_to_test) { - for (const auto& dtype : {vkapi::kFloat, vkapi::kHalf, vkapi::kChar}) { - if (dtype == vkapi::kHalf && - !context()->adapter_ptr()->has_full_float16_buffers_support()) { - continue; - } - if (dtype == vkapi::kHalf && utils::multiply_integers(sizes) >= 2048) { - continue; - } - if (dtype == vkapi::kChar && - !context()->adapter_ptr()->has_full_int8_buffers_support()) { - continue; - } - if (dtype == vkapi::kChar && utils::multiply_integers(sizes) >= 128) { - continue; - } - for (const auto& layout : - {utils::kWidthPacked, - utils::kHeightPacked, - utils::kChannelsPacked}) { - vTensor a = vTensor(context(), sizes, dtype, utils::kBuffer, layout); - switch (dtype) { - case vkapi::kFloat: - run_buffer_tensor_sanity_check(a); - break; - case vkapi::kHalf: - run_buffer_tensor_sanity_check(a); - break; - case vkapi::kChar: - run_buffer_tensor_sanity_check(a); - break; - default: - VK_THROW("Unsupported dtype"); - } - } - } - } -} - -TEST_F(VulkanComputeAPITest, texture_add_sanity_check) { - // Simple test that performs a + b -> c - - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - // Fill input tensors - fill_vtensor(a, 2.5f); - fill_vtensor(b, 1.5f); - - // a + b -> c - record_binary_op(context(), "add", a, b, c); - - // Extract output tensor - std::vector data_out = extract_vtensor(c); - - // Check output - for (size_t i = 0; i < data_out.size(); ++i) { - CHECK_VALUE(data_out, i, 4.0f); - } -} - -TEST_F(VulkanComputeAPITest, tensor_alias_test) { - for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) { - std::vector sizes = {9, 9}; - - const size_t alloc_count_before = get_vma_allocation_count(); - - vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type); - - vTensor copy = vTensor(original); - - // Two tensors but only one additional allocation. - EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1); - EXPECT_TRUE(copy.is_view_of(original)); - - // Fill original tensor with some data - fill_vtensor(original, 2.5f, true); - - std::vector data_out(copy.staging_buffer_numel()); - // Extract the copy tensor; should contain the data of the original tensor - extract_vtensor(copy, data_out); - - for (size_t i = 0; i < original.numel(); ++i) { - CHECK_VALUE(data_out, i, 2.5f + i); - } - } -} - -TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { - constexpr int M = 11; - constexpr int K = 23; - constexpr int N = 17; - std::vector mat1_sizes = {M, K}; - std::vector mat2_sizes = {N, K}; - std::vector out_sizes = {M, N}; - - for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) { - vTensor mat1 = vTensor( - context(), - mat1_sizes, - vkapi::kFloat, - storage_type, - utils::kWidthPacked); - vTensor mat2 = vTensor( - context(), - mat2_sizes, - vkapi::kFloat, - storage_type, - utils::kWidthPacked); - vTensor out = vTensor( - context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked); - - // Generate data - std::vector mat1_data = - create_random_float_buffer(mat1.staging_buffer_numel()); - std::vector mat2_data = - create_random_float_buffer(mat2.staging_buffer_numel()); - - // Create direct view and modify sizes and strides later - vTensor mat2_t = vTensor(mat2); - // Update sizes and strides of mat2_t to be that of a transposed tensor - mat2_t.virtual_transpose(0, 1); - - EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim); - - std::vector mat2_t_data = transpose_matrix(mat2_data, N, K); - std::vector ref_out = - compute_reference_matmul(mat1_data, mat2_t_data, M, K, N); - - // Fill original tensor with some data - fill_vtensor(mat1, mat1_data); - fill_vtensor(mat2, mat2_data); - - if (storage_type == utils::kTexture3D) { - record_matmul_texture3d(context(), out, mat1, mat2_t); - } else { - record_reference_matmul(context(), out, mat1, mat2_t); - } - - std::vector data_out(out.staging_buffer_numel()); - // Extract the copy tensor; should contain the data of the original tensor - extract_vtensor(out, data_out); - - for (size_t i = 0; i < ref_out.size(); ++i) { - EXPECT_TRUE(check_close(data_out[i], ref_out[i])); - } - } -} - -TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) { - // This test is the same as texture_add_sanity_check, except that the tensor - // memory is allocated in a deferred fashion - - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - - // No allocations made so far - EXPECT_TRUE(get_vma_allocation_count() == 0); - - std::vector data_a(a.staging_buffer_numel()); - std::fill(data_a.begin(), data_a.end(), 2.5f); - std::vector data_b(b.staging_buffer_numel()); - std::fill(data_b.begin(), data_b.end(), 1.5f); - - // Allocate memory at the last possible opportunity - vkapi::Allocation a_mem = allocate_memory_for(a); - a.image().bind_allocation(a_mem); - vkapi::Allocation b_mem = allocate_memory_for(b); - b.image().bind_allocation(b_mem); - vkapi::Allocation c_mem = allocate_memory_for(c); - c.image().bind_allocation(c_mem); - - // One allocation for each tensor - EXPECT_TRUE(get_vma_allocation_count() == 3); - - fill_vtensor(a, data_a); - fill_vtensor(b, data_b); - - record_binary_op(context(), "add", a, b, c); - - std::vector data_c(c.staging_buffer_numel()); - extract_vtensor(c, data_c); - - for (size_t i = 0; i < data_c.size(); ++i) { - CHECK_VALUE(data_c, i, 4.0f); - } -} - -TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) { - // This test performs the following operations: - // 1. a + b -> c - // 2. c + d -> e - // and share memory between tensors whenever possible. - - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor d = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor e = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - - // No allocations made so far - EXPECT_TRUE(get_vma_allocation_count() == 0); - - // a and d can share the same memory allocation - vkapi::Allocation a_d_mem = allocate_memory_for(a); - a.image().bind_allocation(a_d_mem); - d.image().bind_allocation(a_d_mem); - // b and e can share the same memory allocation - vkapi::Allocation b_e_mem = allocate_memory_for(b); - b.image().bind_allocation(b_e_mem); - e.image().bind_allocation(b_e_mem); - // c must have its own memory allocation - vkapi::Allocation c_mem = allocate_memory_for(c); - c.image().bind_allocation(c_mem); - - // 3 allocations should be made - EXPECT_TRUE(get_vma_allocation_count() == 3); - - // Specify input data - std::vector data_a(a.staging_buffer_numel()); - std::fill(data_a.begin(), data_a.end(), 2.5f); - std::vector data_b(b.staging_buffer_numel()); - std::fill(data_b.begin(), data_b.end(), 1.5f); - std::vector data_d(b.staging_buffer_numel()); - std::fill(data_d.begin(), data_d.end(), 1.0f); - - // First, fill a and b with data - fill_vtensor(a, data_a); - fill_vtensor(b, data_b); - - // a + b -> c - record_binary_op(context(), "add", a, b, c); - - // Now d can be filled with data - fill_vtensor(d, data_d); - - // c + d -> e - record_binary_op(context(), "add", c, d, e); - - // Extract data from e - std::vector data_e(e.staging_buffer_numel()); - extract_vtensor(e, data_e); - - // Sanity check that the values are correct - for (size_t i = 0; i < data_e.size(); ++i) { - CHECK_VALUE(data_e, i, 5.0f); - } -} - -TEST_F(VulkanComputeAPITest, resource_bind_twice_fails) { - // Check that binding a resource that already has memory associated with it - // fails - - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - // Try to double bind a resource, which should fail - vkapi::Allocation a_mem = allocate_memory_for(a); - EXPECT_THROW(a.image().bind_allocation(a_mem), vkapi::Error); -} - -TEST_F(VulkanComputeAPITest, resource_destructor_non_owning_memory) { - // Check that the destructor of a vTensor that does not own its memory - // does not free the memory - - vkapi::Allocation memory; - - // Default Allocation constructor should not allocate memory - EXPECT_TRUE(get_vma_allocation_count() == 0); - - std::vector sizes = {4, 4, 1}; - { - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - - memory = allocate_memory_for(a); - EXPECT_TRUE(get_vma_allocation_count() == 1); - a.image().bind_allocation(memory); - } - - // Check that the memory is still allocated - EXPECT_TRUE(get_vma_allocation_count() == 1); -} - -TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) { - // Try to encode a command buffer with a vTensor that does not have - // memory - - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - - // No allocations yet - EXPECT_TRUE(get_vma_allocation_count() == 0); - - std::vector data_a(a.staging_buffer_numel()); - std::fill(data_a.begin(), data_a.end(), 2.5f); - - // Encoding a command buffer with a vTensor without memory should throw - EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error); -} - -TEST_F(VulkanComputeAPITest, texture_virtual_resize) { - context()->set_cmd(/*reusable = */ true); - std::vector sizes = {8, 12, 12}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(a) - DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(b) - - fill_staging(staging_buffer_a, 11.5f); - fill_staging(staging_buffer_b, 12.5f); - - record_binary_op(context(), "add", a, b, c); - - DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(c) - - submit_to_gpu(); - check_staging_buffer(staging_buffer_c, 24.0f); - - std::vector> new_sizes_list = { - {4, 2, 4}, {4, 3, 6}, {8, 12, 12}, {8, 1, 1}, {8, 11, 10}}; - - for (auto& new_sizes : new_sizes_list) { - a.virtual_resize(new_sizes); - b.virtual_resize(new_sizes); - c.virtual_resize(new_sizes); - - fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel()); - fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel()); - - submit_to_gpu(); - check_staging_buffer( - staging_buffer_c, - float(new_sizes[1] + new_sizes[2] + 56.5f), - c.numel()); - } -} - -// -// Compute Graph Tests -// - -#define EXTRACT_TENSOR(name) \ - std::vector data_##name(graph.staging_buffer_numel_of(name.value)); \ - graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); - -// The purpose of this test is simply to track the size of various classes over -// time, in the interest of making sure that they doesn't grow too large. -TEST_F(VulkanComputeAPITest, print_object_sizes) { -#define PRINT_SIZE(name) \ - std::cout << #name << " size: " << sizeof(name) << " B" << std::endl - PRINT_SIZE(vTensor); - PRINT_SIZE(Value); - PRINT_SIZE(StagingBuffer); - PRINT_SIZE(ComputeGraph); - PRINT_SIZE(DispatchNode); -#undef PRINT_SIZE - - // The actual sizes of each object is dependent on the platform. However, we - // can alert ourselves to any significant changes in the sizes of these - // objects by checking the `sizeof()` the class against some loose thresholds. - - // Current known size on 64 bit system: 1040 B - EXPECT_TRUE(sizeof(vTensor) < 1200); - // Current known size on 64 bit system: 80 B - EXPECT_TRUE(sizeof(Value) < 100); - // Current known size on 64 bit system: 120 B - EXPECT_TRUE(sizeof(StagingBuffer) < 500); - // Current known size on 64 bit system: 608 B - EXPECT_TRUE(sizeof(ComputeGraph) < 700); - // Current known size on 64 bit system: 248 B - EXPECT_TRUE(sizeof(DispatchNode) < 500); -} - -TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) { - const auto w = 16; - const auto h = 12; - const auto d = 1; - const utils::uvec3 image_extents = {w, h, d}; - - vkapi::Adapter* adapter_ptr = context()->adapter_ptr(); - - vkapi::ImageSampler::Properties sampler_props{ - VK_FILTER_NEAREST, - VK_SAMPLER_MIPMAP_MODE_NEAREST, - VK_SAMPLER_ADDRESS_MODE_REPEAT, - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK, - }; - - VkFormat image_format = VK_FORMAT_R32G32B32A32_SFLOAT; - VkImageType image_type = VK_IMAGE_TYPE_3D; - VkImageViewType image_view_type = VK_IMAGE_VIEW_TYPE_3D; - - VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props); - - auto image = adapter_ptr->vma().create_image( - context()->device(), - vkapi::create_extent3d(image_extents), - image_format, - image_type, - context()->preferred_image_tiling(), - image_view_type, - sampler_props, - sampler, - /*allow_transfer = */ true, - /*allocate_memory = */ true); - - auto tensor = vTensor(context(), image); - - const auto exp_sizes = std::vector{w, h, d * 4}; - EXPECT_TRUE(tensor.sizes() == exp_sizes); - EXPECT_TRUE(tensor.packed_dim() == 2); - - const auto exp_numel = w * h * d * 4; - EXPECT_TRUE(tensor.numel() == exp_numel); -} - -TEST(VulkanComputeGraphTest, test_values_scalars) { - GraphConfig config; - ComputeGraph graph(config); - - ValueRef idx; - - idx = graph.add_scalar(4); - EXPECT_TRUE(graph.get_int(idx) == 4); - - idx = graph.add_scalar(5.5f); - EXPECT_TRUE(graph.get_double(idx) == 5.5f); -} - -TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) { - GraphConfig config; - ComputeGraph graph(config); - - ValueRef idx = graph.add_scalar_list({1, 2, 3, 4}); - const auto arr = graph.get_int_list(idx); - EXPECT_TRUE(arr->size() == 4); - for (int i = 0; i < 4; i++) { - EXPECT_TRUE(arr->at(i) == i + 1); - } -} - -TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) { - GraphConfig config; - ComputeGraph graph(config); - - ValueRef idx; - { - std::vector data = {5.0, 4.0, 3.0, 2.0, 1.0}; - idx = graph.add_scalar_list(std::move(data)); - } - const auto& arr = graph.get_double_list(idx); - EXPECT_TRUE(arr->size() == 5); - for (int i = 0; i < 5; i++) { - EXPECT_TRUE(arr->at(i) == (5 - i)); - } -} - -TEST(VulkanComputeGraphTest, test_values_string) { - GraphConfig config; - ComputeGraph graph(config); - - ValueRef idx; - { - std::string data = "hello, world"; - idx = graph.add_string(std::move(data)); - } - std::string stored = graph.get_string(idx); - EXPECT_TRUE(stored == "hello, world"); -} - -TEST(VulkanComputeGraphTest, empty_init_graphnode_test) { - ExecuteNode node(nullptr, {}); - - GraphConfig config; - ComputeGraph graph(config); - - // Encode an empty ExecuteNode and check that command buffer encoding does not - // crash. - graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {})); -} - -TEST(VulkanComputeGraphTest, test_zero_dim_tensor) { - GraphConfig config; - ComputeGraph graph(config); - - std::vector size_big = {7, 3, 5}; - std::vector size_small = {}; - - // Build graph - - IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat); - IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat); - - IOValueRef out = {}; - - out.value = graph.add_tensor(size_big, vkapi::kFloat); - - auto addFn = VK_GET_OP_FN("aten.add.Tensor"); - addFn(graph, {a.value, b.value, kDummyValueRef, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - // Run graph - - for (float i = 5.0f; i < 30.0f; i += 10.0f) { - float val_a = i + 2.0f; - float val_b = i + 1.5f; - float val_c = val_a + val_b; - - fill_vtensor(graph, a, val_a); - fill_vtensor(graph, b, val_b); - - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); ++i) { - CHECK_VALUE(data_out, i, val_c); - } - } -} - -TEST(VulkanComputeGraphTest, test_simple_graph_with_buffer) { - GraphConfig config; - ComputeGraph graph(config); - - std::vector sizes = {7, 13, 19}; - - // Build graph - - IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat, utils::kBuffer); - - IOValueRef out = {}; - - out.value = graph.add_tensor(sizes, vkapi::kFloat, utils::kBuffer); - - auto addFn = VK_GET_OP_FN("aten.abs.default"); - addFn(graph, {a.value, out.value, kDummyValueRef, kDummyValueRef}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - // Run graph - - for (float i = 5.0f; i < 30.0f; i += 10.0f) { - float val = -i + 2.0f; - float expected_val = std::abs(val); - - fill_vtensor(graph, a, val); - - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); ++i) { - CHECK_VALUE(data_out, i, expected_val); - } - } -} - -TEST(VulkanComputeGraphTest, test_graph_view_of_view) { - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - - constexpr int N = 3; - constexpr int C = 5; - constexpr int H = 17; - constexpr int W = 19; - - std::vector orig_sizes = {N, C, H, W}; - - // Test a common view of view usage pattern. In delegate execution, the values - // of the graph are created first; then operators are added. As a result, - // creating views of views is a bit tricky because metadata updates to a view - // does not update the metadata of the view's views. Nonetheless, view - // operators have an implicit assumption that the metadata of the output is - // equivalent to the metadata of the input. Therefore, view operators must - // account for unseen updates to the input view by first calling - // `virtual_clone()` to make the output equivalent to the input before. - // modifying metadata. - - ValueRef t1 = graph.add_tensor(orig_sizes, vkapi::kFloat); - ValueRef t2 = graph.add_tensor_view(t1); - ValueRef t3 = graph.add_tensor_view(t2); - - ValueRef channels = graph.add_scalar(1); - ValueRef height = graph.add_scalar(2); - ValueRef width = graph.add_scalar(3); - - auto opFn = VK_GET_OP_FN("aten.transpose.int"); - - opFn(graph, {t1, channels, height, t2}); - std::vector t2_sizes = graph.sizes_of(t2); - std::vector expected_t2_sizes = {N, H, C, W}; - EXPECT_TRUE(t2_sizes == expected_t2_sizes); - - opFn(graph, {t2, height, width, t3}); - std::vector t3_sizes = graph.sizes_of(t3); - std::vector expected_t3_sizes = {N, H, W, C}; - EXPECT_TRUE(t3_sizes == expected_t3_sizes); -} - -TEST(VulkanComputeGraphTest, test_simple_graph) { - GraphConfig config; - ComputeGraph graph(config); - - std::vector size_big = {1, 8, 8}; - std::vector size_small = {1, 1, 8}; - - // Build graph - - IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat); - IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat); - - IOValueRef out = {}; - - out.value = graph.add_tensor(size_big, vkapi::kFloat); - - auto addFn = VK_GET_OP_FN("aten.add.Tensor"); - addFn(graph, {a.value, b.value, kDummyValueRef, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - // Run graph - - for (float i = 5.0f; i < 30.0f; i += 10.0f) { - float val_a = i + 2.0f; - float val_b = i + 1.5f; - float val_c = val_a + val_b; - - fill_vtensor(graph, a, val_a); - fill_vtensor(graph, b, val_b); - - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); ++i) { - CHECK_VALUE(data_out, i, val_c); - } - } -} - -TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) { - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - - std::vector sizes = {8, 64, 124}; - - // Build graph - - ValueRef scalar = graph.add_symint(1); - IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat); - - IOValueRef out = {}; - out.value = a.value; - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR("scalar_add_texture"), - graph.create_global_wg_size(a.value), - graph.create_local_wg_size(a.value), - // Inputs and Outputs - {{out.value, vkapi::MemoryAccessType::WRITE}}, - // Shader params buffers - {graph.logical_limits_ubo(a.value), - graph.get_or_create_int_param_buffer(scalar)}, - // Push constants - {}, - // Specialization Constants - {}, - // Resizing Logic - {}, - nullptr)); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - // Run graph - - for (float i = 5.0f; i < 30.0f; i += 10.0f) { - int scalar_val = i - 3.0f; - graph.set_symint(scalar, scalar_val); - - int32_t scalar_val_read = graph.read_symint(scalar); - EXPECT_TRUE(scalar_val_read == scalar_val); - - float val_a = i + 2.0f; - float val_out = val_a + scalar_val; - - fill_vtensor(graph, a, val_a); - - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); i++) { - CHECK_VALUE(data_out, i, val_out); - } - } -} - -#define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val) \ - std::vector data_##name(utils::multiply_integers(sizes)); \ - std::fill(data_##name.begin(), data_##name.end(), val); \ - ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data()); - -TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { - GraphConfig config; - config.enable_querypool = true; - ComputeGraph graph(config); - - std::vector size_big = {8, 73, 62}; - std::vector size_small = {8, 73, 1}; - - CREATE_WEIGHT_TENSOR(w1, size_small, vkapi::kFloat, 3.5f); - CREATE_WEIGHT_TENSOR(w2, size_small, vkapi::kFloat, 3.0f); - - // Build graph - - IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat); - - ValueRef c = graph.add_tensor(size_big, vkapi::kFloat); - ValueRef e = graph.add_tensor(size_big, vkapi::kFloat); - - ValueRef w1_packed = graph.add_tensor(size_small, vkapi::kFloat); - ValueRef w2_packed = graph.add_tensor(size_small, vkapi::kFloat); - - auto prepackFn = VK_GET_OP_FN("et_vk.prepack.default"); - prepackFn(graph, {w1, w1_packed}); - prepackFn(graph, {w2, w2_packed}); - - auto addFn = VK_GET_OP_FN("aten.add.Tensor"); - addFn(graph, {a.value, w1_packed, kDummyValueRef, c}); - - auto mulFn = VK_GET_OP_FN("aten.mul.Tensor"); - mulFn(graph, {c, w2_packed, e}); - - IOValueRef out = {}; - out.value = e; - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - - graph.prepack(); - - // Run graph - - for (float i = 5.0f; i < 30.0f; i += 10.0f) { - float val_out = (i + 3.5f) * 3.0f; - - fill_vtensor(graph, a, i); - - // Execute graph - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); ++i) { - CHECK_VALUE(data_out, i, val_out); - } - - if (graph.context()->querypool()) { - graph.context()->querypool().extract_results(); - graph.context()->querypool().print_results(); - } - } -} - -TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { - GraphConfig config; - config.expect_dynamic_shapes = true; - ComputeGraph graph(config); - size_t expected_vma_allocation_count = 0; - - std::vector size_big = {12, 64, 64}; - std::vector size_small = {12, 64, 64}; - - // Build graph and regularly check allocation counts - - IOValueRef a = graph.add_input_tensor( - size_big, - vkapi::kFloat, - /*shared_object_idx = */ 2); - IOValueRef b = graph.add_input_tensor( - size_small, - vkapi::kFloat, - /*shared_object_idx = */ 4); - - // +2: t.sizes_ubo() for each staging shader - expected_vma_allocation_count += 2; - EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); - - ValueRef c = graph.add_tensor( - size_big, - vkapi::kFloat, - /*shared_object_idx = */ 6); - - auto addFn = VK_GET_OP_FN("aten.add.Tensor"); - addFn(graph, {a.value, b.value, kDummyValueRef, c}); - - // no new allocations if binary op uses push constants - EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); - - IOValueRef d = graph.add_input_tensor( - size_small, - vkapi::kFloat, - /*shared_object_idx = */ 2); - - // +1: t.sizes_ubo() uniform buffer for staging shader - expected_vma_allocation_count += 1; - EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); - - ValueRef e = graph.add_tensor( - size_big, - vkapi::kFloat, - /*shared_object_idx = */ 4); - - auto mulFn = VK_GET_OP_FN("aten.mul.Tensor"); - mulFn(graph, {c, d.value, e}); - - // no new allocations if binary op uses push constants - EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); - - IOValueRef out = {}; - out.value = e; - out.staging = graph.set_output_tensor(out.value); - - // +1: staging buffer input tensor - expected_vma_allocation_count += 1; - EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); - - graph.prepare(); - graph.prepack(); - - // +3: shared memory allocations for tensors - expected_vma_allocation_count += 3; - EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); - - // Run graph - - std::vector> new_sizes_list = { - {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}}; - - for (auto& new_sizes : new_sizes_list) { - graph.virtual_resize(a.value, new_sizes); - graph.virtual_resize(b.value, new_sizes); - graph.virtual_resize(d.value, new_sizes); - graph.propagate_resize(); - - float val_a = new_sizes[1] + 4.0f; - float val_b = new_sizes[2] + 1.5f; - float val_d = new_sizes[0] + 2.0f; - float val_out = (val_a + val_b) * val_d; - - fill_vtensor(graph, a, val_a); - fill_vtensor(graph, b, val_b); - fill_vtensor(graph, d, val_d); - - // Execute graph - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); i++) { - CHECK_VALUE(data_out, i, val_out); - } - } - - std::vector> new_sizes_list_2 = { - {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}}; - - for (auto& new_sizes : new_sizes_list_2) { - graph.resize_input(0, new_sizes); - graph.resize_input(1, new_sizes); - graph.resize_input(2, new_sizes); - graph.propagate_resize(); - - // Check output shape - EXPECT_TRUE(graph.sizes_of(out.value) == new_sizes); - - float val_a = new_sizes[1] + 6.0f; - float val_b = new_sizes[2] + 2.5f; - float val_d = new_sizes[0] + 4.0f; - float val_out = (val_a + val_b) * val_d; - - fill_vtensor(graph, a, val_a); - fill_vtensor(graph, b, val_b); - fill_vtensor(graph, d, val_d); - - // Execute graph - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); i++) { - CHECK_VALUE(data_out, i, val_out); - } - } -} - -TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) { - GraphConfig config; - ComputeGraph graph(config); - - std::vector size_big = {8, 64, 124}; - std::vector size_small = {8, 1, 124}; - - // Build graph - - IOValueRef a = graph.add_input_tensor( - size_big, vkapi::kFloat, /*shared_object_idx = */ 0); - IOValueRef b = graph.add_input_tensor( - size_small, vkapi::kFloat, /*shared_object_idx = */ 1); - - IOValueRef out = {}; - - out.value = - graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2); - - // Perform the following compute - // - // a, b, out; - // { - // inter; - // { - // tmp = a + b - // tmp2 = tmp + a - // inter = tmp2 + b - // } - // { - // tmp = inter + b; - // tmp2 = tmp + a - // out = tmp2 + b; - // } - // } - { - TmpTensor inter(&graph, size_big, vkapi::kFloat); - EXPECT_TRUE(inter.sobj_idx == 3); - { - TmpTensor tmp(&graph, size_big, vkapi::kFloat); - EXPECT_TRUE(tmp.sobj_idx == 4); - VK_GET_OP_FN("aten.add.Tensor") - (graph, {a, b, kDummyValueRef, tmp}); - - TmpTensor tmp2(&graph, size_big, vkapi::kFloat); - EXPECT_TRUE(tmp2.sobj_idx == 5); - VK_GET_OP_FN("aten.add.Tensor") - (graph, {tmp, a, kDummyValueRef, tmp2}); - - VK_GET_OP_FN("aten.add.Tensor") - (graph, {tmp2, b, kDummyValueRef, inter}); - } - { - TmpTensor tmp(&graph, size_big, vkapi::kFloat); - EXPECT_TRUE(tmp.sobj_idx == 4); - VK_GET_OP_FN("aten.add.Tensor") - (graph, {inter, b, kDummyValueRef, tmp}); - - TmpTensor tmp2(&graph, size_big, vkapi::kFloat); - EXPECT_TRUE(tmp2.sobj_idx == 5); - VK_GET_OP_FN("aten.add.Tensor") - (graph, {tmp, a, kDummyValueRef, tmp2}); - - VK_GET_OP_FN("aten.add.Tensor") - (graph, {tmp2, b, kDummyValueRef, out}); - } - } - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - // Run graph - - for (float i = 5.0f; i < 30.0f; i += 10.0f) { - float val_a = i + 2.0f; - float val_b = i + 1.5f; - float val_tmp = val_a + val_b; - float val_tmp2 = val_tmp + val_a; - float val_inter = val_tmp2 + val_b; - float val_tmp_2 = val_inter + val_b; - float val_tmp2_2 = val_tmp_2 + val_a; - float val_out = val_tmp2_2 + val_b; - - fill_vtensor(graph, a, val_a); - fill_vtensor(graph, b, val_b); - - graph.execute(); - - EXTRACT_TENSOR(out); - - // Sanity check that the values are correct - for (size_t i = 0; i < graph.numel_of(out.value); ++i) { - CHECK_VALUE(data_out, i, val_out); - } - } -} - -TEST(VulkanComputeGraphTest, test_large_graph) { - auto build_start_time = std::chrono::system_clock::now(); - GraphConfig config; - config.expect_dynamic_shapes = true; - ComputeGraph graph(config); - - int64_t input_w = 256; - int64_t input_h = 256; - int64_t input_c = 8; - - std::vector size_big = {input_c, input_h, input_w}; - std::vector size_small = {input_c, input_h, 1}; - - std::vector size_big_alt = {input_c / 2, input_h / 2, input_w / 2}; - std::vector size_small_alt = {input_c / 2, input_h / 2, 1}; - - // Build graph - - IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat, 2); - IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat, 4); - - ValueRef c = graph.add_tensor(size_big, vkapi::kFloat, 6); - - auto addFn = VK_GET_OP_FN("aten.add.Tensor"); - addFn(graph, {a.value, b.value, kDummyValueRef, c}); - - int n = 100; - - for (int i = 0; i < n; i++) { - addFn(graph, {c, b.value, kDummyValueRef, a.value}); - - addFn(graph, {a.value, b.value, kDummyValueRef, c}); - } - - IOValueRef out = {}; - out.value = c; - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - auto build_end_time = std::chrono::system_clock::now(); - - auto build_time = std::chrono::duration_cast( - build_end_time - build_start_time); - - std::stringstream ss; - for (int i = 0; i < 10; i++) { - auto resize_start_time = std::chrono::system_clock::now(); - if (i % 2 == 0) { - graph.resize_input(0, size_big_alt); - graph.resize_input(1, size_small_alt); - } else { - graph.resize_input(0, size_big); - graph.resize_input(1, size_small); - } - graph.propagate_resize(); - auto resize_end_time = std::chrono::system_clock::now(); - - auto resize_time = std::chrono::duration_cast( - resize_end_time - resize_start_time); - - float val_a = 1.0f; - float val_b = 2.0f; - - float val_e = val_a + val_b * (2 * n + 1); - - auto inference_start_time = std::chrono::system_clock::now(); - - fill_vtensor(graph, a, val_a); - fill_vtensor(graph, b, val_b); - - graph.execute(); - - EXTRACT_TENSOR(out); - - auto inference_end_time = std::chrono::system_clock::now(); - - auto inference_time = std::chrono::duration_cast( - inference_end_time - inference_start_time); - - for (int i = 0; i < graph.numel_of(out.value); i++) { - CHECK_VALUE(data_out, i, val_e); - } - - ss << "[ ] Resize: " << std::setw(10) << std::right - << resize_time.count() << " us" << std::endl; - ss << "[ ] Inference: " << std::setw(10) << std::right - << inference_time.count() << " us" << std::endl; - } - ss << "[ ] Model Load:" << std::setw(10) << std::right - << build_time.count() << " us" << std::endl; - std::cout << ss.str(); -} - -void test_clone( - std::vector sizes, - utils::StorageType src_storage, - utils::GPUMemoryLayout src_layout, - utils::StorageType dst_storage, - utils::GPUMemoryLayout dst_layout) { - GraphConfig config; - ComputeGraph graph(config); - - IOValueRef a = - graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout); - - auto copyFn = VK_GET_OP_FN("aten.clone.default"); - copyFn(graph, {a.value, kDummyValueRef, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, /*iota = */ true); - - graph.propagate_resize(); - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - for (int i = 0; i < graph.numel_of(a.value); ++i) { - EXPECT_TRUE(data_out[i] == data_a[i]); - } -} - -TEST(VulkanComputeGraphTest, test_clone) { - std::vector> cases{ - {utils::kWidthPacked, utils::kWidthPacked}, - {utils::kWidthPacked, utils::kChannelsPacked}, - {utils::kChannelsPacked, utils::kChannelsPacked}, - }; - - for (std::vector sizes : standard_sizes_to_test) { - for (auto& [src_layout, dst_layout] : cases) { - test_clone( - sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout); - test_clone( - sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout); - test_clone( - sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout); - } - } -} - -TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 6; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - // Notice that copy_node operates on in texture's x, y, z dimension. In the - // comment, we provide the cooresponding coordinate in nchw. - - // src_offset is (n=0, c=4, h=1, w=1) - ValueRef src_offset_ref = graph.add_scalar_list({1, 1, 1}); - - // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate - // Argument is {x, y, z}. - // x = 0 since w = 0 - // y = 2 since h = 2 - // z = c / 4 + 2 since - // 1. there c/4 planes per batch, n=1 means we are on the first batch; - // 2. +2 because c = 8, with channel packing it means two texels. - ValueRef dst_offset_ref = graph.add_scalar_list({0, 2, c / 4 + 2}); - - // range is (n=1, c=8, h=2, w=4) - // Argument is {x, y, z}. - // x = 4 since w = 4 - // y = 2 since h = 2 - // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a - // bit misleading here, since it gives the impression that we are copying the - // entire channel. However, remember when we copy, we are trying to - // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range], - // range must be non zero. - ValueRef range_ref = graph.add_scalar_list({4, 2, 2}); - - auto copyFn = VK_GET_OP_FN("etvk.copy_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, /*iota = */ true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - // We will examine the results in the dst_range - // The value in the cooresponding coordinate should match between the source - // and destination tensor. We loop thru the range, calculate both the src and - // dst index using the offsets, and compare the values in the extracted - // vector. They should match. - int n_idx = 0; - // at each nested loop, index range from dst_offset to dst_offset + range - - for (int c_idx = 0; c_idx < 8; c_idx++) { - for (int h_idx = 0; h_idx < 2; h_idx++) { - for (int w_idx = 0; w_idx < 4; w_idx++) { - auto dst_idx = - get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx}); - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1}); - - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } -} - -TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 2; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - int64_t src_offset = 2; - int64_t dst_offset = 3; - int64_t range = 7; - - ValueRef src_offset_ref = graph.add_scalar(src_offset); - ValueRef dst_offset_ref = graph.add_scalar(dst_offset); - ValueRef range_ref = graph.add_scalar(range); - - auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = 0; c_idx < range; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx}); - auto dst_idx = get_buf_idx( - graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } - } -} - -TEST( - VulkanComputeGraphTest, - DISABLED_test_etvk_copy_channel_offset_node_clean_boundary) { - // Tricky part for channel copy is handling the boundary across multiple copy. - // For example, when we concat two [3, 1, 1] nchw-tensors along the channel - // dimension, due to channel packing, elements from different source texel - // will be packed into same destination texel at the boundaries. - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 2; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); - - // Make sure entire out tensor is zeroed. The zero tensor will be filled with - // zero later. - copyFn( - graph, - {zero.value, - graph.add_scalar(c), - graph.add_scalar(0), - graph.add_scalar(0), - out.value}); - - int64_t a_src_offset = 0; - int64_t a_dst_offset = 2; - int64_t a_range = 5; - // a will write to channge [2, 7) - copyFn( - graph, - {a.value, - graph.add_scalar(a_range), - graph.add_scalar(a_src_offset), - graph.add_scalar(a_dst_offset), - out.value}); - - // b will write to channel [6, 11) - // Intentional for b to override channel=6 - int64_t b_src_offset = 0; - int64_t b_dst_offset = 6; - int64_t b_range = 5; - - copyFn( - graph, - {b.value, - graph.add_scalar(b_range), - graph.add_scalar(b_src_offset), - graph.add_scalar(b_dst_offset), - out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - float a_value = 1.0f; - float b_value = 2.0f; - float zero_value = 0.0f; - fill_vtensor(graph, a, a_value); - fill_vtensor(graph, b, b_value); - fill_vtensor(graph, zero, zero_value); - - graph.execute(); - - EXTRACT_TENSOR(out); - - for (int n_idx = 0; n_idx < n; n_idx++) { - // c_idx only up to a_range-1 because the expected overwrite by b - for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1; - c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == a_value); - } - } - } - } - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == b_value); - } - } - } - } - - // Also verify that data before a_dst_offset and after b_dst_offset + b_range - // are untouched. - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == zero_value); - } - } - } - } - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == zero_value); - } - } - } - } -} - -TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 6; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kInt, memory_layout); - - // Notice that copy_node operates on in texture's x, y, z dimension. In the - // comment, we provide the cooresponding coordinate in nchw. - - // src_offset is (n=0, c=4, h=1, w=1) - ValueRef src_offset_ref = graph.add_scalar_list({1, 1, 1}); - - // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate - // Argument is {x, y, z}. - // x = 0 since w = 0 - // y = 2 since h = 2 - // z = c / 4 + 2 since - // 1. there c/4 planes per batch, n=1 means we are on the first batch; - // 2. +2 because c = 8, with channel packing it means two texels. - ValueRef dst_offset_ref = graph.add_scalar_list({0, 2, c / 4 + 2}); - - // range is (n=1, c=8, h=2, w=4) - // Argument is {x, y, z}. - // x = 4 since w = 4 - // y = 2 since h = 2 - // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a - // bit misleading here, since it gives the impression that we are copying the - // entire channel. However, remember when we copy, we are trying to - // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range], - // range must be non zero. - ValueRef range_ref = graph.add_scalar_list({4, 2, 2}); - - auto copyFn = VK_GET_OP_FN("etvk.copy_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0, /*iota = */ true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - // We will examine the results in the dst_range - // The value in the cooresponding coordinate should match between the source - // and destination tensor. We loop thru the range, calculate both the src and - // dst index using the offsets, and compare the values in the extracted - // vector. They should match. - int n_idx = 0; - // at each nested loop, index range from dst_offset to dst_offset + range - - for (int c_idx = 0; c_idx < 8; c_idx++) { - for (int h_idx = 0; h_idx < 2; h_idx++) { - for (int w_idx = 0; w_idx < 4; w_idx++) { - auto dst_idx = - get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx}); - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1}); - - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } -} - -TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_int_node) { - GraphConfig config; - ComputeGraph graph(config); - - int64_t n = 2; - int64_t c = 12; - int64_t h = 4; - int64_t w = 8; - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; - - std::vector size = {n, c, h, w}; - - IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout); - - int64_t src_offset = 2; - int64_t dst_offset = 3; - int64_t range = 7; - - ValueRef src_offset_ref = graph.add_scalar(src_offset); - ValueRef dst_offset_ref = graph.add_scalar(dst_offset); - ValueRef range_ref = graph.add_scalar(range); - - auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); - copyFn( - graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, a, 0.0f, true); - - graph.execute(); - - EXTRACT_TENSOR(out); - EXTRACT_TENSOR(a); - - for (int n_idx = 0; n_idx < n; n_idx++) { - for (int c_idx = 0; c_idx < range; c_idx++) { - for (int h_idx = 0; h_idx < h; h_idx++) { - for (int w_idx = 0; w_idx < w; w_idx++) { - auto src_idx = - get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx}); - auto dst_idx = get_buf_idx( - graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx}); - EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); - } - } - } - } -} - -TEST(VulkanComputeGraphTest, test_view_change_packing) { - std::vector> - layout_pairs = { - {utils::kWidthPacked, utils::kChannelsPacked}, - {utils::kWidthPacked, utils::kHeightPacked}, - {utils::kWidthPacked, utils::kWidthPacked}, - {utils::kHeightPacked, utils::kChannelsPacked}, - {utils::kHeightPacked, utils::kHeightPacked}, - {utils::kHeightPacked, utils::kHeightPacked}, - {utils::kChannelsPacked, utils::kChannelsPacked}, - {utils::kChannelsPacked, utils::kHeightPacked}, - {utils::kChannelsPacked, utils::kHeightPacked}, - }; - - int64_t n = 3; - int64_t c = 2; - int64_t h = 2; - int64_t w = 5; - std::vector size = {n, c, h, w}; - - for (auto layout_pair : layout_pairs) { - GraphConfig config; - ComputeGraph graph(config); - - IOValueRef in = - graph.add_input_tensor(size, vkapi::kFloat, layout_pair.first); - - IOValueRef out = {}; - out.value = graph.add_tensor(size, vkapi::kFloat, layout_pair.second); - - auto viewFn = VK_GET_OP_FN("aten.view_copy.default"); - viewFn(graph, {in.value, graph.add_none(), out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - graph.prepack(); - - fill_vtensor(graph, in, 0.0, true); - - graph.execute(); - - EXTRACT_TENSOR(out); - - // The extracted data is a flattened nchw buffer. Hence, should expect the - // all elements inside the out array to match the index. - for (int i = 0; i < graph.numel_of(out.value); i++) { - CHECK_VALUE(data_out, i, i); - } - } -} - -class VulkanToFromGPUShaderTest : public ::testing::Test { - public: - void SetUp() override { - // Make sure we are starting with a clean slate - EXPECT_TRUE(get_vma_allocation_count() == 0); - } - - void TearDown() override { - context()->flush(); - - // Make sure we are ending with a clean slate - EXPECT_TRUE(get_vma_allocation_count() == 0); - } -}; - -template -void run_from_gpu_test( - std::vector& sizes, - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, - vkapi::ScalarType dtype = vkapi::kFloat, - utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { - if (dtype == vkapi::kHalf && - !context()->adapter_ptr()->supports_16bit_storage_buffers()) { - return; - } - vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); - - std::string kernel_name("idx_fill_texture"); - add_dtype_suffix(kernel_name, vten.dtype()); - - int32_t offset = -50; - - { - vkapi::PipelineBarrier pipeline_barrier{}; - context()->submit_compute_job( - VK_KERNEL_FROM_STR(kernel_name), - pipeline_barrier, - vten.logical_limits(), - {4, 4, 4}, - {vten.packed_dim(), offset}, - VK_NULL_HANDLE, - 0, - vten.image( - pipeline_barrier, - vkapi::PipelineStage::COMPUTE, - vkapi::MemoryAccessType::WRITE), - vten.sizes_ubo()); - } - - StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); - - if (dtype == vkapi::kChar && - !context()->adapter_ptr()->has_full_int8_buffers_support()) { - record_bitw8_image_to_nchw_nobitw8buffer_op( - context(), vten, staging_buffer); - } else { - record_image_to_nchw_op(context(), vten, staging_buffer.buffer()); - } - - submit_to_gpu(); - - std::vector data_out(staging_buffer.numel()); - staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes()); - - for (int i = 0; i < vten.numel(); i++) { - CHECK_VALUE(data_out, i, i + offset); - } -} - -template -void round_trip_test( - std::vector& sizes, - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, - vkapi::ScalarType dtype = vkapi::kFloat, - utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { - if (dtype == vkapi::kHalf && - !context()->adapter_ptr()->supports_16bit_storage_buffers()) { - return; - } - - vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); - - // Create and fill input staging buffer - StagingBuffer staging_buffer_in( - context(), dtype, vten.staging_buffer_numel()); - - std::vector data_in(staging_buffer_in.numel()); - for (int i = 0; i < staging_buffer_in.numel(); i++) { - data_in[i] = T(i * -1); - } - staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes()); - - // Output staging buffer - StagingBuffer staging_buffer_out( - context(), dtype, vten.staging_buffer_numel()); - - record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); - - // Copy data in and out of the tensor - if (dtype == vkapi::kChar && - !context()->adapter_ptr()->has_full_int8_buffers_support()) { - record_bitw8_image_to_nchw_nobitw8buffer_op( - context(), vten, staging_buffer_out); - } else { - record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer()); - } - - // Execute command buffer - submit_to_gpu(); - - // Extract data from output staging buffer - std::vector data_out(staging_buffer_out.numel()); - staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes()); - - // All indices should be equal to the input data - for (int i = 0; i < vten.numel(); i++) { - CHECK_VALUE(data_out, i, data_in[i]); - } -} - -template -void compute_graph_round_trip_test( - std::vector& sizes, - utils::GPUMemoryLayout memory_layout = - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, - vkapi::ScalarType dtype = vkapi::kFloat, - utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { - if (dtype == vkapi::kHalf && - !context()->adapter_ptr()->supports_16bit_storage_buffers()) { - return; - } - - GraphConfig config; - ComputeGraph graph(config); - - ValueRef r_tensor = - graph.add_tensor(sizes, dtype, storage_type, memory_layout); - ValueRef r_staging_in = graph.set_input_tensor(r_tensor); - ValueRef r_staging_out = graph.set_output_tensor(r_tensor); - - graph.prepare(); - graph.prepack(); - - std::vector data_in(graph.numel_of(r_tensor)); - for (int i = 0; i < data_in.size(); i++) { - data_in[i] = T(i * -1); - } - graph.copy_into_staging(r_staging_in, data_in.data(), data_in.size()); - - graph.execute(); - - std::vector data_out(graph.staging_buffer_numel_of(r_tensor)); - graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size()); - - for (int i = 0; i < data_in.size(); i++) { - CHECK_VALUE(data_out, i, data_in[i]); - } -} - -TEST(VulkanToFromGPUShaderTest, round_trip_tests) { - // The below tests will fill each texel element with the value of the linear - // buffer index that corresponds to it. The texel at position (0, 0, 0) will - // be filled with the values [0, 1, 2, 3], the texel at position (1, 0, 0) - // will be filled with the values [4, 5, 6, 7], and so forth. The contents of - // the texture are then written back to the CPU, and to check that the - // transfer has ben performed correctly the value at each index of the CPU - // data buffer should be equal to the index. - // - // The below test cases should ensure that the total number of elements does - // not exceed 2048, or else the tests will fail for FP16 textures due to - // precision issues. Half precision floating point formats can only represent - // integers from 2048 to 4096 using intervals of 2. - std::vector> to_test = { - // 2D sizes - {17, 21}, - {67, 23}, - {55, 33}, - // 3D sizes - {7, 9, 13}, - {21, 2, 19}, - {17, 17, 5}, - // 4D sizes - {7, 3, 13, 7}, - {11, 9, 9, 1}, - {3, 3, 3, 3}, - {3, 1, 7, 13}, - }; - - // These sizes are set such that the total number of elements is less than - // 128 which is the maximum representable value for int8. - std::vector> to_test_int8 = { - // 2D sizes - {14, 7}, - // 3D sizes - {3, 7, 5}, - {4, 2, 11}, - // 4D sizes - {3, 3, 3, 3}, - {7, 1, 6, 3}, - }; - -#define RUN_TESTS(ctype, dtype) \ - round_trip_test( \ - sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \ - round_trip_test( \ - sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype); \ - round_trip_test( \ - sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype); \ - compute_graph_round_trip_test( \ - sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \ - compute_graph_round_trip_test( \ - sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype); \ - compute_graph_round_trip_test( \ - sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype); - - for (auto& sizes : to_test) { - RUN_TESTS(float, vkapi::kFloat) - RUN_TESTS(executorch::aten::Half, vkapi::kHalf) - } - - for (auto& sizes : to_test_int8) { - RUN_TESTS(int8_t, vkapi::kChar); - } - -#undef RUN_TESTS -} - -// -// Operator Smoke Tests -// - -void test_binary_op( - std::string op_name, - std::vector sizes_big, - std::vector sizes_small, - vkapi::ScalarType dtype, - utils::GPUMemoryLayout memory_layout) { - GraphConfig config; - ComputeGraph graph(config); - - IOValueRef arg2{}; - - // Build graph - - IOValueRef arg1 = graph.add_input_tensor(sizes_big, dtype, memory_layout); - arg2 = graph.add_input_tensor(sizes_small, dtype, memory_layout); - - IOValueRef out; - out.value = graph.add_tensor(sizes_big, dtype, memory_layout); - - std::stringstream ss; - ss << "aten."; - ss << op_name; - ss << ".Tensor"; - VK_GET_OP_FN(ss.str()) - (graph, {arg1.value, arg2.value, kDummyValueRef, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - - graph.prepack(); - - for (int i = 1; i < 4; i++) { - float val_arg1 = i + 1.5; - float val_arg2 = i - 3.5; - - float val_out = val_arg1 + val_arg2; - if (op_name == "sub") { - val_out = val_arg1 - val_arg2; - } - if (op_name == "mul") { - val_out = val_arg1 * val_arg2; - } - if (op_name == "div") { - val_out = val_arg1 / val_arg2; - } - - execute_graph_and_check_output(graph, {val_arg1, val_arg2}, {val_out}); - } -} - -#define CALL_TEST_FN_FORALL_CONDITIONS(_) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked) - -#define CALL_TEST_FN_FOR_W_PACKED(_) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true) \ - _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false) \ - _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true) - -#define CALL_TEST_FN_FOR_C_PACKED(_) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \ - _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true) \ - _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false) \ - _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true) - -TEST(VulkanComputeGraphOpsTest, add_smoke_test) { -#define RUN_TESTS(dtype, storage, layout) \ - test_binary_op("add", {17, 21}, {17, 21}, dtype, layout); \ - test_binary_op("add", {17, 21}, {1, 1}, dtype, layout); \ - test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout); \ - test_binary_op("sub", {11, 22}, {11, 1}, dtype, layout); \ - test_binary_op("add", {7, 17, 17}, {7, 17, 17}, dtype, layout); \ - test_binary_op("add", {7, 17, 17}, {7, 1, 17}, dtype, layout); \ - test_binary_op("sub", {9, 9, 7}, {9, 9, 7}, dtype, layout); \ - test_binary_op("sub", {9, 9, 7}, {9, 1, 1}, dtype, layout); - - CALL_TEST_FN_FORALL_CONDITIONS(RUN_TESTS); - -#undef RUN_TESTS -} - -void test_mm( - int B, - int M, - int K, - int N, - vkapi::ScalarType dtype, - utils::StorageType storage_type, - utils::GPUMemoryLayout memory_layout, - bool prepack = true) { - std::vector mat2_size = {B, K, N}; - - std::vector mat2_data(utils::multiply_integers(mat2_size)); - std::fill(mat2_data.begin(), mat2_data.end(), 2.0f); - ComputeGraph graph = build_mm_graph( - B, M, K, N, dtype, storage_type, memory_layout, mat2_data, prepack); - - graph.prepare(); - graph.prepack(); - - for (int i = 1; i < 4; i++) { - if (prepack) { - float val_mat1 = i; - float val_out = K * (val_mat1 * 2.0f); - execute_graph_and_check_output(graph, {val_mat1}, {val_out}); - } else { - float val_mat1 = i; - float val_mat2 = i + 1; - float val_out = K * (val_mat1 * val_mat2); - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - } - } -} - -TEST(VulkanComputeGraphOpsTest, mm_smoke_test) { -#define RUN_TESTS(dtype, storage_type, layout, prepack) \ - test_mm( \ - /*B = */ 1, \ - /*M = */ 31, \ - /*K = */ 127, \ - /*N = */ 23, \ - dtype, \ - storage_type, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 5, \ - /*M = */ 31, \ - /*K = */ 127, \ - /*N = */ 23, \ - dtype, \ - storage_type, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 7, \ - /*M = */ 13, \ - /*K = */ 89, \ - /*N = */ 17, \ - dtype, \ - storage_type, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 1, \ - /*M = */ 13, \ - /*K = */ 89, \ - /*N = */ 17, \ - dtype, \ - storage_type, \ - layout, \ - prepack); - - CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS); - CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS); - -#undef RUN_TESTS -} - -void test_mm_with_resize_reencode( - int B, - int M, - int K, - int N, - vkapi::ScalarType dtype, - utils::StorageType storage_type, - utils::GPUMemoryLayout memory_layout) { - ASSERT_TRUE(M > 1); - - std::vector mat2_size = {B, K, N}; - std::vector mat2_data(utils::multiply_integers(mat2_size)); - std::fill(mat2_data.begin(), mat2_data.end(), 2.0f); - - ComputeGraph graph = build_mm_graph( - B, M, K, N, dtype, storage_type, memory_layout, mat2_data, false); - - graph.prepare(); - graph.prepack(); - - for (int i = 1; i < 4; i++) { - float val_mat1 = i; - float val_mat2 = i + 1; - float val_out = K * (val_mat1 * val_mat2); - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - } - - // Switch to GEMV mode - int new_K = K / 2; - std::vector new_mat1_size = {1, new_K}; - std::vector new_mat2_size = {new_K, N}; - graph.resize_input(0, new_mat1_size); - graph.resize_input(1, new_mat2_size); - graph.propagate_resize(); - - for (int i = 1; i < 4; i++) { - float val_mat1 = i; - float val_mat2 = i + 1; - float val_out = new_K * (val_mat1 * val_mat2); - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - } -} - -TEST(VulkanComputeGraphOpsTest, test_graph_resize_reencode) { - test_mm_with_resize_reencode( - /*B = */ 1, - /*M = */ 31, - /*K = */ 127, - /*N = */ 23, - vkapi::kFloat, - utils::kTexture3D, - utils::kWidthPacked); -} - -void test_grid_priors( - std::vector input_sizes, - std::vector output_sizes, - int stride, - double offset, - const std::vector& data_out_expected) { - GraphConfig config; - ComputeGraph graph(config); - - // Build graph - IOValueRef in = graph.add_input_tensor( - input_sizes, - vkapi::kFloat, - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); - IOValueRef out; - out.value = graph.add_tensor( - output_sizes, - vkapi::kFloat, - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); - - VK_GET_OP_FN("et_vk.grid_priors.default") - (graph, - {in.value, - graph.add_scalar(stride), - graph.add_scalar(offset), - out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - - graph.prepack(); - - // Resize input - graph.propagate_resize(); - - // run graph - graph.execute(); - - std::vector output_data(graph.staging_buffer_numel_of(out.value)); - graph.copy_from_staging(out.staging, output_data.data(), output_data.size()); - - // check results - std::vector out_sizes = graph.sizes_of(out.value); - int h_out = utils::val_at(-2, out_sizes); - int w_out = utils::val_at(-1, out_sizes); - for (size_t i = 0; i < h_out; ++i) { - for (size_t j = 0; j < w_out; ++j) { - size_t idx_out = i * w_out + j; - CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]); - } - } -} - -TEST(VulkanComputeGraphOpsTest, grid_priors_test) { - test_grid_priors( - /*input size = */ {1, 5, 2, 3}, - /*output size = */ {6, 2}, - /*stride = */ 1, - /*offset = */ 0.0, - /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1}); - - test_grid_priors( - /*input size = */ {1, 5, 2, 3}, - /*output size = */ {6, 2}, - /*stride = */ 8, - /*offset = */ 0.5, - /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); -} - -void test_transpose_view_mm( - const int B, - const int M, - const int K, - const int N, - utils::StorageType storage_type) { - GraphConfig config; - config.expect_dynamic_shapes = true; - config.set_storage_type_override(storage_type); - ComputeGraph graph(config); - - std::vector mat1_size = {M, K}; - std::vector mat2_t_size = {N, K}; - std::vector out_size = {M, N}; - - std::vector mat1_small_size = {M - 4, K - 3}; - std::vector mat2_t_small_size = {N - 1, K - 3}; - - if (B > 1) { - mat1_size.resize(3); - mat1_size = {B, M, K}; - mat2_t_size.resize(3); - mat2_t_size = {B, N, K}; - out_size.resize(3); - out_size = {B, M, N}; - - mat1_small_size.resize(3); - mat1_small_size = {B, M - 4, K - 3}; - mat2_t_small_size.resize(3); - mat2_t_small_size = {B, N - 1, K - 3}; - } - - // Build graph; use shared objects to test views of shared objects - - IOValueRef mat1 = - graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked, 0); - IOValueRef mat2_transpose = graph.add_input_tensor( - mat2_t_size, vkapi::kFloat, utils::kWidthPacked, 1); - - ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value); - - ValueRef dim0; - ValueRef dim1; - - if (B > 1) { - dim0 = graph.add_scalar(1); - dim1 = graph.add_scalar(2); - } else { - dim0 = graph.add_scalar(0); - dim1 = graph.add_scalar(1); - } - - IOValueRef out; - out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked, 2); - - VK_GET_OP_FN("aten.transpose.int") - (graph, {mat2_transpose.value, dim0, dim1, mat2}); - VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - - graph.prepack(); - - for (int i = 1; i < 4; i++) { - float val_mat1 = i; - float val_mat2 = i + 1; - float val_out = K * (val_mat1 * val_mat2); - - // Try at full size - graph.resize_input(0, mat1_size); - graph.resize_input(1, mat2_t_size); - graph.propagate_resize(); - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - - // Try at reduced sizes - val_out = (K - 3) * (val_mat1 * val_mat2); - graph.resize_input(0, mat1_small_size); - graph.resize_input(1, mat2_t_small_size); - graph.propagate_resize(); - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - } -} - -TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) { - for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) { - test_transpose_view_mm(2, 7, 17, 5, storage_type); - } -} - -void test_to_copy() { - GraphConfig config; - config.set_storage_type_override(utils::kTexture3D); - ComputeGraph graph(config); - int M = 8; - int N = 8; - int K = 8; - // Build graph - IOValueRef in = graph.add_input_tensor( - {1, M, N, K}, - vkapi::kFloat, - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); - - std::vector data_in = - create_random_float_buffer(M * N * K, -1024, 1024); - graph.copy_into_staging(in.staging, data_in.data(), data_in.size()); - - IOValueRef out; - out.value = graph.add_tensor( - {1, M, N, K}, - vkapi::kHalf, - utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); - - auto op = VK_GET_OP_FN("aten._to_copy.default"); - op(graph, - {in.value, - graph.add_none(), - graph.add_none(), - graph.add_none(), - graph.add_none(), - graph.add_none(), - graph.add_none(), - out.value}); - - out.staging = graph.set_output_tensor(out.value); - - graph.prepare(); - - graph.prepack(); - graph.propagate_resize(); - graph.execute(); - - std::vector output_data(graph.numel_of(out.value)); - graph.copy_from_staging(out.staging, output_data.data(), output_data.size()); - - EXPECT_EQ(data_in.size(), output_data.size()); - -#ifdef VULKAN_DEBUG - float mse_ex = 0.0f; - float mse_vk = 0.0f; -#endif - - // check results - for (size_t i = 0; i < output_data.size(); ++i) { - float input = data_in[i]; - torch::executor::Half expected_output = - static_cast(input); - uint16_t* expected_bits = reinterpret_cast(&expected_output); - torch::executor::Half output = output_data[i]; - uint16_t* output_bits = reinterpret_cast(&output); - -#ifdef VULKAN_DEBUG - std::string msg; - msg.reserve(64); - msg = "input = " + std::to_string(input) + "(0b" + - std::bitset<32>(*reinterpret_cast(&input)).to_string() + - "), expected output = " + std::to_string(expected_output) + "(0b" + - std::bitset<16>(*expected_bits).to_string() + - "), recieved output = " + std::to_string(output) + "(0b" + - std::bitset<16>(*output_bits).to_string() + ")"; - - std::cout << msg << std::endl; - - mse_ex += std::pow(expected_output - input, 2); - mse_vk += std::pow(output - input, 2); -#endif - - // Note: Torch executor half "rounds up" when converting to fp16 whereas - // most driver implementations of Vulkan's opFConvert() just truncates the - // extra bits for performance (rounding introduces conditional). - // Example: - // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011}, - // mantissa{0b10010011111101111100111}), - // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011}, - // mantissa{0b1001010000}), - // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011}, - // mantissa{0b1001001111}) - // Note: - // The vulkan mantissa exactly matches the first 10 - // bits of the input 23 bit mantissa. But since the 11th bit is 1, the - // torch half output is rounded up (essentially adding a 1). - // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000} - - EXPECT_TRUE( - (*output_bits == *expected_bits) || - /*rounding error*/ ((*output_bits + 1u) == *expected_bits)); - } - -#ifdef VULKAN_DEBUG - mse_ex /= output_data.size(); - mse_vk /= output_data.size(); - - std::cout << "=========================================================" - << std::endl; - std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl; -#endif -} - -TEST(VulkanComputeGraphOpsTest, test_to_copy) { - if (context()->adapter_ptr()->supports_16bit_storage_buffers()) { - test_to_copy(); - } -} - -vkapi::ShaderInfo pick_dynamic_dispatch_shader( - ComputeGraph* graph, - const std::vector& args, - const std::vector& additional_args) { - const ValueRef mat1 = args[1].refs[0]; - - std::string kernel_name = "dynamic_dispatch_test"; - if (graph->size_at(-2, mat1) == 1) { - kernel_name += "_var1"; - } else { - kernel_name += "_var2"; - } - return VK_KERNEL_FROM_STR(kernel_name); -} - -utils::uvec3 pick_dynamic_dispatch_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - (void)shader; - const ValueRef out = args[0].refs[0]; - return graph->logical_limits_of(out); -} - -utils::uvec3 pick_dynamic_dispatch_local_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const utils::uvec3& global_workgroup_size, - const std::vector& args, - const std::vector& resize_args) { - (void)graph; - (void)shader; - (void)global_workgroup_size; - return {64, 1, 1}; -} - -void resize_dynamic_dispatch_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& additional_args) { - const ValueRef out = args[0].refs[0]; - const ValueRef mat1 = args[1].refs[0]; - - std::vector out_sizes = graph->sizes_of(mat1); - out_sizes.at(out_sizes.size() - 2) = 1; - - graph->virtual_resize(out, out_sizes); -} - -void add_dynamic_dispatch_test_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2, - const ValueRef out) { - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - pick_dynamic_dispatch_shader, - pick_dynamic_dispatch_global_wg_size, - pick_dynamic_dispatch_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}}, - // Shader params buffers - {}, - // Push Constants - {graph.sizes_pc_of(out), - graph.sizes_pc_of(mat1), - graph.sizes_pc_of(mat2)}, - // Specialization constants - {}, - // Resize Logic - {}, - resize_dynamic_dispatch_node)); -} - -vkcompute::ComputeGraph build_dynamic_dispatch_test_graph(int M, int N) { - using namespace vkcompute; - GraphConfig config; - config.expect_dynamic_shapes = true; - ComputeGraph graph(config); - - vkapi::ScalarType dtype = vkapi::kFloat; - utils::StorageType in_out_stype = utils::kTexture3D; - utils::GPUMemoryLayout memory_layout = utils::kWidthPacked; - - std::vector mat1_size = {M, N}; - std::vector mat2_size = {M, N}; - std::vector out_size = {1, N}; - - IOValueRef mat1 = - graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout); - IOValueRef mat2{}; - - mat2.value = graph.add_tensor(mat2_size, dtype, in_out_stype, memory_layout); - mat2.staging = graph.set_input_tensor(mat2.value); - - IOValueRef out; - out.value = graph.add_tensor(out_size, dtype, in_out_stype, memory_layout); - - add_dynamic_dispatch_test_node(graph, mat1, mat2, out); - - out.staging = graph.set_output_tensor(out.value); - - return graph; -} - -void test_dynamic_dispatch(int M, int N) { - ComputeGraph graph = build_dynamic_dispatch_test_graph(M, N); - - graph.prepare(); - graph.prepack(); - - for (int i = 1; i < 4; i++) { - float val_mat1 = i; - float val_mat2 = i + 1; - // 5.3 is a hardcoded offset in the compute shader - float val_out = M * (val_mat1 * val_mat2) + 5.5; - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - } - - // Switch to GEMV mode - int new_N = N / 2; - std::vector new_mat1_size = {1, new_N}; - std::vector new_mat2_size = {1, new_N}; - graph.resize_input(0, new_mat1_size); - graph.resize_input(1, new_mat2_size); - graph.propagate_resize(); - - for (int i = 1; i < 4; i++) { - float val_mat1 = i; - float val_mat2 = i + 1; - float val_out = (val_mat1 * val_mat2) + 2.25; - execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); - } -} - -TEST(VulkanComputeGraphOpsTest, test_dynamic_dispatch_graph) { - test_dynamic_dispatch(128, 128); -} diff --git a/backends/vulkan/tools b/backends/vulkan/tools new file mode 120000 index 00000000000..1049695e9e7 --- /dev/null +++ b/backends/vulkan/tools @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/tools \ No newline at end of file diff --git a/backends/vulkan/tools/gpuinfo/TARGETS b/backends/vulkan/tools/gpuinfo/TARGETS deleted file mode 100644 index 10e3acb4b8c..00000000000 --- a/backends/vulkan/tools/gpuinfo/TARGETS +++ /dev/null @@ -1,50 +0,0 @@ -load("@fbcode_macros//build_defs:native_rules.bzl", "buck_filegroup") -load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary") -load( - "@fbsource//tools/build_defs:platform_defs.bzl", - "ANDROID", -) -load( - "@fbsource//xplat/executorch/backends/vulkan:targets.bzl", - "vulkan_spv_shader_lib", -) - -oncall("executorch") - -buck_filegroup( - name = "gpuinfo_shaders", - srcs = glob([ - "glsl/*", - ]), - visibility = [ - "PUBLIC", - ], -) - -vulkan_spv_shader_lib( - name = "gpuinfo_shader_lib", - is_fbcode = True, - spv_filegroups = { - ":gpuinfo_shaders": "glsl", - }, -) - -fb_xplat_cxx_binary( - name = "vulkan_gpuinfo", - srcs = glob([ - "**/*.cpp", - ]), - headers = glob([ - "**/*.h", - ]), - header_namespace = "/include", - include_directories = ["/include"], - platforms = ANDROID, - raw_headers = glob([ - "**/*.h", - ]), - deps = [ - ":gpuinfo_shader_lib", - "//executorch/backends/vulkan:vulkan_graph_runtime", - ], -) diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json deleted file mode 100644 index afb5cbc6c59..00000000000 --- a/backends/vulkan/tools/gpuinfo/config.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "reg_count": { - "enabled": true, - "threshold": 3, - "compensate": 0.1 - }, - "buf_cacheline_size": { - "enabled": true, - "threshold": 10, - "compensate": 0.1 - }, - "buffer_bandwidth": { - "enabled": true, - "range": 134217728, - "nflush": 4, - "nunroll": 16, - "niter": 10 - }, - "ubo_bandwidth": { - "enabled": true, - "range": 134217728, - "nflush": 4, - "nunroll": 16, - "niter": 10 - }, - "shared_bandwidth": { - "enabled": true, - "nflush": 4, - "nunroll": 16, - "niter": 10 - }, - "warp_size": { - "enabled": true, - "threshold": 3, - "compensate": 0.1 - }, - "tex_bandwidth": { - "enabled": true, - "nflush": 4, - "nunroll": 16, - "niter": 10 - }, - "tex_cacheline_concurr": { - "enabled": true, - "threshold": 3, - "compensate": 0.1 - } -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl deleted file mode 100644 index 38c9befec6f..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -$if MEMTYPE == "ubo": - ${layout_declare_ubo(0, "vec4", "A")} -$elif MEMTYPE == "buffer": - ${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)} -$else: - ${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)} - -${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int niter = 1; -layout(constant_id = 4) const int nvec = 1; -layout(constant_id = 5) const int local_group_size = 1; -// The address mask works as a modulo because x % 2^n == x & (2^n - 1). -// This will help us limit address accessing to a specific set of unique -// addresses depending on the access size we want to measure. -layout(constant_id = 6) const int addr_mask = 1; -layout(constant_id = 7) const int workgroup_width = 1; - -$if MEMTYPE == "shared": - shared vec4 A[nvec]; - -void main() { - - $if MEMTYPE == "shared": - A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0]; - memoryBarrierShared(); - - vec4 sum = vec4(0); - uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; - - int i = 0; - for (; i < niter; ++i){ - $for j in range(int(NUNROLL)): - sum *= A[offset]; - - // On each unroll, a new unique address will be accessed through the offset, - // limited by the address mask to a specific set of unique addresses - offset = (offset + local_group_size) & addr_mask; - } - - // This is to ensure no compiler optimizations occur - vec4 zero = vec4(i>>31); - - B[gl_LocalInvocationID[0]] = sum + zero; -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml deleted file mode 100644 index b47e6ba2a3d..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -buf_bandwidth: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - NUNROLL: "16" - generate_variant_forall: - MEMTYPE: - - VALUE: ubo - - VALUE: buffer - - VALUE: shared - shader_variants: - - NAME: buf_bandwidth diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl deleted file mode 100644 index d9e36376909..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - - -${layout_declare_buffer(0, "r", "source", DTYPE)} -${layout_declare_buffer(1, "w", "destination", DTYPE)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int niter = 1; -layout(constant_id = 4) const int stride = 1; -layout(constant_id = 5) const int pitch = 1; - -void main() { - float c = 0; - for (int i = 0; i < niter; ++i) { - const int zero = i >> 31; - c += source[zero + pitch * gl_GlobalInvocationID[0]]; - c += source[zero + stride + pitch * gl_GlobalInvocationID[0]]; - } - destination[0] = c; -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml deleted file mode 100644 index 8570e14ea1b..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -buf_cacheline_size: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - shader_variants: - - NAME: buf_cacheline_size diff --git a/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl b/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl deleted file mode 100644 index cc63ae80c52..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -${layout_declare_buffer(0, "w", "out_buff", DTYPE)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int NITER = 1; - -void main() { - - $for k in range(int(NREG)): - float reg_data${k} = float(NITER) + ${k}; - - int i = 0; - for (; i < NITER; ++i) { - reg_data0 *= reg_data${int(NREG)-1}; - $for k in range(1, int(NREG)): - reg_data${k} *= reg_data${k-1}; - } - i = i >> 31; - - $for k in range(int(NREG)): - out_buff[${k} * i] = reg_data${k}; -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/reg_count.yaml b/backends/vulkan/tools/gpuinfo/glsl/reg_count.yaml deleted file mode 100644 index ecdf87d362e..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/reg_count.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -reg_count: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - NREG: - - RANGE: [1, 512] - - shader_variants: - - NAME: reg_count diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl deleted file mode 100644 index 7ab67bd2d0a..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_sampler(0, "r", "A", DTYPE)} -${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int niter = 1; -layout(constant_id = 4) const int nvec = 1; -layout(constant_id = 5) const int local_group_size = 1; -// The address mask works as a modulo because x % 2^n == x & (2^n - 1). -// This will help us limit address accessing to a specific set of unique -// addresses depending on the access size we want to measure. -layout(constant_id = 6) const int addr_mask = 1; -layout(constant_id = 7) const int workgroup_width = 1; - -void main() { - vec4 sum = vec4(0); - uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; - - int i = 0; - for (; i < niter; ++i){ - VEC4_T in_texel; - $for j in range(int(NUNROLL)): - $if DIM == 0: - in_texel = texelFetch(A, ivec3(offset, 0, 0), 0); - $elif DIM == 1: - in_texel = texelFetch(A, ivec3(0, offset, 0), 0); - $elif DIM == 2: - in_texel = texelFetch(A, ivec3(0, 0, offset), 0); - - sum *= in_texel; - - // On each unroll, a new unique address will be accessed through the offset, - // limited by the address mask to a specific set of unique addresses - offset = (offset + local_group_size) & addr_mask; - } - - // This is to ensure no compiler optimizations occur - vec4 zero = vec4(i>>31); - - B[gl_LocalInvocationID[0]] = sum + zero; -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml deleted file mode 100644 index 84da6938fd4..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -tex_bandwidth: - parameter_names_with_default_values: - DTYPE: float - NUNROLL: "16" - generate_variant_forall: - DIM: - - RANGE: [0, 2] - shader_variants: - - NAME: tex_bandwidth diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl deleted file mode 100644 index 62659c7bb88..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -${layout_declare_sampler(0, "r", "in_tex", DTYPE)} -${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int niter = 1; - -void main() { - vec4 sum = vec4(0); - int i = 0; - for (; i < niter; ++i){ - $if DIM == 0: - sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0); - $elif DIM == 1: - sum += texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0); - $elif DIM == 2: - sum += texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0); - } - - // This is to ensure no compiler optimizations occur - vec4 zero = vec4(i>>31); - - out_buf[0] = sum + zero; -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml deleted file mode 100644 index 6b557c9f66e..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -tex_cacheline_concurr: - parameter_names_with_default_values: - DTYPE: float - generate_variant_forall: - DIM: - - RANGE: [0, 2] - shader_variants: - - NAME: tex_cacheline_concurr diff --git a/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl deleted file mode 100644 index 352ce04a5c9..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -${layout_declare_buffer(0, "w", "out_buff", DTYPE)} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -$if METHOD == "scheduler": - shared int shared_counter; -$elif METHOD == "physical": - layout(constant_id = 3) const int NITER = 1; -$else: - $raise Exception("Unsupported value for warp_size") - -void main() { - - $if METHOD == "scheduler": - shared_counter = 0; - memoryBarrierShared(); - int i = atomicAdd(shared_counter, 1); - memoryBarrierShared(); - out_buff[gl_GlobalInvocationID[0]] = i; - $else: - int sum = 0; - for (int j = 0; j < NITER; ++j) { - // Integer division is an exemplary multi-cycle instruction that can - // hardly be optimized, thus reducing the impact of latency hiding. - sum += j / 3; - barrier(); - } - out_buff[gl_GlobalInvocationID[0]] = sum; -} diff --git a/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml deleted file mode 100644 index 69587bd38d0..00000000000 --- a/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -warp_size: - parameter_names_with_default_values: - DTYPE: int32 - STORAGE: buffer - generate_variant_forall: - METHOD: - - VALUE: scheduler - - VALUE: physical - shader_variants: - - NAME: warp_size diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h deleted file mode 100644 index a46e9e6b9ae..00000000000 --- a/backends/vulkan/tools/gpuinfo/include/app.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include - -#include "utils.h" - -namespace gpuinfo { - -class App { - private: - folly::dynamic config_; - - public: - size_t buf_cache_size; - uint32_t max_shared_mem_size; - uint32_t sm_count; - uint32_t nthread_logic; - uint32_t subgroup_size; - uint32_t max_tex_width; - uint32_t max_tex_height; - uint32_t max_tex_depth; - - App() { - context()->initialize_querypool(); - - std::cout << context()->adapter_ptr()->stringize() << std::endl - << std::endl; - - auto cl_device = get_cl_device(); - - sm_count = cl_device.getInfo(); - nthread_logic = cl_device.getInfo(); - buf_cache_size = cl_device.getInfo(); - max_shared_mem_size = cl_device.getInfo(); - max_tex_width = cl_device.getInfo(); - max_tex_height = cl_device.getInfo(); - max_tex_depth = cl_device.getInfo(); - - VkPhysicalDeviceSubgroupProperties subgroup_props{}; - VkPhysicalDeviceProperties2 props2{}; - - props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - props2.pNext = &subgroup_props; - subgroup_props.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; - vkGetPhysicalDeviceProperties2( - context()->adapter_ptr()->physical_handle(), &props2); - subgroup_size = subgroup_props.subgroupSize; - - std::cout << std::endl; - std::cout << "SM count," << sm_count << std::endl; - std::cout << "Logic Thread Count," << nthread_logic << std::endl; - std::cout << "Cache Size," << buf_cache_size << std::endl; - std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl; - std::cout << "SubGroup Size," << subgroup_size << std::endl; - std::cout << "MaxTexWidth," << max_tex_width << std::endl; - std::cout << "MaxTexHeight," << max_tex_height << std::endl; - std::cout << "MaxTexDepth," << max_tex_depth << std::endl; - } - - float get_config(const std::string& test, const std::string& key) const { - if (config_[test].empty()) { - throw std::runtime_error("Missing config for " + test); - } - - if (!config_[test][key].isNumber()) { - throw std::runtime_error( - "Config for " + test + "." + key + " is not a number"); - } - - float value; - if (config_[test][key].isDouble()) { - value = config_[test][key].getDouble(); - } else { - value = config_[test][key].getInt(); - } - - std::cout << "Read value for " << test << "." << key << " = " << value - << std::endl; - return value; - } - - bool enabled(const std::string& test) const { - if (config_.empty() || config_[test].empty() || - !config_[test]["enabled"].isBool()) { - return true; - } - return config_[test]["enabled"].getBool(); - } - - void load_config(std::string file_path) { - std::ifstream file(file_path); - std::stringstream buffer; - buffer << file.rdbuf(); - const std::string json_str = buffer.str(); - if (json_str.empty()) { - throw std::runtime_error( - "Failed to read config file from " + file_path + "."); - } - config_ = folly::parseJson(json_str); - } -}; -} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h deleted file mode 100644 index 9af908eb170..00000000000 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include "app.h" -#include "stats.h" -#include "utils.h" - -using namespace vkapi; - -namespace gpuinfo { - -void reg_count(const App& app) { - if (!app.enabled("reg_count")) { - std::cout << "Skipped Register Count" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Register Count ------" << std::endl; - const uint32_t NREG_MIN = 1; - const uint32_t NREG_MAX = 512; - const uint32_t NREG_STEP = 1; - - const double COMPENSATE = app.get_config("reg_count", "compensate"); - const double THRESHOLD = app.get_config("reg_count", "threshold"); - - const uint32_t NGRP_MIN = 1; - const uint32_t NGRP_MAX = 64; - const uint32_t NGRP_STEP = 1; - - uint32_t NITER; - - auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StagingBuffer buffer(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "reg_count_" + std::to_string(nreg); - - auto time = benchmark_on_gpu(shader_name, 30, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {1, ngrp, 1}, - {1, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - buffer.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); - - uint32_t nreg_max; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nreg = NREG_MIN; - for (; nreg <= NREG_MAX; nreg += NREG_STEP) { - double time = bench(1, nreg); - std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus" - << std::endl; - if (dj.push(time)) { - nreg -= NREG_STEP; - nreg_max = nreg; - break; - } - } - if (nreg >= NREG_MAX) { - std::cout << "Unable to conclude a maximal register count" << std::endl; - nreg_max = NREG_STEP; - } else { - std::cout << nreg_max << " registers are available at most" << std::endl; - } - - auto find_ngrp_by_nreg = [&](const uint32_t nreg) { - DtJumpFinder<3> dj(COMPENSATE, THRESHOLD); - for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { - auto time = bench(ngrp, nreg); - std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t" - << ngrp << "\t, time=\t" << time << "\tus" << std::endl; - - if (dj.push(time)) { - ngrp -= NGRP_STEP; - std::cout << "Using " << nreg << " registers can have " << ngrp - << " concurrent single-thread workgroups" << std::endl; - return ngrp; - } - } - std::cout - << "Unable to conclude a maximum number of concurrent single-thread workgroups when " - << nreg << " registers are occupied" << std::endl; - return (uint32_t)1; - }; - - uint32_t ngrp_full, ngrp_half; - ngrp_full = find_ngrp_by_nreg(nreg_max); - ngrp_half = find_ngrp_by_nreg(nreg_max / 2); - - std::string reg_ty; - - if (ngrp_full * 1.5 < ngrp_half) { - std::cout << "All physical threads in an sm share " << nreg_max - << " registers" << std::endl; - reg_ty = "Pooled"; - - } else { - std::cout << "Each physical thread has " << nreg_max << " registers" - << std::endl; - reg_ty = "Dedicated"; - } - - std::cout << std::endl << std::endl; - std::cout << "MaxRegisters," << nreg_max << std::endl; - std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl; - std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl; - std::cout << "RegisterType," << reg_ty << std::endl; -} - -// Warp size is a difficult metric to obtain because the hardware limitations -// do not always coincide with the way the SM divides the workload. For -// instance, the hardware can have a warp size of 64 threads, but an SM might -// be able to simulate concurrency of 128 threads with a single scheduler. - -// Because of this, it is important to measure the warp size different ways, -// that can evidence both the physical limitations of the hardware, and the -// actual behavior of the driver. - -// Additionally,the SM can behave in two different ways when the assigned -// workload is smaller than the warp size. - -// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty -// threads and maintain a uniform workload. - -// In Case 2, like in Adreno, the driver might decide to pack multiple works -// together and dispatch them at once. -void warp_size(const App& app, const bool verbose = false) { - if (!app.enabled("warp_size")) { - std::cout << "Skipped Warp Size" << std::endl; - return; - } - - std::cout << "\n------ Warp Size ------" << std::endl; - - // Method A: Stress test with a kernel that uses complex ALU operations like - // integer division to avoid latency hiding. Increase the number of threads - // until a jump in latency is detected. - - // This timing-based method helps us identify physical warp sizes. It also - // helps with Case 2, when threads of multiple warps are managed by the same - // scheduler at the same time. - const double COMPENSATE = app.get_config("warp_size", "compensate"); - const double THRESHOLD = app.get_config("warp_size", "threshold"); - - uint32_t NITER; - - auto bench = [&](uint32_t nthread) { - StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_physical"; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - // Large number of work groups selected to potentially saturate all - // ALUs and thus have a better baseline for comparison. - {nthread, 1024, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t warp_size = app.subgroup_size; - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - - // We increase the number of threads until we hit a jump in the data. - uint32_t nthread = 1; - for (; nthread <= app.nthread_logic; ++nthread) { - double time = bench(nthread); - std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" - << std::endl; - if (dj.push(time)) { - warp_size = nthread - 1; - break; - } - } - if (nthread >= app.nthread_logic) { - std::cout - << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" - << std::endl; - } - - // Method B: Let all the threads in a warp race and atomically fetch-add - // a counter, then store the counter values to the output buffer in the - // scheduling order of these threads. If all the order numbers follow an - // ascending order, then the threads are likely executing within a warp. - // Threads in different warps are not managed by the same scheduler, so they - // would race for a same ID out of order, unaware of each other. - - // This method evidences the actual driver behavior when running - // concurrency, regardless of the physical limitations of the hardware. - - // Likewise, this method helps us identify warp sizes when the SM - // sub-divides its ALUs into independent groups, like the three execution - // engines in a Mali G76 core. It helps warp-probing in Case 1 because it - // doesn't depend on kernel timing, so the extra wait time doesn't lead to - // inaccuracy. - auto bench_sm = [&](uint32_t nthread) { - StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_scheduler"; - - benchmark_on_gpu(shader_name, 1, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - std::vector data(app.nthread_logic); - out_buf.copy_to(data.data(), out_buf.nbytes()); - - if (verbose) { - std::stringstream ss; - for (auto j = 0; j < nthread; ++j) { - ss << data[j] << " "; - } - std::cout << ss.str() << std::endl; - } - - // Check until which point is the data in ascending order. - int32_t last = -1; - int32_t j = 0; - for (; j < nthread; ++j) { - if (last >= data[j]) { - break; - } - last = data[j]; - } - - return j; - }; - - // Test increasing sizes until the data is no longer in ascending order. - uint32_t warp_size_scheduler = warp_size; - int i = 1; - for (; i <= app.nthread_logic; ++i) { - uint32_t nascend = bench_sm(i); - if (nascend != i) { - warp_size_scheduler = nascend; - break; - } - } - if (i > app.nthread_logic) { - std::cout << "Unable to conclude an SM Warp Size." << std::endl; - } - - std::cout << "PhysicalWarpSize," << warp_size << std::endl; - std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; -} -}; // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h deleted file mode 100644 index 31137b11eea..00000000000 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include "app.h" -#include "stats.h" -#include "utils.h" - -using namespace vkapi; - -namespace gpuinfo { - -void buf_cacheline_size(const App& app) { - if (!app.enabled("buf_cacheline_size")) { - std::cout << "Skipped Buffer Cacheline Size" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Buffer Cacheline Size ------" << std::endl; - - const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate"); - const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold"); - - const uint32_t PITCH = app.buf_cache_size / app.nthread_logic; - const uint32_t BUF_SIZE = app.buf_cache_size; - const uint32_t MAX_STRIDE = PITCH; - - uint32_t NITER; - - auto bench = [&](int stride) { - StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StagingBuffer out_buf(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_cacheline_size"; - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {app.nthread_logic, 1, 1}, - {app.nthread_logic, 1, 1}, - {SV(NITER), SV(stride), SV(PITCH)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t cacheline_size; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t stride = 1; - for (; stride <= MAX_STRIDE; ++stride) { - double time = bench(stride); - std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - cacheline_size = stride * sizeof(float); - break; - } - } - if (stride >= MAX_STRIDE) { - std::cout << "Unable to conclude a top level buffer cacheline size." - << std::endl; - cacheline_size = MAX_STRIDE * sizeof(float); - } - - std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; -} - -void _bandwidth( - const App& app, - const std::string memtype, - const uint32_t range) { - auto memtype_lower = memtype; - std::transform( - memtype_lower.begin(), - memtype_lower.end(), - memtype_lower.begin(), - [](unsigned char c) { return std::tolower(c); }); - - auto test_name = memtype_lower + "_bandwidth"; - - // Cache lines flushed - const uint32_t NFLUSH = app.get_config(test_name, "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // buf_bandwidth.yaml - const uint32_t NUNROLL = app.get_config(test_name, "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange for - // higher latency. - const uint32_t NITER = app.get_config(test_name, "niter"); - // Vector dimensions (vec4) - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - // Number of vectors that fit in the selected memory space - const uint32_t NVEC = range / VEC_SIZE; - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read al l vectors - // The thread count doesn't divide by thread workload in shared memory - // because of the limited memory size. - const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; - // Occupy all threads - const uint32_t local_x = app.nthread_logic; - // Ensure that global is a multiple of local, and distribute across all SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; - - auto bench = [&](uint32_t access_size) { - // Number of vectors that fit in this iteration - const uint32_t nvec_access = access_size / VEC_SIZE; - - // The address mask works as a modulo because x % 2^n == x & (2^n - 1). - // This will help us limit address accessing to a specific set of unique - // addresses depending on the access size we want to measure. - const uint32_t addr_mask = nvec_access - 1; - - // This is to distribute the accesses to unique addresses across the - // workgroups, once the size of the access excedes the workgroup width. - const uint32_t workgroup_width = local_x * NITER * NUNROLL; - - StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StagingBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_bandwidth_" + memtype_lower; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), - SV(nvec_access), - SV(local_x), - SV(addr_mask), - SV(workgroup_width)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - auto gbps = SIZE_TRANS * 1e-3 / time; - std::cout << memtype << " bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) { - double gbps = bench(access_size); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth - << std::endl; - std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth - << std::endl; -} - -void buf_bandwidth(const App& app) { - if (!app.enabled("buffer_bandwidth")) { - std::cout << "Skipped Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Memory Bandwidth ------" << std::endl; - // Maximum memory space read - 128MB - // For regular devices, bandwidth plateaus at less memory than this, so more - // is not needed. - const uint32_t RANGE = app.get_config("buffer_bandwidth", "range"); - _bandwidth(app, "Buffer", RANGE); -} - -void ubo_bandwidth(const App& app) { - if (!app.enabled("ubo_bandwidth")) { - std::cout << "Skipped UBO Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ UBO Bandwidth ------" << std::endl; - const uint32_t RANGE = app.get_config("ubo_bandwidth", "range"); - _bandwidth(app, "UBO", RANGE); -} - -void shared_mem_bandwidth(const App& app) { - if (!app.enabled("shared_bandwidth")) { - std::cout << "Skipped Shared Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Shared Bandwidth ------" << std::endl; - const uint32_t RANGE = app.max_shared_mem_size; - _bandwidth(app, "Shared", RANGE); -} -} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/stats.h b/backends/vulkan/tools/gpuinfo/include/stats.h deleted file mode 100644 index 123ed0d8bcb..00000000000 --- a/backends/vulkan/tools/gpuinfo/include/stats.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Portions (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* - * Code sourced from - * https://github.com/microsoft/ArchProbe/blob/main/include/stats.hpp with the - * following MIT license - * - * MIT License - * - * Copyright (c) Microsoft Corporation. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE - */ - -#pragma once -#include -#include - -template -class AvgStats { - T sum_ = 0; - uint64_t n_ = 0; - - public: - typedef T value_t; - - void push(T value) { - sum_ += value; - n_ += 1; - } - inline bool has_value() const { - return n_ != 0; - } - operator T() const { - return sum_ / n_; - } -}; - -template -class NTapAvgStats { - std::array hist_; - size_t cur_idx_; - bool ready_; - - public: - typedef T value_t; - - void push(T value) { - hist_[cur_idx_++] = value; - if (cur_idx_ >= NTap) { - cur_idx_ = 0; - ready_ = true; - } - } - inline bool has_value() const { - return ready_; - } - operator T() const { - double out = 0.0; - for (double x : hist_) { - out += x; - } - out /= NTap; - return out; - } -}; - -template -struct DtJumpFinder { - private: - NTapAvgStats time_avg_; - AvgStats dtime_avg_; - double compensation_; - double threshold_; - - public: - // Compensation is a tiny additive to give on delta time so that the algorithm - // works smoothly when a sequence of identical timing is ingested, which is - // pretty common in our tests. Threshold is simply how many times the new - // delta has to be to be recognized as a deviation. - DtJumpFinder(double compensation = 0.01, double threshold = 10) - : time_avg_(), - dtime_avg_(), - compensation_(compensation), - threshold_(threshold) {} - - // Returns true if the delta time regarding to the last data point seems - // normal; returns false if it seems the new data point is too much away from - // the historical records. - bool push(double time) { - if (time_avg_.has_value()) { - double dtime = std::abs(time - time_avg_) + (compensation_ * time_avg_); - if (dtime_avg_.has_value()) { - double ddtime = std::abs(dtime - dtime_avg_); - if (ddtime > threshold_ * dtime_avg_) { - return true; - } - } - dtime_avg_.push(dtime); - } - time_avg_.push(time); - return false; - } - - double dtime_avg() const { - return dtime_avg_; - } - double compensate_time() const { - return compensation_ * time_avg_; - } -}; diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h deleted file mode 100644 index c9ff133f1ec..00000000000 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include "app.h" -#include "stats.h" -#include "utils.h" - -namespace gpuinfo { - -// Textures are drastically different from buffers in terms of data layout. -// While buffers are a contiguous range of memory, textures are opaque objects -// defined by the vendor and it is possible that nearby points of data are not -// neighboring in memory. Likewise, data points are accessed in -// multi-dimensional patches instead of simple lines. This makes the stride -// method for figuring out the cache line size not applicable. To go around -// this, this experiment runs an increasing amount of threads accessing -// different datapoints in the texture and measures latency. If the cache line -// is big enough to contain all requested data for the amount of threads, -// latency will be low. When there are more threads and hence more data than -// what a single cache line can handle, a second line must be fetched, -// increasing latency in a measurable way. -void tex_cacheline_concurr(const App& app) { - if (!app.enabled("tex_cacheline_concurr")) { - std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl; - return; - } - - const uint32_t TEXEL_WIDTH = 4; - const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH; - - const double COMPENSATE = - app.get_config("tex_cacheline_concurr", "compensate"); - const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold"); - - for (int dim = 0; dim < 3; ++dim) { - std::cout << std::endl; - std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim - << ") ------" << std::endl; - - uint32_t NITER; - - const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width - : dim == 1 ? app.max_tex_height - : app.max_tex_depth; - - const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE); - - auto bench = [&](uint32_t nthread) { - std::vector sizes_whd = { - app.max_tex_width, app.max_tex_height, app.max_tex_depth}; - - auto sizes_nchw = whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); - - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nthread = 1; - for (; nthread <= MAX_NTHREAD; ++nthread) { - double time = bench(nthread); - std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - auto max_concurrency = nthread - 1; - std::cout << "TextureCachelineConcurrencyDim" << dim << " (B)," - << max_concurrency * TEXEL_SIZE << std::endl; - break; - } - } - if (nthread >= MAX_NTHREAD) { - std::cout - << "Unable to conclude an optimal texture cacheline concurrency for dim " - << dim << std::endl; - }; - } - - // TODO: Use concurrency information to obtain the cache line size for - // textures as done in https://fburl.com/98xiou3g -} - -void tex_bandwidth(const App& app) { - if (!app.enabled("tex_bandwidth")) { - std::cout << "Skipped Texture Bandwidth" << std::endl; - return; - } - - for (int dim = 0; dim < 3; dim++) { - std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" - << std::endl; - const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width - : dim == 1 ? app.max_tex_height - : app.max_tex_depth; - - // rgba, float - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - const uint32_t NVEC = MAX_SIZE; - - const uint32_t RANGE = NVEC * VEC_SIZE; - - // Cache lines flushed - const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // tex_bandwidth.yaml - const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange - // for higher latency. - const uint32_t NITER = app.get_config("tex_bandwidth", "niter"); - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read all texells - const uint32_t NTHREAD = NVEC; - // Occupy all threads - const uint32_t local_x = app.nthread_logic; - // Ensure that global is a multiple of local, and distribute across all - // SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; - - auto shader_name = "tex_bandwidth_" + std::to_string(dim); - - std::vector sizes_whd = {MAX_SIZE, 1, 1}; - if (dim == 1) { - sizes_whd = {1, MAX_SIZE, 1}; - } else if (dim == 2) { - sizes_whd = {1, 1, MAX_SIZE}; - } - auto sizes_nchw = whd_to_nchw(sizes_whd); - - vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - auto bench = [&](uint32_t access_size, uint32_t dim) { - // Number of texels that fit in this iteration - const uint32_t ntexel_access = access_size / VEC_SIZE; - - // The address mask works as a modulo because x % 2^n == x & (2^n - 1). - // This will help us limit address accessing to a specific set of unique - // addresses depending on the access size we want to measure. - const uint32_t addr_mask = ntexel_access - 1; - - // This is to distribute the accesses to unique addresses across the - // workgroups, once the size of the access excedes the workgroup width. - const uint32_t workgroup_width = local_x * NITER * NUNROLL; - - StagingBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), - SV(ntexel_access), - SV(local_x), - SV(addr_mask), - SV(workgroup_width)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - double gbps = SIZE_TRANS * 1e-3 / time; - std::cout << "Texture bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < RANGE; - access_size *= 2) { - double gbps = bench(access_size, dim); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth - << std::endl; - std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth - << std::endl; - } -} -} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h deleted file mode 100644 index 887cb443ef4..00000000000 --- a/backends/vulkan/tools/gpuinfo/include/utils.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#define CL_TARGET_OPENCL_VERSION 200 -#define CL_HPP_TARGET_OPENCL_VERSION CL_TARGET_OPENCL_VERSION -#include - -using namespace vkcompute; -using namespace api; - -#define QP context()->querypool() - -auto benchmark_on_gpu( - std::string shader_id, - uint32_t niter, - std::function encode_kernel) { - auto fence = context()->fences().get_fence(); - - for (int i = 0; i < niter; ++i) { - encode_kernel(); - }; - - context()->submit_cmd_to_gpu(fence.get_submit_handle()); - fence.wait(); - QP.extract_results(); - uint64_t count = QP.get_mean_shader_ns(shader_id); - QP.reset_state(); - context()->flush(); - - return count / 1000.f; -} - -void ensure_min_niter( - double min_time_us, - uint32_t& niter, - std::function run) { - const uint32_t DEFAULT_NITER = 100; - niter = DEFAULT_NITER; - for (uint32_t i = 0; i < 100; ++i) { - double t = run(); - if (t > min_time_us * 0.99) { - return; - } - niter = uint32_t(niter * min_time_us / t); - } -} - -std::vector whd_to_nchw(std::vector sizes) { - const int64_t W = sizes[0]; - const int64_t H = sizes[1]; - const int64_t D = sizes[2]; - - // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} - return {1, D * 4, H, W}; -} - -cl_platform_id get_cl_platform_id() { - cl_uint nplatform_id; - clGetPlatformIDs(0, nullptr, &nplatform_id); - std::vector platform_ids; - platform_ids.resize(nplatform_id); - clGetPlatformIDs(nplatform_id, platform_ids.data(), nullptr); - return platform_ids[0]; -} - -cl_device_id get_cl_dev_id(cl_platform_id platform_id) { - cl_uint ndev_id; - clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, nullptr, &ndev_id); - std::vector dev_ids; - dev_ids.resize(ndev_id); - clGetDeviceIDs( - platform_id, CL_DEVICE_TYPE_ALL, ndev_id, dev_ids.data(), nullptr); - return dev_ids[0]; -} - -cl::Device get_cl_device() { - auto platform_id = get_cl_platform_id(); - auto dev_id = get_cl_dev_id(platform_id); - cl::Device dev(dev_id); - return dev; -} diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp deleted file mode 100644 index f0e29aaf1ae..00000000000 --- a/backends/vulkan/tools/gpuinfo/src/main.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "app.h" -#include "architecture.h" -#include "buffers.h" -#include "textures.h" - -using namespace vkapi; - -int main(int argc, const char** argv) { - gpuinfo::App app; - - std::string file_path = "config.json"; - if (argc > 1) { - file_path = argv[1]; - }; - app.load_config(file_path); - - // Architecture - gpuinfo::reg_count(app); - gpuinfo::warp_size(app); - - // Buffers - gpuinfo::buf_cacheline_size(app); - gpuinfo::buf_bandwidth(app); - gpuinfo::ubo_bandwidth(app); - gpuinfo::shared_mem_bandwidth(app); - - // Textures - gpuinfo::tex_bandwidth(app); - gpuinfo::tex_cacheline_concurr(app); - - return 0; -} diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py deleted file mode 100644 index 1291eb62936..00000000000 --- a/backends/vulkan/utils.py +++ /dev/null @@ -1,1305 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import operator -from typing import Any, List, Optional, Set, Tuple, Union - -import torch - -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - VkMemoryLayout, - VkStorageType, -) - -from executorch.exir.backend.canonical_partitioners.config_partitioner import ( - format_target_name, -) - -from executorch.exir.dialects.edge._ops import EdgeOpOverload - -from executorch.exir.tensor import TensorSpec - -from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param - -from torch._subclasses.fake_tensor import FakeTensor, FakeTensorConverter - -from torch.export import ExportedProgram - -from torch.export.exported_program import InputKind -from torch.export.graph_signature import TensorArgument - -TorchOpType = Union[EdgeOpOverload, torch._ops.OpOverload, str] - -_DQ_OPS = { - "dequantize_per_tensor.tensor", - "dequantize_per_tensor.default", - "dequantize_per_channel.default", - "dequantize_per_channel_group.default", - "dequantize_per_token.default", - "dequantize_affine.default", -} - -_Q_OPS = { - "quantize_per_tensor.tensor", - "quantize_per_tensor.default", - "quantize_per_channel.default", - "quantize_per_token.default", - "quantize_affine.default", -} - -## -## Node type determination -## - -# Convenience type -MaybeNodeList = Union[torch.fx.Node, List[torch.fx.Node], Tuple[torch.fx.Node]] - - -def is_torch_op_node(node: torch.fx.Node) -> bool: - if node.op != "call_function": - return False - - if isinstance(node.target, EdgeOpOverload): - return True - if isinstance(node.target, torch._ops.OpOverload): - return True - - return False - - -def is_dequant_node(node: torch.fx.Node) -> bool: - if node.op != "call_function": - return False - node_name = format_target_name(node.target.__name__) # pyre-ignore - return node_name in _DQ_OPS - - -def is_quant_node(node: torch.fx.Node) -> bool: - if node.op != "call_function": - return False - node_name = format_target_name(node.target.__name__) # pyre-ignore - return node_name in _Q_OPS - - -def is_dequant_per_channel_node(node: torch.fx.Node) -> bool: - if node.op != "call_function": - return False - node_name = format_target_name(node.target.__name__) # pyre-ignore - return node_name == "dequantize_per_channel.default" - - -def is_linear_node(node: torch.fx.Node) -> bool: - if node.op != "call_function": - return False - node_name = format_target_name(node.target.__name__) # pyre-ignore - return node_name == "linear.default" - - -def is_get_attr_node(node: torch.fx.Node) -> bool: - return isinstance(node, torch.fx.Node) and node.op == "get_attr" - - -def is_constant(program: ExportedProgram, node: torch.fx.Node) -> bool: - return node.name in program.graph_signature.inputs_to_lifted_tensor_constants - - -def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool: - """ - Check if the given node is a parameter within the exported program - """ - return ( - is_get_attr_node(node) - or is_param(program, node) - or is_buffer(program, node) - or is_constant(program, node) - ) - - -def is_mutable_buffer_node( - node: torch.fx.Node, exported_program: ExportedProgram -) -> bool: - if node.target not in exported_program.graph_signature.inputs_to_buffers: - return False - buf = exported_program.graph_signature.inputs_to_buffers[node.target] - return buf in exported_program.graph_signature.buffers_to_mutate.values() - - -def is_symint_node(node: torch.fx.Node) -> bool: - """ - Returns true if the given node produces a SymInt value - """ - if "val" not in node.meta: - return False - - if isinstance(node.meta["val"], torch.SymInt): - return True - - return False - - -def is_single_tensor_node(node: torch.fx.Node) -> bool: - """ - Returns true if the given node produces a single tensor value - """ - if "val" not in node.meta: - return False - - if isinstance(node.meta["val"], FakeTensor): - return True - - return False - - -def is_tensor_collection_node(node: Any) -> bool: - """ - Returns true if the given node produces a collection of tensor values - """ - if not isinstance(node, torch.fx.Node): - return False - - if "val" not in node.meta: - return False - - if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple): - return all(isinstance(x, FakeTensor) for x in node.meta["val"]) - - return False - - -def is_tensor_node(node: Any) -> bool: - """ - Returns true if the given node produces a tensor value, or a collection of tensor values - """ - if not isinstance(node, torch.fx.Node): - return False - - if "val" not in node.meta: - return False - - if isinstance(node.meta["val"], FakeTensor): - return True - - if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple): - return all(isinstance(x, FakeTensor) for x in node.meta["val"]) - - return False - - -def is_tensor_arg_node(node: Any) -> bool: - if isinstance(node, torch.fx.Node): - return is_tensor_node(node) - elif isinstance(node, (list, tuple)): - return all(is_tensor_node(n) for n in node) - - return False - - -def num_tensor_arg_nodes(node: torch.fx.Node) -> int: - """ - For a given node, return the number of argument nodes that are associated with - tensors. - """ - count = 0 - for arg_node in node.args: - if not isinstance(arg_node, torch.fx.Node): - continue - if is_tensor_node(arg_node): - count += 1 - - return count - - -def num_tensors_in_node(node: torch.fx.Node) -> int: - """ - Returns the number of tensors associated a given node - """ - if "val" not in node.meta: - return 0 - - if isinstance(node.meta["val"], FakeTensor): - return 1 - - if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple): - if all(isinstance(x, FakeTensor) for x in node.meta["val"]): - return len(node.meta["val"]) - - return 0 - - -def tensor_node_is_bool(node: torch.fx.Node) -> bool: - """ - Returns true if a given node contains a tensor with bool dtype - """ - if isinstance(node.meta["val"], FakeTensor): - return node.meta["val"].dtype == torch.bool - if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple): - for fake_tensor in node.meta["val"]: - if isinstance(fake_tensor, FakeTensor): - if fake_tensor.dtype == torch.bool: - return True - return False - - -def get_primary_arg_idx(self, node: torch.fx.Node) -> Optional[int]: - primary_arg_idx: Optional[int] = None - for i, arg_node in enumerate(node.args): - if self.is_non_constant_tensor_node(arg_node): - return i - - return primary_arg_idx - - -def node_comes_from_any_nn_module_in_set( - node, - nn_module_typenames: Set[str], -) -> bool: - if isinstance(node, (list, tuple)): - return all( - node_comes_from_any_nn_module_in_set(n, nn_module_typenames) for n in node - ) - - if not isinstance(node, torch.fx.Node): - return False - - nn_module_stack = node.meta.get("nn_module_stack", None) - if nn_module_stack is None: - return False - - for _, packed in nn_module_stack.items(): - _, typename = packed - for partial_name in nn_module_typenames: - if partial_name in typename: - return True - - return False - - -def get_tensor_name(exp_prog: ExportedProgram, node: torch.fx.Node) -> str: - if node is None: - return "" - if is_param(exp_prog, node): - return exp_prog.graph_signature.inputs_to_parameters[node.name] - elif is_buffer(exp_prog, node): - return exp_prog.graph_signature.inputs_to_buffers[node.name] - elif is_lifted_tensor_constant(exp_prog, node): - return exp_prog.graph_signature.inputs_to_lifted_tensor_constants[node.name] - else: - assert isinstance(node.target, str) - return node.target - - return "" - - -def find_dequant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]: - """ - Search the direct users of the given node and return the first one that is a - dequantization op. Returns None if no dequantization op is found. - """ - for user in node.users: - if is_dequant_node(user): - return user - return None - - -def find_quant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]: - """ - Search the direct users of the given node and return the first one that is a - quantization op. Returns None if no quantization op is found. - """ - for user in node.users: - if is_quant_node(user): - return user - - return None - - -## -## Memory Layout, Storage Type Determination -## - -ImageExtents = Tuple[int, int, int] - -DEFAULT_TEXTURE_LIMITS = (16384, 16384, 2048) -DEFAULT_BUFFER_LIMIT = 128 * (1024 * 1024) - -all_storage_types: Set[VkStorageType] = { - VkStorageType.BUFFER, - VkStorageType.TEXTURE_3D, -} - -all_memory_layouts: Set[VkMemoryLayout] = { - VkMemoryLayout.TENSOR_WIDTH_PACKED, - VkMemoryLayout.TENSOR_HEIGHT_PACKED, - VkMemoryLayout.TENSOR_CHANNELS_PACKED, -} - -MemoryLayoutSet = Set[VkMemoryLayout] -MemoryLayoutSetList = Union[MemoryLayoutSet, List[MemoryLayoutSet]] - - -def within_buffer_limit(node: torch.fx.Node, buffer_limit: int) -> int: - """ - Checks whether the tensors produced by the given node can fit within the device's - GPU buffer limit, which represents the maximum number of elements that can be stored - in a GPU buffer. - """ - assert is_tensor_node(node) - - if isinstance(node.meta["val"], FakeTensor): - return node.meta["val"].numel() < buffer_limit - elif isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple): - return all(x.numel() < buffer_limit for x in node.meta["val"]) - else: - raise RuntimeError(f"Cannot get numel for val of type {type(node.meta['val'])}") - - -def tensor_node_is_high_dim(node: torch.fx.Node) -> bool: - """ - Returns true if a given node contains a tensor with more than 4 dimensions - """ - if isinstance(node.meta["val"], FakeTensor): - return len(node.meta["val"].shape) > 4 - if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple): - for fake_tensor in node.meta["val"]: - if isinstance(fake_tensor, FakeTensor): - if len(fake_tensor.shape) > 4: - return True - return False - - -def required_image_extents(sizes: torch.Size, layout: VkMemoryLayout) -> ImageExtents: - """ - Calculate the image extents that will be used to represent a tensor with the given sizes - and memory layout in the Vulkan Delegate. - """ - width = sizes[-1] if len(sizes) >= 1 else 1 - height = sizes[-2] if len(sizes) >= 2 else 1 - channels = sizes[-3] if len(sizes) >= 3 else 1 - batch = sizes[0] if len(sizes) >= 4 else 1 - - if layout == VkMemoryLayout.TENSOR_WIDTH_PACKED: - width = (width + 3) // 4 - elif layout == VkMemoryLayout.TENSOR_HEIGHT_PACKED: - height = (height + 3) // 4 - elif layout == VkMemoryLayout.TENSOR_CHANNELS_PACKED: - channels = (channels + 3) // 4 - else: - raise RuntimeError(f"Unsupported memory layout {layout}") - - return width, height, channels * batch - - -def extents_are_valid(extents: ImageExtents, limits: ImageExtents) -> bool: - return all(extents[i] <= limits[i] for i in range(len(extents))) - - -def valid_texture_memory_layouts( - tensor_sizes: torch.Size, texture_limits: ImageExtents -) -> Set[VkMemoryLayout]: - """ - Given tensor sizes, determine the set of memory layouts which will prodice a texture - that can fit within the specified device limits. - """ - valid_layouts = set() - for layout in list(all_memory_layouts): - extents = required_image_extents(tensor_sizes, layout) - if extents_are_valid(extents, texture_limits): - valid_layouts.add(layout) - - return valid_layouts - - -class TensorRepr: - """ - This class is a wrapper around a pair of VkStorageType and VkMemoryLayout which - describes how a tensor should be represented in the Vulkan Delegate. - """ - - def __init__(self, storage_type: VkStorageType, memory_layout: VkMemoryLayout): - self.storage_type = storage_type - self.memory_layout = memory_layout - - def __str__(self) -> str: - return f"TensorRepr({self.storage_type}, {self.memory_layout})" - - def __eq__(self, other: object) -> bool: - if not isinstance(other, TensorRepr): - return NotImplemented - return ( - self.storage_type == other.storage_type - and self.memory_layout == other.memory_layout - ) - - def __ne__(self, other: object) -> bool: - return not self.__eq__(other) - - -class TensorReprList: - """ - This class is a wrapper around a list of TensorRepr instances that automatically - applies a "broadcasting" mechanism. The broadcasting mechanism allows for a single - underlying TensorRepr to be used to represent multiple tensors. - """ - - def __init__(self, tensor_reprs: Union[TensorRepr, List[TensorRepr]]): - self.vals: List[TensorRepr] = ( - tensor_reprs if isinstance(tensor_reprs, list) else [tensor_reprs] - ) - - def __len__(self): - return len(self.vals) - - def __getitem__(self, idx: int) -> TensorRepr: - if idx > 0 and len(self) == 1: - return self.vals[0] - else: - return self.vals[idx] - - def __setitem__(self, idx: int, val: TensorRepr) -> None: - if idx > 0 and len(self) == 1: - self.vals[0] = val - else: - self.vals[idx] = val - - def __str__(self) -> str: - return f"[{', '.join(str(ts) for ts in self.vals)}]" - - def __eq__(self, other: object) -> bool: - if not isinstance(other, TensorReprList): - return NotImplemented - - if len(self) == len(other): - for self_val, other_val in zip(self.vals, other.vals): - if self_val != other_val: - return False - - return True - - return False - - def __ne__(self, other: object) -> bool: - return not self.__eq__(other) - - def append(self, val: TensorRepr) -> None: - self.vals.append(val) - - def storage_type(self, idx: int = 0) -> VkStorageType: - return self.vals[idx].storage_type - - def memory_layout(self, idx: int = 0) -> VkMemoryLayout: - return self.vals[idx].memory_layout - - -class TensorRepSet: - """ - This class describes the possible set of representations (i.e. TensorRepr) that may - be used to represent a tensor. This set is determined by the implementation of the - operator that the tensor participates in as well as the texture extents of the GPU. - """ - - def __init__( - self, - buffer_memory_layouts: Set[VkMemoryLayout], - texture_memory_layouts: Set[VkMemoryLayout], - ): - self.valid_buffer_layouts = buffer_memory_layouts - self.valid_texture_layouts = texture_memory_layouts - - def __str__(self) -> str: - buffer_layouts = ", ".join(layout.name for layout in self.valid_buffer_layouts) - texture_layouts = ", ".join( - layout.name for layout in self.valid_texture_layouts - ) - return f"TensorRepSet(Buffer Layouts: [{buffer_layouts}], Texture Layouts: [{texture_layouts}])" - - def __eq__(self, other: object) -> bool: - if not isinstance(other, TensorRepSet): - return NotImplemented - return ( - self.valid_buffer_layouts == other.valid_buffer_layouts - and self.valid_texture_layouts == other.valid_texture_layouts - ) - - def __ne__(self, other: object) -> bool: - return not self.__eq__(other) - - def is_empty(self) -> bool: - """ - A TensorRepSet is "empty" if there are no valid representations of the tensor. - """ - return ( - len(self.valid_buffer_layouts) == 0 and len(self.valid_texture_layouts) == 0 - ) - - def make_intersect(self, other: "TensorRepSet") -> "TensorRepSet": - """ - Merge this TensorRepr with another TensorRepr, returning a new TensorRepr - with the intersection of the two. - """ - return TensorRepSet( - self.valid_buffer_layouts & other.valid_buffer_layouts, - self.valid_texture_layouts & other.valid_texture_layouts, - ) - - def is_compatible(self, storage: TensorRepr) -> bool: - """ - Check if this TensorRepr is compatible with the given TensorRepSet. - """ - if storage.storage_type == VkStorageType.BUFFER: - return storage.memory_layout in self.valid_buffer_layouts - elif storage.storage_type == VkStorageType.TEXTURE_3D: - return storage.memory_layout in self.valid_texture_layouts - else: - raise RuntimeError(f"Unsupported storage type {storage.storage_type}") - - def any_in_common(self, other: "TensorRepSet") -> bool: - """ - Check if this TensorRepr has any representations in common with another - TensorRepr. - """ - return ( - len(self.valid_buffer_layouts & other.valid_buffer_layouts) > 0 - or len(self.valid_texture_layouts & other.valid_texture_layouts) > 0 - ) - - def texture_is_valid(self): - return len(self.valid_texture_layouts) > 0 - - def buffer_is_valid(self): - return len(self.valid_buffer_layouts) > 0 - - def first_valid_buffer_layout(self): - return list(self.valid_buffer_layouts)[0] - - def first_valid_texture_layout(self): - return list(self.valid_texture_layouts)[0] - - def make_tensor_repr(self) -> TensorRepr: - """ - Pick a representation (i.e. TensorRepr) from the set of possible representations. - If there are multiple valid representations, then: - 1. Prefer texture storage over buffer storage - 2. Pick the first available memory layout. - """ - if self.is_empty(): - # An empty repset typically means that it is associated with a weight tensor - # or non tensor argument. In this case, just return default storage and - # layout as placeholder. - return TensorRepr( - VkStorageType.DEFAULT_STORAGE, VkMemoryLayout.DEFAULT_LAYOUT - ) - - if self.texture_is_valid(): - return TensorRepr( - VkStorageType.TEXTURE_3D, self.first_valid_texture_layout() - ) - - else: - return TensorRepr(VkStorageType.BUFFER, self.first_valid_buffer_layout()) - - def is_constrained(self) -> bool: - """ - A "constrained" RepSet is one that has either: - 1. A single valid texture memory layout, and no valid buffer memory layouts - 2. No valid texture memory layouts, and a single valid buffer memory layout - 3. Is empty - - In this case, it is unambiguous which representation should be used for the - tensor. - """ - if self.is_empty(): - return True - elif ( - len(self.valid_texture_layouts) == 1 and len(self.valid_buffer_layouts) == 0 - ): - return True - elif ( - len(self.valid_texture_layouts) == 0 and len(self.valid_buffer_layouts) == 1 - ): - return True - else: - return False - - def is_ambiguous(self) -> bool: - """ - An "ambiguous" RepSet is one that is not constrained. - """ - return not self.is_constrained() - - -def make_tensor_repset(tensor_repr: TensorRepr) -> TensorRepSet: - """ - Given a TensorRepr, return a TensorRepSet that contains only that TensorRepr - """ - if tensor_repr.storage_type == VkStorageType.BUFFER: - return TensorRepSet({tensor_repr.memory_layout}, set()) - elif tensor_repr.storage_type == VkStorageType.TEXTURE_3D: - return TensorRepSet(set(), {tensor_repr.memory_layout}) - else: - raise RuntimeError(f"Unsupported storage type {tensor_repr.storage_type}") - - -def make_filtered_tensor_repset( - tensor_val: FakeTensor, - tensor_repset: TensorRepSet, - texture_limits: ImageExtents, -) -> TensorRepSet: - """ - `tensor_val` represents an actual tensor participating in some operator computation. - - `tensor_repset` represents the set of valid tensor representations that may be used - for that tensor that is supported by the op implementation. - - `texture_limits` represents the maximum texture sizes that is supported by the GPU. - - Given the above, return a new TensorRepSet that contains only texture layouts that - can be used to produce a valid image texture for the given tensor (i.e. fits within - texture limits). - """ - valid_texture_layouts = set() - for memory_layout in tensor_repset.valid_texture_layouts: - extents = required_image_extents(tensor_val.shape, memory_layout) - if extents_are_valid(extents, texture_limits): - valid_texture_layouts.add(memory_layout) - - # High dimensional tensors require buffer storage - if len(tensor_val.shape) > 4: - return TensorRepSet(tensor_repset.valid_buffer_layouts, set()) - - # Bool tensors are currently not supported - if tensor_val.dtype == torch.bool: - return NO_STORAGE - - return TensorRepSet(tensor_repset.valid_buffer_layouts, valid_texture_layouts) - - -## Convenience TensorRepSet definitions - -CONTIGUOUS_ANY = TensorRepSet( - {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED} -) -CONTIGUOUS_BUFFER = TensorRepSet({VkMemoryLayout.TENSOR_WIDTH_PACKED}, set()) - -WIDTH_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_WIDTH_PACKED}) -CHANNELS_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_CHANNELS_PACKED}) - -ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts) -ANY_BUFFER = TensorRepSet(all_memory_layouts, set()) - -ANY_STORAGE = TensorRepSet(all_memory_layouts, all_memory_layouts) -NO_STORAGE = TensorRepSet(set(), set()) - - -class TensorRepSetList: - """ - This class is a wrapper around a list of TensorRepSet instances that automatically - applies a "broadcasting" mechanism. The broadcasting mechanism allows for a single - underlying TensorRepSet to be used for multiple tensors. - """ - - def __init__( - self, - tensor_repsets: Union[TensorRepSet, List[TensorRepSet]], - ): - self.vals: List[TensorRepSet] = ( - tensor_repsets if isinstance(tensor_repsets, list) else [tensor_repsets] - ) - - def __len__(self): - return len(self.vals) - - def __getitem__(self, idx: int) -> TensorRepSet: - if idx > 0 and len(self) == 1: - return self.vals[0] - else: - return self.vals[idx] - - def __setitem__(self, idx: int, val: TensorRepSet) -> None: - if idx > 0 and len(self.vals) == 1: - self.vals[0] = val - else: - self.vals[idx] = val - - def __str__(self) -> str: - return f"[{', '.join(str(ts) for ts in self.vals)}]" - - def append(self, val: TensorRepSet) -> None: - return self.vals.append(val) - - def any_is_empty(self) -> bool: - if len(self.vals) == 0: - return True - - return any(tensor_repr.is_empty() for tensor_repr in self.vals) - - -class OpRepSets: - """ - This class is responsible for representing and managing the set of valid tensor - representations that may be used for all input and output tensors of an operator. - It is also responsible for maintaining synchronization rules between tensors - participating in the computation. - - Currently, three synchronization rules exist: - 1. All input tensors must use the same representation (e.g. binary ops) - 2. The "primary" input and output tensors must use the same representation - (e.g. group norm; the output is a tuple of out, mean, rstd; out must be the same - representation as the first input x, but mean and rstd may use different - representations as out) - 3. All output tensors must use the same representation (e.g. choose qparams) - - Note that "primary" input and output tensor refers to the first non-weight input - tensor and the first output tensor. Note that Some operators (such as arange) do not - have any tensor inputs. - - Currently, the above three synchronization rules are sufficient to describe the - representation requirements of all ET-VK operators. - - This class also provides utilities to constrain the repsets; when applying the - constraints, the synchronization rules will be maintained. - """ - - def __init__( # noqa: C901 - self, - inputs_repsets: TensorRepSetList, - outputs_repsets: TensorRepSetList, - op_node: torch.fx.Node, - texture_limits: ImageExtents, - ): - self.op_node = op_node - - # inputs_repset_list is received from the operator registration. If a different - # repset is defined for each input tensor, then assume that the input tensor - # representations do not need to be synchronized. - if len(inputs_repsets) > 1: - self.sync_args_repr = False - # Otherwise, default to True - else: - self.sync_args_repr = True - - # outputs_repset_list is received from the operator registration. If a different - # repset is defined for each output tensor, then assume that the output tensor - # representations do not need to be synchronized. - if len(outputs_repsets) > 1: - self.sync_outs_repr = False - else: - self.sync_outs_repr = True - - # Try to determine the index of the "primary" argument, i.e. the first non - # constant tensor argument. For the vast majority of operators with tensor - # arguments, this will be the first argument. - self.primary_arg_idx: Optional[int] = None - for i, arg_node in enumerate(self.op_node.args): - arg_node_repset = inputs_repsets[i] - if not is_tensor_arg_node(arg_node): - continue - if arg_node_repset is None: - continue - if arg_node_repset.is_empty(): - continue - - self.primary_arg_idx = i - break - - # If the repset of the primary input and the primary output are the same, then - # assume they need to be the same. - self.sync_primary_io_repr = self.primary_arg_idx is not None - if self.primary_arg_idx is not None: - if inputs_repsets[self.primary_arg_idx] != outputs_repsets[0]: - self.sync_primary_io_repr = False - - # Now, go through the arguments of the operator and create a filtered repset - # for each based on the actual tensor value. - args_repset_list = TensorRepSetList([]) - common_arg_repset = ANY_STORAGE - for i, arg_node in enumerate(op_node.args): - arg_repset = inputs_repsets[i] - - # Use ANY_STORAGE for non-tensor nodes so they don't cause the op repsets to - # appear empty - if not is_tensor_arg_node(arg_node): - args_repset_list.append(ANY_STORAGE) - # NO_STORAGE is used to denote that an input is either a non tensor arg or - # a weight tensor that is not prepacked. Similar to the above, use - # ANY_STORAGE in this case. - elif arg_repset.is_empty(): - args_repset_list.append(ANY_STORAGE) - else: - assert not arg_repset.is_empty() - - arg_repset = self.make_valid_tensor_repset_for_arg( - arg_repset, arg_node, texture_limits - ) - - args_repset_list.append(arg_repset) - common_arg_repset = common_arg_repset.make_intersect(arg_repset) - - # Repeat for output tensors. - outs_repset_list = TensorRepSetList([]) - common_out_repset = ANY_STORAGE - if num_tensors_in_node(op_node) == 1: - common_out_repset = make_filtered_tensor_repset( - op_node.meta["val"], outputs_repsets[0], texture_limits - ) - outs_repset_list.append(common_out_repset) - # Multiple output tensors - else: - for i, val in enumerate(op_node.meta["val"]): - assert isinstance(val, FakeTensor) - out_repset = make_filtered_tensor_repset( - val, outputs_repsets[i], texture_limits - ) - - outs_repset_list.append(out_repset) - common_out_repset = common_out_repset.make_intersect(out_repset) - - # Apply synchronization rules; if either all inputs/outputs must use the same - # representation, then only use a single underlying repset. - if self.sync_args_repr: - args_repset_list = TensorRepSetList([common_arg_repset]) - - if self.sync_outs_repr: - outs_repset_list = TensorRepSetList([common_out_repset]) - - # Finally, apply synchronization rules that sync inputs and outputs. If input - # or output repsets are updated, then maintain synchronization rules. - if self.sync_primary_io_repr: - assert self.primary_arg_idx is not None - - primary_in_repset = args_repset_list[self.primary_arg_idx] - primary_out_repset = outs_repset_list[0] - - primary_repset = primary_in_repset.make_intersect(primary_out_repset) - - if self.sync_args_repr: - args_repset_list = TensorRepSetList([primary_repset]) - else: - assert self.primary_arg_idx is not None - args_repset_list[self.primary_arg_idx] = primary_repset - - if self.sync_outs_repr: - outs_repset_list = TensorRepSetList([primary_repset]) - else: - assert self.primary_arg_idx is not None - outs_repset_list[0] = primary_repset - - # Save the resulting repsets - self.args_repset_list = args_repset_list - self.outs_repset_list = outs_repset_list - - # Check that synchronization rules are respected. - self.assert_sync_contraints() - - def __str__(self) -> str: - return f"OpRepSets(ins={self.args_repset_list}, outs={self.outs_repset_list})" - - def make_valid_tensor_repset_for_node_list_arg( - self, - arg_repsets: TensorRepSet, - arg_node: List[torch.fx.Node], - texture_limits: ImageExtents, - ) -> TensorRepSet: - """ - Wrapper around make_filtered_tensor_repset for a list of nodes. This will happen - for the cat operator, where the first argument is a list of nodes. - """ - # For variable length args, assume that they all need to use the same representation - # only one repset should be defined - common_tensor_repsets = arg_repsets - - for n in arg_node: - assert isinstance(n, torch.fx.Node) - common_tensor_repsets = common_tensor_repsets.make_intersect( - make_filtered_tensor_repset( - n.meta["val"], common_tensor_repsets, texture_limits - ) - ) - - return common_tensor_repsets - - def make_valid_tensor_repset_for_arg( - self, arg_repsets: TensorRepSet, arg_node: Any, texture_limits: ImageExtents - ) -> TensorRepSet: - """ - Helper function to call make_filtered_tensor_repset - """ - if isinstance(arg_node, torch.fx.Node) and is_single_tensor_node(arg_node): - return make_filtered_tensor_repset( - arg_node.meta["val"], arg_repsets, texture_limits - ) - elif isinstance(arg_node, list) and all( - is_single_tensor_node(n) for n in arg_node - ): - return self.make_valid_tensor_repset_for_node_list_arg( - arg_repsets, arg_node, texture_limits - ) - # Special case for getitem; return the repset of the particular val in the - # list of tensors that is being extracted. - elif ( - self.op_node.target == operator.getitem and arg_node == self.op_node.args[0] - ): - idx = self.op_node.args[1] - assert isinstance(idx, int) - return make_filtered_tensor_repset( - arg_node.meta["val"][idx], arg_repsets, texture_limits - ) - - raise NotImplementedError(f"Unhandled node type {arg_node}") - - def assert_sync_contraints(self) -> None: - if self.sync_args_repr: - assert len(self.args_repset_list) == 1 - - if self.sync_outs_repr: - assert len(self.outs_repset_list) == 1 - - if self.sync_primary_io_repr: - assert ( - self.args_repset_list[self.primary_arg_idx] == self.outs_repset_list[0] - ) - - def any_is_empty(self) -> bool: - return ( - self.args_repset_list.any_is_empty() or self.outs_repset_list.any_is_empty() - ) - - def get_arg_repset(self, i: int): - return self.args_repset_list[i] - - def get_out_repset(self, i: int): - return self.outs_repset_list[i] - - def try_constrain_with_arg_repset( - self, arg_i: int, source_repset: TensorRepSet - ) -> bool: - """ - Attempt to constrain the repsets of the tensors participating in this operator - based on an "existing" repset of an argument. The existing repset can have two - sources: - * A representation may have been determined for the argument already from a - prior operator - * The output repset of the operator which produces the argument - - If the existing repset of the argument is compatible with the current operator, - then constrain the repsets of this operator and apply synchronization rules. - - This process tries to minimize the number of transition nodes that will need to - be inserted by tag_memory_meta_pass.py by maintaining existing representations - for as long as possible. - """ - arg_current_repset = self.args_repset_list[arg_i] - - if arg_current_repset == source_repset: - return False - - if not arg_current_repset.any_in_common(source_repset): - return False - - if self.sync_primary_io_repr: - if not self.get_out_repset(0).any_in_common(source_repset): - return False - - # If this point is reached, then it is possible to constrain - self.args_repset_list[arg_i] = arg_current_repset.make_intersect(source_repset) - if self.sync_primary_io_repr and ( - arg_i == self.primary_arg_idx or self.sync_args_repr - ): - self.outs_repset_list[0] = arg_current_repset.make_intersect(source_repset) - - self.assert_sync_contraints() - return True - - def pick_representations(self) -> Tuple[TensorReprList, TensorReprList]: - """ - For each tensor participating in the op, pick a representation for it among the - possible represetntation sets. - """ - args_repr_list = TensorReprList([]) - outs_repr_list = TensorReprList([]) - - for i in range(len(self.op_node.args)): - arg_repset = self.args_repset_list[i] - args_repr_list.append(arg_repset.make_tensor_repr()) - - for i in range(num_tensors_in_node(self.op_node)): - out_repset = self.outs_repset_list[i] - outs_repr_list.append(out_repset.make_tensor_repr()) - - return args_repr_list, outs_repr_list - - -## -## TensorSpec Utils -## - - -def has_node_spec_attr(node: torch.fx.Node, attr: str) -> bool: - return "spec" in node.meta and hasattr(node.meta["spec"], attr) - - -def set_node_spec_attr(node: torch.fx.Node, attr: str, value): - assert "spec" in node.meta - spec = node.meta["spec"] - if isinstance(spec, TensorSpec): - setattr(spec, attr, value) - elif isinstance(spec, (list, tuple)): - # Special case if value is a list/tuple of the same length as the - # collection of tensors in the node. In this case, treat the value list - # as a list of values to set indivudually for each tensor in the node - if isinstance(value, (list, tuple)) and len(spec) == len(value): - assert len(spec) == len(value) - for s, v in zip(spec, value): - assert isinstance(s, TensorSpec) - setattr(s, attr, v) - # Otherwise, set the attribute to value for all tensors in the list - else: - for s in spec: - assert isinstance(s, TensorSpec) - setattr(s, attr, value) - else: - raise RuntimeError(f"Cannot set attr for spec of type {type(spec)}") - - -def get_node_spec_attr(node: torch.fx.Node, attr: str, return_first: bool = True): - assert "spec" in node.meta - spec = node.meta["spec"] - if isinstance(spec, TensorSpec): - return getattr(spec, attr) if hasattr(spec, attr) else None - elif isinstance(spec, (list, tuple)): - if return_first: - return getattr(spec[0], attr) if hasattr(spec[0], attr) else None - else: - return [getattr(s, attr) if hasattr(s, attr) else None for s in spec] - else: - raise RuntimeError(f"Cannot get attr for spec of type {type(spec)}") - - -def get_node_storage_type(node: torch.fx.Node) -> Optional[VkStorageType]: - return get_node_spec_attr(node, "vk_storage_type") - - -def get_node_memory_layout(node: torch.fx.Node) -> Optional[VkMemoryLayout]: - return get_node_spec_attr(node, "vk_memory_layout") - - -def has_node_repr(node) -> bool: - if isinstance(node, (list, tuple)): - return all(has_node_spec_attr(n, "etvk_node_repr") for n in node) - else: - return has_node_spec_attr(node, "etvk_node_repr") - - -def set_node_repr(node: torch.fx.Node, node_repr: Union[TensorRepr, TensorReprList]): - if isinstance(node_repr, TensorReprList): - # Convert to a regular list so taht `set_node_spec_attr` can attach each entry - # to a separate TensorSpec - node_repr_list = [node_repr[i] for i in range(num_tensors_in_node(node))] - set_node_spec_attr(node, "etvk_node_repr", node_repr_list) - else: - set_node_spec_attr(node, "etvk_node_repr", node_repr) - - -def get_node_repr(node) -> Union[TensorRepr, TensorReprList]: - if isinstance(node, (list, tuple)): - raise NotImplementedError("get_node_repr not implemented for list of nodes") - else: - return get_node_spec_attr(node, "etvk_node_repr", False) - - -## -## Graph Pattern Matching -## - - -def maybe_skip_q_dq_arg_chain( - arg: torch.fx.node.Argument, -) -> Tuple[Optional[torch.fx.Node], Optional[torch.fx.Node], Optional[torch.fx.Node]]: - """ - Check if the given node argument is part of a Quantize/Dequantize chain produced by - the quant workflow. If so, return the source tensor that is the input to the Q/DQ - chain and the quantize/dequantize nodes in the chain. Otherwise, return the argument - as is and None, None - """ - if not isinstance(arg, torch.fx.Node): - return None, None, None - - if is_dequant_node(arg): - dequant_node = arg - quant_node = dequant_node.args[0] - assert isinstance(quant_node, torch.fx.Node) - source_arg = quant_node.args[0] - assert isinstance(source_arg, torch.fx.Node) - return source_arg, quant_node, dequant_node - else: - return arg, None, None - - -def trace_args_until_placeholder( - node: torch.fx.node.Argument, max_search_depth: int = 4 -) -> Tuple[Optional[torch.fx.Node], List[torch.fx.Node]]: - """ - Trace through node.args[0] of a given initial node until a placeholder node is found - then return it and the list of nodes traversed. If no placeholder node is found, - returns None and an empty list. - """ - cur_node = node - search_depth = 0 - - if not isinstance(cur_node, torch.fx.Node): - return None, [] - - traversed = [cur_node] - while cur_node.op != "placeholder" and search_depth < max_search_depth: - # Break if cur_node has no args - if len(cur_node.args) == 0: - break - - cur_node = cur_node.args[0] - if not isinstance(cur_node, torch.fx.Node): - break - traversed.append(cur_node) - search_depth += 1 - - if not isinstance(cur_node, torch.fx.Node): - return None, [] - if cur_node.op != "placeholder": - return None, [] - - assert isinstance(cur_node, torch.fx.Node) - return cur_node, traversed - - -def is_in_4bit_range(tensor: torch.Tensor) -> bool: - """ - Check if the given tensor is in the range of 4-bit quantization and is of integer type. - """ - if tensor.dtype not in (torch.int8, torch.uint8): - return False - - return tensor.min().item() >= -8 and tensor.max().item() <= 7 - - -def is_in_8bit_range(tensor: torch.Tensor) -> bool: - """ - Check if the given tensor is in the range of 4-bit quantization and is of integer type. - """ - if tensor.dtype not in (torch.int8, torch.uint8): - return False - - return tensor.min().item() >= -128 and tensor.max().item() <= 127 - - -## -## Misc -## - - -def get_tensor_val_str(tensor_val: FakeTensor) -> str: - return f"{tensor_val.dtype}: {tensor_val.shape}" - - -def get_node_val_str(node: torch.fx.Node) -> str: - if is_single_tensor_node(node): - assert isinstance(node.meta["val"], FakeTensor) - return get_tensor_val_str(node.meta["val"]) - elif is_tensor_collection_node(node): - assert isinstance(node.meta["val"], (list, tuple)) - return f"[{', '.join(get_tensor_val_str(t) for t in node.meta['val'])}]" - else: - if "val" not in node.meta: - return str(node) - return str(node.meta["val"]) - - -def get_arg_node_val_str(arg_node: Any) -> str: - if isinstance(arg_node, torch.fx.Node): - return get_node_val_str(arg_node) - elif isinstance(arg_node, (list, tuple)): - return f"[{', '.join(get_arg_node_val_str(n) for n in arg_node)}]" - else: - return str(arg_node) - - -def node_io_str(node: torch.fx.Node) -> str: - target = node.target - if isinstance(target, EdgeOpOverload): - assert isinstance(target, EdgeOpOverload) - target_name = target.__name__ - elif isinstance(target, torch._ops.OpOverload): - assert isinstance(target, torch._ops.OpOverload) - target_name = target.name() - else: - target_name = str(target) - - out_str = f"{get_node_val_str(node)} = {target_name}(" - for arg in node.args: - out_str += get_arg_node_val_str(arg) + ", " - - out_str += " ...)" - return out_str - - -def update_program_state_dict( - program: ExportedProgram, - buffer_name: str, - updated_tensor: torch.Tensor, -) -> None: - target_name = None - # Iterate over all the tensors in the graph signature, and find - # the one corresponding to the parameter/buffer name - for input_ in program.graph_signature.input_specs: - if ( - input_.kind in (InputKind.BUFFER, InputKind.PARAMETER) - and isinstance(input_.arg, TensorArgument) - and input_.arg.name == buffer_name - ): - target_name = input_.target - break - - # Assert that we found the parameter/buffer - assert ( - target_name is not None - ), f"could not find {buffer_name} in source program signature" - assert target_name in program.state_dict, f"could not find {target_name}" - - # Finally, overwrite the current tensor with updated tensor - program.state_dict[target_name] = updated_tensor - - -def align_width_and_update_state_dict( - ep: ExportedProgram, - node: torch.fx.Node, - cur_tensor: torch.Tensor, - align_to: int = 4, - force_update: bool = False, -) -> torch.Tensor: - """ - Align the width of the given tensor to the given alignment value and update the - state dict of the program with the aligned tensor. - """ - added_padding = False - cur_width = cur_tensor.shape[-1] - # Only align the width of the tensor if it is not already aligned - if cur_width % align_to != 0: - num_padding = align_to - (cur_width % align_to) - # Align the width of the tensor to the given alignment value - aligned_tensor = torch.nn.functional.pad( - cur_tensor, (0, num_padding) - ).contiguous() - added_padding = True - else: - aligned_tensor = cur_tensor - - if added_padding or force_update: - update_program_state_dict(ep, node.name, aligned_tensor) - # FakeTensor needs to match updated tensor - cur_fake_tensor = node.meta["val"] - node.meta["val"] = FakeTensorConverter().from_real_tensor( - cur_fake_tensor.fake_mode, - aligned_tensor, - ) - - return aligned_tensor diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py new file mode 120000 index 00000000000..78678ae8191 --- /dev/null +++ b/backends/vulkan/utils.py @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/utils.py \ No newline at end of file diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py deleted file mode 100644 index 69d3cdef75d..00000000000 --- a/backends/vulkan/vulkan_preprocess.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -from functools import partial - -from typing import Any, Dict, final, List - -import executorch.backends.vulkan.utils as utils - -from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform -from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass -from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform -from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import ( - ViewCopyToSqueezeUnsqueezePass, -) -from executorch.backends.vulkan._passes import ( - FoldQDQPass, - FuseQuantizedOpsTransform, - insert_prepack_nodes, - RemoveLocalScalarDenseOpsTransform, - RemoveRedundantOpsTransform, - SqueezeUnsqueezeInputs, - TagMemoryMetaPass, -) -from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass -from executorch.backends.vulkan._passes.remove_asserts import RemoveAssertsTransform - -from executorch.backends.vulkan.serialization.vulkan_graph_builder import VkGraphBuilder -from executorch.backends.vulkan.serialization.vulkan_graph_schema import ( - VkMemoryLayout, - VkStorageType, -) -from executorch.backends.vulkan.serialization.vulkan_graph_serialize import ( - serialize_vulkan_graph, -) -from executorch.backends.xnnpack._passes import FuseBatchNormPass - -from executorch.exir.backend.backend_details import ( - BackendDetails, - CompileSpec, - ExportedProgram, - PreprocessResult, -) -from executorch.exir.backend.utils import DelegateMappingBuilder - -from executorch.exir.memory_planning import greedy, MemoryPlanningAlgorithmSuite -from executorch.exir.pass_base import ExportPass, PassBase - -from executorch.exir.passes import MemoryPlanningPass, SpecPropPass - -from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass - -from executorch.exir.program._program import _copy_module - -from torch.export._remove_auto_functionalized_pass import ( - unsafe_remove_auto_functionalized_pass, -) - -DEFAULT_DEBUG_HANDLE = 65535 - - -# pyre-ignore -def apply_passes(program: ExportedProgram, passes) -> ExportedProgram: - for p in passes: - if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase): - new_gm = program.graph_module - # This is a workaround to allow the memory planning pass to work without - # having to first apply ToOutVarPass(). See the `greedy()` function in - # `exir.memory_planning`; if this attribute isn't set, assertions in - # `collect_spec_from_nodes()` will fail. - if isinstance(p, MemoryPlanningPass): - new_gm.encounter_to_out_var_failure = True - - new_gm_res = p(new_gm) - assert new_gm_res is not None - new_gm = new_gm_res.graph_module - - # See the application of this function in exir/program/_program.py for more - # details on why this step is necessary. - if isinstance(p, SpecPropPass): - p.update_placeholder_tensor_specs(program, new_gm) - - _copy_module(program.graph_module, new_gm) - else: - program = p(program) - - return program - - -def parse_compile_spec(compile_specs: List[CompileSpec]) -> Dict[str, Any]: - options = {} - for spec in compile_specs: - if spec.key == "storage_type_override": - options[spec.key] = VkStorageType( - int.from_bytes(spec.value, byteorder="little") - ) - if spec.key == "memory_layout_override": - options[spec.key] = VkMemoryLayout( - int.from_bytes(spec.value, byteorder="little") - ) - if spec.key in {"texture_limits_x", "texture_limits_y", "texture_limits_z"}: - options[spec.key] = int.from_bytes(spec.value, byteorder="little") - - if spec.key == "skip_tag_memory_metadata": - options[spec.key] = bool.from_bytes(spec.value, byteorder="little") - - if spec.key == "downcast_64_bit": - options[spec.key] = bool.from_bytes(spec.value, byteorder="little") - - # Unhandled options are ignored - - return options - - -@final -class VulkanBackend(BackendDetails): - @classmethod - # pyre-ignore - def preprocess( # noqa: C901 - cls, - program: ExportedProgram, - module_compile_spec: List[CompileSpec], - ) -> PreprocessResult: - compile_options = parse_compile_spec(module_compile_spec) - limits_x = compile_options.get( - "texture_limits_x", utils.DEFAULT_TEXTURE_LIMITS[0] - ) - limits_y = compile_options.get( - "texture_limits_y", utils.DEFAULT_TEXTURE_LIMITS[1] - ) - limits_z = compile_options.get( - "texture_limits_z", utils.DEFAULT_TEXTURE_LIMITS[2] - ) - texture_limits = (limits_x, limits_y, limits_z) - - default_storage_type = compile_options.get( - "storage_type_override", VkStorageType.TEXTURE_3D - ) - default_memory_layout = compile_options.get( - "memory_layout_override", VkMemoryLayout.TENSOR_WIDTH_PACKED - ) - downcast_64_bit = compile_options.get("downcast_64_bit", True) - - program = unsafe_remove_auto_functionalized_pass(program) - - # First, apply passes that fuse/remove operators to consolidate the graph - # structure but still preserve an "ATen-compliant" graph structure (i.e. all - # arguments to ATen operators must match the ATen function schema). - program = apply_passes( - program, - [ - FusePatternsPass(program), - RemoveRedundantOpsTransform(), - AddmmToLinearTransform(), - FuseQuantizedOpsTransform(program), - FoldQDQPass(program), - SqueezeUnsqueezeInputs(), - FuseViewCopyTransform(), - ViewCopyToSqueezeUnsqueezePass(), - FuseBatchNormPass(program), - FuseClampPass(), - ], - ) - - # Next annotate tensor nodes with TensorSpec structs which is needed for dynamic - # shapes and memory planning. Until this point, the graph must be ATen compliant - # because SpecPropPass will be calling the underlying ATen operators during its - # execution. - program = apply_passes(program, [SpecPropPass()]) - - # Apply graph transforms which either require `TensorSpec`s to have been created - # or would create an non ATen compliant graph structure. - program = apply_passes( - program, - [ - RemoveAssertsTransform(), - # Since this pass may replace a scalar argument with a tensor argument, - # this pass may result in a non ATen compliant graph structure. - RemoveLocalScalarDenseOpsTransform(), - insert_prepack_nodes, - ], - ) - - # Optionally apply the memory metadata tagging pass, which will insert storage - # type and memory layout transition nodes to ensure that all tensor arguments - # to an operator is in a supported or optimal configuration. If this pass is not - # applied, there will be a risk that some operators recieve arguments with - # memory settings that are not supported by the implementation. - if not compile_options.get("skip_tag_memory_metadata", False): - program = apply_passes( - program, - [ - TagMemoryMetaPass( - texture_limits, - default_storage_type=default_storage_type, - default_memory_layout=default_memory_layout, - ), - ], - ) - - # Finally, apply dynamic shape passes and memory planning pass. These passes - # must be applied only when the graph structure is finalized. - greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False) - mem_planning_suite = MemoryPlanningAlgorithmSuite( - algo_list=[greedy_memory_planning] - ) - program = apply_passes( - program, - [ - ConstraintBasedSymShapeEvalPass(), - MemoryPlanningPass(memory_planning_algo=mem_planning_suite), - ], - ) - - graph_builder = VkGraphBuilder( - program, - DelegateMappingBuilder(generated_identifiers=True), - downcast_64_bit=downcast_64_bit, - ) - vk_graph = graph_builder.build_graph() - - return PreprocessResult( - processed_bytes=serialize_vulkan_graph( - vk_graph, graph_builder.const_tensors, [] - ), - debug_handle_map=graph_builder.delegate_mapping_builder.get_delegate_mapping(), - data_store_output=graph_builder.named_data_store.get_named_data_store_output(), - ) diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py new file mode 120000 index 00000000000..a52006818c0 --- /dev/null +++ b/backends/vulkan/vulkan_preprocess.py @@ -0,0 +1 @@ +/home/ssjia/fbsource/xplat/executorch/backends/vulkan/vulkan_preprocess.py \ No newline at end of file