diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index aa7be5dfb68..177424ba347 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -861,7 +861,7 @@ jobs: PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh # Run aot examples: - PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh cifar10 + PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh cifar10 PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh mobilenetv2 @@ -901,6 +901,39 @@ jobs: done + test-vulkan-operators-linux: + name: test-vulkan-operators-linux + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + runner: linux.2xlarge + docker-image: ci-image:executorch-ubuntu-22.04-clang12 + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate + source .ci/scripts/setup-vulkan-linux-deps.sh + + # Setup python + PYTHON_EXECUTABLE=python \ + CMAKE_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON" \ + .ci/scripts/setup-linux.sh --build-tool "cmake" + + # Build operator tests, but don't run yet since some test may be failing with + # swiftshader. + # TODO: enable these tests in CI once validated. + PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build + + PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh add nxp-build-test: name: nxp-build-test diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt new file mode 100644 index 00000000000..078a37d069b --- /dev/null +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.19) +project(prototyping_shaders) + +if(ANDROID) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) + set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) +endif() + +find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend) + +# Only build if Vulkan was compiled +if(TARGET vulkan_backend) + if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) + endif() + + if(NOT PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE python3) + endif() + + # Include this file to access executorch_target_link_options_shared_lib + include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + include(${EXECUTORCH_ROOT}/backends/vulkan/cmake/ShaderLibrary.cmake) + + # Third party include paths + set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party) + set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include) + set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) + set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) + + set(COMMON_INCLUDES ${EXECUTORCH_ROOT}/.. ${VULKAN_HEADERS_PATH} ${VOLK_PATH} + ${VMA_PATH} ${CMAKE_CURRENT_SOURCE_DIR} + ) + + # Prototyping utility files + set(PROTOTYPING_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}) + set(PROTOTYPING_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp) + + # Prototyping shaders + set(PROTOTYPING_SHADERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/glsl) + gen_vulkan_shader_lib_cpp(${PROTOTYPING_SHADERS_PATH}) + vulkan_shader_lib(prototyping_shaderlib ${generated_spv_cpp}) + + # Operator implementations library + file(GLOB OPERATOR_IMPL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) + add_library(operator_implementations STATIC ${OPERATOR_IMPL_SOURCES}) + target_include_directories( + operator_implementations PRIVATE ${COMMON_INCLUDES} + ) + target_link_libraries( + operator_implementations PRIVATE vulkan_backend executorch_core + prototyping_shaderlib + ) + target_compile_options(operator_implementations PRIVATE ${VULKAN_CXX_FLAGS}) + set_property(TARGET operator_implementations PROPERTY CXX_STANDARD 17) + + executorch_target_link_options_shared_lib(vulkan_backend) + executorch_target_link_options_shared_lib(operator_implementations) + + # Function to create operator prototype binaries + function(add_operator_prototype OPERATOR_NAME) + set(TARGET_NAME ${OPERATOR_NAME}) + set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${OPERATOR_NAME}.cpp) + + add_executable(${TARGET_NAME} ${SOURCE_FILE} ${PROTOTYPING_UTILS_CPP}) + target_include_directories(${TARGET_NAME} PRIVATE ${COMMON_INCLUDES}) + target_link_libraries( + ${TARGET_NAME} PRIVATE vulkan_backend executorch_core + prototyping_shaderlib operator_implementations + ) + target_compile_options(${TARGET_NAME} PRIVATE ${VULKAN_CXX_FLAGS}) + set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 17) + endfunction() + + # Define operator prototypes + add_operator_prototype(add) +endif() diff --git a/backends/vulkan/test/custom_ops/add.cpp b/backends/vulkan/test/custom_ops/add.cpp new file mode 100644 index 00000000000..bc20246a7d1 --- /dev/null +++ b/backends/vulkan/test/custom_ops/add.cpp @@ -0,0 +1,165 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include "utils.h" + +using namespace executorch::vulkan::prototyping; + +// Generate test cases for add operation +std::vector generate_add_test_cases() { + std::vector test_cases; + + // Set the data generation type as a local variable + DataGenType data_gen_type = DataGenType::ONES; + + // Define different input size configurations + std::vector> size_configs = { + {1, 64, 64}, // Small square + {1, 128, 128}, // Medium square + {1, 256, 256}, // Large square + {1, 512, 512}, // Very large square + {1, 1, 1024}, // Wide tensor + {1, 1024, 1}, // Tall tensor + {32, 32, 32}, // 3D cube + {16, 128, 64}, // 3D rectangular + }; + + // Storage types to test + std::vector storage_types = { + utils::kTexture3D, utils::kBuffer}; + + // Data types to test + std::vector data_types = {vkapi::kFloat, vkapi::kHalf}; + + // Generate test cases for each combination + for (const auto& sizes : size_configs) { + for (const auto& storage_type : storage_types) { + for (const auto& data_type : data_types) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string size_str = ""; + for (size_t i = 0; i < sizes.size(); ++i) { + size_str += std::to_string(sizes[i]); + if (i < sizes.size() - 1) + size_str += "x"; + } + + std::string storage_str = + (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; + std::string dtype_str = (data_type == vkapi::kFloat) ? "Float" : "Half"; + + // Add data generation type to the name for clarity + std::string test_name = + "Add_" + size_str + "_" + storage_str + "_" + dtype_str; + test_case.set_name(test_name); + + // Set the operator name for the test case + test_case.set_operator_name("etvk.add_prototype"); + + // Add two input tensors with the same size, type, storage, and data + // generation method + ValueSpec input_a( + sizes, data_type, storage_type, utils::kWidthPacked, data_gen_type); + ValueSpec input_b( + sizes, data_type, storage_type, utils::kWidthPacked, data_gen_type); + + // Add output tensor with the same size, type, and storage as inputs + // (output uses ZEROS by default) + ValueSpec output( + sizes, + data_type, + storage_type, + utils::kWidthPacked, + DataGenType::ZEROS); + + test_case.add_input_spec(input_a); + test_case.add_input_spec(input_b); + test_case.add_output_spec(output); + + test_cases.push_back(test_case); + } + } + } + + return test_cases; +} + +// Custom FLOP calculator for add operation +// Add operation performs 1 FLOP (addition) per element +int64_t add_flop_calculator(const TestCase& test_case) { + // Calculate total elements from the first input tensor + int64_t total_elements = 1; + if (!test_case.empty() && test_case.num_inputs() > 0 && + test_case.inputs()[0].is_tensor()) { + const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); + for (int64_t size : sizes) { + total_elements *= size; + } + } + + // Add operation: 1 FLOP per element (one addition) + return total_elements; +} + +// Reference implementation for add operator +void add_reference_compute(TestCase& test_case) { + const ValueSpec& input_a = test_case.inputs().at(0); + const ValueSpec& input_b = test_case.inputs().at(1); + + ValueSpec& output = test_case.outputs().at(0); + + if (input_a.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Calculate number of elements + int64_t num_elements = input_a.numel(); + + auto& input_a_data = input_a.get_float_data(); + auto& input_b_data = input_b.get_float_data(); + + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(num_elements); + for (int64_t i = 0; i < num_elements; ++i) { + ref_data[i] = input_a_data[i] + input_b_data[i]; + } +} + +int main(int argc, char* argv[]) { + set_print_output(false); // Disable output tensor printing + set_print_latencies(false); // Enable latency timing printing + set_use_gpu_timestamps(true); // Enable GPU timestamps + + print_performance_header(); + std::cout << "Add Operation Prototyping Framework" << std::endl; + print_separator(); + + // Initialize Vulkan context + try { + api::context()->initialize_querypool(); + } catch (const std::exception& e) { + std::cerr << "Failed to initialize Vulkan context: " << e.what() + << std::endl; + return 1; + } + + // Execute test cases using the new framework with custom FLOP calculator and + // reference compute + auto results = execute_test_cases( + generate_add_test_cases, + add_flop_calculator, + "Add", + 3, + 10, + add_reference_compute); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/build_and_run.sh b/backends/vulkan/test/custom_ops/build_and_run.sh new file mode 100755 index 00000000000..2b9ce576e0e --- /dev/null +++ b/backends/vulkan/test/custom_ops/build_and_run.sh @@ -0,0 +1,177 @@ +#!/bin/zsh + +set -eux + +# Check that we're in the executorch directory +current_dir=$(pwd) +if [[ ! "$current_dir" =~ executorch$ ]]; then + echo "Error: This script must be run from a directory ending in 'executorch'" + echo "Current directory: $current_dir" + exit 1 +fi + +# Function to configure and build main project +configure_and_build_main() { + local android_args="" + if [[ "$ANDROID_MODE" == "true" ]]; then + cmake . \ + -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ + -DEXECUTORCH_BUILD_VULKAN=ON \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DGLSLC_PATH=$(which glslc) \ + -B$CMAKE_OUT_DIR + else + cmake . \ + -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ + -DEXECUTORCH_BUILD_VULKAN=ON \ + -DGLSLC_PATH=$(which glslc) \ + -B$CMAKE_OUT_DIR + fi + + cmake --build $CMAKE_OUT_DIR -j16 --target install + # -DCMAKE_CXX_FLAGS="-DVULKAN_DEBUG" \ +} + +# Function to build main project only +build_main() { + cmake --build $CMAKE_OUT_DIR -j16 --target install +} + +# Function to configure and build tests +configure_and_build_tests() { + # Check if glslc is installed + if ! command -v glslc >/dev/null 2>&1; then + echo "Error: glslc is not installed or not found in PATH." + exit 1 + fi + + local android_args="" + if [[ "$ANDROID_MODE" == "true" ]]; then + cmake backends/vulkan/test/custom_ops/ \ + -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DGLSLC_PATH=$(which glslc) \ + -B$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops + else + cmake backends/vulkan/test/custom_ops/ \ + -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \ + -DCMAKE_BUILD_TYPE=Debug \ + -DGLSLC_PATH=$(which glslc) \ + -B$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops + fi + + cmake --build $CMAKE_OUT_DIR/backends/vulkan/test/custom_ops -j16 --target all + +} + +build_tests() { + cmake --build $CMAKE_OUT_DIR/backends/vulkan/test/custom_ops -j16 --target all +} + +# Function to rebuild both main and tests +rebuild_both() { + build_main + build_tests +} + +# Function to clean and rebuild everything +clean_and_rebuild() { + rm -rf $CMAKE_OUT_DIR + configure_and_build_main + configure_and_build_tests +} + +# Function to execute binary if specified +execute_binary() { + local binary_name="$1" + if [[ -n "$binary_name" ]]; then + local binary_path="$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops/$binary_name" + echo "Executing binary: $binary_path" + + if [[ "$ANDROID_MODE" == "true" ]]; then + if [[ -f "$binary_path" ]]; then + echo "Pushing binary to Android device..." + adb push "$binary_path" /data/local/tmp/ + echo "Executing binary on Android device..." + adb shell "cd /data/local/tmp && ./$binary_name" + else + echo "Error: Binary '$binary_path' not found" + exit 1 + fi + else + if [[ -f "$binary_path" && -x "$binary_path" ]]; then + "$binary_path" + else + echo "Error: Binary '$binary_path' not found or not executable" + exit 1 + fi + fi + fi +} + +# Parse command line arguments +BINARY_TO_EXECUTE="" +ANDROID_MODE=false +CMAKE_OUT_DIR="cmake-out" + +# Check for --android flag and adjust arguments accordingly +if [[ "$1" == "--android" ]]; then + ANDROID_MODE=true + CMAKE_OUT_DIR="cmake-android-out" + shift # Remove --android from arguments + echo "Android mode enabled. Using $CMAKE_OUT_DIR as build directory." +fi + +case "${1:-}" in + --rebuild|-r) + echo "Rebuilding both main project and tests..." + BINARY_TO_EXECUTE="${2:-}" + rebuild_both + execute_binary "$BINARY_TO_EXECUTE" + ;; + --rebuild1|-r1) + echo "Rebuilding main project only..." + BINARY_TO_EXECUTE="${2:-}" + build_main + execute_binary "$BINARY_TO_EXECUTE" + ;; + --rebuild2|-r2) + echo "Rebuilding tests only..." + BINARY_TO_EXECUTE="${2:-}" + build_tests + execute_binary "$BINARY_TO_EXECUTE" + ;; + --clean|-c) + echo "WARNING: This will delete the entire $CMAKE_OUT_DIR directory and rebuild everything." + echo -n "Are you sure you want to continue? (y/N): " + read -r response + if [[ "$response" =~ ^[Yy]$ ]]; then + echo "Cleaning and rebuilding everything..." + BINARY_TO_EXECUTE="${2:-}" + clean_and_rebuild + execute_binary "$BINARY_TO_EXECUTE" + else + echo "Clean operation cancelled." + exit 0 + fi + ;; + "") + echo "Running full configure and build..." + configure_and_build_main + configure_and_build_tests + ;; + *) + # If first argument doesn't match any build option, treat it as binary name + # and use default build behavior + echo "Running full configure and build..." + BINARY_TO_EXECUTE="$1" + configure_and_build_main + configure_and_build_tests + execute_binary "$BINARY_TO_EXECUTE" + ;; +esac diff --git a/backends/vulkan/test/custom_ops/glsl/add.yaml b/backends/vulkan/test/custom_ops/glsl/add.yaml new file mode 100644 index 00000000000..dd479cafd31 --- /dev/null +++ b/backends/vulkan/test/custom_ops/glsl/add.yaml @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +add_buffer: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + PACKING: C_packed + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: add_buffer + +add_texture: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + PACKING: C_packed + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: add_texture3d diff --git a/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl b/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl new file mode 100644 index 00000000000..8a0ddc4dba7 --- /dev/null +++ b/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl @@ -0,0 +1,39 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("buffer")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_other", DTYPE, "buffer")} + +layout(push_constant) uniform restrict Block { + int out_numel; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const int out_bufi = int(gl_GlobalInvocationID.x); + if (out_bufi >= out_numel) { + return; + } + + // Simple addition without broadcasting + t_out[out_bufi] = t_in[out_bufi] + t_other[out_bufi]; +} \ No newline at end of file diff --git a/backends/vulkan/test/custom_ops/glsl/add_texture.glsl b/backends/vulkan/test/custom_ops/glsl/add_texture.glsl new file mode 100644 index 00000000000..f64c8e25d71 --- /dev/null +++ b/backends/vulkan/test/custom_ops/glsl/add_texture.glsl @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +${define_active_storage_type("texture3d")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_other", DTYPE, "texture3d")} + +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "out_layout", "0")} + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + // Simple addition without broadcasting - same position for all tensors + VEC4_T in_texel = texelFetch(t_in, pos, 0); + VEC4_T other_texel = texelFetch(t_other, pos, 0); + + imageStore(t_out, pos, in_texel + other_texel); +} diff --git a/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl b/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl new file mode 100644 index 00000000000..f821fa3586f --- /dev/null +++ b/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl @@ -0,0 +1,34 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type("texture3d")} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", "float", "texture3d")} +${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")} + +${layout_declare_ubo(B, "ivec3", "out_limits")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(lpos, out_limits))) { + return; + } + + // Placeholder: just copy input to output + vec4 in_texel = vec4(1.0f); + imageStore(t_out, lpos, in_texel); +} diff --git a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl new file mode 100644 index 00000000000..c1d90fadf7e --- /dev/null +++ b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type("texture3d")} + +#extension GL_EXT_debug_printf : enable + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", "int", "texture3d")} +${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")} + +${layout_declare_ubo(B, "ivec3", "out_limits")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(lpos, out_limits))) { + return; + } + + // Pack four 8-bit values equal to 1 into a single uint + int packed = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 24); + + debugPrintfEXT( + "t_out[%i, %i] = %i\\n", + lpos.x, lpos.y, + packed); + + + // Placeholder: just copy input to output + ivec4 in_texel = ivec4(packed); + imageStore(t_out, lpos, in_texel); +} diff --git a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl new file mode 100644 index 00000000000..be6717efdaa --- /dev/null +++ b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type("texture2d")} + +#extension GL_EXT_debug_printf : enable + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", "int", "texture3d")} +${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")} + +${layout_declare_ubo(B, "ivec3", "out_limits")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(lpos, out_limits))) { + return; + } + + // Pack four 8-bit values equal to 1 into a single uint + int packed = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 24); + + debugPrintfEXT( + "t_out[%i, %i] = %i\\n", + lpos.x, lpos.y, + packed); + + + // Placeholder: just copy input to output + ivec4 in_texel = ivec4(packed); + imageStore(t_out, lpos, in_texel); +} diff --git a/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp b/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp new file mode 100644 index 00000000000..dc35153baf0 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +// Shader selection function for add operations +vkapi::ShaderInfo pick_add_shader( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + const ValueRef in1 = args.at(1).refs.at(0); + + // Build shader name following the binary_op pattern + std::string kernel_name = "add"; + add_storage_type_suffix(kernel_name, graph->storage_type_of(out)); + add_dtype_suffix(kernel_name, graph->dtype_of(in1)); + + return VK_KERNEL_FROM_STR(kernel_name); +} + +// Global workgroup size function for add operations +utils::uvec3 add_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + return default_pick_global_wg_size(graph, shader, args, resize_args); +} + +// Local workgroup size function for add operations +utils::uvec3 add_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + return default_pick_local_wg_size( + graph, shader, global_workgroup_size, args, resize_args); +} + +void add_prototype(ComputeGraph& graph, const std::vector& args) { + int idx = 0; + const ValueRef input_a = args.at(idx++); + const ValueRef input_b = args.at(idx++); + const ValueRef output = args.at(idx++); + + // Prepare parameter buffers (empty for add operation) + vkapi::ParamsBindList param_buffers; + + // Prepare push constants based on storage type + std::vector push_constants; + push_constants.reserve(graph.is_buffer_storage(output) ? 1 : 1); + + if (graph.is_buffer_storage(output)) { + // Buffer storage: pass numel as push constant + push_constants.emplace_back(graph.numel_pc_of(output)); + } else { + // Texture storage: pass sizes as push constant + push_constants.emplace_back(graph.sizes_pc_of(output)); + } + + // Prepare specialization constants + vkapi::SpecVarList spec_vars; + if (graph.is_buffer_storage(output)) { + spec_vars = { + graph.hashed_layout_of(output), + graph.hashed_layout_of(input_a), + graph.hashed_layout_of(input_b)}; + } else { + spec_vars = {graph.hashed_layout_of(output)}; + } + + // Add the compute node + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + pick_add_shader, + add_global_wg_size, + add_local_wg_size, + // Inputs and Outputs + {{output, vkapi::kWrite}, {{input_a, input_b}, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + spec_vars, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(etvk.add_prototype, add_prototype); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp new file mode 100644 index 00000000000..c6fedf3c6e1 --- /dev/null +++ b/backends/vulkan/test/custom_ops/utils.cpp @@ -0,0 +1,1622 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "utils.h" +#include +#include +#include +#include + +#include + +namespace executorch { +namespace vulkan { +namespace prototyping { + +int get_seed() { + static int seed = 42; + return seed++; +} + +// Forward declarations for data generation utilities +void generate_random_float_data( + std::vector& data, + float min_val = -1.0f, + float max_val = 1.0f); +void generate_random_int_data( + std::vector& data, + int min_val = -10, + int max_val = 10); +void generate_randint_float_data( + std::vector& data, + int min_val = -10, + int max_val = 10); +void generate_randint_half_data( + std::vector& data, + int min_val = -10, + int max_val = 10); +void generate_random_int8_data( + std::vector& data, + int8_t min_val = -10, + int8_t max_val = 10); +void generate_random_uint8_data( + std::vector& data, + uint8_t min_val = 0, + uint8_t max_val = 255); +void generate_random_int4_data( + std::vector& data, + int8_t min_val = -8, + int8_t max_val = 7); +void generate_ones_data(std::vector& data); +void generate_zeros_data(std::vector& data); + +// Output and latency printing utilities +namespace { +static int print_output_enabled = 0; +static int print_latencies_enabled = 0; +static int gpu_timestamps_enabled = 0; +static int debugging_enabled = 0; +} // namespace + +bool print_output() { + return print_output_enabled > 0; +} + +void set_print_output(bool print_output) { + print_output_enabled = print_output ? 1 : 0; +} + +bool print_latencies() { + return print_latencies_enabled > 0; +} + +void set_print_latencies(bool print_latencies) { + print_latencies_enabled = print_latencies ? 1 : 0; +} + +bool use_gpu_timestamps() { + return gpu_timestamps_enabled > 0; +} + +void set_use_gpu_timestamps(bool use_timestamps) { + gpu_timestamps_enabled = use_timestamps ? 1 : 0; +} + +bool debugging() { + return debugging_enabled > 0; +} + +void set_debugging(bool enable_debugging) { + debugging_enabled = enable_debugging ? 1 : 0; +} + +// ValueSpec implementation +void ValueSpec::generate_tensor_data() { + if (spec_type != SpecType::Tensor) { + return; + } + + int64_t num_elements = numel(); + + switch (dtype) { + case vkapi::kFloat: { + float_data.resize(num_elements); + if (data_gen_type == DataGenType::RANDOM) { + generate_random_float_data(float_data); + } else if (data_gen_type == DataGenType::RANDOM_SCALES) { + generate_random_float_data(float_data, 0.005, 0.015); + } else if (data_gen_type == DataGenType::RANDINT) { + generate_randint_float_data(float_data); + } else if (data_gen_type == DataGenType::RANDINT8) { + generate_randint_float_data(float_data, -128, 127); + } else if (data_gen_type == DataGenType::RANDINT4) { + generate_randint_float_data(float_data, -8, 7); + } else if (data_gen_type == DataGenType::ONES) { + generate_ones_data(float_data); + } else if (data_gen_type == DataGenType::ZEROS) { + generate_zeros_data(float_data); + } else { + generate_zeros_data(float_data); + } + break; + } + case vkapi::kHalf: { + half_data.resize(num_elements); + if (data_gen_type == DataGenType::RANDOM) { + // Generate random float data first, then convert to half + std::vector temp_data(num_elements); + generate_random_float_data(temp_data); + for (size_t i = 0; i < temp_data.size(); ++i) { + // Simple conversion to uint16_t representation of half + half_data[i] = static_cast(temp_data[i] * 32767.0f); + } + } else if (data_gen_type == DataGenType::RANDINT) { + generate_randint_half_data(half_data); + } else if (data_gen_type == DataGenType::RANDINT8) { + generate_randint_half_data(half_data, -128, 127); + } else if (data_gen_type == DataGenType::RANDINT4) { + generate_randint_half_data(half_data, -8, 7); + } else if (data_gen_type == DataGenType::ONES) { + std::fill( + half_data.begin(), + half_data.end(), + static_cast(32767)); // 1.0 in half + } else if (data_gen_type == DataGenType::ZEROS) { + std::fill( + half_data.begin(), + half_data.end(), + static_cast(0)); // 0.0 in half + } else { + std::fill( + half_data.begin(), + half_data.end(), + static_cast(0)); // 0.0 in half + } + break; + } + case vkapi::kInt: { + int32_data.resize(num_elements); + if (data_gen_type == DataGenType::RANDOM) { + generate_random_int_data(int32_data); + } else if (data_gen_type == DataGenType::RANDINT) { + generate_random_int_data( + int32_data); // For int type, RANDINT is same as RANDOM + } else if (data_gen_type == DataGenType::RANDINT8) { + generate_random_int_data(int32_data, -128, 127); + } else if (data_gen_type == DataGenType::RANDINT4) { + generate_random_int_data(int32_data, -8, 7); + } else if (data_gen_type == DataGenType::ONES) { + std::fill(int32_data.begin(), int32_data.end(), 1); + } else if (data_gen_type == DataGenType::ZEROS) { + std::fill(int32_data.begin(), int32_data.end(), 0); + } else { + std::fill(int32_data.begin(), int32_data.end(), 0); + } + break; + } + case vkapi::kChar: { + int8_data.resize(num_elements); + if (data_gen_type == DataGenType::RANDOM) { + generate_random_int8_data(int8_data); + } else if (data_gen_type == DataGenType::RANDINT) { + generate_random_int8_data(int8_data); + } else if (data_gen_type == DataGenType::RANDINT8) { + generate_random_int8_data(int8_data, -128, 127); + } else if (data_gen_type == DataGenType::RANDINT4) { + generate_random_int4_data(int8_data); + } else if (data_gen_type == DataGenType::ONES) { + std::fill(int8_data.begin(), int8_data.end(), 1); + } else if (data_gen_type == DataGenType::ZEROS) { + std::fill(int8_data.begin(), int8_data.end(), 0); + } else { + std::fill(int8_data.begin(), int8_data.end(), 0); + } + break; + } + case vkapi::kByte: { + uint8_data.resize(num_elements); + if (data_gen_type == DataGenType::RANDOM) { + generate_random_uint8_data(uint8_data); + } else if (data_gen_type == DataGenType::RANDINT) { + generate_random_uint8_data(uint8_data); + } else if (data_gen_type == DataGenType::RANDINT8) { + generate_random_uint8_data(uint8_data, 0, 255); + } else if (data_gen_type == DataGenType::RANDINT4) { + generate_random_uint8_data(uint8_data, 0, 15); + } else if (data_gen_type == DataGenType::ONES) { + std::fill(uint8_data.begin(), uint8_data.end(), 1); + } else if (data_gen_type == DataGenType::ZEROS) { + std::fill(uint8_data.begin(), uint8_data.end(), 0); + } else { + std::fill(uint8_data.begin(), uint8_data.end(), 0); + } + break; + } + default: + // Default to float + float_data.resize(num_elements); + if (data_gen_type == DataGenType::RANDOM) { + generate_random_float_data(float_data); + } else if (data_gen_type == DataGenType::RANDINT) { + generate_randint_float_data(float_data); + } else if (data_gen_type == DataGenType::ONES) { + generate_ones_data(float_data); + } else if (data_gen_type == DataGenType::ZEROS) { + generate_zeros_data(float_data); + } else { + generate_zeros_data(float_data); + } + break; + } +} + +int64_t ValueSpec::numel() const { + if (spec_type == SpecType::Int || spec_type == SpecType::Float || + spec_type == SpecType::Bool) { + return 1; + } else if (spec_type == SpecType::IntList) { + return sizes.empty() ? 0 : sizes[0]; + } else { // Tensor + int64_t total = 1; + for (int64_t size : sizes) { + total *= size; + } + return total; + } +} + +size_t ValueSpec::nbytes() const { + size_t element_size = 0; + switch (dtype) { + case vkapi::kFloat: + element_size = sizeof(float); + break; + case vkapi::kHalf: + element_size = sizeof(uint16_t); + break; + case vkapi::kInt: + element_size = sizeof(int32_t); + break; + case vkapi::kChar: + element_size = sizeof(int8_t); + break; + case vkapi::kByte: + element_size = sizeof(uint8_t); + break; + default: + element_size = sizeof(float); // Default fallback + break; + } + return numel() * element_size; +} + +std::string ValueSpec::to_string() const { + std::string result = "ValueSpec("; + + switch (spec_type) { + case SpecType::Tensor: + result += "type=Tensor, sizes=["; + break; + case SpecType::IntList: + result += "type=IntList, count="; + result += std::to_string(sizes.empty() ? 0 : sizes[0]); + result += ", data_gen="; + result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; + result += ")"; + return result; + case SpecType::Int: + result += "type=Int, value="; + result += std::to_string(get_int_value()); + result += ", data_gen="; + result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; + result += ")"; + return result; + case SpecType::Float: + result += "type=Float, value="; + result += std::to_string(get_float_value()); + result += ", data_gen="; + result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; + result += ")"; + return result; + case SpecType::Bool: + result += "type=Bool, value="; + result += get_bool_value() ? "true" : "false"; + result += ", data_gen="; + result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; + result += ")"; + return result; + } + + for (size_t i = 0; i < sizes.size(); ++i) { + result += std::to_string(sizes[i]); + if (i < sizes.size() - 1) + result += ", "; + } + result += "]"; + + if (spec_type == SpecType::Tensor) { + result += ", dtype="; + switch (dtype) { + case vkapi::kFloat: + result += "float"; + break; + case vkapi::kHalf: + result += "half"; + break; + case vkapi::kInt: + result += "int32"; + break; + case vkapi::kChar: + result += "int8"; + break; + case vkapi::kByte: + result += "uint8"; + break; + default: + result += "unknown"; + break; + } + + result += ", memory_layout="; + switch (memory_layout) { + case utils::kWidthPacked: + result += "WidthPacked"; + break; + case utils::kHeightPacked: + result += "HeightPacked"; + break; + case utils::kChannelsPacked: + result += "ChannelsPacked"; + break; + default: + result += "unknown"; + break; + } + + result += ", storage_type="; + switch (storage_type) { + case utils::kTexture3D: + result += "Texture3D"; + break; + case utils::kBuffer: + result += "Buffer"; + break; + default: + result += "unknown"; + break; + } + } + + result += ", data_gen="; + switch (data_gen_type) { + case DataGenType::FIXED: + result += "FIXED"; + break; + case DataGenType::RANDOM: + result += "RANDOM"; + break; + case DataGenType::RANDINT: + result += "RANDINT"; + break; + case DataGenType::RANDINT8: + result += "RANDINT8"; + break; + case DataGenType::RANDINT4: + result += "RANDINT4"; + break; + case DataGenType::ONES: + result += "ONES"; + break; + case DataGenType::ZEROS: + result += "ZEROS"; + break; + default: + result += "unknown"; + break; + } + result += ")"; + return result; +} + +// Additional ValueSpec methods +void ValueSpec::resize_data(size_t new_size) { + switch (dtype) { + case vkapi::kFloat: + float_data.resize(new_size); + break; + case vkapi::kHalf: + half_data.resize(new_size); + break; + case vkapi::kInt: + int32_data.resize(new_size); + break; + case vkapi::kChar: + int8_data.resize(new_size); + break; + case vkapi::kByte: + uint8_data.resize(new_size); + break; + default: + float_data.resize(new_size); + break; + } +} + +void* ValueSpec::get_mutable_data_ptr() { + switch (dtype) { + case vkapi::kFloat: + return float_data.data(); + case vkapi::kHalf: + return half_data.data(); + case vkapi::kInt: + return int32_data.data(); + case vkapi::kChar: + return int8_data.data(); + case vkapi::kByte: + return uint8_data.data(); + default: + return float_data.data(); + } +} + +float ValueSpec::get_element(size_t index) const { + if (index >= static_cast(numel())) { + return 0.0f; + } + + switch (dtype) { + case vkapi::kFloat: + return index < float_data.size() ? float_data[index] : 0.0f; + case vkapi::kHalf: + return index < half_data.size() ? (half_data[index] / 32767.0f) : 0.0f; + case vkapi::kInt: + return index < int32_data.size() ? static_cast(int32_data[index]) + : 0.0f; + case vkapi::kChar: + return index < int8_data.size() ? static_cast(int8_data[index]) + : 0.0f; + case vkapi::kByte: + return index < uint8_data.size() ? static_cast(uint8_data[index]) + : 0.0f; + default: + return 0.0f; + } +} + +const void* ValueSpec::get_data_ptr() const { + switch (dtype) { + case vkapi::kFloat: + return float_data.data(); + case vkapi::kHalf: + return half_data.data(); + case vkapi::kInt: + return int32_data.data(); + case vkapi::kChar: + return int8_data.data(); + case vkapi::kByte: + return uint8_data.data(); + default: + throw std::runtime_error("Unsupported data type for get_data_ptr"); + } +} + +void generate_random_float_data( + std::vector& data, + float min_val, + float max_val) { + std::mt19937 gen(get_seed()); + std::uniform_real_distribution dis(min_val, max_val); + for (auto& val : data) { + val = dis(gen); + } +} + +void generate_random_int_data( + std::vector& data, + int min_val, + int max_val) { + std::mt19937 gen(get_seed()); + std::uniform_int_distribution dis(min_val, max_val); + for (auto& val : data) { + val = dis(gen); + } +} + +void generate_randint_float_data( + std::vector& data, + int min_val, + int max_val) { + std::mt19937 gen(get_seed()); + std::uniform_int_distribution dis(min_val, max_val); + for (auto& val : data) { + val = static_cast(dis(gen)); + } +} + +void generate_randint_half_data( + std::vector& data, + int min_val, + int max_val) { + std::mt19937 gen(get_seed()); + std::uniform_int_distribution dis(min_val, max_val); + for (auto& val : data) { + val = static_cast(std::abs(dis(gen)) % 65536); + } +} + +void generate_ones_data(std::vector& data) { + std::fill(data.begin(), data.end(), 1.0f); +} + +void generate_random_int8_data( + std::vector& data, + int8_t min_val, + int8_t max_val) { + std::mt19937 gen(get_seed()); + std::uniform_int_distribution dis(min_val, max_val); + for (auto& val : data) { + val = static_cast(dis(gen)); + } +} + +void generate_random_uint8_data( + std::vector& data, + uint8_t min_val, + uint8_t max_val) { + std::mt19937 gen(get_seed()); + std::uniform_int_distribution dis(min_val, max_val); + for (auto& val : data) { + val = static_cast(dis(gen)); + } +} + +void generate_random_int4_data( + std::vector& data, + int8_t min_val, + int8_t max_val) { + std::mt19937 gen(get_seed()); + std::uniform_int_distribution dis(min_val, max_val); + for (auto& val : data) { + val = static_cast(dis(gen)); + } +} + +void generate_zeros_data(std::vector& data) { + std::fill(data.begin(), data.end(), 0.0f); +} + +// Correctness checking against reference data +bool ValueSpec::validate_against_reference( + float abs_tolerance, + float rel_tolerance) const { + // Only validate float tensors as specified in requirements + if (dtype != vkapi::kFloat || !is_tensor()) { + return true; // Skip validation for non-float or non-tensor types + } + + const auto& computed_data = get_float_data(); + const auto& reference_data = get_ref_float_data(); + + // Skip validation if no reference data is available + if (reference_data.empty()) { + return true; + } + + // Check if sizes match + if (computed_data.size() != reference_data.size()) { + if (debugging()) { + std::cout << "Size mismatch: computed=" << computed_data.size() + << ", reference=" << reference_data.size() << std::endl; + } + return false; + } + + // Element-wise comparison with both absolute and relative tolerance + for (size_t i = 0; i < computed_data.size(); ++i) { + float diff = std::abs(computed_data[i] - reference_data[i]); + float abs_ref = std::abs(reference_data[i]); + + // Check if either absolute or relative tolerance condition is satisfied + bool abs_tolerance_ok = diff <= abs_tolerance; + bool rel_tolerance_ok = diff <= rel_tolerance * abs_ref; + + if (!abs_tolerance_ok && !rel_tolerance_ok) { + std::cout << "Mismatch at element " << i + << ": computed=" << computed_data[i] + << ", reference=" << reference_data[i] << ", diff=" << diff + << ", abs_tolerance=" << abs_tolerance + << ", rel_tolerance=" << rel_tolerance + << ", rel_threshold=" << (rel_tolerance * abs_ref) << std::endl; + return false; + } + } + + if (debugging()) { + std::cout << "Correctness validation PASSED" << std::endl; + } + return true; +} + +// Helper function to collect GPU timing from querypool +float collect_gpu_timing_us(ComputeGraph& graph) { + graph.context()->querypool().extract_results(); + const auto results = graph.context()->querypool().get_shader_timestamp_data(); + if (!results.empty()) { + // Sum durations of all shaders that don't contain nchw_to or to_nchw + float total_duration_us = 0.0f; + for (const auto& shader_result : results) { + if (shader_result.kernel_name.find("nchw_to") == std::string::npos && + shader_result.kernel_name.find("to_nchw") == std::string::npos) { + // Calculate duration from start and end times, convert from ns to μs + uint64_t duration_ns = + shader_result.end_time_ns - shader_result.start_time_ns; + total_duration_us += static_cast(duration_ns) / 1000.0f; + } + } + return total_duration_us; + } + return 0.0f; +} + +// BenchmarkResult implementation +void BenchmarkResult::add_iter_timing(float time_us) { + iter_timings.push_back(time_us); +} + +float BenchmarkResult::get_avg_time_us() const { + if (iter_timings.empty()) { + return 0.0f; + } + + float sum = 0.0f; + for (float timing : iter_timings) { + sum += timing; + } + return sum / iter_timings.size(); +} + +float BenchmarkResult::get_min_time_us() const { + if (iter_timings.empty()) { + return 0.0f; + } + + return *std::min_element(iter_timings.begin(), iter_timings.end()); +} + +float BenchmarkResult::get_max_time_us() const { + if (iter_timings.empty()) { + return 0.0f; + } + + return *std::max_element(iter_timings.begin(), iter_timings.end()); +} + +float BenchmarkResult::get_std_dev_us() const { + if (iter_timings.size() <= 1) { + return 0.0f; + } + + float mean = get_avg_time_us(); + float sum_sq_diff = 0.0f; + + for (float timing : iter_timings) { + float diff = timing - mean; + sum_sq_diff += diff * diff; + } + + return std::sqrt(sum_sq_diff / (iter_timings.size() - 1)); +} + +void BenchmarkResult::print_summary( + int case_number, + const std::string& size_info, + float total_gflops) const { + static constexpr int KERNEL_NAME_WIDTH = 140; + static constexpr int SIZE_INFO_WIDTH = 20; + static constexpr int TIMING_WIDTH = 20; + static constexpr int GFLOPS_WIDTH = 20; + static constexpr int CORRECTNESS_WIDTH = 10; + + std::string correctness_str; + switch (correctness_status_) { + case CorrectnessStatus::SKIPPED: + correctness_str = "SKIPPED"; + break; + case CorrectnessStatus::PASSED: + correctness_str = "PASSED"; + break; + case CorrectnessStatus::FAIL: + correctness_str = "FAIL"; + break; + } + + std::cout << std::left << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() + << std::right << " " << std::setw(SIZE_INFO_WIDTH) << size_info + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH) + << std::fixed << std::setprecision(3) << total_gflops << " GFLOP/s " + << std::setw(CORRECTNESS_WIDTH) << correctness_str << std::endl; +} + +// TestResult implementation +void TestResult::add_result(const BenchmarkResult& result) { + results_.push_back(result); +} + +void TestResult::add_result(BenchmarkResult&& result) { + results_.push_back(std::move(result)); +} + +void TestResult::print_summary() const { + static constexpr int CASE_WIDTH = 80; + static constexpr int KERNEL_NAME_WIDTH = 20; + static constexpr int TIMING_WIDTH = 12; + static constexpr int PASS_WIDTH = 8; + + if (results_.empty()) { + std::cout << "No results to display" << std::endl; + return; + } + + std::cout << "\n=== " << operation_name_ + << " Performance Summary ===" << std::endl; + print_separator(); + + std::cout << std::left << std::setw(CASE_WIDTH) << "Case" << std::left + << std::setw(KERNEL_NAME_WIDTH) << "Kernel Name" << std::left + << std::setw(TIMING_WIDTH) << "Avg (μs)" << std::left + << std::setw(TIMING_WIDTH) << "Min (μs)" << std::left + << std::setw(TIMING_WIDTH) << "Max (μs)" << std::left + << std::setw(TIMING_WIDTH) << "Std Dev" << std::left + << std::setw(PASS_WIDTH) << "Pass" << std::endl; + print_separator(); + + for (size_t i = 0; i < results_.size(); ++i) { + const auto& result = results_[i]; + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + std::cout << std::left << std::setw(CASE_WIDTH) << i + 1 << std::left + << std::setw(KERNEL_NAME_WIDTH) + << result.get_kernel_name().substr(0, KERNEL_NAME_WIDTH - 1) + << std::left << std::setw(TIMING_WIDTH) << std::fixed + << std::setprecision(3) << result.get_avg_time_us() << std::left + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << result.get_min_time_us() << std::left + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << result.get_max_time_us() << std::left + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << result.get_std_dev_us() << std::left << std::setw(PASS_WIDTH) + << (vulkan_execute_succeeded ? "✓" : "✗") << std::endl; + } + + print_separator(); + std::cout << "Total cases: " << results_.size() + << ", Passed: " << get_passed_count() + << ", Failed: " << get_failed_count() << std::endl; + std::cout << "Overall GFLOP/s: " << std::fixed << std::setprecision(3) + << gflops_ << std::endl; + std::cout << "Overall correctness: " + << (correctness_passed_ ? "PASSED" : "FAILED") << std::endl; +} + +void TestResult::print_detailed_results() const { + if (results_.empty()) { + std::cout << "No results to display" << std::endl; + return; + } + + std::cout << "\n=== " << operation_name_ + << " Detailed Results ===" << std::endl; + + for (size_t i = 0; i < results_.size(); ++i) { + const auto& result = results_[i]; + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + std::cout << "\nCase " << i + 1 << ": " << result.get_kernel_name() + << std::endl; + std::cout << " Iterations: " << result.get_num_iterations() << std::endl; + std::cout << " Average: " << std::fixed << std::setprecision(3) + << result.get_avg_time_us() << " μs" << std::endl; + std::cout << " Min: " << std::fixed << std::setprecision(3) + << result.get_min_time_us() << " μs" << std::endl; + std::cout << " Max: " << std::fixed << std::setprecision(3) + << result.get_max_time_us() << " μs" << std::endl; + std::cout << " Std Dev: " << std::fixed << std::setprecision(3) + << result.get_std_dev_us() << " μs" << std::endl; + std::cout << " Correctness: " + << (vulkan_execute_succeeded ? "PASSED" : "FAILED") << std::endl; + + if (result.get_num_iterations() > 0) { + std::cout << " Individual timings (μs): "; + const auto& timings = result.get_iter_timings(); + for (size_t j = 0; j < std::min(size_t(10), timings.size()); ++j) { + std::cout << std::fixed << std::setprecision(1) << timings[j]; + if (j < std::min(size_t(10), timings.size()) - 1) + std::cout << ", "; + } + if (timings.size() > 10) { + std::cout << " ... (" << (timings.size() - 10) << " more)"; + } + std::cout << std::endl; + } + } + + std::cout << "\nOverall Results:" << std::endl; + std::cout << " Total GFLOP/s: " << std::fixed << std::setprecision(3) + << gflops_ << std::endl; + std::cout << " Overall correctness: " + << (correctness_passed_ ? "PASSED" : "FAILED") << std::endl; +} + +void TestResult::print_statistics() const { + if (results_.empty()) { + std::cout << "No results to display statistics for" << std::endl; + return; + } + + std::cout << "\n=== " << operation_name_ << " Statistics ===" << std::endl; + std::cout << "Total test cases: " << results_.size() << std::endl; + std::cout << "Passed: " << get_passed_count() << std::endl; + std::cout << "Failed: " << get_failed_count() << std::endl; + std::cout << "Success rate: " << std::fixed << std::setprecision(1) + << (100.0f * get_passed_count() / results_.size()) << "%" + << std::endl; + + if (get_passed_count() > 0) { + std::cout << "Total average time: " << std::fixed << std::setprecision(3) + << get_total_avg_time_us() << " μs" << std::endl; + std::cout << "Total GFLOP/s: " << std::fixed << std::setprecision(3) + << get_total_gflops() << std::endl; + + const auto* fastest = get_fastest_result(); + const auto* slowest = get_slowest_result(); + const auto* highest_gflops = get_highest_gflops_result(); + + if (fastest) { + std::cout << "Fastest case: " << fastest->get_kernel_name() << " (" + << std::fixed << std::setprecision(3) + << fastest->get_avg_time_us() << " μs)" << std::endl; + } + + if (slowest) { + std::cout << "Slowest case: " << slowest->get_kernel_name() << " (" + << std::fixed << std::setprecision(3) + << slowest->get_avg_time_us() << " μs)" << std::endl; + } + + if (highest_gflops) { + std::cout << "Best performing case: " << highest_gflops->get_kernel_name() + << " (" << std::fixed << std::setprecision(3) + << highest_gflops->get_avg_time_us() << " μs)" << std::endl; + } + } +} + +void TestResult::print_brief_summary() const { + print_separator(); + std::cout << "Summary Statistics:" << std::endl; + + if (get_passed_count() > 0) { + std::cout << "Average execution time: " << std::fixed + << std::setprecision(3) << get_total_avg_time_us() << " μs" + << std::endl; + std::cout << "Total throughput: " << std::fixed << std::setprecision(3) + << get_gflops() << " GFLOP/s" << std::endl; + std::cout << "Successful test cases: " << get_passed_count() << "/" + << size() << std::endl; + std::cout << "Overall correctness: " + << (get_correctness_passed() ? "PASSED" : "FAILED") << std::endl; + } else { + std::cout << "No successful test cases to report" << std::endl; + } +} + +float TestResult::get_total_avg_time_us() const { + if (results_.empty()) { + return 0.0f; + } + + float sum = 0.0f; + size_t count = 0; + + for (const auto& result : results_) { + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + if (vulkan_execute_succeeded) { + sum += result.get_avg_time_us(); + count++; + } + } + + return count > 0 ? sum / count : 0.0f; +} + +float TestResult::get_total_gflops() const { + return gflops_; +} + +size_t TestResult::get_passed_count() const { + size_t count = 0; + for (const auto& result : results_) { + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + if (vulkan_execute_succeeded) { + count++; + } + } + return count; +} + +size_t TestResult::get_failed_count() const { + return results_.size() - get_passed_count(); +} + +const BenchmarkResult* TestResult::get_fastest_result() const { + const BenchmarkResult* fastest = nullptr; + + for (const auto& result : results_) { + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + if (vulkan_execute_succeeded) { + if (!fastest || result.get_avg_time_us() < fastest->get_avg_time_us()) { + fastest = &result; + } + } + } + + return fastest; +} + +const BenchmarkResult* TestResult::get_slowest_result() const { + const BenchmarkResult* slowest = nullptr; + + for (const auto& result : results_) { + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + if (vulkan_execute_succeeded) { + if (!slowest || result.get_avg_time_us() > slowest->get_avg_time_us()) { + slowest = &result; + } + } + } + + return slowest; +} + +const BenchmarkResult* TestResult::get_highest_gflops_result() const { + // Since GFLOPS is now a TestResult-level metric rather than per-case, + // this method returns the fastest result as a proxy for highest performance + return get_fastest_result(); +} + +// Default FLOP calculation function (assumes 1 FLOP per element) +int64_t default_flop_calculator(const TestCase& test_case) { + // Calculate total elements from the first input tensor + int64_t total_elements = 1; + if (!test_case.empty() && test_case.num_inputs() > 0 && + test_case.inputs()[0].is_tensor()) { + const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); + for (int64_t size : sizes) { + total_elements *= size; + } + } + + // Assume 1 FLOP per element for basic operations + return total_elements; +} + +ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name) { + GraphConfig config; + config.enable_querypool = true; + ComputeGraph graph(config); + + std::vector input_values; + + // Process input ValueSpecs + for (size_t i = 0; i < test_case.num_inputs(); ++i) { + const ValueSpec& input_spec = test_case.inputs()[i]; + + if (input_spec.is_float()) { + ValueRef input_value = + graph.add_scalar(static_cast(input_spec.get_float_value())); + input_values.push_back(input_value); + } else if (input_spec.is_int()) { + ValueRef input_value = + graph.add_scalar(static_cast(input_spec.get_int_value())); + input_values.push_back(input_value); + } else if (input_spec.is_bool()) { + ValueRef input_value = graph.add_scalar(input_spec.get_bool_value()); + input_values.push_back(input_value); + } else if (input_spec.is_int_list()) { + // Convert int32_t list to int64_t list for ComputeGraph + const auto& int32_list = input_spec.get_int_list(); + std::vector int64_list; + int64_list.reserve(int32_list.size()); + for (int32_t val : int32_list) { + int64_list.push_back(static_cast(val)); + } + ValueRef input_value = graph.add_scalar_list(std::move(int64_list)); + input_values.push_back(input_value); + } else if (input_spec.is_constant()) { + ValueRef input_value = graph.add_tensorref( + input_spec.get_tensor_sizes(), + input_spec.dtype, + input_spec.get_data_ptr()); + input_values.push_back(input_value); + } else { + IOValueRef input_io = graph.add_input_tensor( + input_spec.get_tensor_sizes(), + input_spec.dtype, + input_spec.storage_type, + input_spec.memory_layout); + input_values.push_back(input_io.value); + } + } + + std::vector output_values; + + // Process output ValueSpecs + for (size_t i = 0; i < test_case.num_outputs(); ++i) { + const ValueSpec& output_spec = test_case.outputs()[i]; + + if (!output_spec.is_tensor()) { + throw std::runtime_error("All output specifications must be tensors"); + } + + // Create output tensor + ValueRef output_value = graph.add_tensor( + output_spec.get_tensor_sizes(), + output_spec.dtype, + output_spec.storage_type, + output_spec.memory_layout); + + output_values.push_back(output_value); + } + + // Get the operator function and call it + auto opFn = VK_GET_OP_FN(op_name); + + // Create arguments vector for the operator function + std::vector op_args = input_values; + op_args.insert(op_args.end(), output_values.begin(), output_values.end()); + + opFn(graph, op_args); + + for (size_t i = 0; i < output_values.size(); ++i) { + graph.set_output_value(output_values[i]); + } + return graph; +} + +// Test execution utilities +BenchmarkResult +execute_test_case(TestCase& test_case, int warmup_runs, int benchmark_runs) { + BenchmarkResult result( + test_case.name().empty() ? "unnamed_test_case" : test_case.name()); + + // Initialize querypool if using GPU timestamps + if (use_gpu_timestamps()) { + api::context()->initialize_querypool(); + } + + // Create the compute graph for this test case using setup_compute_graph + ComputeGraph graph = + setup_compute_graph(test_case, test_case.operator_name()); + + // Prepare the graph + graph.prepare(); + graph.prepack(); + + // Copy input data into the graph's staging buffers + for (size_t i = 0; i < test_case.num_inputs(); ++i) { + const ValueSpec& input_spec = test_case.inputs()[i]; + if (input_spec.is_tensor() && i < graph.inputs().size()) { + // Skip copying data for constant tensors + if (input_spec.is_constant()) { + continue; + } + + const auto& input_ref = graph.inputs()[i]; + + // Get the appropriate data based on dtype + const void* data_ptr = nullptr; + size_t data_numel = input_spec.numel(); + + switch (input_spec.dtype) { + case vkapi::kFloat: + data_ptr = input_spec.get_float_data().data(); + break; + case vkapi::kHalf: + data_ptr = input_spec.get_half_data().data(); + break; + case vkapi::kInt: + data_ptr = input_spec.get_int32_data().data(); + break; + case vkapi::kChar: + data_ptr = input_spec.get_int8_data().data(); + break; + case vkapi::kByte: + data_ptr = input_spec.get_uint8_data().data(); + break; + default: + throw std::runtime_error("Unsupported data type for input tensor"); + } + + // Copy data into staging buffer + graph.copy_into_staging(input_ref.staging, data_ptr, data_numel); + } + } + + // Warmup runs + for (int run = 0; run < warmup_runs; ++run) { + graph.execute(); + } + + // Benchmark runs - collect individual iteration timings + float total_cpu_time_us = 0.0f; + float total_gpu_time_us = 0.0f; + + for (int run = 0; run < benchmark_runs; ++run) { + // Measure CPU time for each execute() call + auto cpu_start = std::chrono::high_resolution_clock::now(); + graph.execute(); + auto cpu_end = std::chrono::high_resolution_clock::now(); + + auto cpu_duration = std::chrono::duration_cast( + cpu_end - cpu_start); + float cpu_time_us = static_cast(cpu_duration.count()); + total_cpu_time_us += cpu_time_us; + + // Collect GPU timing using helper function + float gpu_time_us = collect_gpu_timing_us(graph); + total_gpu_time_us += gpu_time_us; + + // Add the appropriate timing based on the flag + float iter_time_us = use_gpu_timestamps() ? gpu_time_us : cpu_time_us; + result.add_iter_timing(iter_time_us); + } + + // Calculate averages for display + float avg_cpu_time_us = total_cpu_time_us / benchmark_runs; + float avg_gpu_time_us = total_gpu_time_us / benchmark_runs; + + // Print both timings if latency printing is enabled + if (print_latencies()) { + if (use_gpu_timestamps()) { + graph.context()->querypool().print_results(); + } + std::cout << " CPU timing: " << std::fixed << std::setprecision(3) + << avg_cpu_time_us << " μs" << std::endl; + std::cout << " GPU timing: " << std::fixed << std::setprecision(3) + << avg_gpu_time_us << " μs" << std::endl; + std::cout << " Using " << (use_gpu_timestamps() ? "GPU" : "CPU") + << " timing for result" << std::endl; + } + + // Copy output data from the graph's staging buffers + for (size_t i = 0; i < test_case.num_outputs(); ++i) { + ValueSpec& output_spec = test_case.outputs()[i]; + + if (output_spec.is_tensor() && i < graph.outputs().size()) { + const auto& output_ref = graph.outputs()[i]; + + // Ensure output data vector is properly sized + size_t data_numel = output_spec.numel(); + output_spec.resize_data(data_numel); + + // Get mutable data pointer for the output + void* data_ptr = output_spec.get_mutable_data_ptr(); + + if (data_ptr != nullptr) { + // Copy data from staging buffer to output spec + graph.copy_from_staging(output_ref.staging, data_ptr, data_numel); + } + + // Print output tensor data if output printing is enabled + if (print_output()) { + std::string output_name = "Output[" + std::to_string(i) + "]"; + print_valuespec_data(output_spec, output_name); + } + } + } + + return result; +} + +TestResult execute_test_cases( + std::function()> test_case_generator, + FlopCalculatorFunc flop_calculator, + const std::string& operation_name, + int warmup_runs, + int benchmark_runs, + ReferenceComputeFunc reference_compute_func) { + TestResult results(operation_name); + + // Generate all test cases + std::vector test_cases = test_case_generator(); + + std::cout << "Executing " << test_cases.size() << " test cases for " + << operation_name << std::endl; + print_separator(); + + bool any_correctness_failed = false; + float total_gflops = 0.0f; + size_t successful_cases = 0; + + for (size_t i = 0; i < test_cases.size(); ++i) { + TestCase& test_case = test_cases[i]; + + // Compute reference data if reference function is provided + bool skipped_reference_fn = true; + if (reference_compute_func) { + try { + reference_compute_func(test_case); + skipped_reference_fn = false; + } catch (const std::invalid_argument& e) { + if (debugging()) { + std::cout << "Compute reference skipped: " << e.what() << std::endl; + } + } + } + + // Execute single test case + BenchmarkResult result = + execute_test_case(test_case, warmup_runs, benchmark_runs); + + // Determine if this test case passed (has valid timing data) + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + + if (!vulkan_execute_succeeded) { + result.set_correctness_status(CorrectnessStatus::FAIL); + } else if (skipped_reference_fn) { + result.set_correctness_status(CorrectnessStatus::SKIPPED); + } else { + // Reference function provided and succeeded - validate outputs + bool correctness_passed = true; + + for (size_t output_idx = 0; output_idx < test_case.num_outputs(); + ++output_idx) { + const ValueSpec& output_spec = test_case.outputs()[output_idx]; + + if (!output_spec.validate_against_reference()) { + correctness_passed = false; + std::cout << " Correctness validation FAILED for test " + << result.get_kernel_name() << std::endl; + print_valuespec_data(output_spec, "vulkan output"); + print_valuespec_data(output_spec, "ref output", true); + + throw std::runtime_error("Correctness validation failed"); + } + } + + if (correctness_passed) { + result.set_correctness_status(CorrectnessStatus::PASSED); + } else { + any_correctness_failed = true; + result.set_correctness_status(CorrectnessStatus::FAIL); + } + } + + // Calculate GFLOPS for this test case using the provided FLOP calculator + float case_gflops = 0.0f; + if (vulkan_execute_succeeded) { + // Use the provided FLOP calculator to get total FLOPs for this test case + int64_t total_flops = flop_calculator(test_case); + float flops = static_cast(total_flops); + float avg_time_us = result.get_avg_time_us(); + if (avg_time_us > 0.0f && total_flops > 0) { + case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f); + } + + total_gflops += case_gflops; + successful_cases++; + } else { + case_gflops = -1.0f; // Indicate failure + } + + // Calculate tensor info for display + std::string size_info = "["; + if (!test_case.empty() && test_case.num_inputs() > 0 && + test_case.inputs()[0].is_tensor()) { + const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); + for (size_t j = 0; j < sizes.size(); ++j) { + size_info += std::to_string(sizes[j]); + if (j < sizes.size() - 1) + size_info += "x"; + } + } + size_info += "]"; + + // Print progress using the BenchmarkResult member function + result.print_summary(i + 1, size_info, case_gflops); + + // Add result to collection + results.add_result(std::move(result)); + } + + // Set the overall results on the TestResult + results.set_correctness_passed(!any_correctness_failed); + results.set_gflops(total_gflops); + + print_separator(); + std::cout << "Completed " << results.size() << " test cases" << std::endl; + + return results; +} + +// Convenience overload that uses the default FLOP calculator +TestResult execute_test_cases( + std::function()> test_case_generator, + const std::string& operation_name, + int warmup_runs, + int benchmark_runs, + ReferenceComputeFunc reference_compute_func) { + return execute_test_cases( + test_case_generator, + default_flop_calculator, + operation_name, + warmup_runs, + benchmark_runs, + reference_compute_func); +} + +// Utility functions for printing +void print_performance_header() { + std::cout << "\n=== Compute Shader Performance Benchmark ===" << std::endl; +} + +void print_separator() { + std::cout << std::string(70, '-') << std::endl; +} + +// ValueSpec data printing utilities +void print_valuespec_data( + const ValueSpec& spec, + const std::string& name, + const bool print_ref_data, + size_t max_elements, + int precision) { + std::cout << "\n" << name << " Data:" << std::endl; + std::cout << " Type: " << spec.to_string() << std::endl; + + if (!spec.is_tensor()) { + if (spec.is_int()) { + std::cout << " Value: " << spec.get_int_value() << std::endl; + } else if (spec.is_int_list()) { + const auto& int_list = spec.get_int_list(); + std::cout << " Values: ["; + size_t print_count = std::min(max_elements, int_list.size()); + for (size_t i = 0; i < print_count; ++i) { + std::cout << int_list[i]; + if (i < print_count - 1) + std::cout << ", "; + } + if (int_list.size() > max_elements) { + std::cout << ", ... (" << (int_list.size() - max_elements) << " more)"; + } + std::cout << "]" << std::endl; + } + return; + } + + // Print tensor data + size_t total_elements = spec.numel(); + size_t print_count = std::min(max_elements, total_elements); + + std::cout << " Total elements: " << total_elements << std::endl; + std::cout << " Data (first " << print_count << " elements): ["; + + std::cout << std::fixed << std::setprecision(precision); + + switch (spec.dtype) { + case vkapi::kFloat: { + auto data = spec.get_float_data().data(); + if (print_ref_data) { + data = spec.get_ref_float_data().data(); + } + for (size_t i = 0; i < print_count; ++i) { + std::cout << data[i]; + if (i < print_count - 1) + std::cout << ", "; + } + break; + } + case vkapi::kHalf: { + const auto& data = spec.get_half_data(); + for (size_t i = 0; i < print_count; ++i) { + // Convert uint16_t back to float for display + float value = data[i] / 32767.0f; + std::cout << value; + if (i < print_count - 1) + std::cout << ", "; + } + break; + } + case vkapi::kInt: { + const auto& data = spec.get_int32_data(); + for (size_t i = 0; i < print_count; ++i) { + std::cout << data[i]; + if (i < print_count - 1) + std::cout << ", "; + } + break; + } + case vkapi::kChar: { + const auto& data = spec.get_int8_data(); + for (size_t i = 0; i < print_count; ++i) { + std::cout << static_cast(data[i]); + if (i < print_count - 1) + std::cout << ", "; + } + break; + } + case vkapi::kByte: { + const auto& data = spec.get_uint8_data(); + for (size_t i = 0; i < print_count; ++i) { + std::cout << static_cast(data[i]); + if (i < print_count - 1) + std::cout << ", "; + } + break; + } + default: + std::cout << "unsupported data type"; + break; + } + + if (total_elements > max_elements) { + std::cout << ", ... (" << (total_elements - max_elements) << " more)"; + } + std::cout << "]" << std::endl; + + // Print some statistics for tensor data + if (total_elements > 0) { + float min_val = 0.0f, max_val = 0.0f, sum = 0.0f; + bool first = true; + + for (size_t i = 0; i < total_elements; ++i) { + float val = spec.get_element(i); + if (first) { + min_val = max_val = val; + first = false; + } else { + min_val = std::min(min_val, val); + max_val = std::max(max_val, val); + } + sum += val; + } + + float mean = sum / total_elements; + std::cout << " Statistics: min=" << std::setprecision(precision) << min_val + << ", max=" << max_val << ", mean=" << mean << ", sum=" << sum + << std::endl; + } +} + +ValueRef quantized_weights_canvas( + ComputeGraph& graph, + const ValueRef weight_ref) { + const auto original_sizes = graph.sizes_of(weight_ref); + + // Get the 2 highest values of original_sizes + std::vector sorted_sizes = original_sizes; + std::sort(sorted_sizes.begin(), sorted_sizes.end(), std::greater()); + int64_t largest1 = sorted_sizes.size() > 0 ? sorted_sizes[0] : 0; + int64_t largest2 = sorted_sizes.size() > 1 ? sorted_sizes[1] : 0; + + std::vector final_sizes = {1, largest1, largest1}; + + // Debug logging if debugging flag is set + if (debugging()) { + std::cout << "Debug: Creating quantized weights canvas tensor" << std::endl; + std::cout << "Debug: Original sizes: ["; + for (size_t i = 0; i < original_sizes.size(); ++i) { + std::cout << original_sizes[i]; + if (i < original_sizes.size() - 1) + std::cout << ", "; + } + std::cout << "]" << std::endl; + std::cout << "Debug: Canvas sizes: ["; + for (size_t i = 0; i < final_sizes.size(); ++i) { + std::cout << final_sizes[i]; + if (i < final_sizes.size() - 1) + std::cout << ", "; + } + std::cout << "]" << std::endl; + } + + ValueRef packed_weight = graph.add_tensor( + final_sizes, vkapi::kInt, utils::kTexture3D, utils::kWidthPacked); + + utils::uvec3 global_wg_size{ + utils::div_up(utils::safe_downcast(largest1), uint32_t(4)), + utils::safe_downcast(largest2), + utils::safe_downcast(std::min(largest1, int64_t(2048)))}; + + std::string kernel_name = "packed_int32_canvas"; + add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + graph.create_global_wg_size(packed_weight), + graph.create_local_wg_size(packed_weight), + weight_ref, + packed_weight, + // UBOs + {graph.logical_limits_ubo(packed_weight)}, + // Specialization constants + {}, + // Push Constants + {})); + + return packed_weight; +} + +ValueRef float_tensor_canvas(ComputeGraph& graph, const ValueRef weight_ref) { + const auto original_sizes = graph.sizes_of(weight_ref); + + // Get the 2 highest values of original_sizes + std::vector sorted_sizes = original_sizes; + std::sort(sorted_sizes.begin(), sorted_sizes.end(), std::greater()); + int64_t largest1 = sorted_sizes.size() > 0 ? sorted_sizes[0] : 0; + int64_t largest2 = sorted_sizes.size() > 1 ? sorted_sizes[1] : 0; + + std::vector final_sizes = {1, largest1, largest1}; + + // Debug logging if debugging flag is set + if (debugging()) { + std::cout << "Debug: Creating float tensor canvas" << std::endl; + std::cout << "Debug: Original sizes: ["; + for (size_t i = 0; i < original_sizes.size(); ++i) { + std::cout << original_sizes[i]; + if (i < original_sizes.size() - 1) + std::cout << ", "; + } + std::cout << "]" << std::endl; + std::cout << "Debug: Canvas sizes: ["; + for (size_t i = 0; i < final_sizes.size(); ++i) { + std::cout << final_sizes[i]; + if (i < final_sizes.size() - 1) + std::cout << ", "; + } + std::cout << "]" << std::endl; + } + + ValueRef packed_weight = graph.add_tensor( + final_sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked); + + utils::uvec3 global_wg_size{ + utils::div_up(utils::safe_downcast(largest1), uint32_t(4)), + utils::safe_downcast(largest2), + utils::safe_downcast(std::min(largest1, int64_t(2048)))}; + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR("float_canvas"), + graph.create_global_wg_size(packed_weight), + graph.create_local_wg_size(packed_weight), + weight_ref, + packed_weight, + // UBOs + {graph.logical_limits_ubo(packed_weight)}, + // Specialization constants + {}, + // Push Constants + {})); + + return packed_weight; +} + +// Compute weight sums for quantized operations (linear and convolution) +void compute_weight_sums( + ValueSpec& weight_sums, + const ValueSpec& quantized_weight, + int64_t out_features, + int64_t elements_per_output_feature) { + auto& weight_sums_data = weight_sums.get_float_data(); + auto& quantized_weight_data = quantized_weight.get_int8_data(); + + weight_sums_data.resize(out_features); + + // For each output feature, compute the sum of quantized weights + for (int64_t out_f = 0; out_f < out_features; ++out_f) { + float sum = 0.0f; + for (int64_t elem = 0; elem < elements_per_output_feature; ++elem) { + // Weight indexing depends on the layout: + // For linear: [in_features, out_features] -> elem * out_features + out_f + // For conv2d: [C_in * K_h * K_w, C_out] -> elem * out_features + out_f + int64_t weight_idx = elem * out_features + out_f; + sum += static_cast(quantized_weight_data[weight_idx]); + } + weight_sums_data[out_f] = sum; + } +} + +} // namespace prototyping +} // namespace vulkan +} // namespace executorch diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h new file mode 100644 index 00000000000..daa9eeda237 --- /dev/null +++ b/backends/vulkan/test/custom_ops/utils.h @@ -0,0 +1,593 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace vulkan { +namespace prototyping { + +using namespace vkcompute; + +// +// Global configuration options +// + +bool print_output(); +void set_print_output(bool print_output); + +bool print_latencies(); +void set_print_latencies(bool print_latencies); + +bool use_gpu_timestamps(); +void set_use_gpu_timestamps(bool use_timestamps); + +bool debugging(); +void set_debugging(bool enable_debugging); + +// +// ValueSpec class +// + +enum class SpecType { Tensor, IntList, Int, Float, Bool }; + +// Data generation types +enum class DataGenType { + FIXED, + RANDOM, + RANDOM_SCALES, + RANDINT, + RANDINT8, + RANDINT4, + ONES, + ZEROS +}; + +// Value specification struct +struct ValueSpec { + std::vector sizes; + vkapi::ScalarType dtype; + utils::GPUMemoryLayout memory_layout; + utils::StorageType storage_type; + SpecType spec_type; + DataGenType data_gen_type; + bool is_constant_tensor; + + std::vector float_data; + std::vector int32_data; + std::vector half_data; // Using uint16_t as substitute for half + std::vector int8_data; // For kChar (signed 8-bit) + std::vector uint8_data; // For kByte (unsigned 8-bit) + + std::vector ref_float_data; + std::vector ref_int32_data; + std::vector ref_half_data; + std::vector ref_int8_data; + std::vector ref_uint8_data; + + ValueSpec( + const std::vector& sizes, + vkapi::ScalarType dtype, + utils::StorageType storage_type = utils::kTexture3D, + utils::GPUMemoryLayout memory_layout = utils::kWidthPacked) + : sizes(sizes), + dtype(dtype), + memory_layout(memory_layout), + storage_type(storage_type), + spec_type(SpecType::Tensor), + data_gen_type(DataGenType::ZEROS), + is_constant_tensor(false) { + generate_tensor_data(); + } + + // Constructor for tensor with custom data generation type + ValueSpec( + const std::vector& sizes, + vkapi::ScalarType dtype, + utils::StorageType storage_type, + utils::GPUMemoryLayout memory_layout, + DataGenType data_gen_type) + : sizes(sizes), + dtype(dtype), + memory_layout(memory_layout), + storage_type(storage_type), + spec_type(SpecType::Tensor), + data_gen_type(data_gen_type), + is_constant_tensor(false) { + generate_tensor_data(); + } + + // Constructor for single int + ValueSpec(int32_t value) + : sizes({1}), + dtype(vkapi::kInt), + memory_layout(utils::kWidthPacked), + storage_type(utils::kTexture3D), + spec_type(SpecType::Int), + data_gen_type(DataGenType::FIXED), + is_constant_tensor(false) { + int32_data.push_back(value); + } + + // Constructor for single float + ValueSpec(float value) + : sizes({1}), + dtype(vkapi::kFloat), + memory_layout(utils::kWidthPacked), + storage_type(utils::kTexture3D), + spec_type(SpecType::Float), + data_gen_type(DataGenType::FIXED), + is_constant_tensor(false) { + float_data.push_back(value); + } + + // Constructor for single bool + ValueSpec(bool value) + : sizes({1}), + dtype(vkapi::kInt), + memory_layout(utils::kWidthPacked), + storage_type(utils::kTexture3D), + spec_type(SpecType::Bool), + data_gen_type(DataGenType::FIXED), + is_constant_tensor(false) { + int32_data.push_back(value ? 1 : 0); + } + + // Constructor for int list + ValueSpec(const std::vector& values) + : sizes({static_cast(values.size())}), + dtype(vkapi::kInt), + memory_layout(utils::kWidthPacked), + storage_type(utils::kTexture3D), + spec_type(SpecType::IntList), + data_gen_type(DataGenType::FIXED), + is_constant_tensor(false), + int32_data(values) {} + + // Default constructor + ValueSpec() + : dtype(vkapi::kFloat), + memory_layout(utils::kWidthPacked), + storage_type(utils::kTexture3D), + spec_type(SpecType::Tensor), + data_gen_type(DataGenType::ZEROS), + is_constant_tensor(false) {} + + int64_t numel() const; + size_t nbytes() const; + std::string to_string() const; + + bool is_tensor() const { + return spec_type == SpecType::Tensor; + } + bool is_int_list() const { + return spec_type == SpecType::IntList; + } + bool is_int() const { + return spec_type == SpecType::Int; + } + bool is_float() const { + return spec_type == SpecType::Float; + } + bool is_bool() const { + return spec_type == SpecType::Bool; + } + + int32_t get_int_value() const { + return int32_data.empty() ? 0 : int32_data[0]; + } + float get_float_value() const { + return float_data.empty() ? 0.0f : float_data[0]; + } + bool get_bool_value() const { + return int32_data.empty() ? false : (int32_data[0] != 0); + } + const std::vector& get_int_list() const { + return int32_data; + } + const std::vector& get_tensor_sizes() const { + return sizes; + } + + const std::vector& get_float_data() const { + return float_data; + } + const std::vector& get_int32_data() const { + return int32_data; + } + const std::vector& get_half_data() const { + return half_data; + } + const std::vector& get_int8_data() const { + return int8_data; + } + const std::vector& get_uint8_data() const { + return uint8_data; + } + + std::vector& get_float_data() { + return float_data; + } + std::vector& get_int32_data() { + return int32_data; + } + std::vector& get_half_data() { + return half_data; + } + std::vector& get_int8_data() { + return int8_data; + } + std::vector& get_uint8_data() { + return uint8_data; + } + + const std::vector& get_ref_float_data() const { + return ref_float_data; + } + const std::vector& get_ref_int32_data() const { + return ref_int32_data; + } + const std::vector& get_ref_half_data() const { + return ref_half_data; + } + const std::vector& get_ref_int8_data() const { + return ref_int8_data; + } + const std::vector& get_ref_uint8_data() const { + return ref_uint8_data; + } + + std::vector& get_ref_float_data() { + return ref_float_data; + } + std::vector& get_ref_int32_data() { + return ref_int32_data; + } + std::vector& get_ref_half_data() { + return ref_half_data; + } + std::vector& get_ref_int8_data() { + return ref_int8_data; + } + std::vector& get_ref_uint8_data() { + return ref_uint8_data; + } + + void resize_data(size_t new_size); + void* get_mutable_data_ptr(); + float get_element(size_t index) const; + + // Set/get constant flag + bool is_constant() const { + return is_constant_tensor; + } + void set_constant(bool is_constant) { + is_constant_tensor = is_constant; + } + + const void* get_data_ptr() const; + + // Correctness checking against reference data + // Returns true if computed data matches reference data within tolerance + // Only validates float tensors as specified in requirements + bool validate_against_reference( + float abs_tolerance = 2e-3f, + float rel_tolerance = 1e-3f) const; + + private: + void generate_tensor_data(); +}; + +// +// TestCase +// + +class TestCase { + public: + TestCase() = default; + TestCase(const std::string& name) : name_(name) {} + + void set_name(const std::string& name) { + name_ = name; + } + const std::string& name() const { + return name_; + } + + void set_operator_name(const std::string& op_name) { + operator_name_ = op_name; + } + const std::string& operator_name() const { + return operator_name_; + } + + void add_input_spec(const ValueSpec& spec) { + inputs_.push_back(spec); + } + + const std::vector& inputs() const { + return inputs_; + } + + std::vector& inputs() { + return inputs_; + } + + size_t num_inputs() const { + return inputs_.size(); + } + + void add_output_spec(const ValueSpec& spec) { + outputs_.push_back(spec); + } + + const std::vector& outputs() const { + return outputs_; + } + + std::vector& outputs() { + return outputs_; + } + + size_t num_outputs() const { + return outputs_.size(); + } + + bool empty() const { + return inputs_.empty() && outputs_.empty(); + } + void clear() { + inputs_.clear(); + outputs_.clear(); + name_.clear(); + operator_name_.clear(); + } + + private: + std::string name_; + std::string operator_name_; + std::vector inputs_; + std::vector outputs_; +}; + +// +// BenchmarkResult +// + +enum class CorrectnessStatus { + SKIPPED, // No reference function provided + PASSED, // Reference function provided and validation passed + FAIL // Reference function provided but validation failed +}; + +class BenchmarkResult { + public: + BenchmarkResult() : correctness_status_(CorrectnessStatus::SKIPPED) {} + + BenchmarkResult(const std::string& name) + : kernel_name(name), correctness_status_(CorrectnessStatus::SKIPPED) {} + + // Add timing for a single iteration + void add_iter_timing(float time_us); + + // Getters + const std::string& get_kernel_name() const { + return kernel_name; + } + float get_avg_time_us() const; + size_t get_num_iterations() const { + return iter_timings.size(); + } + const std::vector& get_iter_timings() const { + return iter_timings; + } + CorrectnessStatus get_correctness_status() const { + return correctness_status_; + } + + // Setters + void set_kernel_name(const std::string& name) { + kernel_name = name; + } + void set_correctness_status(CorrectnessStatus status) { + correctness_status_ = status; + } + + // Statistics + float get_min_time_us() const; + float get_max_time_us() const; + float get_std_dev_us() const; + + // Clear all timings + void clear_timings() { + iter_timings.clear(); + } + + // Print progress for this benchmark result + void print_summary( + int case_number, + const std::string& size_info, + float total_gflops) const; + + private: + std::string kernel_name; + std::vector + iter_timings; // Individual iteration timings in microseconds + CorrectnessStatus correctness_status_; +}; + +// Test result collection and processing +class TestResult { + public: + TestResult() : gflops_(0.0f), correctness_passed_(true) {} + TestResult(const std::string& operation_name) + : operation_name_(operation_name), + gflops_(0.0f), + correctness_passed_(true) {} + + // Add a benchmark result + void add_result(const BenchmarkResult& result); + void add_result(BenchmarkResult&& result); + + // Getters + const std::string& get_operation_name() const { + return operation_name_; + } + float get_gflops() const { + return gflops_; + } + bool get_correctness_passed() const { + return correctness_passed_; + } + size_t size() const { + return results_.size(); + } + bool empty() const { + return results_.empty(); + } + + // Setters + void set_gflops(float gflops_val) { + gflops_ = gflops_val; + } + void set_correctness_passed(bool passed) { + correctness_passed_ = passed; + } + + // Access results + const BenchmarkResult& operator[](size_t index) const { + return results_[index]; + } + BenchmarkResult& operator[](size_t index) { + return results_[index]; + } + const std::vector& get_results() const { + return results_; + } + + // Iterator support + std::vector::iterator begin() { + return results_.begin(); + } + std::vector::iterator end() { + return results_.end(); + } + std::vector::const_iterator begin() const { + return results_.begin(); + } + std::vector::const_iterator end() const { + return results_.end(); + } + + // Processing and analysis + void print_summary() const; + void print_detailed_results() const; + void print_statistics() const; + void print_brief_summary() const; + + // Get aggregate statistics + float get_total_avg_time_us() const; + float get_total_gflops() const; + size_t get_passed_count() const; + size_t get_failed_count() const; + + // Find best/worst performing results + const BenchmarkResult* get_fastest_result() const; + const BenchmarkResult* get_slowest_result() const; + const BenchmarkResult* get_highest_gflops_result() const; + + // Clear all results + void clear() { + results_.clear(); + } + + // Set operation name + void set_operation_name(const std::string& name) { + operation_name_ = name; + } + + private: + std::string operation_name_; + std::vector results_; + float gflops_; + bool correctness_passed_; +}; + +// +// Test case execution +// + +using FlopCalculatorFunc = std::function; + +// Default FLOP calculation function (assumes 1 FLOP per element) +int64_t default_flop_calculator(const TestCase& test_case); + +using ReferenceComputeFunc = std::function; + +BenchmarkResult execute_test_case( + TestCase& test_case, + int warmup_runs = 3, + int benchmark_runs = 10); + +TestResult execute_test_cases( + std::function()> test_case_generator, + FlopCalculatorFunc flop_calculator, + const std::string& operation_name = "Operation", + int warmup_runs = 3, + int benchmark_runs = 10, + ReferenceComputeFunc reference_compute_func = nullptr); + +TestResult execute_test_cases( + std::function()> test_case_generator, + const std::string& operation_name = "Operation", + int warmup_runs = 3, + int benchmark_runs = 10, + ReferenceComputeFunc reference_compute_func = nullptr); + +// +// Print utilities +// + +void print_performance_header(); +void print_separator(); + +void print_valuespec_data( + const ValueSpec& spec, + const std::string& name = "ValueSpec", + const bool print_ref_data = false, + size_t max_elements = 20, + int precision = 6); + +ValueRef quantized_weights_canvas( + ComputeGraph& graph, + const ValueRef weight_ref); + +ValueRef float_tensor_canvas(ComputeGraph& graph, const ValueRef weight_ref); + +// Compute weight sums for quantized operations (linear and convolution) +void compute_weight_sums( + ValueSpec& weight_sums, + const ValueSpec& quantized_weight, + int64_t out_features, + int64_t elements_per_output_feature); + +// Setup compute graph based on TestCase and operation name +ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name); + +} // namespace prototyping +} // namespace vulkan +} // namespace executorch