diff --git a/examples/11_xe20_cutlass_library/CMakeLists.txt b/examples/11_xe20_cutlass_library/CMakeLists.txt new file mode 100644 index 0000000000..22a0a77daa --- /dev/null +++ b/examples/11_xe20_cutlass_library/CMakeLists.txt @@ -0,0 +1,97 @@ +# Copyright (C) 2025 Intel Corporation, All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Example 11: XE20 CUTLASS Library BF16 GEMM +# This example creates a shared library (.so) that exports CUTLASS BF16 GEMM +# functionality for use with Python via ctypes. + +# Create shared library for Python integration +add_library(xe20_cutlass_library_bf16 SHARED + xe_20_cutlass_library_b16.cpp +) + +# Set library properties +set_target_properties(xe20_cutlass_library_bf16 PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + VERSION 1.0 + SOVERSION 1 + OUTPUT_NAME "xe20_cutlass_library_bf16" +) + +# Include directories +target_include_directories(xe20_cutlass_library_bf16 PRIVATE + ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR} + ${CUTLASS_EXAMPLES_UTILS_DIR} + ${CUTLASS_APPLICATIONS_DIR} +) + +# Link libraries +target_link_libraries(xe20_cutlass_library_bf16 PRIVATE + CUTLASS + cutlass_tools_util_includes +) + +# Add compile definitions +target_compile_definitions(xe20_cutlass_library_bf16 PRIVATE + CUTLASS_ENABLE_SYCL=1 + SYCL_INTEL_TARGET=1 + DPCPP_SYCL_TARGET=intel_gpu_bmg_g21 +) + +# Add Intel-specific SYCL compiler flags for XE20 optimization +if(CUTLASS_ENABLE_SYCL AND SYCL_INTEL_TARGET) + target_compile_options(xe20_cutlass_library_bf16 PRIVATE + -Xspirv-translator + -spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate + ) + add_onemkl_to_target(TARGET xe20_cutlass_library_bf16) + add_sycl_to_target(TARGET xe20_cutlass_library_bf16) +endif() + +# Link against CUTLASS XE20 GEMM library if available +if(TARGET cutlass_gemm_xe20_gemm) + target_link_libraries(xe20_cutlass_library_bf16 PRIVATE cutlass_gemm_xe20_gemm) +endif() + +# Install the shared library +install(TARGETS xe20_cutlass_library_bf16 + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) + +# Add to examples target +add_dependencies(cutlass_examples xe20_cutlass_library_bf16) + +# Custom target for building just this library +add_custom_target(xe20_cutlass_library + DEPENDS xe20_cutlass_library_bf16 + COMMENT "Building XE20 CUTLASS Library BF16 GEMM Shared Library (.so)" +) + +message(STATUS "Added shared library xe20_cutlass_library_bf16 for Python integration") \ No newline at end of file diff --git a/examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp b/examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp new file mode 100644 index 0000000000..812af797d7 --- /dev/null +++ b/examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp @@ -0,0 +1,225 @@ +/*************************************************************************************************** + * Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************************************/ + + + +#include +#include +#include +#include +#include + +#include "cute/tensor.hpp" +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "cutlass/util/device_memory.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +//#include "cutlass/gemm/device/gemm_sparse.h" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/activation.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/kernel/tile_scheduler.hpp" +#include "cutlass/tensor_ref.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/tensor_view_io.h" + + +// We compile all models with -fvisibility=hidden. Any symbols that need to be +// exposed in the final shared library must be declared with PT_EXPORT to make +// them visible. +#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++) +#define PT_EXPORT __attribute__((__visibility__("default"))) +#else +#ifdef _WIN32 +#define PT_EXPORT __declspec(dllexport) +#else +#define PT_EXPORT +#endif +#endif + +using namespace cute; +#define CUTLASS_CHECK(status) \ +{ \ + cutlass::Status error = status; \ + if (error != cutlass::Status::kSuccess) { \ + auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " + \ + cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__); \ + throw std::runtime_error(msg); \ + } \ +} + +// Used as pass-through functor in EVT just for type casting / rounding +template +struct identity_op { + CUTLASS_HOST_DEVICE + T operator()(T val) const { return val; } +}; + + + +using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp, + cute::Shape, + cute::Shape, + cutlass::epilogue::collective::EpilogueTileAuto, + float, float, + float, cutlass::layout::RowMajor, 4, + float, cutlass::layout::RowMajor, 4, + cutlass::epilogue::collective::EpilogueScheduleAuto, + cutlass::epilogue::fusion::LinearCombination< + float, + float, + float, + float + > + >::CollectiveOp; + +using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8, + float, + cute::Shape, + cute::Shape, + cutlass::gemm::collective::StageCountAuto, + cutlass::gemm::collective::KernelScheduleAuto + >::CollectiveOp; + +// Gemm operator cutlass3x_xe11_tensorop_gemm_bf16_128x256_16x0_tn_align2 +using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base = cutlass::gemm::kernel::GemmUniversal< + cute::Shape, + cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop, + cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue, + cutlass::gemm::PersistentScheduler>; + +// Define named type +struct cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8 : +public cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base { }; + + + using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type = cutlass::gemm::device::GemmUniversalAdapter; + +// When workspace_size is not a nullptr, populates requested workspace_size and returns. +// Otherwise, computes the Gemm kernel using the given workspace ptr. +extern "C" { +PT_EXPORT int sycl_tla_gemm_xe20_bf16(const uint16_t* X, const uint16_t* W, uint16_t* Y, const int M, const int N, const int K, const int B, const int lda, const int ldb, const int ldc, const int ldd, const int X_offset, const int W_offset, const int Y_offset, const uint8_t swizzle, size_t* workspace_size, uint8_t* workspace, sycl::queue* stream) { + try { + using ElementComputeEpilogue = cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::ElementAccumulator; + using coord_t = cutlass::gemm::GemmCoord::Index; + static cutlass::KernelHardwareInfo hw_info; + if (hw_info.sm_count == 0) { + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(0); + CUTLASS_TRACE_HOST("Query result for SM count per device: " << hw_info.sm_count); + } + + // Initialize GemmUniversal3xInstance arguments using constructor + cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, // GemmUniversalMode mode + { + static_cast(M), + static_cast(N), + static_cast(K), + static_cast(B) + }, // ProblemShape problem_shape + { + (cutlass::bfloat16_t*)(X + X_offset), // ElementA const* ptr_A + cute::make_tuple(cute::Int<1>{}, int64_t(lda), int64_t(0)), // StrideA dA (column-major: stride_m=1, stride_n=lda, batch=0) + (cutlass::bfloat16_t*)(W + W_offset), // ElementB const* ptr_B + cute::make_tuple(int64_t(ldb), cute::Int<1>{}, int64_t(0)), // StrideB dB (column-major: stride_m=ldb, stride_n=1, batch=0) + }, // MainloopArguments mainloop + + // see https://tinyurl.com/4rk89z48 + { + {ElementComputeEpilogue(1), ElementComputeEpilogue(0)}, // thread, typename FusionCallbacks::Arguments ( EVT ) or ThreadEpilogueOp::Params (non-EVT ) + nullptr, // ElementC const* ptr_C + cute::make_tuple(int64_t(0), cute::Int<1>{}, int64_t(0)), // StrideC dC (row-major: stride_m, stride_n=1, batch=0) + (float*)(Y + Y_offset), // ElementD ptr_D (output is float, not bfloat16) + cute::make_tuple(int64_t(ldd), cute::Int<1>{}, int64_t(0)), // StrideD dD (row-major: stride_m=ldd, stride_n=1, batch=0) + }, // EpilogueArguments epilogue, + hw_info + }; + arguments.scheduler.max_swizzle_size = swizzle; + cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type gemm_op; + if (workspace_size) { + *workspace_size = gemm_op.get_workspace_size(arguments); + return 0; + } + // check for null pointers after workspace size, since querying workspace size doesn't require valid data pointers +#ifndef CUTLASS_BACKEND_DISABLE_CHECKS + { + auto status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + } +#endif +#ifdef CUTLASS_DEBUG_TRACE_LEVEL +#if CUTLASS_DEBUG_TRACE_LEVEL == 1 + { + // Print the maximum number of active blocks per SM for the kernel if CUTLASS_DEBUG_TRACE_LEVEL == 1 + // we don't need a print statement, it's happening inside the function. + gemm_op.maximum_active_blocks(); + } +#endif +#endif + { + auto status = gemm_op.initialize(arguments, workspace, stream); + CUTLASS_CHECK(status); + } + { + auto status = gemm_op(stream); + CUTLASS_CHECK(status); + } + } + catch (std::exception& e) { + std::cerr << "Runtime error: " << e.what() << std::endl; + return -1; + } + catch (...) { + return -1; + } + return 0; +} +} + +// configuration name: cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8 \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d141f5b7de..9d51802ecf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -111,6 +111,7 @@ if(CUTLASS_ENABLE_SYCL) 08_bmg_gemm_f8 09_bmg_grouped_gemm_f8 10_bmg_grouped_gemm_mixed_dtype + 11_xe20_cutlass_library ) add_subdirectory(${EXAMPLE}) endforeach() diff --git a/examples/python/cutlass_library/xe20_gemm_bf16.py b/examples/python/cutlass_library/xe20_gemm_bf16.py new file mode 100644 index 0000000000..93205b69ef --- /dev/null +++ b/examples/python/cutlass_library/xe20_gemm_bf16.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (C) 2025 Intel Corporation, All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +############################################################################### + +""" +Test the generated CUTLASS GEMM kernel (sycl_tla_gemm_xe20_bf16) +""" + +import ctypes +from ctypes import c_void_p, c_int, c_size_t, c_uint8, c_uint16, POINTER, byref +import numpy as np +import time +from pathlib import Path + + +def test_sycl_tla_gemm_xe20_bf16(): + """Test the compiled sycl_tla_gemm_xe20_bf16 function""" + + # Load the shared library + lib_path = Path(__file__).parent / '../../../build/examples/11_xe20_cutlass_library/libxe20_cutlass_library_bf16.so' + if not lib_path.exists(): + print(f"Error: {lib_path} not found!") + print("Please build the library first: ninja xe20_cutlass_library_bf16") + return + + lib = ctypes.CDLL(str(lib_path)) + + # Define function signature + # int sycl_tla_gemm_xe20_bf16( + # const uint16_t* X, const uint16_t* W, uint16_t* Y, + # const int M, const int N, const int K, const int B, + # const int lda, const int ldb, const int ldc, const int ldd, + # const int X_offset, const int W_offset, const int Y_offset, + # const uint8_t swizzle, + # size_t* workspace_size, uint8_t* workspace, sycl::queue* stream) + lib.sycl_tla_gemm_xe20_bf16.argtypes = [ + c_void_p, # X (input A) + c_void_p, # W (input B) + c_void_p, # Y (output) + c_int, # M + c_int, # N + c_int, # K + c_int, # B (batch) + c_int, # lda + c_int, # ldb + c_int, # ldc + c_int, # ldd + c_int, # X_offset + c_int, # W_offset + c_int, # Y_offset + c_uint8, # swizzle + POINTER(c_size_t), # workspace_size + c_void_p, # workspace + c_void_p, # stream (sycl::queue*) + ] + lib.sycl_tla_gemm_xe20_bf16.restype = c_int + + print("="*80) + print("Testing sycl_tla_gemm_xe20_bf16 (BF16 256x256x32 GEMM)") + print("="*80) + + # Problem dimensions (matching the kernel tile: 256x256x32) + M = 256 + N = 256 + K = 32 + B = 1 # batch size + + print(f"\nProblem size: M={M}, N={N}, K={K}, B={B}") + print(f" A: {M} x {K} (bfloat16, column-major)") + print(f" B: {K} x {N} (bfloat16, column-major)") + print(f" C: {M} x {N} (float, row-major)") + + # Leading dimensions (column-major for inputs, row-major for output) + lda = M # column-major: leading dimension is M + ldb = K # column-major: leading dimension is K + ldc = 0 # not used (ptr_C is nullptr) + ldd = N # row-major: leading dimension is N + + print(f"\nLeading dimensions: lda={lda}, ldb={ldb}, ldd={ldd}") + + # Allocate input/output matrices + # Note: Using uint16 to represent bfloat16 in memory + X = np.random.randint(0, 100, size=(M * K), dtype=np.uint16) + W = np.random.randint(0, 100, size=(K * N), dtype=np.uint16) + Y = np.zeros(M * N, dtype=np.float32) # Output is float32 + + print(f"\nAllocated matrices:") + print(f" X: {X.nbytes} bytes") + print(f" W: {W.nbytes} bytes") + print(f" Y: {Y.nbytes} bytes") + + # Query workspace size + print("\n1. Querying workspace size...") + workspace_size = c_size_t(0) + result = lib.sycl_tla_gemm_xe20_bf16( + c_void_p(), # X (not needed for workspace query) + c_void_p(), # W + c_void_p(), # Y + M, N, K, B, + lda, ldb, ldc, ldd, + 0, 0, 0, # offsets + 1, # swizzle + byref(workspace_size), + c_void_p(), # workspace + c_void_p(), # stream (NULL = use default) + ) + + if result != 0: + print(f" ✗ Workspace query failed with code {result}") + return + + print(f" ✓ Workspace required: {workspace_size.value} bytes") + + # Allocate workspace if needed + workspace = None + workspace_ptr = c_void_p() + if workspace_size.value > 0: + workspace = np.zeros(workspace_size.value, dtype=np.uint8) + workspace_ptr = workspace.ctypes.data_as(c_void_p) + print(f" ✓ Workspace allocated") + + # Run GEMM + print("\n2. Executing GEMM...") + + X_ptr = X.ctypes.data_as(c_void_p) + W_ptr = W.ctypes.data_as(c_void_p) + Y_ptr = Y.ctypes.data_as(c_void_p) + + # Warmup run + result = lib.sycl_tla_gemm_xe20_bf16( + X_ptr, W_ptr, Y_ptr, + M, N, K, B, + lda, ldb, ldc, ldd, + 0, 0, 0, # offsets + 1, # swizzle + None, # workspace_size (None = execute mode, not query) + workspace_ptr, + c_void_p(), # stream (NULL = use default) + ) + + if result != 0: + print(f" ✗ GEMM execution failed with code {result}") + return + + print(f" ✓ Warmup run completed") + + # Benchmark + print("\n3. Benchmarking...") + num_runs = 10 + times = [] + + for i in range(num_runs): + start = time.time() + result = lib.sycl_tla_gemm_xe20_bf16( + X_ptr, W_ptr, Y_ptr, + M, N, K, B, + lda, ldb, ldc, ldd, + 0, 0, 0, + 1, + None, # workspace_size (None = execute mode) + workspace_ptr, + c_void_p(), + ) + elapsed = time.time() - start + + if result != 0: + print(f" ✗ Run {i+1} failed with code {result}") + continue + + times.append(elapsed) + + if not times: + print(" ✗ All runs failed!") + return + + # Calculate statistics + avg_time = np.mean(times) + min_time = np.min(times) + max_time = np.max(times) + std_time = np.std(times) + + # Calculate FLOPS (2*M*N*K for GEMM) + flops = 2 * M * N * K + avg_gflops = flops / avg_time / 1e9 + peak_gflops = flops / min_time / 1e9 + + print(f"\n{'='*80}") + print(f"Performance Results ({num_runs} runs)") + print(f"{'='*80}") + print(f" Average time: {avg_time*1000:.3f} ms") + print(f" Min time: {min_time*1000:.3f} ms") + print(f" Max time: {max_time*1000:.3f} ms") + print(f" Std dev: {std_time*1000:.3f} ms") + print(f"") + print(f" Average GFLOPS: {avg_gflops:.2f}") + print(f" Peak GFLOPS: {peak_gflops:.2f}") + print(f"{'='*80}") + + # Check output (basic sanity check) + non_zero = np.count_nonzero(Y) + print(f"\nOutput sanity check:") + print(f" Non-zero elements: {non_zero}/{Y.size}") + print(f" Output range: [{Y.min():.3f}, {Y.max():.3f}]") + + return avg_gflops + + +def benchmark_multiple_sizes(): + """Benchmark different problem sizes""" + + print("\n" + "="*80) + print("Benchmarking Multiple Problem Sizes") + print("="*80) + + # Test different sizes (all should be compatible with 256x256x32 tile) + sizes = [ + (256, 256, 32), + (512, 512, 32), + (256, 256, 64), + (512, 512, 64), + (1024, 1024, 32), + ] + + # Note: This would require modifying the function to accept variable sizes + # For now, the kernel is hard-coded to 256x256x32 + print("\nNote: Current kernel is optimized for 256x256x32 tile size") + print("Multi-size benchmarking would require different kernel configurations") + + +if __name__ == "__main__": + try: + gflops = test_sycl_tla_gemm_xe20_bf16() + if gflops: + print(f"\n✓ Test completed successfully!") + print(f" Average performance: {gflops:.2f} GFLOPS") + except Exception as e: + print(f"\n✗ Test failed with exception:") + print(f" {e}") + import traceback + traceback.print_exc() diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h index 3e4f55c5bd..e344b2922a 100644 --- a/include/cutlass/arch/arch.h +++ b/include/cutlass/arch/arch.h @@ -123,6 +123,17 @@ struct IntelXe { static int const kMinComputeCapability = 0; }; +// Intel Xe architecture aliases for library generation compatibility +// Xe12 = PVC (Ponte Vecchio) +struct Xe12 : IntelXe { + static int const kIntelXeArch = 12; +}; + +// Xe20 = BMG (Battlemage) +struct Xe20 : IntelXe { + static int const kIntelXeArch = 20; +}; + struct Agnostic { static int const kMinComputeCapability = 1; }; diff --git a/include/cutlass/epilogue/collective/builders/xe_builder.inl b/include/cutlass/epilogue/collective/builders/xe_builder.inl index 809cede6f7..495244c6e2 100644 --- a/include/cutlass/epilogue/collective/builders/xe_builder.inl +++ b/include/cutlass/epilogue/collective/builders/xe_builder.inl @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -163,8 +164,8 @@ template < "Trying to use Intel pipeline on Non Intel hardware"); #endif static_assert(is_static::value); - static_assert(cute::is_any_of_v, - "ElementC needs to be one of: float, bfloat, half for the Intel pipeline"); + static_assert(cute::is_any_of_v, + "ElementC needs to be one of: float, bfloat, half, int32, or void for the Intel pipeline"); using EpilogueSchedule = std::conditional_t, IntelXeXMX16, @@ -211,4 +212,110 @@ template < CopyOpR2S_ >; }; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Xe12 (PVC) Epilogue CollectiveBuilder - forwards to IntelXe +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class TileShape_MNK, + class EpilogueTileType, + class ElementAccumulator, + class ElementCompute, + class ElementC, + class GmemLayoutTagC, + int AlignmentC, + class ElementD, + class GmemLayoutTagD, + int AlignmentD, + class EpilogueScheduleType, + class FusionOpOrCallbacks + > +struct CollectiveBuilder< + arch::Xe12, + arch::OpClassTensorOp, + TileShape_MNK, + Shape<_1, _1, _1>, + EpilogueTileType, + ElementAccumulator, + ElementCompute, + ElementC, + GmemLayoutTagC, + AlignmentC, + ElementD, + GmemLayoutTagD, + AlignmentD, + EpilogueScheduleType, + FusionOpOrCallbacks + > : CollectiveBuilder< + arch::IntelXe, // Forward to IntelXe + arch::OpClassTensorOp, + TileShape_MNK, + Shape<_1, _1, _1>, + EpilogueTileType, + ElementAccumulator, + ElementCompute, + ElementC, + GmemLayoutTagC, + AlignmentC, + ElementD, + GmemLayoutTagD, + AlignmentD, + EpilogueScheduleType, + FusionOpOrCallbacks + > {}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Xe20 (BMG) Epilogue CollectiveBuilder - forwards to IntelXe +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class TileShape_MNK, + class EpilogueTileType, + class ElementAccumulator, + class ElementCompute, + class ElementC, + class GmemLayoutTagC, + int AlignmentC, + class ElementD, + class GmemLayoutTagD, + int AlignmentD, + class EpilogueScheduleType, + class FusionOpOrCallbacks + > +struct CollectiveBuilder< + arch::Xe20, + arch::OpClassTensorOp, + TileShape_MNK, + Shape<_1, _1, _1>, + EpilogueTileType, + ElementAccumulator, + ElementCompute, + ElementC, + GmemLayoutTagC, + AlignmentC, + ElementD, + GmemLayoutTagD, + AlignmentD, + EpilogueScheduleType, + FusionOpOrCallbacks + > : CollectiveBuilder< + arch::IntelXe, // Forward to IntelXe + arch::OpClassTensorOp, + TileShape_MNK, + Shape<_1, _1, _1>, + EpilogueTileType, + ElementAccumulator, + ElementCompute, + ElementC, + GmemLayoutTagC, + AlignmentC, + ElementD, + GmemLayoutTagD, + AlignmentD, + EpilogueScheduleType, + FusionOpOrCallbacks + > {}; + } // namespace cutlass::epilogue::collective + diff --git a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl index c2ffaa5a5f..71a2101329 100644 --- a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl +++ b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -54,32 +55,36 @@ constexpr auto get_num_atoms(T_m tile_m, T_n tile_n){ template constexpr auto select_copy_atom_16b(T_m tile_m, T_n tile_n){ + // Extract compile-time constant values from cute::Int<> types + constexpr int tile_m_val = decltype(tile_m)::value; + constexpr int tile_n_val = decltype(tile_n)::value; + #define RETURN_ATOM(WIDTH, HEIGHT, LETTER) \ return XE_2D_U16x##WIDTH##x##HEIGHT##_LD_##LETTER {}; if constexpr(is_t){ // tile_m and tile_n have swapped role in case of _T - static_assert(tile_n % 16 == 0 && "Invalid tile_m"); - if constexpr(tile_m == 8){ + static_assert(tile_n_val % 16 == 0 && "Invalid tile_m"); + if constexpr(tile_m_val == 8){ RETURN_ATOM(16, 8, T) - } else if constexpr(tile_m % 16 == 0){ + } else if constexpr(tile_m_val % 16 == 0){ RETURN_ATOM(16, 16, T) } else{ static_assert(dependent_false && "Invalid tile_n"); } } else if constexpr(is_v){ #define SELECT_HEIGHT_V(WIDTH) \ - if constexpr(tile_n == 16){ \ + if constexpr(tile_n_val == 16){ \ RETURN_ATOM(WIDTH, 16, V) \ - } else if constexpr(tile_n % 32 == 0){ \ + } else if constexpr(tile_n_val % 32 == 0){ \ RETURN_ATOM(WIDTH, 32, V) \ } else{ \ static_assert(dependent_false && "Invalid tile_n"); \ } - if constexpr(tile_m == 16){ + if constexpr(tile_m_val == 16){ SELECT_HEIGHT_V(16) - } else if constexpr(tile_m % 32 == 0){ + } else if constexpr(tile_m_val % 32 == 0){ SELECT_HEIGHT_V(32) } else{ static_assert(dependent_false && "Invalid tile_m"); @@ -87,25 +92,25 @@ constexpr auto select_copy_atom_16b(T_m tile_m, T_n tile_n){ #undef SELECT_HEIGHT_V } else{ // _N #define SELECT_WIDTH_N(HEIGHT) \ - if constexpr(tile_m == 1){ \ + if constexpr(tile_m_val == 1){ \ RETURN_ATOM(1, HEIGHT, N) \ - } else if constexpr(tile_m == 2){ \ + } else if constexpr(tile_m_val == 2){ \ RETURN_ATOM(2, HEIGHT, N) \ - } else if constexpr(tile_m == 4){ \ + } else if constexpr(tile_m_val == 4){ \ RETURN_ATOM(4, HEIGHT, N) \ - } else if constexpr(tile_m == 8){ \ + } else if constexpr(tile_m_val == 8){ \ RETURN_ATOM(8, HEIGHT, N) \ - } else if constexpr(tile_m == 16){ \ + } else if constexpr(tile_m_val == 16){ \ RETURN_ATOM(16, HEIGHT, N) \ - } else if constexpr(tile_m % 32 == 0){ \ + } else if constexpr(tile_m_val % 32 == 0){ \ RETURN_ATOM(32, HEIGHT, N) \ } else { \ static_assert(dependent_false && "Invalid tile_m"); \ } - if constexpr(tile_n == 16){ + if constexpr(tile_n_val == 16){ SELECT_WIDTH_N(16) - } else if constexpr(tile_n % 32 == 0){ + } else if constexpr(tile_n_val % 32 == 0){ SELECT_WIDTH_N(32) } else { static_assert(dependent_false && "Invalid tile_n"); @@ -130,6 +135,11 @@ PICK_MMA(bfloat16_t, float, XE_8x16x16_F32BF16BF16F32_TT); PICK_MMA(bfloat16_t, bfloat16_t, XE_8x16x16_BF16BF16BF16BF16_TT); PICK_MMA(half_t, float, XE_8x16x16_F32F16F16F32_TT); PICK_MMA(half_t, half_t, XE_8x16x16_F16F16F16F16_TT); +// FP8 types use FP16 accumulation, the conversion happens in the collective +PICK_MMA(float_e4m3_t, float, XE_8x16x16_F32F16F16F32_TT); +PICK_MMA(float_e5m2_t, float, XE_8x16x16_F32F16F16F32_TT); +// INT8 types use INT32 accumulation (note: K=32 for INT8, not K=16) +PICK_MMA(int8_t, int32_t, XE_8x16x32_S32S8S8S32_TT); #undef PICK_MMA } @@ -171,8 +181,8 @@ struct CollectiveBuilder< "Trying to use Intel pipeline on Non Intel hardware"); #endif static_assert(is_static::value); - static_assert(cute::is_any_of_v, - "Intel multi-stage pipeline requires ElementC to be of type float, bfloat or half"); + static_assert(cute::is_any_of_v, + "Intel multi-stage pipeline requires ElementC to be of type float, bfloat, half, or int32"); static constexpr bool isAtypeBig = cute::sizeof_bits_v > cute::sizeof_bits_v; using MMAType = std::conditional_t; @@ -218,6 +228,7 @@ struct CollectiveBuilder< using ElementA_ = std::conditional_t <= 8, cute::tuple, ElementA>; using ElementB_ = std::conditional_t <= 8, cute::tuple, ElementB>; + using CollectiveOp = cutlass::gemm::collective::CollectiveMma< DispatchPolicy, TileShape_MNK, @@ -236,4 +247,96 @@ struct CollectiveBuilder< TransformB >; }; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Xe12 (PVC) CollectiveBuilder - forwards to IntelXe +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class ElementA, + class GmemLayoutATag, + int AlignmentA, + class ElementB, + class GmemLayoutBTag, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class KernelScheduleType + > +struct CollectiveBuilder< + arch::Xe12, + arch::OpClassTensorOp, + ElementA, + GmemLayoutATag, + AlignmentA, + ElementB, + GmemLayoutBTag, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + Shape<_1, _1, _1>, + cutlass::gemm::collective::StageCountAuto, + KernelScheduleType + > : CollectiveBuilder< + arch::IntelXe, // Forward to IntelXe + arch::OpClassTensorOp, + ElementA, + GmemLayoutATag, + AlignmentA, + ElementB, + GmemLayoutBTag, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + Shape<_1, _1, _1>, + cutlass::gemm::collective::StageCountAuto, + KernelScheduleType + > {}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Xe20 (BMG) CollectiveBuilder - forwards to IntelXe +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class ElementA, + class GmemLayoutATag, + int AlignmentA, + class ElementB, + class GmemLayoutBTag, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class KernelScheduleType + > +struct CollectiveBuilder< + arch::Xe20, + arch::OpClassTensorOp, + ElementA, + GmemLayoutATag, + AlignmentA, + ElementB, + GmemLayoutBTag, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + Shape<_1, _1, _1>, + cutlass::gemm::collective::StageCountAuto, + KernelScheduleType + > : CollectiveBuilder< + arch::IntelXe, // Forward to IntelXe + arch::OpClassTensorOp, + ElementA, + GmemLayoutATag, + AlignmentA, + ElementB, + GmemLayoutBTag, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + Shape<_1, _1, _1>, + cutlass::gemm::collective::StageCountAuto, + KernelScheduleType + > {}; + } + diff --git a/media/docs/python/xe_cutlass_library.md b/media/docs/python/xe_cutlass_library.md new file mode 100644 index 0000000000..b348db46d8 --- /dev/null +++ b/media/docs/python/xe_cutlass_library.md @@ -0,0 +1,192 @@ + + +# Kernel Generation and Manifest + +This is a code/kernel generation system that creates a searchable catalog of CUTLASS kernel operations, bridging build-time generation and runtime selection. + +## Architecture Overview + +**Two-Phase System:** +1. **Build Time (Python)**: `manifest.py` generates C++ initialization code +2. **Runtime (C++)**: Generated code registers operations into a searchable `Manifest` + +``` +Python Generator → C++ Files → Compiled Library → Runtime Catalog +``` + +## Key Components + +### Python Generator (`manifest.py`) + +**Responsibilities:** +- Filter kernels by GPU architecture (SM/Xe), operation type, patterns +- Group operations by kind/architecture/instruction type +- Generate C++ initialization functions and CMake files + +### Generated File Structure +``` +build/tools/library/generated/ +├── initialize_all.cpp +├── gemm/20/tensorop/cutlass3x_xe20_tensorop_gemm_bf16_*.cpp +└── manifest.cmake +``` + +### Architecture Naming +| GPU | Prefix | ID | Example | +|-----|--------|----|---------| +| CUDA | `sm` | 70-90 | `sm80` | +| Intel Xe | `xe` | 12,20 | `xe20` | + +## Runtime API + +### Core Classes + +```cpp +// Manifest: Operation catalog +class Manifest { + Status initialize(); + void append(Operation *op); + OperationVector const& operations() const; +}; + +// Operation: Base kernel interface +class Operation { + virtual Status can_implement(void const *config, void const *args) const = 0; + virtual Status run(void const *args, void *workspace, Stream stream) const = 0; +}; +``` + +### Initialization Hierarchy +```cpp +namespace cutlass::library { + void initialize_all(Manifest &manifest); // All operations + void initialize_all_gemm_operations(Manifest &manifest); // GEMM only + void initialize_all_xe20_gemm_operations(Manifest &manifest); // XE20 GEMM +} +``` + +## Usage Examples + +### Basic Usage +```cpp +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +cutlass::library::Manifest manifest; +cutlass::library::initialize_all(manifest); + +// Find BF16 GEMM +for (auto& op : manifest.operations()) { + if (op->description().name.find("bf16") != std::string::npos) { + // Use operation... + } +} +``` + +### Python Integration +```python +# Use extern "C" wrappers for ctypes integration +from ctypes import CDLL +lib = CDLL("libcutlass_gemm_xe20_gemm.so") +# Call exported C functions that wrap C++ manifest APIs +``` + +**Example Implementation:** See `examples/11_xe20_cutlass_library/` for a complete CMake-based shared library that exports CUTLASS kernels for Python usage via ctypes. + +## Common Patterns + +### Lazy Initialization +```cpp +class LazyManifest { + cutlass::library::Manifest manifest_; + bool initialized_ = false; +public: + cutlass::library::Manifest& get() { + if (!initialized_) { + cutlass::library::initialize_all(manifest_); + initialized_ = true; + } + return manifest_; + } +}; +``` + +### Operation Caching +```cpp +class OperationCache { + std::map cache_; +public: + cutlass::library::Operation* find(const std::string& pattern) { + if (cache_.count(pattern)) return cache_[pattern]; + // Search manifest and cache result... + } +}; +``` + +## Build Integration + +### CMake Configuration +```bash +# Generate for Intel XE20 +cmake .. -DCUTLASS_LIBRARY_GENERATOR_ARCHS="20" +ninja cutlass_library +``` + +### Python Generator +```bash +python3 generator.py --operations=gemm --architectures=20 --build-dir=. +``` + +## Performance Tips + +- **Selective Initialization**: Only initialize needed operation kinds +- **Operation Caching**: Cache frequently used operations +- **Kernel Filtering**: Use build-time filtering to reduce library size +- **Lazy Loading**: Initialize manifest only when needed + +## Debugging + +```bash +# List generated operations +nm -D libcutlass_gemm_xe20_gemm.so | grep initialize + +# Enable Python debug logging +python3 -c "import logging; logging.basicConfig(level=logging.DEBUG)" +``` + +## References + +- **Source**: `python/cutlass_library/manifest.py` +- **Headers**: `tools/library/include/cutlass/library/` +- **Generated**: `build/tools/library/generated/` +- **Examples**: + - `examples/11_xe20_cutlass_library/` - CMake-based shared library for Python integration + - `examples/python/cutlass_library/xe20_gemm_bf16.py` - Python test script using ctypes diff --git a/media/docs/python/xe_library_generation.md b/media/docs/python/xe_library_generation.md new file mode 100644 index 0000000000..63c22c088f --- /dev/null +++ b/media/docs/python/xe_library_generation.md @@ -0,0 +1,225 @@ + + +# Intel SYCL*TLA Library Generation Guide + +**Complete Reference for Intel Xe GPU Architecture Support** + +--- + +## Quick Start + +```bash +# Configure for BMG (Xe2) +cd build +cmake .. -GNinja -DCUTLASS_NVCC_ARCHS="" -DCUTLASS_ENABLE_SYCL=ON -DSYCL_INTEL_TARGET -DCUTLASS_LIBRARY_GENERATOR_ARCHS="20" + +# Build libraries +ninja cutlass_library + +# Test generation +cd python/cutlass_library +python3 test_simple_generation.py --build-dir ./test_build --arch 20 +``` + +**Expected Output:** ✅ 24 operations, 31 .cpp files generated + +--- + +## Architecture Support + +| GPU | Arch | Compute Cap | File Ext | Arch Tag | +|-----|------|-------------|----------|----------| +| **BMG** (Xe2) | 20 | 12-50 | `.cpp` | `cutlass::arch::Xe20` | +| **PVC** (Xe-HPC) | 12 | 12-50 | `.cpp` | `cutlass::arch::Xe12` | + +**Key Differences from CUDA:** +- Architecture prefix: `xe` (not `sm`) +- File extension: `.cpp` (not `.cu`) +- Compute capability: 12-50 (vs 50-120 for CUDA) + +--- + +## Supported Kernel Types + +### ✅ Homogeneous Types (A == B) + +| Type | A × B → C/D | Math Inst | Tile | Align | Status | +|------|-------------|-----------|------|-------|--------| +| **FP16** | half × half → float | [8,16,16] | 256×256×32 | 8 | ✅ | +| **BF16** | bf16 × bf16 → float | [8,16,16] | 256×256×32 | 8 | ✅ | +| **FP8-E4M3** | e4m3 × e4m3 → float | [8,16,32] | 256×256×64 | 16 | ✅ | +| **FP8-E5M2** | e5m2 × e5m2 → float | [8,16,32] | 256×256×64 | 16 | ✅ | +| **INT8** | int8 × int8 → int32 | [8,16,32] | 256×256×64 | 16 | ✅ | + +**Layout Combinations:** RR, RC, CR, CC (4 variants per type) + +### ❌ Mixed Precision (A ≠ B) + +Mixed precision infrastructure is not supported now: +- FP16 × E4M3/E5M2 → FP32 +- BF16 × E4M3/E5M2 → FP32 +- FP16 × INT4 → FP32 + +--- + +## Generated Libraries + +```bash +$ ls -lh build/tools/library/libcutlass*.so +-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_bf16.so # BF16 kernels +-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_e4m3.so # FP8 E4M3 +-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_e5m2.so # FP8 E5M2 +-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_f16.so # FP16 kernels +-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_s8.so # INT8 kernels +-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm.so # Generic +-rwxrwxr-x 19K libcutlass.so # Base library +``` + +### Kernel Naming Convention + +``` +cutlass3x_xe{arch}_{opclass}_{operation}_{dtype}_{tile}_{warp}_{layout}_align{N} +``` + +**Examples:** +```cpp +cutlass3x_xe20_tensorop_gemm_f16_256x256_32x0_nn_align8 // FP16, Row×Row +cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nt_align8 // BF16, Row×Column +cutlass3x_xe20_tensorop_gemm_e4m3_256x256_64x0_tn_align16 // E4M3, Column×Row +``` + +**Layout Codes:** `nn`=Row×Row, `nt`=Row×Column, `tn`=Column×Row, `tt`=Column×Column + +--- + +## Build & Usage + +### CMake Configuration + +```bash +# BMG (Xe2) +cmake .. -GNinja -DCUTLASS_ENABLE_SYCL=ON -DCUTLASS_LIBRARY_GENERATOR_ARCHS="20" + +# PVC (Xe-HPC) +cmake .. -GNinja -DCUTLASS_ENABLE_SYCL=ON -DCUTLASS_LIBRARY_GENERATOR_ARCHS="12" +``` + +### Build Targets + +```bash +ninja cutlass_library # All libraries +ninja cutlass_library_gemm_xe20_gemm_bf16 # BF16 only +ninja cutlass_library_gemm_xe20_gemm_f16 # FP16 only +``` + +### Python Generator (Direct) + +```bash +cd build +python3 ../python/cutlass_library/generator.py --operations=gemm --architectures=20 --build-dir=. +``` + +### Python Integration Example + +For Python integration via ctypes, see: +- **`examples/11_xe20_cutlass_library/`** - Complete CMake-based shared library example +- **`examples/python/cutlass_library/xe20_gemm_bf16.py`** - Python test script using ctypes + +**Build and test:** +```bash +# Build the shared library +ninja xe20_cutlass_library_bf16 + +# Test with Python +cd examples/python/cutlass_library +python3 xe20_gemm_bf16.py +``` + +## Troubleshooting + +### No Operations Generated +**Check:** `GenerateIntelXe()` called for arch in [12, 20] in `generator.py` + +### Library Link Errors +``` +undefined reference to `initialize_all_xe20_gemm_bf16_gemm_operations()` +``` +**Solution:** Build and link the specific library: `-lcutlass_gemm_xe20_gemm_bf16` + +## Summary + +### ✅ What Works +- **5 data type libraries** (FP16, BF16, E4M3, E5M2, INT8) +- **~24 operations, 31 .cpp files** generated +- **Homogeneous type kernels** compile cleanly +- **INT32 accumulator** for INT8 +- **FP8→FP16 conversion** in MMA + +### ❌ Limitations +- **Mixed precision** requires grouped GEMM +- **Regular library** only supports ElementA == ElementB +- **No INT4** in regular GEMM + +### 📊 Quick Reference +| Feature | Value | +|---------|-------| +| Arch Numbers | BMG=20, PVC=12 | +| File Ext | `.cpp` | +| Arch Prefix | `xe` | +| CC Range | 12-50 | +| Total Libraries | 7 | +| Total Kernels | ~24 | +| Supported Types | FP16, BF16, E4M3, E5M2, INT8 | + +## Examples and References + +### Practical Examples +- **`examples/11_xe20_cutlass_library/`** - CMake-based shared library for Python integration + - Exports `sycl_tla_gemm_xe20_bf16()` function via extern "C" + - Builds `libxe20_cutlass_library_bf16.so` with proper CMake integration + - Integrated into main examples build system (`ninja cutlass_examples`) + +- **`examples/python/cutlass_library/xe20_gemm_bf16.py`** - Python ctypes integration + - Complete test script using the shared library + - Demonstrates workspace querying, execution, and benchmarking + - Shows proper error handling and performance measurement + +### Build Integration +```bash +# Build the example library +ninja xe20_cutlass_library_bf16 + +# Run Python test +cd examples/python/cutlass_library +python3 xe20_gemm_bf16.py +``` + +--- diff --git a/python/cutlass_library/arch_constants.py b/python/cutlass_library/arch_constants.py new file mode 100644 index 0000000000..36e14a7773 --- /dev/null +++ b/python/cutlass_library/arch_constants.py @@ -0,0 +1,47 @@ +################################################################################################# +# +# Copyright (C) 2025 Intel Corporation, All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################################# + +""" +Architecture range constants for CUTLASS library generation. +Shared across manifest.py and gemm_operation.py to avoid circular imports. +""" + +################################################################################################### +# Architecture range constants +# Intel Xe architectures use the range [INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX) +# CUDA architectures use values >= CUDA_ARCH_MIN +################################################################################################### +INTEL_XE_ARCH_MIN = 12 # Minimum Intel Xe architecture (PVC = 12, BMG = 20) +INTEL_XE_ARCH_MAX = 50 # Upper bound (exclusive) for Intel Xe range +CUDA_ARCH_MIN = 50 # Minimum CUDA architecture (sm_50, sm_60, etc.) + +################################################################################################### diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py index 6dc9a0456b..d7ce9cb5dd 100644 --- a/python/cutlass_library/gemm_operation.py +++ b/python/cutlass_library/gemm_operation.py @@ -1,6 +1,7 @@ - +################################################################################################# # # Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (C) 2025 Intel Corporation, All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without @@ -47,8 +48,10 @@ if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True: raise ImportError("Disabling attempt to import cutlass_library") from cutlass_library.library import * + from cutlass_library.arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN except ImportError: from library import * + from arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN _LOGGER = logging.getLogger(__name__) @@ -87,7 +90,8 @@ def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, self.B = B self.C = C self.D = D - self.is_xe = self.arch == 11 + # Intel Xe architectures: PVC (12), BMG/Xe2 (20), etc. + self.is_xe = self.arch >= INTEL_XE_ARCH_MIN and self.arch < INTEL_XE_ARCH_MAX if is_block_scaled(gemm_kind): self.ScaleFactorA = ScaleFactorA @@ -388,6 +392,7 @@ def _procedural_name(self): l = self.layout_name(), a = str(max(self.A.alignment, self.B.alignment))) else: + # Intel Xe architectures use xe{cc} naming (e.g., xe20 for BMG, xe12 for PVC) threadblock = self.tile_description.procedural_name() return "cutlass{p}_xe{ar}_{op}_{ex}_{tb}_{l}_align{a}".format( p = self.prefix, @@ -1156,9 +1161,11 @@ def emit(self, operation): 'blockwise_prepare_code' : blockwise_prepare_code } - # Overriding values for Intel Xe + # Overriding values for Intel Xe architectures if operation.is_xe: - values['arch'] = "cutlass::arch::IntelXe" + # Use specific compute capability for Intel Xe GPUs + # e.g., cutlass::arch::Xe20 for BMG, cutlass::arch::Xe12 for PVC + values['arch'] = "cutlass::arch::Xe%d" % operation.arch return SubstituteTemplate(self.gemm_template, values) @@ -1473,7 +1480,13 @@ def emit(self, operation): class EmitGemmConfigurationLibrary: def __init__(self, operation_path, configuration_name): self.configuration_name = configuration_name - self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/') + + # Determine file extension based on architecture + # Intel Xe architectures (12=PVC, 20=BMG) use .cpp, CUDA uses .cu + # Check if operation_path contains /12/, /20/, xe12, or xe20 + is_xe_arch = any(marker in operation_path for marker in ['/12/', '\\12\\', 'xe12', '/20/', '\\20\\', 'xe20']) + file_extension = "cpp" if is_xe_arch else "cu" + self.configuration_path = os.path.join(operation_path, "%s.%s" % (configuration_name, file_extension)).replace('\\', '/') self.instance_emitter = { GemmKind.Gemm: EmitGemmInstance, diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py index aa73fb8b13..e7e1c4bad1 100644 --- a/python/cutlass_library/generator.py +++ b/python/cutlass_library/generator.py @@ -1,6 +1,7 @@ ################################################################################################# # # Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (C) 2025 Intel Corporation, All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without @@ -11769,7 +11770,292 @@ def GeneratePVC_TensorOp_16b_gemm(manifest, cuda_version): CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent]) def GeneratePVC(manifest, cuda_version): - GeneratePVC_TensorOp_16b_gemm(manifest, cuda_version) + """ + Generate CUTLASS kernels for PVC (Ponte Vecchio) architecture. + + PVC is Intel's Xe-HPC GPU architecture with compute capability 12. + + This is a legacy wrapper that calls GenerateIntelXe with arch=12. + """ + GenerateIntelXe(manifest, cuda_version, arch=12) + +################################################################################################### +def GenerateXe_TensorOp_16b_DPAS_gemm(manifest, cuda_version, min_cc=20): + """Generate FP16/BF16 GEMM kernels for Intel Xe architecture using DPAS. + + :param min_cc: Architecture number (12 for PVC, 20 for BMG) + """ + layout_list = [ + [[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8]], + [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8]], + [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8]], + [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8]], + ] + + math_instructions = [ + MathInstruction( + [8, 16, 16], + DataType.f16, DataType.f16, DataType.f32, + OpcodeClass.TensorOp, + MathOperation.multiply_add), + MathInstruction( + [8, 16, 16], + DataType.f16, DataType.f16, DataType.f16, + OpcodeClass.TensorOp, + MathOperation.multiply_add), + MathInstruction( + [8, 16, 16], + DataType.bf16, DataType.bf16, DataType.f32, + OpcodeClass.TensorOp, + MathOperation.multiply_add), + MathInstruction( + [8, 16, 16], + DataType.bf16, DataType.bf16, DataType.bf16, + OpcodeClass.TensorOp, + MathOperation.multiply_add) + ] + + max_cc = min_cc + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 256, 32], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 256, 32], + 0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([256, 128, 32], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 128, 32], + 0, [4, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([64, 128, 32], + 0, [2, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + ] + + data_type = { + "a_type": math_inst.element_a, + "b_type": math_inst.element_b, + "c_type": math_inst.element_accumulator, + "d_type": math_inst.element_accumulator, + "acc_type": math_inst.element_accumulator, + "epi_type": math_inst.element_accumulator + } + + schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]] + + CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent]) + + +def GenerateXe_TensorOp_fp8_DPAS_gemm(manifest, cuda_version, min_cc=20): + """Generate FP8 (E4M3/E5M2) GEMM kernels for Intel Xe architecture using DPAS. + + Supported combinations for regular GEMM: + - [e4m3, e4m3, fp32]: E4M3 x E4M3 -> FP32 (homogeneous) + - [e5m2, e5m2, fp32]: E5M2 x E5M2 -> FP32 (homogeneous) + + Note: Mixed precision (FP16/BF16 x FP8) requires grouped GEMM infrastructure + and is NOT supported for regular library generation. + + :param min_cc: Architecture number (12 for PVC, 20 for BMG) + """ + layout_list = [ + [[LayoutType.RowMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 8]], + [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 8]], + [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 8]], + [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 8]], + ] + + # FP8 math instructions for Intel Xe + # Only homogeneous types (same A and B type) for regular GEMM + math_instructions = [ + # Homogeneous FP8 (same type for A and B) - SUPPORTED + MathInstruction( + [8, 16, 32], + DataType.e4m3, DataType.e4m3, DataType.f32, # E4M3 x E4M3 -> FP32 + OpcodeClass.TensorOp, + MathOperation.multiply_add), + MathInstruction( + [8, 16, 32], + DataType.e5m2, DataType.e5m2, DataType.f32, # E5M2 x E5M2 -> FP32 + OpcodeClass.TensorOp, + MathOperation.multiply_add), + + # DISABLED: Mixed precision FP16/BF16 x FP8 requires grouped GEMM + # These would need MainloopIntelXeXMX16GroupMixedPrecision which is only + # activated when IsGroup=true (KernelXePtrArrayCooperative schedule). + # Regular library GEMMs use MainloopIntelXeXMX16 which requires ElementA == ElementB. + # + # MathInstruction([8, 16, 32], DataType.f16, DataType.e5m2, DataType.f32, ...), + # MathInstruction([8, 16, 32], DataType.f16, DataType.e4m3, DataType.f32, ...), + # MathInstruction([8, 16, 32], DataType.bf16, DataType.e5m2, DataType.f32, ...), + # MathInstruction([8, 16, 32], DataType.bf16, DataType.e4m3, DataType.f32, ...), + ] + + max_cc = min_cc + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 256, 64], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 256, 64], + 0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([256, 128, 64], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 128, 64], + 0, [4, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + ] + + data_type = { + "a_type": math_inst.element_a, + "b_type": math_inst.element_b, + "c_type": math_inst.element_accumulator, + "d_type": math_inst.element_accumulator, + "acc_type": math_inst.element_accumulator, + "epi_type": math_inst.element_accumulator + } + + schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]] + + CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent]) + +def GenerateXe_TensorOp_int8_DPAS_gemm(manifest, cuda_version, min_cc=20): + """Generate INT8 GEMM kernels for Intel Xe architecture using DPAS. + + Supported: [int8, int8, int32] -> INT32 accumulator (hardware requirement) + + :param min_cc: Architecture number (12 for PVC, 20 for BMG) + """ + layout_list = [ + [[LayoutType.RowMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 4]], + [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 4]], + [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 4]], + [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 4]], + ] + + # INT8 x INT8 -> INT32 (hardware requirement for Intel Xe) + math_instructions = [ + MathInstruction( + [8, 16, 32], + DataType.s8, DataType.s8, DataType.s32, # Changed from f32 to s32 + OpcodeClass.TensorOp, + MathOperation.multiply_add), + ] + + max_cc = min_cc + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 256, 64], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 256, 64], + 0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([256, 128, 64], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 128, 64], + 0, [4, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + ] + + data_type = { + "a_type": math_inst.element_a, + "b_type": math_inst.element_b, + "c_type": math_inst.element_accumulator, + "d_type": math_inst.element_accumulator, + "acc_type": math_inst.element_accumulator, + "epi_type": math_inst.element_accumulator + } + + schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]] + + CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent]) + + +def GenerateXe_TensorOp_mixed_dtype_DPAS_gemm(manifest, cuda_version, min_cc=20): + """Generate mixed-precision GEMM kernels for Intel Xe architecture using DPAS. + + Supported: [fp16, int4, fp32] -> FP16 x INT4 with FP32 accumulator + + :param min_cc: Architecture number (12 for PVC, 20 for BMG) + """ + layout_list = [ + [[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 32], [LayoutType.RowMajor, 8]], + [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor, 8]], + [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 32], [LayoutType.RowMajor, 8]], + [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor, 8]], + ] + + # Mixed precision: FP16 x INT4 -> FP32 (hardware requirement for Intel Xe) + math_instructions = [ + MathInstruction( + [8, 16, 32], + DataType.f16, DataType.s4, DataType.f32, # Changed from [s8, f16, f32] to [f16, s4, f32] + OpcodeClass.TensorOp, + MathOperation.multiply_add), + ] + + max_cc = min_cc + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 256, 64], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([128, 256, 64], + 0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + TileDescription([256, 128, 64], + 0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]), + ] + + data_type = { + "a_type": math_inst.element_a, + "b_type": math_inst.element_b, + "c_type": math_inst.element_accumulator, + "d_type": math_inst.element_accumulator, + "acc_type": math_inst.element_accumulator, + "epi_type": math_inst.element_accumulator + } + + schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]] + + CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent]) + + +def GenerateBMG(manifest, cuda_version): + """ + Generate CUTLASS kernels for BMG (Battlemage/Xe2) architecture. + + BMG is Intel's Xe2 GPU architecture with compute capability 20. + Supports DPAS operations with FP16, BF16, FP8, and INT8 data types. + + This is a legacy wrapper that calls GenerateIntelXe with arch=20. + """ + GenerateIntelXe(manifest, cuda_version, arch=20) + +def GenerateIntelXe(manifest, cuda_version, arch=20): + """ + Unified generator for Intel Xe GPU architectures. + + Supports both PVC (arch 12) and BMG (arch 20) with the same generation code. + The operations are identical, only the architecture number differs. + + Supported data types: + - FP16/BF16: [fp16/bf16, fp16/bf16, fp32] + - INT8: [int8, int8, int32] + - FP8: [fp8, fp8, fp32] (E4M3 or E5M2, same types only) + - Mixed: [fp16, int4, fp32] + + :param manifest: Manifest object to add operations to + :param cuda_version: CUDA version string (used for compatibility) + :param arch: Architecture number (12 for PVC, 20 for BMG) + """ + if arch not in [12, 20]: + raise ValueError(f"Unsupported Intel Xe architecture: {arch}. Supported: 12 (PVC), 20 (BMG)") + + # All Intel Xe architectures use the same generation functions + # Only the min_cc (architecture number) differs + GenerateXe_TensorOp_16b_DPAS_gemm(manifest, cuda_version, min_cc=arch) + GenerateXe_TensorOp_fp8_DPAS_gemm(manifest, cuda_version, min_cc=arch) + GenerateXe_TensorOp_int8_DPAS_gemm(manifest, cuda_version, min_cc=arch) + # DISABLED: Mixed precision (FP16 x INT4) requires grouped GEMM infrastructure + # Regular library generation uses MainloopIntelXeXMX16 which requires ElementA == ElementB + # GenerateXe_TensorOp_mixed_dtype_DPAS_gemm(manifest, cuda_version, min_cc=arch) ################################################################################################### @@ -11865,6 +12151,21 @@ def define_parser(): GenerateSM100(manifest, args.cuda_version) GenerateSM120(manifest, args.cuda_version) + # Intel Xe GPU architectures - unified handling for PVC and BMG + # Both architectures share the same generation code, just different arch numbers + + # Check for BMG (architecture 20) + bmg_arch_list = ["20", "bmg", "xe2", "intel_gpu_bmg_g21"] + bmg_enabled_arch = any(arch.lower() in [x.lower() for x in bmg_arch_list] for arch in archs) + if bmg_enabled_arch: + GenerateIntelXe(manifest, args.cuda_version, arch=20) + + # Check for PVC (architecture 12) + pvc_arch_list = ["12", "pvc", "intel_gpu_pvc"] + pvc_enabled_arch = any(arch.lower() in [x.lower() for x in pvc_arch_list] for arch in archs) + if pvc_enabled_arch: + GenerateIntelXe(manifest, args.cuda_version, arch=12) + if 'library' in args.generator_target.split(','): manifest.emit(GeneratorTarget.Library) diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py index baaaac28a8..e0ddf91d43 100644 --- a/python/cutlass_library/manifest.py +++ b/python/cutlass_library/manifest.py @@ -1,6 +1,7 @@ ################################################################################################# # # Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (C) 2025 Intel Corporation, All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without @@ -65,6 +66,16 @@ ################################################################################################### _LOGGER = logging.getLogger(__name__) +################################################################################################### +# Import architecture range constants from shared module +################################################################################################### +try: + from cutlass_library.arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN +except ImportError: + from arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN + +################################################################################################### + class EmitOperationKindAll: """ @@ -136,7 +147,27 @@ def __enter__(self): str(self.operation_path)); os.makedirs(self.operation_path, exist_ok=True) - self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.cu") + # Determine file extension based on architecture + # Check if any Intel Xe target is present in the architectures + file_extension = "cu" # Default to CUDA + if self.args and hasattr(self.args, 'architectures'): + archs = self.args.architectures.split(';') if len(self.args.architectures) else [] + for arch in archs: + arch_lower = arch.lower() + # Check for Intel Xe targets + if any(xe_target in arch_lower for xe_target in ['pvc', 'bmg', 'intel_gpu']): + file_extension = "cpp" + break + # Check for numeric Xe architecture in the Intel Xe range + try: + arch_num = int(arch.split('a')[0].split('f')[0]) + if arch_num >= INTEL_XE_ARCH_MIN and arch_num < INTEL_XE_ARCH_MAX: + file_extension = "cpp" + break + except (ValueError, AttributeError): + pass + + self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.{file_extension}") _LOGGER.debug(f"*** top_level_path (file to write): {str(self.top_level_path)}") self.top_level_file = open(self.top_level_path, "w") @@ -184,9 +215,10 @@ class EmitOperationKindLibrary: for min_cc=90 and OperationKind=Gemm), in the file all_sm{min_cc}_{operation_kind}_operations.cu (e.g., all_sm90_gemm_operations.cu for min_cc=90 and OperationKind=Gemm). + For Intel Xe targets, uses xe{min_cc} prefix instead of sm{min_cc}. The min_cc variable here indicates the minimum GPU architecture version that the things to be initialized require. - For example, min_cc=90 indicates sm90. + For example, min_cc=90 indicates sm90 for CUDA, min_cc=20 indicates Xe2/BMG for Intel. That file declares several functions in namespace cutlass::library. The functions all have this form, @@ -207,11 +239,23 @@ class EmitOperationKindLibrary: of what happens in each of those subdirectories. """ + @staticmethod + def get_arch_prefix(min_cc): + """Get architecture prefix based on compute capability. + Returns 'sm' for CUDA architectures, 'xe' for Intel Xe architectures. + Intel Xe: 12 (PVC), 20 (BMG) + CUDA: 50+ for CUDA architectures""" + if min_cc >= INTEL_XE_ARCH_MIN and min_cc < INTEL_XE_ARCH_MAX: + return 'xe' + else: + return 'sm' + def __init__(self, generated_path, min_cc, kind, args): self.generated_path = generated_path self.min_cc = min_cc self.kind = kind self.args = args + self.arch_prefix = self.get_arch_prefix(min_cc) self.emitters = { OperationKind.Gemm: EmitGemmConfigurationLibrary, OperationKind.Conv2d: EmitConv2dConfigurationLibrary, @@ -242,12 +286,12 @@ def __init__(self, generated_path, min_cc, kind, args): // // Entry point to construct operations // -void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) { +void initialize_all_${arch_prefix}${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) { """ self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" self.configuration_template = " initialize_${configuration_name}(manifest);\n" - self.subclass_call_template = " initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n" - self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n" + self.subclass_call_template = " initialize_all_${arch_prefix}${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n" + self.subclass_prototype_template = "void initialize_all_${arch_prefix}${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n" self.epilogue_template ="""} /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -268,7 +312,9 @@ def __enter__(self): _LOGGER.debug(f"*** operation_path (directory to make): {str(self.operation_path)}") os.makedirs(self.operation_path) - self.top_level_path = os.path.join(self.operation_path, f"all_sm{self.min_cc}_{OperationKindNames[self.kind]}_operations.cu") + # Use .cpp extension for Intel Xe architectures, .cu for CUDA + file_extension = "cpp" if (self.min_cc >= INTEL_XE_ARCH_MIN and self.min_cc < INTEL_XE_ARCH_MAX) else "cu" + self.top_level_path = os.path.join(self.operation_path, f"all_{self.arch_prefix}{self.min_cc}_{OperationKindNames[self.kind]}_operations.{file_extension}") _LOGGER.debug(f"*** top_level_path (file to write): {str(self.top_level_path)}") self.top_level_file = open(self.top_level_path, "w") @@ -307,9 +353,11 @@ def emit(self, configuration_name, operations): self.subclass_configurations[extended_name] = [] + # Use .cpp extension for Intel Xe architectures, .cu for CUDA + file_extension = "cpp" if (self.min_cc >= INTEL_XE_ARCH_MIN and self.min_cc < INTEL_XE_ARCH_MAX) else "cu" # Open a new top-level file for this sub class subclass_top_level_path = os.path.join( - subclass_path, f"all_sm{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.cu") + subclass_path, f"all_{self.arch_prefix}{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.{file_extension}") _LOGGER.debug('*** subclass_top_level_path (min_cc, extended_name, ' + 'OperationKind): ' + str(subclass_top_level_path)) @@ -337,6 +385,7 @@ def __exit__(self, exception_type, exception_value, traceback): _LOGGER.debug("*** EmitOperationKindLibrary::__exit__") for subclass_name, subclass_file in sorted(self.subclass_files.items()): subclass_cfg = { + 'arch_prefix': self.arch_prefix, 'min_cc': str(self.min_cc), 'subclass_name': subclass_name, 'operation_name': OperationKindNames[self.kind] @@ -345,6 +394,7 @@ def __exit__(self, exception_type, exception_value, traceback): self.top_level_file.write( SubstituteTemplate(self.entry_template, { + 'arch_prefix': self.arch_prefix, 'min_cc': str(self.min_cc), 'subclass_name': '', 'operation_name': OperationKindNames[self.kind] @@ -353,6 +403,7 @@ def __exit__(self, exception_type, exception_value, traceback): # Finish and close all subclass files for subclass_name, subclass_file in sorted(self.subclass_files.items()): subclass_cfg = { + 'arch_prefix': self.arch_prefix, 'min_cc': str(self.min_cc), 'subclass_name': subclass_name, 'operation_name': OperationKindNames[self.kind] @@ -511,6 +562,7 @@ def __init__(self, args = None): self.compute_capabilities_feature_set = ['50',] self.curr_build_dir = '.' self.filter_by_cc = True + self.is_xe_target = False # Track if building for Intel Xe if self.args: self.kernel_filter = self.args.kernels @@ -518,10 +570,43 @@ def __init__(self, args = None): # A common user error is to use commas instead of semicolons. if ',' in args.architectures: - raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures) + raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS or DPCPP_SYCL_TARGET) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures) self.compute_capabilities_feature_set = args.architectures.split(';') if len(args.architectures) else ['50',] - self.compute_capabilities_baseline = sorted(set(int(arch.split('a')[0].split('f')[0]) for arch in self.compute_capabilities_feature_set)) + + # Parse architecture identifiers - support both CUDA SM and Intel Xe targets + baseline_archs = [] + for arch in self.compute_capabilities_feature_set: + # Check if this is an Intel Xe target (pvc, bmg, etc.) + # Support both string names ('pvc', 'bmg') and numeric values + arch_lower = arch.lower() + is_xe_named = any(xe_target in arch_lower for xe_target in ['pvc', 'bmg', 'intel_gpu']) + + # Also check if it's a numeric Xe architecture in the Intel Xe range + try: + arch_num = int(arch.split('a')[0].split('f')[0]) + is_xe_numeric = (arch_num >= INTEL_XE_ARCH_MIN and arch_num < INTEL_XE_ARCH_MAX) + except (ValueError, AttributeError): + arch_num = None + is_xe_numeric = False + + if is_xe_named or is_xe_numeric: + self.is_xe_target = True + # Map Intel Xe architectures to numeric identifiers for compatibility + # PVC (Ponte Vecchio) -> 12 + # BMG (Battlemage/Xe2) -> 20 + if 'pvc' in arch_lower or arch_num == 12: + baseline_archs.append(12) + elif 'bmg' in arch_lower or 'xe2' in arch_lower or arch_num == 20: + baseline_archs.append(20) + else: + # Generic Intel GPU target - default to BMG + baseline_archs.append(20) + else: + # CUDA SM architecture + baseline_archs.append(arch_num if arch_num is not None else int(arch.split('a')[0].split('f')[0])) + + self.compute_capabilities_baseline = sorted(set(baseline_archs)) if args.filter_by_cc in ['false', 'False', '0']: self.filter_by_cc = False @@ -740,18 +825,24 @@ def emit_manifest_cmake(self, manifest_path, top_level_path, source_files): manifest_file.write(target_text + '\n\n') manifest_file.write(" %s\n" % str(top_level_path.replace('\\', '/'))) generated_path = os.path.join(self.curr_build_dir, 'generated') + + # Determine file extension based on whether we're targeting Intel Xe + file_extension = "cpp" if self.is_xe_target else "cu" + for kind in self.operations.keys(): kind_str = OperationKindNames[kind] - all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.cu").replace('\\', '/') + all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.{file_extension}").replace('\\', '/') manifest_file.write(f" {all_kind_file}\n") manifest_file.write(')\n\n') for kind in self.operations.keys(): for min_cc in sorted(self.operations[kind].keys()): for subclass in sorted(source_files[kind][min_cc].keys()): + # Use appropriate prefix (sm for CUDA, xe for Intel) + arch_prefix = 'xe' if (min_cc >= INTEL_XE_ARCH_MIN and min_cc < INTEL_XE_ARCH_MAX) else 'sm' target_text = SubstituteTemplate("""cutlass_add_cutlass_library( - SUFFIX ${kind}_sm${min_cc}_${subclass} -""", { 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass }) + SUFFIX ${kind}_${arch_prefix}${min_cc}_${subclass} +""", { 'arch_prefix': arch_prefix, 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass }) manifest_file.write(target_text + '\n\n') for source_file in source_files[kind][min_cc][subclass]: @@ -759,7 +850,8 @@ def emit_manifest_cmake(self, manifest_path, top_level_path, source_files): manifest_file.write(")\n") - if self.disable_full_archs_compilation: + # Only apply CUDA-specific arch compilation settings for CUDA targets + if self.disable_full_archs_compilation and min_cc < INTEL_XE_ARCH_MIN: self.emit_disable_full_archs_compilation(manifest_file, source_files) def emit_disable_full_archs_compilation(manifest_file, source_files): diff --git a/python/setup_cutlass.py b/python/setup_cutlass.py index 8122b7a6a1..bd1926e03a 100644 --- a/python/setup_cutlass.py +++ b/python/setup_cutlass.py @@ -1,6 +1,7 @@ ################################################################################################# # # Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (C) 2025 Intel Corporation, All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without diff --git a/python/setup_library.py b/python/setup_library.py index 875ba62d55..3257eb1b99 100644 --- a/python/setup_library.py +++ b/python/setup_library.py @@ -1,6 +1,7 @@ ################################################################################################# # # Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (C) 2025 Intel Corporation, All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without @@ -35,9 +36,9 @@ def perform_setup(): setup( - name='cutlass_library', + name='cutlass_library_xe', version='4.1.0', - description='CUTLASS library generation scripts', + description='SYL*TLA library generation scripts', packages=['cutlass_library'] ) diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt index 98e97bc5da..014ec02db5 100644 --- a/tools/library/CMakeLists.txt +++ b/tools/library/CMakeLists.txt @@ -83,6 +83,11 @@ target_link_libraries( ################################################################################ +function(cutlass_target_sources target) + # Wrapper function for target_sources to maintain compatibility with generated manifests + target_sources(${target} ${ARGN}) +endfunction() + function(cutlass_add_cutlass_library) # # Generates static and shared libraries with the given SOURCES. The public CMake @@ -120,6 +125,11 @@ function(cutlass_add_cutlass_library) PRIVATE cutlass_library_internal_interface ) + # Add SYCL-specific compile options when building for SYCL + if (CUTLASS_ENABLE_SYCL) + target_compile_options(${__NAME}_objs PRIVATE -fsycl) + endif() + if (CUTLASS_BUILD_MONO_LIBRARY AND __SUFFIX) # If we're only building a single monolithic library then we @@ -150,9 +160,26 @@ function(cutlass_add_cutlass_library) ${__NAME} PUBLIC cutlass_library_includes PRIVATE $ - cuda_driver ) + # Link with appropriate runtime library + if (CUTLASS_ENABLE_SYCL) + # For SYCL builds, explicitly link with libsycl.so + # We use find_library to locate it in the oneAPI installation + find_library(SYCL_LIBRARY NAMES sycl sycl8 PATHS ENV LD_LIBRARY_PATH NO_DEFAULT_PATH) + if(NOT SYCL_LIBRARY) + find_library(SYCL_LIBRARY NAMES sycl sycl8) + endif() + if(SYCL_LIBRARY) + target_link_libraries(${__NAME} PRIVATE ${SYCL_LIBRARY}) + else() + message(WARNING "libsycl.so not found - runtime may fail to load") + endif() + else() + # For CUDA builds, link with cuda_driver + target_link_libraries(${__NAME} PRIVATE cuda_driver) + endif() + set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}") cutlass_add_library( @@ -181,9 +208,20 @@ function(cutlass_add_cutlass_library) ${__NAME}_static PUBLIC cutlass_library_includes PRIVATE $ - cuda_driver ) + # Link with appropriate runtime library + if (CUTLASS_ENABLE_SYCL) + # For SYCL builds, explicitly link with libsycl.so + # Note: SYCL_LIBRARY should already be found from shared library linking above + if(SYCL_LIBRARY) + target_link_libraries(${__NAME}_static PRIVATE ${SYCL_LIBRARY}) + endif() + else() + # For CUDA builds, link with cuda_driver + target_link_libraries(${__NAME}_static PRIVATE cuda_driver) + endif() + set_target_properties(${__NAME}_static PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}") install( @@ -272,6 +310,24 @@ if (NOT CUTLASS_ENABLE_SYCL) # For backward compatibility with the old name add_library(cutlass_lib ALIAS cutlass_library) add_library(cutlass_lib_static ALIAS cutlass_library_static) + +else() + # SYCL-enabled library generation + # Create base library targets for SYCL that will be populated by generated kernels + # Note: .cu files will be compiled with SYCL compiler (icpx) for Intel Xe GPUs + + cutlass_add_cutlass_library( + src/handle.cu + src/manifest.cpp + src/operation_table.cu + src/singleton.cu + src/util.cu + ) + + # For backward compatibility with the old name + add_library(cutlass_lib ALIAS cutlass_library) + add_library(cutlass_lib_static ALIAS cutlass_library_static) + endif() ################################################################################ @@ -307,6 +363,13 @@ if(CUTLASS_LIBRARY_HEURISTICS_PROBLEMS_FILE) endif() endif() +# Set architecture parameter based on whether SYCL or CUDA is enabled +if (CUTLASS_ENABLE_SYCL) + set(CUTLASS_LIBRARY_GENERATOR_ARCHS "20" CACHE STRING "Intel Xe architectures (12=PVC, 20=BMG)") +else() + set(CUTLASS_LIBRARY_GENERATOR_ARCHS "${CUTLASS_NVCC_ARCHS_ENABLED}") +endif() + # --log-level is set to DEBUG to enable printing information about which kernels were excluded # from generation in /python/cutlass_library/manifest.py. To avoid having this information appear # in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO @@ -318,7 +381,7 @@ execute_process( --build-dir ${PROJECT_BINARY_DIR} --curr-build-dir ${CMAKE_CURRENT_BINARY_DIR} --generator-target library - --architectures "${CUTLASS_NVCC_ARCHS_ENABLED}" + --architectures "${CUTLASS_LIBRARY_GENERATOR_ARCHS}" --kernels "${CUTLASS_LIBRARY_KERNELS}" --instantiation-level "${CUTLASS_LIBRARY_INSTANTIATION_LEVEL}" --ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}" @@ -341,14 +404,13 @@ endif() message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.") -if (NOT CUTLASS_ENABLE_SYCL) - # include auto-instantiated kernels in he CUTLASS Deliverables Library - set(CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated/manifest.cmake) - if(EXISTS "${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}") - include(${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) - else() - message(STATUS "auto-generated library manifest cmake file (${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) not found.") - endif() +# Include auto-instantiated kernels in the CUTLASS Deliverables Library +# Now enabled for both CUDA and SYCL +set(CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated/manifest.cmake) +if(EXISTS "${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}") + include(${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) +else() + message(STATUS "auto-generated library manifest cmake file (${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) not found.") endif() ################################################################################ diff --git a/tools/library/include/cutlass/library/arch_mappings.h b/tools/library/include/cutlass/library/arch_mappings.h index df241e3ca6..e6e31f0f9f 100644 --- a/tools/library/include/cutlass/library/arch_mappings.h +++ b/tools/library/include/cutlass/library/arch_mappings.h @@ -1,5 +1,7 @@ /*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. + * * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -148,6 +150,39 @@ template struct ArchMap { static int const kMax = 121; }; +// Intel Xe architecture mappings +template struct ArchMap { + static int const kMin = 12; + static int const kMax = 50; +}; + +template <> struct ArchMap { + static int const kMin = 12; + static int const kMax = 50; +}; + +// Xe12 (PVC) alias +template struct ArchMap { + static int const kMin = 12; + static int const kMax = 50; +}; + +template <> struct ArchMap { + static int const kMin = 12; + static int const kMax = 50; +}; + +// Xe20 (BMG) alias +template struct ArchMap { + static int const kMin = 20; + static int const kMax = 50; +}; + +template <> struct ArchMap { + static int const kMin = 20; + static int const kMax = 50; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace library diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h index 6764d9a6d8..5564325d4f 100644 --- a/tools/library/include/cutlass/library/library.h +++ b/tools/library/include/cutlass/library/library.h @@ -52,7 +52,10 @@ #include #include #include + +#if !defined(CUTLASS_ENABLE_SYCL) #include +#endif #include "cutlass/cutlass.h" #include "cutlass/library/types.h" diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h index f537421751..efd788e22f 100644 --- a/tools/library/include/cutlass/library/util.h +++ b/tools/library/include/cutlass/library/util.h @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -224,21 +225,35 @@ NumericTypeID dynamic_datatype_to_id(RuntimeDatatype type); } \ } while (0) -// RAII CUDA buffer container +// RAII device buffer container (CUDA/SYCL compatible) class CudaBuffer { public: CudaBuffer() : size_(0), d_ptr_(nullptr) {} explicit CudaBuffer(size_t size) : size_(size), d_ptr_(nullptr) { +#if defined(CUTLASS_ENABLE_SYCL) + // SYCL memory allocation using malloc_device + auto q = compat::get_default_queue(); + d_ptr_ = sycl::malloc_device(size_, q); + if (d_ptr_ == nullptr) { + throw std::runtime_error("sycl::malloc_device failed"); + } +#else cudaError_t err = cudaMalloc(&d_ptr_, size_); if (err != cudaSuccess) { throw std::runtime_error("cudaMalloc failed: " + std::string(cudaGetErrorString(err))); } +#endif } ~CudaBuffer() { if (d_ptr_) { +#if defined(CUTLASS_ENABLE_SYCL) + auto q = compat::get_default_queue(); + sycl::free(d_ptr_, q); +#else cudaFree(d_ptr_); +#endif } } @@ -253,7 +268,12 @@ class CudaBuffer { CudaBuffer& operator=(CudaBuffer&& other) noexcept { if (this != &other) { if (d_ptr_) { +#if defined(CUTLASS_ENABLE_SYCL) + auto q = compat::get_default_queue(); + sycl::free(d_ptr_, q); +#else cudaFree(d_ptr_); +#endif } d_ptr_ = other.d_ptr_; size_ = other.size_; diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h index 880cb4bf34..1d87f3ecf0 100644 --- a/tools/library/src/gemm_operation.h +++ b/tools/library/src/gemm_operation.h @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -36,13 +37,18 @@ #include "cutlass/cutlass.h" #include "cutlass/gemm/device/gemm.h" + +#if !defined(CUTLASS_ENABLE_SYCL) +// CUDA-only kernel types - not compatible with SYCL #include "cutlass/gemm/device/gemm_sparse.h" #include "cutlass/gemm/device/gemm_complex.h" #include "cutlass/gemm/device/gemm_batched.h" #include "cutlass/gemm/device/gemm_array.h" +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" +#endif + #include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/kernel/default_gemm_universal.h" -#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" #include "cutlass/library/library.h" #include "library_internal.h" diff --git a/tools/library/src/gemm_operation_3x.hpp b/tools/library/src/gemm_operation_3x.hpp index 7b27913df9..ebd555e7c8 100644 --- a/tools/library/src/gemm_operation_3x.hpp +++ b/tools/library/src/gemm_operation_3x.hpp @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -46,6 +47,7 @@ #include "cutlass/util/device_memory.h" #include "cutlass/util/reference/device/tensor_fill.h" #include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/mixed_dtype_utils.hpp" #include "cute/tensor.hpp" #include @@ -193,10 +195,16 @@ class GemmUniversal3xOperation : public GemmOperation3xBase { cute::size<2>(typename Operator::GemmKernel::ClusterShape{})); uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock; void const* kernel_ptr = (void*)(device_kernel); +#if !defined(CUTLASS_ENABLE_SYCL) + // query_device_max_active_clusters is CUDA-specific max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters( cluster_dims, threads_per_block, kernel_ptr); +#else + // For SYCL, set a default value (will be overridden if needed) + max_active_clusters = 1; +#endif } } diff --git a/tools/library/src/grouped_gemm_operation_3x.hpp b/tools/library/src/grouped_gemm_operation_3x.hpp index 91f618d4fa..1089cb0175 100644 --- a/tools/library/src/grouped_gemm_operation_3x.hpp +++ b/tools/library/src/grouped_gemm_operation_3x.hpp @@ -441,13 +441,18 @@ class GroupedGemmUniversal3xOperation : public GroupedGemmOperation3xBase); +#if !defined(CUTLASS_ENABLE_SYCL) args->max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters( cluster_dims, threads_per_block, kernel_ptr); +#else + // For SYCL, set a default value + args->max_active_clusters = 1; +#endif if (args->max_active_clusters == 0) { - std::cerr << "Max Active Clusters could not be queried. " + std::cerr << "Max Active Clusters could not be queried. " << "Falling back to heuristics mode (static cluster shape) or preferred cluster mode.\n"; } diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h index e8bd77397f..a6f343be08 100644 --- a/tools/library/src/library_internal.h +++ b/tools/library/src/library_internal.h @@ -181,7 +181,11 @@ template <> struct NumericTypeMap { static NumericTypeID const kId = NumericTypeID::kTF32; }; - +// Handle cute::tuple-wrapped types (used in some collectives) +template +struct NumericTypeMap> { + static NumericTypeID const kId = NumericTypeMap::kId; +}; template <> struct NumericTypeMap { diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp index b9c04de71d..d622060b83 100644 --- a/tools/library/src/manifest.cpp +++ b/tools/library/src/manifest.cpp @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -43,7 +44,27 @@ namespace library { ////////////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef CUTLASS_ENABLE_SYCL +// For CUDA builds, reference operations are defined in initialize_reference_operations.cu void initialize_reference_operations(Manifest &manifest); +#else +// For SYCL builds, provide a stub implementation since reference ops are not yet supported +inline void initialize_reference_operations(Manifest &manifest) { + // Reference operations not yet implemented for SYCL + // This is a stub to allow the library to compile +} +#endif + +#ifndef CUTLASS_ENABLE_SYCL +// For CUDA builds, reduction operations are defined in init_reduction_operations.cu +// Declaration is in manifest.h +#else +// For SYCL builds, provide a stub implementation since reduction ops are not yet supported +inline void initialize_all_reduction_op(Manifest &manifest) { + // Reduction operations not yet implemented for SYCL + // This is a stub to allow the library to compile +} +#endif ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/library/src/sparse_gemm_operation_3x.hpp b/tools/library/src/sparse_gemm_operation_3x.hpp index 34da25b9a6..6cb836b89a 100644 --- a/tools/library/src/sparse_gemm_operation_3x.hpp +++ b/tools/library/src/sparse_gemm_operation_3x.hpp @@ -1,5 +1,6 @@ /*************************************************************************************************** * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -34,6 +35,9 @@ #pragma once +// Sparse GEMM operations are CUDA-only (not supported in SYCL) +#if !defined(CUTLASS_ENABLE_SYCL) + #include "cutlass/cutlass.h" #include "cutlass/detail/collective.hpp" #include "cutlass/array.h" @@ -501,4 +505,6 @@ class SparseGemmUniversal3xOperation : public GemmOperation3xBase { } // namespace cutlass::library +#endif // !defined(CUTLASS_ENABLE_SYCL) + ///////////////////////////////////////////////////////////////////////////////////////////////////