Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions examples/11_xe20_cutlass_library/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (C) 2025 Intel Corporation, All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# Example 11: XE20 CUTLASS Library BF16 GEMM
# This example creates a shared library (.so) that exports CUTLASS BF16 GEMM
# functionality for use with Python via ctypes.

# Create shared library for Python integration
add_library(xe20_cutlass_library_bf16 SHARED
xe_20_cutlass_library_b16.cpp
)

# Set library properties
set_target_properties(xe20_cutlass_library_bf16 PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
VERSION 1.0
SOVERSION 1
OUTPUT_NAME "xe20_cutlass_library_bf16"
)

# Include directories
target_include_directories(xe20_cutlass_library_bf16 PRIVATE
${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
${CUTLASS_EXAMPLES_UTILS_DIR}
${CUTLASS_APPLICATIONS_DIR}
)

# Link libraries
target_link_libraries(xe20_cutlass_library_bf16 PRIVATE
CUTLASS
cutlass_tools_util_includes
)

# Add compile definitions
target_compile_definitions(xe20_cutlass_library_bf16 PRIVATE
CUTLASS_ENABLE_SYCL=1
SYCL_INTEL_TARGET=1
DPCPP_SYCL_TARGET=intel_gpu_bmg_g21
)

# Add Intel-specific SYCL compiler flags for XE20 optimization
if(CUTLASS_ENABLE_SYCL AND SYCL_INTEL_TARGET)
target_compile_options(xe20_cutlass_library_bf16 PRIVATE
-Xspirv-translator
-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate
)
add_onemkl_to_target(TARGET xe20_cutlass_library_bf16)
add_sycl_to_target(TARGET xe20_cutlass_library_bf16)
endif()

# Link against CUTLASS XE20 GEMM library if available
if(TARGET cutlass_gemm_xe20_gemm)
target_link_libraries(xe20_cutlass_library_bf16 PRIVATE cutlass_gemm_xe20_gemm)
endif()

# Install the shared library
install(TARGETS xe20_cutlass_library_bf16
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)

# Add to examples target
add_dependencies(cutlass_examples xe20_cutlass_library_bf16)

# Custom target for building just this library
add_custom_target(xe20_cutlass_library
DEPENDS xe20_cutlass_library_bf16
COMMENT "Building XE20 CUTLASS Library BF16 GEMM Shared Library (.so)"
)

message(STATUS "Added shared library xe20_cutlass_library_bf16 for Python integration")
225 changes: 225 additions & 0 deletions examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/***************************************************************************************************
* Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************************************/



#include <exception>
#include <iostream>
#include <memory>
#include <random>
#include <vector>

#include "cute/tensor.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/tensor_ref.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_fill.h"
#include "cutlass/util/device_memory.h"

#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/device/gemm_universal.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
//#include "cutlass/gemm/device/gemm_sparse.h"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "cutlass/epilogue/thread/activation.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"


// We compile all models with -fvisibility=hidden. Any symbols that need to be
// exposed in the final shared library must be declared with PT_EXPORT to make
// them visible.
#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
#define PT_EXPORT __attribute__((__visibility__("default")))
#else
#ifdef _WIN32
#define PT_EXPORT __declspec(dllexport)
#else
#define PT_EXPORT
#endif
#endif

using namespace cute;
#define CUTLASS_CHECK(status) \
{ \
cutlass::Status error = status; \
if (error != cutlass::Status::kSuccess) { \
auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " + \
cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__); \
throw std::runtime_error(msg); \
} \
}

// Used as pass-through functor in EVT just for type casting / rounding
template <typename T>
struct identity_op {
CUTLASS_HOST_DEVICE
T operator()(T val) const { return val; }
};



using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_256, cute::_256, cute::_32>,
cute::Shape<cute::_1, cute::_1, cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
float, cutlass::layout::RowMajor, 4,
float, cutlass::layout::RowMajor, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
cutlass::epilogue::fusion::LinearCombination<
float,
float,
float,
float
>
>::CollectiveOp;

using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
float,
cute::Shape<cute::_256, cute::_256, cute::_32>,
cute::Shape<cute::_1, cute::_1, cute::_1>,
cutlass::gemm::collective::StageCountAuto,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;

// Gemm operator cutlass3x_xe11_tensorop_gemm_bf16_128x256_16x0_tn_align2
using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop,
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue,
cutlass::gemm::PersistentScheduler>;

// Define named type
struct cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8 :
public cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base { };


using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type = cutlass::gemm::device::GemmUniversalAdapter<cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8>;

// When workspace_size is not a nullptr, populates requested workspace_size and returns.
// Otherwise, computes the Gemm kernel using the given workspace ptr.
extern "C" {
PT_EXPORT int sycl_tla_gemm_xe20_bf16(const uint16_t* X, const uint16_t* W, uint16_t* Y, const int M, const int N, const int K, const int B, const int lda, const int ldb, const int ldc, const int ldd, const int X_offset, const int W_offset, const int Y_offset, const uint8_t swizzle, size_t* workspace_size, uint8_t* workspace, sycl::queue* stream) {
try {
using ElementComputeEpilogue = cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::ElementAccumulator;
using coord_t = cutlass::gemm::GemmCoord::Index;
static cutlass::KernelHardwareInfo hw_info;
if (hw_info.sm_count == 0) {
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(0);
CUTLASS_TRACE_HOST("Query result for SM count per device: " << hw_info.sm_count);
}

// Initialize GemmUniversal3xInstance arguments using constructor
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm, // GemmUniversalMode mode
{
static_cast<coord_t>(M),
static_cast<coord_t>(N),
static_cast<coord_t>(K),
static_cast<coord_t>(B)
}, // ProblemShape problem_shape
{
(cutlass::bfloat16_t*)(X + X_offset), // ElementA const* ptr_A
cute::make_tuple(cute::Int<1>{}, int64_t(lda), int64_t(0)), // StrideA dA (column-major: stride_m=1, stride_n=lda, batch=0)
(cutlass::bfloat16_t*)(W + W_offset), // ElementB const* ptr_B
cute::make_tuple(int64_t(ldb), cute::Int<1>{}, int64_t(0)), // StrideB dB (column-major: stride_m=ldb, stride_n=1, batch=0)
}, // MainloopArguments mainloop

// see https://tinyurl.com/4rk89z48
{
{ElementComputeEpilogue(1), ElementComputeEpilogue(0)}, // thread, typename FusionCallbacks::Arguments ( EVT ) or ThreadEpilogueOp::Params (non-EVT )
nullptr, // ElementC const* ptr_C
cute::make_tuple(int64_t(0), cute::Int<1>{}, int64_t(0)), // StrideC dC (row-major: stride_m, stride_n=1, batch=0)
(float*)(Y + Y_offset), // ElementD ptr_D (output is float, not bfloat16)
cute::make_tuple(int64_t(ldd), cute::Int<1>{}, int64_t(0)), // StrideD dD (row-major: stride_m=ldd, stride_n=1, batch=0)
}, // EpilogueArguments epilogue,
hw_info
};
arguments.scheduler.max_swizzle_size = swizzle;
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type gemm_op;
if (workspace_size) {
*workspace_size = gemm_op.get_workspace_size(arguments);
return 0;
}
// check for null pointers after workspace size, since querying workspace size doesn't require valid data pointers
#ifndef CUTLASS_BACKEND_DISABLE_CHECKS
{
auto status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
}
#endif
#ifdef CUTLASS_DEBUG_TRACE_LEVEL
#if CUTLASS_DEBUG_TRACE_LEVEL == 1
{
// Print the maximum number of active blocks per SM for the kernel if CUTLASS_DEBUG_TRACE_LEVEL == 1
// we don't need a print statement, it's happening inside the function.
gemm_op.maximum_active_blocks();
}
#endif
#endif
{
auto status = gemm_op.initialize(arguments, workspace, stream);
CUTLASS_CHECK(status);
}
{
auto status = gemm_op(stream);
CUTLASS_CHECK(status);
}
}
catch (std::exception& e) {
std::cerr << "Runtime error: " << e.what() << std::endl;
return -1;
}
catch (...) {
return -1;
}
return 0;
}
}

// configuration name: cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ if(CUTLASS_ENABLE_SYCL)
08_bmg_gemm_f8
09_bmg_grouped_gemm_f8
10_bmg_grouped_gemm_mixed_dtype
11_xe20_cutlass_library
)
add_subdirectory(${EXAMPLE})
endforeach()
Expand Down
Loading
Loading