diff --git a/examples/11_xe20_cutlass_library/CMakeLists.txt b/examples/11_xe20_cutlass_library/CMakeLists.txt
new file mode 100644
index 0000000000..22a0a77daa
--- /dev/null
+++ b/examples/11_xe20_cutlass_library/CMakeLists.txt
@@ -0,0 +1,97 @@
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Example 11: XE20 CUTLASS Library BF16 GEMM
+# This example creates a shared library (.so) that exports CUTLASS BF16 GEMM
+# functionality for use with Python via ctypes.
+
+# Create shared library for Python integration
+add_library(xe20_cutlass_library_bf16 SHARED
+  xe_20_cutlass_library_b16.cpp
+)
+
+# Set library properties
+set_target_properties(xe20_cutlass_library_bf16 PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+  VERSION 1.0
+  SOVERSION 1
+  OUTPUT_NAME "xe20_cutlass_library_bf16"
+)
+
+# Include directories
+target_include_directories(xe20_cutlass_library_bf16 PRIVATE
+  ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
+  ${CUTLASS_EXAMPLES_UTILS_DIR}
+  ${CUTLASS_APPLICATIONS_DIR}
+)
+
+# Link libraries
+target_link_libraries(xe20_cutlass_library_bf16 PRIVATE
+  CUTLASS
+  cutlass_tools_util_includes
+)
+
+# Add compile definitions
+target_compile_definitions(xe20_cutlass_library_bf16 PRIVATE
+  CUTLASS_ENABLE_SYCL=1
+  SYCL_INTEL_TARGET=1
+  DPCPP_SYCL_TARGET=intel_gpu_bmg_g21
+)
+
+# Add Intel-specific SYCL compiler flags for XE20 optimization
+if(CUTLASS_ENABLE_SYCL AND SYCL_INTEL_TARGET)
+  target_compile_options(xe20_cutlass_library_bf16 PRIVATE
+    -Xspirv-translator
+    -spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate
+  )
+  add_onemkl_to_target(TARGET xe20_cutlass_library_bf16)
+  add_sycl_to_target(TARGET xe20_cutlass_library_bf16)
+endif()
+
+# Link against CUTLASS XE20 GEMM library if available
+if(TARGET cutlass_gemm_xe20_gemm)
+  target_link_libraries(xe20_cutlass_library_bf16 PRIVATE cutlass_gemm_xe20_gemm)
+endif()
+
+# Install the shared library
+install(TARGETS xe20_cutlass_library_bf16
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
+
+# Add to examples target
+add_dependencies(cutlass_examples xe20_cutlass_library_bf16)
+
+# Custom target for building just this library
+add_custom_target(xe20_cutlass_library 
+  DEPENDS xe20_cutlass_library_bf16
+  COMMENT "Building XE20 CUTLASS Library BF16 GEMM Shared Library (.so)"
+)
+
+message(STATUS "Added shared library xe20_cutlass_library_bf16 for Python integration")
\ No newline at end of file
diff --git a/examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp b/examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp
new file mode 100644
index 0000000000..812af797d7
--- /dev/null
+++ b/examples/11_xe20_cutlass_library/xe_20_cutlass_library_b16.cpp
@@ -0,0 +1,225 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ***************************************************************************************************/
+
+
+
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+//#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+
+
+// We compile all models with -fvisibility=hidden. Any symbols that need to be
+// exposed in the final shared library must be declared with PT_EXPORT to make
+// them visible.
+#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
+#define PT_EXPORT __attribute__((__visibility__("default")))
+#else
+#ifdef _WIN32
+#define PT_EXPORT __declspec(dllexport)
+#else
+#define PT_EXPORT
+#endif
+#endif
+
+using namespace cute;
+#define CUTLASS_CHECK(status)                                                      \
+{                                                                                  \
+  cutlass::Status error = status;                                                  \
+  if (error != cutlass::Status::kSuccess) {                                        \
+    auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +             \
+        cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);        \
+    throw std::runtime_error(msg);                                                 \
+  }                                                                                \
+}
+
+// Used as pass-through functor in EVT just for type casting / rounding
+template <typename T>
+struct identity_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T val) const { return val; }
+};
+
+
+
+using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
+    cute::Shape<cute::_256, cute::_256, cute::_32>,
+    cute::Shape<cute::_1, cute::_1, cute::_1>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    float, float,
+    float, cutlass::layout::RowMajor, 4,
+    float, cutlass::layout::RowMajor, 4,
+    cutlass::epilogue::collective::EpilogueScheduleAuto,
+    cutlass::epilogue::fusion::LinearCombination<
+      float,
+      float,
+      float,
+      float
+    >
+  >::CollectiveOp;
+
+using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
+    cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
+    cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
+    float,
+    cute::Shape<cute::_256, cute::_256, cute::_32>,
+    cute::Shape<cute::_1, cute::_1, cute::_1>,
+    cutlass::gemm::collective::StageCountAuto,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+// Gemm operator cutlass3x_xe11_tensorop_gemm_bf16_128x256_16x0_tn_align2
+using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base = cutlass::gemm::kernel::GemmUniversal<
+    cute::Shape<int,int,int,int>,
+    cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop,
+    cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue,
+    cutlass::gemm::PersistentScheduler>;
+
+// Define named type
+struct cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8 :
+public cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base { };
+
+
+  using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type = cutlass::gemm::device::GemmUniversalAdapter<cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8>;
+
+// When workspace_size is not a nullptr, populates requested workspace_size and returns.
+// Otherwise, computes the Gemm kernel using the given workspace ptr.
+extern "C" {
+PT_EXPORT int sycl_tla_gemm_xe20_bf16(const uint16_t* X, const uint16_t* W, uint16_t* Y, const int M, const int N, const int K, const int B, const int lda, const int ldb, const int ldc, const int ldd, const int X_offset, const int W_offset, const int Y_offset, const uint8_t swizzle, size_t* workspace_size, uint8_t* workspace, sycl::queue* stream) {
+  try {
+  using ElementComputeEpilogue = cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::ElementAccumulator;
+  using coord_t = cutlass::gemm::GemmCoord::Index;
+  static cutlass::KernelHardwareInfo hw_info;
+  if (hw_info.sm_count == 0) {
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(0);
+    CUTLASS_TRACE_HOST("Query result for SM count per device: " << hw_info.sm_count);
+  }
+
+  // Initialize GemmUniversal3xInstance arguments using constructor
+  cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,  // GemmUniversalMode mode
+    {
+      static_cast<coord_t>(M),
+      static_cast<coord_t>(N),
+      static_cast<coord_t>(K),
+      static_cast<coord_t>(B)
+    }, // ProblemShape problem_shape
+    {
+      (cutlass::bfloat16_t*)(X + X_offset),  // ElementA const* ptr_A
+      cute::make_tuple(cute::Int<1>{}, int64_t(lda), int64_t(0)),  // StrideA dA (column-major: stride_m=1, stride_n=lda, batch=0)
+      (cutlass::bfloat16_t*)(W + W_offset),  // ElementB const* ptr_B
+      cute::make_tuple(int64_t(ldb), cute::Int<1>{}, int64_t(0)),  // StrideB dB (column-major: stride_m=ldb, stride_n=1, batch=0)
+    },  // MainloopArguments mainloop
+
+    // see https://tinyurl.com/4rk89z48
+    {
+      {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // thread, typename FusionCallbacks::Arguments ( EVT ) or ThreadEpilogueOp::Params (non-EVT )
+      nullptr,  // ElementC const* ptr_C
+      cute::make_tuple(int64_t(0), cute::Int<1>{}, int64_t(0)),  // StrideC dC (row-major: stride_m, stride_n=1, batch=0)
+      (float*)(Y + Y_offset),  // ElementD ptr_D (output is float, not bfloat16)
+      cute::make_tuple(int64_t(ldd), cute::Int<1>{}, int64_t(0)),  // StrideD dD (row-major: stride_m=ldd, stride_n=1, batch=0)
+    },  // EpilogueArguments epilogue,
+    hw_info
+  };
+  arguments.scheduler.max_swizzle_size = swizzle;
+  cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type gemm_op;
+  if (workspace_size) {
+    *workspace_size = gemm_op.get_workspace_size(arguments);
+    return 0;
+  }
+  // check for null pointers after workspace size, since querying workspace size doesn't require valid data pointers
+#ifndef CUTLASS_BACKEND_DISABLE_CHECKS
+  {
+    auto status = gemm_op.can_implement(arguments);
+    CUTLASS_CHECK(status);
+  }
+#endif
+#ifdef CUTLASS_DEBUG_TRACE_LEVEL
+#if CUTLASS_DEBUG_TRACE_LEVEL == 1
+  {
+    // Print the maximum number of active blocks per SM for the kernel if CUTLASS_DEBUG_TRACE_LEVEL == 1
+    // we don't need a print statement, it's happening inside the function.
+    gemm_op.maximum_active_blocks();
+  }
+#endif
+#endif
+  {
+    auto status = gemm_op.initialize(arguments, workspace, stream);
+    CUTLASS_CHECK(status);
+  }
+  {
+    auto status = gemm_op(stream);
+    CUTLASS_CHECK(status);
+  }
+  }
+  catch (std::exception& e) {
+    std::cerr << "Runtime error: " << e.what() << std::endl;
+    return -1;
+  }
+  catch (...) {
+    return -1;
+  }
+  return 0;
+}
+}
+
+// configuration name: cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d141f5b7de..9d51802ecf 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -111,6 +111,7 @@ if(CUTLASS_ENABLE_SYCL)
       08_bmg_gemm_f8
       09_bmg_grouped_gemm_f8
       10_bmg_grouped_gemm_mixed_dtype
+      11_xe20_cutlass_library
       )
       add_subdirectory(${EXAMPLE})
     endforeach()
diff --git a/examples/python/cutlass_library/xe20_gemm_bf16.py b/examples/python/cutlass_library/xe20_gemm_bf16.py
new file mode 100644
index 0000000000..93205b69ef
--- /dev/null
+++ b/examples/python/cutlass_library/xe20_gemm_bf16.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+"""
+Test the generated CUTLASS GEMM kernel (sycl_tla_gemm_xe20_bf16)
+"""
+
+import ctypes
+from ctypes import c_void_p, c_int, c_size_t, c_uint8, c_uint16, POINTER, byref
+import numpy as np
+import time
+from pathlib import Path
+
+
+def test_sycl_tla_gemm_xe20_bf16():
+    """Test the compiled sycl_tla_gemm_xe20_bf16 function"""
+    
+    # Load the shared library
+    lib_path = Path(__file__).parent / '../../../build/examples/11_xe20_cutlass_library/libxe20_cutlass_library_bf16.so'
+    if not lib_path.exists():
+        print(f"Error: {lib_path} not found!")
+        print("Please build the library first: ninja xe20_cutlass_library_bf16")
+        return
+    
+    lib = ctypes.CDLL(str(lib_path))
+    
+    # Define function signature
+    # int sycl_tla_gemm_xe20_bf16(
+    #   const uint16_t* X, const uint16_t* W, uint16_t* Y,
+    #   const int M, const int N, const int K, const int B,
+    #   const int lda, const int ldb, const int ldc, const int ldd,
+    #   const int X_offset, const int W_offset, const int Y_offset,
+    #   const uint8_t swizzle,
+    #   size_t* workspace_size, uint8_t* workspace, sycl::queue* stream)
+    lib.sycl_tla_gemm_xe20_bf16.argtypes = [
+        c_void_p,  # X (input A)
+        c_void_p,  # W (input B)
+        c_void_p,  # Y (output)
+        c_int,     # M
+        c_int,     # N
+        c_int,     # K
+        c_int,     # B (batch)
+        c_int,     # lda
+        c_int,     # ldb
+        c_int,     # ldc
+        c_int,     # ldd
+        c_int,     # X_offset
+        c_int,     # W_offset
+        c_int,     # Y_offset
+        c_uint8,   # swizzle
+        POINTER(c_size_t),  # workspace_size
+        c_void_p,  # workspace
+        c_void_p,  # stream (sycl::queue*)
+    ]
+    lib.sycl_tla_gemm_xe20_bf16.restype = c_int
+    
+    print("="*80)
+    print("Testing sycl_tla_gemm_xe20_bf16 (BF16 256x256x32 GEMM)")
+    print("="*80)
+    
+    # Problem dimensions (matching the kernel tile: 256x256x32)
+    M = 256
+    N = 256
+    K = 32
+    B = 1  # batch size
+    
+    print(f"\nProblem size: M={M}, N={N}, K={K}, B={B}")
+    print(f"  A: {M} x {K} (bfloat16, column-major)")
+    print(f"  B: {K} x {N} (bfloat16, column-major)")
+    print(f"  C: {M} x {N} (float, row-major)")
+    
+    # Leading dimensions (column-major for inputs, row-major for output)
+    lda = M   # column-major: leading dimension is M
+    ldb = K   # column-major: leading dimension is K
+    ldc = 0   # not used (ptr_C is nullptr)
+    ldd = N   # row-major: leading dimension is N
+    
+    print(f"\nLeading dimensions: lda={lda}, ldb={ldb}, ldd={ldd}")
+    
+    # Allocate input/output matrices
+    # Note: Using uint16 to represent bfloat16 in memory
+    X = np.random.randint(0, 100, size=(M * K), dtype=np.uint16)
+    W = np.random.randint(0, 100, size=(K * N), dtype=np.uint16)
+    Y = np.zeros(M * N, dtype=np.float32)  # Output is float32
+    
+    print(f"\nAllocated matrices:")
+    print(f"  X: {X.nbytes} bytes")
+    print(f"  W: {W.nbytes} bytes")
+    print(f"  Y: {Y.nbytes} bytes")
+    
+    # Query workspace size
+    print("\n1. Querying workspace size...")
+    workspace_size = c_size_t(0)
+    result = lib.sycl_tla_gemm_xe20_bf16(
+        c_void_p(),  # X (not needed for workspace query)
+        c_void_p(),  # W
+        c_void_p(),  # Y
+        M, N, K, B,
+        lda, ldb, ldc, ldd,
+        0, 0, 0,  # offsets
+        1,  # swizzle
+        byref(workspace_size),
+        c_void_p(),  # workspace
+        c_void_p(),  # stream (NULL = use default)
+    )
+    
+    if result != 0:
+        print(f"   ✗ Workspace query failed with code {result}")
+        return
+    
+    print(f"   ✓ Workspace required: {workspace_size.value} bytes")
+    
+    # Allocate workspace if needed
+    workspace = None
+    workspace_ptr = c_void_p()
+    if workspace_size.value > 0:
+        workspace = np.zeros(workspace_size.value, dtype=np.uint8)
+        workspace_ptr = workspace.ctypes.data_as(c_void_p)
+        print(f"   ✓ Workspace allocated")
+    
+    # Run GEMM
+    print("\n2. Executing GEMM...")
+    
+    X_ptr = X.ctypes.data_as(c_void_p)
+    W_ptr = W.ctypes.data_as(c_void_p)
+    Y_ptr = Y.ctypes.data_as(c_void_p)
+    
+    # Warmup run
+    result = lib.sycl_tla_gemm_xe20_bf16(
+        X_ptr, W_ptr, Y_ptr,
+        M, N, K, B,
+        lda, ldb, ldc, ldd,
+        0, 0, 0,  # offsets
+        1,  # swizzle
+        None,  # workspace_size (None = execute mode, not query)
+        workspace_ptr,
+        c_void_p(),  # stream (NULL = use default)
+    )
+    
+    if result != 0:
+        print(f"   ✗ GEMM execution failed with code {result}")
+        return
+    
+    print(f"   ✓ Warmup run completed")
+    
+    # Benchmark
+    print("\n3. Benchmarking...")
+    num_runs = 10
+    times = []
+    
+    for i in range(num_runs):
+        start = time.time()
+        result = lib.sycl_tla_gemm_xe20_bf16(
+            X_ptr, W_ptr, Y_ptr,
+            M, N, K, B,
+            lda, ldb, ldc, ldd,
+            0, 0, 0,
+            1,
+            None,  # workspace_size (None = execute mode)
+            workspace_ptr,
+            c_void_p(),
+        )
+        elapsed = time.time() - start
+        
+        if result != 0:
+            print(f"   ✗ Run {i+1} failed with code {result}")
+            continue
+        
+        times.append(elapsed)
+    
+    if not times:
+        print("   ✗ All runs failed!")
+        return
+    
+    # Calculate statistics
+    avg_time = np.mean(times)
+    min_time = np.min(times)
+    max_time = np.max(times)
+    std_time = np.std(times)
+    
+    # Calculate FLOPS (2*M*N*K for GEMM)
+    flops = 2 * M * N * K
+    avg_gflops = flops / avg_time / 1e9
+    peak_gflops = flops / min_time / 1e9
+    
+    print(f"\n{'='*80}")
+    print(f"Performance Results ({num_runs} runs)")
+    print(f"{'='*80}")
+    print(f"  Average time: {avg_time*1000:.3f} ms")
+    print(f"  Min time:     {min_time*1000:.3f} ms")
+    print(f"  Max time:     {max_time*1000:.3f} ms")
+    print(f"  Std dev:      {std_time*1000:.3f} ms")
+    print(f"")
+    print(f"  Average GFLOPS: {avg_gflops:.2f}")
+    print(f"  Peak GFLOPS:    {peak_gflops:.2f}")
+    print(f"{'='*80}")
+    
+    # Check output (basic sanity check)
+    non_zero = np.count_nonzero(Y)
+    print(f"\nOutput sanity check:")
+    print(f"  Non-zero elements: {non_zero}/{Y.size}")
+    print(f"  Output range: [{Y.min():.3f}, {Y.max():.3f}]")
+    
+    return avg_gflops
+
+
+def benchmark_multiple_sizes():
+    """Benchmark different problem sizes"""
+    
+    print("\n" + "="*80)
+    print("Benchmarking Multiple Problem Sizes")
+    print("="*80)
+    
+    # Test different sizes (all should be compatible with 256x256x32 tile)
+    sizes = [
+        (256, 256, 32),
+        (512, 512, 32),
+        (256, 256, 64),
+        (512, 512, 64),
+        (1024, 1024, 32),
+    ]
+    
+    # Note: This would require modifying the function to accept variable sizes
+    # For now, the kernel is hard-coded to 256x256x32
+    print("\nNote: Current kernel is optimized for 256x256x32 tile size")
+    print("Multi-size benchmarking would require different kernel configurations")
+
+
+if __name__ == "__main__":
+    try:
+        gflops = test_sycl_tla_gemm_xe20_bf16()
+        if gflops:
+            print(f"\n✓ Test completed successfully!")
+            print(f"  Average performance: {gflops:.2f} GFLOPS")
+    except Exception as e:
+        print(f"\n✗ Test failed with exception:")
+        print(f"  {e}")
+        import traceback
+        traceback.print_exc()
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index 3e4f55c5bd..e344b2922a 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -123,6 +123,17 @@ struct IntelXe {
   static int const kMinComputeCapability = 0;
 };
 
+// Intel Xe architecture aliases for library generation compatibility
+// Xe12 = PVC (Ponte Vecchio)
+struct Xe12 : IntelXe {
+  static int const kIntelXeArch = 12;
+};
+
+// Xe20 = BMG (Battlemage) 
+struct Xe20 : IntelXe {
+  static int const kIntelXeArch = 20;
+};
+
 struct Agnostic {
   static int const kMinComputeCapability = 1;
 };
diff --git a/include/cutlass/epilogue/collective/builders/xe_builder.inl b/include/cutlass/epilogue/collective/builders/xe_builder.inl
index 809cede6f7..495244c6e2 100644
--- a/include/cutlass/epilogue/collective/builders/xe_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/xe_builder.inl
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -163,8 +164,8 @@ template <
           "Trying to use Intel pipeline on Non Intel hardware");
       #endif
       static_assert(is_static<TileShape_MNK>::value);
-      static_assert(cute::is_any_of_v<ElementC, float, bfloat16_t, half_t, void>,
-        "ElementC needs to be one of: float, bfloat, half for the Intel pipeline");
+      static_assert(cute::is_any_of_v<ElementC, float, bfloat16_t, half_t, int32_t, void>,
+        "ElementC needs to be one of: float, bfloat, half, int32, or void for the Intel pipeline");
       
       using EpilogueSchedule = std::conditional_t<cute::is_same_v<EpilogueScheduleType, EpilogueScheduleAuto>, 
                                                   IntelXeXMX16,
@@ -211,4 +212,110 @@ template <
             CopyOpR2S_
         >;
     };
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Xe12 (PVC) Epilogue CollectiveBuilder - forwards to IntelXe
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class EpilogueScheduleType,
+  class FusionOpOrCallbacks
+  >
+struct CollectiveBuilder<
+  arch::Xe12,
+  arch::OpClassTensorOp,
+  TileShape_MNK,
+  Shape<_1, _1, _1>,
+  EpilogueTileType,
+  ElementAccumulator,
+  ElementCompute,
+  ElementC,
+  GmemLayoutTagC,
+  AlignmentC,
+  ElementD,
+  GmemLayoutTagD,
+  AlignmentD,
+  EpilogueScheduleType,
+  FusionOpOrCallbacks
+  > : CollectiveBuilder<
+      arch::IntelXe,  // Forward to IntelXe
+      arch::OpClassTensorOp,
+      TileShape_MNK,
+      Shape<_1, _1, _1>,
+      EpilogueTileType,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD,
+      GmemLayoutTagD,
+      AlignmentD,
+      EpilogueScheduleType,
+      FusionOpOrCallbacks
+  > {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Xe20 (BMG) Epilogue CollectiveBuilder - forwards to IntelXe
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class EpilogueScheduleType,
+  class FusionOpOrCallbacks
+  >
+struct CollectiveBuilder<
+  arch::Xe20,
+  arch::OpClassTensorOp,
+  TileShape_MNK,
+  Shape<_1, _1, _1>,
+  EpilogueTileType,
+  ElementAccumulator,
+  ElementCompute,
+  ElementC,
+  GmemLayoutTagC,
+  AlignmentC,
+  ElementD,
+  GmemLayoutTagD,
+  AlignmentD,
+  EpilogueScheduleType,
+  FusionOpOrCallbacks
+  > : CollectiveBuilder<
+      arch::IntelXe,  // Forward to IntelXe
+      arch::OpClassTensorOp,
+      TileShape_MNK,
+      Shape<_1, _1, _1>,
+      EpilogueTileType,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD,
+      GmemLayoutTagD,
+      AlignmentD,
+      EpilogueScheduleType,
+      FusionOpOrCallbacks
+  > {};
+
 } // namespace cutlass::epilogue::collective
+
diff --git a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl
index c2ffaa5a5f..71a2101329 100644
--- a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,32 +55,36 @@ constexpr auto get_num_atoms(T_m tile_m, T_n tile_n){
 
 template<bool is_t, bool is_v, typename T_m, typename T_n>
 constexpr auto select_copy_atom_16b(T_m tile_m, T_n tile_n){
+  // Extract compile-time constant values from cute::Int<> types
+  constexpr int tile_m_val = decltype(tile_m)::value;
+  constexpr int tile_n_val = decltype(tile_n)::value;
+  
   #define RETURN_ATOM(WIDTH, HEIGHT, LETTER) \
     return XE_2D_U16x##WIDTH##x##HEIGHT##_LD_##LETTER {};
 
   if constexpr(is_t){
     // tile_m and tile_n have swapped role in case of _T
-    static_assert(tile_n % 16 == 0 && "Invalid tile_m");
-    if constexpr(tile_m == 8){
+    static_assert(tile_n_val % 16 == 0 && "Invalid tile_m");
+    if constexpr(tile_m_val == 8){
       RETURN_ATOM(16, 8, T)
-    } else if constexpr(tile_m % 16 == 0){
+    } else if constexpr(tile_m_val % 16 == 0){
       RETURN_ATOM(16, 16, T)
     } else{
       static_assert(dependent_false<T_m> && "Invalid tile_n");
     }
   } else if constexpr(is_v){
     #define SELECT_HEIGHT_V(WIDTH) \
-      if constexpr(tile_n == 16){ \
+      if constexpr(tile_n_val == 16){ \
         RETURN_ATOM(WIDTH, 16, V) \
-      } else if constexpr(tile_n % 32 == 0){ \
+      } else if constexpr(tile_n_val % 32 == 0){ \
         RETURN_ATOM(WIDTH, 32, V) \
       } else{ \
         static_assert(dependent_false<T_n> && "Invalid tile_n"); \
       }
 
-    if constexpr(tile_m == 16){
+    if constexpr(tile_m_val == 16){
       SELECT_HEIGHT_V(16)
-    } else if constexpr(tile_m % 32 == 0){
+    } else if constexpr(tile_m_val % 32 == 0){
       SELECT_HEIGHT_V(32)
     } else{
       static_assert(dependent_false<T_m> && "Invalid tile_m");
@@ -87,25 +92,25 @@ constexpr auto select_copy_atom_16b(T_m tile_m, T_n tile_n){
     #undef SELECT_HEIGHT_V
   } else{ // _N
     #define SELECT_WIDTH_N(HEIGHT) \
-      if constexpr(tile_m == 1){ \
+      if constexpr(tile_m_val == 1){ \
         RETURN_ATOM(1, HEIGHT, N) \
-      } else if constexpr(tile_m == 2){ \
+      } else if constexpr(tile_m_val == 2){ \
         RETURN_ATOM(2, HEIGHT, N) \
-      } else if constexpr(tile_m == 4){ \
+      } else if constexpr(tile_m_val == 4){ \
         RETURN_ATOM(4, HEIGHT, N) \
-      } else if constexpr(tile_m == 8){ \
+      } else if constexpr(tile_m_val == 8){ \
         RETURN_ATOM(8, HEIGHT, N) \
-      } else if constexpr(tile_m == 16){ \
+      } else if constexpr(tile_m_val == 16){ \
         RETURN_ATOM(16, HEIGHT, N) \
-      } else if constexpr(tile_m % 32 == 0){ \
+      } else if constexpr(tile_m_val % 32 == 0){ \
         RETURN_ATOM(32, HEIGHT, N) \
       } else { \
         static_assert(dependent_false<T_m> && "Invalid tile_m"); \
       }
 
-    if constexpr(tile_n == 16){
+    if constexpr(tile_n_val == 16){
       SELECT_WIDTH_N(16)
-    } else if constexpr(tile_n % 32 == 0){
+    } else if constexpr(tile_n_val % 32 == 0){
       SELECT_WIDTH_N(32)
     } else {
       static_assert(dependent_false<T_n> && "Invalid tile_n");
@@ -130,6 +135,11 @@ PICK_MMA(bfloat16_t, float, XE_8x16x16_F32BF16BF16F32_TT);
 PICK_MMA(bfloat16_t, bfloat16_t, XE_8x16x16_BF16BF16BF16BF16_TT);
 PICK_MMA(half_t, float, XE_8x16x16_F32F16F16F32_TT);
 PICK_MMA(half_t, half_t, XE_8x16x16_F16F16F16F16_TT);
+// FP8 types use FP16 accumulation, the conversion happens in the collective
+PICK_MMA(float_e4m3_t, float, XE_8x16x16_F32F16F16F32_TT);
+PICK_MMA(float_e5m2_t, float, XE_8x16x16_F32F16F16F32_TT);
+// INT8 types use INT32 accumulation (note: K=32 for INT8, not K=16)
+PICK_MMA(int8_t, int32_t, XE_8x16x32_S32S8S8S32_TT);
 
 #undef PICK_MMA
 }
@@ -171,8 +181,8 @@ struct CollectiveBuilder<
           "Trying to use Intel pipeline on Non Intel hardware");
       #endif
       static_assert(is_static<TileShape_MNK>::value);
-      static_assert(cute::is_any_of_v<ElementAccumulator, float, bfloat16_t, half_t>,
-        "Intel multi-stage pipeline requires ElementC to be of type float, bfloat or half");
+      static_assert(cute::is_any_of_v<ElementAccumulator, float, bfloat16_t, half_t, int32_t>,
+        "Intel multi-stage pipeline requires ElementC to be of type float, bfloat, half, or int32");
 
       static constexpr bool isAtypeBig = cute::sizeof_bits_v<ElementA> > cute::sizeof_bits_v<ElementB>;
       using MMAType = std::conditional_t<isAtypeBig, ElementA, ElementB>;
@@ -218,6 +228,7 @@ struct CollectiveBuilder<
       using ElementA_ = std::conditional_t<cute::sizeof_bits_v<ElementA> <= 8, cute::tuple<ElementA>, ElementA>;
       using ElementB_ = std::conditional_t<cute::sizeof_bits_v<ElementB> <= 8, cute::tuple<ElementB>, ElementB>;
 
+
       using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
               DispatchPolicy,
               TileShape_MNK,
@@ -236,4 +247,96 @@ struct CollectiveBuilder<
               TransformB
           >;
     };
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Xe12 (PVC) CollectiveBuilder - forwards to IntelXe
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class KernelScheduleType
+  >
+struct CollectiveBuilder<
+  arch::Xe12,
+  arch::OpClassTensorOp,
+  ElementA,
+  GmemLayoutATag,
+  AlignmentA,
+  ElementB,
+  GmemLayoutBTag,
+  AlignmentB,
+  ElementAccumulator,
+  TileShape_MNK,
+  Shape<_1, _1, _1>,    
+  cutlass::gemm::collective::StageCountAuto,
+  KernelScheduleType
+  > : CollectiveBuilder<
+      arch::IntelXe,  // Forward to IntelXe
+      arch::OpClassTensorOp,
+      ElementA,
+      GmemLayoutATag,
+      AlignmentA,
+      ElementB,
+      GmemLayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      Shape<_1, _1, _1>,
+      cutlass::gemm::collective::StageCountAuto,
+      KernelScheduleType
+  > {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Xe20 (BMG) CollectiveBuilder - forwards to IntelXe
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class KernelScheduleType
+  >
+struct CollectiveBuilder<
+  arch::Xe20,
+  arch::OpClassTensorOp,
+  ElementA,
+  GmemLayoutATag,
+  AlignmentA,
+  ElementB,
+  GmemLayoutBTag,
+  AlignmentB,
+  ElementAccumulator,
+  TileShape_MNK,
+  Shape<_1, _1, _1>,    
+  cutlass::gemm::collective::StageCountAuto,
+  KernelScheduleType
+  > : CollectiveBuilder<
+      arch::IntelXe,  // Forward to IntelXe
+      arch::OpClassTensorOp,
+      ElementA,
+      GmemLayoutATag,
+      AlignmentA,
+      ElementB,
+      GmemLayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      Shape<_1, _1, _1>,
+      cutlass::gemm::collective::StageCountAuto,
+      KernelScheduleType
+  > {};
+
 }
+
diff --git a/media/docs/python/xe_cutlass_library.md b/media/docs/python/xe_cutlass_library.md
new file mode 100644
index 0000000000..b348db46d8
--- /dev/null
+++ b/media/docs/python/xe_cutlass_library.md
@@ -0,0 +1,192 @@
+<!--
+Copyright (C) 2025 Intel Corporation, All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Kernel Generation and Manifest
+
+This is a code/kernel generation system that creates a searchable catalog of CUTLASS kernel operations, bridging build-time generation and runtime selection.
+
+## Architecture Overview
+
+**Two-Phase System:**
+1. **Build Time (Python)**: `manifest.py` generates C++ initialization code
+2. **Runtime (C++)**: Generated code registers operations into a searchable `Manifest`
+
+```
+Python Generator → C++ Files → Compiled Library → Runtime Catalog
+```
+
+## Key Components
+
+### Python Generator (`manifest.py`)
+
+**Responsibilities:**
+- Filter kernels by GPU architecture (SM/Xe), operation type, patterns
+- Group operations by kind/architecture/instruction type  
+- Generate C++ initialization functions and CMake files
+
+### Generated File Structure
+```
+build/tools/library/generated/
+├── initialize_all.cpp
+├── gemm/20/tensorop/cutlass3x_xe20_tensorop_gemm_bf16_*.cpp
+└── manifest.cmake
+```
+
+### Architecture Naming
+| GPU | Prefix | ID | Example |
+|-----|--------|----|---------|
+| CUDA | `sm` | 70-90 | `sm80` |
+| Intel Xe | `xe` | 12,20 | `xe20` |
+
+## Runtime API
+
+### Core Classes
+
+```cpp
+// Manifest: Operation catalog
+class Manifest {
+  Status initialize();
+  void append(Operation *op);
+  OperationVector const& operations() const;
+};
+
+// Operation: Base kernel interface  
+class Operation {
+  virtual Status can_implement(void const *config, void const *args) const = 0;
+  virtual Status run(void const *args, void *workspace, Stream stream) const = 0;
+};
+```
+
+### Initialization Hierarchy
+```cpp
+namespace cutlass::library {
+  void initialize_all(Manifest &manifest);                    // All operations
+  void initialize_all_gemm_operations(Manifest &manifest);    // GEMM only
+  void initialize_all_xe20_gemm_operations(Manifest &manifest); // XE20 GEMM
+}
+```
+
+## Usage Examples
+
+### Basic Usage
+```cpp
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+cutlass::library::Manifest manifest;
+cutlass::library::initialize_all(manifest);
+
+// Find BF16 GEMM
+for (auto& op : manifest.operations()) {
+  if (op->description().name.find("bf16") != std::string::npos) {
+    // Use operation...
+  }
+}
+```
+
+### Python Integration
+```python
+# Use extern "C" wrappers for ctypes integration
+from ctypes import CDLL
+lib = CDLL("libcutlass_gemm_xe20_gemm.so")
+# Call exported C functions that wrap C++ manifest APIs
+```
+
+**Example Implementation:** See `examples/11_xe20_cutlass_library/` for a complete CMake-based shared library that exports CUTLASS kernels for Python usage via ctypes.
+
+## Common Patterns
+
+### Lazy Initialization
+```cpp
+class LazyManifest {
+  cutlass::library::Manifest manifest_;
+  bool initialized_ = false;
+public:
+  cutlass::library::Manifest& get() {
+    if (!initialized_) {
+      cutlass::library::initialize_all(manifest_);
+      initialized_ = true;
+    }
+    return manifest_;
+  }
+};
+```
+
+### Operation Caching
+```cpp
+class OperationCache {
+  std::map<std::string, cutlass::library::Operation*> cache_;
+public:
+  cutlass::library::Operation* find(const std::string& pattern) {
+    if (cache_.count(pattern)) return cache_[pattern];
+    // Search manifest and cache result...
+  }
+};
+```
+
+## Build Integration
+
+### CMake Configuration
+```bash
+# Generate for Intel XE20
+cmake .. -DCUTLASS_LIBRARY_GENERATOR_ARCHS="20"
+ninja cutlass_library
+```
+
+### Python Generator
+```bash
+python3 generator.py --operations=gemm --architectures=20 --build-dir=.
+```
+
+## Performance Tips
+
+- **Selective Initialization**: Only initialize needed operation kinds
+- **Operation Caching**: Cache frequently used operations
+- **Kernel Filtering**: Use build-time filtering to reduce library size
+- **Lazy Loading**: Initialize manifest only when needed
+
+## Debugging
+
+```bash
+# List generated operations
+nm -D libcutlass_gemm_xe20_gemm.so | grep initialize
+
+# Enable Python debug logging
+python3 -c "import logging; logging.basicConfig(level=logging.DEBUG)"
+```
+
+## References
+
+- **Source**: `python/cutlass_library/manifest.py`
+- **Headers**: `tools/library/include/cutlass/library/`
+- **Generated**: `build/tools/library/generated/`
+- **Examples**: 
+  - `examples/11_xe20_cutlass_library/` - CMake-based shared library for Python integration
+  - `examples/python/cutlass_library/xe20_gemm_bf16.py` - Python test script using ctypes
diff --git a/media/docs/python/xe_library_generation.md b/media/docs/python/xe_library_generation.md
new file mode 100644
index 0000000000..63c22c088f
--- /dev/null
+++ b/media/docs/python/xe_library_generation.md
@@ -0,0 +1,225 @@
+<!--
+Copyright (C) 2025 Intel Corporation, All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Intel SYCL*TLA Library Generation Guide
+
+**Complete Reference for Intel Xe GPU Architecture Support**
+
+---
+
+## Quick Start
+
+```bash
+# Configure for BMG (Xe2)
+cd build
+cmake .. -GNinja -DCUTLASS_NVCC_ARCHS="" -DCUTLASS_ENABLE_SYCL=ON -DSYCL_INTEL_TARGET -DCUTLASS_LIBRARY_GENERATOR_ARCHS="20"
+
+# Build libraries
+ninja cutlass_library
+
+# Test generation
+cd python/cutlass_library
+python3 test_simple_generation.py --build-dir ./test_build --arch 20
+```
+
+**Expected Output:** ✅ 24 operations, 31 .cpp files generated
+
+---
+
+## Architecture Support
+
+| GPU | Arch | Compute Cap | File Ext | Arch Tag |
+|-----|------|-------------|----------|----------|
+| **BMG** (Xe2) | 20 | 12-50 | `.cpp` | `cutlass::arch::Xe20` |
+| **PVC** (Xe-HPC) | 12 | 12-50 | `.cpp` | `cutlass::arch::Xe12` |
+
+**Key Differences from CUDA:**
+- Architecture prefix: `xe` (not `sm`)
+- File extension: `.cpp` (not `.cu`)
+- Compute capability: 12-50 (vs 50-120 for CUDA)
+
+---
+
+## Supported Kernel Types
+
+### ✅ Homogeneous Types (A == B)
+
+| Type | A × B → C/D | Math Inst | Tile | Align | Status |
+|------|-------------|-----------|------|-------|--------|
+| **FP16** | half × half → float | [8,16,16] | 256×256×32 | 8 | ✅ |
+| **BF16** | bf16 × bf16 → float | [8,16,16] | 256×256×32 | 8 | ✅ |
+| **FP8-E4M3** | e4m3 × e4m3 → float | [8,16,32] | 256×256×64 | 16 | ✅ |
+| **FP8-E5M2** | e5m2 × e5m2 → float | [8,16,32] | 256×256×64 | 16 | ✅ |
+| **INT8** | int8 × int8 → int32 | [8,16,32] | 256×256×64 | 16 | ✅ |
+
+**Layout Combinations:** RR, RC, CR, CC (4 variants per type)
+
+### ❌ Mixed Precision (A ≠ B)
+
+Mixed precision infrastructure is not supported now:
+- FP16 × E4M3/E5M2 → FP32
+- BF16 × E4M3/E5M2 → FP32  
+- FP16 × INT4 → FP32
+
+---
+
+## Generated Libraries
+
+```bash
+$ ls -lh build/tools/library/libcutlass*.so
+-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_bf16.so    # BF16 kernels
+-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_e4m3.so    # FP8 E4M3 
+-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_e5m2.so    # FP8 E5M2
+-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_f16.so     # FP16 kernels
+-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm_s8.so      # INT8 kernels
+-rwxrwxr-x 186K libcutlass_gemm_xe20_gemm.so         # Generic
+-rwxrwxr-x  19K libcutlass.so                        # Base library
+```
+
+### Kernel Naming Convention
+
+```
+cutlass3x_xe{arch}_{opclass}_{operation}_{dtype}_{tile}_{warp}_{layout}_align{N}
+```
+
+**Examples:**
+```cpp
+cutlass3x_xe20_tensorop_gemm_f16_256x256_32x0_nn_align8    // FP16, Row×Row
+cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nt_align8   // BF16, Row×Column
+cutlass3x_xe20_tensorop_gemm_e4m3_256x256_64x0_tn_align16  // E4M3, Column×Row
+```
+
+**Layout Codes:** `nn`=Row×Row, `nt`=Row×Column, `tn`=Column×Row, `tt`=Column×Column
+
+---
+
+## Build & Usage
+
+### CMake Configuration
+
+```bash
+# BMG (Xe2)
+cmake .. -GNinja -DCUTLASS_ENABLE_SYCL=ON -DCUTLASS_LIBRARY_GENERATOR_ARCHS="20"
+
+# PVC (Xe-HPC)  
+cmake .. -GNinja -DCUTLASS_ENABLE_SYCL=ON -DCUTLASS_LIBRARY_GENERATOR_ARCHS="12"
+```
+
+### Build Targets
+
+```bash
+ninja cutlass_library                              # All libraries
+ninja cutlass_library_gemm_xe20_gemm_bf16         # BF16 only
+ninja cutlass_library_gemm_xe20_gemm_f16          # FP16 only
+```
+
+### Python Generator (Direct)
+
+```bash
+cd build
+python3 ../python/cutlass_library/generator.py --operations=gemm --architectures=20 --build-dir=.
+```
+
+### Python Integration Example
+
+For Python integration via ctypes, see:
+- **`examples/11_xe20_cutlass_library/`** - Complete CMake-based shared library example
+- **`examples/python/cutlass_library/xe20_gemm_bf16.py`** - Python test script using ctypes
+
+**Build and test:**
+```bash
+# Build the shared library
+ninja xe20_cutlass_library_bf16
+
+# Test with Python
+cd examples/python/cutlass_library
+python3 xe20_gemm_bf16.py
+```
+
+## Troubleshooting
+
+### No Operations Generated
+**Check:** `GenerateIntelXe()` called for arch in [12, 20] in `generator.py`
+
+### Library Link Errors
+```
+undefined reference to `initialize_all_xe20_gemm_bf16_gemm_operations()`
+```
+**Solution:** Build and link the specific library: `-lcutlass_gemm_xe20_gemm_bf16`
+
+## Summary
+
+### ✅ What Works
+- **5 data type libraries** (FP16, BF16, E4M3, E5M2, INT8)
+- **~24 operations, 31 .cpp files** generated
+- **Homogeneous type kernels** compile cleanly
+- **INT32 accumulator** for INT8
+- **FP8→FP16 conversion** in MMA
+
+### ❌ Limitations  
+- **Mixed precision** requires grouped GEMM
+- **Regular library** only supports ElementA == ElementB
+- **No INT4** in regular GEMM
+
+### 📊 Quick Reference
+| Feature | Value |
+|---------|-------|
+| Arch Numbers | BMG=20, PVC=12 |
+| File Ext | `.cpp` |
+| Arch Prefix | `xe` |
+| CC Range | 12-50 |
+| Total Libraries | 7 |
+| Total Kernels | ~24 |
+| Supported Types | FP16, BF16, E4M3, E5M2, INT8 |
+
+## Examples and References
+
+### Practical Examples
+- **`examples/11_xe20_cutlass_library/`** - CMake-based shared library for Python integration
+  - Exports `sycl_tla_gemm_xe20_bf16()` function via extern "C"
+  - Builds `libxe20_cutlass_library_bf16.so` with proper CMake integration
+  - Integrated into main examples build system (`ninja cutlass_examples`)
+
+- **`examples/python/cutlass_library/xe20_gemm_bf16.py`** - Python ctypes integration
+  - Complete test script using the shared library
+  - Demonstrates workspace querying, execution, and benchmarking
+  - Shows proper error handling and performance measurement
+
+### Build Integration
+```bash
+# Build the example library
+ninja xe20_cutlass_library_bf16
+
+# Run Python test
+cd examples/python/cutlass_library
+python3 xe20_gemm_bf16.py
+```
+
+---
diff --git a/python/cutlass_library/arch_constants.py b/python/cutlass_library/arch_constants.py
new file mode 100644
index 0000000000..36e14a7773
--- /dev/null
+++ b/python/cutlass_library/arch_constants.py
@@ -0,0 +1,47 @@
+#################################################################################################
+#
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Architecture range constants for CUTLASS library generation.
+Shared across manifest.py and gemm_operation.py to avoid circular imports.
+"""
+
+###################################################################################################
+# Architecture range constants
+# Intel Xe architectures use the range [INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX)
+# CUDA architectures use values >= CUDA_ARCH_MIN
+###################################################################################################
+INTEL_XE_ARCH_MIN = 12  # Minimum Intel Xe architecture (PVC = 12, BMG = 20)
+INTEL_XE_ARCH_MAX = 50  # Upper bound (exclusive) for Intel Xe range
+CUDA_ARCH_MIN = 50      # Minimum CUDA architecture (sm_50, sm_60, etc.)
+
+###################################################################################################
diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py
index 6dc9a0456b..d7ce9cb5dd 100644
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -1,6 +1,7 @@
-
+#################################################################################################
 #
 # Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -47,8 +48,10 @@
   if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
     raise ImportError("Disabling attempt to import cutlass_library")
   from cutlass_library.library import *
+  from cutlass_library.arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN
 except ImportError:
   from library import *
+  from arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -87,7 +90,8 @@ def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue,
     self.B = B
     self.C = C
     self.D = D
-    self.is_xe = self.arch == 11
+    # Intel Xe architectures: PVC (12), BMG/Xe2 (20), etc.
+    self.is_xe = self.arch >= INTEL_XE_ARCH_MIN and self.arch < INTEL_XE_ARCH_MAX
 
     if is_block_scaled(gemm_kind):
       self.ScaleFactorA = ScaleFactorA
@@ -388,6 +392,7 @@ def _procedural_name(self):
           l = self.layout_name(),
           a = str(max(self.A.alignment, self.B.alignment)))
     else:
+      # Intel Xe architectures use xe{cc} naming (e.g., xe20 for BMG, xe12 for PVC)
       threadblock = self.tile_description.procedural_name()
       return "cutlass{p}_xe{ar}_{op}_{ex}_{tb}_{l}_align{a}".format(
           p = self.prefix,
@@ -1156,9 +1161,11 @@ def emit(self, operation):
       'blockwise_prepare_code' : blockwise_prepare_code
     }
 
-    # Overriding values for Intel Xe
+    # Overriding values for Intel Xe architectures
     if operation.is_xe:
-      values['arch'] = "cutlass::arch::IntelXe"
+      # Use specific compute capability for Intel Xe GPUs
+      # e.g., cutlass::arch::Xe20 for BMG, cutlass::arch::Xe12 for PVC
+      values['arch'] = "cutlass::arch::Xe%d" % operation.arch
 
     return SubstituteTemplate(self.gemm_template, values)
 
@@ -1473,7 +1480,13 @@ def emit(self, operation):
 class EmitGemmConfigurationLibrary:
   def __init__(self, operation_path, configuration_name):
     self.configuration_name = configuration_name
-    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+    
+    # Determine file extension based on architecture
+    # Intel Xe architectures (12=PVC, 20=BMG) use .cpp, CUDA uses .cu
+    # Check if operation_path contains /12/, /20/, xe12, or xe20
+    is_xe_arch = any(marker in operation_path for marker in ['/12/', '\\12\\', 'xe12', '/20/', '\\20\\', 'xe20'])
+    file_extension = "cpp" if is_xe_arch else "cu"
+    self.configuration_path = os.path.join(operation_path, "%s.%s" % (configuration_name, file_extension)).replace('\\', '/')
 
     self.instance_emitter = {
       GemmKind.Gemm: EmitGemmInstance,
diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index aa73fb8b13..e7e1c4bad1 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -1,6 +1,7 @@
 #################################################################################################
 #
 # Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -11769,7 +11770,292 @@ def GeneratePVC_TensorOp_16b_gemm(manifest, cuda_version):
       CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent])
 
 def GeneratePVC(manifest, cuda_version):
-    GeneratePVC_TensorOp_16b_gemm(manifest, cuda_version)
+    """
+    Generate CUTLASS kernels for PVC (Ponte Vecchio) architecture.
+    
+    PVC is Intel's Xe-HPC GPU architecture with compute capability 12.
+    
+    This is a legacy wrapper that calls GenerateIntelXe with arch=12.
+    """
+    GenerateIntelXe(manifest, cuda_version, arch=12)
+
+###################################################################################################
+def GenerateXe_TensorOp_16b_DPAS_gemm(manifest, cuda_version, min_cc=20):
+    """Generate FP16/BF16 GEMM kernels for Intel Xe architecture using DPAS.
+    
+    :param min_cc: Architecture number (12 for PVC, 20 for BMG)
+    """
+    layout_list = [
+        [[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8]],
+        [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8]],
+        [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8]],
+        [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8]],
+    ]
+
+    math_instructions = [
+        MathInstruction(
+            [8, 16, 16],
+            DataType.f16, DataType.f16, DataType.f32,
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+        MathInstruction(
+            [8, 16, 16],
+            DataType.f16, DataType.f16, DataType.f16,
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+        MathInstruction(
+            [8, 16, 16],
+            DataType.bf16, DataType.bf16, DataType.f32,
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+        MathInstruction(
+            [8, 16, 16],
+            DataType.bf16, DataType.bf16, DataType.bf16,
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add)
+    ]
+
+    max_cc = min_cc
+
+    for math_inst in math_instructions:
+        tile_descriptions = [
+            TileDescription([256, 256, 32],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 256, 32],
+                0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([256, 128, 32],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 128, 32],
+                0, [4, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([64, 128, 32],
+                0, [2, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+        ]
+
+        data_type = {
+            "a_type": math_inst.element_a,
+            "b_type": math_inst.element_b,
+            "c_type": math_inst.element_accumulator,
+            "d_type": math_inst.element_accumulator,
+            "acc_type": math_inst.element_accumulator,
+            "epi_type": math_inst.element_accumulator
+        }
+
+        schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]]
+
+        CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent])
+
+
+def GenerateXe_TensorOp_fp8_DPAS_gemm(manifest, cuda_version, min_cc=20):
+    """Generate FP8 (E4M3/E5M2) GEMM kernels for Intel Xe architecture using DPAS.
+    
+    Supported combinations for regular GEMM:
+    - [e4m3, e4m3, fp32]: E4M3 x E4M3 -> FP32 (homogeneous)
+    - [e5m2, e5m2, fp32]: E5M2 x E5M2 -> FP32 (homogeneous)
+    
+    Note: Mixed precision (FP16/BF16 x FP8) requires grouped GEMM infrastructure
+    and is NOT supported for regular library generation.
+    
+    :param min_cc: Architecture number (12 for PVC, 20 for BMG)
+    """
+    layout_list = [
+        [[LayoutType.RowMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 8]],
+        [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 8]],
+        [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 8]],
+        [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 8]],
+    ]
+
+    # FP8 math instructions for Intel Xe
+    # Only homogeneous types (same A and B type) for regular GEMM
+    math_instructions = [
+        # Homogeneous FP8 (same type for A and B) - SUPPORTED
+        MathInstruction(
+            [8, 16, 32],
+            DataType.e4m3, DataType.e4m3, DataType.f32,  # E4M3 x E4M3 -> FP32
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+        MathInstruction(
+            [8, 16, 32],
+            DataType.e5m2, DataType.e5m2, DataType.f32,  # E5M2 x E5M2 -> FP32
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+        
+        # DISABLED: Mixed precision FP16/BF16 x FP8 requires grouped GEMM
+        # These would need MainloopIntelXeXMX16GroupMixedPrecision which is only
+        # activated when IsGroup=true (KernelXePtrArrayCooperative schedule).
+        # Regular library GEMMs use MainloopIntelXeXMX16 which requires ElementA == ElementB.
+        #
+        # MathInstruction([8, 16, 32], DataType.f16, DataType.e5m2, DataType.f32, ...),
+        # MathInstruction([8, 16, 32], DataType.f16, DataType.e4m3, DataType.f32, ...),
+        # MathInstruction([8, 16, 32], DataType.bf16, DataType.e5m2, DataType.f32, ...),
+        # MathInstruction([8, 16, 32], DataType.bf16, DataType.e4m3, DataType.f32, ...),
+    ]
+
+    max_cc = min_cc
+
+    for math_inst in math_instructions:
+        tile_descriptions = [
+            TileDescription([256, 256, 64],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 256, 64],
+                0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([256, 128, 64],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 128, 64],
+                0, [4, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+        ]
+
+        data_type = {
+            "a_type": math_inst.element_a,
+            "b_type": math_inst.element_b,
+            "c_type": math_inst.element_accumulator,
+            "d_type": math_inst.element_accumulator,
+            "acc_type": math_inst.element_accumulator,
+            "epi_type": math_inst.element_accumulator
+        }
+
+        schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]]
+
+        CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent])
+
+def GenerateXe_TensorOp_int8_DPAS_gemm(manifest, cuda_version, min_cc=20):
+    """Generate INT8 GEMM kernels for Intel Xe architecture using DPAS.
+    
+    Supported: [int8, int8, int32] -> INT32 accumulator (hardware requirement)
+    
+    :param min_cc: Architecture number (12 for PVC, 20 for BMG)
+    """
+    layout_list = [
+        [[LayoutType.RowMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 4]],
+        [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 4]],
+        [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 4]],
+        [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 4]],
+    ]
+
+    # INT8 x INT8 -> INT32 (hardware requirement for Intel Xe)
+    math_instructions = [
+        MathInstruction(
+            [8, 16, 32],
+            DataType.s8, DataType.s8, DataType.s32,  # Changed from f32 to s32
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+    ]
+
+    max_cc = min_cc
+
+    for math_inst in math_instructions:
+        tile_descriptions = [
+            TileDescription([256, 256, 64],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 256, 64],
+                0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([256, 128, 64],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 128, 64],
+                0, [4, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+        ]
+
+        data_type = {
+            "a_type": math_inst.element_a,
+            "b_type": math_inst.element_b,
+            "c_type": math_inst.element_accumulator,
+            "d_type": math_inst.element_accumulator,
+            "acc_type": math_inst.element_accumulator,
+            "epi_type": math_inst.element_accumulator
+        }
+
+        schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]]
+
+        CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent])
+
+
+def GenerateXe_TensorOp_mixed_dtype_DPAS_gemm(manifest, cuda_version, min_cc=20):
+    """Generate mixed-precision GEMM kernels for Intel Xe architecture using DPAS.
+    
+    Supported: [fp16, int4, fp32] -> FP16 x INT4 with FP32 accumulator
+    
+    :param min_cc: Architecture number (12 for PVC, 20 for BMG)
+    """
+    layout_list = [
+        [[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 32], [LayoutType.RowMajor, 8]],
+        [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor, 8]],
+        [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 32], [LayoutType.RowMajor, 8]],
+        [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor, 8]],
+    ]
+
+    # Mixed precision: FP16 x INT4 -> FP32 (hardware requirement for Intel Xe)
+    math_instructions = [
+        MathInstruction(
+            [8, 16, 32],
+            DataType.f16, DataType.s4, DataType.f32,  # Changed from [s8, f16, f32] to [f16, s4, f32]
+            OpcodeClass.TensorOp,
+            MathOperation.multiply_add),
+    ]
+
+    max_cc = min_cc
+
+    for math_inst in math_instructions:
+        tile_descriptions = [
+            TileDescription([256, 256, 64],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([128, 256, 64],
+                0, [4, 8, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+            TileDescription([256, 128, 64],
+                0, [8, 4, 1], math_inst, min_cc, max_cc, [1, 1, 1]),
+        ]
+
+        data_type = {
+            "a_type": math_inst.element_a,
+            "b_type": math_inst.element_b,
+            "c_type": math_inst.element_accumulator,
+            "d_type": math_inst.element_accumulator,
+            "acc_type": math_inst.element_accumulator,
+            "epi_type": math_inst.element_accumulator
+        }
+
+        schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]]
+
+        CreateGemmUniversal3xOperator(manifest, layout_list, tile_descriptions, data_type, schedules, tile_schedulers=[TileSchedulerType.Persistent])
+
+
+def GenerateBMG(manifest, cuda_version):
+    """
+    Generate CUTLASS kernels for BMG (Battlemage/Xe2) architecture.
+    
+    BMG is Intel's Xe2 GPU architecture with compute capability 20.
+    Supports DPAS operations with FP16, BF16, FP8, and INT8 data types.
+    
+    This is a legacy wrapper that calls GenerateIntelXe with arch=20.
+    """
+    GenerateIntelXe(manifest, cuda_version, arch=20)
+
+def GenerateIntelXe(manifest, cuda_version, arch=20):
+    """
+    Unified generator for Intel Xe GPU architectures.
+    
+    Supports both PVC (arch 12) and BMG (arch 20) with the same generation code.
+    The operations are identical, only the architecture number differs.
+    
+    Supported data types:
+    - FP16/BF16: [fp16/bf16, fp16/bf16, fp32]
+    - INT8: [int8, int8, int32]
+    - FP8: [fp8, fp8, fp32] (E4M3 or E5M2, same types only)
+    - Mixed: [fp16, int4, fp32]
+    
+    :param manifest: Manifest object to add operations to
+    :param cuda_version: CUDA version string (used for compatibility)
+    :param arch: Architecture number (12 for PVC, 20 for BMG)
+    """
+    if arch not in [12, 20]:
+        raise ValueError(f"Unsupported Intel Xe architecture: {arch}. Supported: 12 (PVC), 20 (BMG)")
+    
+    # All Intel Xe architectures use the same generation functions
+    # Only the min_cc (architecture number) differs
+    GenerateXe_TensorOp_16b_DPAS_gemm(manifest, cuda_version, min_cc=arch)
+    GenerateXe_TensorOp_fp8_DPAS_gemm(manifest, cuda_version, min_cc=arch)
+    GenerateXe_TensorOp_int8_DPAS_gemm(manifest, cuda_version, min_cc=arch)
+    # DISABLED: Mixed precision (FP16 x INT4) requires grouped GEMM infrastructure
+    # Regular library generation uses MainloopIntelXeXMX16 which requires ElementA == ElementB
+    # GenerateXe_TensorOp_mixed_dtype_DPAS_gemm(manifest, cuda_version, min_cc=arch)
 
 ###################################################################################################
 
@@ -11865,6 +12151,21 @@ def define_parser():
     GenerateSM100(manifest, args.cuda_version)
     GenerateSM120(manifest, args.cuda_version)
 
+  # Intel Xe GPU architectures - unified handling for PVC and BMG
+  # Both architectures share the same generation code, just different arch numbers
+  
+  # Check for BMG (architecture 20)
+  bmg_arch_list = ["20", "bmg", "xe2", "intel_gpu_bmg_g21"]
+  bmg_enabled_arch = any(arch.lower() in [x.lower() for x in bmg_arch_list] for arch in archs)
+  if bmg_enabled_arch:
+    GenerateIntelXe(manifest, args.cuda_version, arch=20)
+
+  # Check for PVC (architecture 12)
+  pvc_arch_list = ["12", "pvc", "intel_gpu_pvc"]
+  pvc_enabled_arch = any(arch.lower() in [x.lower() for x in pvc_arch_list] for arch in archs)
+  if pvc_enabled_arch:
+    GenerateIntelXe(manifest, args.cuda_version, arch=12)
+
   if 'library' in args.generator_target.split(','):
     manifest.emit(GeneratorTarget.Library)
 
diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py
index baaaac28a8..e0ddf91d43 100644
--- a/python/cutlass_library/manifest.py
+++ b/python/cutlass_library/manifest.py
@@ -1,6 +1,7 @@
 #################################################################################################
 #
 # Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -65,6 +66,16 @@
 ###################################################################################################
 _LOGGER = logging.getLogger(__name__)
 
+###################################################################################################
+# Import architecture range constants from shared module
+###################################################################################################
+try:
+  from cutlass_library.arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN
+except ImportError:
+  from arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN
+
+###################################################################################################
+
 
 class EmitOperationKindAll:
   """
@@ -136,7 +147,27 @@ def __enter__(self):
                   str(self.operation_path));
     os.makedirs(self.operation_path, exist_ok=True)
 
-    self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.cu")
+    # Determine file extension based on architecture
+    # Check if any Intel Xe target is present in the architectures
+    file_extension = "cu"  # Default to CUDA
+    if self.args and hasattr(self.args, 'architectures'):
+      archs = self.args.architectures.split(';') if len(self.args.architectures) else []
+      for arch in archs:
+        arch_lower = arch.lower()
+        # Check for Intel Xe targets
+        if any(xe_target in arch_lower for xe_target in ['pvc', 'bmg', 'intel_gpu']):
+          file_extension = "cpp"
+          break
+        # Check for numeric Xe architecture in the Intel Xe range
+        try:
+          arch_num = int(arch.split('a')[0].split('f')[0])
+          if arch_num >= INTEL_XE_ARCH_MIN and arch_num < INTEL_XE_ARCH_MAX:
+            file_extension = "cpp"
+            break
+        except (ValueError, AttributeError):
+          pass
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.{file_extension}")
     _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
 
     self.top_level_file = open(self.top_level_path, "w")
@@ -184,9 +215,10 @@ class EmitOperationKindLibrary:
   for min_cc=90 and OperationKind=Gemm), in the file
   all_sm{min_cc}_{operation_kind}_operations.cu
   (e.g., all_sm90_gemm_operations.cu for min_cc=90 and OperationKind=Gemm).
+  For Intel Xe targets, uses xe{min_cc} prefix instead of sm{min_cc}.
   The min_cc variable here indicates the minimum GPU architecture version
   that the things to be initialized require.
-  For example, min_cc=90 indicates sm90.
+  For example, min_cc=90 indicates sm90 for CUDA, min_cc=20 indicates Xe2/BMG for Intel.
 
   That file declares several functions in namespace cutlass::library.
   The functions all have this form,
@@ -207,11 +239,23 @@ class EmitOperationKindLibrary:
   of what happens in each of those subdirectories.
   """
 
+  @staticmethod
+  def get_arch_prefix(min_cc):
+    """Get architecture prefix based on compute capability.
+    Returns 'sm' for CUDA architectures, 'xe' for Intel Xe architectures.
+    Intel Xe: 12 (PVC), 20 (BMG)
+    CUDA: 50+ for CUDA architectures"""
+    if min_cc >= INTEL_XE_ARCH_MIN and min_cc < INTEL_XE_ARCH_MAX:
+      return 'xe'
+    else:
+      return 'sm'
+
   def __init__(self, generated_path, min_cc, kind, args):
     self.generated_path = generated_path
     self.min_cc = min_cc
     self.kind = kind
     self.args = args
+    self.arch_prefix = self.get_arch_prefix(min_cc)
     self.emitters = {
       OperationKind.Gemm: EmitGemmConfigurationLibrary,
       OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
@@ -242,12 +286,12 @@ def __init__(self, generated_path, min_cc, kind, args):
 //
 // Entry point to construct operations
 //
-void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) {
+void initialize_all_${arch_prefix}${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) {
 """
     self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
     self.configuration_template = "  initialize_${configuration_name}(manifest);\n"
-    self.subclass_call_template = "  initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n"
-    self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n"
+    self.subclass_call_template = "  initialize_all_${arch_prefix}${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n"
+    self.subclass_prototype_template = "void initialize_all_${arch_prefix}${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n"
     self.epilogue_template ="""}
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -268,7 +312,9 @@ def __enter__(self):
     _LOGGER.debug(f"***   operation_path (directory to make): {str(self.operation_path)}")
     os.makedirs(self.operation_path)
 
-    self.top_level_path = os.path.join(self.operation_path, f"all_sm{self.min_cc}_{OperationKindNames[self.kind]}_operations.cu")
+    # Use .cpp extension for Intel Xe architectures, .cu for CUDA
+    file_extension = "cpp" if (self.min_cc >= INTEL_XE_ARCH_MIN and self.min_cc < INTEL_XE_ARCH_MAX) else "cu"
+    self.top_level_path = os.path.join(self.operation_path, f"all_{self.arch_prefix}{self.min_cc}_{OperationKindNames[self.kind]}_operations.{file_extension}")
     _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
 
     self.top_level_file = open(self.top_level_path, "w")
@@ -307,9 +353,11 @@ def emit(self, configuration_name, operations):
 
       self.subclass_configurations[extended_name] = []
 
+      # Use .cpp extension for Intel Xe architectures, .cu for CUDA
+      file_extension = "cpp" if (self.min_cc >= INTEL_XE_ARCH_MIN and self.min_cc < INTEL_XE_ARCH_MAX) else "cu"
       # Open a new top-level file for this sub class
       subclass_top_level_path = os.path.join(
-        subclass_path, f"all_sm{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.cu")
+        subclass_path, f"all_{self.arch_prefix}{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.{file_extension}")
       _LOGGER.debug('***     subclass_top_level_path (min_cc, extended_name, ' +
                     'OperationKind): ' + str(subclass_top_level_path))
 
@@ -337,6 +385,7 @@ def __exit__(self, exception_type, exception_value, traceback):
     _LOGGER.debug("*** EmitOperationKindLibrary::__exit__")    
     for subclass_name, subclass_file in sorted(self.subclass_files.items()):
       subclass_cfg = {
+        'arch_prefix': self.arch_prefix,
         'min_cc': str(self.min_cc),
         'subclass_name': subclass_name,
         'operation_name': OperationKindNames[self.kind]
@@ -345,6 +394,7 @@ def __exit__(self, exception_type, exception_value, traceback):
 
     self.top_level_file.write(
       SubstituteTemplate(self.entry_template, {
+        'arch_prefix': self.arch_prefix,
         'min_cc': str(self.min_cc),
         'subclass_name': '',
         'operation_name': OperationKindNames[self.kind]
@@ -353,6 +403,7 @@ def __exit__(self, exception_type, exception_value, traceback):
     # Finish and close all subclass files
     for subclass_name, subclass_file in sorted(self.subclass_files.items()):
       subclass_cfg = {
+        'arch_prefix': self.arch_prefix,
         'min_cc': str(self.min_cc),
         'subclass_name': subclass_name,
         'operation_name': OperationKindNames[self.kind]
@@ -511,6 +562,7 @@ def __init__(self, args = None):
     self.compute_capabilities_feature_set = ['50',]
     self.curr_build_dir = '.'
     self.filter_by_cc = True
+    self.is_xe_target = False  # Track if building for Intel Xe
 
     if self.args:
       self.kernel_filter = self.args.kernels
@@ -518,10 +570,43 @@ def __init__(self, args = None):
 
       # A common user error is to use commas instead of semicolons.
       if ',' in args.architectures:
-        raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures)
+        raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS or DPCPP_SYCL_TARGET) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures)
       
       self.compute_capabilities_feature_set = args.architectures.split(';') if len(args.architectures) else ['50',]
-      self.compute_capabilities_baseline = sorted(set(int(arch.split('a')[0].split('f')[0]) for arch in self.compute_capabilities_feature_set))
+      
+      # Parse architecture identifiers - support both CUDA SM and Intel Xe targets
+      baseline_archs = []
+      for arch in self.compute_capabilities_feature_set:
+        # Check if this is an Intel Xe target (pvc, bmg, etc.)
+        # Support both string names ('pvc', 'bmg') and numeric values
+        arch_lower = arch.lower()
+        is_xe_named = any(xe_target in arch_lower for xe_target in ['pvc', 'bmg', 'intel_gpu'])
+        
+        # Also check if it's a numeric Xe architecture in the Intel Xe range
+        try:
+          arch_num = int(arch.split('a')[0].split('f')[0])
+          is_xe_numeric = (arch_num >= INTEL_XE_ARCH_MIN and arch_num < INTEL_XE_ARCH_MAX)
+        except (ValueError, AttributeError):
+          arch_num = None
+          is_xe_numeric = False
+        
+        if is_xe_named or is_xe_numeric:
+          self.is_xe_target = True
+          # Map Intel Xe architectures to numeric identifiers for compatibility
+          # PVC (Ponte Vecchio) -> 12
+          # BMG (Battlemage/Xe2) -> 20  
+          if 'pvc' in arch_lower or arch_num == 12:
+            baseline_archs.append(12)
+          elif 'bmg' in arch_lower or 'xe2' in arch_lower or arch_num == 20:
+            baseline_archs.append(20)
+          else:
+            # Generic Intel GPU target - default to BMG
+            baseline_archs.append(20)
+        else:
+          # CUDA SM architecture
+          baseline_archs.append(arch_num if arch_num is not None else int(arch.split('a')[0].split('f')[0]))
+      
+      self.compute_capabilities_baseline = sorted(set(baseline_archs))
 
       if args.filter_by_cc in ['false', 'False', '0']:
         self.filter_by_cc = False
@@ -740,18 +825,24 @@ def emit_manifest_cmake(self, manifest_path, top_level_path, source_files):
       manifest_file.write(target_text + '\n\n')
       manifest_file.write("    %s\n" % str(top_level_path.replace('\\', '/')))
       generated_path = os.path.join(self.curr_build_dir, 'generated')
+      
+      # Determine file extension based on whether we're targeting Intel Xe
+      file_extension = "cpp" if self.is_xe_target else "cu"
+      
       for kind in self.operations.keys():
         kind_str = OperationKindNames[kind]
-        all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.cu").replace('\\', '/')
+        all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.{file_extension}").replace('\\', '/')
         manifest_file.write(f"    {all_kind_file}\n")
       manifest_file.write(')\n\n')
 
       for kind in self.operations.keys():
         for min_cc in sorted(self.operations[kind].keys()):
           for subclass in sorted(source_files[kind][min_cc].keys()):
+            # Use appropriate prefix (sm for CUDA, xe for Intel)
+            arch_prefix = 'xe' if (min_cc >= INTEL_XE_ARCH_MIN and min_cc < INTEL_XE_ARCH_MAX) else 'sm'
             target_text = SubstituteTemplate("""cutlass_add_cutlass_library(
-      SUFFIX ${kind}_sm${min_cc}_${subclass}
-""", { 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass })
+      SUFFIX ${kind}_${arch_prefix}${min_cc}_${subclass}
+""", { 'arch_prefix': arch_prefix, 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass })
             manifest_file.write(target_text + '\n\n')
 
             for source_file in source_files[kind][min_cc][subclass]:
@@ -759,7 +850,8 @@ def emit_manifest_cmake(self, manifest_path, top_level_path, source_files):
 
             manifest_file.write(")\n")
 
-          if self.disable_full_archs_compilation:
+          # Only apply CUDA-specific arch compilation settings for CUDA targets
+          if self.disable_full_archs_compilation and min_cc < INTEL_XE_ARCH_MIN:
             self.emit_disable_full_archs_compilation(manifest_file, source_files)
 
   def emit_disable_full_archs_compilation(manifest_file, source_files):
diff --git a/python/setup_cutlass.py b/python/setup_cutlass.py
index 8122b7a6a1..bd1926e03a 100644
--- a/python/setup_cutlass.py
+++ b/python/setup_cutlass.py
@@ -1,6 +1,7 @@
 #################################################################################################
 #
 # Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/setup_library.py b/python/setup_library.py
index 875ba62d55..3257eb1b99 100644
--- a/python/setup_library.py
+++ b/python/setup_library.py
@@ -1,6 +1,7 @@
 #################################################################################################
 #
 # Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -35,9 +36,9 @@
 
 def perform_setup():
     setup(
-        name='cutlass_library',
+        name='cutlass_library_xe',
         version='4.1.0',
-        description='CUTLASS library generation scripts',
+        description='SYL*TLA library generation scripts',
         packages=['cutlass_library']
     )
 
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 98e97bc5da..014ec02db5 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -83,6 +83,11 @@ target_link_libraries(
 
 ################################################################################
 
+function(cutlass_target_sources target)
+  # Wrapper function for target_sources to maintain compatibility with generated manifests
+  target_sources(${target} ${ARGN})
+endfunction()
+
 function(cutlass_add_cutlass_library)
 #
 # Generates static and shared libraries with the given SOURCES. The public CMake
@@ -120,6 +125,11 @@ function(cutlass_add_cutlass_library)
     PRIVATE cutlass_library_internal_interface
     )
 
+  # Add SYCL-specific compile options when building for SYCL
+  if (CUTLASS_ENABLE_SYCL)
+    target_compile_options(${__NAME}_objs PRIVATE -fsycl)
+  endif()
+
   if (CUTLASS_BUILD_MONO_LIBRARY AND __SUFFIX)
 
     # If we're only building a single monolithic library then we
@@ -150,9 +160,26 @@ function(cutlass_add_cutlass_library)
       ${__NAME}
       PUBLIC cutlass_library_includes
       PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>
-      cuda_driver
       )
     
+    # Link with appropriate runtime library
+    if (CUTLASS_ENABLE_SYCL)
+      # For SYCL builds, explicitly link with libsycl.so
+      # We use find_library to locate it in the oneAPI installation
+      find_library(SYCL_LIBRARY NAMES sycl sycl8 PATHS ENV LD_LIBRARY_PATH NO_DEFAULT_PATH)
+      if(NOT SYCL_LIBRARY)
+        find_library(SYCL_LIBRARY NAMES sycl sycl8)
+      endif()
+      if(SYCL_LIBRARY)
+        target_link_libraries(${__NAME} PRIVATE ${SYCL_LIBRARY})
+      else()
+        message(WARNING "libsycl.so not found - runtime may fail to load")
+      endif()
+    else()
+      # For CUDA builds, link with cuda_driver
+      target_link_libraries(${__NAME} PRIVATE cuda_driver)
+    endif()
+    
     set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
     
     cutlass_add_library(
@@ -181,9 +208,20 @@ function(cutlass_add_cutlass_library)
       ${__NAME}_static
       PUBLIC cutlass_library_includes
       PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>
-      cuda_driver
       )
     
+    # Link with appropriate runtime library
+    if (CUTLASS_ENABLE_SYCL)
+      # For SYCL builds, explicitly link with libsycl.so
+      # Note: SYCL_LIBRARY should already be found from shared library linking above
+      if(SYCL_LIBRARY)
+        target_link_libraries(${__NAME}_static PRIVATE ${SYCL_LIBRARY})
+      endif()
+    else()
+      # For CUDA builds, link with cuda_driver
+      target_link_libraries(${__NAME}_static PRIVATE cuda_driver)
+    endif()
+    
     set_target_properties(${__NAME}_static PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
     
     install(
@@ -272,6 +310,24 @@ if (NOT CUTLASS_ENABLE_SYCL)
   # For backward compatibility with the old name
   add_library(cutlass_lib ALIAS cutlass_library)
   add_library(cutlass_lib_static ALIAS cutlass_library_static)
+
+else()
+  # SYCL-enabled library generation
+  # Create base library targets for SYCL that will be populated by generated kernels
+  # Note: .cu files will be compiled with SYCL compiler (icpx) for Intel Xe GPUs
+  
+  cutlass_add_cutlass_library(
+    src/handle.cu
+    src/manifest.cpp
+    src/operation_table.cu
+    src/singleton.cu
+    src/util.cu
+  )
+  
+  # For backward compatibility with the old name
+  add_library(cutlass_lib ALIAS cutlass_library)
+  add_library(cutlass_lib_static ALIAS cutlass_library_static)
+  
 endif()
 ################################################################################
 
@@ -307,6 +363,13 @@ if(CUTLASS_LIBRARY_HEURISTICS_PROBLEMS_FILE)
   endif()
 endif()
 
+# Set architecture parameter based on whether SYCL or CUDA is enabled
+if (CUTLASS_ENABLE_SYCL)
+  set(CUTLASS_LIBRARY_GENERATOR_ARCHS "20" CACHE STRING "Intel Xe architectures (12=PVC, 20=BMG)")
+else()
+  set(CUTLASS_LIBRARY_GENERATOR_ARCHS "${CUTLASS_NVCC_ARCHS_ENABLED}")
+endif()
+
 # --log-level is set to DEBUG to enable printing information about which kernels were excluded
 # from generation in /python/cutlass_library/manifest.py. To avoid having this information appear
 # in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO
@@ -318,7 +381,7 @@ execute_process(
     --build-dir ${PROJECT_BINARY_DIR}
     --curr-build-dir ${CMAKE_CURRENT_BINARY_DIR}
     --generator-target library
-    --architectures "${CUTLASS_NVCC_ARCHS_ENABLED}"
+    --architectures "${CUTLASS_LIBRARY_GENERATOR_ARCHS}"
     --kernels "${CUTLASS_LIBRARY_KERNELS}"
     --instantiation-level "${CUTLASS_LIBRARY_INSTANTIATION_LEVEL}"
     --ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}"
@@ -341,14 +404,13 @@ endif()
 
 message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
 
-if (NOT CUTLASS_ENABLE_SYCL)
-  # include auto-instantiated kernels in he CUTLASS Deliverables Library
-  set(CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated/manifest.cmake)
-  if(EXISTS "${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}")
-    include(${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE})
-  else()
-    message(STATUS "auto-generated library manifest cmake file (${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) not found.")
-  endif()
+# Include auto-instantiated kernels in the CUTLASS Deliverables Library
+# Now enabled for both CUDA and SYCL
+set(CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated/manifest.cmake)
+if(EXISTS "${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}")
+  include(${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE})
+else()
+  message(STATUS "auto-generated library manifest cmake file (${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) not found.")
 endif()
 
 ################################################################################
diff --git a/tools/library/include/cutlass/library/arch_mappings.h b/tools/library/include/cutlass/library/arch_mappings.h
index df241e3ca6..e6e31f0f9f 100644
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@@ -1,5 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -148,6 +150,39 @@ template <typename OperatorClass> struct ArchMap<arch::Sm120, OperatorClass> {
   static int const kMax = 121;
 };
 
+// Intel Xe architecture mappings
+template <typename OperatorClass> struct ArchMap<arch::IntelXe, OperatorClass> {
+  static int const kMin = 12;
+  static int const kMax = 50;
+};
+
+template <> struct ArchMap<arch::IntelXe, arch::OpClassTensorOp> {
+  static int const kMin = 12;
+  static int const kMax = 50;
+};
+
+// Xe12 (PVC) alias
+template <typename OperatorClass> struct ArchMap<arch::Xe12, OperatorClass> {
+  static int const kMin = 12;
+  static int const kMax = 50;
+};
+
+template <> struct ArchMap<arch::Xe12, arch::OpClassTensorOp> {
+  static int const kMin = 12;
+  static int const kMax = 50;
+};
+
+// Xe20 (BMG) alias
+template <typename OperatorClass> struct ArchMap<arch::Xe20, OperatorClass> {
+  static int const kMin = 20;
+  static int const kMax = 50;
+};
+
+template <> struct ArchMap<arch::Xe20, arch::OpClassTensorOp> {
+  static int const kMin = 20;
+  static int const kMax = 50;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace library
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index 6764d9a6d8..5564325d4f 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -52,7 +52,10 @@
 #include <string>
 #include <cstdint>
 #include <stdexcept>
+
+#if !defined(CUTLASS_ENABLE_SYCL)
 #include <cuda_runtime.h>
+#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/library/types.h"
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index f537421751..efd788e22f 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -224,21 +225,35 @@ NumericTypeID dynamic_datatype_to_id(RuntimeDatatype type);
     }                                                                                              \
   } while (0)
 
-// RAII CUDA buffer container
+// RAII device buffer container (CUDA/SYCL compatible)
 class CudaBuffer {
 public:
   CudaBuffer() : size_(0), d_ptr_(nullptr) {}
 
   explicit CudaBuffer(size_t size) : size_(size), d_ptr_(nullptr) {
+#if defined(CUTLASS_ENABLE_SYCL)
+    // SYCL memory allocation using malloc_device
+    auto q = compat::get_default_queue();
+    d_ptr_ = sycl::malloc_device(size_, q);
+    if (d_ptr_ == nullptr) {
+      throw std::runtime_error("sycl::malloc_device failed");
+    }
+#else
     cudaError_t err = cudaMalloc(&d_ptr_, size_);
     if (err != cudaSuccess) {
       throw std::runtime_error("cudaMalloc failed: " + std::string(cudaGetErrorString(err)));
     }
+#endif
   }
 
   ~CudaBuffer() {
     if (d_ptr_) {
+#if defined(CUTLASS_ENABLE_SYCL)
+      auto q = compat::get_default_queue();
+      sycl::free(d_ptr_, q);
+#else
       cudaFree(d_ptr_);
+#endif
     }
   }
 
@@ -253,7 +268,12 @@ class CudaBuffer {
   CudaBuffer& operator=(CudaBuffer&& other) noexcept {
     if (this != &other) {
       if (d_ptr_) {
+#if defined(CUTLASS_ENABLE_SYCL)
+        auto q = compat::get_default_queue();
+        sycl::free(d_ptr_, q);
+#else
         cudaFree(d_ptr_);
+#endif
       }
       d_ptr_ = other.d_ptr_;
       size_ = other.size_;
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index 880cb4bf34..1d87f3ecf0 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,13 +37,18 @@
 #include "cutlass/cutlass.h"
 
 #include "cutlass/gemm/device/gemm.h"
+
+#if !defined(CUTLASS_ENABLE_SYCL)
+// CUDA-only kernel types - not compatible with SYCL
 #include "cutlass/gemm/device/gemm_sparse.h"
 #include "cutlass/gemm/device/gemm_complex.h"
 #include "cutlass/gemm/device/gemm_batched.h"
 #include "cutlass/gemm/device/gemm_array.h"
+#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
+#endif
+
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
 
 #include "cutlass/library/library.h"
 #include "library_internal.h"
diff --git a/tools/library/src/gemm_operation_3x.hpp b/tools/library/src/gemm_operation_3x.hpp
index 7b27913df9..ebd555e7c8 100644
--- a/tools/library/src/gemm_operation_3x.hpp
+++ b/tools/library/src/gemm_operation_3x.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -46,6 +47,7 @@
 #include "cutlass/util/device_memory.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/mixed_dtype_utils.hpp"
 #include "cute/tensor.hpp"
 #include <unordered_map>
 
@@ -193,10 +195,16 @@ class GemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
         cute::size<2>(typename Operator::GemmKernel::ClusterShape{}));
       uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
       void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
+#if !defined(CUTLASS_ENABLE_SYCL)
+      // query_device_max_active_clusters is CUDA-specific
       max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
         cluster_dims,
         threads_per_block,
         kernel_ptr);
+#else
+      // For SYCL, set a default value (will be overridden if needed)
+      max_active_clusters = 1;
+#endif
     }
   }
 
diff --git a/tools/library/src/grouped_gemm_operation_3x.hpp b/tools/library/src/grouped_gemm_operation_3x.hpp
index 91f618d4fa..1089cb0175 100644
--- a/tools/library/src/grouped_gemm_operation_3x.hpp
+++ b/tools/library/src/grouped_gemm_operation_3x.hpp
@@ -441,13 +441,18 @@ class GroupedGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operat
 
     uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
     void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
+#if !defined(CUTLASS_ENABLE_SYCL)
     args->max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
       cluster_dims,
       threads_per_block,
       kernel_ptr);
+#else
+    // For SYCL, set a default value
+    args->max_active_clusters = 1;
+#endif
 
     if (args->max_active_clusters == 0) {
-      std::cerr << "Max Active Clusters could not be queried. " 
+      std::cerr << "Max Active Clusters could not be queried. "
                 << "Falling back to heuristics mode (static cluster shape) or preferred cluster mode.\n";
     }
 
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index e8bd77397f..a6f343be08 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -181,7 +181,11 @@ template <> struct NumericTypeMap<cutlass::tfloat32_t> {
   static NumericTypeID const kId = NumericTypeID::kTF32;
 };
 
-
+// Handle cute::tuple-wrapped types (used in some collectives)
+template <typename T>
+struct NumericTypeMap<cute::tuple<T>> {
+  static NumericTypeID const kId = NumericTypeMap<T>::kId;
+};
 
 
 template <> struct NumericTypeMap<cutlass::type_erased_dynamic_float8_t> {
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index b9c04de71d..d622060b83 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,7 +44,27 @@ namespace library {
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#ifndef CUTLASS_ENABLE_SYCL
+// For CUDA builds, reference operations are defined in initialize_reference_operations.cu
 void initialize_reference_operations(Manifest &manifest);
+#else
+// For SYCL builds, provide a stub implementation since reference ops are not yet supported
+inline void initialize_reference_operations(Manifest &manifest) {
+  // Reference operations not yet implemented for SYCL
+  // This is a stub to allow the library to compile
+}
+#endif
+
+#ifndef CUTLASS_ENABLE_SYCL
+// For CUDA builds, reduction operations are defined in init_reduction_operations.cu
+// Declaration is in manifest.h
+#else
+// For SYCL builds, provide a stub implementation since reduction ops are not yet supported
+inline void initialize_all_reduction_op(Manifest &manifest) {
+  // Reduction operations not yet implemented for SYCL
+  // This is a stub to allow the library to compile
+}
+#endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/library/src/sparse_gemm_operation_3x.hpp b/tools/library/src/sparse_gemm_operation_3x.hpp
index 34da25b9a6..6cb836b89a 100644
--- a/tools/library/src/sparse_gemm_operation_3x.hpp
+++ b/tools/library/src/sparse_gemm_operation_3x.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,6 +35,9 @@
 
 #pragma once
 
+// Sparse GEMM operations are CUDA-only (not supported in SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL)
+
 #include "cutlass/cutlass.h"
 #include "cutlass/detail/collective.hpp"
 #include "cutlass/array.h"
@@ -501,4 +505,6 @@ class SparseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
 
 } // namespace cutlass::library
 
+#endif // !defined(CUTLASS_ENABLE_SYCL)
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////