Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/common/sycl_cute_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,22 @@ zero_fill(InTensor &X)
X(i) = T(0);
}

template <typename T>
void
random_fill(std::vector<T> &X) {

for (int i = 0; i < X.size(); i++)
X[i] = random_value<T>();
}

template <typename T>
void
zero_fill(std::vector<T> &X) {
for (int i = 0; i < X.size(); i++)
X[i] = T(0);
}


// Pack sub-byte types in a gmem tensor.
// On input, the backing array holds one sub-byte value per byte.
// On exit, the backing array contains packed values.
Expand Down
5 changes: 5 additions & 0 deletions examples/cute/tutorial/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ if (CUTLASS_ENABLE_SYCL)
tiled_copy_sycl.cpp
)

cutlass_example_add_executable(
cute_tutorial_tiled_transpose
transpose/tiled_transpose_sycl.cpp
)

cutlass_example_add_executable(
cute_tutorial_tiled_copy_if
tiled_copy_if_sycl.cpp
Expand Down
141 changes: 141 additions & 0 deletions examples/cute/tutorial/transpose/copy_direct.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#pragma once

/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

// copy kernel adapted from
// https://github.com/NVIDIA/cutlass/blob/main/examples/cute/tutorial/tiled_copy.cu

#include <cute/util/compat.hpp>
#include <sycl/sycl.hpp>

#include <cute/tensor.hpp>

#include "cutlass/util/print_error.hpp"
#include "util.h"

#include <iomanip>

template <class TensorS, class TensorD, class ThreadLayout>
void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
using namespace cute;

// Slice the tiled tensors
Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)

// Construct a partitioning of the tile among threads with the given thread
// arrangement.

// Concept: Tensor ThrLayout ThrIndex
Tensor thr_tile_S = local_partition(
tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
Tensor thr_tile_D = local_partition(
tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
//

// Construct a register-backed Tensor with the same shape as each thread's
// partition Use make_tensor to try to match the layout of thr_tile_S
Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)

// Copy from GMEM to RMEM and from RMEM to GMEM
copy(thr_tile_S, fragment);
copy(fragment, thr_tile_D);
}

template <typename Element> void copy_direct(TransposeParams<Element> params) {
//
// Given a 2D shape, perform an efficient copy
//

using namespace cute;

//
// Make tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

//
// Tile tensors
//

// Define a statically sized block (M, N).
// Note, by convention, capital letters are used to represent static modes.
auto block_shape = make_shape(Int<1>{}, Int<16384>{});

if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
(size<1>(tensor_shape) % size<1>(block_shape))) {
std::cerr << "The tensor shape must be divisible by the block shape."
<< std::endl;
}
// Equivalent check to the above
if (not evenly_divides(tensor_shape, block_shape)) {
std::cerr << "Expected the block_shape to evenly divide the tensor shape."
<< std::endl;
}

// Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile
// shape, and modes (m', n') correspond to the number of tiles.
//
// These will be used to determine the CUDA kernel grid dimensions.
Tensor tiled_tensor_S =
tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
Tensor tiled_tensor_D =
tiled_divide(tensor_D, block_shape); // ((M, N), m', n')

// Thread arrangement
Layout thr_layout =
make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});

//
// Determine grid and block dimensions
//

auto gridDim = compat::dim3(
size<1>(tiled_tensor_S),
size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
auto blockDim = compat::dim3(size(thr_layout));

//
// Launch the kernel
//
compat::launch<copy_kernel<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
decltype(thr_layout)>>(
gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
}
148 changes: 148 additions & 0 deletions examples/cute/tutorial/transpose/copy_smem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#pragma once
/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

#include <cute/util/compat.hpp>
#include <sycl/sycl.hpp>

#include <cute/tensor.hpp>

#include "cutlass/util/print_error.hpp"
#include "util.h"

#include "cutlass/detail/layout.hpp"

// Shared Storage for aligned addresses
template <class Element, class SmemLayout> struct SharedStorageCopy {
cute::array_aligned<Element, cute::cosize_v<SmemLayout>> smem;
};

template <class TensorS, class TensorD, class ThreadLayout, class SmemLayout,
int smem_size>
void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
SmemLayout) {
using namespace cute;
using Element = typename TensorS::value_type;

// Use Shared Storage structure to allocate aligned SMEM addresses.
using SharedStorage = SharedStorageCopy<Element, SmemLayout>;
auto smem = compat::local_mem<Element[smem_size]>();
SharedStorage &shared_storage = *reinterpret_cast<SharedStorage *>(smem);

Tensor gS = S(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (bM, bN)
Tensor gD = D(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (bN, bM)

Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()),
SmemLayout{}); // (bN, bM)

auto tiled_copy_load = make_tiled_copy(
Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
ThreadLayout{});

auto tiled_copy_store = make_tiled_copy(
Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
ThreadLayout{});
//
// Construct a Tensor corresponding to each thread's slice.
auto thr_copy_load = tiled_copy_load.get_thread_slice(compat::local_id::x());
auto thr_copy_store =
tiled_copy_store.get_thread_slice(compat::local_id::x());

Tensor tSgS = thr_copy_load.partition_S(gS);
Tensor tSsS = thr_copy_load.partition_D(sS);
//
Tensor tDsS = thr_copy_store.partition_D(sS);
Tensor tDgD = thr_copy_store.partition_D(gD);

copy(tiled_copy_load, tSgS, tSsS);

cp_async_fence();
cp_async_wait<0>();
syncthreads();
//
copy(tiled_copy_store, tDsS, tDgD);
}

template <typename Element> void copy_smem(TransposeParams<Element> params) {

using namespace cute;

//
// Make tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

//
// Tile tensors
using bM = Int<1>;
using bN = Int<8192>;

auto block_shape = make_shape(bM{}, bN{}); // (bM, bN)

auto smem_layout = make_layout(block_shape, LayoutRight{});

Tensor tiled_tensor_S =
tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n')
Tensor tiled_tensor_D =
tiled_divide(tensor_D, block_shape); // ((bN, bM), n', m')

auto threadLayout =
make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});

//
// Determine grid and block dimensions
//

dim3 gridDim(
size<1>(tiled_tensor_S),
size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
dim3 blockDim(size(threadLayout)); // 256 threads

constexpr int smem_size =
int(sizeof(SharedStorageCopy<Element, decltype(smem_layout)>));

//
// Launch the kernel
//
compat::launch<
copySmemKernel<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
decltype(threadLayout), decltype(smem_layout), smem_size>>(
gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, threadLayout,
smem_layout);
}
29 changes: 29 additions & 0 deletions examples/cute/tutorial/transpose/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "copy_direct.h"
#include "copy_smem.h"
#include "transpose_naive.h"
#include "transpose_smem.h"
#include "util.h"

int main(int argc, char const **argv) {

using Element = float;

int size = 16384;
int M = size, N = size, iterations = 10;

std::cout << "Matrix size: " << M << " x " << N << std::endl;

printf("Baseline copy.\n");
benchmark<Element, false>(copy_direct<Element>, M, N, iterations);

printf("\nNaive transpose (no smem):\n");
benchmark<Element>(transpose_naive<Element>, M, N, iterations);

printf("\nCopy through SMEM.\n");
benchmark<Element, false>(copy_smem<Element>, M, N, iterations);

printf("\nTranspose through SMEM.:\n");
benchmark<Element>(transpose_smem<Element>, M, N, iterations);

return 0;
}
Loading
Loading