Skip to content

Commit 8a3ddea

Browse files
committed
remove syclcompat dependency
1 parent 0079b6e commit 8a3ddea

File tree

3 files changed

+183
-13
lines changed

3 files changed

+183
-13
lines changed

src/sycl/chunked_prefill.cpp

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,10 @@
44
#include <torch/all.h>
55

66
#include <cute/tensor.hpp>
7-
#include <random>
87

98
#include "cutlass/epilogue/collective/default_epilogue.hpp"
10-
#include "cutlass/gemm/device/gemm_universal_adapter.h"
11-
#include "cutlass/util/GPU_Clock.hpp"
12-
#include "cutlass/util/command_line.h"
139
#include "cutlass/util/device_memory.h"
1410
#include "cutlass/util/packed_stride.hpp"
15-
#include "cutlass/util/reference/device/gemm_complex.h"
16-
#include "cutlass/util/reference/device/tensor_compare.h"
1711
#include "cutlass/util/sycl_event_manager.hpp"
1812
#include "flash_attention_v2/collective/fmha_fusion.hpp"
1913
#include "flash_attention_v2/collective/xe_flash_attn_chunk_prefill_epilogue.hpp"
@@ -283,21 +277,21 @@ struct KernelRunner {
283277
// configure smem size and carveout
284278
int smem_size = FMHAChunkPrefillKernel::SharedStorageSize;
285279

286-
const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
287-
const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
280+
const auto sycl_block = compat::dim3(block.x, block.y, block.z);
281+
const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
288282

289-
syclcompat::experimental::launch_properties launch_props{
283+
using namespace compat::experimental;
284+
compat::experimental::launch_properties launch_props{
290285
sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
291286
};
292-
syclcompat::experimental::kernel_properties kernel_props{
287+
compat::experimental::kernel_properties kernel_props{
293288
sycl::ext::oneapi::experimental::sub_group_size<FMHAChunkPrefillKernel::DispatchPolicy::SubgroupSize>};
294-
syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
295-
// auto event = syclcompat::experimental::launch<cutlass::device_kernel<FMHAChunkPrefillKernel>>(policy, params);
289+
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
296290

297291
sycl::ext::oneapi::experimental::launch_config config(policy.get_range(), policy.get_launch_properties());
298292
auto cgf = [&](::sycl::handler& cgh) {
299293
auto KernelFunctor =
300-
syclcompat::experimental::detail::build_kernel_functor<cutlass::device_kernel<FMHAChunkPrefillKernel>>(
294+
compat::experimental::detail::build_kernel_functor<cutlass::device_kernel<FMHAChunkPrefillKernel>>(
301295
cgh, policy, params);
302296
sycl::ext::oneapi::experimental::detail::
303297
LaunchConfigAccess<sycl::nd_range<3>, decltype(policy.get_launch_properties())>

src/sycl/helper.h

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/***************************************************************************************************
2+
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
3+
*reserved. SPDX-License-Identifier: BSD-3-Clause
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are met:
7+
*
8+
* 1. Redistributions of source code must retain the above copyright notice,
9+
*this list of conditions and the following disclaimer.
10+
*
11+
* 2. Redistributions in binary form must reproduce the above copyright notice,
12+
* this list of conditions and the following disclaimer in the documentation
13+
* and/or other materials provided with the distribution.
14+
*
15+
* 3. Neither the name of the copyright holder nor the names of its
16+
* contributors may be used to endorse or promote products derived from
17+
* this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22+
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23+
*LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24+
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25+
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26+
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27+
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28+
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29+
*POSSIBILITY OF SUCH DAMAGE.
30+
*
31+
**************************************************************************************************/
32+
#pragma once
33+
34+
#if defined(CUTLASS_ENABLE_SYCL)
35+
#include "cutlass/util/sycl_timer.hpp"
36+
#else
37+
#include <cuda_runtime.h>
38+
#endif
39+
#include <iostream>
40+
41+
/**
42+
* Panic wrapper for unwinding CUTLASS errors
43+
*/
44+
#define CUTLASS_CHECK(status) \
45+
{ \
46+
cutlass::Status error = status; \
47+
if (error != cutlass::Status::kSuccess) { \
48+
std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ << std::endl; \
49+
exit(EXIT_FAILURE); \
50+
} \
51+
}
52+
53+
/**
54+
* Panic wrapper for unwinding CUDA runtime errors
55+
*/
56+
#define CUDA_CHECK(status) \
57+
{ \
58+
cudaError_t error = status; \
59+
if (error != cudaSuccess) { \
60+
std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) << " at line: " << __LINE__ << std::endl; \
61+
exit(EXIT_FAILURE); \
62+
} \
63+
}
64+
65+
/**
66+
* GPU timer for recording the elapsed time across kernel(s) launched in GPU
67+
* stream
68+
*/
69+
struct GpuTimer {
70+
#if defined(CUTLASS_ENABLE_SYCL)
71+
using cudaStream_t = int;
72+
SYCLTimer syclTimer;
73+
#else
74+
cudaEvent_t _start;
75+
cudaEvent_t _stop;
76+
#endif
77+
cudaStream_t _stream_id;
78+
79+
/// Constructor
80+
GpuTimer() : _stream_id(0) {
81+
#if !defined(CUTLASS_ENABLE_SYCL)
82+
CUDA_CHECK(cudaEventCreate(&_start));
83+
CUDA_CHECK(cudaEventCreate(&_stop));
84+
#endif
85+
}
86+
87+
/// Destructor
88+
~GpuTimer() {
89+
#if !defined(CUTLASS_ENABLE_SYCL)
90+
CUDA_CHECK(cudaEventDestroy(_start));
91+
CUDA_CHECK(cudaEventDestroy(_stop));
92+
#endif
93+
}
94+
95+
/// Start the timer for a given stream (defaults to the default stream)
96+
void start(cudaStream_t stream_id = 0) {
97+
_stream_id = stream_id;
98+
#if defined(CUTLASS_ENABLE_SYCL)
99+
syclTimer.start();
100+
#else
101+
CUDA_CHECK(cudaEventRecord(_start, _stream_id));
102+
#endif
103+
}
104+
105+
/// Stop the timer
106+
void stop() {
107+
#if defined(CUTLASS_ENABLE_SYCL)
108+
syclTimer.stop();
109+
#else
110+
CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
111+
#endif
112+
}
113+
114+
/// Return the elapsed time (in milliseconds)
115+
float elapsed_millis() {
116+
#if defined(CUTLASS_ENABLE_SYCL)
117+
return syclTimer.milliseconds();
118+
#else
119+
float elapsed = 0.0;
120+
CUDA_CHECK(cudaEventSynchronize(_stop));
121+
CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
122+
return elapsed;
123+
#endif
124+
}
125+
};

src/sycl/sycl_common.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/***************************************************************************************************
2+
* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
3+
* SPDX-License-Identifier: BSD-3-Clause
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are met:
7+
*
8+
* 1. Redistributions of source code must retain the above copyright notice, this
9+
* list of conditions and the following disclaimer.
10+
*
11+
* 2. Redistributions in binary form must reproduce the above copyright notice,
12+
* this list of conditions and the following disclaimer in the documentation
13+
* and/or other materials provided with the distribution.
14+
*
15+
* 3. Neither the name of the copyright holder nor the names of its
16+
* contributors may be used to endorse or promote products derived from
17+
* this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
*
30+
**************************************************************************************************/
31+
32+
#pragma once
33+
34+
#include "cutlass/cutlass.h"
35+
#include "cutlass/util/device_memory.h"
36+
#include "cutlass/util/mixed_dtype_utils.hpp"
37+
#include "cutlass/util/reference/device/sycl_tensor_fill.h"
38+
39+
template <typename T>
40+
inline bool is_close(T a, T b, float atol, float rtol) {
41+
return std::abs((float)a - (float)b) <= atol + rtol * std::abs((float)b);
42+
}
43+
44+
// TODO(Codeplay): use on device initialisation for this
45+
46+
template <typename SrcT, typename DstT>
47+
void convert_dtype(const SrcT* d_src, DstT* d_dst, size_t size) {
48+
syclcompat::get_default_queue()
49+
.parallel_for(size, [=](auto indx) { d_dst[indx] = static_cast<DstT>(d_src[indx]); })
50+
.wait();
51+
}

0 commit comments

Comments
 (0)