Skip to content

Commit be5d1b4

Browse files
committed
Fix all errors in clangd/vscode.
1 parent eab32ee commit be5d1b4

File tree

8 files changed

+108
-85
lines changed

8 files changed

+108
-85
lines changed

.clangd

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
CompileFlags:
2+
Remove:
3+
- --expt-relaxed-constexpr
4+
- --compiler-options
5+
- --expt-extended-lambda
6+
- --use_fast_math
7+
- --threads
8+
- -gencode
9+
Add:
10+
- --no-cuda-version-check
11+
---
12+
If:
13+
PathMatch: .*\.cu
14+
CompileFlags:
15+
Add:
16+
- "--cuda-gpu-arch=sm_80"

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ var/
3030

3131
# Dev
3232
venv
33+
env
3334
.tup
3435
tup.config
35-
local
36+
local
37+
compile_commands.json

csrc/flash_attn/src/alibi.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1+
#pragma once
2+
13
#include <cmath>
24

35
#include <cute/tensor.hpp>
46

57
#include <cutlass/cutlass.h>
68
#include <cutlass/array.h>
79

8-
#include "utils.h"
9-
1010
namespace flash {
1111

1212
using namespace cute;

csrc/flash_attn/src/dropout.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,23 @@ struct Dropout {
2323
}
2424

2525
template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
26-
__forceinline__ __device__ void apply_dropout(Tensor<Engine, Layout> &tensor_,
26+
__forceinline__ __device__ void apply_dropout(cute::Tensor<Engine, Layout> &tensor_,
2727
int block_row_start, int block_col_start, int block_row_stride) {
2828
// convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2)
29-
Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout()));
29+
cute::Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout()));
3030
using T = typename Engine::value_type;
3131
auto encode_dropout = [](bool keep, T val) {
3232
return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
3333
};
34-
static_assert(decltype(size<2>(tensor))::value % 2 == 0);
34+
static_assert(decltype(cute::size<2>(tensor))::value % 2 == 0);
3535
const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
3636
const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
3737
// if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
3838
#pragma unroll
39-
for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
39+
for (int m = 0; m < cute::size<1>(tensor); ++m, block_row_start += block_row_stride) {
4040
uint2 rowcol = make_uint2(block_row_start, block_col_start);
4141
#pragma unroll
42-
for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
42+
for (int n = 0; n < cute::size<2>(tensor) / 2; ++n, ++rowcol.y) {
4343
// if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
4444
uint4 random_uint4 = flash::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
4545
// if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
@@ -60,7 +60,7 @@ struct Dropout {
6060
uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
6161
#pragma unroll
6262
for (int j = 0; j < 2; j++) {
63-
Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
63+
cute::Tensor tensor_uint32 = cute::recast<uint32_t>(tensor(cute::_, m, n * 2 + j));
6464
// if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
6565
// if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
6666
#pragma unroll
@@ -78,7 +78,7 @@ struct Dropout {
7878
for (int i = 0; i < 8; i++) {
7979
tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
8080
}
81-
Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
81+
cute::Tensor tensor_uint32 = cute::recast<uint32_t>(tensor(cute::_, m, n * 2 + j));
8282
// if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
8383
}
8484
}

csrc/flash_attn/src/flash_bwd_kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <cutlass/numeric_types.h>
1212

1313
#include "block_info.h"
14-
#include "kernel_traits.h"
14+
#include "flash_bwd_preprocess_kernel.h"
1515
#include "utils.h"
1616
#include "softmax.h"
1717
#include "mask.h"

csrc/flash_attn/src/flash_fwd_kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <cutlass/numeric_types.h>
1212

1313
#include "block_info.h"
14-
#include "kernel_traits.h"
14+
#include "flash.h"
1515
#include "utils.h"
1616
#include "softmax.h"
1717
#include "mask.h"

csrc/flash_attn/src/mask.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#pragma once
66

7+
#include "utils.h"
78
#include <cute/tensor.hpp>
89

910
namespace flash {

0 commit comments

Comments
 (0)