Fix all errors in clangd/vscode.

nshepperd · nshepperd · commit be5d1b46cb2c · 2024-07-17T14:49:50.000+10:00
diff --git a/.clangd b/.clangd
@@ -0,0 +1,16 @@
+CompileFlags:
+  Remove:
+   - --expt-relaxed-constexpr
+   - --compiler-options
+   - --expt-extended-lambda
+   - --use_fast_math
+   - --threads
+   - -gencode
+  Add:
+    - --no-cuda-version-check
+---
+If:
+  PathMatch: .*\.cu
+CompileFlags:
+  Add:
+    - "--cuda-gpu-arch=sm_80"
diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,8 @@ var/
 
 # Dev
 venv
+env
 .tup
 tup.config
-local
+local
+compile_commands.json
diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h
@@ -1,12 +1,12 @@
+#pragma once
+
 #include <cmath>
 
 #include <cute/tensor.hpp>
 
 #include <cutlass/cutlass.h>
 #include <cutlass/array.h>
 
-#include "utils.h"
-
 namespace flash {
 
 using namespace cute;
diff --git a/csrc/flash_attn/src/dropout.h b/csrc/flash_attn/src/dropout.h
@@ -23,23 +23,23 @@ struct Dropout {
     }
 
     template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
-    __forceinline__ __device__ void apply_dropout(Tensor<Engine, Layout> &tensor_,
+    __forceinline__ __device__ void apply_dropout(cute::Tensor<Engine, Layout> &tensor_,
                                          int block_row_start, int block_col_start, int block_row_stride) {
         // convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2)
-        Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout()));
+        cute::Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout()));
         using T = typename Engine::value_type;
         auto encode_dropout = [](bool keep, T val) {
             return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
         };
-        static_assert(decltype(size<2>(tensor))::value % 2 == 0);
+        static_assert(decltype(cute::size<2>(tensor))::value % 2 == 0);
         const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
         const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
         // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
         #pragma unroll
-        for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
+        for (int m = 0; m < cute::size<1>(tensor); ++m, block_row_start += block_row_stride) {
             uint2 rowcol = make_uint2(block_row_start, block_col_start);
             #pragma unroll
-            for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
+            for (int n = 0; n < cute::size<2>(tensor) / 2; ++n, ++rowcol.y) {
                 // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
                 uint4 random_uint4 = flash::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
                 // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
@@ -60,7 +60,7 @@ struct Dropout {
                     uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
                     #pragma unroll
                     for (int j = 0; j < 2; j++) {
-                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        cute::Tensor tensor_uint32 = cute::recast<uint32_t>(tensor(cute::_, m, n * 2 + j));
                         // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
                         // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
                         #pragma unroll
@@ -78,7 +78,7 @@ struct Dropout {
                         for (int i = 0; i < 8; i++) {
                             tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
                         }
-                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        cute::Tensor tensor_uint32 = cute::recast<uint32_t>(tensor(cute::_, m, n * 2 + j));
                         // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
                     }
                 }
diff --git a/csrc/flash_attn/src/flash_bwd_kernel.h b/csrc/flash_attn/src/flash_bwd_kernel.h
@@ -11,7 +11,7 @@
 #include <cutlass/numeric_types.h>
 
 #include "block_info.h"
-#include "kernel_traits.h"
+#include "flash_bwd_preprocess_kernel.h"
 #include "utils.h"
 #include "softmax.h"
 #include "mask.h"
diff --git a/csrc/flash_attn/src/flash_fwd_kernel.h b/csrc/flash_attn/src/flash_fwd_kernel.h
@@ -11,7 +11,7 @@
 #include <cutlass/numeric_types.h>
 
 #include "block_info.h"
-#include "kernel_traits.h"
+#include "flash.h"
 #include "utils.h"
 #include "softmax.h"
 #include "mask.h"
diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "utils.h"
 #include <cute/tensor.hpp>
 
 namespace flash {
diff --git a/csrc/flash_attn/src/utils.h b/csrc/flash_attn/src/utils.h