Cut flash attention from CUDA again

jart · jart · commit 4d1fde022562 · 2024-06-24T07:37:43.000-07:00
If you really want it anyway, just say:

    ./llamafile -ngl 999 -fa --recompile ...

And it'll build ggml-cuda with flash attention for your system.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -901,6 +901,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "-fa" || arg == "--flash-attn") {
         params.flash_attn = true;
+        FLAG_flash_attn = true; // [jart]
         return true;
     }
     if (arg == "--color") {
diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu
@@ -23,7 +23,6 @@
 #include <string>
 #include <vector>
 
-
 ////////////////////////////////////////////////////////////////////////////////
 //
 // ROLLUP acc.cu
@@ -3608,6 +3607,8 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
     GGML_UNUSED(src1_padded_row_size);
 }
 
+#ifndef GGML_MINIMIZE_CODE_SIZE
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // ROLLUP fattn.cu
@@ -5098,7 +5099,6 @@ void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_ten
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
@@ -5432,7 +5432,6 @@ void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, gg
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
@@ -5709,6 +5708,8 @@ void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tens
     launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
 
+#endif // GGML_MINIMIZE_CODE_SIZE
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // ROLLUP getrows.cu
@@ -13096,7 +13097,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             ggml_cuda_op_argsort(ctx, dst);
             break;
         case GGML_OP_FLASH_ATTN_EXT:
+#ifndef GGML_MINIMIZE_CODE_SIZE
             ggml_cuda_flash_attn_ext(ctx, dst);
+#endif
             break;
         default:
             return false;
@@ -13649,7 +13652,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_LEAKY_RELU:
             return true;
         case GGML_OP_FLASH_ATTN_EXT:
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_MINIMIZE_CODE_SIZE)
+            return false;
+#elif defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
             return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
 #else
             if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
diff --git a/llamafile/cuda.c b/llamafile/cuda.c
@@ -61,7 +61,8 @@ __static_yoink("llama.cpp/ggml-backend-impl.h");
     /* "-DNDEBUG",  */ "-DGGML_BUILD=1", "-DGGML_SHARED=1", "-DGGML_MULTIPLATFORM", \
         "-DGGML_CUDA_DMMV_X=32", "-DK_QUANTS_PER_ITERATION=2", \
         "-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128", "-DGGML_CUDA_MMV_Y=1", \
-        (FLAG_tinyblas ? "-DGGML_USE_TINYBLAS" : "-DGGML_USE_CUBLAS")
+        (FLAG_tinyblas ? "-DGGML_USE_TINYBLAS" : "-DGGML_USE_CUBLAS"), \
+        (FLAG_flash_attn ? "-DTEHFLASH" : "-DGGML_MINIMIZE_CODE_SIZE")
 
 #define NVCC_FLAGS \
     "-std=c++11", "-O3", "--shared", "--use_fast_math", "--forward-unknown-to-host-compiler", \
@@ -567,6 +568,7 @@ static bool compile_amd_windows(const char *clangxx, const char *dso, const char
         "-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128",
         "-DGGML_CUDA_MMV_Y=1",
         "-DGGML_USE_TINYBLAS",
+        FLAG_flash_attn ? "-DTEHFLASH" : "-DGGML_MINIMIZE_CODE_SIZE",
         "-o",
         (char *)tmpdso,
         (char *)src,

Original file line number	Diff line number	Diff line change
`@@ -901,6 +901,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa`
`901`	`901`	`}`
`902`	`902`	`if (arg == "-fa" \|\| arg == "--flash-attn") {`
`903`	`903`	`params.flash_attn = true;`
	`904`	`+ FLAG_flash_attn = true; // [jart]`
`904`	`905`	`return true;`
`905`	`906`	`}`
`906`	`907`	`if (arg == "--color") {`