File tree Expand file tree Collapse file tree 4 files changed +6
-8
lines changed
Expand file tree Collapse file tree 4 files changed +6
-8
lines changed Original file line number Diff line number Diff line change @@ -139,10 +139,10 @@ if (LLAMA_CUBLAS)
139139 elseif (CUDAToolkit_VERSION VERSION_GREATER 12)
140140 add_compile_definitions (GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
141141 add_compile_definitions (KCPP_LIMIT_CUDA_MAX_ARCH=800)
142- set (CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual; 75-virtual;80-virtual" ) # lowest CUDA 12 standard + lowest for integer intrinsics
142+ set (CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;75-virtual;80-virtual" ) # lowest CUDA 12 standard + lowest for integer intrinsics
143143 else ()
144144 add_compile_definitions (KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
145- set (CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual; 75-virtual" ) # lowest CUDA 12 standard + lowest for integer intrinsics
145+ set (CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;75-virtual" ) # lowest CUDA 12 standard + lowest for integer intrinsics
146146 endif ()
147147 endif ()
148148 message (STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES} " )
Original file line number Diff line number Diff line change @@ -226,15 +226,13 @@ NVCCFLAGS += -Wno-deprecated-gpu-targets \
226226 -gencode arch=compute_35,code=compute_35 \
227227 -gencode arch=compute_50,code=compute_50 \
228228 -gencode arch=compute_61,code=compute_61 \
229- -gencode arch=compute_70,code=compute_70 \
230229 -gencode arch=compute_75,code=compute_75 \
231230 -DKCPP_LIMIT_CUDA_MAX_ARCH=750
232231
233232else ifdef LLAMA_ARCHES_CU12
234233NVCCFLAGS += -Wno-deprecated-gpu-targets \
235234 -gencode arch=compute_50,code=compute_50 \
236235 -gencode arch=compute_61,code=compute_61 \
237- -gencode arch=compute_70,code=compute_70 \
238236 -gencode arch=compute_75,code=compute_75 \
239237 -gencode arch=compute_80,code=compute_80 \
240238 -DKCPP_LIMIT_CUDA_MAX_ARCH=800
Original file line number Diff line number Diff line change @@ -111,7 +111,7 @@ void ggml_cuda_mul_mat_q(
111111 const int64_t s03 = src0->nb [3 ] / ts_src0;
112112 const int64_t s3 = dst->nb [3 ] / ts_dst;
113113
114- const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA)
114+ const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) > GGML_CUDA_CC_VOLTA)
115115 || GGML_CUDA_CC_IS_CDNA (cc);
116116
117117 if (!ids) {
@@ -221,7 +221,7 @@ void ggml_cuda_op_mul_mat_q(
221221 // The stream-k decomposition is only faster for recent NVIDIA GPUs.
222222 // Also its fixup needs to allocate a temporary buffer in the memory pool.
223223 // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
224- const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA)
224+ const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) > GGML_CUDA_CC_VOLTA)
225225 || GGML_CUDA_CC_IS_CDNA (cc))
226226 && src1_ncols == ne11;
227227 const mmq_args args = {
Original file line number Diff line number Diff line change @@ -94,7 +94,7 @@ struct tile_x_sizes {
9494
9595static int get_mmq_x_max_host (const int cc) {
9696 return (amd_mfma_available (cc) || turing_mma_available (cc) || amd_wmma_available (cc)) ? 128 :
97- GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA ?
97+ GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) > GGML_CUDA_CC_VOLTA ?
9898#ifdef GGML_CUDA_FORCE_MMQ
9999 128 : 64 ;
100100#else
@@ -127,7 +127,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
127127
128128static int get_mmq_y_host (const int cc) {
129129 return GGML_CUDA_CC_IS_AMD (cc) ? (GGML_CUDA_CC_IS_RDNA1 (cc) ? 64 : 128 ) :
130- ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64 );
130+ ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) > GGML_CUDA_CC_VOLTA) ? 128 : 64 );
131131}
132132
133133static constexpr __device__ int get_mmq_y_device () {
You can’t perform that action at this time.
0 commit comments