Skip to content

Commit 2df689a

Browse files
committed
try remove volta as a dedicated target
1 parent 1a14ae1 commit 2df689a

File tree

4 files changed

+6
-8
lines changed

4 files changed

+6
-8
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,10 @@ if (LLAMA_CUBLAS)
139139
elseif(CUDAToolkit_VERSION VERSION_GREATER 12)
140140
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
141141
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=800)
142-
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
142+
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
143143
else()
144144
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
145-
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
145+
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
146146
endif()
147147
endif()
148148
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Makefile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,15 +226,13 @@ NVCCFLAGS += -Wno-deprecated-gpu-targets \
226226
-gencode arch=compute_35,code=compute_35 \
227227
-gencode arch=compute_50,code=compute_50 \
228228
-gencode arch=compute_61,code=compute_61 \
229-
-gencode arch=compute_70,code=compute_70 \
230229
-gencode arch=compute_75,code=compute_75 \
231230
-DKCPP_LIMIT_CUDA_MAX_ARCH=750
232231

233232
else ifdef LLAMA_ARCHES_CU12
234233
NVCCFLAGS += -Wno-deprecated-gpu-targets \
235234
-gencode arch=compute_50,code=compute_50 \
236235
-gencode arch=compute_61,code=compute_61 \
237-
-gencode arch=compute_70,code=compute_70 \
238236
-gencode arch=compute_75,code=compute_75 \
239237
-gencode arch=compute_80,code=compute_80 \
240238
-DKCPP_LIMIT_CUDA_MAX_ARCH=800

ggml/src/ggml-cuda/mmq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ void ggml_cuda_mul_mat_q(
111111
const int64_t s03 = src0->nb[3] / ts_src0;
112112
const int64_t s3 = dst->nb[3] / ts_dst;
113113

114-
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
114+
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) > GGML_CUDA_CC_VOLTA)
115115
|| GGML_CUDA_CC_IS_CDNA(cc);
116116

117117
if (!ids) {
@@ -221,7 +221,7 @@ void ggml_cuda_op_mul_mat_q(
221221
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
222222
// Also its fixup needs to allocate a temporary buffer in the memory pool.
223223
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
224-
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
224+
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) > GGML_CUDA_CC_VOLTA)
225225
|| GGML_CUDA_CC_IS_CDNA(cc))
226226
&& src1_ncols == ne11;
227227
const mmq_args args = {

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ struct tile_x_sizes {
9494

9595
static int get_mmq_x_max_host(const int cc) {
9696
return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
97-
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
97+
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) > GGML_CUDA_CC_VOLTA ?
9898
#ifdef GGML_CUDA_FORCE_MMQ
9999
128 : 64;
100100
#else
@@ -127,7 +127,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
127127

128128
static int get_mmq_y_host(const int cc) {
129129
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
130-
((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
130+
((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) > GGML_CUDA_CC_VOLTA) ? 128 : 64);
131131
}
132132

133133
static constexpr __device__ int get_mmq_y_device() {

0 commit comments

Comments
 (0)