Skip to content

Commit d7af092

Browse files
committed
Revert "CUDA: fix non-cont. inputs for batched mat mul (ggml-org#13155)"
This reverts commit cdf7658.
1 parent fa2a7a5 commit d7af092

File tree

4 files changed

+42
-94
lines changed

4 files changed

+42
-94
lines changed

ggml/src/ggml-cuda/convert.cu

Lines changed: 12 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#include "convert.cuh"
22
#include "dequantize.cuh"
33

4-
#include <cstdint>
5-
64
#define CUDA_Q8_0_NE_ALIGN 2048
75

86
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -572,46 +570,30 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
572570
}
573571

574572
template <typename src_t, typename dst_t>
575-
static __global__ void convert_unary(
576-
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
577-
const int64_t s01, const int64_t s02, const int64_t s03) {
578-
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
573+
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
574+
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
579575

580-
if (i00 >= ne00) {
576+
if (i >= k) {
581577
return;
582578
}
583579

584-
const int64_t i01 = blockIdx.y;
585-
const int64_t i02 = blockIdx.z % ne02;
586-
const int64_t i03 = blockIdx.z / ne02;
587-
588580
const src_t * x = (const src_t *) vx;
589581

590-
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
591-
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
592-
y[iy] = float(x[ix]);
582+
y[i] = float(x[i]);
593583
}
594584

595585
template <typename src_t, typename dst_t>
596-
static void convert_unary_cuda(const void * vx, dst_t * y,
597-
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
598-
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
599-
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
600-
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
601-
(vx, y, ne00, ne01, ne02, s01, s02, s03);
602-
}
603-
604-
template <typename src_t, typename dst_t>
605-
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
606-
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
586+
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
587+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
588+
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
607589
}
608590

609591
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
610592
switch (type) {
611593
case GGML_TYPE_F32:
612-
return convert_unary_cont_cuda<float>;
594+
return convert_unary_cuda<float>;
613595
case GGML_TYPE_F16:
614-
return convert_unary_cont_cuda<half>;
596+
return convert_unary_cuda<half>;
615597
default:
616598
return nullptr;
617599
}
@@ -661,9 +643,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
661643
case GGML_TYPE_IQ3_S:
662644
return dequantize_row_iq3_s_cuda;
663645
case GGML_TYPE_F32:
664-
return convert_unary_cont_cuda<float>;
646+
return convert_unary_cuda<float>;
665647
case GGML_TYPE_BF16:
666-
return convert_unary_cont_cuda<nv_bfloat16>;
648+
return convert_unary_cuda<nv_bfloat16>;
667649
default:
668650
return nullptr;
669651
}
@@ -710,18 +692,7 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
710692
case GGML_TYPE_IQ3_S:
711693
return dequantize_row_iq3_s_cuda;
712694
case GGML_TYPE_F16:
713-
return convert_unary_cont_cuda<half>;
714-
case GGML_TYPE_BF16:
715-
return convert_unary_cont_cuda<nv_bfloat16>;
716-
default:
717-
return nullptr;
718-
}
719-
}
720-
721-
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
722-
switch (type) {
723-
case GGML_TYPE_F32:
724-
return convert_unary_cuda<float>;
695+
return convert_unary_cuda<half>;
725696
case GGML_TYPE_BF16:
726697
return convert_unary_cuda<nv_bfloat16>;
727698
default:

ggml/src/ggml-cuda/convert.cuh

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
44

55
template<typename T>
6-
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
6+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
77

88
typedef to_t_cuda_t<float> to_fp32_cuda_t;
99
typedef to_t_cuda_t<half> to_fp16_cuda_t;
@@ -14,13 +14,3 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
1414
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
1515

1616
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
17-
18-
// TODO more general support for non-contiguous inputs
19-
20-
template<typename T>
21-
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
22-
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
23-
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
24-
25-
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
26-
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 28 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,15 +1721,15 @@ static __global__ void k_compute_batched_ptrs(
17211721
size_t nb12, size_t nb13,
17221722
size_t nbd2, size_t nbd3,
17231723
int64_t r2, int64_t r3) {
1724-
const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1725-
const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1724+
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1725+
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
17261726

17271727
if (i13 >= ne13 || i12 >= ne12) {
17281728
return;
17291729
}
17301730

1731-
const int64_t i03 = i13 / r3;
1732-
const int64_t i02 = i12 / r2;
1731+
int64_t i03 = i13 / r3;
1732+
int64_t i02 = i12 / r2;
17331733

17341734
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
17351735
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
@@ -1743,10 +1743,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17431743
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
17441744
GGML_ASSERT(src0->type == GGML_TYPE_F16);
17451745

1746-
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1747-
// As long as dst is contiguous this does not matter though.
1748-
GGML_ASSERT(ggml_is_contiguous(dst));
1749-
17501746
GGML_TENSOR_BINARY_OP_LOCALS
17511747

17521748
const int64_t ne_dst = ggml_nelements(dst);
@@ -1755,31 +1751,21 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17551751

17561752
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
17571753

1758-
const half * src0_f16 = (const half *) src0->data;
1759-
float * dst_ddf = (float *) dst->data;
1760-
1761-
const half * src1_f16 = (const half *) src1->data;
1762-
const size_t ts_src1 = ggml_type_size(src1->type);
1763-
GGML_ASSERT(nb10 == ts_src1);
1764-
int64_t s11 = nb11 / ts_src1;
1765-
int64_t s12 = nb12 / ts_src1;
1766-
int64_t s13 = nb13 / ts_src1;
1767-
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
1754+
void * src0_ddq = src0->data;
1755+
half * src0_f16 = (half *) src0_ddq;
1756+
float * src1_ddf = (float *) src1->data;
1757+
float * dst_ddf = (float *) dst->data;
17681758

17691759
// convert src1 to fp16
1760+
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
17701761
if (src1->type != GGML_TYPE_F16) {
1771-
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
1762+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
17721763
const int64_t ne_src1 = ggml_nelements(src1);
17731764
src1_f16_alloc.alloc(ne_src1);
17741765
GGML_ASSERT(to_fp16_cuda != nullptr);
1775-
1776-
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1777-
1778-
src1_f16 = src1_f16_alloc.get();
1779-
s11 = ne10;
1780-
s12 = ne11*s11;
1781-
s13 = ne12*s12;
1766+
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
17821767
}
1768+
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
17831769

17841770
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
17851771
char * dst_t;
@@ -1839,13 +1825,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18391825
int i02 = i12 / r2;
18401826

18411827
CUBLAS_CHECK(
1842-
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1843-
ne01, ne11, ne10,
1844-
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
1845-
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
1846-
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
1847-
cu_compute_type,
1848-
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1828+
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
1829+
ne01, ne11, ne10,
1830+
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
1831+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
1832+
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
1833+
cu_compute_type,
1834+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18491835
}
18501836
}
18511837
}
@@ -1856,15 +1842,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18561842
CUBLAS_CHECK(
18571843
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
18581844
ne01, ne11, ne10,
1859-
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1860-
src1_f16, CUDA_R_16F, s11, s12, // strideB
1861-
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
1845+
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1846+
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
1847+
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
18621848
ne12*ne13,
18631849
cu_compute_type,
18641850
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18651851
} else {
18661852
// use cublasGemmBatchedEx
1867-
const int64_t ne23 = ne12*ne13;
1853+
const int ne23 = ne12*ne13;
18681854

18691855
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
18701856
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
@@ -1876,8 +1862,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18761862
ne12, ne13,
18771863
ne23,
18781864
nb02, nb03,
1879-
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
1880-
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
1865+
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
1866+
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
18811867
nbd2, nbd3,
18821868
r2, r3);
18831869
CUDA_CHECK(cudaGetLastError());
@@ -1886,8 +1872,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18861872
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
18871873
ne01, ne11, ne10,
18881874
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
1889-
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
1890-
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
1875+
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
1876+
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
18911877
ne23,
18921878
cu_compute_type,
18931879
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1951,7 +1937,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19511937
} else if (!split && use_mul_mat_vec_q) {
19521938
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
19531939
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1954-
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1940+
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
19551941
// general KQ + KQV multi-batch without FlashAttention
19561942
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
19571943
} else if (use_mul_mat_vec) {

src/llama-model.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10298,6 +10298,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
1029810298

1029910299
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
1030010300
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
10301+
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
1030110302
cb(q_nope_absorbed, "q_nope_absorbed", il);
1030210303

1030310304
// {kv_lora_rank, n_head, n_tokens}

0 commit comments

Comments
 (0)