Skip to content
130 changes: 87 additions & 43 deletions ggml/src/ggml-cuda/cpy.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "cpy.cuh"
#include "dequantize.cuh"
#include "cpy-utils.cuh"
#include <climits> // For SIZE_MAX
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
#include "ggml-musa/mudnn.cuh"
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
Expand Down Expand Up @@ -140,69 +141,110 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des

template<typename src_t, typename dst_t>
static void ggml_cpy_flt_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {


const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int num_blocks = (chunk + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx + offset * sizeof(src_t), cdst + offset * sizeof(dst_t), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
}

static void ggml_cpy_f32_q8_0_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

GGML_ASSERT(ne % QK8_0 == 0);
const int num_blocks = ne / QK8_0;
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk / QK8_0;
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<chunk_blocks, 1, 0, stream>>>
(cx + offset * sizeof(float), cdst + (offset / QK8_0) * sizeof(block_q8_0), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
}

static void ggml_cpy_q8_0_f32_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

const int num_blocks = ne;
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk;
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<chunk_blocks, 1, 0, stream>>>
(cx + (offset / QK8_0) * sizeof(block_q8_0), cdst + offset * sizeof(float), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
}

static void ggml_cpy_f32_q4_0_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

GGML_ASSERT(ne % QK4_0 == 0);
const int num_blocks = ne / QK4_0;
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk / QK4_0;
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<chunk_blocks, 1, 0, stream>>>
(cx + offset * sizeof(float), cdst + (offset / QK4_0) * sizeof(block_q4_0), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
}

static void ggml_cpy_q4_0_f32_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02,
const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12,
const int nb10, const int nb11, const int nb12, const int nb13,
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const int num_blocks = ne;
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk;
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<chunk_blocks, 1, 0, stream>>>(
cx + (offset / QK4_0) * sizeof(block_q4_0), cdst + offset * sizeof(float), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
}

static void ggml_cpy_f32_q4_1_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

GGML_ASSERT(ne % QK4_1 == 0);
const int num_blocks = ne / QK4_1;
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk / QK4_1;
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<chunk_blocks, 1, 0, stream>>>
(cx + offset * sizeof(float), cdst + (offset / QK4_1) * sizeof(block_q4_1), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
}

static void ggml_cpy_q4_1_f32_cuda(
Expand Down Expand Up @@ -281,8 +323,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));

GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
// No INT_MAX limit – ggml_nbytes may exceed 2GB on large contexts.
// The underlying cudaMemcpyAsync can handle size_t lengths.
GGML_ASSERT(ggml_nbytes(src0) <= SIZE_MAX / 4); // Reasonable upper bound with safety margin
GGML_ASSERT(ggml_nbytes(src1) <= SIZE_MAX / 4); // Reasonable upper bound with safety margin

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
Expand Down
Loading