@@ -1151,8 +1151,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
11511151 void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
11521152
11531153 GGML_ASSERT (ggml_backend_buffer_is_cuda (src->buffer ));
1154- char * src_ptr = (char *) src->data ;
1155- char * dst_ptr = (char *) dst;
1154+ const char * src_ptr = (const char *) src->data ;
1155+ char * dst_ptr = (char *) dst;
11561156
11571157 const int64_t ne0 = src->ne [0 ];
11581158 const int64_t nb0 = src->nb [0 ];
@@ -1162,7 +1162,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
11621162 const enum ggml_type type = src->type ;
11631163 const int64_t ts = ggml_type_size (type);
11641164 const int64_t bs = ggml_blck_size (type);
1165- int64_t i1_diff = i1_high - i1_low;
1165+ const int64_t i1_diff = i1_high - i1_low;
11661166
11671167 const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
11681168 if (nb0 == ts && nb1 == ts*ne0/bs) {
@@ -1479,13 +1479,17 @@ static void ggml_cuda_op_mul_mat(
14791479 if (src0_is_contiguous) {
14801480 dev[id].src0_dd = split ? (char *) src0_extra->data_device [id] : (char *) src0->data ;
14811481 } else {
1482- dev[id].src0_dd = dev[id].src0_dd_alloc .alloc (ctx.pool (id), ggml_nbytes (src0));
1482+ // If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding.
1483+ const size_t nbytes_data = ggml_nbytes (src0);
1484+ const size_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1485+ dev[id].src0_dd = dev[id].src0_dd_alloc .alloc (ctx.pool (id), nbytes_data + nbytes_padding);
1486+ CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd + nbytes_data , 0 , nbytes_padding, stream));
14831487 }
14841488
1485- // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
1489+ // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
14861490 if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized (src0->type ) && ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr ) {
1487- const int64_t nbytes_data = ggml_row_size (src0->type , (dev[id].row_high - dev[id].row_low )*ne00);
1488- const int64_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1491+ const size_t nbytes_data = ggml_row_size (src0->type , (dev[id].row_high - dev[id].row_low )*ne00);
1492+ const size_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
14891493 CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd + nbytes_data , 0 , nbytes_padding, stream));
14901494 }
14911495
0 commit comments