@@ -1170,8 +1170,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
11701170 void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
11711171
11721172 GGML_ASSERT (ggml_backend_buffer_is_cuda (src->buffer ));
1173- char * src_ptr = (char *) src->data ;
1174- char * dst_ptr = (char *) dst;
1173+ const char * src_ptr = (const char *) src->data ;
1174+ char * dst_ptr = (char *) dst;
11751175
11761176 const int64_t ne0 = src->ne [0 ];
11771177 const int64_t nb0 = src->nb [0 ];
@@ -1182,7 +1182,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
11821182 const int64_t ts = ggml_type_size (type);
11831183 const int64_t rs = ggml_row_size (type, ne0);
11841184 const int64_t bs = ggml_blck_size (type);
1185- int64_t i1_diff = i1_high - i1_low;
1185+ const int64_t i1_diff = i1_high - i1_low;
11861186
11871187 const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
11881188 if (nb0 == ts && nb1 == rs) {
@@ -1532,10 +1532,14 @@ static void ggml_cuda_op_mul_mat(
15321532 if (src0_is_contiguous) {
15331533 dev[id].src0_dd = split ? (char *) src0_extra->data_device [id] : (char *) src0->data ;
15341534 } else {
1535- dev[id].src0_dd = dev[id].src0_dd_alloc .alloc (ctx.pool (id), ggml_nbytes (src0));
1535+ // If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding.
1536+ const size_t nbytes_data = ggml_nbytes (src0);
1537+ const size_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1538+ dev[id].src0_dd = dev[id].src0_dd_alloc .alloc (ctx.pool (id), nbytes_data + nbytes_padding);
1539+ CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd , 0 , nbytes_data + nbytes_padding, stream));
15361540 }
15371541
1538- // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
1542+ // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
15391543 if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized (src0->type ) && ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr ) {
15401544 const int64_t nbytes_data = ggml_row_size (src0->type , (dev[id].row_high - dev[id].row_low )*ne00);
15411545 const int64_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
0 commit comments