@@ -7830,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
78307830 }
78317831
78327832#ifdef NDEBUG
7833+ for (int id = 0 ; id < g_device_count; ++id) {
7834+ CUDA_CHECK (ggml_cuda_set_device (id));
7835+ CUDA_CHECK (cudaDeviceSynchronize ());
7836+ }
7837+
78337838 for (int id = 0 ; id < g_device_count; ++id) {
78347839 CUDA_CHECK (ggml_cuda_set_device (id));
78357840
@@ -7881,8 +7886,6 @@ static void ggml_cuda_op_mul_mat(
78817886 const int nb2 = dst->nb [2 ];
78827887 const int nb3 = dst->nb [3 ];
78837888
7884- ggml_cuda_set_peer_access (ne11);
7885-
78867889 GGML_ASSERT (dst->backend != GGML_BACKEND_GPU_SPLIT);
78877890 GGML_ASSERT (src1->backend != GGML_BACKEND_GPU_SPLIT);
78887891
@@ -8781,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
87818784
87828785 GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
87838786
8787+ const int64_t nb11 = src1->nb [1 ];
8788+ const int64_t nb1 = dst->nb [1 ];
8789+
87848790 const struct ggml_tensor * ids = src0;
87858791 const int32_t id = ((int32_t *) dst->op_params )[0 ];
87868792 const int32_t n_as = ((int32_t *) dst->op_params )[1 ];
87878793
87888794 std::vector<char > ids_host (ggml_nbytes (ids));
87898795
8796+ const cudaStream_t stream = g_cudaStreams[g_main_device][0 ];
8797+
87908798 if (ids->backend == GGML_BACKEND_GPU) {
87918799 const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra )->data_device [g_main_device];
8792- CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_nbytes (ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][ 0 ] ));
8793- CUDA_CHECK (cudaStreamSynchronize (g_cudaStreams[g_main_device][ 0 ] ));
8800+ CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_nbytes (ids), cudaMemcpyDeviceToHost, stream ));
8801+ CUDA_CHECK (cudaStreamSynchronize (stream ));
87948802 } else {
87958803 memcpy (ids_host.data (), ids->data , ggml_nbytes (ids));
87968804 }
@@ -8804,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88048812 ggml_tensor src1_row = *src1;
88058813 ggml_tensor dst_row = *dst;
88068814
8807- src1_row.ne [ 1 ] = 1 ;
8808- dst_row.ne [ 1 ] = 1 ;
8815+ src1_row.extra = &src1_row_extra ;
8816+ dst_row.extra = &dst_row_extra ;
88098817
8810- src1_row. nb [ 2 ] = src1_row. nb [ 1 ];
8811- dst_row. nb [ 2 ] = dst_row. nb [ 1 ];
8818+ char * src1_original = ( char *) src1_extra-> data_device [g_main_device ];
8819+ char * dst_original = ( char *) dst_extra-> data_device [g_main_device ];
88128820
8813- src1_row.nb [3 ] = src1_row.nb [1 ];
8814- dst_row.nb [3 ] = dst_row.nb [1 ];
8821+ if (src1->ne [1 ] == 1 ) {
8822+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8823+ // int32_t row_id;
8824+ // CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8825+ // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
88158826
8816- src1_row.extra = &src1_row_extra;
8817- dst_row.extra = &dst_row_extra;
8827+ const int32_t row_id = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
88188828
8829+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
88198830
8820- for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8821- // int32_t row_id;
8822- // CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8823- // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8831+ const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
88248832
8825- const int32_t row_id = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8833+ src1_row_extra.data_device [g_main_device] = src1_original + i01*src1->nb [1 ];
8834+ src1_row.data = (char *) src1->data + i01*src1->nb [1 ]; // TODO why is this set?
88268835
8827- GGML_ASSERT (row_id >= 0 && row_id < n_as);
8836+ dst_row_extra.data_device [g_main_device] = dst_original + i01*dst->nb [1 ];
8837+ dst_row.data = (char *) dst->data + i01*dst->nb [1 ]; // TODO why is this set?
88288838
8829- const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8839+ ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8840+ }
8841+ } else {
8842+ size_t as_src1, as_dst;
8843+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (src1), &as_src1);
8844+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (dst), &as_dst);
88308845
8831- src1_row_extra.data_device [g_main_device] = (char *) src1_extra->data_device [g_main_device] + i01*src1->nb [1 ];
8832- src1_row.data = (char *) src1->data + i01*src1->nb [1 ];
8846+ src1_row_extra.data_device [g_main_device] = src1_contiguous;
8847+ dst_row_extra.data_device [g_main_device] = dst_contiguous;
8848+
8849+ for (int32_t row_id = 0 ; row_id < n_as; ++row_id) {
8850+ const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8851+
8852+ int64_t num_src1_rows = 0 ;
8853+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8854+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8855+
8856+ if (row_id_i != row_id) {
8857+ continue ;
8858+ }
88338859
8834- dst_row_extra.data_device [g_main_device] = (char *) dst_extra->data_device [g_main_device] + i01*dst->nb [1 ];
8835- dst_row.data = (char *) dst->data + i01*dst->nb [1 ];
8860+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
88368861
8837- ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8862+ CUDA_CHECK (cudaMemcpyAsync (src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8863+ nb11, cudaMemcpyDeviceToDevice, stream));
8864+ num_src1_rows++;
8865+ }
8866+
8867+ if (num_src1_rows == 0 ) {
8868+ continue ;
8869+ }
8870+
8871+ src1_row.ne [1 ] = num_src1_rows;
8872+ dst_row.ne [1 ] = num_src1_rows;
8873+
8874+ src1_row.nb [1 ] = nb11;
8875+ src1_row.nb [2 ] = num_src1_rows*nb11;
8876+ src1_row.nb [3 ] = num_src1_rows*nb11;
8877+
8878+ dst_row.nb [1 ] = nb1;
8879+ dst_row.nb [2 ] = num_src1_rows*nb1;
8880+ dst_row.nb [3 ] = num_src1_rows*nb1;
8881+
8882+ ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8883+
8884+ num_src1_rows = 0 ;
8885+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8886+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8887+
8888+ if (row_id_i != row_id) {
8889+ continue ;
8890+ }
8891+
8892+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8893+
8894+ CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8895+ nb1, cudaMemcpyDeviceToDevice, stream));
8896+ num_src1_rows++;
8897+ }
8898+ }
8899+
8900+ ggml_cuda_pool_free (src1_contiguous, as_src1);
8901+ ggml_cuda_pool_free (dst_contiguous, as_dst);
88388902 }
88398903}
88408904
@@ -9370,6 +9434,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
93709434 return false ;
93719435 }
93729436
9437+ if (tensor->src [0 ] != nullptr && tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT) {
9438+ ggml_cuda_set_peer_access (tensor->src [1 ]->ne [1 ]);
9439+ }
9440+
93739441 if (params->ith != 0 ) {
93749442 return true ;
93759443 }
0 commit comments