@@ -963,13 +963,15 @@ NDArray_Broadcast(NDArray *a, NDArray *b) {
963963 rtn_p = rtn_p + (sizeof (float ) * NDArray_SHAPE (src )[0 ]);
964964 }
965965 }
966+ #ifdef HAVE_CUBLAS
966967 if (NDArray_DEVICE (dst ) == NDARRAY_DEVICE_GPU ) {
967968 for (i = 0 ; i < NDArray_SHAPE (dst )[NDArray_NDIM (dst ) - 2 ]; i ++ ) {
968969 NDArray_VMEMCPY_D2D (NDArray_DATA (src ), rtn_p ,
969970 sizeof (float ) * NDArray_SHAPE (dst )[NDArray_NDIM (dst ) - 1 ]);
970971 rtn_p = rtn_p + (sizeof (float ) * NDArray_SHAPE (src )[0 ]);
971972 }
972973 }
974+ #endif
973975 }
974976 }
975977 int j ;
@@ -982,6 +984,7 @@ NDArray_Broadcast(NDArray *a, NDArray *b) {
982984 }
983985 }
984986 }
987+ #ifdef HAVE_CUBLAS
985988 if (NDArray_DEVICE (dst ) == NDARRAY_DEVICE_GPU ) {
986989 for (i = 0 ; i < NDArray_SHAPE (dst )[NDArray_NDIM (dst ) - 2 ]; i ++ ) {
987990 for (j = 0 ; j < NDArray_SHAPE (dst )[NDArray_NDIM (dst ) - 1 ]; j ++ ) {
@@ -990,8 +993,8 @@ NDArray_Broadcast(NDArray *a, NDArray *b) {
990993 NDArray_VMEMCPY_D2D (tmp_p , rtn_p , sizeof (float ));
991994 }
992995 }
993- NDArray_Print (rtn ,0 );
994996 }
997+ #endif
995998 }
996999 if (NDArray_SHAPE (src )[NDArray_NDIM (dst ) - 1 ] == NDArray_SHAPE (dst )[NDArray_NDIM (dst ) - 2 ]) {
9971000 if (NDArray_DEVICE (dst ) == NDARRAY_DEVICE_CPU ) {
@@ -1001,6 +1004,7 @@ NDArray_Broadcast(NDArray *a, NDArray *b) {
10011004 rtn_p = rtn_p + (sizeof (float ) * NDArray_SHAPE (src )[NDArray_NDIM (dst ) - 1 ]);
10021005 }
10031006 }
1007+ #ifdef HAVE_CUBLAS
10041008 if (NDArray_DEVICE (dst ) == NDARRAY_DEVICE_GPU ) {
10051009 for (i = 0 ; i < NDArray_SHAPE (dst )[NDArray_NDIM (dst ) - 2 ]; i ++ ) {
10061010 NDArray_VMEMCPY_D2D (NDArray_DATA (src ), rtn_p ,
@@ -1009,6 +1013,7 @@ NDArray_Broadcast(NDArray *a, NDArray *b) {
10091013 (i * NDArray_STRIDES (rtn )[NDArray_NDIM (rtn ) - 2 ] / NDArray_ELSIZE (rtn )) + j );
10101014 }
10111015 }
1016+ #endif
10121017 }
10131018 }
10141019 return rtn ;
0 commit comments