@@ -161,24 +161,25 @@ NDArray_Median_Float(NDArray* a) {
161161
162162NDArray *
163163NDArray_Add_Float (NDArray * a , NDArray * b ) {
164- if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
164+ NDArray * a_temp = NULL , * b_temp = NULL ;
165+ if (NDArray_DEVICE (a ) != NDArray_DEVICE (b ) && NDArray_NDIM (a ) != 0 && NDArray_NDIM (b ) != 0 ) {
165166 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
166167 return NULL ;
167168 }
168169
169- if ( NDArray_NDIM ( a ) == 0 && NDArray_NDIM ( b ) == 0 ) {
170- int * shape = ecalloc ( 1 , sizeof ( int ));
171- NDArray * rtn = NDArray_Zeros ( shape , 0 , NDARRAY_TYPE_FLOAT32 , NDArray_DEVICE ( a )) ;
172- #ifdef HAVE_CUBLAS
173- if ( NDArray_DEVICE ( a ) == NDARRAY_DEVICE_GPU ) {
174- cuda_add_float ( 2 , NDArray_FDATA ( a ), NDArray_FDATA (b ), NDArray_FDATA ( rtn ), 1 );
175- } else {
176- #endif
177- NDArray_FDATA ( rtn )[ 0 ] = NDArray_FDATA ( a )[ 0 ] + NDArray_FDATA ( b )[ 0 ] ;
178- #ifdef HAVE_CUBLAS
179- }
180- #endif
181- return rtn ;
170+ // If a or b are scalars, reshape
171+ if ( NDArray_NDIM ( a ) == 0 && NDArray_NDIM ( b ) > 0 ) {
172+ a_temp = a ;
173+ int * n_shape = emalloc ( sizeof ( int ) * NDArray_NDIM ( b ));
174+ copy ( NDArray_SHAPE ( b ), n_shape , NDArray_NDIM ( b ));
175+ a = NDArray_Zeros ( n_shape , NDArray_NDIM ( b ), NDArray_TYPE (b ), NDArray_DEVICE ( b ) );
176+ a = NDArray_Fill ( a , NDArray_FDATA ( a_temp )[ 0 ]);
177+ } else if ( NDArray_NDIM ( b ) == 0 && NDArray_NDIM ( a ) > 0 ) {
178+ b_temp = b ;
179+ int * n_shape = emalloc ( sizeof ( int ) * NDArray_NDIM ( a ));
180+ copy ( NDArray_SHAPE ( a ), n_shape , NDArray_NDIM ( a ));
181+ b = NDArray_Zeros ( n_shape , NDArray_NDIM ( a ), NDArray_TYPE ( a ), NDArray_DEVICE ( a ));
182+ b = NDArray_Fill ( b , NDArray_FDATA ( b_temp )[ 0 ]) ;
182183 }
183184
184185 NDArray * broadcasted = NULL ;
@@ -209,72 +210,69 @@ NDArray_Add_Float(NDArray* a, NDArray* b) {
209210 }
210211
211212 // Create a new NDArray to store the result
212- NDArray * result = (NDArray * ) emalloc (sizeof (NDArray ));
213- result -> strides = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
214- result -> dimensions = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
213+ NDArray * result = (NDArray * ) emalloc (sizeof (NDArray ));
214+ result -> strides = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
215+ result -> dimensions = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
215216 result -> ndim = a_broad -> ndim ;
216217 if (NDArray_DEVICE (a_broad ) == NDARRAY_DEVICE_GPU ) {
217218#if HAVE_CUBLAS
218219 vmalloc ((void * * ) & result -> data , NDArray_NUMELEMENTS (a_broad ) * sizeof (float ));
220+ cudaDeviceSynchronize ();
219221 result -> device = NDARRAY_DEVICE_GPU ;
220222#endif
221223 } else {
222224 result -> data = (char * ) emalloc (a_broad -> descriptor -> numElements * sizeof (float ));
223225 }
224226 result -> base = NULL ;
225227 result -> flags = 0 ; // Set appropriate flags
226- result -> descriptor = (NDArrayDescriptor * ) emalloc (sizeof (NDArrayDescriptor ));
228+ result -> descriptor = (NDArrayDescriptor * ) emalloc (sizeof (NDArrayDescriptor ));
227229 result -> descriptor -> type = NDARRAY_TYPE_FLOAT32 ;
228230 result -> descriptor -> elsize = sizeof (float );
229- result -> device = NDArray_DEVICE (a_broad );
230231 result -> descriptor -> numElements = a_broad -> descriptor -> numElements ;
231232 result -> refcount = 1 ;
233+ result -> device = NDArray_DEVICE (a_broad );
232234
233- // Perform element-wise addition
235+ // Perform element-wise subtraction
234236 result -> strides = memcpy (result -> strides , a_broad -> strides , a_broad -> ndim * sizeof (int ));
235237 result -> dimensions = memcpy (result -> dimensions , a_broad -> dimensions , a_broad -> ndim * sizeof (int ));
236- float * resultData = (float * ) result -> data ;
237- float * aData = (float * ) a_broad -> data ;
238- float * bData = (float * ) b_broad -> data ;
238+ float * resultData = (float * ) result -> data ;
239+ float * aData = (float * ) a_broad -> data ;
240+ float * bData = (float * ) b_broad -> data ;
239241 int numElements = a_broad -> descriptor -> numElements ;
240242 NDArrayIterator_INIT (result );
241243 if (NDArray_DEVICE (a_broad ) == NDARRAY_DEVICE_GPU && NDArray_DEVICE (b_broad ) == NDARRAY_DEVICE_GPU ) {
242244#if HAVE_CUBLAS
243- cuda_add_float (NDArray_NUMELEMENTS (a_broad ), NDArray_FDATA (a_broad ), NDArray_FDATA (b_broad ), NDArray_FDATA (result ), NDArray_NUMELEMENTS ( a_broad ));
244- result -> device = NDARRAY_DEVICE_GPU ;
245+ cuda_add_float (NDArray_NUMELEMENTS (a_broad ), NDArray_FDATA (a_broad ), NDArray_FDATA (b_broad ), NDArray_FDATA (result ),
246+ NDArray_NUMELEMENTS ( a_broad )) ;
245247#endif
246248 } else {
247249#ifdef HAVE_AVX2
248250 int i ;
249- __m256 vec1 , vec2 , mul ;
251+ __m256 vec1 , vec2 , sub ;
250252
251253 for (i = 0 ; i < NDArray_NUMELEMENTS (a ) - 7 ; i += 8 ) {
252254 vec1 = _mm256_loadu_ps (& aData [i ]);
253255 vec2 = _mm256_loadu_ps (& bData [i ]);
254- mul = _mm256_add_ps (vec1 , vec2 );
255- _mm256_storeu_ps (& resultData [i ], mul );
256+ sub = _mm256_add_ps (vec1 , vec2 );
257+ _mm256_storeu_ps (& resultData [i ], sub );
256258 }
259+
257260 // Handle remaining elements if the length is not a multiple of 4
258261 for (; i < numElements ; i ++ ) {
259262 resultData [i ] = aData [i ] + bData [i ];
260263 }
261- #elif HAVE_CBLAS
262- if (NDArray_NUMELEMENTS (a_broad ) == NDArray_NUMELEMENTS (b_broad )) {
263- memcpy (resultData , NDArray_FDATA (b_broad ), NDArray_ELSIZE (b_broad ) * NDArray_NUMELEMENTS (b_broad ));
264- cblas_saxpy (NDArray_NUMELEMENTS (a_broad ), 1.0F , NDArray_FDATA (a_broad ), 1 , resultData ,
265- 1 );
266- } else {
267- for (int i = 0 ; i < numElements ; i ++ ) {
268- resultData [i ] = aData [i ] + bData [i ];
269- }
270- }
271264#else
272265 for (int i = 0 ; i < numElements ; i ++ ) {
273266 resultData [i ] = aData [i ] + bData [i ];
274267 }
275268#endif
276269 }
277-
270+ if (a_temp != NULL ) {
271+ NDArray_FREE (a );
272+ }
273+ if (b_temp != NULL ) {
274+ NDArray_FREE (b );
275+ }
278276 if (broadcasted != NULL ) {
279277 NDArray_FREE (broadcasted );
280278 }
@@ -292,7 +290,7 @@ NDArray*
292290NDArray_Multiply_Float (NDArray * a , NDArray * b ) {
293291 NDArray * broadcasted = NULL ;
294292 NDArray * a_temp = NULL , * b_temp = NULL ;
295- if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
293+ if (NDArray_DEVICE (a ) != NDArray_DEVICE (b ) && NDArray_NDIM ( a ) != 0 && NDArray_NDIM ( b ) != 0 ) {
296294 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
297295 return NULL ;
298296 }
@@ -433,7 +431,7 @@ NDArray_Multiply_Float(NDArray* a, NDArray* b) {
433431NDArray *
434432NDArray_Subtract_Float (NDArray * a , NDArray * b ) {
435433 NDArray * a_temp = NULL , * b_temp = NULL ;
436- if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
434+ if (NDArray_DEVICE (a ) != NDArray_DEVICE (b ) && NDArray_NDIM ( a ) != 0 && NDArray_NDIM ( b ) != 0 ) {
437435 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
438436 return NULL ;
439437 }
@@ -561,7 +559,7 @@ NDArray*
561559NDArray_Divide_Float (NDArray * a , NDArray * b ) {
562560 NDArray * a_temp = NULL , * b_temp = NULL ;
563561
564- if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
562+ if (NDArray_DEVICE (a ) != NDArray_DEVICE (b ) && NDArray_NDIM ( a ) != 0 && NDArray_NDIM ( b ) != 0 ) {
565563 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
566564 return NULL ;
567565 }
@@ -694,18 +692,11 @@ NDArray_Divide_Float(NDArray* a, NDArray* b) {
694692NDArray *
695693NDArray_Mod_Float (NDArray * a , NDArray * b ) {
696694 NDArray * a_temp = NULL , * b_temp = NULL ;
697- if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
695+ if (NDArray_DEVICE (a ) != NDArray_DEVICE (b ) && NDArray_NDIM ( a ) != 0 && NDArray_NDIM ( b ) != 0 ) {
698696 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
699697 return NULL ;
700698 }
701699
702- if (NDArray_NDIM (a ) == 0 ) {
703- int * shape = ecalloc (1 , sizeof (int ));
704- NDArray * rtn = NDArray_Zeros (shape , 0 , NDARRAY_TYPE_FLOAT32 , NDArray_DEVICE (a ));
705- NDArray_FDATA (rtn )[0 ] = NDArray_FDATA (a )[0 ] + NDArray_FDATA (b )[0 ];
706- return rtn ;
707- }
708-
709700 // If a or b are scalars, reshape
710701 if (NDArray_NDIM (a ) == 0 && NDArray_NDIM (b ) > 0 ) {
711702 a_temp = a ;
@@ -723,6 +714,7 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
723714
724715 NDArray * broadcasted = NULL ;
725716 NDArray * a_broad = NULL , * b_broad = NULL ;
717+
726718 if (NDArray_NUMELEMENTS (a ) < NDArray_NUMELEMENTS (b )) {
727719 broadcasted = NDArray_Broadcast (a , b );
728720 a_broad = broadcasted ;
@@ -748,9 +740,9 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
748740 }
749741
750742 // Create a new NDArray to store the result
751- NDArray * result = (NDArray * ) emalloc (sizeof (NDArray ));
752- result -> strides = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
753- result -> dimensions = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
743+ NDArray * result = (NDArray * ) emalloc (sizeof (NDArray ));
744+ result -> strides = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
745+ result -> dimensions = (int * ) emalloc (a_broad -> ndim * sizeof (int ));
754746 result -> ndim = a_broad -> ndim ;
755747 if (NDArray_DEVICE (a_broad ) == NDARRAY_DEVICE_GPU ) {
756748#if HAVE_CUBLAS
@@ -763,7 +755,7 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
763755 }
764756 result -> base = NULL ;
765757 result -> flags = 0 ; // Set appropriate flags
766- result -> descriptor = (NDArrayDescriptor * ) emalloc (sizeof (NDArrayDescriptor ));
758+ result -> descriptor = (NDArrayDescriptor * ) emalloc (sizeof (NDArrayDescriptor ));
767759 result -> descriptor -> type = NDARRAY_TYPE_FLOAT32 ;
768760 result -> descriptor -> elsize = sizeof (float );
769761 result -> descriptor -> numElements = a_broad -> descriptor -> numElements ;
@@ -773,9 +765,9 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
773765 // Perform element-wise subtraction
774766 result -> strides = memcpy (result -> strides , a_broad -> strides , a_broad -> ndim * sizeof (int ));
775767 result -> dimensions = memcpy (result -> dimensions , a_broad -> dimensions , a_broad -> ndim * sizeof (int ));
776- float * resultData = (float * ) result -> data ;
777- float * aData = (float * ) a_broad -> data ;
778- float * bData = (float * ) b_broad -> data ;
768+ float * resultData = (float * ) result -> data ;
769+ float * aData = (float * ) a_broad -> data ;
770+ float * bData = (float * ) b_broad -> data ;
779771 int numElements = a_broad -> descriptor -> numElements ;
780772 NDArrayIterator_INIT (result );
781773 if (NDArray_DEVICE (a_broad ) == NDARRAY_DEVICE_GPU && NDArray_DEVICE (b_broad ) == NDARRAY_DEVICE_GPU ) {
@@ -784,11 +776,27 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
784776 NDArray_NUMELEMENTS (a_broad ));
785777#endif
786778 } else {
779+ #ifdef HAVE_AVX2
780+ int i ;
781+ __m256 vec1 , vec2 , vout ;
782+
783+ for (i = 0 ; i < NDArray_NUMELEMENTS (a ) - 7 ; i += 8 ) {
784+ vec1 = _mm256_loadu_ps (& aData [i ]);
785+ vec2 = _mm256_loadu_ps (& bData [i ]);
786+ vout = _mm256_sub_ps (vec1 , _mm256_mul_ps (_mm256_floor_ps (_mm256_div_ps (vec1 , vec2 )), vec2 ));
787+ _mm256_storeu_ps (& resultData [i ], vout );
788+ }
789+
790+ // Handle remaining elements if the length is not a multiple of 4
791+ for (; i < numElements ; i ++ ) {
792+ resultData [i ] = fmodf (aData [i ], bData [i ]);
793+ }
794+ #else
787795 for (int i = 0 ; i < numElements ; i ++ ) {
788796 resultData [i ] = fmodf (aData [i ], bData [i ]);
789797 }
798+ #endif
790799 }
791-
792800 if (a_temp != NULL ) {
793801 NDArray_FREE (a );
794802 }
@@ -809,18 +817,11 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
809817NDArray *
810818NDArray_Pow_Float (NDArray * a , NDArray * b ) {
811819 NDArray * a_temp = NULL , * b_temp = NULL ;
812- if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
820+ if (NDArray_DEVICE (a ) != NDArray_DEVICE (b ) && NDArray_NDIM ( a ) != 0 && NDArray_NDIM ( b ) != 0 ) {
813821 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
814822 return NULL ;
815823 }
816824
817- if (NDArray_NDIM (a ) == 0 ) {
818- int * shape = ecalloc (1 , sizeof (int ));
819- NDArray * rtn = NDArray_Zeros (shape , 0 , NDARRAY_TYPE_FLOAT32 , NDArray_DEVICE (a ));
820- NDArray_FDATA (rtn )[0 ] = NDArray_FDATA (a )[0 ] + NDArray_FDATA (b )[0 ];
821- return rtn ;
822- }
823-
824825 // If a or b are scalars, reshape
825826 if (NDArray_NDIM (a ) == 0 && NDArray_NDIM (b ) > 0 ) {
826827 a_temp = a ;
@@ -870,12 +871,12 @@ NDArray_Pow_Float(NDArray* a, NDArray* b) {
870871 result -> ndim = a_broad -> ndim ;
871872 if (NDArray_DEVICE (a_broad ) == NDARRAY_DEVICE_GPU ) {
872873#if HAVE_CUBLAS
873- vmalloc ((void * * ) & result -> data , NDArray_NUMELEMENTS (a ) * sizeof (float ));
874+ vmalloc ((void * * ) & result -> data , NDArray_NUMELEMENTS (a_broad ) * sizeof (float ));
874875 cudaDeviceSynchronize ();
875876 result -> device = NDARRAY_DEVICE_GPU ;
876877#endif
877878 } else {
878- result -> data = (char * ) emalloc (a -> descriptor -> numElements * sizeof (float ));
879+ result -> data = (char * ) emalloc (a_broad -> descriptor -> numElements * sizeof (float ));
879880 }
880881 result -> base = NULL ;
881882 result -> flags = 0 ; // Set appropriate flags
@@ -886,13 +887,13 @@ NDArray_Pow_Float(NDArray* a, NDArray* b) {
886887 result -> refcount = 1 ;
887888 result -> device = NDArray_DEVICE (a_broad );
888889
889- // Perform element-wise
890+ // Perform element-wise subtraction
890891 result -> strides = memcpy (result -> strides , a_broad -> strides , a_broad -> ndim * sizeof (int ));
891892 result -> dimensions = memcpy (result -> dimensions , a_broad -> dimensions , a_broad -> ndim * sizeof (int ));
892893 float * resultData = (float * ) result -> data ;
893894 float * aData = (float * ) a_broad -> data ;
894895 float * bData = (float * ) b_broad -> data ;
895- int numElements = a -> descriptor -> numElements ;
896+ int numElements = a_broad -> descriptor -> numElements ;
896897 NDArrayIterator_INIT (result );
897898 if (NDArray_DEVICE (a_broad ) == NDARRAY_DEVICE_GPU && NDArray_DEVICE (b_broad ) == NDARRAY_DEVICE_GPU ) {
898899#if HAVE_CUBLAS
0 commit comments