@@ -185,15 +185,12 @@ NDArray_Median_Float(NDArray* a) {
185185
186186NDArray *
187187NDArray_Add_Float (NDArray * a , NDArray * b ) {
188- NDArray * broadcasted = NULL ;
189188 if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
190189 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
191190 return NULL ;
192191 }
193192
194- NDArray * a_broad = NULL , * b_broad = NULL ;
195-
196- if (NDArray_NDIM (a ) == 0 ) {
193+ if (NDArray_NDIM (a ) == 0 && NDArray_NDIM (b ) == 0 ) {
197194 int * shape = ecalloc (1 , sizeof (int ));
198195 NDArray * rtn = NDArray_Zeros (shape , 0 , NDARRAY_TYPE_FLOAT32 , NDArray_DEVICE (a ));
199196#ifdef HAVE_CUBLAS
@@ -208,6 +205,9 @@ NDArray_Add_Float(NDArray* a, NDArray* b) {
208205 return rtn ;
209206 }
210207
208+ NDArray * broadcasted = NULL ;
209+ NDArray * a_broad = NULL , * b_broad = NULL ;
210+
211211 if (NDArray_NUMELEMENTS (a ) < NDArray_NUMELEMENTS (b )) {
212212 broadcasted = NDArray_Broadcast (a , b );
213213 a_broad = broadcasted ;
@@ -250,6 +250,7 @@ NDArray_Add_Float(NDArray* a, NDArray* b) {
250250 result -> descriptor = (NDArrayDescriptor * )emalloc (sizeof (NDArrayDescriptor ));
251251 result -> descriptor -> type = NDARRAY_TYPE_FLOAT32 ;
252252 result -> descriptor -> elsize = sizeof (float );
253+ result -> device = NDArray_DEVICE (a_broad );
253254 result -> descriptor -> numElements = a_broad -> descriptor -> numElements ;
254255 result -> refcount = 1 ;
255256
@@ -278,7 +279,7 @@ NDArray_Add_Float(NDArray* a, NDArray* b) {
278279 _mm256_storeu_ps (& resultData [i ], mul );
279280 }
280281 // Handle remaining elements if the length is not a multiple of 4
281- for (; i < NDArray_NUMELEMENTS ( a ) ; i ++ ) {
282+ for (; i < numElements ; i ++ ) {
282283 resultData [i ] = aData [i ] + bData [i ];
283284 }
284285#elif HAVE_CBLAS
@@ -365,7 +366,6 @@ NDArray_Multiply_Float(NDArray* a, NDArray* b) {
365366 b_broad = b ;
366367 a_broad = a ;
367368 }
368-
369369 if (b_broad == NULL || a_broad == NULL ) {
370370 zend_throw_error (NULL , "Can't broadcast arrays." );
371371 return NULL ;
@@ -415,18 +415,18 @@ NDArray_Multiply_Float(NDArray* a, NDArray* b) {
415415#endif
416416 } else {
417417#ifdef HAVE_AVX2
418- int i ;
418+ int i = 0 ;
419419 __m256 vec1 , vec2 , mul ;
420420
421- for (i = 0 ; i < NDArray_NUMELEMENTS (a ) - 7 ; i += 8 ) {
421+ for (; i < NDArray_NUMELEMENTS (a ) - 7 ; i += 8 ) {
422422 vec1 = _mm256_loadu_ps (& aData [i ]);
423423 vec2 = _mm256_loadu_ps (& bData [i ]);
424424 mul = _mm256_mul_ps (vec1 , vec2 );
425425 _mm256_storeu_ps (& resultData [i ], mul );
426426 }
427427
428428 // Handle remaining elements if the length is not a multiple of 4
429- for (; i < NDArray_NUMELEMENTS ( a ) ; i ++ ) {
429+ for (; i < numElements ; i ++ ) {
430430 resultData [i ] = aData [i ] * bData [i ];
431431 }
432432#else
@@ -553,7 +553,7 @@ NDArray_Subtract_Float(NDArray* a, NDArray* b) {
553553 }
554554
555555 // Handle remaining elements if the length is not a multiple of 4
556- for (; i < NDArray_NUMELEMENTS ( a ) ; i ++ ) {
556+ for (; i < numElements ; i ++ ) {
557557 resultData [i ] = aData [i ] - bData [i ];
558558 }
559559#else
@@ -584,6 +584,7 @@ NDArray_Subtract_Float(NDArray* a, NDArray* b) {
584584NDArray *
585585NDArray_Divide_Float (NDArray * a , NDArray * b ) {
586586 NDArray * a_temp = NULL , * b_temp = NULL ;
587+
587588 if (NDArray_DEVICE (a ) != NDArray_DEVICE (b )) {
588589 zend_throw_error (NULL , "Device mismatch, both NDArray MUST be in the same device." );
589590 return NULL ;
@@ -658,6 +659,7 @@ NDArray_Divide_Float(NDArray* a, NDArray* b) {
658659 result -> descriptor = (NDArrayDescriptor * ) emalloc (sizeof (NDArrayDescriptor ));
659660 result -> descriptor -> type = NDARRAY_TYPE_FLOAT32 ;
660661 result -> descriptor -> elsize = sizeof (float );
662+ result -> device = NDArray_DEVICE (a_broad );
661663 result -> descriptor -> numElements = a_broad -> descriptor -> numElements ;
662664 result -> refcount = 1 ;
663665
@@ -687,7 +689,7 @@ NDArray_Divide_Float(NDArray* a, NDArray* b) {
687689 }
688690
689691 // Handle remaining elements if the length is not a multiple of 4
690- for (; i < NDArray_NUMELEMENTS ( a ) ; i ++ ) {
692+ for (; i < numElements ; i ++ ) {
691693 resultData [i ] = aData [i ] / bData [i ];
692694 }
693695#else
0 commit comments