Skip to content

Commit 52a746c

Browse files
Merge branch 'fix/stable-deploy'
2 parents 454abe2 + 236dcbe commit 52a746c

File tree

4 files changed

+113
-70
lines changed

4 files changed

+113
-70
lines changed

numpower.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3157,6 +3157,41 @@ PHP_METHOD(NDArray, expand_dims) {
31573157
RETURN_NDARRAY(rtn, return_value);
31583158
}
31593159

3160+
/**
3161+
* NDArray::expand_dims
3162+
*/
3163+
ZEND_BEGIN_ARG_INFO(arginfo_ndarray_flip, 0)
3164+
ZEND_ARG_INFO(0, a)
3165+
ZEND_ARG_INFO(0, axis)
3166+
ZEND_END_ARG_INFO()
3167+
PHP_METHOD(NDArray, flip) {
3168+
NDArray *rtn = NULL;
3169+
zval *a;
3170+
zval *axis = NULL;
3171+
ZEND_PARSE_PARAMETERS_START(1, 2)
3172+
Z_PARAM_ZVAL(a)
3173+
Z_PARAM_ZVAL(axis)
3174+
ZEND_PARSE_PARAMETERS_END();
3175+
3176+
if (Z_TYPE_P(axis) != IS_ARRAY && Z_TYPE_P(axis) != IS_LONG && Z_TYPE_P(axis) != IS_OBJECT) {
3177+
zend_throw_error(NULL, "expected array, integer or ndarray");
3178+
return;
3179+
}
3180+
NDArray *nda = ZVAL_TO_NDARRAY(a);
3181+
NDArray *ndaxis = ZVAL_TO_NDARRAY(axis);
3182+
if (nda == NULL || ndaxis == NULL) {
3183+
return;
3184+
}
3185+
rtn = NDArray_Flip(nda, ndaxis);
3186+
3187+
CHECK_INPUT_AND_FREE(a, nda);
3188+
CHECK_INPUT_AND_FREE(axis, ndaxis);
3189+
if (rtn == NULL) {
3190+
return;
3191+
}
3192+
RETURN_NDARRAY(rtn, return_value);
3193+
}
3194+
31603195
/**
31613196
* NDArray::append
31623197
*/

src/manipulation.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,3 +545,9 @@ NDArray_AtLeast3D(NDArray *a) {
545545
return output;
546546
}
547547

548+
NDArray*
549+
NDArray_Flip(NDArray *a, NDArray *axis)
550+
{
551+
552+
}
553+

src/manipulation.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ NDArray* NDArray_AtLeast1D(NDArray *a);
1717
NDArray* NDArray_AtLeast2D(NDArray *a);
1818
NDArray* NDArray_AtLeast3D(NDArray *a);
1919
NDArray* NDArray_ConcatenateFlat(NDArray **arrays, int num_arrays);
20+
NDArray* NDArray_Flip(NDArray *a, NDArray *axis);
2021
#endif //PHPSCI_NDARRAY_MANIPULATION_H

src/ndmath/arithmetics.c

Lines changed: 71 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -161,24 +161,25 @@ NDArray_Median_Float(NDArray* a) {
161161

162162
NDArray*
163163
NDArray_Add_Float(NDArray* a, NDArray* b) {
164-
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
164+
NDArray *a_temp = NULL, *b_temp = NULL;
165+
if (NDArray_DEVICE(a) != NDArray_DEVICE(b) && NDArray_NDIM(a) != 0 && NDArray_NDIM(b) != 0) {
165166
zend_throw_error(NULL, "Device mismatch, both NDArray MUST be in the same device.");
166167
return NULL;
167168
}
168169

169-
if (NDArray_NDIM(a) == 0 && NDArray_NDIM(b) == 0) {
170-
int* shape = ecalloc(1, sizeof(int));
171-
NDArray *rtn = NDArray_Zeros(shape, 0, NDARRAY_TYPE_FLOAT32, NDArray_DEVICE(a));
172-
#ifdef HAVE_CUBLAS
173-
if (NDArray_DEVICE(a) == NDARRAY_DEVICE_GPU) {
174-
cuda_add_float(2, NDArray_FDATA(a), NDArray_FDATA(b), NDArray_FDATA(rtn), 1);
175-
} else {
176-
#endif
177-
NDArray_FDATA(rtn)[0] = NDArray_FDATA(a)[0] + NDArray_FDATA(b)[0];
178-
#ifdef HAVE_CUBLAS
179-
}
180-
#endif
181-
return rtn;
170+
// If a or b are scalars, reshape
171+
if (NDArray_NDIM(a) == 0 && NDArray_NDIM(b) > 0) {
172+
a_temp = a;
173+
int *n_shape = emalloc(sizeof(int) * NDArray_NDIM(b));
174+
copy(NDArray_SHAPE(b), n_shape, NDArray_NDIM(b));
175+
a = NDArray_Zeros(n_shape, NDArray_NDIM(b), NDArray_TYPE(b), NDArray_DEVICE(b));
176+
a = NDArray_Fill(a, NDArray_FDATA(a_temp)[0]);
177+
} else if (NDArray_NDIM(b) == 0 && NDArray_NDIM(a) > 0) {
178+
b_temp = b;
179+
int *n_shape = emalloc(sizeof(int) * NDArray_NDIM(a));
180+
copy(NDArray_SHAPE(a), n_shape, NDArray_NDIM(a));
181+
b = NDArray_Zeros(n_shape, NDArray_NDIM(a), NDArray_TYPE(a), NDArray_DEVICE(a));
182+
b = NDArray_Fill(b, NDArray_FDATA(b_temp)[0]);
182183
}
183184

184185
NDArray *broadcasted = NULL;
@@ -209,72 +210,69 @@ NDArray_Add_Float(NDArray* a, NDArray* b) {
209210
}
210211

211212
// Create a new NDArray to store the result
212-
NDArray* result = (NDArray*)emalloc(sizeof(NDArray));
213-
result->strides = (int*)emalloc(a_broad->ndim * sizeof(int));
214-
result->dimensions = (int*)emalloc(a_broad->ndim * sizeof(int));
213+
NDArray *result = (NDArray *) emalloc(sizeof(NDArray));
214+
result->strides = (int *) emalloc(a_broad->ndim * sizeof(int));
215+
result->dimensions = (int *) emalloc(a_broad->ndim * sizeof(int));
215216
result->ndim = a_broad->ndim;
216217
if (NDArray_DEVICE(a_broad) == NDARRAY_DEVICE_GPU) {
217218
#if HAVE_CUBLAS
218219
vmalloc((void **) &result->data, NDArray_NUMELEMENTS(a_broad) * sizeof(float));
220+
cudaDeviceSynchronize();
219221
result->device = NDARRAY_DEVICE_GPU;
220222
#endif
221223
} else {
222224
result->data = (char *) emalloc(a_broad->descriptor->numElements * sizeof(float));
223225
}
224226
result->base = NULL;
225227
result->flags = 0; // Set appropriate flags
226-
result->descriptor = (NDArrayDescriptor*)emalloc(sizeof(NDArrayDescriptor));
228+
result->descriptor = (NDArrayDescriptor *) emalloc(sizeof(NDArrayDescriptor));
227229
result->descriptor->type = NDARRAY_TYPE_FLOAT32;
228230
result->descriptor->elsize = sizeof(float);
229-
result->device = NDArray_DEVICE(a_broad);
230231
result->descriptor->numElements = a_broad->descriptor->numElements;
231232
result->refcount = 1;
233+
result->device = NDArray_DEVICE(a_broad);
232234

233-
// Perform element-wise addition
235+
// Perform element-wise subtraction
234236
result->strides = memcpy(result->strides, a_broad->strides, a_broad->ndim * sizeof(int));
235237
result->dimensions = memcpy(result->dimensions, a_broad->dimensions, a_broad->ndim * sizeof(int));
236-
float* resultData = (float*)result->data;
237-
float* aData = (float*)a_broad->data;
238-
float* bData = (float*)b_broad->data;
238+
float *resultData = (float *) result->data;
239+
float *aData = (float *) a_broad->data;
240+
float *bData = (float *) b_broad->data;
239241
int numElements = a_broad->descriptor->numElements;
240242
NDArrayIterator_INIT(result);
241243
if (NDArray_DEVICE(a_broad) == NDARRAY_DEVICE_GPU && NDArray_DEVICE(b_broad) == NDARRAY_DEVICE_GPU) {
242244
#if HAVE_CUBLAS
243-
cuda_add_float(NDArray_NUMELEMENTS(a_broad), NDArray_FDATA(a_broad), NDArray_FDATA(b_broad), NDArray_FDATA(result), NDArray_NUMELEMENTS(a_broad));
244-
result->device = NDARRAY_DEVICE_GPU;
245+
cuda_add_float(NDArray_NUMELEMENTS(a_broad), NDArray_FDATA(a_broad), NDArray_FDATA(b_broad), NDArray_FDATA(result),
246+
NDArray_NUMELEMENTS(a_broad));
245247
#endif
246248
} else {
247249
#ifdef HAVE_AVX2
248250
int i;
249-
__m256 vec1, vec2, mul;
251+
__m256 vec1, vec2, sub;
250252

251253
for (i = 0; i < NDArray_NUMELEMENTS(a) - 7; i += 8) {
252254
vec1 = _mm256_loadu_ps(&aData[i]);
253255
vec2 = _mm256_loadu_ps(&bData[i]);
254-
mul = _mm256_add_ps(vec1, vec2);
255-
_mm256_storeu_ps(&resultData[i], mul);
256+
sub = _mm256_add_ps(vec1, vec2);
257+
_mm256_storeu_ps(&resultData[i], sub);
256258
}
259+
257260
// Handle remaining elements if the length is not a multiple of 4
258261
for (; i < numElements; i++) {
259262
resultData[i] = aData[i] + bData[i];
260263
}
261-
#elif HAVE_CBLAS
262-
if (NDArray_NUMELEMENTS(a_broad) == NDArray_NUMELEMENTS(b_broad)) {
263-
memcpy(resultData, NDArray_FDATA(b_broad), NDArray_ELSIZE(b_broad) * NDArray_NUMELEMENTS(b_broad));
264-
cblas_saxpy(NDArray_NUMELEMENTS(a_broad), 1.0F, NDArray_FDATA(a_broad), 1, resultData,
265-
1);
266-
} else {
267-
for (int i = 0; i < numElements; i++) {
268-
resultData[i] = aData[i] + bData[i];
269-
}
270-
}
271264
#else
272265
for (int i = 0; i < numElements; i++) {
273266
resultData[i] = aData[i] + bData[i];
274267
}
275268
#endif
276269
}
277-
270+
if (a_temp != NULL) {
271+
NDArray_FREE(a);
272+
}
273+
if (b_temp != NULL) {
274+
NDArray_FREE(b);
275+
}
278276
if (broadcasted != NULL) {
279277
NDArray_FREE(broadcasted);
280278
}
@@ -292,7 +290,7 @@ NDArray*
292290
NDArray_Multiply_Float(NDArray* a, NDArray* b) {
293291
NDArray *broadcasted = NULL;
294292
NDArray *a_temp = NULL, *b_temp = NULL;
295-
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
293+
if (NDArray_DEVICE(a) != NDArray_DEVICE(b) && NDArray_NDIM(a) != 0 && NDArray_NDIM(b) != 0) {
296294
zend_throw_error(NULL, "Device mismatch, both NDArray MUST be in the same device.");
297295
return NULL;
298296
}
@@ -433,7 +431,7 @@ NDArray_Multiply_Float(NDArray* a, NDArray* b) {
433431
NDArray*
434432
NDArray_Subtract_Float(NDArray* a, NDArray* b) {
435433
NDArray *a_temp = NULL, *b_temp = NULL;
436-
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
434+
if (NDArray_DEVICE(a) != NDArray_DEVICE(b) && NDArray_NDIM(a) != 0 && NDArray_NDIM(b) != 0) {
437435
zend_throw_error(NULL, "Device mismatch, both NDArray MUST be in the same device.");
438436
return NULL;
439437
}
@@ -561,7 +559,7 @@ NDArray*
561559
NDArray_Divide_Float(NDArray* a, NDArray* b) {
562560
NDArray *a_temp = NULL, *b_temp = NULL;
563561

564-
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
562+
if (NDArray_DEVICE(a) != NDArray_DEVICE(b) && NDArray_NDIM(a) != 0 && NDArray_NDIM(b) != 0) {
565563
zend_throw_error(NULL, "Device mismatch, both NDArray MUST be in the same device.");
566564
return NULL;
567565
}
@@ -694,18 +692,11 @@ NDArray_Divide_Float(NDArray* a, NDArray* b) {
694692
NDArray*
695693
NDArray_Mod_Float(NDArray* a, NDArray* b) {
696694
NDArray *a_temp = NULL, *b_temp = NULL;
697-
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
695+
if (NDArray_DEVICE(a) != NDArray_DEVICE(b) && NDArray_NDIM(a) != 0 && NDArray_NDIM(b) != 0) {
698696
zend_throw_error(NULL, "Device mismatch, both NDArray MUST be in the same device.");
699697
return NULL;
700698
}
701699

702-
if (NDArray_NDIM(a) == 0) {
703-
int* shape = ecalloc(1, sizeof(int));
704-
NDArray *rtn = NDArray_Zeros(shape, 0, NDARRAY_TYPE_FLOAT32, NDArray_DEVICE(a));
705-
NDArray_FDATA(rtn)[0] = NDArray_FDATA(a)[0] + NDArray_FDATA(b)[0];
706-
return rtn;
707-
}
708-
709700
// If a or b are scalars, reshape
710701
if (NDArray_NDIM(a) == 0 && NDArray_NDIM(b) > 0) {
711702
a_temp = a;
@@ -723,6 +714,7 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
723714

724715
NDArray *broadcasted = NULL;
725716
NDArray *a_broad = NULL, *b_broad = NULL;
717+
726718
if (NDArray_NUMELEMENTS(a) < NDArray_NUMELEMENTS(b)) {
727719
broadcasted = NDArray_Broadcast(a, b);
728720
a_broad = broadcasted;
@@ -748,9 +740,9 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
748740
}
749741

750742
// Create a new NDArray to store the result
751-
NDArray* result = (NDArray*)emalloc(sizeof(NDArray));
752-
result->strides = (int*)emalloc(a_broad->ndim * sizeof(int));
753-
result->dimensions = (int*)emalloc(a_broad->ndim * sizeof(int));
743+
NDArray *result = (NDArray *) emalloc(sizeof(NDArray));
744+
result->strides = (int *) emalloc(a_broad->ndim * sizeof(int));
745+
result->dimensions = (int *) emalloc(a_broad->ndim * sizeof(int));
754746
result->ndim = a_broad->ndim;
755747
if (NDArray_DEVICE(a_broad) == NDARRAY_DEVICE_GPU) {
756748
#if HAVE_CUBLAS
@@ -763,7 +755,7 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
763755
}
764756
result->base = NULL;
765757
result->flags = 0; // Set appropriate flags
766-
result->descriptor = (NDArrayDescriptor*)emalloc(sizeof(NDArrayDescriptor));
758+
result->descriptor = (NDArrayDescriptor *) emalloc(sizeof(NDArrayDescriptor));
767759
result->descriptor->type = NDARRAY_TYPE_FLOAT32;
768760
result->descriptor->elsize = sizeof(float);
769761
result->descriptor->numElements = a_broad->descriptor->numElements;
@@ -773,9 +765,9 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
773765
// Perform element-wise subtraction
774766
result->strides = memcpy(result->strides, a_broad->strides, a_broad->ndim * sizeof(int));
775767
result->dimensions = memcpy(result->dimensions, a_broad->dimensions, a_broad->ndim * sizeof(int));
776-
float* resultData = (float*)result->data;
777-
float* aData = (float*)a_broad->data;
778-
float* bData = (float*)b_broad->data;
768+
float *resultData = (float *) result->data;
769+
float *aData = (float *) a_broad->data;
770+
float *bData = (float *) b_broad->data;
779771
int numElements = a_broad->descriptor->numElements;
780772
NDArrayIterator_INIT(result);
781773
if (NDArray_DEVICE(a_broad) == NDARRAY_DEVICE_GPU && NDArray_DEVICE(b_broad) == NDARRAY_DEVICE_GPU) {
@@ -784,11 +776,27 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
784776
NDArray_NUMELEMENTS(a_broad));
785777
#endif
786778
} else {
779+
#ifdef HAVE_AVX2
780+
int i;
781+
__m256 vec1, vec2, vout;
782+
783+
for (i = 0; i < NDArray_NUMELEMENTS(a) - 7; i += 8) {
784+
vec1 = _mm256_loadu_ps(&aData[i]);
785+
vec2 = _mm256_loadu_ps(&bData[i]);
786+
vout = _mm256_sub_ps(vec1, _mm256_mul_ps(_mm256_floor_ps(_mm256_div_ps(vec1, vec2)), vec2));
787+
_mm256_storeu_ps(&resultData[i], vout);
788+
}
789+
790+
// Handle remaining elements if the length is not a multiple of 4
791+
for (; i < numElements; i++) {
792+
resultData[i] = fmodf(aData[i], bData[i]);
793+
}
794+
#else
787795
for (int i = 0; i < numElements; i++) {
788796
resultData[i] = fmodf(aData[i], bData[i]);
789797
}
798+
#endif
790799
}
791-
792800
if (a_temp != NULL) {
793801
NDArray_FREE(a);
794802
}
@@ -809,18 +817,11 @@ NDArray_Mod_Float(NDArray* a, NDArray* b) {
809817
NDArray*
810818
NDArray_Pow_Float(NDArray* a, NDArray* b) {
811819
NDArray *a_temp = NULL, *b_temp = NULL;
812-
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
820+
if (NDArray_DEVICE(a) != NDArray_DEVICE(b) && NDArray_NDIM(a) != 0 && NDArray_NDIM(b) != 0) {
813821
zend_throw_error(NULL, "Device mismatch, both NDArray MUST be in the same device.");
814822
return NULL;
815823
}
816824

817-
if (NDArray_NDIM(a) == 0) {
818-
int *shape = ecalloc(1, sizeof(int));
819-
NDArray *rtn = NDArray_Zeros(shape, 0, NDARRAY_TYPE_FLOAT32, NDArray_DEVICE(a));
820-
NDArray_FDATA(rtn)[0] = NDArray_FDATA(a)[0] + NDArray_FDATA(b)[0];
821-
return rtn;
822-
}
823-
824825
// If a or b are scalars, reshape
825826
if (NDArray_NDIM(a) == 0 && NDArray_NDIM(b) > 0) {
826827
a_temp = a;
@@ -870,12 +871,12 @@ NDArray_Pow_Float(NDArray* a, NDArray* b) {
870871
result->ndim = a_broad->ndim;
871872
if (NDArray_DEVICE(a_broad) == NDARRAY_DEVICE_GPU) {
872873
#if HAVE_CUBLAS
873-
vmalloc((void **) &result->data, NDArray_NUMELEMENTS(a) * sizeof(float));
874+
vmalloc((void **) &result->data, NDArray_NUMELEMENTS(a_broad) * sizeof(float));
874875
cudaDeviceSynchronize();
875876
result->device = NDARRAY_DEVICE_GPU;
876877
#endif
877878
} else {
878-
result->data = (char *) emalloc(a->descriptor->numElements * sizeof(float));
879+
result->data = (char *) emalloc(a_broad->descriptor->numElements * sizeof(float));
879880
}
880881
result->base = NULL;
881882
result->flags = 0; // Set appropriate flags
@@ -886,13 +887,13 @@ NDArray_Pow_Float(NDArray* a, NDArray* b) {
886887
result->refcount = 1;
887888
result->device = NDArray_DEVICE(a_broad);
888889

889-
// Perform element-wise
890+
// Perform element-wise subtraction
890891
result->strides = memcpy(result->strides, a_broad->strides, a_broad->ndim * sizeof(int));
891892
result->dimensions = memcpy(result->dimensions, a_broad->dimensions, a_broad->ndim * sizeof(int));
892893
float *resultData = (float *) result->data;
893894
float *aData = (float *) a_broad->data;
894895
float *bData = (float *) b_broad->data;
895-
int numElements = a->descriptor->numElements;
896+
int numElements = a_broad->descriptor->numElements;
896897
NDArrayIterator_INIT(result);
897898
if (NDArray_DEVICE(a_broad) == NDARRAY_DEVICE_GPU && NDArray_DEVICE(b_broad) == NDARRAY_DEVICE_GPU) {
898899
#if HAVE_CUBLAS

0 commit comments

Comments
 (0)