Skip to content

Commit 2152d7b

Browse files
feat: Use AVX2 for image loading and saving
1 parent 812d4bc commit 2152d7b

File tree

2 files changed

+89
-1
lines changed

2 files changed

+89
-1
lines changed

src/ndarray.c

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,33 @@ NDArray_FromGD(zval *a) {
101101
i_shape[1] = (int)img_ptr->sy;
102102
i_shape[2] = (int)img_ptr->sx;
103103
rtn = NDArray_Zeros(i_shape, 3, NDARRAY_TYPE_FLOAT32, NDARRAY_DEVICE_CPU);
104+
int elsize = NDArray_ELSIZE(rtn);
105+
106+
#ifdef HAVE_AVX2
107+
__m256i red_mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
108+
for (int i = 0; i < img_ptr->sy; i++) {
109+
for (int j = 0; j < img_ptr->sx; j += 8) {
110+
offset_red = (NDArray_STRIDES(rtn)[0] / elsize * 0) +
111+
((NDArray_STRIDES(rtn)[1] / elsize) * i) +
112+
((NDArray_STRIDES(rtn)[2] / elsize) * j);
113+
offset_green = ((NDArray_STRIDES(rtn)[0] / elsize) * 1) +
114+
((NDArray_STRIDES(rtn)[1] / elsize) * i) +
115+
((NDArray_STRIDES(rtn)[2] / elsize) * j);
116+
offset_blue = ((NDArray_STRIDES(rtn)[0] / elsize) * 2) +
117+
((NDArray_STRIDES(rtn)[1] / elsize) * i) +
118+
((NDArray_STRIDES(rtn)[2] / elsize) * j);
119+
120+
__m256i color_indices = _mm256_loadu_si256((__m256i*)&img_ptr->tpixels[i][j]);
121+
__m256i red_shifted = _mm256_and_si256(_mm256_srli_epi32(color_indices, 16), red_mask);
122+
__m256i green_shifted = _mm256_and_si256(_mm256_srli_epi32(color_indices, 8), red_mask);
123+
__m256i blue_shifted = _mm256_and_si256(color_indices, red_mask);
124+
125+
_mm256_storeu_ps(&NDArray_FDATA(rtn)[offset_red], _mm256_cvtepi32_ps(red_shifted));
126+
_mm256_storeu_ps(&NDArray_FDATA(rtn)[offset_green], _mm256_cvtepi32_ps(green_shifted));
127+
_mm256_storeu_ps(&NDArray_FDATA(rtn)[offset_blue], _mm256_cvtepi32_ps(blue_shifted));
128+
}
129+
}
130+
#else
104131
for (int i = 0; i < img_ptr->sy; i++) {
105132
for (int j = 0; j < img_ptr->sx; j++) {
106133
offset_red = (NDArray_STRIDES(rtn)[0]/ NDArray_ELSIZE(rtn) * 0) +
@@ -121,6 +148,7 @@ NDArray_FromGD(zval *a) {
121148
NDArray_FDATA(rtn)[offset_green] = (float)green;
122149
}
123150
}
151+
#endif
124152
return rtn;
125153
}
126154

@@ -135,6 +163,64 @@ NDArray_ToGD(NDArray *a, NDArray *n_alpha, zval *output) {
135163
int red, green, blue, alpha;
136164
char *tmp_red, *tmp_blue, *tmp_green;
137165
gdImagePtr im = gdImageCreateTrueColor_(NDArray_SHAPE(a)[2], NDArray_SHAPE(a)[1]);
166+
167+
#ifdef HAVE_AVX2
168+
int elsize = NDArray_ELSIZE(a);
169+
__m256i alpha_mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
170+
171+
for (int i = 0; i < im->sy; i++) {
172+
for (int j = 0; j < im->sx; j += 8) {
173+
offset_alpha = (NDArray_STRIDES(a)[0] / elsize * i) +
174+
((NDArray_STRIDES(a)[1] / elsize) * j);
175+
offset_red = (NDArray_STRIDES(a)[0] / elsize * 0) +
176+
((NDArray_STRIDES(a)[1] / elsize) * i) +
177+
((NDArray_STRIDES(a)[2] / elsize) * j);
178+
offset_green = ((NDArray_STRIDES(a)[0] / elsize) * 1) +
179+
((NDArray_STRIDES(a)[1] / elsize) * i) +
180+
((NDArray_STRIDES(a)[2] / elsize) * j);
181+
offset_blue = ((NDArray_STRIDES(a)[0] / elsize) * 2) +
182+
((NDArray_STRIDES(a)[1] / elsize) * i) +
183+
((NDArray_STRIDES(a)[2] / elsize) * j);
184+
185+
__m256 red_values = _mm256_loadu_ps(&NDArray_FDATA(a)[offset_red]);
186+
__m256 green_values = _mm256_loadu_ps(&NDArray_FDATA(a)[offset_green]);
187+
__m256 blue_values = _mm256_loadu_ps(&NDArray_FDATA(a)[offset_blue]);
188+
189+
if (n_alpha != NULL) {
190+
__m256i alpha_values = _mm256_cvtps_epi32(_mm256_loadu_ps(&NDArray_FDATA(n_alpha)[offset_alpha]));
191+
alpha_values = _mm256_and_si256(alpha_values, alpha_mask);
192+
} else {
193+
// Handle the case when n_alpha is NULL (no alpha channel)
194+
// Set alpha_values to a default value or do appropriate handling
195+
// For example, you can set alpha_values to all 255 (fully opaque).
196+
__m256i alpha_values = _mm256_set1_epi32(255);
197+
}
198+
199+
__m256i red_int = _mm256_cvtps_epi32(red_values);
200+
__m256i green_int = _mm256_cvtps_epi32(green_values);
201+
__m256i blue_int = _mm256_cvtps_epi32(blue_values);
202+
203+
__m256i color_indices;
204+
205+
if (n_alpha != NULL) {
206+
__m256i alpha_values = _mm256_cvtps_epi32(_mm256_loadu_ps(&NDArray_FDATA(n_alpha)[offset_alpha]));
207+
alpha_values = _mm256_and_si256(alpha_values, alpha_mask);
208+
209+
color_indices = _mm256_or_si256(_mm256_or_si256(_mm256_slli_epi32(alpha_values, 24),
210+
_mm256_slli_epi32(red_int, 16)),
211+
_mm256_or_si256(_mm256_slli_epi32(green_int, 8), blue_int));
212+
} else {
213+
// Handle the case when n_alpha is NULL
214+
// Set color_indices using only red, green, and blue values
215+
color_indices = _mm256_or_si256(_mm256_or_si256(_mm256_slli_epi32(red_int, 16),
216+
_mm256_slli_epi32(green_int, 8)),
217+
blue_int);
218+
}
219+
220+
_mm256_storeu_si256((__m256i*)&im->tpixels[i][j], color_indices);
221+
}
222+
}
223+
#else
138224
for (int i = 0; i < im->sy; i++) {
139225
for (int j = 0; j < im->sx; j++) {
140226
offset_alpha = (NDArray_STRIDES(a)[0]/ NDArray_ELSIZE(a) * i) +
@@ -160,8 +246,10 @@ NDArray_ToGD(NDArray *a, NDArray *n_alpha, zval *output) {
160246
im->tpixels[i][j] = color_index;
161247
}
162248
}
249+
#endif
163250
php_gd_assign_libgdimageptr_as_extgdimage(output, im);
164251
}
252+
165253
#endif
166254

167255
void apply_reduce(NDArray* result, NDArray *target, NDArray* (*operation)(NDArray*, NDArray*)) {

src/ndmath/linalg.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1080,7 +1080,7 @@ convolve2d_same_float(const float* a, const float* b, const int* shape_a,
10801080
NDArray*
10811081
NDArray_Convolve2D(NDArray *a, NDArray *b, char mode, char boundary, float fill_value) {
10821082
if (NDArray_DEVICE(a) != NDArray_DEVICE(b)) {
1083-
zend_throw_error(NULL, "Device error.");
1083+
zend_throw_error(NULL, "Both arrays must be at the same device.");
10841084
return NULL;
10851085
}
10861086

0 commit comments

Comments
 (0)