Skip to content

Commit 5258307

Browse files
author
Dogancan Ozturk
committed
rgbyuv neon v2
1 parent c2216ef commit 5258307

File tree

1 file changed

+163
-28
lines changed

1 file changed

+163
-28
lines changed

H264SharpNative/Rgb2YuvNEON.cpp

Lines changed: 163 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ namespace H264Sharp
3737
RGB2YUVP_ParallelBody_SIMD<2,1,0,3>(rgb, dst, width, height, stride, 0, height);
3838
}
3939

40-
40+
4141
template <int R_INDEX, int G_INDEX, int B_INDEX, int NUM_CH>
4242
inline void RGB2YUVP_ParallelBody_SIMD(
4343
const unsigned char* src,
@@ -58,19 +58,19 @@ namespace H264Sharp
5858
buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
5959
*/
6060
// SIMD constants for YUV conversion
61-
const uint16x8_t kB_Y = vdupq_n_u16(25);
62-
const uint16x8_t kG_Y = vdupq_n_u16(129);
63-
const uint16x8_t kR_Y = vdupq_n_u16(66);
61+
const uint16_t kB_Y = 25;
62+
const uint16_t kG_Y = 129;
63+
const uint16_t kR_Y = 66;
6464
const uint16x8_t offset_Y = vdupq_n_u16(16);
6565

6666

67-
const int16x8_t kR_U = vdupq_n_s16(112);
68-
const int16x8_t kG_U = vdupq_n_s16(-94);
69-
const int16x8_t kB_U = vdupq_n_s16(-18);
67+
const int16_t kR_U = 112;
68+
const int16_t kG_U = 94;
69+
const int16_t kB_U = 18;
7070

71-
const int16x8_t kR_V = vdupq_n_s16(-38);
72-
const int16x8_t kG_V = vdupq_n_s16(-74);
73-
const int16x8_t kB_V = vdupq_n_s16(112);
71+
const int16_t kR_V = 38;
72+
const int16_t kG_V = 74;
73+
const int16_t kB_V = 112;
7474

7575
const int16x8_t offset_UV = vdupq_n_s16(128);
7676

@@ -109,31 +109,32 @@ namespace H264Sharp
109109
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
110110

111111
// Y Channel (Unsigned because we can overflow)
112-
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
113-
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
112+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_n_u16(r_low, kR_Y), vmulq_n_u16(g_low, kG_Y)), vmulq_n_u16(b_low, kB_Y));
113+
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_n_u16(r_high, kR_Y), vmulq_n_u16(g_high, kG_Y)), vmulq_n_u16(b_high, kB_Y));
114114
// div 256(shift 8) + offset
115-
y_low = vshrq_n_u16(y_low, 8);
116-
y_high = vshrq_n_u16(y_high, 8);
117-
y_low = vaddq_u16(y_low, offset_Y);
118-
y_high = vaddq_u16(y_high, offset_Y);
115+
y_low = vaddq_u16(y_low>>8, offset_Y);
116+
y_high = vaddq_u16(y_high>>8, offset_Y);
119117

120118
// shrink combine strore
121119

122120
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
123121
yIndex += 16;
124122

123+
//buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
124+
// buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
125+
125126
// we need signed here
126127
int16x8_t r_low_signed = vreinterpretq_s16_u16(r_low);
127128
int16x8_t g_low_signed = vreinterpretq_s16_u16(g_low);
128129
int16x8_t b_low_signed = vreinterpretq_s16_u16(b_low);
129130

130131
// Compute U channel (average over 2x2 blocks)
131-
int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U));
132-
u = vaddq_s16(vshrq_n_s16(u, 8), offset_UV);
132+
int16x8_t u = vsubq_s16(vsubq_s16(vmulq_n_s16(r_low_signed, kR_U), vmulq_n_s16(g_low_signed, kG_U)), vmulq_n_s16(b_low_signed, kB_U));
133+
u = vaddq_s16(u>>8, offset_UV);
133134

134135
// Compute V channel
135-
int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(b_low_signed, kB_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(r_low_signed, kR_V));
136-
v = vaddq_s16(vshrq_n_s16(v, 8), offset_UV);
136+
int16x8_t v = vsubq_s16(vsubq_s16(vmulq_n_s16(b_low_signed, kB_V), vmulq_n_s16(g_low_signed, kG_V)), vmulq_n_s16(r_low_signed, kR_V));
137+
v = vaddq_s16(v>>8, offset_UV);
137138

138139
// Store U and V
139140
vst1_u8(&buffer[uIndex], vqmovun_s16(u));
@@ -147,7 +148,7 @@ namespace H264Sharp
147148
// add stride offset here..
148149
index = stride * (row+1);
149150
//second row only Y
150-
for (int i = 0; i < width; i += 16) {
151+
for (int i = 0; i < width/16; i++) {
151152
// Load 16 pixels (48 bytes) from src
152153
uint8x16_t r, g, b;
153154
if constexpr (NUM_CH == 4) {
@@ -165,7 +166,6 @@ namespace H264Sharp
165166
b = pixels.val[B_INDEX];
166167
}
167168

168-
169169
//Widen to 16 bits unsigned
170170
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
171171
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
@@ -175,15 +175,14 @@ namespace H264Sharp
175175
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
176176

177177
// Y Channel (Unsigned because we can overflow)
178-
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
179-
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
178+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_n_u16(r_low, kR_Y), vmulq_n_u16(g_low, kG_Y)), vmulq_n_u16(b_low, kB_Y));
179+
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_n_u16(r_high, kR_Y), vmulq_n_u16(g_high, kG_Y)), vmulq_n_u16(b_high, kB_Y));
180180
// div 256(shift 8) + offset
181-
y_low = vshrq_n_u16(y_low, 8);
182-
y_high = vshrq_n_u16(y_high, 8);
183-
y_low = vaddq_u16(y_low, offset_Y);
184-
y_high = vaddq_u16(y_high, offset_Y);
181+
y_low = vaddq_u16(y_low >> 8, offset_Y);
182+
y_high = vaddq_u16(y_high >> 8, offset_Y);
185183

186184
// shrink combine strore
185+
187186
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
188187
yIndex += 16;
189188

@@ -197,6 +196,142 @@ namespace H264Sharp
197196

198197

199198

199+
template <int R_INDEX, int G_INDEX, int B_INDEX, int NUM_CH>
200+
inline void RGB2YUVP_ParallelBody_SIMDv2(
201+
const unsigned char* src,
202+
unsigned char* dst,
203+
const int width,
204+
const int height,
205+
const int stride,
206+
const int begin,
207+
const int end
208+
) {
209+
210+
unsigned char* buffer = dst;
211+
/*
212+
* buffer[yIndex++] = ((25 * b + 129 * g + 66 * r) >> 8) + 16;
213+
buffer[yIndex++] = ((25 * b1 + 129 * g1 + 66 * r1) >> 8) + 16;
214+
215+
buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
216+
buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
217+
*/
218+
// SIMD constants for YUV conversion
219+
const uint16_t kB_Y = 25;
220+
const uint16_t kG_Y = 129;
221+
const uint16_t kR_Y = 66;
222+
const uint16x8_t offset_Y = vdupq_n_u16(16);
223+
224+
225+
const int16_t kR_U = 112;
226+
const int16_t kG_U = -94;
227+
const int16_t kB_U = -18;
228+
229+
const int16_t kR_V = -38;
230+
const int16_t kG_V = -74;
231+
const int16_t kB_V = 112;
232+
233+
const int16x8_t offset_UV = vdupq_n_s16(128);
234+
235+
int index = 0;
236+
int nextLineindex = stride;
237+
238+
int yIndex = 0;
239+
int yIndexn = yIndex+width;
240+
241+
int uIndex = width * height;
242+
int vIndex = uIndex + (uIndex >> 2);
243+
// Loop over the specified range of rows
244+
for (int row = begin; row < end; row += 2) {
245+
// first row includes UV
246+
247+
for (int i = 0; i < width; i += 16) {
248+
// Load 16 pixels (48 bytes) from src
249+
uint8x16_t r, g, b,rn,gn,bn;
250+
if constexpr (NUM_CH == 4) {
251+
uint8x16x4_t pixels = vld4q_u8(&src[index]);
252+
uint8x16x4_t pixelsn = vld4q_u8(&src[nextLineindex]);
253+
254+
r = pixels.val[R_INDEX];
255+
g = pixels.val[G_INDEX];
256+
b = pixels.val[B_INDEX];
257+
258+
rn = pixelsn.val[R_INDEX];
259+
gn = pixelsn.val[G_INDEX];
260+
bn = pixelsn.val[B_INDEX];
261+
}
262+
else {
263+
uint8x16x3_t pixels = vld3q_u8(&src[index]);
264+
uint8x16x3_t pixelsn = vld3q_u8(&src[nextLineindex]);
265+
266+
r = pixels.val[R_INDEX];
267+
g = pixels.val[G_INDEX];
268+
b = pixels.val[B_INDEX];
269+
270+
rn = pixelsn.val[R_INDEX];
271+
gn = pixelsn.val[G_INDEX];
272+
bn = pixelsn.val[B_INDEX];
273+
}
274+
275+
276+
//Widen to 16 bits unsigned
277+
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
278+
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
279+
uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
280+
uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
281+
uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
282+
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
283+
284+
uint16x8_t r_lown = vmovl_u8(vget_low_u8(rn));
285+
uint16x8_t r_highn = vmovl_u8(vget_high_u8(rn));
286+
uint16x8_t g_lown = vmovl_u8(vget_low_u8(gn));
287+
uint16x8_t g_highn = vmovl_u8(vget_high_u8(gn));
288+
uint16x8_t b_lown = vmovl_u8(vget_low_u8(bn));
289+
uint16x8_t b_highn = vmovl_u8(vget_high_u8(bn));
290+
291+
uint8x8_t y00 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r_low, vdup_n_u8(kR_Y)), g_low, vdup_n_u8(kG_Y)), b_low, vdup_n_u8(kB_Y)), 8);
292+
uint8x8_t y01 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r_high, vdup_n_u8(kR_Y)), g_high, vdup_n_u8(kG_Y)), b_high, vdup_n_u8(kB_Y)), 8);
293+
uint8x8_t y10 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r_lown, vdup_n_u8(kR_Y)), g_lown, vdup_n_u8(kG_Y)), b_lown, vdup_n_u8(kB_Y)), 8);
294+
uint8x8_t y11 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r_highn, vdup_n_u8(kR_Y)), g_highn, vdup_n_u8(kG_Y)), b_highn, vdup_n_u8(kB_Y)), 8);
295+
296+
uint8x16_t y000 = vcombine_u8(y00, y01);
297+
uint8x16_t y100 = vcombine_u8(y10, y11);
298+
299+
y000 = vqaddq_u8(vcombine_u8(y00, y01), vdupq_n_u8(16));
300+
y100 = vqaddq_u8(vcombine_u8(y10, y11), vdupq_n_u8(16));
301+
302+
vst1q_u8(&buffer[yIndex], y000); yIndex += 16;
303+
vst1q_u8(&buffer[yIndexn], y100); yIndexn += 16;
304+
305+
306+
int16x8_t r000 = vpadalq_u8(vpaddlq_u8(r), rn);
307+
int16x8_t g000 = vpadalq_u8(vpaddlq_u8(g), gn);
308+
int16x8_t b000 = vpadalq_u8(vpaddlq_u8(b), bn);
309+
310+
uint8x8_t u00 = vrshrn_n_s16(vmlaq_s16(vmlaq_s16(vmulq_s16(r000, vdupq_n_s16(kR_U >> 2)), g000, vdupq_n_s16(kG_U >> 2)), b000, vdupq_n_s16(kB_U >> 2)), 8);
311+
uint8x8_t v00 = vrshrn_n_s16(vmlaq_s16(vmlaq_s16(vmulq_s16(r000, vdupq_n_s16(kR_V >> 2)), g000, vdupq_n_s16(kG_V >> 2), b000, vdupq_n_s16(kB_V >> 2)), 8);
312+
313+
u00 = vadd_u8(u00, vdup_n_u8(128));
314+
v00 = vadd_u8(v00, vdup_n_u8(128));
315+
316+
vst1_u8(&buffer[uIndex], u00);
317+
vst1_u8(&buffer[vIndex], v00);
318+
uIndex += 8;
319+
vIndex += 8;
320+
321+
index += NUM_CH * 16;
322+
nextLineindex += NUM_CH * 16;
323+
}
324+
325+
326+
327+
index = stride * (row + 2);
328+
nextLineindex = stride * (row + 2);
329+
330+
}
331+
}
332+
333+
334+
200335
}
201336

202337
#endif

0 commit comments

Comments
 (0)