@@ -37,7 +37,7 @@ namespace H264Sharp
3737 RGB2YUVP_ParallelBody_SIMD<2 ,1 ,0 ,3 >(rgb, dst, width, height, stride, 0 , height);
3838 }
3939
40-
40+
4141 template <int R_INDEX, int G_INDEX, int B_INDEX, int NUM_CH>
4242 inline void RGB2YUVP_ParallelBody_SIMD (
4343 const unsigned char * src,
@@ -58,19 +58,19 @@ namespace H264Sharp
5858 buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
5959 */
6060 // SIMD constants for YUV conversion
61- const uint16x8_t kB_Y = vdupq_n_u16 ( 25 ) ;
62- const uint16x8_t kG_Y = vdupq_n_u16 ( 129 ) ;
63- const uint16x8_t kR_Y = vdupq_n_u16 ( 66 ) ;
61+ const uint16_t kB_Y = 25 ;
62+ const uint16_t kG_Y = 129 ;
63+ const uint16_t kR_Y = 66 ;
6464 const uint16x8_t offset_Y = vdupq_n_u16 (16 );
6565
6666
67- const int16x8_t kR_U = vdupq_n_s16 ( 112 ) ;
68- const int16x8_t kG_U = vdupq_n_s16 (- 94 ) ;
69- const int16x8_t kB_U = vdupq_n_s16 (- 18 ) ;
67+ const int16_t kR_U = 112 ;
68+ const int16_t kG_U = 94 ;
69+ const int16_t kB_U = 18 ;
7070
71- const int16x8_t kR_V = vdupq_n_s16 (- 38 ) ;
72- const int16x8_t kG_V = vdupq_n_s16 (- 74 ) ;
73- const int16x8_t kB_V = vdupq_n_s16 ( 112 ) ;
71+ const int16_t kR_V = 38 ;
72+ const int16_t kG_V = 74 ;
73+ const int16_t kB_V = 112 ;
7474
7575 const int16x8_t offset_UV = vdupq_n_s16 (128 );
7676
@@ -109,31 +109,32 @@ namespace H264Sharp
109109 uint16x8_t b_high = vmovl_u8 (vget_high_u8 (b));
110110
111111 // Y Channel (Unsigned because we can overflow)
112- uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_low, kR_Y ), vmulq_u16 (g_low, kG_Y )), vmulq_u16 (b_low, kB_Y ));
113- uint16x8_t y_high = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_high, kR_Y ), vmulq_u16 (g_high, kG_Y )), vmulq_u16 (b_high, kB_Y ));
112+ uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_n_u16 (r_low, kR_Y ), vmulq_n_u16 (g_low, kG_Y )), vmulq_n_u16 (b_low, kB_Y ));
113+ uint16x8_t y_high = vaddq_u16 (vaddq_u16 (vmulq_n_u16 (r_high, kR_Y ), vmulq_n_u16 (g_high, kG_Y )), vmulq_n_u16 (b_high, kB_Y ));
114114 // div 256(shift 8) + offset
115- y_low = vshrq_n_u16 (y_low, 8 );
116- y_high = vshrq_n_u16 (y_high, 8 );
117- y_low = vaddq_u16 (y_low, offset_Y);
118- y_high = vaddq_u16 (y_high, offset_Y);
115+ y_low = vaddq_u16 (y_low>>8 , offset_Y);
116+ y_high = vaddq_u16 (y_high>>8 , offset_Y);
119117
120118 // shrink combine strore
121119
122120 vst1q_u8 (&buffer[yIndex], vcombine_u8 (vqmovn_u16 (y_low), vqmovn_u16 (y_high)));
123121 yIndex += 16 ;
124122
123+ // buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
124+ // buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
125+
125126 // we need signed here
126127 int16x8_t r_low_signed = vreinterpretq_s16_u16 (r_low);
127128 int16x8_t g_low_signed = vreinterpretq_s16_u16 (g_low);
128129 int16x8_t b_low_signed = vreinterpretq_s16_u16 (b_low);
129130
130131 // Compute U channel (average over 2x2 blocks)
131- int16x8_t u = vaddq_s16 ( vaddq_s16 ( vmulq_s16 (r_low_signed, kR_U ), vmulq_s16 (g_low_signed, kG_U )), vmulq_s16 (b_low_signed, kB_U ));
132- u = vaddq_s16 (vshrq_n_s16 (u, 8 ) , offset_UV);
132+ int16x8_t u = vsubq_s16 ( vsubq_s16 ( vmulq_n_s16 (r_low_signed, kR_U ), vmulq_n_s16 (g_low_signed, kG_U )), vmulq_n_s16 (b_low_signed, kB_U ));
133+ u = vaddq_s16 (u>> 8 , offset_UV);
133134
134135 // Compute V channel
135- int16x8_t v = vaddq_s16 ( vaddq_s16 ( vmulq_s16 (b_low_signed, kB_V ), vmulq_s16 (g_low_signed, kG_V )), vmulq_s16 (r_low_signed, kR_V ));
136- v = vaddq_s16 (vshrq_n_s16 (v, 8 ) , offset_UV);
136+ int16x8_t v = vsubq_s16 ( vsubq_s16 ( vmulq_n_s16 (b_low_signed, kB_V ), vmulq_n_s16 (g_low_signed, kG_V )), vmulq_n_s16 (r_low_signed, kR_V ));
137+ v = vaddq_s16 (v>> 8 , offset_UV);
137138
138139 // Store U and V
139140 vst1_u8 (&buffer[uIndex], vqmovun_s16 (u));
@@ -147,7 +148,7 @@ namespace H264Sharp
147148 // add stride offset here..
148149 index = stride * (row+1 );
149150 // second row only Y
150- for (int i = 0 ; i < width; i += 16 ) {
151+ for (int i = 0 ; i < width/ 16 ; i++ ) {
151152 // Load 16 pixels (48 bytes) from src
152153 uint8x16_t r, g, b;
153154 if constexpr (NUM_CH == 4 ) {
@@ -165,7 +166,6 @@ namespace H264Sharp
165166 b = pixels.val [B_INDEX];
166167 }
167168
168-
169169 // Widen to 16 bits unsigned
170170 uint16x8_t r_low = vmovl_u8 (vget_low_u8 (r));
171171 uint16x8_t r_high = vmovl_u8 (vget_high_u8 (r));
@@ -175,15 +175,14 @@ namespace H264Sharp
175175 uint16x8_t b_high = vmovl_u8 (vget_high_u8 (b));
176176
177177 // Y Channel (Unsigned because we can overflow)
178- uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_low, kR_Y ), vmulq_u16 (g_low, kG_Y )), vmulq_u16 (b_low, kB_Y ));
179- uint16x8_t y_high = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_high, kR_Y ), vmulq_u16 (g_high, kG_Y )), vmulq_u16 (b_high, kB_Y ));
178+ uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_n_u16 (r_low, kR_Y ), vmulq_n_u16 (g_low, kG_Y )), vmulq_n_u16 (b_low, kB_Y ));
179+ uint16x8_t y_high = vaddq_u16 (vaddq_u16 (vmulq_n_u16 (r_high, kR_Y ), vmulq_n_u16 (g_high, kG_Y )), vmulq_n_u16 (b_high, kB_Y ));
180180 // div 256(shift 8) + offset
181- y_low = vshrq_n_u16 (y_low, 8 );
182- y_high = vshrq_n_u16 (y_high, 8 );
183- y_low = vaddq_u16 (y_low, offset_Y);
184- y_high = vaddq_u16 (y_high, offset_Y);
181+ y_low = vaddq_u16 (y_low >> 8 , offset_Y);
182+ y_high = vaddq_u16 (y_high >> 8 , offset_Y);
185183
186184 // shrink combine strore
185+
187186 vst1q_u8 (&buffer[yIndex], vcombine_u8 (vqmovn_u16 (y_low), vqmovn_u16 (y_high)));
188187 yIndex += 16 ;
189188
@@ -197,6 +196,142 @@ namespace H264Sharp
197196
198197
199198
199+ template <int R_INDEX, int G_INDEX, int B_INDEX, int NUM_CH>
200+ inline void RGB2YUVP_ParallelBody_SIMDv2 (
201+ const unsigned char * src,
202+ unsigned char * dst,
203+ const int width,
204+ const int height,
205+ const int stride,
206+ const int begin,
207+ const int end
208+ ) {
209+
210+ unsigned char * buffer = dst;
211+ /*
212+ * buffer[yIndex++] = ((25 * b + 129 * g + 66 * r) >> 8) + 16;
213+ buffer[yIndex++] = ((25 * b1 + 129 * g1 + 66 * r1) >> 8) + 16;
214+
215+ buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
216+ buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
217+ */
218+ // SIMD constants for YUV conversion
219+ const uint16_t kB_Y = 25 ;
220+ const uint16_t kG_Y = 129 ;
221+ const uint16_t kR_Y = 66 ;
222+ const uint16x8_t offset_Y = vdupq_n_u16 (16 );
223+
224+
225+ const int16_t kR_U = 112 ;
226+ const int16_t kG_U = -94 ;
227+ const int16_t kB_U = -18 ;
228+
229+ const int16_t kR_V = -38 ;
230+ const int16_t kG_V = -74 ;
231+ const int16_t kB_V = 112 ;
232+
233+ const int16x8_t offset_UV = vdupq_n_s16 (128 );
234+
235+ int index = 0 ;
236+ int nextLineindex = stride;
237+
238+ int yIndex = 0 ;
239+ int yIndexn = yIndex+width;
240+
241+ int uIndex = width * height;
242+ int vIndex = uIndex + (uIndex >> 2 );
243+ // Loop over the specified range of rows
244+ for (int row = begin; row < end; row += 2 ) {
245+ // first row includes UV
246+
247+ for (int i = 0 ; i < width; i += 16 ) {
248+ // Load 16 pixels (48 bytes) from src
249+ uint8x16_t r, g, b,rn,gn,bn;
250+ if constexpr (NUM_CH == 4 ) {
251+ uint8x16x4_t pixels = vld4q_u8 (&src[index]);
252+ uint8x16x4_t pixelsn = vld4q_u8 (&src[nextLineindex]);
253+
254+ r = pixels.val [R_INDEX];
255+ g = pixels.val [G_INDEX];
256+ b = pixels.val [B_INDEX];
257+
258+ rn = pixelsn.val [R_INDEX];
259+ gn = pixelsn.val [G_INDEX];
260+ bn = pixelsn.val [B_INDEX];
261+ }
262+ else {
263+ uint8x16x3_t pixels = vld3q_u8 (&src[index]);
264+ uint8x16x3_t pixelsn = vld3q_u8 (&src[nextLineindex]);
265+
266+ r = pixels.val [R_INDEX];
267+ g = pixels.val [G_INDEX];
268+ b = pixels.val [B_INDEX];
269+
270+ rn = pixelsn.val [R_INDEX];
271+ gn = pixelsn.val [G_INDEX];
272+ bn = pixelsn.val [B_INDEX];
273+ }
274+
275+
276+ // Widen to 16 bits unsigned
277+ uint16x8_t r_low = vmovl_u8 (vget_low_u8 (r));
278+ uint16x8_t r_high = vmovl_u8 (vget_high_u8 (r));
279+ uint16x8_t g_low = vmovl_u8 (vget_low_u8 (g));
280+ uint16x8_t g_high = vmovl_u8 (vget_high_u8 (g));
281+ uint16x8_t b_low = vmovl_u8 (vget_low_u8 (b));
282+ uint16x8_t b_high = vmovl_u8 (vget_high_u8 (b));
283+
284+ uint16x8_t r_lown = vmovl_u8 (vget_low_u8 (rn));
285+ uint16x8_t r_highn = vmovl_u8 (vget_high_u8 (rn));
286+ uint16x8_t g_lown = vmovl_u8 (vget_low_u8 (gn));
287+ uint16x8_t g_highn = vmovl_u8 (vget_high_u8 (gn));
288+ uint16x8_t b_lown = vmovl_u8 (vget_low_u8 (bn));
289+ uint16x8_t b_highn = vmovl_u8 (vget_high_u8 (bn));
290+
291+ uint8x8_t y00 = vqshrn_n_u16 (vmlal_u8 (vmlal_u8 (vmull_u8 (r_low, vdup_n_u8 (kR_Y )), g_low, vdup_n_u8 (kG_Y )), b_low, vdup_n_u8 (kB_Y )), 8 );
292+ uint8x8_t y01 = vqshrn_n_u16 (vmlal_u8 (vmlal_u8 (vmull_u8 (r_high, vdup_n_u8 (kR_Y )), g_high, vdup_n_u8 (kG_Y )), b_high, vdup_n_u8 (kB_Y )), 8 );
293+ uint8x8_t y10 = vqshrn_n_u16 (vmlal_u8 (vmlal_u8 (vmull_u8 (r_lown, vdup_n_u8 (kR_Y )), g_lown, vdup_n_u8 (kG_Y )), b_lown, vdup_n_u8 (kB_Y )), 8 );
294+ uint8x8_t y11 = vqshrn_n_u16 (vmlal_u8 (vmlal_u8 (vmull_u8 (r_highn, vdup_n_u8 (kR_Y )), g_highn, vdup_n_u8 (kG_Y )), b_highn, vdup_n_u8 (kB_Y )), 8 );
295+
296+ uint8x16_t y000 = vcombine_u8 (y00, y01);
297+ uint8x16_t y100 = vcombine_u8 (y10, y11);
298+
299+ y000 = vqaddq_u8 (vcombine_u8 (y00, y01), vdupq_n_u8 (16 ));
300+ y100 = vqaddq_u8 (vcombine_u8 (y10, y11), vdupq_n_u8 (16 ));
301+
302+ vst1q_u8 (&buffer[yIndex], y000); yIndex += 16 ;
303+ vst1q_u8 (&buffer[yIndexn], y100); yIndexn += 16 ;
304+
305+
306+ int16x8_t r000 = vpadalq_u8 (vpaddlq_u8 (r), rn);
307+ int16x8_t g000 = vpadalq_u8 (vpaddlq_u8 (g), gn);
308+ int16x8_t b000 = vpadalq_u8 (vpaddlq_u8 (b), bn);
309+
310+ uint8x8_t u00 = vrshrn_n_s16 (vmlaq_s16 (vmlaq_s16 (vmulq_s16 (r000, vdupq_n_s16 (kR_U >> 2 )), g000, vdupq_n_s16 (kG_U >> 2 )), b000, vdupq_n_s16 (kB_U >> 2 )), 8 );
311+ uint8x8_t v00 = vrshrn_n_s16 (vmlaq_s16 (vmlaq_s16 (vmulq_s16 (r000, vdupq_n_s16 (kR_V >> 2 )), g000, vdupq_n_s16 (kG_V >> 2 ), b000, vdupq_n_s16 (kB_V >> 2 )), 8 );
312+
313+ u00 = vadd_u8 (u00, vdup_n_u8 (128 ));
314+ v00 = vadd_u8 (v00, vdup_n_u8 (128 ));
315+
316+ vst1_u8 (&buffer[uIndex], u00);
317+ vst1_u8 (&buffer[vIndex], v00);
318+ uIndex += 8 ;
319+ vIndex += 8 ;
320+
321+ index += NUM_CH * 16 ;
322+ nextLineindex += NUM_CH * 16 ;
323+ }
324+
325+
326+
327+ index = stride * (row + 2 );
328+ nextLineindex = stride * (row + 2 );
329+
330+ }
331+ }
332+
333+
334+
200335}
201336
202337#endif
0 commit comments