Skip to content

Commit d8de7d8

Browse files
author
Dogancan Ozturk
committed
added rgb 2 yuv neon first version
1 parent c3b4528 commit d8de7d8

File tree

4 files changed

+282
-0
lines changed

4 files changed

+282
-0
lines changed

H264SharpNative/H264SharpNative.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@
345345
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">Create</PrecompiledHeader>
346346
</ClCompile>
347347
<ClCompile Include="Rgb2Yuv.cpp" />
348+
<ClCompile Include="Rgb2YuvNEON.cpp" />
348349
<ClCompile Include="ThreadPool.cpp" />
349350
<ClCompile Include="Yuv2RgbNEON.cpp" />
350351
<ClCompile Include="Yuv2RgbSSE.cpp" />

H264SharpNative/H264SharpNative.vcxproj.filters

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@
9595
<ClCompile Include="Yuv2RgbNEON.cpp">
9696
<Filter>Source Files</Filter>
9797
</ClCompile>
98+
<ClCompile Include="Rgb2YuvNEON.cpp">
99+
<Filter>Source Files</Filter>
100+
</ClCompile>
98101
</ItemGroup>
99102
<ItemGroup>
100103
<Text Include="CMakeLists.txt" />

H264SharpNative/Rgb2Yuv.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ namespace H264Sharp
99
static void BGRtoYUV420Planar(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount);
1010
static void RGBAtoYUV420Planar(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount);
1111
static void RGBtoYUV420Planar(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount);
12+
13+
static void BGRAtoYUV420PlanarNeon(const unsigned char* bgra, unsigned char* dst, int width, int height, int stride, int threadCount);
14+
static void BGRtoYUV420PlanarNeon(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount);
15+
static void RGBAtoYUV420PlanarNeon(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount);
16+
static void RGBtoYUV420PlanarNeon(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount);
1217
};
1318

1419
}

H264SharpNative/Rgb2YuvNEON.cpp

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
#include "Rgb2Yuv.h"
2+
#include <arm_neon.h>
3+
#include <cstdint>
4+
5+
namespace H264Sharp
6+
{
7+
void Rgb2Yuv::BGRAtoYUV420PlanarNeon(const unsigned char* bgra, unsigned char* dst, int width, int height, int stride, int threadCount)
8+
{
9+
RGBA2YUVP_ParallelBody_SIMD<2,1,0>(bgra, dst, width, height, stride, 0, height);
10+
}
11+
void Rgb2Yuv::BGRtoYUV420PlanarNeon(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount)
12+
{
13+
RGBA2YUVP_ParallelBody_SIMD<2,1,0>(bgr, dst, width, height, stride, 0, height);
14+
}
15+
void Rgb2Yuv::RGBAtoYUV420PlanarNeon(unsigned char* rgba, unsigned char* dst, int width, int height, int stride, int threadCount)
16+
{
17+
RGBA2YUVP_ParallelBody_SIMD<0, 1, 2>(rgba, dst, width, height, stride, 0, height);
18+
}
19+
void Rgb2Yuv::RGBtoYUV420PlanarNeon(unsigned char* rgb, unsigned char* dst, int width, int height, int stride, int threadCount)
20+
{
21+
RGB2YUVP_ParallelBody_SIMD<0, 1, 2>(rgb, dst, width, height, stride, 0, height);
22+
}
23+
24+
25+
// Template function for SIMD-based RGB to YUV conversion
26+
template <int R_INDEX, int G_INDEX, int B_INDEX>
27+
inline void RGB2YUVP_ParallelBody_SIMD(
28+
const unsigned char* src,
29+
unsigned char* dst,
30+
const int width,
31+
const int height,
32+
const int stride,
33+
const int begin,
34+
const int end
35+
) {
36+
const int wi = width / 2; // Width for U/V channels
37+
const int yPlaneSize = width * height;
38+
const int uvPlaneSize = yPlaneSize / 4;
39+
40+
unsigned char* buffer = dst;
41+
42+
// SIMD constants for YUV conversion
43+
const uint16x8_t kR_Y = vdupq_n_u16(66);
44+
const uint16x8_t kG_Y = vdupq_n_u16(129);
45+
const uint16x8_t kB_Y = vdupq_n_u16(25);
46+
const uint16x8_t kR_U = vdupq_n_u16(112);
47+
const int16x8_t kG_U = vdupq_n_s16(-94);
48+
const int16x8_t kB_U = vdupq_n_s16(-18);
49+
const int16x8_t kR_V = vdupq_n_s16(-38);
50+
const int16x8_t kG_V = vdupq_n_s16(-74);
51+
const uint16x8_t kB_V = vdupq_n_u16(112);
52+
const uint16x8_t offset_Y = vdupq_n_u16(16);
53+
const uint16x8_t offset_UV = vdupq_n_u16(128);
54+
55+
// Loop over the specified range of rows
56+
for (int row = begin; row < end; row+=2) {
57+
const int uvLineIndex = row * wi;
58+
const int yIndexStart = row * width;
59+
const int readIndexStart = row * 2 * stride;
60+
61+
int yIndex = yIndexStart;
62+
int uIndex = yPlaneSize + uvLineIndex;
63+
int vIndex = yPlaneSize + uvPlaneSize + uvLineIndex;
64+
int index = readIndexStart;
65+
// first row includes UV
66+
for (int i = 0; i < wi; i += 8) {
67+
// Load 16 pixels (48 bytes) from src
68+
uint8x16x3_t pixels = vld3q_u8(&src[index]);
69+
70+
uint8x16_t r = pixels.val[R_INDEX];
71+
uint8x16_t g = pixels.val[G_INDEX];
72+
uint8x16_t b = pixels.val[B_INDEX];
73+
74+
//Widen to 16 bits unsigned
75+
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
76+
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
77+
uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
78+
uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
79+
uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
80+
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
81+
82+
// Y Channel (Unsigned because we can overflow)
83+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
84+
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
85+
// div 256(shift 8) + offset
86+
y_low = vshrq_n_u16(y_low, 8);
87+
y_high = vshrq_n_u16(y_high, 8);
88+
y_low = vaddq_u16(y_low, offset_Y);
89+
y_high = vaddq_u16(y_high, offset_Y);
90+
91+
// shrink combine strore
92+
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
93+
yIndex += 16;
94+
95+
// we need signed here
96+
int16x8_t r_low_signed = vreinterpretq_s16_u16(r_low);
97+
int16x8_t g_low_signed = vreinterpretq_s16_u16(g_low);
98+
int16x8_t b_low_signed = vreinterpretq_s16_u16(b_low);
99+
100+
// Compute U channel (average over 2x2 blocks)
101+
int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U));
102+
u = vshrq_n_s16(u, 8);
103+
u = vaddq_s16(u, offset_UV);
104+
105+
// Compute V channel
106+
int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(b_low_signed, kB_V));
107+
v = vshrq_n_s16(v, 8);
108+
v = vaddq_s16(v, offset_UV);
109+
110+
// Store U and V
111+
vst1_u8(&buffer[uIndex], vqmovun_s16(u));
112+
vst1_u8(&buffer[vIndex], vqmovun_s16(v));
113+
uIndex += 8;
114+
vIndex += 8;
115+
116+
index += 48;
117+
}
118+
//second row only Y
119+
for (int i = 0; i < wi; i += 8) {
120+
// Load 16 pixels (48 bytes) from src
121+
uint8x16x3_t pixels = vld3q_u8(&src[index]);
122+
123+
uint8x16_t r = pixels.val[R_INDEX];
124+
uint8x16_t g = pixels.val[G_INDEX];
125+
uint8x16_t b = pixels.val[B_INDEX];
126+
127+
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
128+
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
129+
uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
130+
uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
131+
uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
132+
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
133+
134+
// Y Channel (Unsigned because we can overflow the signed)
135+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
136+
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
137+
y_low = vshrq_n_u16(y_low, 8);
138+
y_high = vshrq_n_u16(y_high, 8);
139+
y_low = vaddq_u16(y_low, offset_Y);
140+
y_high = vaddq_u16(y_high, offset_Y);
141+
142+
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
143+
yIndex += 16;
144+
145+
index += 48;
146+
}
147+
}
148+
}
149+
150+
// Template function for SIMD-based RGB to YUV conversion
151+
template <int R_INDEX, int G_INDEX, int B_INDEX>
152+
inline void RGBA2YUVP_ParallelBody_SIMD(
153+
const unsigned char* src,
154+
unsigned char* dst,
155+
const int width,
156+
const int height,
157+
const int stride,
158+
const int begin,
159+
const int end
160+
) {
161+
const int wi = width / 2; // Width for U/V channels
162+
const int yPlaneSize = width * height;
163+
const int uvPlaneSize = yPlaneSize / 4;
164+
165+
unsigned char* buffer = dst;
166+
167+
// SIMD constants for YUV conversion
168+
const uint16x8_t kR_Y = vdupq_n_u16(66);
169+
const uint16x8_t kG_Y = vdupq_n_u16(129);
170+
const uint16x8_t kB_Y = vdupq_n_u16(25);
171+
const uint16x8_t kR_U = vdupq_n_u16(112);
172+
const int16x8_t kG_U = vdupq_n_s16(-94);
173+
const int16x8_t kB_U = vdupq_n_s16(-18);
174+
const int16x8_t kR_V = vdupq_n_s16(-38);
175+
const int16x8_t kG_V = vdupq_n_s16(-74);
176+
const uint16x8_t kB_V = vdupq_n_u16(112);
177+
const uint16x8_t offset_Y = vdupq_n_u16(16);
178+
const uint16x8_t offset_UV = vdupq_n_u16(128);
179+
180+
// Loop over the specified range of rows
181+
for (int row = begin; row < end; row += 2) {
182+
const int uvLineIndex = row * wi;
183+
const int yIndexStart = row * width;
184+
const int readIndexStart = row * 2 * stride;
185+
186+
int yIndex = yIndexStart;
187+
int uIndex = yPlaneSize + uvLineIndex;
188+
int vIndex = yPlaneSize + uvPlaneSize + uvLineIndex;
189+
int index = readIndexStart;
190+
// first row includes UV
191+
for (int i = 0; i < wi; i += 8)
192+
{
193+
// Load 24 pixels (64 bytes) from src
194+
uint8x16x4_t pixels = vld3q_u8(&src[index]);
195+
196+
uint8x16_t r = pixels.val[R_INDEX];
197+
uint8x16_t g = pixels.val[G_INDEX];
198+
uint8x16_t b = pixels.val[B_INDEX];
199+
200+
201+
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
202+
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
203+
uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
204+
uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
205+
uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
206+
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
207+
208+
// Y Channel (Unsigned because we can overflow the signed)
209+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
210+
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
211+
y_low = vshrq_n_u16(y_low, 8);
212+
y_high = vshrq_n_u16(y_high, 8);
213+
y_low = vaddq_u16(y_low, offset_Y);
214+
y_high = vaddq_u16(y_high, offset_Y);
215+
216+
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
217+
yIndex += 16;
218+
219+
// we need signed here
220+
int16x8_t r_low_signed = vreinterpretq_s16_u16(r_low);
221+
int16x8_t g_low_signed = vreinterpretq_s16_u16(g_low);
222+
int16x8_t b_low_signed = vreinterpretq_s16_u16(b_low);
223+
224+
// Compute U channel (average over 2x2 blocks)
225+
int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U));
226+
u = vshrq_n_s16(u, 8);
227+
u = vaddq_s16(u, offset_UV);
228+
229+
// Compute V channel
230+
int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(b_low_signed, kB_V));
231+
v = vshrq_n_s16(v, 8);
232+
v = vaddq_s16(v, offset_UV);
233+
234+
// Store U and V
235+
vst1_u8(&buffer[uIndex], vqmovun_s16(u));
236+
vst1_u8(&buffer[vIndex], vqmovun_s16(v));
237+
uIndex += 8;
238+
vIndex += 8;
239+
240+
index += 64;
241+
}
242+
//second row only Y
243+
for (int i = 0; i < wi; i += 8) {
244+
// Load 24 pixels (64 bytes) from src
245+
uint8x16x3_t pixels = vld3q_u8(&src[index]);
246+
247+
uint8x16_t r = pixels.val[R_INDEX];
248+
uint8x16_t g = pixels.val[G_INDEX];
249+
uint8x16_t b = pixels.val[B_INDEX];
250+
251+
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
252+
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
253+
uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
254+
uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
255+
uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
256+
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
257+
258+
// Y Channel (Unsigned because we can overflow the signed)
259+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
260+
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
261+
y_low = vshrq_n_u16(y_low, 8);
262+
y_high = vshrq_n_u16(y_high, 8);
263+
y_low = vaddq_u16(y_low, offset_Y);
264+
y_high = vaddq_u16(y_high, offset_Y);
265+
266+
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
267+
yIndex += 16;
268+
index += 64;
269+
}
270+
}
271+
}
272+
273+
}

0 commit comments

Comments
 (0)