Skip to content

Commit 1e368b6

Browse files
committed
fixed allignment issue on Yuv2RGB Neon
1 parent b2a40af commit 1e368b6

File tree

5 files changed

+124
-11
lines changed

5 files changed

+124
-11
lines changed

Examples/CrossPlatformTest/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ internal class Program
1515
static void Main(string[] args)
1616
{
1717

18-
Converter.EnableNEON = false;
18+
Converter.EnableNEON = true;
1919
Converter.NumThreads = 1;
2020

2121
H264Encoder encoder = new H264Encoder();

H264SharpNative/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ add_library (CMakeProject1 SHARED
6262
if (CMAKE_VERSION VERSION_GREATER 3.12)
6363
set_property(TARGET CMakeProject1 PROPERTY CXX_STANDARD 17)
6464
endif()
65-
#output name
65+
#output name
6666
#H264SharpNative-linux64.so
6767
#H264SharpNative-linux64.so
6868

H264SharpNative/Converter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ namespace H264Sharp {
4949
width,
5050
height, numThreads);
5151
else
52-
Yuv2Rgb::ConvertYUVToRGB_NEON(
52+
Yuv2Rgb::ConvertYUVToRGB_NEONv2(
5353
y_ptr,
5454
u_ptr,
5555
v_ptr,

H264SharpNative/Yuv2Rgb.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ extern const unsigned int yuv2rgb565_table1[];
4747

4848
static void ConvertYUVToRGB_NEON(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
4949
uint8_t* rgb_buffer, int width, int height);
50+
static void ConvertYUVToRGB_NEONv2(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
51+
uint8_t* rgb_buffer, int width, int height);
5052
static void ConvertYUVToRGB_NEON_Parallel(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
5153
uint8_t* rgb_buffer, int width, int height, int numThreads);
5254

H264SharpNative/Yuv2RgbNEON.cpp

Lines changed: 119 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,132 @@
88

99
namespace H264Sharp
1010
{
11-
11+
/*
12+
* R = CLAMP((Y-16)*1.164 + 1.596*V)
13+
G = CLAMP((Y-16)*1.164 - 0.391*U - 0.813*V)
14+
B = CLAMP((Y-16)*1.164 + 2.018*U )
15+
*/
1216
// BT.601-7 studio range constants
1317
const int16x8_t const_16 = vdupq_n_s16(16);
1418
const int16x8_t const_128 = vdupq_n_s16(128);
15-
const int16x8_t const_0 = vdupq_n_s16(0);
16-
const int16x8_t const_255 = vdupq_n_s16(255);
19+
const uint8x8_t const_125 = vdup_n_u8(125);
1720
// divide them by 2 here
1821
// Precalculated fixed-point coefficients
1922
// FP scaling: 1 << 7 (128) to maintain precision
2023
const int16_t y_factor = 149; // 1.164 * 128
21-
const int16_t v_to_r_coeff = 204; // 1.596 * 128
22-
const int16_t u_to_g_coeff = 50; // 0.391 * 128
23-
const int16_t v_to_g_coeff = 104; // 0.813 * 128
24-
const int16_t u_to_b_coeff = 258; // 2.018 * 128
25-
24+
const int16_t v_to_r_coeff = 102; // 1.596 * 64
25+
const int16_t u_to_g_coeff = 25; // 0.391 * 64
26+
const int16_t v_to_g_coeff = 52; // 0.813 * 64
27+
const int16_t u_to_b_coeff = 129; // 2.018 * 64
28+
29+
void Yuv2Rgb::ConvertYUVToRGB_NEONv2(
30+
const uint8_t* y_plane,
31+
const uint8_t* u_plane,
32+
const uint8_t* v_plane,
33+
uint8_t* rgb_buffer,
34+
int width,
35+
int height)
36+
{
37+
const int uv_width = width / 2;
38+
for (int y = 0; y < height; y += 2) {
39+
const uint8_t* y_row1 = y_plane + y * width;
40+
const uint8_t* y_row2 = y_row1 + width;
41+
const uint8_t* u_row = u_plane + (y / 2) * uv_width;
42+
const uint8_t* v_row = v_plane + (y / 2) * uv_width;
43+
uint8_t* rgb_row1 = rgb_buffer + y * width * 3;
44+
uint8_t* rgb_row2 = rgb_row1 + width * 3;
45+
46+
for (int x = 0; x < width; x += 16) {
47+
48+
// Load 8 U and V values
49+
uint8x8_t u_vals8 = vld1_u8(u_row + (x / 2));
50+
uint8x8_t v_vals8 = vld1_u8(v_row + (x / 2));
51+
52+
// Process U/V (widen then -128)
53+
int16x8_t u_vals = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(u_vals8)), const_128);
54+
int16x8_t v_vals = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(v_vals8)), const_128);
55+
56+
// duplicate [1,2,3,4,5,6,7,8]=> [1,1,2,2,3,3,4,4] , [5,5,6,6,7,7,8,8]
57+
int16x8_t u_valsl = vzip1q_s16(u_vals, u_vals);
58+
int16x8_t u_valsh = vzip2q_s16(u_vals, u_vals);
59+
60+
int16x8_t v_valsl = vzip1q_s16(v_vals, v_vals);
61+
int16x8_t v_valsh = vzip2q_s16(v_vals, v_vals);
62+
63+
// multiply UV with the scaling
64+
int16x8_t u_vals_ugl = vshrq_n_s16(vmulq_n_s16(u_valsl, u_to_g_coeff),6);
65+
int16x8_t u_vals_ubl = vshrq_n_s16(vmulq_n_s16(u_valsl, u_to_b_coeff),6);
66+
int16x8_t v_vals_vgl = vshrq_n_s16(vmulq_n_s16(v_valsl, v_to_g_coeff), 6);
67+
int16x8_t v_vals_vrl = vshrq_n_s16(vmulq_n_s16(v_valsl, v_to_r_coeff), 6);
68+
69+
int16x8_t u_vals_ugh = vshrq_n_s16(vmulq_n_s16(u_valsh, u_to_g_coeff), 6);
70+
int16x8_t u_vals_ubh = vshrq_n_s16(vmulq_n_s16(u_valsh, u_to_b_coeff), 6);
71+
int16x8_t v_vals_vgh = vshrq_n_s16(vmulq_n_s16(v_valsh, v_to_g_coeff), 6);
72+
int16x8_t v_vals_vrh = vshrq_n_s16(vmulq_n_s16(v_valsh, v_to_r_coeff), 6);
73+
74+
// Load 16 Y values for two rows
75+
uint8x16_t y_vals1 = vld1q_u8(y_row1 + x);
76+
uint8x16_t y_vals2 = vld1q_u8(y_row2 + x);
77+
78+
79+
// Convert Y to 16-bit and adjust range
80+
int16x8_t y_vals_16_1l = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals1))), const_16);
81+
int16x8_t y_vals_16_1h = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals1))), const_16);
82+
//second row
83+
int16x8_t y_vals_16_2l = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals2))), const_16);
84+
int16x8_t y_vals_16_2h = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals2))), const_16);
85+
86+
// Scale Y (-16 and multiply by 1.164)
87+
y_vals_16_1l = vshrq_n_s16(vmulq_n_s16(y_vals_16_1l, y_factor), 7);
88+
y_vals_16_1h = vshrq_n_s16(vmulq_n_s16(y_vals_16_1h, y_factor), 7);
89+
y_vals_16_2l = vshrq_n_s16(vmulq_n_s16(y_vals_16_2l, y_factor), 7);
90+
y_vals_16_2h = vshrq_n_s16(vmulq_n_s16(y_vals_16_2h, y_factor), 7);
91+
92+
// Calculate RGB for first 8 pixels
93+
int16x8_t r1l = vaddq_s16(y_vals_16_1l, v_vals_vrl);
94+
int16x8_t g1l = vsubq_s16(vsubq_s16(y_vals_16_1l, u_vals_ugl), v_vals_vgl);
95+
int16x8_t b1l = vaddq_s16(y_vals_16_1l, u_vals_ubl);
96+
97+
int16x8_t r1h = vaddq_s16(y_vals_16_1h, v_vals_vrh);
98+
int16x8_t g1h = vsubq_s16(vsubq_s16(y_vals_16_1h, u_vals_ugh), v_vals_vgh);
99+
int16x8_t b1h = vaddq_s16(y_vals_16_1h, u_vals_ubh);
100+
101+
// Calculate RGB for second row
102+
int16x8_t r2l = vaddq_s16(y_vals_16_2l, v_vals_vrl);
103+
int16x8_t g2l = vsubq_s16(vsubq_s16(y_vals_16_2l, u_vals_ugl), v_vals_vgl);
104+
int16x8_t b2l = vaddq_s16(y_vals_16_2l, u_vals_ubl);
105+
106+
int16x8_t r2h = vaddq_s16(y_vals_16_2h, v_vals_vrh);
107+
int16x8_t g2h = vsubq_s16(vsubq_s16(y_vals_16_2h, u_vals_ugh), v_vals_vgh);
108+
int16x8_t b2h = vaddq_s16(y_vals_16_2h, u_vals_ubh);
109+
110+
// Clamp values between 0 and 255
111+
// Store first row (in BGR order)
112+
uint8x8x3_t rgb1l, rgb1h;
113+
rgb1l.val[0] = vqmovun_s16(b1l);
114+
rgb1l.val[1] = vqmovun_s16(g1l);
115+
rgb1l.val[2] = vqmovun_s16(r1l);
116+
117+
rgb1h.val[0] = vqmovun_s16(b1h);
118+
rgb1h.val[1] = vqmovun_s16(g1h);
119+
rgb1h.val[2] = vqmovun_s16(r1h);
120+
121+
vst3_u8(rgb_row1 + x * 3, rgb1l);
122+
vst3_u8(rgb_row1 + (x * 3) + 24, rgb1h);
123+
124+
rgb1l.val[0] = vqmovun_s16(b2l);
125+
rgb1l.val[1] = vqmovun_s16(g2l);
126+
rgb1l.val[2] = vqmovun_s16(r2l);
127+
128+
rgb1h.val[0] = vqmovun_s16(b2h);
129+
rgb1h.val[1] = vqmovun_s16(g2h);
130+
rgb1h.val[2] = vqmovun_s16(r2h);
131+
132+
vst3_u8(rgb_row2 + x * 3, rgb1l);
133+
vst3_u8(rgb_row2 + (x * 3) + 24, rgb1h);
134+
}
135+
}
136+
}
26137

27138
void Yuv2Rgb::ConvertYUVToRGB_NEON(
28139
const uint8_t* y_plane,

0 commit comments

Comments
 (0)