Skip to content

Commit c2216ef

Browse files
committed
-
1 parent a8aa138 commit c2216ef

File tree

3 files changed

+35
-20
lines changed

3 files changed

+35
-20
lines changed

Examples/CrossPlatformTest/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ internal class Program
1515
static void Main(string[] args)
1616
{
1717

18-
Converter.EnableNEON = true;
18+
Converter.EnableNEON = false;
1919
Converter.NumThreads = 1;
2020

2121
H264Encoder encoder = new H264Encoder();

H264SharpNative/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ add_library (CMakeProject1 SHARED
5252
"Encoder.cpp"
5353
"ImageTypes.cpp"
5454
"pch.cpp"
55-
"Yuv2Rgb.cpp"
55+
"Yuv2Rgb.cpp"
5656
"Yuv2RgbSSE.cpp"
5757
"Yuv2RgbNEON.cpp"
5858
"ThreadPool.cpp"

H264SharpNative/Rgb2YuvNEON.cpp

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -50,24 +50,34 @@ namespace H264Sharp
5050
) {
5151

5252
unsigned char* buffer = dst;
53+
/*
54+
* buffer[yIndex++] = ((25 * b + 129 * g + 66 * r) >> 8) + 16;
55+
buffer[yIndex++] = ((25 * b1 + 129 * g1 + 66 * r1) >> 8) + 16;
5356
57+
buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
58+
buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
59+
*/
5460
// SIMD constants for YUV conversion
55-
const uint16x8_t kR_Y = vdupq_n_u16(66);
56-
const uint16x8_t kG_Y = vdupq_n_u16(129);
5761
const uint16x8_t kB_Y = vdupq_n_u16(25);
58-
const uint16x8_t kR_U = vdupq_n_u16(112);
62+
const uint16x8_t kG_Y = vdupq_n_u16(129);
63+
const uint16x8_t kR_Y = vdupq_n_u16(66);
64+
const uint16x8_t offset_Y = vdupq_n_u16(16);
65+
66+
67+
const int16x8_t kR_U = vdupq_n_s16(112);
5968
const int16x8_t kG_U = vdupq_n_s16(-94);
60-
const int16x8_t kB_U = vdupq_n_s16(-18);
69+
const int16x8_t kB_U = vdupq_n_s16(-18);
70+
6171
const int16x8_t kR_V = vdupq_n_s16(-38);
6272
const int16x8_t kG_V = vdupq_n_s16(-74);
63-
const uint16x8_t kB_V = vdupq_n_u16(112);
64-
const uint16x8_t offset_Y = vdupq_n_u16(16);
65-
const uint16x8_t offset_UV = vdupq_n_u16(128);
73+
const int16x8_t kB_V = vdupq_n_s16(112);
74+
75+
const int16x8_t offset_UV = vdupq_n_s16(128);
6676

6777
int index = 0;
6878
int yIndex = 0;
6979
int uIndex = width* height;
70-
int vIndex = uIndex + width * height/4;
80+
int vIndex = uIndex + (uIndex>>2);
7181
// Loop over the specified range of rows
7282
for (int row = begin; row < end; row+=2) {
7383
// first row includes UV
@@ -88,7 +98,7 @@ namespace H264Sharp
8898
g = pixels.val[G_INDEX];
8999
b = pixels.val[B_INDEX];
90100
}
91-
101+
92102

93103
//Widen to 16 bits unsigned
94104
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
@@ -99,7 +109,7 @@ namespace H264Sharp
99109
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
100110

101111
// Y Channel (Unsigned because we can overflow)
102-
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
112+
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
103113
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
104114
// div 256(shift 8) + offset
105115
y_low = vshrq_n_u16(y_low, 8);
@@ -108,6 +118,7 @@ namespace H264Sharp
108118
y_high = vaddq_u16(y_high, offset_Y);
109119

110120
// shrink combine strore
121+
111122
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
112123
yIndex += 16;
113124

@@ -118,13 +129,11 @@ namespace H264Sharp
118129

119130
// Compute U channel (average over 2x2 blocks)
120131
int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U));
121-
u = vshrq_n_s16(u, 8);
122-
u = vaddq_s16(u, offset_UV);
132+
u = vaddq_s16(vshrq_n_s16(u, 8), offset_UV);
123133

124134
// Compute V channel
125-
int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(b_low_signed, kB_V));
126-
v = vshrq_n_s16(v, 8);
127-
v = vaddq_s16(v, offset_UV);
135+
int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(b_low_signed, kB_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(r_low_signed, kR_V));
136+
v = vaddq_s16(vshrq_n_s16(v, 8), offset_UV);
128137

129138
// Store U and V
130139
vst1_u8(&buffer[uIndex], vqmovun_s16(u));
@@ -136,12 +145,11 @@ namespace H264Sharp
136145
}
137146

138147
// add stride offset here..
139-
//index += row*stride - index;
148+
index = stride * (row+1);
140149
//second row only Y
141150
for (int i = 0; i < width; i += 16) {
142151
// Load 16 pixels (48 bytes) from src
143152
uint8x16_t r, g, b;
144-
145153
if constexpr (NUM_CH == 4) {
146154
uint8x16x4_t pixels = vld4q_u8(&src[index]);
147155

@@ -157,26 +165,33 @@ namespace H264Sharp
157165
b = pixels.val[B_INDEX];
158166
}
159167

168+
169+
//Widen to 16 bits unsigned
160170
uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
161171
uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
162172
uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
163173
uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
164174
uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
165175
uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
166176

167-
// Y Channel (Unsigned because we can overflow the signed)
177+
// Y Channel (Unsigned because we can overflow)
168178
uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
169179
uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
180+
// div 256(shift 8) + offset
170181
y_low = vshrq_n_u16(y_low, 8);
171182
y_high = vshrq_n_u16(y_high, 8);
172183
y_low = vaddq_u16(y_low, offset_Y);
173184
y_high = vaddq_u16(y_high, offset_Y);
174185

186+
// shrink combine strore
175187
vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
176188
yIndex += 16;
177189

178190
index += NUM_CH * 16;
179191
}
192+
193+
index = stride * (row + 2);
194+
180195
}
181196
}
182197

0 commit comments

Comments
 (0)