Skip to content

Commit 66ef693

Browse files
committed
Neon converter parallelisation
1 parent 1e368b6 commit 66ef693

File tree

6 files changed

+179
-361
lines changed

6 files changed

+179
-361
lines changed

H264SharpNative/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ add_library (CMakeProject1 SHARED
5757
"Yuv2RgbNEON.cpp"
5858
"ThreadPool.cpp"
5959
"Rgb2YuvNEON.cpp"
60-
"Rgb2Yuv.cpp")
60+
"Rgb2Yuv.cpp")
6161

6262
if (CMAKE_VERSION VERSION_GREATER 3.12)
6363
set_property(TARGET CMakeProject1 PROPERTY CXX_STANDARD 17)

H264SharpNative/Converter.cpp

Lines changed: 55 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ namespace H264Sharp {
2121
{
2222

2323
int numThreads = Converter::NumThreads;
24-
25-
if (Converter::EnableSSE>0 && width % 32 == 0)
24+
numThreads = width * height < minSize ? 1 : numThreads;
25+
#ifndef __arm__
26+
27+
if (Converter::EnableSSE > 0 && width % 32 == 0)
2628
{
27-
28-
#ifndef __arm__
29+
2930
// SSE, may parallel, not arm
3031
Yuv2Rgb::yuv420_rgb24_sse(width,
3132
height,
@@ -38,8 +39,25 @@ namespace H264Sharp {
3839
dst_span,
3940
numThreads);
4041
return;
42+
}
43+
else
44+
{
45+
46+
Yuv2Rgb::Yuv420P2RGBDefault(dst_ptr,
47+
y_ptr,
48+
u_ptr,
49+
v_ptr,
50+
width,
51+
height,
52+
y_span,
53+
uv_span,
54+
dst_span,
55+
numThreads);
56+
}
57+
4158
#elif defined(__aarch64__)
42-
if (Converter::EnableNEON > 0) {
59+
if (Converter::EnableNEON > 0 && width % 16 == 0)
60+
{
4361
if(numThreads>1)
4462
Yuv2Rgb::ConvertYUVToRGB_NEON_Parallel(
4563
y_ptr,
@@ -49,15 +67,14 @@ namespace H264Sharp {
4967
width,
5068
height, numThreads);
5169
else
52-
Yuv2Rgb::ConvertYUVToRGB_NEONv2(
70+
Yuv2Rgb::ConvertYUVToRGB_NEON(
5371
y_ptr,
5472
u_ptr,
5573
v_ptr,
5674
dst_ptr,
5775
width,
5876
height);
59-
}
60-
77+
}
6178
else
6279
{
6380
Yuv2Rgb::Yuv420P2RGBDefault(dst_ptr,
@@ -85,47 +102,13 @@ namespace H264Sharp {
85102
numThreads);
86103
#endif
87104

88-
}
89-
else
90-
{
91-
92-
Yuv2Rgb::Yuv420P2RGBDefault(dst_ptr,
93-
y_ptr,
94-
u_ptr,
95-
v_ptr,
96-
width,
97-
height,
98-
y_span,
99-
uv_span,
100-
dst_span,
101-
numThreads);
102-
}
105+
106+
103107

104108

105109
}
106110

107-
void Converter::Yuv420PtoRGB(YuvNative& yuv, unsigned char* destination)
108-
{
109-
int numThreads = Converter::NumThreads;
110-
if (Converter::EnableSSE>0 && yuv.width % 32 == 0)
111-
{
112-
113-
#ifndef __arm__
114-
// SSE, may parallel, not arm
115-
Yuv2Rgb::yuv420_rgb24_sse(yuv, destination,
116-
numThreads);
117-
#else
118-
Yuv2Rgb::Yuv420P2RGBDefault(yuv, destination,
119-
numThreads);
120-
#endif
121-
}
122-
else
123-
{
124-
Yuv2Rgb::Yuv420P2RGBDefault(yuv,destination,
125-
numThreads);
126-
}
127-
}
128-
;
111+
129112

130113

131114
#pragma endregion
@@ -135,38 +118,32 @@ namespace H264Sharp {
135118
void Converter::BGRAtoYUV420Planar(const unsigned char* bgra, unsigned char* dst, const int width, const int height, const int stride)
136119
{
137120
int numThreads = Converter::NumThreads;
121+
numThreads= width* height < minSize ? 1 : numThreads;
138122

139123
#if defined(__aarch64__)
140-
Rgb2Yuv::BGRAtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
141-
142-
#else
143-
if (width * height > minSize)
144-
{
145-
Rgb2Yuv::BGRAtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
146-
}
124+
if(Converter::EnableNEON>0)
125+
Rgb2Yuv::BGRAtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
147126
else
148-
{
149-
Rgb2Yuv::BGRAtoYUV420Planar(bgra, dst, width, height, stride, 1);
150-
}
127+
Rgb2Yuv::BGRAtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
128+
#else
129+
Rgb2Yuv::BGRAtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
151130
#endif
152131
}
153132

154133
void Converter::RGBAtoYUV420Planar(unsigned char* bgra, unsigned char* dst, int width, int height, int stride)
155134
{
156135
int numThreads = Converter::NumThreads;
136+
numThreads = width * height < minSize ? 1 : numThreads;
157137

158138
#if defined(__aarch64__)
159-
Rgb2Yuv::RGBAtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
139+
if (Converter::EnableNEON > 0)
140+
Rgb2Yuv::RGBAtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
141+
else
142+
Rgb2Yuv::RGBAtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
160143

161144
#else
162-
if (width * height > minSize)
163-
{
164-
Rgb2Yuv::RGBAtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
165-
}
166-
else
167-
{
168-
Rgb2Yuv::RGBAtoYUV420Planar(bgra, dst, width, height, stride, 1);
169-
}
145+
146+
Rgb2Yuv::RGBAtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
170147
#endif
171148

172149

@@ -175,39 +152,36 @@ namespace H264Sharp {
175152
void Converter::BGRtoYUV420Planar(unsigned char* bgra, unsigned char* dst, int width, int height, int stride)
176153
{
177154
int numThreads = Converter::NumThreads;
155+
numThreads = width * height < minSize ? 1 : numThreads;
178156

179157
#if defined(__aarch64__)
180-
Rgb2Yuv::BGRtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
158+
if (Converter::EnableNEON > 0)
159+
Rgb2Yuv::BGRtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
160+
else
161+
Rgb2Yuv::BGRtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
181162

182163
#else
183-
if (width * height > minSize)
184-
{
164+
185165
Rgb2Yuv::BGRtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
186-
}
187-
else
188-
{
189-
Rgb2Yuv::BGRtoYUV420Planar(bgra, dst, width, height, stride, 1);
190-
}
166+
191167
#endif
192168

193169
}
194170

195171
void Converter::RGBtoYUV420Planar(unsigned char* bgra, unsigned char* dst, int width, int height, int stride)
196172
{
197173
int numThreads = Converter::NumThreads;
174+
numThreads = width * height < minSize ? 1 : numThreads;
198175

199176
#if defined(__aarch64__)
200-
Rgb2Yuv::RGBtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
201-
177+
if (Converter::EnableNEON > 0)
178+
Rgb2Yuv::RGBtoYUV420PlanarNeon(bgra, dst, width, height, stride, numThreads);
179+
else
180+
Rgb2Yuv::RGBtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
202181
#else
203-
if (width * height > minSize)
204-
{
182+
205183
Rgb2Yuv::RGBtoYUV420Planar(bgra, dst, width, height, stride, numThreads);
206-
}
207-
else
208-
{
209-
Rgb2Yuv::RGBtoYUV420Planar(bgra, dst, width, height, stride, 1);
210-
}
184+
211185
#endif
212186

213187
}

H264SharpNative/Converter.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ namespace H264Sharp
3030
signed int uv_span,
3131
signed int dst_span);
3232

33-
static void Yuv420PtoRGB(YuvNative& yub, unsigned char* dst);
3433

3534
static void BGRAtoYUV420Planar(const unsigned char* bgra, unsigned char* dst, int width, int height, int stride);
3635
static void BGRtoYUV420Planar(unsigned char* bgr, unsigned char* dst, int width, int height, int stride);

H264SharpNative/Rgb2YuvNEON.cpp

Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,101 @@ namespace H264Sharp
3030

3131
void Rgb2Yuv::BGRAtoYUV420PlanarNeon(const unsigned char* bgra, unsigned char* dst, int width, int height, int stride, int threadCount)
3232
{
33-
RGB2YUVP_ParallelBody_SIMD<2,1,0,4>(bgra, dst, width, height, stride, 0, height);
33+
if (threadCount > 1) {
34+
int chunkLen = height / threadCount;
35+
ThreadPool::For(int(0), threadCount, [&](int j)
36+
{
37+
int bgn = chunkLen * j;
38+
int end = chunkLen * (j + 1);
39+
if (j == threadCount - 1)
40+
{
41+
end = height;
42+
}
3443

44+
RGB2YUVP_ParallelBody_SIMD<2, 1, 0, 4>(bgra, dst, width, height, stride, bgn, end);
45+
46+
47+
});
48+
}
49+
else {
50+
RGB2YUVP_ParallelBody_SIMD<2, 1, 0, 4>(bgra, dst, width, height, stride, 0, height);
51+
52+
}
3553
}
3654
void Rgb2Yuv::BGRtoYUV420PlanarNeon(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount)
3755
{
38-
RGB2YUVP_ParallelBody_SIMD<2,1,0,3>(bgr, dst, width, height, stride, 0, height);
56+
if (threadCount > 1) {
57+
int chunkLen = height / threadCount;
58+
ThreadPool::For(int(0), threadCount, [&](int j)
59+
{
60+
int bgn = chunkLen * j;
61+
int end = chunkLen * (j + 1);
62+
if (j == threadCount - 1)
63+
{
64+
end = height;
65+
}
66+
67+
RGB2YUVP_ParallelBody_SIMD<2, 1, 0, 3>(bgr, dst, width, height, stride, bgn, end);
68+
69+
70+
71+
});
72+
}
73+
else {
74+
RGB2YUVP_ParallelBody_SIMD<2, 1, 0, 3>(bgr, dst, width, height, stride, 0, height);
75+
}
76+
3977

4078
}
4179
void Rgb2Yuv::RGBAtoYUV420PlanarNeon(unsigned char* rgba, unsigned char* dst, int width, int height, int stride, int threadCount)
4280
{
43-
RGB2YUVP_ParallelBody_SIMD<0,1,2,4>(rgba, dst, width, height, stride, 0, height);
81+
82+
if (threadCount > 1) {
83+
int chunkLen = height / threadCount;
84+
ThreadPool::For(int(0), threadCount, [&](int j)
85+
{
86+
int bgn = chunkLen * j;
87+
int end = chunkLen * (j + 1);
88+
if (j == threadCount - 1)
89+
{
90+
end = height;
91+
}
92+
93+
RGB2YUVP_ParallelBody_SIMD<0, 1, 2, 4>(rgba, dst, width, height, stride, bgn, end);
94+
95+
96+
97+
98+
});
99+
}
100+
else {
101+
RGB2YUVP_ParallelBody_SIMD<0, 1, 2, 4>(rgba, dst, width, height, stride, 0, height);
102+
}
44103

45104
}
46105
void Rgb2Yuv::RGBtoYUV420PlanarNeon(unsigned char* rgb, unsigned char* dst, int width, int height, int stride, int threadCount)
47106
{
48-
RGB2YUVP_ParallelBody_SIMD<0,1,2,3>(rgb, dst, width, height, stride, 0, height);
107+
if (threadCount > 1) {
108+
int chunkLen = height / threadCount;
109+
ThreadPool::For(int(0), threadCount, [&](int j)
110+
{
111+
int bgn = chunkLen * j;
112+
int end = chunkLen * (j + 1);
113+
if (j == threadCount - 1)
114+
{
115+
end = height;
116+
}
117+
118+
RGB2YUVP_ParallelBody_SIMD<0, 1, 2, 3>(rgb, dst, width, height, stride, bgn, end);
119+
120+
121+
122+
123+
});
124+
}
125+
else {
126+
RGB2YUVP_ParallelBody_SIMD<0, 1, 2, 3>(rgb, dst, width, height, stride, 0, height);
127+
}
49128
}
50129

51130
template <int R_INDEX, int G_INDEX, int B_INDEX, int NUM_CH>

H264SharpNative/Yuv2Rgb.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ extern const unsigned int yuv2rgb565_table1[];
4747

4848
static void ConvertYUVToRGB_NEON(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
4949
uint8_t* rgb_buffer, int width, int height);
50-
static void ConvertYUVToRGB_NEONv2(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
51-
uint8_t* rgb_buffer, int width, int height);
50+
5251
static void ConvertYUVToRGB_NEON_Parallel(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
5352
uint8_t* rgb_buffer, int width, int height, int numThreads);
5453

0 commit comments

Comments
 (0)