Skip to content

Commit ee21d94

Browse files
committed
Split unit tests and performance tests, some AMD-related optimizations
1 parent 0aec8be commit ee21d94

File tree

11 files changed

+507
-61
lines changed

11 files changed

+507
-61
lines changed

CHANGELOG

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
=== 1.0.15 ===
66
* Fixed syntax error in C interface, covered with tests.
77
* Bugfix in horizontal summing functions (invalid register clobber list).
8+
* Some AMD-related optimizations.
89

910
=== 1.0.14 ===
1011
* Implemented pcomplex_r2c instruction set.

include/private/dsp/arch/x86/cpuid.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3-
* (C) 2020 Vladimir Sadovnikov <[email protected]>
2+
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2023 Vladimir Sadovnikov <[email protected]>
44
*
55
* This file is part of lsp-dsp-lib
66
* Created on: 31 мар. 2020 г.
@@ -116,12 +116,18 @@
116116

117117
//-------------------------------------------------------------------------
118118
// Different processor families
119+
120+
#define INTEL_FAMILY_686_CORE 0x06
121+
119122
#define AMD_FAMILY_K8_HAMMER 0x0f
120123
#define AMD_FAMILY_K10 0x10
121124
#define AMD_FAMILY_BOBCAT 0x14
122125
#define AMD_FAMILY_BULLDOZER 0x15
123126
#define AMD_FAMILY_JAGUAR 0x16
124127
#define AMD_FAMILY_ZEN_1_2 0x17
128+
#define AMD_FAMILY_DHYANA 0x18
129+
#define AMD_FAMILY_ZEN_3_4 0x19
130+
#define AMD_FAMILY_ZEN_5 0x1a
125131

126132
#define AMD_MODEL_ZEN_2 0x31
127133

@@ -191,8 +197,9 @@ namespace lsp
191197
}
192198

193199
uint64_t read_xcr(umword_t xcr_id);
194-
}
195-
}
200+
201+
} /* namespace x86 */
202+
} /* namespace lsp */
196203

197204

198205
#endif /* PRIVATE_DSP_ARCH_X86_CPUID_H_ */

include/private/dsp/arch/x86/features.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@
8989
{
9090
FEAT_FAST_MOVS, // Processor implements optimized MOVS instruction
9191
FEAT_FAST_AVX, // Fast AVX implementation
92-
FEAT_FAST_FMA3 // Fast FMA3 implementation
92+
FEAT_FAST_FMA3, // Fast FMA3 implementation
93+
FEAT_BELOW_ZEN3 // CPU has AMD architecture and is below Zen3
9394
};
9495

9596
/**

src/main/x86/avx.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3-
* (C) 2020 Vladimir Sadovnikov <[email protected]>
2+
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2023 Vladimir Sadovnikov <[email protected]>
44
*
55
* This file is part of lsp-dsp-lib
66
* Created on: 31 мар. 2020 г.
@@ -127,8 +127,9 @@
127127

128128
// This routine sucks on AMD Bulldozer processor family but is pretty great on Intel
129129
// Not tested on AMD Processors above Bulldozer family
130-
bool favx = feature_check(f, FEAT_FAST_AVX);
131-
bool ffma = favx && feature_check(f, FEAT_FAST_FMA3);
130+
bool favx = feature_check(f, FEAT_FAST_AVX);
131+
bool ffma = favx && feature_check(f, FEAT_FAST_FMA3);
132+
bool below_zen3 = feature_check(f, FEAT_BELOW_ZEN3);
132133

133134
CEXPORT2_X64(favx, reverse1, reverse1);
134135
CEXPORT2_X64(favx, reverse2, reverse2);
@@ -448,7 +449,10 @@
448449
CEXPORT2(favx, pcomplex_rdiv2, pcomplex_rdiv2_fma3);
449450
CEXPORT2(favx, pcomplex_div3, pcomplex_div3_fma3);
450451

451-
CEXPORT2(favx, h_sqr_sum, h_sqr_sum_fma3);
452+
if (!below_zen3)
453+
{
454+
CEXPORT2(favx, h_sqr_sum, h_sqr_sum_fma3);
455+
}
452456

453457
CEXPORT2(favx, direct_fft, direct_fft_fma3);
454458
CEXPORT2(favx, reverse_fft, reverse_fft_fma3);

src/main/x86/x86.cpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -516,21 +516,45 @@
516516
case FEAT_FAST_MOVS:
517517
if (f->vendor == CPU_VENDOR_INTEL)
518518
{
519-
if ((f->family == 0x6) && (f->model >= 0x5e)) // Should be some Core i3 microarchitecture...
519+
// Should be some Core i3 microarchitecture...
520+
if ((f->family == INTEL_FAMILY_686_CORE) && (f->model >= 0x5e))
520521
return true;
521522
}
522523
break;
523524
case FEAT_FAST_AVX:
524525
if (f->vendor == CPU_VENDOR_INTEL) // Any Intel CPU is good enough with AVX
525526
return true;
527+
// Only starting with ZEN 1 architecture AMD's implementation of AVX is fast enough
526528
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON))
527-
return (f->family >= AMD_FAMILY_ZEN_1_2); // Only starting with ZEN 1 architecture AMD's implementation of AVX is fast enough
529+
{
530+
if (f->family < AMD_FAMILY_ZEN_1_2)
531+
return false;
532+
if (f->family == AMD_FAMILY_DHYANA)
533+
return false;
534+
return true;
535+
}
528536
break;
529537
case FEAT_FAST_FMA3:
530538
if (f->vendor == CPU_VENDOR_INTEL) // Any Intel CPU is good enough with AVX
531539
return true;
532-
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON)) // Starting with ZEN 2 FMA3 operations are fast enough on AMD
533-
return (f->family >= AMD_FAMILY_ZEN_1_2) && (f->model >= AMD_MODEL_ZEN_2);
540+
// Starting with ZEN 2 FMA3 operations are fast enough on AMD
541+
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON))
542+
{
543+
if (f->family < AMD_FAMILY_ZEN_1_2)
544+
return false;
545+
if (f->family == AMD_FAMILY_DHYANA)
546+
return false;
547+
return true;
548+
}
549+
break;
550+
551+
case FEAT_BELOW_ZEN3: // Test that this is AMD and below Zen 3 architecture
552+
if ((f->vendor == CPU_VENDOR_AMD) || (f->vendor == CPU_VENDOR_HYGON))
553+
{
554+
if (f->family < AMD_FAMILY_ZEN_3_4)
555+
return true;
556+
return false;
557+
}
534558
break;
535559
default:
536560
break;

src/test/ptest/hmath/h_abs_sum.cpp

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2023 Vladimir Sadovnikov <[email protected]>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 31 мар. 2020 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#include <lsp-plug.in/dsp/dsp.h>
23+
#include <lsp-plug.in/test-fw/ptest.h>
24+
#include <lsp-plug.in/test-fw/helpers.h>
25+
#include <lsp-plug.in/common/alloc.h>
26+
27+
#define MIN_RANK 8
28+
#define MAX_RANK 16
29+
30+
namespace lsp
31+
{
32+
namespace generic
33+
{
34+
float h_abs_sum(const float *src, size_t count);
35+
}
36+
37+
IF_ARCH_X86(
38+
namespace sse
39+
{
40+
float h_abs_sum(const float *src, size_t count);
41+
}
42+
43+
namespace avx
44+
{
45+
float h_abs_sum(const float *src, size_t count);
46+
}
47+
)
48+
49+
IF_ARCH_ARM(
50+
namespace neon_d32
51+
{
52+
float h_abs_sum(const float *src, size_t count);
53+
}
54+
)
55+
56+
IF_ARCH_AARCH64(
57+
namespace asimd
58+
{
59+
float h_abs_sum(const float *src, size_t count);
60+
}
61+
)
62+
63+
typedef float (* h_sum_t)(const float *src, size_t count);
64+
}
65+
66+
PTEST_BEGIN("dsp.hmath", hsum, 5, 10000)
67+
68+
void call(const char *label, float *src, size_t count, h_sum_t func)
69+
{
70+
if (!PTEST_SUPPORTED(func))
71+
return;
72+
73+
char buf[80];
74+
sprintf(buf, "%s x %d", label, int(count));
75+
printf("Testing %s numbers...\n", buf);
76+
77+
PTEST_LOOP(buf,
78+
func(src, count);
79+
);
80+
}
81+
82+
PTEST_MAIN
83+
{
84+
size_t buf_size = 1 << MAX_RANK;
85+
uint8_t *data = NULL;
86+
float *src = alloc_aligned<float>(data, buf_size, 64);
87+
88+
for (size_t i=0; i < buf_size; ++i)
89+
src[i] = randf(0.0f, 1.0f);
90+
91+
#define CALL(func) \
92+
call(#func, src, count, func)
93+
94+
for (size_t i=MIN_RANK; i <= MAX_RANK; ++i)
95+
{
96+
size_t count = 1 << i;
97+
98+
CALL(generic::h_abs_sum);
99+
IF_ARCH_X86(CALL(sse::h_abs_sum));
100+
IF_ARCH_X86(CALL(avx::h_abs_sum));
101+
IF_ARCH_ARM(CALL(neon_d32::h_abs_sum));
102+
IF_ARCH_AARCH64(CALL(asimd::h_abs_sum));
103+
PTEST_SEPARATOR;
104+
}
105+
106+
free_aligned(data);
107+
}
108+
109+
PTEST_END
110+

src/test/ptest/hmath/hsum.cpp renamed to src/test/ptest/hmath/h_sqr_sum.cpp

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3-
* (C) 2020 Vladimir Sadovnikov <[email protected]>
2+
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2023 Vladimir Sadovnikov <[email protected]>
44
*
55
* This file is part of lsp-dsp-lib
66
* Created on: 31 мар. 2020 г.
@@ -31,50 +31,40 @@ namespace lsp
3131
{
3232
namespace generic
3333
{
34-
float h_sum(const float *src, size_t count);
3534
float h_sqr_sum(const float *src, size_t count);
36-
float h_abs_sum(const float *src, size_t count);
3735
}
3836

3937
IF_ARCH_X86(
4038
namespace sse
4139
{
42-
float h_sum(const float *src, size_t count);
4340
float h_sqr_sum(const float *src, size_t count);
44-
float h_abs_sum(const float *src, size_t count);
4541
}
4642

4743
namespace avx
4844
{
49-
float h_sum(const float *src, size_t count);
5045
float h_sqr_sum(const float *src, size_t count);
5146
float h_sqr_sum_fma3(const float *src, size_t count);
52-
float h_abs_sum(const float *src, size_t count);
5347
}
5448
)
5549

5650
IF_ARCH_ARM(
5751
namespace neon_d32
5852
{
59-
float h_sum(const float *src, size_t count);
6053
float h_sqr_sum(const float *src, size_t count);
61-
float h_abs_sum(const float *src, size_t count);
6254
}
6355
)
6456

6557
IF_ARCH_AARCH64(
6658
namespace asimd
6759
{
68-
float h_sum(const float *src, size_t count);
6960
float h_sqr_sum(const float *src, size_t count);
70-
float h_abs_sum(const float *src, size_t count);
7161
}
7262
)
7363

7464
typedef float (* h_sum_t)(const float *src, size_t count);
7565
}
7666

77-
PTEST_BEGIN("dsp.hmath", hsum, 5, 10000)
67+
PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 10000)
7868

7969
void call(const char *label, float *src, size_t count, h_sum_t func)
8070
{
@@ -106,27 +96,13 @@ PTEST_BEGIN("dsp.hmath", hsum, 5, 10000)
10696
{
10797
size_t count = 1 << i;
10898

109-
CALL(generic::h_sum);
110-
IF_ARCH_X86(CALL(sse::h_sum));
111-
IF_ARCH_X86(CALL(avx::h_sum));
112-
IF_ARCH_ARM(CALL(neon_d32::h_sum));
113-
IF_ARCH_AARCH64(CALL(asimd::h_sum));
114-
PTEST_SEPARATOR;
115-
11699
CALL(generic::h_sqr_sum);
117100
IF_ARCH_X86(CALL(sse::h_sqr_sum));
118101
IF_ARCH_X86(CALL(avx::h_sqr_sum));
119102
IF_ARCH_X86(CALL(avx::h_sqr_sum_fma3));
120103
IF_ARCH_ARM(CALL(neon_d32::h_sqr_sum));
121104
IF_ARCH_AARCH64(CALL(asimd::h_sqr_sum));
122105
PTEST_SEPARATOR;
123-
124-
CALL(generic::h_abs_sum);
125-
IF_ARCH_X86(CALL(sse::h_abs_sum));
126-
IF_ARCH_X86(CALL(avx::h_abs_sum));
127-
IF_ARCH_ARM(CALL(neon_d32::h_abs_sum));
128-
IF_ARCH_AARCH64(CALL(asimd::h_abs_sum));
129-
PTEST_SEPARATOR2;
130106
}
131107

132108
free_aligned(data);

0 commit comments

Comments
 (0)