Skip to content

Commit dd1a799

Browse files
committed
CPU (Linux): detects march on aarch64
1 parent 7611d67 commit dd1a799

File tree

3 files changed

+256
-70
lines changed

3 files changed

+256
-70
lines changed

src/detection/cpu/cpu.c

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,252 @@ const char* ffCPUQualcommCodeToName(uint32_t code)
6060
default: return NULL;
6161
}
6262
}
63+
64+
#if defined(__x86_64__) || defined(__i386__)
65+
66+
#include <cpuid.h>
67+
68+
void ffCPUDetectByCpuid(FFCPUResult* cpu)
69+
{
70+
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
71+
if (__get_cpuid(0x16, &eax, &ebx, &ecx, &edx))
72+
{
73+
// WARNING: CPUID may report frequencies of efficient cores
74+
// cpuid returns 0 MHz when hypervisor is enabled
75+
if (eax) cpu->frequencyBase = eax;
76+
if (ebx) cpu->frequencyMax = ebx;
77+
}
78+
79+
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx))
80+
{
81+
// Feature tests (leaf1.ecx, leaf7.ebx)
82+
bool sse2 = (ecx & bit_SSE2) != 0;
83+
bool sse4_2 = (ecx & bit_SSE4_2) != 0;
84+
bool pclmul = (ecx & bit_PCLMUL) != 0;
85+
bool popcnt = (ecx & bit_POPCNT) != 0;
86+
bool fma = (ecx & bit_FMA) != 0;
87+
bool osxsave = (ecx & bit_OSXSAVE) != 0;
88+
89+
unsigned int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0;
90+
__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7);
91+
92+
bool avx2 = (ebx7 & bit_AVX2) != 0;
93+
bool bmi2 = (ebx7 & bit_BMI2) != 0;
94+
bool avx512f = (ebx7 & bit_AVX512F) != 0;
95+
bool avx512bw = (ebx7 & bit_AVX512BW) != 0;
96+
bool avx512dq = (ebx7 & bit_AVX512DQ) != 0;
97+
98+
// OS support for AVX/AVX512: check XGETBV (requires OSXSAVE)
99+
bool avx_os = false;
100+
bool avx512_os = false;
101+
if (osxsave)
102+
{
103+
__asm__ __volatile__(
104+
"xgetbv"
105+
: "=a"(eax), "=d"(edx)
106+
: "c"(0)
107+
:
108+
);
109+
uint64_t xcr0 = ((uint64_t)edx << 32) | eax;
110+
111+
// AVX requires XCR0[1:2] == 11b (XMM and YMM state)
112+
avx_os = (xcr0 & 0x6ULL) == 0x6ULL;
113+
// AVX512 requires XCR0[7,5,6] etc. common mask 0xE6 (bits 1,2,5,6,7)
114+
avx512_os = (xcr0 & 0xE6ULL) == 0xE6ULL;
115+
}
116+
117+
cpu->march = "unknown";
118+
if (avx512f && avx512bw && avx512dq && avx512_os) cpu->march = "x86_64-v4";
119+
else if (avx2 && fma && bmi2 && avx_os) cpu->march = "x86_64-v3";
120+
else if (sse4_2 && popcnt && pclmul) cpu->march = "x86_64-v2";
121+
else if (sse2) cpu->march = "x86_64-v1";
122+
}
123+
}
124+
125+
#elif defined(__aarch64__)
126+
127+
// This is not accurate because a lot of flags are optional from old versions
128+
// https://developer.arm.com/documentation/109697/2025_06/Feature-descriptions?lang=en
129+
// https://en.wikipedia.org/wiki/AArch64#ARM-A_(application_architecture)
130+
// Worth noting: Apple M1 is marked as ARMv8.5-A on Wikipedia, but it lacks BTI (mandatory in v8.5)
131+
132+
#ifdef __linux__
133+
#include "common/io/io.h"
134+
#include <elf.h>
135+
#include <asm/hwcap.h>
136+
137+
#ifndef HWCAP2_SME
138+
#define HWCAP2_SME (1UL << 23)
139+
#endif
140+
#ifndef HWCAP2_SME2
141+
#define HWCAP2_SME2 (1UL << 37)
142+
#endif
143+
#ifndef HWCAP2_CSSC
144+
#define HWCAP2_CSSC (1UL << 34)
145+
#endif
146+
#ifndef HWCAP2_SME2P1
147+
#define HWCAP2_SME2P1 (1UL << 38)
148+
#endif
149+
#ifndef HWCAP2_MOPS
150+
#define HWCAP2_MOPS (1UL << 43)
151+
#endif
152+
#ifndef HWCAP2_F8E4M3
153+
#define HWCAP2_F8E4M3 (1UL << 55)
154+
#endif
155+
#ifndef HWCAP2_F8E5M2
156+
#define HWCAP2_F8E5M2 (1UL << 56)
157+
#endif
158+
#ifndef HWCAP_CMPBR
159+
#define HWCAP_CMPBR (1UL << 33)
160+
#endif
161+
#ifndef HWCAP_FPRCVT
162+
#define HWCAP_FPRCVT (1UL << 34)
163+
#endif
164+
165+
void ffCPUDetectByCpuid(FFCPUResult* cpu)
166+
{
167+
char buf[PROC_FILE_BUFFSIZ];
168+
ssize_t nRead = ffReadFileData("/proc/self/auxv", ARRAY_SIZE(buf), buf);
169+
170+
if (nRead < (ssize_t) sizeof(Elf64_auxv_t)) return;
171+
172+
uint64_t hwcap = 0, hwcap2 = 0;
173+
174+
for (Elf64_auxv_t* auxv = (Elf64_auxv_t*)buf; (char*)auxv < buf + nRead; ++auxv)
175+
{
176+
if (auxv->a_type == AT_HWCAP)
177+
{
178+
hwcap = auxv->a_un.a_val;
179+
}
180+
else if (auxv->a_type == AT_HWCAP2)
181+
{
182+
hwcap2 = auxv->a_un.a_val;
183+
}
184+
}
185+
186+
if (!hwcap) return;
187+
188+
cpu->march = "unknown";
189+
190+
// ARMv8-A
191+
bool has_fp = (hwcap & HWCAP_FP) != 0;
192+
bool has_asimd = (hwcap & HWCAP_ASIMD) != 0;
193+
194+
// ARMv8.1-A
195+
bool has_atomics = (hwcap & HWCAP_ATOMICS) != 0; // optional from v8.0
196+
bool has_crc32 = (hwcap & HWCAP_CRC32) != 0; // optional from v8.0
197+
bool has_asimdrdm = (hwcap & HWCAP_ASIMDRDM) != 0; // optional from v8.0
198+
199+
// ARMv8.2-A
200+
bool has_fphp = (hwcap & HWCAP_FPHP) != 0; // optional
201+
bool has_dcpop = (hwcap & HWCAP_DCPOP) != 0; // DC CVAP, optional from v8.1
202+
203+
// ARMv8.3-A
204+
bool has_paca = (hwcap & HWCAP_PACA) != 0; // optional from v8.2
205+
bool has_lrcpc = (hwcap & HWCAP_LRCPC) != 0; // optional from v8.2
206+
bool has_fcma = (hwcap & HWCAP_FCMA) != 0; // optional from v8.2
207+
bool has_jscvt = (hwcap & HWCAP_JSCVT) != 0; // optional from v8.2
208+
209+
// ARMv8.4-A
210+
bool has_dit = (hwcap & HWCAP_DIT) != 0; // optional from v8.3
211+
bool has_flagm = (hwcap & HWCAP_FLAGM) != 0; // optional from v8.1
212+
bool has_ilrcpc = (hwcap & HWCAP_ILRCPC) != 0; // optional from v8.2
213+
214+
// ARMv8.5-A
215+
bool has_bti = (hwcap2 & HWCAP2_BTI) != 0; // optional from v8.4
216+
bool has_sb = (hwcap & HWCAP_SB) != 0; // optional from v8.0
217+
bool has_dcpodp = (hwcap2 & HWCAP2_DCPODP) != 0; // optional from v8.1
218+
bool has_flagm2 = (hwcap2 & HWCAP2_FLAGM2) != 0; // optional from v8.4
219+
bool has_frint = (hwcap2 & HWCAP2_FRINT) != 0; // optional from v8.4
220+
221+
// ARMv9.0-A
222+
bool has_sve2 = (hwcap2 & HWCAP2_SVE2) != 0;
223+
224+
// ARMv9.1-A
225+
// ARMv8.6-A
226+
bool has_bf16 = (hwcap2 & HWCAP2_BF16) != 0; // optional from v8.2
227+
bool has_i8mm = (hwcap2 & HWCAP2_I8MM) != 0; // optional from v8.1
228+
229+
// ARMv8.7-A
230+
bool has_afp = (hwcap2 & HWCAP2_AFP) != 0; // optional from v8.6
231+
232+
// ARMv9.2-A
233+
bool has_sme = (hwcap2 & HWCAP2_SME) != 0;
234+
235+
// ARMv9.3-A
236+
bool has_sme2 = (hwcap2 & HWCAP2_SME2) != 0; // optional from v9.2
237+
238+
// ARMv8.8-A
239+
bool has_mops = (hwcap2 & HWCAP2_MOPS) != 0; // optional from v8.7
240+
241+
// ARMv8.9-A
242+
bool has_cssc = (hwcap2 & HWCAP2_CSSC) != 0; // optional from v8.7
243+
244+
// ARMv9.4-A
245+
bool has_sme2p1 = (hwcap2 & HWCAP2_SME2P1) != 0; // optional from v9.2
246+
247+
// ARMv9.5-A
248+
bool has_f8e4m3 = (hwcap2 & HWCAP2_F8E4M3) != 0; // optional from v9.2
249+
bool has_f8e5m2 = (hwcap2 & HWCAP2_F8E5M2) != 0; // optional from v9.2
250+
251+
// ARMv9.6-A
252+
bool has_cmpbr = (hwcap & HWCAP_CMPBR) != 0; // optional from v9.5
253+
bool has_fprcvt = (hwcap & HWCAP_FPRCVT) != 0; // optional from v9.5
254+
255+
if (has_sve2 || has_sme) {
256+
// ARMv9
257+
if (has_cmpbr && has_fprcvt) {
258+
cpu->march = "ARMv9.6-A";
259+
} else if (has_f8e5m2 && has_f8e4m3) {
260+
cpu->march = "ARMv9.5-A";
261+
} else if (has_sme2p1) {
262+
cpu->march = "ARMv9.4-A";
263+
} else if (has_sme2) {
264+
cpu->march = "ARMv9.3-A";
265+
} else if (has_sme) {
266+
cpu->march = "ARMv9.2-A";
267+
} else if (has_i8mm && has_bf16) {
268+
cpu->march = "ARMv9.1-A";
269+
} else {
270+
cpu->march = "ARMv9.0-A";
271+
}
272+
} else {
273+
// ARMv8
274+
if (has_cssc) {
275+
cpu->march = "ARMv8.9-A";
276+
} else if (has_mops) {
277+
cpu->march = "ARMv8.8-A";
278+
} else if (has_afp) {
279+
cpu->march = "ARMv8.7-A";
280+
} else if (has_i8mm && has_bf16) {
281+
cpu->march = "ARMv8.6-A";
282+
} else if (has_bti && has_sb && has_dcpodp && has_flagm2 && has_frint) {
283+
cpu->march = "ARMv8.5-A";
284+
} else if (has_dit && has_flagm && has_ilrcpc) {
285+
cpu->march = "ARMv8.4-A";
286+
} else if (has_paca && has_lrcpc && has_fcma && has_jscvt) {
287+
cpu->march = "ARMv8.3-A";
288+
} else if (has_fphp && has_dcpop) {
289+
cpu->march = "ARMv8.2-A";
290+
} else if (has_atomics && has_crc32 && has_asimdrdm) {
291+
cpu->march = "ARMv8.1-A";
292+
} else if (has_asimd && has_fp) {
293+
cpu->march = "ARMv8-A";
294+
}
295+
}
296+
}
297+
#else
298+
void ffCPUDetectByCpuid(FF_MAYBE_UNUSED FFCPUResult* cpu)
299+
{
300+
// Unsupported platform
301+
}
302+
#endif // __linux__
303+
304+
#else
305+
306+
void ffCPUDetectByCpuid(FF_MAYBE_UNUSED FFCPUResult* cpu)
307+
{
308+
// Unsupported platform
309+
}
310+
311+
#endif

src/detection/cpu/cpu.h

Lines changed: 1 addition & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -33,73 +33,4 @@ typedef struct FFCPUResult
3333
const char* ffDetectCPU(const FFCPUOptions* options, FFCPUResult* cpu);
3434
const char* ffCPUAppleCodeToName(uint32_t code);
3535
const char* ffCPUQualcommCodeToName(uint32_t code);
36-
37-
#if defined(__x86_64__) || defined(__i386__)
38-
39-
#include <cpuid.h>
40-
41-
inline static void ffCPUDetectByCpuid(FFCPUResult* cpu)
42-
{
43-
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
44-
if (__get_cpuid(0x16, &eax, &ebx, &ecx, &edx))
45-
{
46-
// WARNING: CPUID may report frequencies of efficient cores
47-
// cpuid returns 0 MHz when hypervisor is enabled
48-
if (eax) cpu->frequencyBase = eax;
49-
if (ebx) cpu->frequencyMax = ebx;
50-
}
51-
52-
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx))
53-
{
54-
// Feature tests (leaf1.ecx, leaf7.ebx)
55-
bool sse2 = (ecx & bit_SSE2) != 0;
56-
bool sse4_2 = (ecx & bit_SSE4_2) != 0;
57-
bool pclmul = (ecx & bit_PCLMUL) != 0;
58-
bool popcnt = (ecx & bit_POPCNT) != 0;
59-
bool fma = (ecx & bit_FMA) != 0;
60-
bool osxsave = (ecx & bit_OSXSAVE) != 0;
61-
62-
unsigned int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0;
63-
__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7);
64-
65-
bool avx2 = (ebx7 & bit_AVX2) != 0;
66-
bool bmi2 = (ebx7 & bit_BMI2) != 0;
67-
bool avx512f = (ebx7 & bit_AVX512F) != 0;
68-
bool avx512bw = (ebx7 & bit_AVX512BW) != 0;
69-
bool avx512dq = (ebx7 & bit_AVX512DQ) != 0;
70-
71-
// OS support for AVX/AVX512: check XGETBV (requires OSXSAVE)
72-
bool avx_os = false;
73-
bool avx512_os = false;
74-
if (osxsave)
75-
{
76-
__asm__ __volatile__(
77-
"xgetbv"
78-
: "=a"(eax), "=d"(edx)
79-
: "c"(0)
80-
:
81-
);
82-
uint64_t xcr0 = ((uint64_t)edx << 32) | eax;
83-
84-
// AVX requires XCR0[1:2] == 11b (XMM and YMM state)
85-
avx_os = (xcr0 & 0x6ULL) == 0x6ULL;
86-
// AVX512 requires XCR0[7,5,6] etc. common mask 0xE6 (bits 1,2,5,6,7)
87-
avx512_os = (xcr0 & 0xE6ULL) == 0xE6ULL;
88-
}
89-
90-
cpu->march = "unknown";
91-
if (avx512f && avx512bw && avx512dq && avx512_os) cpu->march = "x86_64-v4";
92-
else if (avx2 && fma && bmi2 && avx_os) cpu->march = "x86_64-v3";
93-
else if (sse4_2 && popcnt && pclmul) cpu->march = "x86_64-v2";
94-
else if (sse2) cpu->march = "x86_64-v1";
95-
}
96-
}
97-
98-
#else
99-
100-
inline static void ffCPUDetectByCpuid(FF_MAYBE_UNUSED FFCPUResult* cpu)
101-
{
102-
// Unsupported platform
103-
}
104-
105-
#endif
36+
void ffCPUDetectByCpuid(FFCPUResult* cpu);

src/detection/cpu/cpu_linux.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,13 @@ static double parseHwmonDir(FFstrbuf* dir, FFstrbuf* buffer)
4646

4747
if(
4848
ffStrbufContainS(buffer, "cpu") ||
49+
#if __x86_64__ || __i386__
4950
ffStrbufEqualS(buffer, "k10temp") || // AMD
5051
ffStrbufEqualS(buffer, "fam15h_power") || // AMD
5152
ffStrbufEqualS(buffer, "coretemp") // Intel
53+
#else
54+
ffStrbufEqualS(buffer, "temp") // Asahi
55+
#endif
5256
) return value / 1000.;
5357

5458
return FF_CPU_TEMP_UNSET;
@@ -734,6 +738,8 @@ FF_MAYBE_UNUSED static const char* detectCPUOthers(const FFCPUOptions* options,
734738
if (cpu->coresPhysical == 0)
735739
detectPhysicalCores(cpu);
736740

741+
ffCPUDetectByCpuid(cpu);
742+
737743
return NULL;
738744
}
739745
#endif

0 commit comments

Comments
 (0)