Skip to content

Commit 5e36544

Browse files
authored
Add WAITPKG checks, add support for TPAUSE within SpinPause (microsoft#24524)
### Description This change introduces `TPAUSE` support in the `SpinPause()` function in Windows and Linux to reduce power consumption and improve efficiency during spin-wait periods. `TPAUSE` is a lightweight power/performance ISA that goes into an optimized C0 power state while waiting on a delay event, compared to `_mm_pause()` which is a NOP-like instruction that provides a small delay in the CPU Pipeline. With this change, performance of First Inference Latency across certain models can also improve. Models that were tested internally have shown up to ~2x improvement in First Inference Latency and up to ~20% lower overall power consumption. Genuine Intel CPUID detection logic was also refactored into a shared utility (`CheckIntel()`), enabling consistent platform checks across the codebase. Here `TPAUSE` is enabled by default for architectures that support it. [Intel Intrinsics Guide (TPAUSE)](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=tpause&techs=MMX,SSE_ALL,AVX_ALL,AVX_512,AMX,SVML,Other&ig_expand=6888,6888) ### Motivation and Context Performance and power efficiency gains - Previous PR was created which initially introduced the TPAUSE instruction in `SpinPause()` with measured improvements in power (please see previous TPAUSE PR here: [Add WAITPKG checks, add support for TPAUSE in ThreadPool spin microsoft#16935](microsoft#16935)). Additional performance testing and measurements were done across Mobile, Desktop, and Server, influencing enhancements to the PR such as a tweak to the `spin_delay_cycles`, Linux support and the refactored Intel CPUID detection logic.
1 parent c7f86b3 commit 5e36544

File tree

9 files changed

+191
-41
lines changed

9 files changed

+191
-41
lines changed

cmake/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,6 @@ option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is
258258
option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
259259
option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF)
260260

261-
262261
if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 11.1)
263262
message(FATAL_ERROR "GCC version must be greater than or equal to 11.1")
264263
endif()

cmake/onnxruntime_common.cmake

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ set(onnxruntime_common_src_patterns
1111
"${ONNXRUNTIME_ROOT}/core/common/logging/*.cc"
1212
"${ONNXRUNTIME_ROOT}/core/common/logging/sinks/*.h"
1313
"${ONNXRUNTIME_ROOT}/core/common/logging/sinks/*.cc"
14+
"${ONNXRUNTIME_ROOT}/core/platform/check_intel.h"
15+
"${ONNXRUNTIME_ROOT}/core/platform/check_intel.cc"
1416
"${ONNXRUNTIME_ROOT}/core/platform/device_discovery.h"
1517
"${ONNXRUNTIME_ROOT}/core/platform/device_discovery.cc"
1618
"${ONNXRUNTIME_ROOT}/core/platform/env.h"
@@ -100,6 +102,14 @@ if(WIN32)
100102
target_compile_options(onnxruntime_common PRIVATE "/Zc:char8_t-")
101103
endif()
102104
endif()
105+
106+
if(NOT WIN32 AND NOT APPLE AND NOT ANDROID AND CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
107+
set_source_files_properties(
108+
${ONNXRUNTIME_ROOT}/core/common/spin_pause.cc
109+
PROPERTIES COMPILE_FLAGS "-mwaitpkg"
110+
)
111+
endif()
112+
103113
if (onnxruntime_USE_TELEMETRY)
104114
set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
105115
endif()

include/onnxruntime/core/common/spin_pause.h

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,11 @@
33

44
#pragma once
55

6-
#if defined(_M_AMD64)
7-
#include <intrin.h>
8-
#endif
9-
10-
#if defined(__x86_64__)
11-
#include <xmmintrin.h>
12-
#endif
13-
146
namespace onnxruntime {
15-
167
namespace concurrency {
178

189
// Intrinsic to use in spin-loops
19-
20-
inline void SpinPause() {
21-
#if defined(_M_AMD64) || defined(__x86_64__)
22-
_mm_pause();
23-
#endif
24-
}
10+
void SpinPause();
2511

2612
} // namespace concurrency
27-
2813
} // namespace onnxruntime

onnxruntime/core/common/cpuid_info.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
#include "core/common/cpuid_info.h"
44
#include "core/common/logging/logging.h"
55
#include "core/common/logging/severity.h"
6+
#include "core/platform/check_intel.h"
67

78
#ifdef __linux__
8-
9+
#if (defined(_M_AMD64) || defined(__x86_64__)) && !defined(__ANDROID__)
10+
#include <x86intrin.h>
11+
#endif
912
#include <unistd.h>
1013
#include <sys/syscall.h>
1114
#if !defined(__NR_getcpu)
@@ -133,6 +136,17 @@ void CPUIDInfo::X86Init() {
133136
// avx512_skylake = avx512f | avx512vl | avx512cd | avx512bw | avx512dq
134137
has_avx512_skylake_ = has_avx512 && (data[1] & ((1 << 16) | (1 << 17) | (1 << 28) | (1 << 30) | (1 << 31)));
135138
is_hybrid_ = (data[3] & (1 << 15));
139+
// Check for TPAUSE
140+
CheckIntelResult check_intel = CheckIntel();
141+
if (check_intel.is_intel) {
142+
#ifdef __linux__
143+
#if !defined(__ANDROID__)
144+
has_tpause_ = __builtin_cpu_supports("waitpkg") != 0;
145+
#endif
146+
#else
147+
has_tpause_ = (data[2] & (1 << 5)) != 0;
148+
#endif
149+
}
136150
if (max_SubLeaves >= 1) {
137151
GetCPUID(7, 1, data);
138152
has_avx512_bf16_ = has_avx512 && (data[0] & (1 << 5));

onnxruntime/core/common/cpuid_info.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class CPUIDInfo {
3333
bool HasSSE3() const { return has_sse3_; }
3434
bool HasSSE4_1() const { return has_sse4_1_; }
3535
bool IsHybrid() const { return is_hybrid_; }
36+
bool HasTPAUSE() const { return has_tpause_; }
3637

3738
// ARM
3839
bool HasArmNeonDot() const { return has_arm_neon_dot_; }
@@ -112,6 +113,7 @@ class CPUIDInfo {
112113
bool has_sse3_{false};
113114
bool has_sse4_1_{false};
114115
bool is_hybrid_{false};
116+
bool has_tpause_{false};
115117

116118
std::vector<uint32_t> core_uarchs_; // micro-arch of each core
117119

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#include "core/common/spin_pause.h"
5+
6+
#if defined(_M_AMD64)
7+
#include <intrin.h>
8+
#endif
9+
10+
#if defined(__x86_64__)
11+
#include <xmmintrin.h>
12+
#endif
13+
14+
#if defined(_M_AMD64) || defined(__x86_64__)
15+
#include "core/common/cpuid_info.h"
16+
#if defined(__linux__)
17+
#include <x86intrin.h>
18+
#include <immintrin.h>
19+
#endif
20+
#endif
21+
22+
namespace onnxruntime {
23+
namespace concurrency {
24+
25+
// Intrinsic to use in spin-loops
26+
void SpinPause() {
27+
#if (defined(_M_AMD64) || defined(__x86_64__)) && \
28+
!defined(__ANDROID__) && \
29+
!defined(__APPLE__)
30+
31+
static const bool has_tpause = CPUIDInfo::GetCPUIDInfo().HasTPAUSE();
32+
static constexpr uint64_t tpause_spin_delay_cycles = 1000;
33+
if (has_tpause) {
34+
#if defined(_WIN32)
35+
_tpause(0x0, __rdtsc() + tpause_spin_delay_cycles);
36+
#elif defined(__linux__)
37+
__builtin_ia32_tpause(0x0, __rdtsc() + tpause_spin_delay_cycles);
38+
#else
39+
_mm_pause();
40+
#endif
41+
} else {
42+
_mm_pause();
43+
}
44+
#endif
45+
}
46+
47+
} // namespace concurrency
48+
} // namespace onnxruntime
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#include "core/platform/check_intel.h"
5+
6+
#if (defined(_M_AMD64) || defined(__x86_64__))
7+
#if defined(__linux__)
8+
#include <cpuid.h>
9+
#elif defined(_WIN32)
10+
#include <intrin.h>
11+
#endif
12+
#endif
13+
14+
namespace onnxruntime {
15+
16+
CheckIntelResult CheckIntel() {
17+
CheckIntelResult intel_check = {false, false};
18+
bool is_intel = false;
19+
bool is_intel_specified_platform = false;
20+
21+
#if (defined(_M_AMD64) || defined(__x86_64__))
22+
#if defined(_WIN32)
23+
constexpr unsigned int kVendorID_Intel[] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
24+
constexpr unsigned int kVendorID_IntelSpecifiedPlatformIDs[] = {
25+
// ExtendedModel, ExtendedFamily, Family Code, and Model Number
26+
0xa06a, // MTL
27+
0xc065, // ARL-H
28+
0xb065 // ARL-U
29+
};
30+
31+
int regs_leaf0[4];
32+
int regs_leaf1[4];
33+
__cpuid(regs_leaf0, 0);
34+
__cpuid(regs_leaf1, 0x1);
35+
36+
is_intel =
37+
(kVendorID_Intel[0] == static_cast<unsigned int>(regs_leaf0[1])) &&
38+
(kVendorID_Intel[1] == static_cast<unsigned int>(regs_leaf0[2])) &&
39+
(kVendorID_Intel[2] == static_cast<unsigned int>(regs_leaf0[3]));
40+
41+
if (!is_intel) {
42+
return intel_check; // if not an Intel CPU, return early
43+
}
44+
45+
for (auto intel_specified_platform : kVendorID_IntelSpecifiedPlatformIDs) {
46+
if ((static_cast<unsigned int>(regs_leaf1[0]) >> 4) == intel_specified_platform) {
47+
is_intel_specified_platform = true;
48+
break;
49+
}
50+
}
51+
52+
#elif defined(__linux__)
53+
constexpr unsigned int kVendorID_Intel[] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
54+
unsigned int regs[4] = {0};
55+
__get_cpuid(0, &regs[0], &regs[1], &regs[2], &regs[3]);
56+
57+
is_intel = (regs[1] == kVendorID_Intel[0] &&
58+
regs[2] == kVendorID_Intel[1] &&
59+
regs[3] == kVendorID_Intel[2]);
60+
if (!is_intel) {
61+
return intel_check; // if not an Intel CPU, return early
62+
}
63+
64+
__get_cpuid(1, &regs[0], &regs[1], &regs[2], &regs[3]);
65+
66+
unsigned int base_family = (regs[0] >> 8) & 0xF;
67+
unsigned int base_model = (regs[0] >> 4) & 0xF;
68+
unsigned int extended_model = (regs[0] >> 16) & 0xF;
69+
70+
unsigned int model =
71+
(base_family == 0x6 || base_family == 0xF)
72+
? (base_model + (extended_model << 4))
73+
: base_model;
74+
75+
constexpr unsigned int kVendorID_IntelSpecifiedPlatformIDs[] = {
76+
// ExtendedModel, ExtendedFamily, Family Code, and Model Number
77+
170, // MTL (0xAA)
78+
197, // ARL-H (0xC5)
79+
198 // ARL-U (0xC6)
80+
};
81+
82+
for (auto id : kVendorID_IntelSpecifiedPlatformIDs) {
83+
if (model == id) {
84+
is_intel_specified_platform = true;
85+
break;
86+
}
87+
}
88+
#endif //__linux__
89+
#endif // (_M_AMD64) || (__x86_64__)
90+
91+
intel_check.is_intel = is_intel;
92+
intel_check.is_intel_specified_platform = is_intel_specified_platform;
93+
94+
return intel_check;
95+
}
96+
97+
} // namespace onnxruntime
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
namespace onnxruntime {
7+
typedef struct {
8+
bool is_intel;
9+
bool is_intel_specified_platform;
10+
} CheckIntelResult;
11+
12+
CheckIntelResult CheckIntel();
13+
} // namespace onnxruntime

onnxruntime/core/platform/windows/hardware_core_enumerator.cc

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "hardware_core_enumerator.h"
55
#include "core/platform/windows/env.h"
6+
#include "core/platform/check_intel.h"
67
#include <memory>
78
#include <Windows.h>
89
#include <assert.h>
@@ -85,30 +86,11 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
8586
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
8687
auto cores = GetCoreInfo();
8788
#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
88-
const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
89-
bool isIntelSpecifiedPlatform = false;
90-
const int kVendorID_IntelSpecifiedPlatformIDs[3] = {
91-
// ExtendedModel, ExtendedFamily, Family Code, and Model Number
92-
0xa06a, // MTL
93-
0xc065, // ARL-H
94-
0xb065 // ARL-U
95-
};
96-
97-
int regs_leaf0[4];
98-
int regs_leaf1[4];
99-
__cpuid(regs_leaf0, 0);
100-
__cpuid(regs_leaf1, 0x1);
101-
102-
auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) && (kVendorID_Intel[2] == regs_leaf0[3]);
103-
104-
for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) {
105-
if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) {
106-
isIntelSpecifiedPlatform = true;
107-
}
108-
}
10989

110-
if (isIntel) {
111-
if (isIntelSpecifiedPlatform) {
90+
CheckIntelResult check_intel = CheckIntel();
91+
92+
if (check_intel.is_intel) {
93+
if (check_intel.is_intel_specified_platform) {
11294
// We want to exclude cores without an LLC
11395
return cores.LLCCores;
11496
} else {

0 commit comments

Comments
 (0)