-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[win][aarch64] Add support for detecting the Host CPU on Arm64 Windows #151596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-platform-windows Author: Daniel Paoliello (dpaoliello) ChangesUses the On my local Surface Pro 11 Full diff: https://github.com/llvm/llvm-project/pull/151596.diff 2 Files Affected:
diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h
index 40a9b6cc13902..8f9ecbfc69029 100644
--- a/llvm/include/llvm/TargetParser/Host.h
+++ b/llvm/include/llvm/TargetParser/Host.h
@@ -13,6 +13,8 @@
#ifndef LLVM_TARGETPARSER_HOST_H
#define LLVM_TARGETPARSER_HOST_H
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
#include "llvm/Support/Compiler.h"
#include <string>
@@ -63,6 +65,10 @@ namespace detail {
/// Helper functions to extract HostCPUName from /proc/cpuinfo on linux.
LLVM_ABI StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
LLVM_ABI StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForARM(StringRef Implementer,
+ StringRef Hardware, StringRef Part,
+ ArrayRef<StringRef> Parts,
+ function_ref<unsigned()> GetVariant);
LLVM_ABI StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
LLVM_ABI StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
LLVM_ABI StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 7e09d30bf3d55..b1478dfbd0457 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -197,6 +197,22 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
llvm::sort(Parts);
Parts.erase(llvm::unique(Parts), Parts.end());
+ auto GetVariant = [&]() {
+ unsigned Variant = 0;
+ for (auto I : Lines)
+ if (I.consume_front("CPU variant"))
+ I.ltrim("\t :").getAsInteger(0, Variant);
+ return Variant;
+ };
+
+ return getHostCPUNameForARM(Implementer, Hardware, Part, Parts, GetVariant);
+}
+
+StringRef
+sys::detail::getHostCPUNameForARM(StringRef Implementer, StringRef Hardware,
+ StringRef Part, ArrayRef<StringRef> Parts,
+ function_ref<unsigned()> GetVariant) {
+
auto MatchBigLittle = [](auto const &Parts, StringRef Big, StringRef Little) {
if (Parts.size() == 2)
return (Parts[0] == Big && Parts[1] == Little) ||
@@ -343,21 +359,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
if (Implementer == "0x53") { // Samsung Electronics Co., Ltd.
// The Exynos chips have a convoluted ID scheme that doesn't seem to follow
// any predictive pattern across variants and parts.
- unsigned Variant = 0, Part = 0;
// Look for the CPU variant line, whose value is a 1 digit hexadecimal
// number, corresponding to the Variant bits in the CP15/C0 register.
- for (auto I : Lines)
- if (I.consume_front("CPU variant"))
- I.ltrim("\t :").getAsInteger(0, Variant);
+ unsigned Variant = GetVariant();
- // Look for the CPU part line, whose value is a 3 digit hexadecimal
- // number, corresponding to the PartNum bits in the CP15/C0 register.
- for (auto I : Lines)
- if (I.consume_front("CPU part"))
- I.ltrim("\t :").getAsInteger(0, Part);
+ // Convert the CPU part line, whose value is a 3 digit hexadecimal number,
+ // corresponding to the PartNum bits in the CP15/C0 register.
+ unsigned PartAsInt;
+ Part.getAsInteger(0, PartAsInt);
- unsigned Exynos = (Variant << 12) | Part;
+ unsigned Exynos = (Variant << 12) | PartAsInt;
switch (Exynos) {
default:
// Default by falling through to Exynos M3.
@@ -1450,6 +1462,48 @@ StringRef sys::getHostCPUName() {
return "generic";
}
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+union MIDR_EL1 {
+ uint64_t Raw;
+ struct _Parts {
+ uint64_t Revision : 4;
+ uint64_t Partnum : 12;
+ uint64_t Architecture : 4;
+ uint64_t Variant : 4;
+ uint64_t Implementer : 8;
+ uint64_t Reserved : 32;
+ } Parts;
+};
+
+StringRef sys::getHostCPUName() {
+ StringRef CPU = "generic";
+
+ // The "CP 4000" registry key contains a cached copy of the MIDR_EL1 register.
+ HKEY Key;
+ if (RegOpenKeyExA(HKEY_LOCAL_MACHINE,
+ "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0,
+ KEY_READ, &Key) == ERROR_SUCCESS) {
+ MIDR_EL1 RegValue;
+ DWORD ActualType;
+ DWORD RegValueSize = sizeof(RegValue);
+ if ((RegQueryValueExA(Key, "CP 4000", nullptr, &ActualType,
+ (PBYTE)&RegValue, &RegValueSize) == ERROR_SUCCESS) &&
+ (ActualType == REG_QWORD) && RegValueSize == sizeof(RegValue)) {
+ auto Part = "0x" + utohexstr(RegValue.Parts.Partnum, /*LowerCase*/ true,
+ /*Width*/ 3);
+ CPU = detail::getHostCPUNameForARM(
+ "0x" + utohexstr(RegValue.Parts.Implementer, /*LowerCase*/ true,
+ /*Width*/ 2),
+ /*Hardware*/ "", Part, ArrayRef<StringRef>{Part},
+ [=]() { return RegValue.Parts.Variant; });
+ }
+ RegCloseKey(Key);
+ }
+
+ return CPU;
+}
+
#elif defined(__APPLE__) && defined(__powerpc__)
StringRef sys::getHostCPUName() {
host_basic_info_data_t hostInfo;
|
|
@llvm/pr-subscribers-backend-aarch64 Author: Daniel Paoliello (dpaoliello) ChangesUses the On my local Surface Pro 11 Full diff: https://github.com/llvm/llvm-project/pull/151596.diff 2 Files Affected:
diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h
index 40a9b6cc13902..8f9ecbfc69029 100644
--- a/llvm/include/llvm/TargetParser/Host.h
+++ b/llvm/include/llvm/TargetParser/Host.h
@@ -13,6 +13,8 @@
#ifndef LLVM_TARGETPARSER_HOST_H
#define LLVM_TARGETPARSER_HOST_H
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
#include "llvm/Support/Compiler.h"
#include <string>
@@ -63,6 +65,10 @@ namespace detail {
/// Helper functions to extract HostCPUName from /proc/cpuinfo on linux.
LLVM_ABI StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
LLVM_ABI StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForARM(StringRef Implementer,
+ StringRef Hardware, StringRef Part,
+ ArrayRef<StringRef> Parts,
+ function_ref<unsigned()> GetVariant);
LLVM_ABI StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
LLVM_ABI StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
LLVM_ABI StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 7e09d30bf3d55..b1478dfbd0457 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -197,6 +197,22 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
llvm::sort(Parts);
Parts.erase(llvm::unique(Parts), Parts.end());
+ auto GetVariant = [&]() {
+ unsigned Variant = 0;
+ for (auto I : Lines)
+ if (I.consume_front("CPU variant"))
+ I.ltrim("\t :").getAsInteger(0, Variant);
+ return Variant;
+ };
+
+ return getHostCPUNameForARM(Implementer, Hardware, Part, Parts, GetVariant);
+}
+
+StringRef
+sys::detail::getHostCPUNameForARM(StringRef Implementer, StringRef Hardware,
+ StringRef Part, ArrayRef<StringRef> Parts,
+ function_ref<unsigned()> GetVariant) {
+
auto MatchBigLittle = [](auto const &Parts, StringRef Big, StringRef Little) {
if (Parts.size() == 2)
return (Parts[0] == Big && Parts[1] == Little) ||
@@ -343,21 +359,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
if (Implementer == "0x53") { // Samsung Electronics Co., Ltd.
// The Exynos chips have a convoluted ID scheme that doesn't seem to follow
// any predictive pattern across variants and parts.
- unsigned Variant = 0, Part = 0;
// Look for the CPU variant line, whose value is a 1 digit hexadecimal
// number, corresponding to the Variant bits in the CP15/C0 register.
- for (auto I : Lines)
- if (I.consume_front("CPU variant"))
- I.ltrim("\t :").getAsInteger(0, Variant);
+ unsigned Variant = GetVariant();
- // Look for the CPU part line, whose value is a 3 digit hexadecimal
- // number, corresponding to the PartNum bits in the CP15/C0 register.
- for (auto I : Lines)
- if (I.consume_front("CPU part"))
- I.ltrim("\t :").getAsInteger(0, Part);
+ // Convert the CPU part line, whose value is a 3 digit hexadecimal number,
+ // corresponding to the PartNum bits in the CP15/C0 register.
+ unsigned PartAsInt;
+ Part.getAsInteger(0, PartAsInt);
- unsigned Exynos = (Variant << 12) | Part;
+ unsigned Exynos = (Variant << 12) | PartAsInt;
switch (Exynos) {
default:
// Default by falling through to Exynos M3.
@@ -1450,6 +1462,48 @@ StringRef sys::getHostCPUName() {
return "generic";
}
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+union MIDR_EL1 {
+ uint64_t Raw;
+ struct _Parts {
+ uint64_t Revision : 4;
+ uint64_t Partnum : 12;
+ uint64_t Architecture : 4;
+ uint64_t Variant : 4;
+ uint64_t Implementer : 8;
+ uint64_t Reserved : 32;
+ } Parts;
+};
+
+StringRef sys::getHostCPUName() {
+ StringRef CPU = "generic";
+
+ // The "CP 4000" registry key contains a cached copy of the MIDR_EL1 register.
+ HKEY Key;
+ if (RegOpenKeyExA(HKEY_LOCAL_MACHINE,
+ "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0,
+ KEY_READ, &Key) == ERROR_SUCCESS) {
+ MIDR_EL1 RegValue;
+ DWORD ActualType;
+ DWORD RegValueSize = sizeof(RegValue);
+ if ((RegQueryValueExA(Key, "CP 4000", nullptr, &ActualType,
+ (PBYTE)&RegValue, &RegValueSize) == ERROR_SUCCESS) &&
+ (ActualType == REG_QWORD) && RegValueSize == sizeof(RegValue)) {
+ auto Part = "0x" + utohexstr(RegValue.Parts.Partnum, /*LowerCase*/ true,
+ /*Width*/ 3);
+ CPU = detail::getHostCPUNameForARM(
+ "0x" + utohexstr(RegValue.Parts.Implementer, /*LowerCase*/ true,
+ /*Width*/ 2),
+ /*Hardware*/ "", Part, ArrayRef<StringRef>{Part},
+ [=]() { return RegValue.Parts.Variant; });
+ }
+ RegCloseKey(Key);
+ }
+
+ return CPU;
+}
+
#elif defined(__APPLE__) && defined(__powerpc__)
StringRef sys::getHostCPUName() {
host_basic_info_data_t hostInfo;
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI, see #130509 (which might also impact Windows with this patch).
Can we add any tests for this? I guess we can't really unittest registry queries, but we could at least unittest the conversion of the 32-bit integers to getHostCPUNameForARM strings.
llvm/lib/TargetParser/Host.cpp
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to iterate from CentralProcessor\\0 to CentralProcessor\\N to find all the part numbers? The cpuinfo parsing puts all the unique part numbers into the Parts array, then sets the Part to the highest one. (I think some relevant devices have heterogeneous cores.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
On Linux, the "primary" part is select as the last one added before sorting:
llvm-project/llvm/lib/TargetParser/Host.cpp
Lines 193 to 198 in 154354e
| // Last `Part' seen, in case we don't analyse all `Parts' parsed. | |
| StringRef Part = Parts.empty() ? StringRef() : Parts.back(); | |
| // Remove duplicate `Parts'. | |
| llvm::sort(Parts); | |
| Parts.erase(llvm::unique(Parts), Parts.end()); |
For Windows, I'm selecting the "first" (lowest numbered) part as that corresponds with the BIG cores on my local machine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately I don't think the lowest or biggest part-number is always the biggest CPU. It will often be the little CPU in a system (which might not be the worst thing to pick, at least for scheduling). Otherwise we would probably have used it to get a more reliable order than the one that happens to be last in the list.
I think on linux it just picks the last one to match the old behaviour of looking through them all. But AFAUI from the few devices I have checked, the last ordered CPUs are often the bigger ones, so makes sense to pick if the big cpu is the preference. (I'm not sure how reliable that is on all devices though). The intent is to maybe use MatchBigLittle() in more places to be explicit about which cpu we are opting to optimize for, but that was added fairly recently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems way out-of-scope for this change: as I said, the lowest numbered registry key reliably picks the big cores on both of my Surface Pro 11 devices. I don't have any of the older Pro X or 850/8c/7c devices to verify on them.
Somebody might want to follow up after this change to eliminate the notion of a "primary" part and always use the parts array, but I don't want to bundle that with this change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On an 8cx Gen 3, CPU 0-3 have "CP 4000" = 0x410fd4b0, CPU 4-7 have "CP 4000" = 0x410fd4c0.
I think Windows enumerates cores essentially the same way as Linux. So I think we should pick the "primary" part the same way as Linux, which would be the highest numbered part, not the lowest-numbered part. Doing something different on Linux vs. Windows is more confusing than helpful.
That's likely not 100% reliable, but we can leave OS-independent improvements for a followup.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yes I didn't mean to imply you had to do this, just that this has come up elsewhere recently and it is known not to be ideal which node is picked by default.
So long as you make it mostly like linux (preferably in the same order, picking the same cpu), then that sounds OK to me. If you want to be more explicit about which is picked on a specific system then it is probably best to add a MatchBigLittle or equivalent for it, but for any specific device that is a separate issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, switched to picking the "largest" reg key - still reports oryon-1 for my Pro 11
4fdc555 to
f62a90e
Compare
This change isn't vital to any work that I'm doing, so if we need to back it out, then I have no problems with that.
Refactored the code a bit to improve the testing. Only thing that isn't tested is registry access (in theory we could create and mount a new hive, but that seems like overkill). |
41301f2 to
ea89320
Compare
Probably all we need to do to address this on Windows is to add |
ea89320 to
ebab38f
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - I'm not an expert on the windows APIs side of this, but it looks good to me comparing to an X86 machine I have.
llvm/lib/TargetParser/Host.cpp
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does PrimaryPartKeyNameSize need to be set inside the if too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, wow, good catch!
ebab38f to
d2ef88d
Compare
|
Looks like this is causing failures for big-endian machines (e.g., |
|
If it's specifically big-endian targets, I'd guess the issue is the MIDR_EL1 union? Maybe use |
Yeah, figured it was the union - I was going to use the endian helpers, but |
The new `sys::detail::getHostCPUNameForARM` for Windows (#151596) was implemented using a C++ bit-field, which caused the associated unit tests to fail on big-endian machines as it assumed a little-endian layout. This change switches from the C++ bit-field to LLVM's `BitField` type instead.
Uses the
CP 4000registry key underHKLM\HARDWARE\DESCRIPTION\System\CentralProcessor\0to get the Implementer and Part, which is then provided to a modified form ofgetHostCPUNameForARMto map to a CPU.On my local Surface Pro 11
llc --versionreports: