Skip to content

Commit f93fcde

Browse files
authored
[offload-arch] Fix amdgpu-arch crash on Windows with ROCm 7.1 (#167695)
The tool was crashing on Windows with ROCm 7.1 due to two issues: misuse of hipDeviceGet which should not be used (it worked before by accident but was undefined behavior), and ABI incompatibility from hipDeviceProp_t struct layout changes between HIP versions where the gcnArchName offset changed from 396 to 1160 bytes. The fix removes hipDeviceGet and queries properties directly by device index. It defines separate struct layouts for R0600 (HIP 6.x+) and R0000 (legacy) to handle the different memory layouts correctly. An automatic API fallback mechanism tries R0600, then R0000, then the unversioned API until one succeeds, ensuring compatibility across different HIP runtime versions. A new --hip-api-version option allows manually selecting the API version when needed. Additional improvements include enhanced error handling with hipGetErrorString, verbose logging throughout the detection process, and runtime version detection using hipRuntimeGetVersion when available. The versioned API functions provide stable ABI across HIP versions. Fixes: SWDEV-564272
1 parent d719876 commit f93fcde

File tree

2 files changed

+174
-25
lines changed

2 files changed

+174
-25
lines changed

clang/tools/offload-arch/AMDGPUArchByHIP.cpp

Lines changed: 171 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "llvm/ADT/STLExtras.h"
15+
#include "llvm/ADT/Sequence.h"
1516
#include "llvm/Support/CommandLine.h"
1617
#include "llvm/Support/ConvertUTF.h"
1718
#include "llvm/Support/DynamicLibrary.h"
@@ -32,22 +33,54 @@
3233

3334
using namespace llvm;
3435

35-
typedef struct {
36+
// R0600 struct layout (HIP 6.x+)
37+
typedef struct alignas(8) {
38+
char padding[1160];
39+
char gcnArchName[256];
40+
char padding2[56];
41+
} hipDeviceProp_tR0600;
42+
43+
// R0000 struct layout (legacy)
44+
typedef struct alignas(8) {
3645
char padding[396];
3746
char gcnArchName[256];
3847
char padding2[1024];
39-
} hipDeviceProp_t;
48+
} hipDeviceProp_tR0000;
4049

4150
typedef enum {
4251
hipSuccess = 0,
4352
} hipError_t;
4453

4554
typedef hipError_t (*hipGetDeviceCount_t)(int *);
46-
typedef hipError_t (*hipDeviceGet_t)(int *, int);
47-
typedef hipError_t (*hipGetDeviceProperties_t)(hipDeviceProp_t *, int);
55+
typedef hipError_t (*hipGetDevicePropertiesR0600_t)(hipDeviceProp_tR0600 *,
56+
int);
57+
typedef hipError_t (*hipGetDevicePropertiesR0000_t)(hipDeviceProp_tR0000 *,
58+
int);
59+
typedef hipError_t (*hipGetDeviceProperties_t)(hipDeviceProp_tR0000 *, int);
60+
typedef hipError_t (*hipRuntimeGetVersion_t)(int *);
61+
typedef const char *(*hipGetErrorString_t)(hipError_t);
4862

4963
extern cl::opt<bool> Verbose;
5064

65+
cl::OptionCategory AMDGPUArchByHIPCategory("amdgpu-arch (HIP) options");
66+
67+
enum class HipApiVersion {
68+
Auto, // Automatic fallback (R0600 -> R0000 -> unversioned)
69+
R0600, // Force R0600 API (HIP 6.x+)
70+
R0000, // Force R0000 API (legacy HIP)
71+
Unversioned // Force unversioned API (very old HIP)
72+
};
73+
74+
static cl::opt<HipApiVersion> HipApi(
75+
"hip-api-version", cl::desc("Select HIP API version for device properties"),
76+
cl::values(clEnumValN(HipApiVersion::Auto, "auto",
77+
"Auto-detect (R0600 -> R0000 -> unversioned)"),
78+
clEnumValN(HipApiVersion::R0600, "r0600", "Force R0600 API"),
79+
clEnumValN(HipApiVersion::R0000, "r0000", "Force R0000 API"),
80+
clEnumValN(HipApiVersion::Unversioned, "unversioned",
81+
"Force unversioned API")),
82+
cl::init(HipApiVersion::Auto), cl::cat(AMDGPUArchByHIPCategory));
83+
5184
#ifdef _WIN32
5285
static std::vector<std::string> getSearchPaths() {
5386
std::vector<std::string> Paths;
@@ -177,49 +210,163 @@ int printGPUsByHIP() {
177210
return 1;
178211
}
179212

213+
if (Verbose)
214+
outs() << "Successfully loaded HIP runtime library\n";
215+
180216
#define DYNAMIC_INIT_HIP(SYMBOL) \
181217
{ \
182218
void *SymbolPtr = DynlibHandle->getAddressOfSymbol(#SYMBOL); \
183219
if (!SymbolPtr) { \
184220
llvm::errs() << "Failed to find symbol " << #SYMBOL << '\n'; \
185221
return 1; \
186222
} \
223+
if (Verbose) \
224+
outs() << "Found symbol: " << #SYMBOL << '\n'; \
187225
SYMBOL = reinterpret_cast<decltype(SYMBOL)>(SymbolPtr); \
188226
}
189227

190228
hipGetDeviceCount_t hipGetDeviceCount;
191-
hipDeviceGet_t hipDeviceGet;
192-
hipGetDeviceProperties_t hipGetDeviceProperties;
229+
hipRuntimeGetVersion_t hipRuntimeGetVersion = nullptr;
230+
hipGetDevicePropertiesR0600_t hipGetDevicePropertiesR0600 = nullptr;
231+
hipGetDevicePropertiesR0000_t hipGetDevicePropertiesR0000 = nullptr;
232+
hipGetDeviceProperties_t hipGetDeviceProperties = nullptr;
233+
hipGetErrorString_t hipGetErrorString = nullptr;
193234

194235
DYNAMIC_INIT_HIP(hipGetDeviceCount);
195-
DYNAMIC_INIT_HIP(hipDeviceGet);
196-
DYNAMIC_INIT_HIP(hipGetDeviceProperties);
197236

198237
#undef DYNAMIC_INIT_HIP
199238

200-
int deviceCount;
201-
hipError_t err = hipGetDeviceCount(&deviceCount);
202-
if (err != hipSuccess) {
203-
llvm::errs() << "Failed to get device count\n";
239+
auto LoadSymbol = [&](const char *Name, auto &FuncPtr,
240+
const char *Desc = "") {
241+
void *Sym = DynlibHandle->getAddressOfSymbol(Name);
242+
if (Sym) {
243+
FuncPtr = reinterpret_cast<decltype(FuncPtr)>(Sym);
244+
if (Verbose)
245+
outs() << "Found symbol: " << Name << (Desc[0] ? " " : "") << Desc
246+
<< '\n';
247+
return true;
248+
}
249+
return false;
250+
};
251+
252+
LoadSymbol("hipGetErrorString", hipGetErrorString);
253+
254+
if (LoadSymbol("hipRuntimeGetVersion", hipRuntimeGetVersion)) {
255+
int RuntimeVersion = 0;
256+
if (hipRuntimeGetVersion(&RuntimeVersion) == hipSuccess) {
257+
int Major = RuntimeVersion / 10000000;
258+
int Minor = (RuntimeVersion / 100000) % 100;
259+
int Patch = RuntimeVersion % 100000;
260+
if (Verbose)
261+
outs() << "HIP Runtime Version: " << Major << "." << Minor << "."
262+
<< Patch << '\n';
263+
}
264+
}
265+
266+
LoadSymbol("hipGetDevicePropertiesR0600", hipGetDevicePropertiesR0600,
267+
"(HIP 6.x+ API)");
268+
LoadSymbol("hipGetDevicePropertiesR0000", hipGetDevicePropertiesR0000,
269+
"(legacy API)");
270+
if (!hipGetDevicePropertiesR0600 && !hipGetDevicePropertiesR0000)
271+
LoadSymbol("hipGetDeviceProperties", hipGetDeviceProperties,
272+
"(unversioned legacy API)");
273+
274+
int DeviceCount;
275+
if (Verbose)
276+
outs() << "Calling hipGetDeviceCount...\n";
277+
hipError_t Err = hipGetDeviceCount(&DeviceCount);
278+
if (Err != hipSuccess) {
279+
llvm::errs() << "Failed to get device count";
280+
if (hipGetErrorString) {
281+
llvm::errs() << ": " << hipGetErrorString(Err);
282+
}
283+
llvm::errs() << " (error code: " << Err << ")\n";
204284
return 1;
205285
}
206286

207-
for (int i = 0; i < deviceCount; ++i) {
208-
int deviceId;
209-
err = hipDeviceGet(&deviceId, i);
210-
if (err != hipSuccess) {
211-
llvm::errs() << "Failed to get device id for ordinal " << i << '\n';
212-
return 1;
287+
if (Verbose)
288+
outs() << "Found " << DeviceCount << " device(s)\n";
289+
290+
auto TryGetProperties = [&](auto *ApiFunc, auto *DummyProp, const char *Name,
291+
int DeviceId) -> std::string {
292+
if (!ApiFunc)
293+
return "";
294+
295+
if (Verbose)
296+
outs() << "Using " << Name << "...\n";
297+
298+
using PropType = std::remove_pointer_t<decltype(DummyProp)>;
299+
PropType Prop;
300+
hipError_t Err = ApiFunc(&Prop, DeviceId);
301+
302+
if (Err == hipSuccess) {
303+
if (Verbose) {
304+
outs() << Name << " struct: sizeof = " << sizeof(PropType)
305+
<< " bytes, offsetof(gcnArchName) = "
306+
<< offsetof(PropType, gcnArchName) << " bytes\n";
307+
}
308+
return Prop.gcnArchName;
213309
}
214310

215-
hipDeviceProp_t prop;
216-
err = hipGetDeviceProperties(&prop, deviceId);
217-
if (err != hipSuccess) {
218-
llvm::errs() << "Failed to get device properties for device " << deviceId
219-
<< '\n';
311+
if (Verbose)
312+
llvm::errs() << Name << " failed (error code: " << Err << ")\n";
313+
return "";
314+
};
315+
316+
for (auto I : llvm::seq(DeviceCount)) {
317+
if (Verbose)
318+
outs() << "Processing device " << I << "...\n";
319+
320+
std::string ArchName;
321+
auto TryR0600 = [&](int Dev) -> bool {
322+
if (!hipGetDevicePropertiesR0600)
323+
return false;
324+
ArchName = TryGetProperties(hipGetDevicePropertiesR0600,
325+
(hipDeviceProp_tR0600 *)nullptr,
326+
"R0600 API (HIP 6.x+)", Dev);
327+
return !ArchName.empty();
328+
};
329+
auto TryR0000 = [&](int Dev) -> bool {
330+
if (!hipGetDevicePropertiesR0000)
331+
return false;
332+
ArchName = TryGetProperties(hipGetDevicePropertiesR0000,
333+
(hipDeviceProp_tR0000 *)nullptr,
334+
"R0000 API (legacy HIP)", Dev);
335+
return !ArchName.empty();
336+
};
337+
auto TryUnversioned = [&](int Dev) -> bool {
338+
if (!hipGetDeviceProperties)
339+
return false;
340+
ArchName = TryGetProperties(hipGetDeviceProperties,
341+
(hipDeviceProp_tR0000 *)nullptr,
342+
"unversioned API (very old HIP)", Dev);
343+
return !ArchName.empty();
344+
};
345+
346+
[[maybe_unused]] bool OK;
347+
switch (HipApi) {
348+
case HipApiVersion::Auto:
349+
OK = TryR0600(I) || TryR0000(I) || TryUnversioned(I);
350+
break;
351+
case HipApiVersion::R0600:
352+
OK = TryR0600(I);
353+
break;
354+
case HipApiVersion::R0000:
355+
OK = TryR0000(I);
356+
break;
357+
case HipApiVersion::Unversioned:
358+
OK = TryUnversioned(I);
359+
}
360+
361+
if (ArchName.empty()) {
362+
llvm::errs() << "Failed to get device properties for device " << I
363+
<< " - no APIs available or all failed\n";
220364
return 1;
221365
}
222-
llvm::outs() << prop.gcnArchName << '\n';
366+
367+
if (Verbose)
368+
outs() << "Device " << I << " arch name: ";
369+
llvm::outs() << ArchName << '\n';
223370
}
224371

225372
return 0;

clang/tools/offload-arch/OffloadArch.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden);
1717
// Mark all our options with this category.
1818
static cl::OptionCategory OffloadArchCategory("offload-arch options");
1919

20+
extern cl::OptionCategory AMDGPUArchByHIPCategory;
21+
2022
enum VendorName {
2123
all,
2224
amdgpu,
@@ -62,7 +64,7 @@ const std::array<std::pair<VendorName, function_ref<int()>>, 3> VendorTable{
6264
{VendorName::intel, printIntel}}};
6365

6466
int main(int argc, char *argv[]) {
65-
cl::HideUnrelatedOptions(OffloadArchCategory);
67+
cl::HideUnrelatedOptions({&OffloadArchCategory, &AMDGPUArchByHIPCategory});
6668

6769
cl::SetVersionPrinter(PrintVersion);
6870
cl::ParseCommandLineOptions(

0 commit comments

Comments
 (0)