Skip to content

Commit 468296f

Browse files
committed
AMD: parse the architecture as supplied by gcnArchName
The value provided by minor is truncated for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID. We can also use the common value for GCN4, as gfx800, to avoid missing compatible devices.
1 parent 3edfa7d commit 468296f

File tree

5 files changed

+795
-12
lines changed

5 files changed

+795
-12
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,20 @@
4646
#define GGML_CUDA_CC_VOLTA 700
4747
#define GGML_CUDA_CC_TURING 750
4848
#define GGML_CUDA_CC_AMPERE 800
49-
#define GGML_CUDA_CC_OFFSET_AMD 1000000
49+
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
5050

5151
// GCN/CNDA, wave size is 64
52-
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53-
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54-
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55-
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56-
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57-
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
52+
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x800) // Tonga, Fiji, Polaris, minimum for fast fp16
53+
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
54+
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
55+
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
56+
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
57+
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
5858

5959
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60-
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
61-
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62-
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
60+
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
61+
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
62+
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
6363

6464
#define GGML_CUDA_CC_QY1 210
6565
#define GGML_CUDA_CC_QY2 220

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,59 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
119119
#endif
120120
}
121121

122+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
123+
int ggml_cuda_parse_id(char devName[]) {
124+
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
125+
// these values are not stable so this is susceptible to breakage
126+
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
127+
int archMajor = 0x0;
128+
int archMinor = 0x0;
129+
int archNum = GGML_CUDA_CC_OFFSET_AMD;
130+
int archLen = strlen(devName);
131+
char archName[archLen + 1];
132+
133+
// strip leading 'gfx' while copying into our buffer
134+
if (archLen > 3) {
135+
strcpy(archName, &devName[3]);
136+
archLen -= 3;
137+
}
138+
139+
// trim trailing :xnack- or :sramecc- statuses
140+
archLen = strcspn(archName, ":");
141+
archName[archLen] = '\0';
142+
143+
// tease out the version information
144+
if (archLen > 8) {
145+
// versions labeled generic use '-' as delimiter
146+
// strip the trailing "-generic" then iterate through what remains
147+
if (strstr(archName, "-generic")) {
148+
archName[archLen - 8] = '\0';
149+
char * pch;
150+
if (pch = strtok(archName, "-")) {
151+
archMajor = (int)strtoul(pch, 0, 16);
152+
if (pch = strtok(NULL, "-")) {
153+
archMinor = 0x10 * (int)strtoul(pch, 0, 16);
154+
}
155+
}
156+
}
157+
} else if (archLen >= 3) {
158+
// last two digits should be the minor * 0x10 + stepping
159+
archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
160+
archName[archLen - 2] = '\0';
161+
162+
// only the major version remains
163+
archMajor = (int)strtoul(archName, 0, 16);
164+
}
165+
archNum += archMajor * 0x100;
166+
167+
// be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
168+
if (archMajor != 8) {
169+
archNum += archMinor;
170+
}
171+
return archNum;
172+
}
173+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
174+
122175
static ggml_cuda_device_info ggml_cuda_init() {
123176
#ifdef __HIP_PLATFORM_AMD__
124177
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -169,7 +222,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
169222

170223
cudaDeviceProp prop;
171224
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
172-
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
173225

174226
info.default_tensor_split[id] = total_vram;
175227
total_vram += prop.totalGlobalMem;
@@ -178,10 +230,29 @@ static ggml_cuda_device_info ggml_cuda_init() {
178230
info.devices[id].smpb = prop.sharedMemPerBlock;
179231
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
180232
info.devices[id].smpbo = prop.sharedMemPerBlock;
181-
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
233+
234+
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
235+
if (info.devices[id].cc & 0xff00 == 0x0) {
236+
GGML_LOG_WARN("invalid architecture ID received for device %d %s: %d cc %d.%d\n",
237+
id, prop.name, prop.gcnArchName, prop.major, prop.minor);
238+
239+
// Fallback to prop.major and prop.minor
240+
if (prop.major > 0) {
241+
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
242+
243+
// be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
244+
if (prop.minor != 8) {
245+
info.devices[id].cc += prop.minor * 0x10;
246+
}
247+
}
248+
}
249+
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
250+
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
182251
#else
183252
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
184253
info.devices[id].cc = 100*prop.major + 10*prop.minor;
254+
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
255+
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
185256
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
186257
}
187258

scripts/fetch-amd-ids.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/bin/env python3
2+
import _io
3+
import re
4+
import os
5+
import sys
6+
from pathlib import Path
7+
from urllib import request
8+
from urllib.request import urlopen
9+
10+
reUrl = re.compile('^(http(s|)://)(www.|)[a-zA-Z0-9.]*/.*$')
11+
reSupportedIsas = re.compile('.*static constexpr Isa supportedIsas_.*')
12+
reTarget = re.compile('.*{([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*)},.*')
13+
14+
src = "https://raw.githubusercontent.com/ROCm/clr/refs/heads/amd-staging/rocclr/device/device.cpp"
15+
srcType = 'url'
16+
17+
targets = []
18+
19+
def parse(items):
20+
assert(type(items) == list )
21+
22+
depth = 0
23+
i = 0
24+
for line in items:
25+
i += 1
26+
line = str(line.encode("utf-8"))
27+
28+
if re.match(reSupportedIsas, line):
29+
depth += 1
30+
continue
31+
32+
if depth:
33+
for char in line:
34+
if char == '}':
35+
depth -= 1
36+
if depth < 1:
37+
break
38+
elif char == '{':
39+
depth += 1
40+
41+
if depth < 1:
42+
break
43+
44+
if re.match(reTarget, line):
45+
itms = reTarget.split(line)
46+
targets.append((itms[1].strip(' "'),itms[5].strip(' '),itms[6].strip(' '),itms[7].strip(' ')))
47+
48+
49+
if __name__ == '__main__':
50+
buffer=""
51+
52+
if len(sys.argv) > 1:
53+
src = sys.argv[1]
54+
if re.fullmatch(reUrl, src):
55+
srcType = 'url'
56+
57+
else:
58+
srcType = 'file'
59+
if not os.path.exists(src):
60+
raise FileNotFoundError
61+
62+
_src = Path(src)
63+
if not _src.exists():
64+
raise FileNotFoundError
65+
66+
if srcType == "url":
67+
urlreq = request.Request(src)
68+
data = urlopen(urlreq)
69+
buffer = str(data.read().decode("utf-8"))
70+
71+
parse(buffer.splitlines())
72+
else:
73+
try:
74+
num_lines = -1
75+
with open(_src, 'r') as fileIn:
76+
buffer = fileIn.readlines()
77+
78+
parse(buffer)
79+
80+
except Exception as exception:
81+
print(exception)
82+
finally:
83+
if isinstance(fileIn, _io.TextIOWrapper) and not fileIn.close:
84+
fileIn.close()
85+
86+
if len(targets) == 0:
87+
print(f'No items found in {src}!', file=sys.stderr)
88+
exit(1)
89+
90+
i = 0
91+
print(f'struct target '"{")
92+
print(f' char id[256];')
93+
print(f' char major;')
94+
print(f' char minor;')
95+
print(f' char step;')
96+
print("};")
97+
print('')
98+
print(f'struct target targets[{len(targets)}];')
99+
print('')
100+
for itm in targets:
101+
assert(type(itm) == tuple)
102+
print(f'strcpy(targets[{i}].id, "{itm[0]}");')
103+
print(f'targets[{i}].major = {itm[1]};')
104+
print(f'targets[{i}].minor = {itm[2]};')
105+
print(f'targets[{i}].step = {itm[3]};')
106+
i += 1

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ if (NOT GGML_BACKEND_DL)
143143
llama_target_and_test(test-rope.cpp)
144144
endif()
145145

146+
# llama_target_and_test(test-parse-amd-ids.c)
146147

147148
# dummy executable - not installed
148149
get_filename_component(TEST_TARGET test-c.c NAME_WE)

0 commit comments

Comments
 (0)