Skip to content

Commit 28006d8

Browse files
committed
fix(metax): failed to deploy in k8s
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 346441a commit 28006d8

File tree

3 files changed

+110
-88
lines changed

3 files changed

+110
-88
lines changed

gpustack_runtime/deployer/k8s/deviceplugin/plugin.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,9 @@ def _allocate(
455455
dev_mounts: list[Mount] = []
456456
dev_devices: list[DeviceSpec] = []
457457

458-
container_edits = cdi_cfg.container_edits or []
458+
container_edits = (
459+
[cdi_cfg.container_edits] if cdi_cfg.container_edits else []
460+
)
459461
for cdi_dev in cdi_cfg.devices:
460462
container_edits.append(cdi_dev.container_edits)
461463

gpustack_runtime/detector/metax.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -307,12 +307,16 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
307307
card_id = None
308308
renderd_id = None
309309

310-
drm_path = Path(f"/sys/module/metax/drivers/pci:metax/{dev_bdf}/drm")
311-
if drm_path.exists():
312-
for dir_path in drm_path.iterdir():
313-
if dir_path.name.startswith("card"):
314-
card_id = int(dir_path.name[4:])
315-
elif dir_path.name.startswith("renderD"):
316-
renderd_id = int(dir_path.name[7:])
310+
for drm_path in [
311+
Path(f"/sys/module/metax/drivers/pci:METAX/{dev_bdf}/drm"),
312+
Path(f"/sys/module/metax/drivers/pci:metax/{dev_bdf}/drm"),
313+
]:
314+
if drm_path.exists():
315+
for dir_path in drm_path.iterdir():
316+
if dir_path.name.startswith("card"):
317+
card_id = int(dir_path.name[4:])
318+
elif dir_path.name.startswith("renderD"):
319+
renderd_id = int(dir_path.name[7:])
320+
break
317321

318322
return card_id, renderd_id

tests/gpustack_runtime/detector/samples/detect_output_metax_c500.json

Lines changed: 96 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -3,192 +3,208 @@
33
"manufacturer": "metax",
44
"index": 0,
55
"name": "MXC500",
6-
"uuid": "GPU-d2a832de-fe77-cfdb-66b8-4612726169f5",
7-
"driver_version": "3.0.11",
8-
"runtime_version": "3.1",
9-
"runtime_version_original": "3.1.0.14",
6+
"uuid": "GPU-3fd4ee8f-169c-6d31-f9dc-c2a90a441f1b",
7+
"driver_version": "3.6.11",
8+
"runtime_version": "3.5",
9+
"runtime_version_original": "3.5.3.18",
1010
"compute_capability": null,
1111
"cores": null,
1212
"cores_utilization": 0,
1313
"memory": 65536,
14-
"memory_used": 59276,
15-
"memory_utilization": 90.45,
14+
"memory_used": 59963,
15+
"memory_utilization": 91.5,
1616
"memory_status": "healthy",
17-
"temperature": 35,
17+
"temperature": 36,
1818
"power": 350,
19-
"power_used": 72,
19+
"power_used": 71,
2020
"appendix": {
2121
"vgpu": false,
22-
"bdf": "0000:0c:00.0",
23-
"numa": "0"
22+
"bdf": "0000:0e:00.0",
23+
"numa": "0",
24+
"card_id": 1,
25+
"renderd_id": 128
2426
}
2527
},
2628
{
2729
"manufacturer": "metax",
2830
"index": 1,
2931
"name": "MXC500",
30-
"uuid": "GPU-f6917af6-c7fe-f9a8-2afb-de2a317203b8",
31-
"driver_version": "3.0.11",
32-
"runtime_version": "3.1",
33-
"runtime_version_original": "3.1.0.14",
32+
"uuid": "GPU-3c0d0f70-9f8d-8a2f-a4bf-2910193f4e49",
33+
"driver_version": "3.6.11",
34+
"runtime_version": "3.5",
35+
"runtime_version_original": "3.5.3.18",
3436
"compute_capability": null,
3537
"cores": null,
3638
"cores_utilization": 0,
3739
"memory": 65536,
38-
"memory_used": 58784,
39-
"memory_utilization": 89.7,
40+
"memory_used": 826,
41+
"memory_utilization": 1.26,
4042
"memory_status": "healthy",
41-
"temperature": 33,
43+
"temperature": 37,
4244
"power": 350,
43-
"power_used": 68,
45+
"power_used": 57,
4446
"appendix": {
4547
"vgpu": false,
46-
"bdf": "0000:32:00.0",
47-
"numa": "0"
48+
"bdf": "0000:0f:00.0",
49+
"numa": "0",
50+
"card_id": 2,
51+
"renderd_id": 129
4852
}
4953
},
5054
{
5155
"manufacturer": "metax",
5256
"index": 2,
5357
"name": "MXC500",
54-
"uuid": "GPU-aee00591-4809-ff6e-0220-e7fb9893b942",
55-
"driver_version": "3.0.11",
56-
"runtime_version": "3.1",
57-
"runtime_version_original": "3.1.0.14",
58+
"uuid": "GPU-83be6bc6-a0c4-12f9-ddb2-a5fae06c2603",
59+
"driver_version": "3.6.11",
60+
"runtime_version": "3.5",
61+
"runtime_version_original": "3.5.3.18",
5862
"compute_capability": null,
5963
"cores": null,
6064
"cores_utilization": 0,
6165
"memory": 65536,
62-
"memory_used": 58784,
63-
"memory_utilization": 89.7,
66+
"memory_used": 826,
67+
"memory_utilization": 1.26,
6468
"memory_status": "healthy",
65-
"temperature": 32,
69+
"temperature": 38,
6670
"power": 350,
67-
"power_used": 70,
71+
"power_used": 57,
6872
"appendix": {
6973
"vgpu": false,
70-
"bdf": "0000:45:00.0",
71-
"numa": "0"
74+
"bdf": "0000:10:00.0",
75+
"numa": "0",
76+
"card_id": 3,
77+
"renderd_id": 130
7278
}
7379
},
7480
{
7581
"manufacturer": "metax",
7682
"index": 3,
7783
"name": "MXC500",
78-
"uuid": "GPU-e58d310f-ee48-51d7-c73f-1b29c298bb81",
79-
"driver_version": "3.0.11",
80-
"runtime_version": "3.1",
81-
"runtime_version_original": "3.1.0.14",
84+
"uuid": "GPU-f16121f4-06a2-fdac-5b6b-abbcba762f94",
85+
"driver_version": "3.6.11",
86+
"runtime_version": "3.5",
87+
"runtime_version_original": "3.5.3.18",
8288
"compute_capability": null,
8389
"cores": null,
8490
"cores_utilization": 0,
8591
"memory": 65536,
86-
"memory_used": 58784,
87-
"memory_utilization": 89.7,
92+
"memory_used": 826,
93+
"memory_utilization": 1.26,
8894
"memory_status": "healthy",
89-
"temperature": 34,
95+
"temperature": 36,
9096
"power": 350,
91-
"power_used": 68,
97+
"power_used": 57,
9298
"appendix": {
9399
"vgpu": false,
94-
"bdf": "0000:58:00.0",
95-
"numa": "0"
100+
"bdf": "0000:12:00.0",
101+
"numa": "0",
102+
"card_id": 4,
103+
"renderd_id": 131
96104
}
97105
},
98106
{
99107
"manufacturer": "metax",
100108
"index": 4,
101109
"name": "MXC500",
102-
"uuid": "GPU-be3bc8c9-9f15-cb6e-b2d5-18d319574542",
103-
"driver_version": "3.0.11",
104-
"runtime_version": "3.1",
105-
"runtime_version_original": "3.1.0.14",
110+
"uuid": "GPU-96cfc8cd-156f-3794-8a69-18ff5789a11c",
111+
"driver_version": "3.6.11",
112+
"runtime_version": "3.5",
113+
"runtime_version_original": "3.5.3.18",
106114
"compute_capability": null,
107115
"cores": null,
108116
"cores_utilization": 0,
109117
"memory": 65536,
110-
"memory_used": 858,
111-
"memory_utilization": 1.31,
118+
"memory_used": 826,
119+
"memory_utilization": 1.26,
112120
"memory_status": "healthy",
113-
"temperature": 31,
121+
"temperature": 36,
114122
"power": 350,
115-
"power_used": 59,
123+
"power_used": 57,
116124
"appendix": {
117125
"vgpu": false,
118-
"bdf": "0000:84:00.0",
119-
"numa": "1"
126+
"bdf": "0000:35:00.0",
127+
"numa": "0",
128+
"card_id": 5,
129+
"renderd_id": 132
120130
}
121131
},
122132
{
123133
"manufacturer": "metax",
124134
"index": 5,
125135
"name": "MXC500",
126-
"uuid": "GPU-45d50e91-8ab1-38c3-2717-e2fb4e9b881d",
127-
"driver_version": "3.0.11",
128-
"runtime_version": "3.1",
129-
"runtime_version_original": "3.1.0.14",
136+
"uuid": "GPU-f8b4667b-fad3-331e-c87c-1a155ead8d52",
137+
"driver_version": "3.6.11",
138+
"runtime_version": "3.5",
139+
"runtime_version_original": "3.5.3.18",
130140
"compute_capability": null,
131141
"cores": null,
132142
"cores_utilization": 0,
133143
"memory": 65536,
134-
"memory_used": 858,
135-
"memory_utilization": 1.31,
144+
"memory_used": 826,
145+
"memory_utilization": 1.26,
136146
"memory_status": "healthy",
137-
"temperature": 32,
147+
"temperature": 35,
138148
"power": 350,
139-
"power_used": 56,
149+
"power_used": 58,
140150
"appendix": {
141151
"vgpu": false,
142-
"bdf": "0000:ac:00.0",
143-
"numa": "1"
152+
"bdf": "0000:36:00.0",
153+
"numa": "0",
154+
"card_id": 6,
155+
"renderd_id": 133
144156
}
145157
},
146158
{
147159
"manufacturer": "metax",
148160
"index": 6,
149161
"name": "MXC500",
150-
"uuid": "GPU-fddd2e4f-d191-2769-2f4f-82e93bfb1133",
151-
"driver_version": "3.0.11",
152-
"runtime_version": "3.1",
153-
"runtime_version_original": "3.1.0.14",
162+
"uuid": "GPU-70b0cd3b-1f3c-a439-1050-ffd599a4cc43",
163+
"driver_version": "3.6.11",
164+
"runtime_version": "3.5",
165+
"runtime_version_original": "3.5.3.18",
154166
"compute_capability": null,
155167
"cores": null,
156168
"cores_utilization": 0,
157169
"memory": 65536,
158-
"memory_used": 858,
159-
"memory_utilization": 1.31,
170+
"memory_used": 826,
171+
"memory_utilization": 1.26,
160172
"memory_status": "healthy",
161-
"temperature": 32,
173+
"temperature": 36,
162174
"power": 350,
163-
"power_used": 57,
175+
"power_used": 56,
164176
"appendix": {
165177
"vgpu": false,
166-
"bdf": "0000:c0:00.0",
167-
"numa": "1"
178+
"bdf": "0000:37:00.0",
179+
"numa": "0",
180+
"card_id": 7,
181+
"renderd_id": 134
168182
}
169183
},
170184
{
171185
"manufacturer": "metax",
172186
"index": 7,
173187
"name": "MXC500",
174-
"uuid": "GPU-0e7f2404-ccf5-aea8-e239-4c2c347702b8",
175-
"driver_version": "3.0.11",
176-
"runtime_version": "3.1",
177-
"runtime_version_original": "3.1.0.14",
188+
"uuid": "GPU-aa6c79e0-ae0e-45c4-2ed4-832002e2276c",
189+
"driver_version": "3.6.11",
190+
"runtime_version": "3.5",
191+
"runtime_version_original": "3.5.3.18",
178192
"compute_capability": null,
179193
"cores": null,
180194
"cores_utilization": 0,
181195
"memory": 65536,
182-
"memory_used": 4014,
183-
"memory_utilization": 6.12,
196+
"memory_used": 826,
197+
"memory_utilization": 1.26,
184198
"memory_status": "healthy",
185-
"temperature": 32,
199+
"temperature": 35,
186200
"power": 350,
187-
"power_used": 69,
201+
"power_used": 56,
188202
"appendix": {
189203
"vgpu": false,
190-
"bdf": "0000:d4:00.0",
191-
"numa": "1"
204+
"bdf": "0000:38:00.0",
205+
"numa": "0",
206+
"card_id": 8,
207+
"renderd_id": 135
192208
}
193209
}
194210
]

0 commit comments

Comments
 (0)