Skip to content

Commit c784efe

Browse files
yafsharJeynmannZ
authored andcommitted
UCT/ZE: Fix reset path, DMA-BUF ownership, and descriptor init (openucx#11223)
* UCT/ZE: Add device topology registration Implement Level Zero device enumeration and topology registration to properly integrate Intel GPUs with UCX's topology subsystem. Key changes: - Enumerate Level Zero devices and sub-devices during initialization - Register each physical device once with topology using PCI bus ID - All sub-devices on same device share parent's sys_dev for IB affinity - Device naming: "GPU0" for single sub-device, "GPU0.0"/"GPU0.1" for multi - Use zeDevicePciGetPropertiesExt() for PCI properties (Level Zero 1.0+ compat) - Enable auxiliary paths for multi-path routing Architecture: - Static sub-device array populated at init, read-only after - Query functions return empty list on init failure (not error) - One MD resource, one TL device per sub-device API cleanup: - Removed unused functions from public header * UCT/ZE: Fix code style in ze_base files * UCT/ZE: Fix topology registration for flat device hierarchies Fix device enumeration on systems where Level Zero reports tiles as separate root devices (e.g., Ponte Vecchio Data Center Max) rather than hierarchical sub-devices. Changes: - Detect duplicate PCI addresses (BDF) to identify tiles on same GPU - Share sys_dev across root devices with identical PCI address - Support both hierarchical (zeDeviceGetSubDevices) and flat models - Preserve all 8 device handles (GPU0-GPU7) with correct 4-sys_dev mapping Fixes incorrect NUMA/IB affinity when flat hierarchy causes separate topology registration for tiles on same physical device. * UCX/ZE: Refactor base initialization into helper functions * UCT/ZE/COPY: always reset command list and propagate reset failures * UCT/ZE/COPY: Close exported dmabuf fd after dup in mem_query zeMemGetAllocProperties returns an exported dmabuf fd that must be closed by UCX after duplicating it for the caller. Previously, each mem_query leaked one fd. Add a centralized cleanup path to always close the original fd and handle dup() failure. * UCT/ZE/COPY: initialize stype in Level Zero alloc descriptors Set mandatory stype in ze_host_mem_alloc_desc_t and ze_device_mem_alloc_desc_t used by mem_alloc. Although the descriptors were zero-initialized, explicit stype is required by Level Zero and improves compatibility with stricter runtime validation and future extension chaining. * UCT/ZE/COPY: remove redundant ep_create/ep_destroy ops entries * UCT/ZE: style and whitespace cleanup * UCT/ZE/COPY: preserve Level Zero DMA-BUF export fd ownership in mem_query * UCT/ZE/COPY: clang-format cleanup in ZE copy files * UCT/ZE/COPY: simplify dmabuf fd setup in mem_query
1 parent 2485f1b commit c784efe

File tree

3 files changed

+59
-23
lines changed

3 files changed

+59
-23
lines changed

src/uct/ze/copy/ze_copy_ep.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include <level_zero/ze_api.h>
2222

23+
2324
static UCS_CLASS_INIT_FUNC(uct_ze_copy_ep_t, const uct_ep_params_t *params)
2425
{
2526
uct_ze_copy_iface_t *iface = ucs_derived_of(params->iface,
@@ -45,6 +46,7 @@ ucs_status_t uct_ze_copy_ep_zcopy(uct_ep_h tl_ep, uint64_t remote_addr,
4546
size_t size = uct_iov_get_length(iov);
4647
uct_ze_copy_iface_t *iface = ucs_derived_of(tl_ep->iface,
4748
uct_ze_copy_iface_t);
49+
ucs_status_t status = UCS_OK;
4850
ze_result_t ret;
4951
void *src, *dst;
5052

@@ -62,33 +64,40 @@ ucs_status_t uct_ze_copy_ep_zcopy(uct_ep_h tl_ep, uint64_t remote_addr,
6264
ret = zeCommandListAppendMemoryCopy(iface->ze_cmdl, dst, src, size, NULL, 0,
6365
NULL);
6466
if (ret != ZE_RESULT_SUCCESS) {
65-
return UCS_ERR_IO_ERROR;
67+
status = UCS_ERR_IO_ERROR;
68+
goto out_reset;
6669
}
6770

6871
ret = zeCommandListClose(iface->ze_cmdl);
6972
if (ret != ZE_RESULT_SUCCESS) {
70-
return UCS_ERR_IO_ERROR;
73+
status = UCS_ERR_IO_ERROR;
74+
goto out_reset;
7175
}
7276

7377
ret = zeCommandQueueExecuteCommandLists(iface->ze_cmdq, 1, &iface->ze_cmdl,
7478
NULL);
7579
if (ret != ZE_RESULT_SUCCESS) {
76-
return UCS_ERR_IO_ERROR;
80+
status = UCS_ERR_IO_ERROR;
81+
goto out_reset;
7782
}
7883

7984
ret = zeCommandQueueSynchronize(iface->ze_cmdq, UINT64_MAX);
8085
if (ret != ZE_RESULT_SUCCESS) {
81-
return UCS_ERR_IO_ERROR;
86+
status = UCS_ERR_IO_ERROR;
8287
}
8388

89+
out_reset:
8490
ret = zeCommandListReset(iface->ze_cmdl);
8591
if (ret != ZE_RESULT_SUCCESS) {
86-
return UCS_ERR_IO_ERROR;
92+
ucs_error("zeCommandListReset failed: 0x%x", ret);
93+
if (status == UCS_OK) {
94+
status = UCS_ERR_IO_ERROR;
95+
}
8796
}
8897

8998
ucs_trace("ze memory copy from src %p to dst %p, len %ld", src, dst, size);
9099

91-
return UCS_OK;
100+
return status;
92101
}
93102

94103
ucs_status_t uct_ze_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov,

src/uct/ze/copy/ze_copy_iface.c

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -139,22 +139,26 @@ static uct_iface_ops_t uct_ze_copy_iface_ops = {
139139
.ep_put_short = uct_ze_copy_ep_put_short,
140140
.ep_get_zcopy = uct_ze_copy_ep_get_zcopy,
141141
.ep_put_zcopy = uct_ze_copy_ep_put_zcopy,
142-
.ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy,
143-
.ep_pending_purge = (uct_ep_pending_purge_func_t)ucs_empty_function,
142+
.ep_pending_add = (uct_ep_pending_add_func_t)
143+
ucs_empty_function_return_busy,
144+
.ep_pending_purge = (uct_ep_pending_purge_func_t)
145+
ucs_empty_function,
144146
.ep_flush = uct_base_ep_flush,
145147
.ep_fence = uct_base_ep_fence,
146-
.ep_create = uct_ep_create,
147-
.ep_destroy = uct_ep_destroy,
148148
.ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_ze_copy_ep_t),
149149
.ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_ze_copy_ep_t),
150150
.iface_flush = uct_base_iface_flush,
151151
.iface_fence = uct_base_iface_fence,
152-
.iface_progress_enable = (uct_iface_progress_enable_func_t)ucs_empty_function,
153-
.iface_progress_disable = (uct_iface_progress_disable_func_t)ucs_empty_function,
154-
.iface_progress = (uct_iface_progress_func_t)ucs_empty_function_return_zero,
152+
.iface_progress_enable = (uct_iface_progress_enable_func_t)
153+
ucs_empty_function,
154+
.iface_progress_disable = (uct_iface_progress_disable_func_t)
155+
ucs_empty_function,
156+
.iface_progress = (uct_iface_progress_func_t)
157+
ucs_empty_function_return_zero,
155158
.iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_ze_copy_iface_t),
156159
.iface_query = uct_ze_copy_iface_query,
157-
.iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success,
160+
.iface_get_device_address = (uct_iface_get_device_address_func_t)
161+
ucs_empty_function_return_success,
158162
.iface_get_address = uct_ze_copy_iface_get_address,
159163
.iface_is_reachable = uct_base_iface_is_reachable,
160164
};

src/uct/ze/copy/ze_copy_md.c

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,13 @@ uct_ze_copy_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p,
6666
ucs_memory_type_t mem_type, ucs_sys_device_t sys_dev,
6767
unsigned flags, const char *alloc_name, uct_mem_h *memh_p)
6868
{
69-
uct_ze_copy_md_t *md = ucs_derived_of(tl_md, uct_ze_copy_md_t);
70-
ze_host_mem_alloc_desc_t host_desc = {};
71-
ze_device_mem_alloc_desc_t dev_desc = {};
69+
uct_ze_copy_md_t *md = ucs_derived_of(tl_md, uct_ze_copy_md_t);
70+
ze_host_mem_alloc_desc_t host_desc = {
71+
.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC
72+
};
73+
ze_device_mem_alloc_desc_t dev_desc = {
74+
.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC
75+
};
7276
size_t alignment = ucs_get_page_size();
7377
ucs_status_t status;
7478

@@ -141,10 +145,11 @@ static ucs_status_t
141145
uct_ze_copy_md_query_attributes(uct_md_h md, const void *addr, size_t length,
142146
ucs_memory_info_t *mem_info, int *dmabuf_fd)
143147
{
144-
uct_ze_copy_md_t *ze_md = ucs_derived_of(md, uct_ze_copy_md_t);
148+
uct_ze_copy_md_t *ze_md = ucs_derived_of(md, uct_ze_copy_md_t);
145149
ze_external_memory_export_fd_t export_fd = {
146150
.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD,
147-
.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF
151+
.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF,
152+
.fd = UCT_DMABUF_FD_INVALID
148153
};
149154
ze_memory_allocation_properties_t props = {
150155
.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
@@ -185,12 +190,18 @@ static ucs_status_t uct_ze_copy_md_mem_query(uct_md_h md, const void *addr,
185190
const size_t length,
186191
uct_md_mem_attr_t *mem_attr_p)
187192
{
188-
int dmabuf_fd = UCT_DMABUF_FD_INVALID;
189-
ucs_status_t status;
193+
int dmabuf_fd = UCT_DMABUF_FD_INVALID;
194+
int *dmabuf_fd_p = NULL;
190195
ucs_memory_info_t mem_info;
196+
ucs_status_t status;
197+
198+
if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_DMABUF_FD) {
199+
mem_attr_p->dmabuf_fd = UCT_DMABUF_FD_INVALID;
200+
dmabuf_fd_p = &dmabuf_fd;
201+
}
191202

192203
status = uct_ze_copy_md_query_attributes(md, addr, length, &mem_info,
193-
&dmabuf_fd);
204+
dmabuf_fd_p);
194205
if (status != UCS_OK) {
195206
return status;
196207
}
@@ -215,13 +226,26 @@ static ucs_status_t uct_ze_copy_md_mem_query(uct_md_h md, const void *addr,
215226
}
216227

217228
if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_DMABUF_FD) {
229+
if (dmabuf_fd == UCT_DMABUF_FD_INVALID) {
230+
return UCS_ERR_UNSUPPORTED;
231+
}
232+
218233
mem_attr_p->dmabuf_fd = dup(dmabuf_fd);
234+
if (mem_attr_p->dmabuf_fd < 0) {
235+
return UCS_ERR_IO_ERROR;
236+
}
237+
238+
/* NOTE: Do not close(dmabuf_fd) here. Level Zero caches this fd
239+
* internally per allocation. Closing it can invalidate the cache and
240+
* lead to fd reuse conflicts with other transports.
241+
*/
219242
}
220243

221244
if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_DMABUF_OFFSET) {
222245
mem_attr_p->dmabuf_offset = UCS_PTR_BYTE_DIFF(mem_info.base_address,
223246
addr);
224247
}
248+
225249
return UCS_OK;
226250
}
227251

@@ -330,4 +354,3 @@ uct_component_t uct_ze_copy_component = {
330354
.md_vfs_init = (uct_component_md_vfs_init_func_t)ucs_empty_function
331355
};
332356
UCT_COMPONENT_REGISTER(&uct_ze_copy_component);
333-

0 commit comments

Comments
 (0)