Skip to content

Commit 3350336

Browse files
sewangoh-awsbwbarrett
authored andcommitted
scheduler: move schduler to domain level
sheduler was in device level and used two locks. One is freelist lock and the other is scheduler->rr_lock. When a device has multiple threads and is shared by them, there is lock contention problem in send posting. As a short term solution, move scheduler to domain not to share the scheduler resources by multiple threads. Signed-off-by: Se Wang Oh <[email protected]>
1 parent 7a707a5 commit 3350336

File tree

2 files changed

+41
-45
lines changed

2 files changed

+41
-45
lines changed

include/nccl_ofi_rdma.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -802,9 +802,6 @@ typedef struct nccl_net_ofi_rdma_device {
802802
* and its base struct. */
803803
nccl_net_ofi_device_t base;
804804

805-
/* Message scheduler */
806-
nccl_net_ofi_scheduler_t *scheduler;
807-
808805
/* Number of rails */
809806
uint16_t num_rails;
810807

@@ -850,6 +847,9 @@ typedef struct nccl_net_ofi_rdma_domain {
850847

851848
/* List of endpoints and set of addresses they have connections to */
852849
nccl_ofi_ep_addr_list_t *ep_addr_list;
850+
851+
/* Message scheduler */
852+
nccl_net_ofi_scheduler_t *scheduler;
853853
} nccl_net_ofi_rdma_domain_t;
854854

855855

src/nccl_ofi_rdma.cpp

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -159,21 +159,6 @@ static nccl_net_ofi_rdma_plugin_t *rdma_device_get_plugin(nccl_net_ofi_rdma_devi
159159
return (nccl_net_ofi_rdma_plugin_t*)device->base.plugin;
160160
}
161161

162-
163-
static nccl_net_ofi_rdma_ep_t *rdma_req_get_ep(nccl_net_ofi_rdma_req_t *req)
164-
{
165-
/* TODO: this function doesn't work for rx buffers, which have no
166-
associated comm */
167-
assert(req->comm);
168-
return (nccl_net_ofi_rdma_ep_t *)req->comm->ep;
169-
}
170-
171-
172-
static nccl_net_ofi_rdma_device_t *rdma_req_get_device(nccl_net_ofi_rdma_req_t *req)
173-
{
174-
return (nccl_net_ofi_rdma_device_t *)rdma_req_get_ep(req)->base.domain->device;
175-
}
176-
177162
/*
178163
* @brief Get endpoint communicator with given ID
179164
*/
@@ -905,7 +890,9 @@ static inline int update_send_data_from_remote(nccl_net_ofi_rdma_send_comm_t *s_
905890
assert(ep != NULL);
906891

907892
nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep);
908-
nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
893+
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
894+
assert(domain != NULL);
895+
nccl_net_ofi_scheduler_t *scheduler = domain->scheduler;
909896

910897
rdma_req_send_data_t *send_data = get_send_data(req);
911898
rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(rx_buff_req);
@@ -2200,8 +2187,11 @@ static inline int free_send_req(nccl_net_ofi_rdma_req_t *req,
22002187
}
22012188

22022189
if (send_data->schedule) {
2203-
nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req);
2204-
nccl_net_ofi_release_schedule(device->scheduler, send_data->schedule);
2190+
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base.base.ep;
2191+
assert(ep != NULL);
2192+
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
2193+
assert(domain != NULL);
2194+
nccl_net_ofi_release_schedule(domain->scheduler, send_data->schedule);
22052195
send_data->schedule = NULL;
22062196
}
22072197

@@ -2278,8 +2268,11 @@ static inline int free_send_ctrl_req(nccl_net_ofi_rdma_req_t *req,
22782268
rdma_req_send_ctrl_data_t *send_ctrl_data = get_send_ctrl_data(req);
22792269

22802270
if (send_ctrl_data->ctrl_schedule != NULL) {
2281-
nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req);
2282-
nccl_net_ofi_release_schedule(device->scheduler, send_ctrl_data->ctrl_schedule);
2271+
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
2272+
assert(ep != NULL);
2273+
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
2274+
assert(domain != NULL);
2275+
nccl_net_ofi_release_schedule(domain->scheduler, send_ctrl_data->ctrl_schedule);
22832276
send_ctrl_data->ctrl_schedule = NULL;
22842277
}
22852278

@@ -2304,8 +2297,11 @@ static inline int free_send_close_req(nccl_net_ofi_rdma_req_t *req,
23042297
rdma_req_send_close_data_t *send_close_data = req_get_send_close_data(req);
23052298

23062299
if (send_close_data->ctrl_schedule) {
2307-
nccl_net_ofi_rdma_device_t *device = rdma_req_get_device(req);
2308-
nccl_net_ofi_release_schedule(device->scheduler, send_close_data->ctrl_schedule);
2300+
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
2301+
assert(ep != NULL);
2302+
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
2303+
assert(domain != NULL);
2304+
nccl_net_ofi_release_schedule(domain->scheduler, send_close_data->ctrl_schedule);
23092305
send_close_data->ctrl_schedule = NULL;
23102306
}
23112307

@@ -3269,8 +3265,10 @@ static inline int insert_send_ctrl_req(
32693265
nccl_net_ofi_rdma_req_t *recv_req,
32703266
bool recv_completion_optional)
32713267
{
3272-
nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
32733268
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
3269+
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
3270+
assert(domain != NULL);
3271+
nccl_net_ofi_scheduler_t *scheduler = domain->scheduler;
32743272
nccl_net_ofi_rdma_req_t *send_ctrl_req = allocate_req(r_comm->nccl_ofi_reqs_fl);
32753273
if (OFI_UNLIKELY(send_ctrl_req == NULL)) {
32763274
NCCL_OFI_WARN("Unable to get NCCL OFI send control request for device %d",
@@ -5311,7 +5309,9 @@ static int alloc_rdma_send_req(nccl_net_ofi_rdma_send_comm_t *s_comm,
53115309
{
53125310
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base.base.ep;
53135311
nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep);
5314-
nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
5312+
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
5313+
assert(domain != NULL);
5314+
nccl_net_ofi_scheduler_t *scheduler = domain->scheduler;
53155315
*ret_req = NULL;
53165316

53175317
/* Allocate NCCL OFI request */
@@ -7397,6 +7397,13 @@ nccl_net_ofi_rdma_domain_free(nccl_net_ofi_domain_t *base_domain)
73977397
}
73987398
free(domain->domain_rails);
73997399

7400+
if (domain->scheduler) {
7401+
ret = domain->scheduler->fini(domain->scheduler);
7402+
if (ret != 0) {
7403+
NCCL_OFI_WARN("Cleanup of device failed, scheduler_fini returned %s",
7404+
strerror(-ret));
7405+
}
7406+
}
74007407
if (domain->ep_addr_list) {
74017408
delete domain->ep_addr_list;
74027409
domain->ep_addr_list = NULL;
@@ -7497,6 +7504,13 @@ static nccl_net_ofi_domain_t *nccl_net_ofi_rdma_device_create_domain(nccl_net_of
74977504
goto error;
74987505
}
74997506

7507+
/* Create scheduler */
7508+
ret = nccl_net_ofi_threshold_scheduler_init(domain->num_rails, &domain->scheduler);
7509+
if (ret != 0) {
7510+
goto error;
7511+
}
7512+
assert(domain->scheduler);
7513+
75007514
error:
75017515
if (ret != 0) {
75027516
domain->base.release(&(domain->base), false, false);
@@ -7658,17 +7672,6 @@ nccl_net_ofi_rdma_device_release(nccl_net_ofi_device_t *base_device)
76587672
free(device->device_rails);
76597673
}
76607674

7661-
if (device->scheduler) {
7662-
ret = device->scheduler->fini(device->scheduler);
7663-
if (ret != 0) {
7664-
NCCL_OFI_WARN("Cleanup of device failed, scheduler_fini returned %s",
7665-
strerror(-ret));
7666-
if (first_error == 0) {
7667-
first_error = ret;
7668-
}
7669-
}
7670-
}
7671-
76727675
if (device->comms) {
76737676
free(device->comms);
76747677
device->comms = NULL;
@@ -7786,13 +7789,6 @@ static nccl_net_ofi_rdma_device_t *nccl_net_ofi_rdma_device_create(
77867789
NCCL_OFI_INFO(NCCL_NET, "Created device with %d rails", length);
77877790
}
77887791

7789-
/* Create scheduler */
7790-
ret = nccl_net_ofi_threshold_scheduler_init(length, &device->scheduler);
7791-
if (ret != 0) {
7792-
goto error;
7793-
}
7794-
assert(device->scheduler);
7795-
77967792
/* Set NIC information */
77977793
device->num_rails = length;
77987794
device->device_rails = create_device_rail_array(info_list, length);

0 commit comments

Comments
 (0)