@@ -159,21 +159,6 @@ static nccl_net_ofi_rdma_plugin_t *rdma_device_get_plugin(nccl_net_ofi_rdma_devi
159
159
return (nccl_net_ofi_rdma_plugin_t *)device->base .plugin ;
160
160
}
161
161
162
-
163
- static nccl_net_ofi_rdma_ep_t *rdma_req_get_ep (nccl_net_ofi_rdma_req_t *req)
164
- {
165
- /* TODO: this function doesn't work for rx buffers, which have no
166
- associated comm */
167
- assert (req->comm );
168
- return (nccl_net_ofi_rdma_ep_t *)req->comm ->ep ;
169
- }
170
-
171
-
172
- static nccl_net_ofi_rdma_device_t *rdma_req_get_device (nccl_net_ofi_rdma_req_t *req)
173
- {
174
- return (nccl_net_ofi_rdma_device_t *)rdma_req_get_ep (req)->base .domain ->device ;
175
- }
176
-
177
162
/*
178
163
* @brief Get endpoint communicator with given ID
179
164
*/
@@ -905,7 +890,9 @@ static inline int update_send_data_from_remote(nccl_net_ofi_rdma_send_comm_t *s_
905
890
assert (ep != NULL );
906
891
907
892
nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device (ep);
908
- nccl_net_ofi_scheduler_t *scheduler = device->scheduler ;
893
+ nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain (ep);
894
+ assert (domain != NULL );
895
+ nccl_net_ofi_scheduler_t *scheduler = domain->scheduler ;
909
896
910
897
rdma_req_send_data_t *send_data = get_send_data (req);
911
898
rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data (rx_buff_req);
@@ -2200,8 +2187,11 @@ static inline int free_send_req(nccl_net_ofi_rdma_req_t *req,
2200
2187
}
2201
2188
2202
2189
if (send_data->schedule ) {
2203
- nccl_net_ofi_rdma_device_t *device = rdma_req_get_device (req);
2204
- nccl_net_ofi_release_schedule (device->scheduler , send_data->schedule );
2190
+ nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base .base .ep ;
2191
+ assert (ep != NULL );
2192
+ nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain (ep);
2193
+ assert (domain != NULL );
2194
+ nccl_net_ofi_release_schedule (domain->scheduler , send_data->schedule );
2205
2195
send_data->schedule = NULL ;
2206
2196
}
2207
2197
@@ -2278,8 +2268,11 @@ static inline int free_send_ctrl_req(nccl_net_ofi_rdma_req_t *req,
2278
2268
rdma_req_send_ctrl_data_t *send_ctrl_data = get_send_ctrl_data (req);
2279
2269
2280
2270
if (send_ctrl_data->ctrl_schedule != NULL ) {
2281
- nccl_net_ofi_rdma_device_t *device = rdma_req_get_device (req);
2282
- nccl_net_ofi_release_schedule (device->scheduler , send_ctrl_data->ctrl_schedule );
2271
+ nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base .base .ep ;
2272
+ assert (ep != NULL );
2273
+ nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain (ep);
2274
+ assert (domain != NULL );
2275
+ nccl_net_ofi_release_schedule (domain->scheduler , send_ctrl_data->ctrl_schedule );
2283
2276
send_ctrl_data->ctrl_schedule = NULL ;
2284
2277
}
2285
2278
@@ -2304,8 +2297,11 @@ static inline int free_send_close_req(nccl_net_ofi_rdma_req_t *req,
2304
2297
rdma_req_send_close_data_t *send_close_data = req_get_send_close_data (req);
2305
2298
2306
2299
if (send_close_data->ctrl_schedule ) {
2307
- nccl_net_ofi_rdma_device_t *device = rdma_req_get_device (req);
2308
- nccl_net_ofi_release_schedule (device->scheduler , send_close_data->ctrl_schedule );
2300
+ nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base .base .ep ;
2301
+ assert (ep != NULL );
2302
+ nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain (ep);
2303
+ assert (domain != NULL );
2304
+ nccl_net_ofi_release_schedule (domain->scheduler , send_close_data->ctrl_schedule );
2309
2305
send_close_data->ctrl_schedule = NULL ;
2310
2306
}
2311
2307
@@ -3269,8 +3265,10 @@ static inline int insert_send_ctrl_req(
3269
3265
nccl_net_ofi_rdma_req_t *recv_req,
3270
3266
bool recv_completion_optional)
3271
3267
{
3272
- nccl_net_ofi_scheduler_t *scheduler = device->scheduler ;
3273
3268
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base .base .ep ;
3269
+ nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain (ep);
3270
+ assert (domain != NULL );
3271
+ nccl_net_ofi_scheduler_t *scheduler = domain->scheduler ;
3274
3272
nccl_net_ofi_rdma_req_t *send_ctrl_req = allocate_req (r_comm->nccl_ofi_reqs_fl );
3275
3273
if (OFI_UNLIKELY (send_ctrl_req == NULL )) {
3276
3274
NCCL_OFI_WARN (" Unable to get NCCL OFI send control request for device %d" ,
@@ -5311,7 +5309,9 @@ static int alloc_rdma_send_req(nccl_net_ofi_rdma_send_comm_t *s_comm,
5311
5309
{
5312
5310
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)s_comm->base .base .ep ;
5313
5311
nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device (ep);
5314
- nccl_net_ofi_scheduler_t *scheduler = device->scheduler ;
5312
+ nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain (ep);
5313
+ assert (domain != NULL );
5314
+ nccl_net_ofi_scheduler_t *scheduler = domain->scheduler ;
5315
5315
*ret_req = NULL ;
5316
5316
5317
5317
/* Allocate NCCL OFI request */
@@ -7397,6 +7397,13 @@ nccl_net_ofi_rdma_domain_free(nccl_net_ofi_domain_t *base_domain)
7397
7397
}
7398
7398
free (domain->domain_rails );
7399
7399
7400
+ if (domain->scheduler ) {
7401
+ ret = domain->scheduler ->fini (domain->scheduler );
7402
+ if (ret != 0 ) {
7403
+ NCCL_OFI_WARN (" Cleanup of device failed, scheduler_fini returned %s" ,
7404
+ strerror (-ret));
7405
+ }
7406
+ }
7400
7407
if (domain->ep_addr_list ) {
7401
7408
delete domain->ep_addr_list ;
7402
7409
domain->ep_addr_list = NULL ;
@@ -7497,6 +7504,13 @@ static nccl_net_ofi_domain_t *nccl_net_ofi_rdma_device_create_domain(nccl_net_of
7497
7504
goto error;
7498
7505
}
7499
7506
7507
+ /* Create scheduler */
7508
+ ret = nccl_net_ofi_threshold_scheduler_init (domain->num_rails , &domain->scheduler );
7509
+ if (ret != 0 ) {
7510
+ goto error;
7511
+ }
7512
+ assert (domain->scheduler );
7513
+
7500
7514
error:
7501
7515
if (ret != 0 ) {
7502
7516
domain->base .release (&(domain->base ), false , false );
@@ -7658,17 +7672,6 @@ nccl_net_ofi_rdma_device_release(nccl_net_ofi_device_t *base_device)
7658
7672
free (device->device_rails );
7659
7673
}
7660
7674
7661
- if (device->scheduler ) {
7662
- ret = device->scheduler ->fini (device->scheduler );
7663
- if (ret != 0 ) {
7664
- NCCL_OFI_WARN (" Cleanup of device failed, scheduler_fini returned %s" ,
7665
- strerror (-ret));
7666
- if (first_error == 0 ) {
7667
- first_error = ret;
7668
- }
7669
- }
7670
- }
7671
-
7672
7675
if (device->comms ) {
7673
7676
free (device->comms );
7674
7677
device->comms = NULL ;
@@ -7786,13 +7789,6 @@ static nccl_net_ofi_rdma_device_t *nccl_net_ofi_rdma_device_create(
7786
7789
NCCL_OFI_INFO (NCCL_NET, " Created device with %d rails" , length);
7787
7790
}
7788
7791
7789
- /* Create scheduler */
7790
- ret = nccl_net_ofi_threshold_scheduler_init (length, &device->scheduler );
7791
- if (ret != 0 ) {
7792
- goto error;
7793
- }
7794
- assert (device->scheduler );
7795
-
7796
7792
/* Set NIC information */
7797
7793
device->num_rails = length;
7798
7794
device->device_rails = create_device_rail_array (info_list, length);
0 commit comments