@@ -448,7 +448,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
448
448
449
449
platform::CUDADeviceGuard cuda_guard;
450
450
451
- if (FLAGS_use_stream_safe_cuda_allocator) {
451
+ {
452
+ platform::NCCLGroupGuard nccl_guard;
452
453
for (size_t i = 0 ; i < tensors.size (); ++i) {
453
454
cuda_guard.SetDevice (places[i]);
454
455
gpuStream_t nccl_stream;
@@ -460,12 +461,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
460
461
} else {
461
462
nccl_stream = places_to_ctx_[key][i]->stream ();
462
463
}
463
- memory::RecordStream (tensors[i]. Holder (), nccl_stream);
464
+ fn (tensors[i], nccl_comms[i]-> GetNcclComm (), nccl_stream, dst_rank );
464
465
}
465
466
}
466
467
467
- {
468
- platform::NCCLGroupGuard nccl_guard;
468
+ if (FLAGS_use_stream_safe_cuda_allocator) {
469
469
for (size_t i = 0 ; i < tensors.size (); ++i) {
470
470
cuda_guard.SetDevice (places[i]);
471
471
gpuStream_t nccl_stream;
@@ -477,7 +477,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
477
477
} else {
478
478
nccl_stream = places_to_ctx_[key][i]->stream ();
479
479
}
480
- fn (tensors[i], nccl_comms[i]-> GetNcclComm (), nccl_stream, dst_rank );
480
+ memory::RecordStream (tensors[i]. Holder (), nccl_stream);
481
481
}
482
482
}
483
483
@@ -516,20 +516,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
516
516
// construct uninitialize guard for device
517
517
platform::CUDADeviceGuard cuda_guard;
518
518
519
- if (FLAGS_use_stream_safe_cuda_allocator) {
519
+ {
520
+ platform::NCCLGroupGuard nccl_guard;
520
521
for (size_t i = 0 ; i < tensors.size (); ++i) {
521
522
cuda_guard.SetDevice (places[i]);
522
- memory::RecordStream (tensors[i]. Holder (),
523
- places_to_ctx_[key] [i]->stream () );
523
+ const auto & nccl_stream = places_to_ctx_[key][i]-> stream ();
524
+ fn (tensors[i], nccl_comms [i]->GetNcclComm (), nccl_stream, dst_rank );
524
525
}
525
526
}
526
527
527
- {
528
- platform::NCCLGroupGuard nccl_guard;
528
+ if (FLAGS_use_stream_safe_cuda_allocator) {
529
529
for (size_t i = 0 ; i < tensors.size (); ++i) {
530
530
cuda_guard.SetDevice (places[i]);
531
- const auto & nccl_stream = places_to_ctx_[key][i]-> stream ();
532
- fn (tensors[i], nccl_comms [i]->GetNcclComm (), nccl_stream, dst_rank );
531
+ memory::RecordStream (tensors[i]. Holder (),
532
+ places_to_ctx_[key] [i]->stream () );
533
533
}
534
534
}
535
535
0 commit comments