@@ -523,6 +523,8 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
523523 exception_ = w.exception_ ;
524524}
525525
526+ ProcessGroupNCCL::WorkNCCL::~WorkNCCL () = default ;
527+
526528bool ProcessGroupNCCL::WorkNCCL::isCompleted () {
527529 if (!ncclComm_->isAborted ()) {
528530 checkAndSetException ();
@@ -1009,8 +1011,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10091011 if (options_->global_ranks_in_group .empty ()) {
10101012 this ->globalRankStart = 0 ;
10111013 } else {
1012- this ->globalRankStart =
1013- static_cast <int >(options_->global_ranks_in_group [0 ]);
1014+ this ->globalRankStart = options_->global_ranks_in_group [0 ];
10141015 }
10151016
10161017 if (options_->global_ranks_in_group .empty ()) {
@@ -1032,9 +1033,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10321033 }
10331034
10341035 if (ranksAreStrided) {
1035- this ->globalRankStride = static_cast <int >(
1036- options_->global_ranks_in_group [1 ] -
1037- options_->global_ranks_in_group [0 ]);
1036+ this ->globalRankStride = options_->global_ranks_in_group [1 ] -
1037+ options_->global_ranks_in_group [0 ];
10381038 } else {
10391039 this ->globalRankStride = -1 ;
10401040 }
@@ -1087,7 +1087,7 @@ bool ProcessGroupNCCL::useNonblocking() {
10871087 }
10881088 // 2nd priority: Respect the environment variable
10891089 else if (nbEnv.has_value ()) {
1090- useNonblocking_ = nbEnv;
1090+ useNonblocking_ = nbEnv. value () ;
10911091 }
10921092 // 3rd priority: automatically use nonblocking if we are in eager init mode
10931093 else if (getBoundDeviceId ()) {
@@ -1711,7 +1711,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
17111711 }
17121712
17131713 if (computeDeltaMS (lastTimeHeartBeatCheck, currentTime) >=
1714- heartbeatTimeoutInSec_ * 1000l ) {
1714+ heartbeatTimeoutInSec_ * 1000 ) {
17151715 // Check the heart beat of watchdog thread.
17161716 lastTimeHeartBeatCheck = currentTime;
17171717 auto heartbeat = heartbeat_.load ();
@@ -2122,24 +2122,21 @@ void ProcessGroupNCCL::watchdogHandler() {
21222122 kWorkStatusUpdatePeriodMs ) {
21232123 ::c10d::C10dLoggingData data;
21242124 // logging integers
2125- data.integers [" pg_id" ] = static_cast < int64_t >( local_id_) ;
2125+ data.integers [" pg_id" ] = local_id_;
21262126 data.integers [" rank" ] = rank_;
21272127 data.integers [" global_rank" ] = globalRank ();
21282128 data.integers [" last_enqueued_work" ] = pgStatus_->lastEnqueuedSeq ;
21292129 data.integers [" last_started_work" ] = pgStatus_->lastStartedSeq ;
21302130 data.integers [" last_completed_work" ] = pgStatus_->lastCompletedSeq ;
2131- data.integers [" last_enqueued_numel_in" ] =
2132- static_cast <int64_t >(pgStatus_->lastEnqueuedNumelIn );
2131+ data.integers [" last_enqueued_numel_in" ] = pgStatus_->lastEnqueuedNumelIn ;
21332132 data.integers [" last_enqueued_numel_out" ] =
2134- static_cast < int64_t >( pgStatus_->lastEnqueuedNumelOut ) ;
2133+ pgStatus_->lastEnqueuedNumelOut ;
21352134 data.integers [" last_completed_numel_in" ] =
2136- static_cast < int64_t >( pgStatus_->lastCompletedNumelIn ) ;
2135+ pgStatus_->lastCompletedNumelIn ;
21372136 data.integers [" last_completed_numel_out" ] =
2138- static_cast <int64_t >(pgStatus_->lastCompletedNumelOut );
2139- data.integers [" last_started_numel_in" ] =
2140- static_cast <int64_t >(pgStatus_->lastStartedNumelIn );
2141- data.integers [" last_started_numel_out" ] =
2142- static_cast <int64_t >(pgStatus_->lastStartedNumelOut );
2137+ pgStatus_->lastCompletedNumelOut ;
2138+ data.integers [" last_started_numel_in" ] = pgStatus_->lastStartedNumelIn ;
2139+ data.integers [" last_started_numel_out" ] = pgStatus_->lastStartedNumelOut ;
21432140 // logging strings
21442141 data.strings [" last_enqueued_work_name" ] = pgStatus_->lastEnqueuedWorkName ;
21452142 data.strings [" last_started_work_name" ] = pgStatus_->lastStartedWorkName ;
@@ -2689,7 +2686,6 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
26892686 segmentInfo.device == device.index (),
26902687 " Mismatch between CUDA memory segment device and current device" );
26912688 ncclComm->registerSegment (
2692- // NOLINTNEXTLINE(performance-no-int-to-ptr)
26932689 reinterpret_cast <void *>(segmentInfo.address ),
26942690 segmentInfo.total_size );
26952691 }
@@ -2915,7 +2911,7 @@ void ProcessGroupNCCL::workEnqueue(
29152911 // get deadlock. Here we enqueue work without outputs_.
29162912 workMetaList_.emplace_back (*work);
29172913 // update the PG status related to the last enqueued work
2918- pgStatus_->lastEnqueuedSeq = static_cast < int64_t >( work->seq_ ) ;
2914+ pgStatus_->lastEnqueuedSeq = work->seq_ ;
29192915 pgStatus_->lastEnqueuedWorkName = opTypeToString (work->opType_ );
29202916 pgStatus_->lastEnqueuedNumelIn = work->numelIn_ ;
29212917 pgStatus_->lastEnqueuedNumelOut = work->numelOut_ ;
@@ -3864,6 +3860,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
38643860 TORCH_CHECK (
38653861 !isFloat8Type (tensor.scalar_type ()),
38663862 " Float8 dtypes are not currenlty supported for NCCL reductions" );
3863+ // @lint-ignore CLANGTIDY
38673864 RECORD_PARAM_COMMS_DATA (
38683865 std::make_tuple (
38693866 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -3894,6 +3891,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
38943891 !isFloat8Type (tensors.back ().scalar_type ()),
38953892 " Float8 dtypes are not currenlty supported for NCCL reductions" );
38963893
3894+ // @lint-ignore CLANGTIDY
38973895 RECORD_PARAM_COMMS_DATA (
38983896 std::make_tuple (
38993897 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -3948,6 +3946,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39483946 }
39493947 check_gpu_single_tensor (tensor);
39503948
3949+ // @lint-ignore CLANGTIDY
39513950 RECORD_PARAM_COMMS_DATA (
39523951 std::make_tuple (
39533952 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -3983,7 +3982,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39833982 input.data_ptr (),
39843983 input.numel (),
39853984 getNcclDataType (input.scalar_type ()),
3986- static_cast < int >( root) ,
3985+ root,
39873986 comm,
39883987 stream.stream ());
39893988 },
@@ -4023,7 +4022,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
40234022 output.data_ptr (),
40244023 input.numel (),
40254024 getNcclDataType (input.scalar_type ()),
4026- static_cast < int >( root) ,
4025+ root,
40274026 comm,
40284027 stream.stream ());
40294028 },
@@ -4037,6 +4036,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40374036 std::vector<at::Tensor>& tensors,
40384037 const ReduceOptions& opts) {
40394038 TORCH_CHECK (tensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4039+ // @lint-ignore CLANGTIDY
40404040 auto tensor = tensors.back ();
40414041 if (tensor.is_complex ()) {
40424042 TORCH_CHECK (
@@ -4083,7 +4083,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40834083 input.numel (),
40844084 ncclDataType,
40854085 ncclReduceOp,
4086- static_cast < int >( root) ,
4086+ root,
40874087 comm,
40884088 stream.stream ());
40894089 },
@@ -4137,8 +4137,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
41374137 std::vector<at::Tensor>& inputTensors,
41384138 const AllgatherOptions& opts) {
41394139 TORCH_CHECK (inputTensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4140+ // @lint-ignore CLANGTIDY
41404141 auto inputTensor = inputTensors.back ();
41414142 check_gpu_single_tensor (inputTensor);
4143+ // @lint-ignore CLANGTIDY
41424144 auto outputTensors_ = outputTensors.back ();
41434145
41444146 RECORD_PARAM_COMMS_DATA (
@@ -4207,19 +4209,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
42074209 c10::cuda::CUDACachingAllocator::recordStream (
42084210 outputTensors_[j].storage ().data_ptr (), ncclStream);
42094211 }
4210- outputTensors_[j].copy_ (
4211- outputFlattened[static_cast <int64_t >(j)], true );
4212+ outputTensors_[j].copy_ (outputFlattened[j], true );
42124213 }
42134214 },
42144215 OpType::ALLGATHER,
42154216 " nccl:all_gather" );
42164217 } else {
42174218 const auto num_reduces = outputTensors_.size ();
42184219 startCoalescing ();
4219- for (const int64_t i : c10::irange (static_cast < int64_t >( num_reduces) )) {
4220+ for (const int i : c10::irange (num_reduces)) {
42204221 auto & output = outputTensors_[i];
42214222 auto & input = (i == rank_) ? inputTensor : output;
4222- auto broadcastOpts = BroadcastOptions{i, int64_t (0 ), opts.timeout };
4223+ auto broadcastOpts = BroadcastOptions{
4224+ static_cast <int64_t >(i), static_cast <int64_t >(0 ), opts.timeout };
42234225 _broadcast_oop (output, input, broadcastOpts);
42244226 }
42254227 auto work = endCoalescing (OpType::ALLGATHER);
@@ -4240,6 +4242,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
42404242 std::vector<at::Tensor>& outputs,
42414243 std::vector<at::Tensor>& inputs,
42424244 const AllgatherOptions& opts) {
4245+ // @lint-ignore CLANGTIDY
42434246 RECORD_PARAM_COMMS_DATA (
42444247 std::make_tuple (
42454248 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -4283,8 +4286,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
42834286 std::vector<std::vector<at::Tensor>>& inputTensors,
42844287 const ReduceScatterOptions& opts) {
42854288 TORCH_CHECK (outputTensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4289+ // @lint-ignore CLANGTIDY
42864290 auto outputTensor = outputTensors.back ();
42874291 check_gpu_single_tensor (outputTensor);
4292+ // @lint-ignore CLANGTIDY
42884293 auto inputTensors_ = inputTensors.back ();
42894294 TORCH_CHECK (
42904295 !isFloat8Type (outputTensor.scalar_type ()),
@@ -4359,8 +4364,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43594364 c10::cuda::CUDACachingAllocator::recordStream (
43604365 inputTensors_[j].storage ().data_ptr (), ncclStream);
43614366 }
4362- inputFlattened[static_cast <int64_t >(j)].copy_ (
4363- inputTensors_[j], true );
4367+ inputFlattened[j].copy_ (inputTensors_[j], true );
43644368 }
43654369 },
43664370 [&](at::cuda::CUDAStream&,
@@ -4370,7 +4374,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43704374 } else {
43714375 const auto num_reduces = inputTensors_.size ();
43724376 startCoalescing ();
4373- for (const int i : c10::irange (static_cast < int >( num_reduces) )) {
4377+ for (const int i : c10::irange (num_reduces)) {
43744378 auto & input = inputTensors_[i];
43754379 auto & output = (i == rank_) ? outputTensor : input;
43764380 auto reduceOpts = ReduceOptions{
@@ -4400,6 +4404,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
44004404 " input tensor must be the same size as output size times world size" );
44014405 }
44024406
4407+ // @lint-ignore CLANGTIDY
44034408 const auto & tensor = outputTensor;
44044409 TORCH_CHECK (
44054410 !isFloat8Type (tensor.scalar_type ()),
@@ -4469,6 +4474,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
44694474 !isFloat8Type (inputs.back ().scalar_type ()),
44704475 " Float8 dtypes are not currenlty supported for NCCL reductions" );
44714476
4477+ // @lint-ignore CLANGTIDY
44724478 RECORD_PARAM_COMMS_DATA (
44734479 std::make_tuple (
44744480 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -4533,13 +4539,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45334539 this ->getSize ()); // worldSize
45344540
45354541 // Device to use for barrier
4536- c10::DeviceIndex barDevIdx = -1 ;
4542+ int barDevIdx = -1 ;
45374543
45384544 // Select device to use for barrier
45394545 // 1st choice: Use user defined GPU device ids if provided
45404546 if (!opts.device_ids .empty ()) {
45414547 // Use the first device id because PG NCCL is single-device now
4542- barDevIdx = static_cast <c10::DeviceIndex>( opts.device_ids [0 ]) ;
4548+ barDevIdx = opts.device_ids [0 ];
45434549 } else if (getBoundDeviceId ()) {
45444550 // 2nd choice: Use the bound GPU device id if available.
45454551 // Bounded device id can be passed to `init_process_group`.
@@ -4556,12 +4562,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45564562 // Note: it is better to use global rank because the group-local rank can be
45574563 // offset wrt the device id if intra-node GPUs are sharded into multiple
45584564 // dimensions.
4559- barDevIdx = static_cast <c10::DeviceIndex >(globalRank () % localDeviceCount_);
4565+ barDevIdx = static_cast <int16_t >(globalRank () % localDeviceCount_);
45604566 LOG (WARNING)
45614567 << logPrefix ()
45624568 << c10::str (
45634569 " using GPU " ,
4564- static_cast < int >( barDevIdx) ,
4570+ barDevIdx,
45654571 " to perform barrier as devices used by this process are currently unknown. " ,
45664572 " This can potentially cause a hang if this rank to GPU mapping is incorrect. " ,
45674573 " Specify device_ids in barrier() to force use of a particular device, " ,
@@ -4572,7 +4578,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45724578 ValueError,
45734579 barDevIdx >= 0 ,
45744580 " Failed to infer a GPU device id to perform barrier. " );
4575- auto barDevice = at::Device (at::DeviceType::CUDA, barDevIdx);
4581+ auto barDevice = at::Device (
4582+ at::DeviceType::CUDA, static_cast <c10::DeviceIndex>(barDevIdx));
45764583
45774584 // Create a dummy tensor on the device
45784585 // Note: we use zeros() instead of empty() to prevent barrier from triggering
@@ -4769,6 +4776,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
47694776 int dstRank,
47704777 int /* unused */ ) {
47714778 TORCH_CHECK (tensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4779+ // @lint-ignore CLANGTIDY
47724780 auto tensor = tensors.back ();
47734781 check_gpu_single_tensor (tensor, true );
47744782
@@ -4817,6 +4825,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
48174825 int srcRank,
48184826 int /* unused */ ) {
48194827 TORCH_CHECK (tensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4828+ // @lint-ignore CLANGTIDY
48204829 auto tensor = tensors.back ();
48214830 check_gpu_single_tensor (tensor, true );
48224831
@@ -4895,6 +4904,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
48954904 assertRootRank (invalidArgument, opts.rootRank , size_);
48964905
48974906 TORCH_CHECK (inputTensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4907+ // @lint-ignore CLANGTIDY
48984908 auto inputTensor = inputTensors.back ();
48994909
49004910 std::vector<at::Tensor> outputs;
0 commit comments