@@ -523,8 +523,6 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
523523 exception_ = w.exception_ ;
524524}
525525
526- ProcessGroupNCCL::WorkNCCL::~WorkNCCL () = default ;
527-
528526bool ProcessGroupNCCL::WorkNCCL::isCompleted () {
529527 if (!ncclComm_->isAborted ()) {
530528 checkAndSetException ();
@@ -1011,7 +1009,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10111009 if (options_->global_ranks_in_group .empty ()) {
10121010 this ->globalRankStart = 0 ;
10131011 } else {
1014- this ->globalRankStart = options_->global_ranks_in_group [0 ];
1012+ this ->globalRankStart =
1013+ static_cast <int >(options_->global_ranks_in_group [0 ]);
10151014 }
10161015
10171016 if (options_->global_ranks_in_group .empty ()) {
@@ -1033,8 +1032,9 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10331032 }
10341033
10351034 if (ranksAreStrided) {
1036- this ->globalRankStride = options_->global_ranks_in_group [1 ] -
1037- options_->global_ranks_in_group [0 ];
1035+ this ->globalRankStride = static_cast <int >(
1036+ options_->global_ranks_in_group [1 ] -
1037+ options_->global_ranks_in_group [0 ]);
10381038 } else {
10391039 this ->globalRankStride = -1 ;
10401040 }
@@ -1087,7 +1087,7 @@ bool ProcessGroupNCCL::useNonblocking() {
10871087 }
10881088 // 2nd priority: Respect the environment variable
10891089 else if (nbEnv.has_value ()) {
1090- useNonblocking_ = nbEnv. value () ;
1090+ useNonblocking_ = nbEnv;
10911091 }
10921092 // 3rd priority: automatically use nonblocking if we are in eager init mode
10931093 else if (getBoundDeviceId ()) {
@@ -1711,7 +1711,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
17111711 }
17121712
17131713 if (computeDeltaMS (lastTimeHeartBeatCheck, currentTime) >=
1714- heartbeatTimeoutInSec_ * 1000 ) {
1714+ heartbeatTimeoutInSec_ * 1000l ) {
17151715 // Check the heart beat of watchdog thread.
17161716 lastTimeHeartBeatCheck = currentTime;
17171717 auto heartbeat = heartbeat_.load ();
@@ -2122,21 +2122,24 @@ void ProcessGroupNCCL::watchdogHandler() {
21222122 kWorkStatusUpdatePeriodMs ) {
21232123 ::c10d::C10dLoggingData data;
21242124 // logging integers
2125- data.integers [" pg_id" ] = local_id_;
2125+ data.integers [" pg_id" ] = static_cast < int64_t >( local_id_) ;
21262126 data.integers [" rank" ] = rank_;
21272127 data.integers [" global_rank" ] = globalRank ();
21282128 data.integers [" last_enqueued_work" ] = pgStatus_->lastEnqueuedSeq ;
21292129 data.integers [" last_started_work" ] = pgStatus_->lastStartedSeq ;
21302130 data.integers [" last_completed_work" ] = pgStatus_->lastCompletedSeq ;
2131- data.integers [" last_enqueued_numel_in" ] = pgStatus_->lastEnqueuedNumelIn ;
2131+ data.integers [" last_enqueued_numel_in" ] =
2132+ static_cast <int64_t >(pgStatus_->lastEnqueuedNumelIn );
21322133 data.integers [" last_enqueued_numel_out" ] =
2133- pgStatus_->lastEnqueuedNumelOut ;
2134+ static_cast < int64_t >( pgStatus_->lastEnqueuedNumelOut ) ;
21342135 data.integers [" last_completed_numel_in" ] =
2135- pgStatus_->lastCompletedNumelIn ;
2136+ static_cast < int64_t >( pgStatus_->lastCompletedNumelIn ) ;
21362137 data.integers [" last_completed_numel_out" ] =
2137- pgStatus_->lastCompletedNumelOut ;
2138- data.integers [" last_started_numel_in" ] = pgStatus_->lastStartedNumelIn ;
2139- data.integers [" last_started_numel_out" ] = pgStatus_->lastStartedNumelOut ;
2138+ static_cast <int64_t >(pgStatus_->lastCompletedNumelOut );
2139+ data.integers [" last_started_numel_in" ] =
2140+ static_cast <int64_t >(pgStatus_->lastStartedNumelIn );
2141+ data.integers [" last_started_numel_out" ] =
2142+ static_cast <int64_t >(pgStatus_->lastStartedNumelOut );
21402143 // logging strings
21412144 data.strings [" last_enqueued_work_name" ] = pgStatus_->lastEnqueuedWorkName ;
21422145 data.strings [" last_started_work_name" ] = pgStatus_->lastStartedWorkName ;
@@ -2686,6 +2689,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
26862689 segmentInfo.device == device.index (),
26872690 " Mismatch between CUDA memory segment device and current device" );
26882691 ncclComm->registerSegment (
2692+ // NOLINTNEXTLINE(performance-no-int-to-ptr)
26892693 reinterpret_cast <void *>(segmentInfo.address ),
26902694 segmentInfo.total_size );
26912695 }
@@ -2911,7 +2915,7 @@ void ProcessGroupNCCL::workEnqueue(
29112915 // get deadlock. Here we enqueue work without outputs_.
29122916 workMetaList_.emplace_back (*work);
29132917 // update the PG status related to the last enqueued work
2914- pgStatus_->lastEnqueuedSeq = work->seq_ ;
2918+ pgStatus_->lastEnqueuedSeq = static_cast < int64_t >( work->seq_ ) ;
29152919 pgStatus_->lastEnqueuedWorkName = opTypeToString (work->opType_ );
29162920 pgStatus_->lastEnqueuedNumelIn = work->numelIn_ ;
29172921 pgStatus_->lastEnqueuedNumelOut = work->numelOut_ ;
@@ -3860,7 +3864,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
38603864 TORCH_CHECK (
38613865 !isFloat8Type (tensor.scalar_type ()),
38623866 " Float8 dtypes are not currenlty supported for NCCL reductions" );
3863- // @lint-ignore CLANGTIDY
38643867 RECORD_PARAM_COMMS_DATA (
38653868 std::make_tuple (
38663869 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -3891,7 +3894,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
38913894 !isFloat8Type (tensors.back ().scalar_type ()),
38923895 " Float8 dtypes are not currenlty supported for NCCL reductions" );
38933896
3894- // @lint-ignore CLANGTIDY
38953897 RECORD_PARAM_COMMS_DATA (
38963898 std::make_tuple (
38973899 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -3946,7 +3948,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39463948 }
39473949 check_gpu_single_tensor (tensor);
39483950
3949- // @lint-ignore CLANGTIDY
39503951 RECORD_PARAM_COMMS_DATA (
39513952 std::make_tuple (
39523953 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -3982,7 +3983,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39823983 input.data_ptr (),
39833984 input.numel (),
39843985 getNcclDataType (input.scalar_type ()),
3985- root,
3986+ static_cast < int >( root) ,
39863987 comm,
39873988 stream.stream ());
39883989 },
@@ -4022,7 +4023,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
40224023 output.data_ptr (),
40234024 input.numel (),
40244025 getNcclDataType (input.scalar_type ()),
4025- root,
4026+ static_cast < int >( root) ,
40264027 comm,
40274028 stream.stream ());
40284029 },
@@ -4036,7 +4037,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40364037 std::vector<at::Tensor>& tensors,
40374038 const ReduceOptions& opts) {
40384039 TORCH_CHECK (tensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4039- // @lint-ignore CLANGTIDY
40404040 auto tensor = tensors.back ();
40414041 if (tensor.is_complex ()) {
40424042 TORCH_CHECK (
@@ -4083,7 +4083,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40834083 input.numel (),
40844084 ncclDataType,
40854085 ncclReduceOp,
4086- root,
4086+ static_cast < int >( root) ,
40874087 comm,
40884088 stream.stream ());
40894089 },
@@ -4137,10 +4137,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
41374137 std::vector<at::Tensor>& inputTensors,
41384138 const AllgatherOptions& opts) {
41394139 TORCH_CHECK (inputTensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4140- // @lint-ignore CLANGTIDY
41414140 auto inputTensor = inputTensors.back ();
41424141 check_gpu_single_tensor (inputTensor);
4143- // @lint-ignore CLANGTIDY
41444142 auto outputTensors_ = outputTensors.back ();
41454143
41464144 RECORD_PARAM_COMMS_DATA (
@@ -4209,19 +4207,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
42094207 c10::cuda::CUDACachingAllocator::recordStream (
42104208 outputTensors_[j].storage ().data_ptr (), ncclStream);
42114209 }
4212- outputTensors_[j].copy_ (outputFlattened[j], true );
4210+ outputTensors_[j].copy_ (
4211+ outputFlattened[static_cast <int64_t >(j)], true );
42134212 }
42144213 },
42154214 OpType::ALLGATHER,
42164215 " nccl:all_gather" );
42174216 } else {
42184217 const auto num_reduces = outputTensors_.size ();
42194218 startCoalescing ();
4220- for (const int i : c10::irange (num_reduces)) {
4219+ for (const int64_t i : c10::irange (static_cast < int64_t >( num_reduces) )) {
42214220 auto & output = outputTensors_[i];
42224221 auto & input = (i == rank_) ? inputTensor : output;
4223- auto broadcastOpts = BroadcastOptions{
4224- static_cast <int64_t >(i), static_cast <int64_t >(0 ), opts.timeout };
4222+ auto broadcastOpts = BroadcastOptions{i, int64_t (0 ), opts.timeout };
42254223 _broadcast_oop (output, input, broadcastOpts);
42264224 }
42274225 auto work = endCoalescing (OpType::ALLGATHER);
@@ -4242,7 +4240,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
42424240 std::vector<at::Tensor>& outputs,
42434241 std::vector<at::Tensor>& inputs,
42444242 const AllgatherOptions& opts) {
4245- // @lint-ignore CLANGTIDY
42464243 RECORD_PARAM_COMMS_DATA (
42474244 std::make_tuple (
42484245 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -4286,10 +4283,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
42864283 std::vector<std::vector<at::Tensor>>& inputTensors,
42874284 const ReduceScatterOptions& opts) {
42884285 TORCH_CHECK (outputTensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4289- // @lint-ignore CLANGTIDY
42904286 auto outputTensor = outputTensors.back ();
42914287 check_gpu_single_tensor (outputTensor);
4292- // @lint-ignore CLANGTIDY
42934288 auto inputTensors_ = inputTensors.back ();
42944289 TORCH_CHECK (
42954290 !isFloat8Type (outputTensor.scalar_type ()),
@@ -4364,7 +4359,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43644359 c10::cuda::CUDACachingAllocator::recordStream (
43654360 inputTensors_[j].storage ().data_ptr (), ncclStream);
43664361 }
4367- inputFlattened[j].copy_ (inputTensors_[j], true );
4362+ inputFlattened[static_cast <int64_t >(j)].copy_ (
4363+ inputTensors_[j], true );
43684364 }
43694365 },
43704366 [&](at::cuda::CUDAStream&,
@@ -4374,7 +4370,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43744370 } else {
43754371 const auto num_reduces = inputTensors_.size ();
43764372 startCoalescing ();
4377- for (const int i : c10::irange (num_reduces)) {
4373+ for (const int i : c10::irange (static_cast < int >( num_reduces) )) {
43784374 auto & input = inputTensors_[i];
43794375 auto & output = (i == rank_) ? outputTensor : input;
43804376 auto reduceOpts = ReduceOptions{
@@ -4404,7 +4400,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
44044400 " input tensor must be the same size as output size times world size" );
44054401 }
44064402
4407- // @lint-ignore CLANGTIDY
44084403 const auto & tensor = outputTensor;
44094404 TORCH_CHECK (
44104405 !isFloat8Type (tensor.scalar_type ()),
@@ -4474,7 +4469,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
44744469 !isFloat8Type (inputs.back ().scalar_type ()),
44754470 " Float8 dtypes are not currenlty supported for NCCL reductions" );
44764471
4477- // @lint-ignore CLANGTIDY
44784472 RECORD_PARAM_COMMS_DATA (
44794473 std::make_tuple (
44804474 static_cast <int64_t >(seqCollective_) + 1 ,
@@ -4539,13 +4533,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45394533 this ->getSize ()); // worldSize
45404534
45414535 // Device to use for barrier
4542- int barDevIdx = -1 ;
4536+ c10::DeviceIndex barDevIdx = -1 ;
45434537
45444538 // Select device to use for barrier
45454539 // 1st choice: Use user defined GPU device ids if provided
45464540 if (!opts.device_ids .empty ()) {
45474541 // Use the first device id because PG NCCL is single-device now
4548- barDevIdx = opts.device_ids [0 ];
4542+ barDevIdx = static_cast <c10::DeviceIndex>( opts.device_ids [0 ]) ;
45494543 } else if (getBoundDeviceId ()) {
45504544 // 2nd choice: Use the bound GPU device id if available.
45514545 // Bounded device id can be passed to `init_process_group`.
@@ -4562,12 +4556,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45624556 // Note: it is better to use global rank because the group-local rank can be
45634557 // offset wrt the device id if intra-node GPUs are sharded into multiple
45644558 // dimensions.
4565- barDevIdx = static_cast <int16_t >(globalRank () % localDeviceCount_);
4559+ barDevIdx = static_cast <c10::DeviceIndex >(globalRank () % localDeviceCount_);
45664560 LOG (WARNING)
45674561 << logPrefix ()
45684562 << c10::str (
45694563 " using GPU " ,
4570- barDevIdx,
4564+ static_cast < int >( barDevIdx) ,
45714565 " to perform barrier as devices used by this process are currently unknown. " ,
45724566 " This can potentially cause a hang if this rank to GPU mapping is incorrect. " ,
45734567 " Specify device_ids in barrier() to force use of a particular device, " ,
@@ -4578,8 +4572,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45784572 ValueError,
45794573 barDevIdx >= 0 ,
45804574 " Failed to infer a GPU device id to perform barrier. " );
4581- auto barDevice = at::Device (
4582- at::DeviceType::CUDA, static_cast <c10::DeviceIndex>(barDevIdx));
4575+ auto barDevice = at::Device (at::DeviceType::CUDA, barDevIdx);
45834576
45844577 // Create a dummy tensor on the device
45854578 // Note: we use zeros() instead of empty() to prevent barrier from triggering
@@ -4776,7 +4769,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
47764769 int dstRank,
47774770 int /* unused */ ) {
47784771 TORCH_CHECK (tensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4779- // @lint-ignore CLANGTIDY
47804772 auto tensor = tensors.back ();
47814773 check_gpu_single_tensor (tensor, true );
47824774
@@ -4825,7 +4817,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
48254817 int srcRank,
48264818 int /* unused */ ) {
48274819 TORCH_CHECK (tensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4828- // @lint-ignore CLANGTIDY
48294820 auto tensor = tensors.back ();
48304821 check_gpu_single_tensor (tensor, true );
48314822
@@ -4904,7 +4895,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
49044895 assertRootRank (invalidArgument, opts.rootRank , size_);
49054896
49064897 TORCH_CHECK (inputTensors.size () == 1 , MULTI_DEVICE_ERROR_MSG);
4907- // @lint-ignore CLANGTIDY
49084898 auto inputTensor = inputTensors.back ();
49094899
49104900 std::vector<at::Tensor> outputs;
0 commit comments