Skip to content

Commit 42f4fda

Browse files
cyyeverpytorchmergebot
authored andcommitted
Enable clang-tidy on torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp (pytorch#143806)
Fixes #ISSUE_NUMBER Pull Request resolved: pytorch#143806 Approved by: https://github.com/kwen2501
1 parent 6f07847 commit 42f4fda

File tree

10 files changed

+45
-54
lines changed

10 files changed

+45
-54
lines changed

.lintrunner.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,6 @@ exclude_patterns = [
261261
'torch/csrc/api/include/torch/linalg.h',
262262
'torch/csrc/autograd/generated/**',
263263
'torch/csrc/distributed/**/*.cu',
264-
'torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp',
265264
'torch/csrc/distributed/c10d/WinSockUtils.hpp',
266265
'torch/csrc/distributed/c10d/quantization/quantization_gpu.h',
267266
'torch/csrc/dynamo/eval_frame.h',

torch/csrc/distributed/c10d/FileStore.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class Lock {
9898
}
9999

100100
Lock(const Lock& that) = delete;
101+
Lock& operator=(const Lock&) = delete;
101102

102103
Lock& operator=(const Lock& other) = delete;
103104
Lock& operator=(Lock&& other) noexcept {
@@ -170,6 +171,10 @@ class File {
170171
}
171172
SYSASSERT(fd_, "open(" + path + ")");
172173
}
174+
File(const File&) = delete;
175+
File& operator=(const File&) = delete;
176+
File(File&&) noexcept = delete;
177+
File& operator=(File&&) noexcept = delete;
173178

174179
~File() {
175180
::close(fd_);

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -523,8 +523,6 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
523523
exception_ = w.exception_;
524524
}
525525

526-
ProcessGroupNCCL::WorkNCCL::~WorkNCCL() = default;
527-
528526
bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
529527
if (!ncclComm_->isAborted()) {
530528
checkAndSetException();
@@ -1011,7 +1009,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10111009
if (options_->global_ranks_in_group.empty()) {
10121010
this->globalRankStart = 0;
10131011
} else {
1014-
this->globalRankStart = options_->global_ranks_in_group[0];
1012+
this->globalRankStart =
1013+
static_cast<int>(options_->global_ranks_in_group[0]);
10151014
}
10161015

10171016
if (options_->global_ranks_in_group.empty()) {
@@ -1033,8 +1032,9 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10331032
}
10341033

10351034
if (ranksAreStrided) {
1036-
this->globalRankStride = options_->global_ranks_in_group[1] -
1037-
options_->global_ranks_in_group[0];
1035+
this->globalRankStride = static_cast<int>(
1036+
options_->global_ranks_in_group[1] -
1037+
options_->global_ranks_in_group[0]);
10381038
} else {
10391039
this->globalRankStride = -1;
10401040
}
@@ -1087,7 +1087,7 @@ bool ProcessGroupNCCL::useNonblocking() {
10871087
}
10881088
// 2nd priority: Respect the environment variable
10891089
else if (nbEnv.has_value()) {
1090-
useNonblocking_ = nbEnv.value();
1090+
useNonblocking_ = nbEnv;
10911091
}
10921092
// 3rd priority: automatically use nonblocking if we are in eager init mode
10931093
else if (getBoundDeviceId()) {
@@ -1711,7 +1711,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
17111711
}
17121712

17131713
if (computeDeltaMS(lastTimeHeartBeatCheck, currentTime) >=
1714-
heartbeatTimeoutInSec_ * 1000) {
1714+
heartbeatTimeoutInSec_ * 1000l) {
17151715
// Check the heart beat of watchdog thread.
17161716
lastTimeHeartBeatCheck = currentTime;
17171717
auto heartbeat = heartbeat_.load();
@@ -2122,21 +2122,24 @@ void ProcessGroupNCCL::watchdogHandler() {
21222122
kWorkStatusUpdatePeriodMs) {
21232123
::c10d::C10dLoggingData data;
21242124
// logging integers
2125-
data.integers["pg_id"] = local_id_;
2125+
data.integers["pg_id"] = static_cast<int64_t>(local_id_);
21262126
data.integers["rank"] = rank_;
21272127
data.integers["global_rank"] = globalRank();
21282128
data.integers["last_enqueued_work"] = pgStatus_->lastEnqueuedSeq;
21292129
data.integers["last_started_work"] = pgStatus_->lastStartedSeq;
21302130
data.integers["last_completed_work"] = pgStatus_->lastCompletedSeq;
2131-
data.integers["last_enqueued_numel_in"] = pgStatus_->lastEnqueuedNumelIn;
2131+
data.integers["last_enqueued_numel_in"] =
2132+
static_cast<int64_t>(pgStatus_->lastEnqueuedNumelIn);
21322133
data.integers["last_enqueued_numel_out"] =
2133-
pgStatus_->lastEnqueuedNumelOut;
2134+
static_cast<int64_t>(pgStatus_->lastEnqueuedNumelOut);
21342135
data.integers["last_completed_numel_in"] =
2135-
pgStatus_->lastCompletedNumelIn;
2136+
static_cast<int64_t>(pgStatus_->lastCompletedNumelIn);
21362137
data.integers["last_completed_numel_out"] =
2137-
pgStatus_->lastCompletedNumelOut;
2138-
data.integers["last_started_numel_in"] = pgStatus_->lastStartedNumelIn;
2139-
data.integers["last_started_numel_out"] = pgStatus_->lastStartedNumelOut;
2138+
static_cast<int64_t>(pgStatus_->lastCompletedNumelOut);
2139+
data.integers["last_started_numel_in"] =
2140+
static_cast<int64_t>(pgStatus_->lastStartedNumelIn);
2141+
data.integers["last_started_numel_out"] =
2142+
static_cast<int64_t>(pgStatus_->lastStartedNumelOut);
21402143
// logging strings
21412144
data.strings["last_enqueued_work_name"] = pgStatus_->lastEnqueuedWorkName;
21422145
data.strings["last_started_work_name"] = pgStatus_->lastStartedWorkName;
@@ -2686,6 +2689,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
26862689
segmentInfo.device == device.index(),
26872690
"Mismatch between CUDA memory segment device and current device");
26882691
ncclComm->registerSegment(
2692+
// NOLINTNEXTLINE(performance-no-int-to-ptr)
26892693
reinterpret_cast<void*>(segmentInfo.address),
26902694
segmentInfo.total_size);
26912695
}
@@ -2911,7 +2915,7 @@ void ProcessGroupNCCL::workEnqueue(
29112915
// get deadlock. Here we enqueue work without outputs_.
29122916
workMetaList_.emplace_back(*work);
29132917
// update the PG status related to the last enqueued work
2914-
pgStatus_->lastEnqueuedSeq = work->seq_;
2918+
pgStatus_->lastEnqueuedSeq = static_cast<int64_t>(work->seq_);
29152919
pgStatus_->lastEnqueuedWorkName = opTypeToString(work->opType_);
29162920
pgStatus_->lastEnqueuedNumelIn = work->numelIn_;
29172921
pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
@@ -3860,7 +3864,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
38603864
TORCH_CHECK(
38613865
!isFloat8Type(tensor.scalar_type()),
38623866
"Float8 dtypes are not currenlty supported for NCCL reductions");
3863-
// @lint-ignore CLANGTIDY
38643867
RECORD_PARAM_COMMS_DATA(
38653868
std::make_tuple(
38663869
static_cast<int64_t>(seqCollective_) + 1,
@@ -3891,7 +3894,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
38913894
!isFloat8Type(tensors.back().scalar_type()),
38923895
"Float8 dtypes are not currenlty supported for NCCL reductions");
38933896

3894-
// @lint-ignore CLANGTIDY
38953897
RECORD_PARAM_COMMS_DATA(
38963898
std::make_tuple(
38973899
static_cast<int64_t>(seqCollective_) + 1,
@@ -3946,7 +3948,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39463948
}
39473949
check_gpu_single_tensor(tensor);
39483950

3949-
// @lint-ignore CLANGTIDY
39503951
RECORD_PARAM_COMMS_DATA(
39513952
std::make_tuple(
39523953
static_cast<int64_t>(seqCollective_) + 1,
@@ -3982,7 +3983,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39823983
input.data_ptr(),
39833984
input.numel(),
39843985
getNcclDataType(input.scalar_type()),
3985-
root,
3986+
static_cast<int>(root),
39863987
comm,
39873988
stream.stream());
39883989
},
@@ -4022,7 +4023,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
40224023
output.data_ptr(),
40234024
input.numel(),
40244025
getNcclDataType(input.scalar_type()),
4025-
root,
4026+
static_cast<int>(root),
40264027
comm,
40274028
stream.stream());
40284029
},
@@ -4036,7 +4037,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40364037
std::vector<at::Tensor>& tensors,
40374038
const ReduceOptions& opts) {
40384039
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4039-
// @lint-ignore CLANGTIDY
40404040
auto tensor = tensors.back();
40414041
if (tensor.is_complex()) {
40424042
TORCH_CHECK(
@@ -4083,7 +4083,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40834083
input.numel(),
40844084
ncclDataType,
40854085
ncclReduceOp,
4086-
root,
4086+
static_cast<int>(root),
40874087
comm,
40884088
stream.stream());
40894089
},
@@ -4137,10 +4137,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
41374137
std::vector<at::Tensor>& inputTensors,
41384138
const AllgatherOptions& opts) {
41394139
TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4140-
// @lint-ignore CLANGTIDY
41414140
auto inputTensor = inputTensors.back();
41424141
check_gpu_single_tensor(inputTensor);
4143-
// @lint-ignore CLANGTIDY
41444142
auto outputTensors_ = outputTensors.back();
41454143

41464144
RECORD_PARAM_COMMS_DATA(
@@ -4209,19 +4207,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
42094207
c10::cuda::CUDACachingAllocator::recordStream(
42104208
outputTensors_[j].storage().data_ptr(), ncclStream);
42114209
}
4212-
outputTensors_[j].copy_(outputFlattened[j], true);
4210+
outputTensors_[j].copy_(
4211+
outputFlattened[static_cast<int64_t>(j)], true);
42134212
}
42144213
},
42154214
OpType::ALLGATHER,
42164215
"nccl:all_gather");
42174216
} else {
42184217
const auto num_reduces = outputTensors_.size();
42194218
startCoalescing();
4220-
for (const int i : c10::irange(num_reduces)) {
4219+
for (const int64_t i : c10::irange(static_cast<int64_t>(num_reduces))) {
42214220
auto& output = outputTensors_[i];
42224221
auto& input = (i == rank_) ? inputTensor : output;
4223-
auto broadcastOpts = BroadcastOptions{
4224-
static_cast<int64_t>(i), static_cast<int64_t>(0), opts.timeout};
4222+
auto broadcastOpts = BroadcastOptions{i, int64_t(0), opts.timeout};
42254223
_broadcast_oop(output, input, broadcastOpts);
42264224
}
42274225
auto work = endCoalescing(OpType::ALLGATHER);
@@ -4242,7 +4240,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
42424240
std::vector<at::Tensor>& outputs,
42434241
std::vector<at::Tensor>& inputs,
42444242
const AllgatherOptions& opts) {
4245-
// @lint-ignore CLANGTIDY
42464243
RECORD_PARAM_COMMS_DATA(
42474244
std::make_tuple(
42484245
static_cast<int64_t>(seqCollective_) + 1,
@@ -4286,10 +4283,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
42864283
std::vector<std::vector<at::Tensor>>& inputTensors,
42874284
const ReduceScatterOptions& opts) {
42884285
TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4289-
// @lint-ignore CLANGTIDY
42904286
auto outputTensor = outputTensors.back();
42914287
check_gpu_single_tensor(outputTensor);
4292-
// @lint-ignore CLANGTIDY
42934288
auto inputTensors_ = inputTensors.back();
42944289
TORCH_CHECK(
42954290
!isFloat8Type(outputTensor.scalar_type()),
@@ -4364,7 +4359,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43644359
c10::cuda::CUDACachingAllocator::recordStream(
43654360
inputTensors_[j].storage().data_ptr(), ncclStream);
43664361
}
4367-
inputFlattened[j].copy_(inputTensors_[j], true);
4362+
inputFlattened[static_cast<int64_t>(j)].copy_(
4363+
inputTensors_[j], true);
43684364
}
43694365
},
43704366
[&](at::cuda::CUDAStream&,
@@ -4374,7 +4370,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43744370
} else {
43754371
const auto num_reduces = inputTensors_.size();
43764372
startCoalescing();
4377-
for (const int i : c10::irange(num_reduces)) {
4373+
for (const int i : c10::irange(static_cast<int>(num_reduces))) {
43784374
auto& input = inputTensors_[i];
43794375
auto& output = (i == rank_) ? outputTensor : input;
43804376
auto reduceOpts = ReduceOptions{
@@ -4404,7 +4400,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
44044400
"input tensor must be the same size as output size times world size");
44054401
}
44064402

4407-
// @lint-ignore CLANGTIDY
44084403
const auto& tensor = outputTensor;
44094404
TORCH_CHECK(
44104405
!isFloat8Type(tensor.scalar_type()),
@@ -4474,7 +4469,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
44744469
!isFloat8Type(inputs.back().scalar_type()),
44754470
"Float8 dtypes are not currenlty supported for NCCL reductions");
44764471

4477-
// @lint-ignore CLANGTIDY
44784472
RECORD_PARAM_COMMS_DATA(
44794473
std::make_tuple(
44804474
static_cast<int64_t>(seqCollective_) + 1,
@@ -4539,13 +4533,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45394533
this->getSize()); // worldSize
45404534

45414535
// Device to use for barrier
4542-
int barDevIdx = -1;
4536+
c10::DeviceIndex barDevIdx = -1;
45434537

45444538
// Select device to use for barrier
45454539
// 1st choice: Use user defined GPU device ids if provided
45464540
if (!opts.device_ids.empty()) {
45474541
// Use the first device id because PG NCCL is single-device now
4548-
barDevIdx = opts.device_ids[0];
4542+
barDevIdx = static_cast<c10::DeviceIndex>(opts.device_ids[0]);
45494543
} else if (getBoundDeviceId()) {
45504544
// 2nd choice: Use the bound GPU device id if available.
45514545
// Bounded device id can be passed to `init_process_group`.
@@ -4562,12 +4556,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45624556
// Note: it is better to use global rank because the group-local rank can be
45634557
// offset wrt the device id if intra-node GPUs are sharded into multiple
45644558
// dimensions.
4565-
barDevIdx = static_cast<int16_t>(globalRank() % localDeviceCount_);
4559+
barDevIdx = static_cast<c10::DeviceIndex>(globalRank() % localDeviceCount_);
45664560
LOG(WARNING)
45674561
<< logPrefix()
45684562
<< c10::str(
45694563
" using GPU ",
4570-
barDevIdx,
4564+
static_cast<int>(barDevIdx),
45714565
" to perform barrier as devices used by this process are currently unknown. ",
45724566
"This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
45734567
"Specify device_ids in barrier() to force use of a particular device, ",
@@ -4578,8 +4572,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45784572
ValueError,
45794573
barDevIdx >= 0,
45804574
"Failed to infer a GPU device id to perform barrier. ");
4581-
auto barDevice = at::Device(
4582-
at::DeviceType::CUDA, static_cast<c10::DeviceIndex>(barDevIdx));
4575+
auto barDevice = at::Device(at::DeviceType::CUDA, barDevIdx);
45834576

45844577
// Create a dummy tensor on the device
45854578
// Note: we use zeros() instead of empty() to prevent barrier from triggering
@@ -4776,7 +4769,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
47764769
int dstRank,
47774770
int /* unused */) {
47784771
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4779-
// @lint-ignore CLANGTIDY
47804772
auto tensor = tensors.back();
47814773
check_gpu_single_tensor(tensor, true);
47824774

@@ -4825,7 +4817,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
48254817
int srcRank,
48264818
int /* unused */) {
48274819
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4828-
// @lint-ignore CLANGTIDY
48294820
auto tensor = tensors.back();
48304821
check_gpu_single_tensor(tensor, true);
48314822

@@ -4904,7 +4895,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
49044895
assertRootRank(invalidArgument, opts.rootRank, size_);
49054896

49064897
TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4907-
// @lint-ignore CLANGTIDY
49084898
auto inputTensor = inputTensors.back();
49094899

49104900
std::vector<at::Tensor> outputs;

torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
286286
// destructs outputs_ tensors who are view tensors in autograd graph.
287287
WorkNCCL(const WorkNCCL& w);
288288

289-
~WorkNCCL() override;
289+
~WorkNCCL() override = default;
290290

291291
// Checks if the NCCL kernel has started to execute.
292292
bool isStarted();
@@ -1161,7 +1161,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
11611161
std::unordered_map<std::string, at::cuda::CUDAEvent> ncclEvents_;
11621162

11631163
// Device Indexes used for all collectives in this group
1164-
std::set<int> usedDeviceIdxs_;
1164+
std::set<c10::DeviceIndex> usedDeviceIdxs_;
11651165

11661166
// Flag to denote if a coalescing groupStart/groupEnd block is active
11671167
int coalescing_state_ = 0;

torch/csrc/distributed/c10d/Types.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
7171

7272
ReduceOp(ReduceOp&& other) = default;
7373
ReduceOp& operator=(ReduceOp&& other) = default;
74+
~ReduceOp() override = default;
7475

7576
operator RedOpType() const {
7677
return op_;

torch/csrc/distributed/c10d/exception.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
#pragma once
88

9-
#include <stdexcept>
10-
119
#include <c10/macros/Macros.h>
1210
#include <c10/util/Exception.h>
1311

torch/csrc/distributed/c10d/init.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
548548
py::init(
549549
[](std::vector<at::Tensor> params,
550550
std::vector<std::vector<size_t>> bucket_indices,
551-
std::vector<size_t> per_bucket_size_limits,
551+
const std::vector<size_t>& per_bucket_size_limits,
552552
c10::intrusive_ptr<::c10d::ProcessGroup> process_group,
553553
std::vector<bool> expect_sparse_gradients,
554554
int64_t bucket_bytes_cap,
@@ -563,7 +563,6 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
563563
return std::make_unique<::c10d::Reducer>(
564564
std::move(params),
565565
std::move(bucket_indices),
566-
std::move(per_bucket_size_limits),
567566
std::move(process_group),
568567
std::move(expect_sparse_gradients),
569568
bucket_bytes_cap,
@@ -2939,7 +2938,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
29392938
py::gil_scoped_release nogil{};
29402939

29412940
return c10::make_intrusive<::c10d::ProcessGroupNCCL>(
2942-
store, rank, size, options);
2941+
store, rank, size, std::move(options));
29432942
}),
29442943
py::arg("store"),
29452944
py::arg("rank"),

0 commit comments

Comments
 (0)