Skip to content

Commit 6a2b4db

Browse files
Revert "Enable clang-tidy on torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp (pytorch#143806)"
This reverts commit 42f4fda. Reverted pytorch#143806 on behalf of https://github.com/huydhn due to Lots of builds fail after this land, so maybe a landrace ([comment](pytorch#143806 (comment)))
1 parent 6f60c65 commit 6a2b4db

File tree

10 files changed

+54
-45
lines changed

10 files changed

+54
-45
lines changed

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ exclude_patterns = [
261261
'torch/csrc/api/include/torch/linalg.h',
262262
'torch/csrc/autograd/generated/**',
263263
'torch/csrc/distributed/**/*.cu',
264+
'torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp',
264265
'torch/csrc/distributed/c10d/WinSockUtils.hpp',
265266
'torch/csrc/distributed/c10d/quantization/quantization_gpu.h',
266267
'torch/csrc/dynamo/eval_frame.h',

torch/csrc/distributed/c10d/FileStore.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ class Lock {
9898
}
9999

100100
Lock(const Lock& that) = delete;
101-
Lock& operator=(const Lock&) = delete;
102101

103102
Lock& operator=(const Lock& other) = delete;
104103
Lock& operator=(Lock&& other) noexcept {
@@ -171,10 +170,6 @@ class File {
171170
}
172171
SYSASSERT(fd_, "open(" + path + ")");
173172
}
174-
File(const File&) = delete;
175-
File& operator=(const File&) = delete;
176-
File(File&&) noexcept = delete;
177-
File& operator=(File&&) noexcept = delete;
178173

179174
~File() {
180175
::close(fd_);

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,8 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
523523
exception_ = w.exception_;
524524
}
525525

526+
ProcessGroupNCCL::WorkNCCL::~WorkNCCL() = default;
527+
526528
bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
527529
if (!ncclComm_->isAborted()) {
528530
checkAndSetException();
@@ -1009,8 +1011,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10091011
if (options_->global_ranks_in_group.empty()) {
10101012
this->globalRankStart = 0;
10111013
} else {
1012-
this->globalRankStart =
1013-
static_cast<int>(options_->global_ranks_in_group[0]);
1014+
this->globalRankStart = options_->global_ranks_in_group[0];
10141015
}
10151016

10161017
if (options_->global_ranks_in_group.empty()) {
@@ -1032,9 +1033,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
10321033
}
10331034

10341035
if (ranksAreStrided) {
1035-
this->globalRankStride = static_cast<int>(
1036-
options_->global_ranks_in_group[1] -
1037-
options_->global_ranks_in_group[0]);
1036+
this->globalRankStride = options_->global_ranks_in_group[1] -
1037+
options_->global_ranks_in_group[0];
10381038
} else {
10391039
this->globalRankStride = -1;
10401040
}
@@ -1087,7 +1087,7 @@ bool ProcessGroupNCCL::useNonblocking() {
10871087
}
10881088
// 2nd priority: Respect the environment variable
10891089
else if (nbEnv.has_value()) {
1090-
useNonblocking_ = nbEnv;
1090+
useNonblocking_ = nbEnv.value();
10911091
}
10921092
// 3rd priority: automatically use nonblocking if we are in eager init mode
10931093
else if (getBoundDeviceId()) {
@@ -1711,7 +1711,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
17111711
}
17121712

17131713
if (computeDeltaMS(lastTimeHeartBeatCheck, currentTime) >=
1714-
heartbeatTimeoutInSec_ * 1000l) {
1714+
heartbeatTimeoutInSec_ * 1000) {
17151715
// Check the heart beat of watchdog thread.
17161716
lastTimeHeartBeatCheck = currentTime;
17171717
auto heartbeat = heartbeat_.load();
@@ -2122,24 +2122,21 @@ void ProcessGroupNCCL::watchdogHandler() {
21222122
kWorkStatusUpdatePeriodMs) {
21232123
::c10d::C10dLoggingData data;
21242124
// logging integers
2125-
data.integers["pg_id"] = static_cast<int64_t>(local_id_);
2125+
data.integers["pg_id"] = local_id_;
21262126
data.integers["rank"] = rank_;
21272127
data.integers["global_rank"] = globalRank();
21282128
data.integers["last_enqueued_work"] = pgStatus_->lastEnqueuedSeq;
21292129
data.integers["last_started_work"] = pgStatus_->lastStartedSeq;
21302130
data.integers["last_completed_work"] = pgStatus_->lastCompletedSeq;
2131-
data.integers["last_enqueued_numel_in"] =
2132-
static_cast<int64_t>(pgStatus_->lastEnqueuedNumelIn);
2131+
data.integers["last_enqueued_numel_in"] = pgStatus_->lastEnqueuedNumelIn;
21332132
data.integers["last_enqueued_numel_out"] =
2134-
static_cast<int64_t>(pgStatus_->lastEnqueuedNumelOut);
2133+
pgStatus_->lastEnqueuedNumelOut;
21352134
data.integers["last_completed_numel_in"] =
2136-
static_cast<int64_t>(pgStatus_->lastCompletedNumelIn);
2135+
pgStatus_->lastCompletedNumelIn;
21372136
data.integers["last_completed_numel_out"] =
2138-
static_cast<int64_t>(pgStatus_->lastCompletedNumelOut);
2139-
data.integers["last_started_numel_in"] =
2140-
static_cast<int64_t>(pgStatus_->lastStartedNumelIn);
2141-
data.integers["last_started_numel_out"] =
2142-
static_cast<int64_t>(pgStatus_->lastStartedNumelOut);
2137+
pgStatus_->lastCompletedNumelOut;
2138+
data.integers["last_started_numel_in"] = pgStatus_->lastStartedNumelIn;
2139+
data.integers["last_started_numel_out"] = pgStatus_->lastStartedNumelOut;
21432140
// logging strings
21442141
data.strings["last_enqueued_work_name"] = pgStatus_->lastEnqueuedWorkName;
21452142
data.strings["last_started_work_name"] = pgStatus_->lastStartedWorkName;
@@ -2689,7 +2686,6 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
26892686
segmentInfo.device == device.index(),
26902687
"Mismatch between CUDA memory segment device and current device");
26912688
ncclComm->registerSegment(
2692-
// NOLINTNEXTLINE(performance-no-int-to-ptr)
26932689
reinterpret_cast<void*>(segmentInfo.address),
26942690
segmentInfo.total_size);
26952691
}
@@ -2915,7 +2911,7 @@ void ProcessGroupNCCL::workEnqueue(
29152911
// get deadlock. Here we enqueue work without outputs_.
29162912
workMetaList_.emplace_back(*work);
29172913
// update the PG status related to the last enqueued work
2918-
pgStatus_->lastEnqueuedSeq = static_cast<int64_t>(work->seq_);
2914+
pgStatus_->lastEnqueuedSeq = work->seq_;
29192915
pgStatus_->lastEnqueuedWorkName = opTypeToString(work->opType_);
29202916
pgStatus_->lastEnqueuedNumelIn = work->numelIn_;
29212917
pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
@@ -3864,6 +3860,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
38643860
TORCH_CHECK(
38653861
!isFloat8Type(tensor.scalar_type()),
38663862
"Float8 dtypes are not currenlty supported for NCCL reductions");
3863+
// @lint-ignore CLANGTIDY
38673864
RECORD_PARAM_COMMS_DATA(
38683865
std::make_tuple(
38693866
static_cast<int64_t>(seqCollective_) + 1,
@@ -3894,6 +3891,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
38943891
!isFloat8Type(tensors.back().scalar_type()),
38953892
"Float8 dtypes are not currenlty supported for NCCL reductions");
38963893

3894+
// @lint-ignore CLANGTIDY
38973895
RECORD_PARAM_COMMS_DATA(
38983896
std::make_tuple(
38993897
static_cast<int64_t>(seqCollective_) + 1,
@@ -3948,6 +3946,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39483946
}
39493947
check_gpu_single_tensor(tensor);
39503948

3949+
// @lint-ignore CLANGTIDY
39513950
RECORD_PARAM_COMMS_DATA(
39523951
std::make_tuple(
39533952
static_cast<int64_t>(seqCollective_) + 1,
@@ -3983,7 +3982,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
39833982
input.data_ptr(),
39843983
input.numel(),
39853984
getNcclDataType(input.scalar_type()),
3986-
static_cast<int>(root),
3985+
root,
39873986
comm,
39883987
stream.stream());
39893988
},
@@ -4023,7 +4022,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
40234022
output.data_ptr(),
40244023
input.numel(),
40254024
getNcclDataType(input.scalar_type()),
4026-
static_cast<int>(root),
4025+
root,
40274026
comm,
40284027
stream.stream());
40294028
},
@@ -4037,6 +4036,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40374036
std::vector<at::Tensor>& tensors,
40384037
const ReduceOptions& opts) {
40394038
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4039+
// @lint-ignore CLANGTIDY
40404040
auto tensor = tensors.back();
40414041
if (tensor.is_complex()) {
40424042
TORCH_CHECK(
@@ -4083,7 +4083,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
40834083
input.numel(),
40844084
ncclDataType,
40854085
ncclReduceOp,
4086-
static_cast<int>(root),
4086+
root,
40874087
comm,
40884088
stream.stream());
40894089
},
@@ -4137,8 +4137,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
41374137
std::vector<at::Tensor>& inputTensors,
41384138
const AllgatherOptions& opts) {
41394139
TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4140+
// @lint-ignore CLANGTIDY
41404141
auto inputTensor = inputTensors.back();
41414142
check_gpu_single_tensor(inputTensor);
4143+
// @lint-ignore CLANGTIDY
41424144
auto outputTensors_ = outputTensors.back();
41434145

41444146
RECORD_PARAM_COMMS_DATA(
@@ -4207,19 +4209,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
42074209
c10::cuda::CUDACachingAllocator::recordStream(
42084210
outputTensors_[j].storage().data_ptr(), ncclStream);
42094211
}
4210-
outputTensors_[j].copy_(
4211-
outputFlattened[static_cast<int64_t>(j)], true);
4212+
outputTensors_[j].copy_(outputFlattened[j], true);
42124213
}
42134214
},
42144215
OpType::ALLGATHER,
42154216
"nccl:all_gather");
42164217
} else {
42174218
const auto num_reduces = outputTensors_.size();
42184219
startCoalescing();
4219-
for (const int64_t i : c10::irange(static_cast<int64_t>(num_reduces))) {
4220+
for (const int i : c10::irange(num_reduces)) {
42204221
auto& output = outputTensors_[i];
42214222
auto& input = (i == rank_) ? inputTensor : output;
4222-
auto broadcastOpts = BroadcastOptions{i, int64_t(0), opts.timeout};
4223+
auto broadcastOpts = BroadcastOptions{
4224+
static_cast<int64_t>(i), static_cast<int64_t>(0), opts.timeout};
42234225
_broadcast_oop(output, input, broadcastOpts);
42244226
}
42254227
auto work = endCoalescing(OpType::ALLGATHER);
@@ -4240,6 +4242,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
42404242
std::vector<at::Tensor>& outputs,
42414243
std::vector<at::Tensor>& inputs,
42424244
const AllgatherOptions& opts) {
4245+
// @lint-ignore CLANGTIDY
42434246
RECORD_PARAM_COMMS_DATA(
42444247
std::make_tuple(
42454248
static_cast<int64_t>(seqCollective_) + 1,
@@ -4283,8 +4286,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
42834286
std::vector<std::vector<at::Tensor>>& inputTensors,
42844287
const ReduceScatterOptions& opts) {
42854288
TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4289+
// @lint-ignore CLANGTIDY
42864290
auto outputTensor = outputTensors.back();
42874291
check_gpu_single_tensor(outputTensor);
4292+
// @lint-ignore CLANGTIDY
42884293
auto inputTensors_ = inputTensors.back();
42894294
TORCH_CHECK(
42904295
!isFloat8Type(outputTensor.scalar_type()),
@@ -4359,8 +4364,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43594364
c10::cuda::CUDACachingAllocator::recordStream(
43604365
inputTensors_[j].storage().data_ptr(), ncclStream);
43614366
}
4362-
inputFlattened[static_cast<int64_t>(j)].copy_(
4363-
inputTensors_[j], true);
4367+
inputFlattened[j].copy_(inputTensors_[j], true);
43644368
}
43654369
},
43664370
[&](at::cuda::CUDAStream&,
@@ -4370,7 +4374,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
43704374
} else {
43714375
const auto num_reduces = inputTensors_.size();
43724376
startCoalescing();
4373-
for (const int i : c10::irange(static_cast<int>(num_reduces))) {
4377+
for (const int i : c10::irange(num_reduces)) {
43744378
auto& input = inputTensors_[i];
43754379
auto& output = (i == rank_) ? outputTensor : input;
43764380
auto reduceOpts = ReduceOptions{
@@ -4400,6 +4404,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
44004404
"input tensor must be the same size as output size times world size");
44014405
}
44024406

4407+
// @lint-ignore CLANGTIDY
44034408
const auto& tensor = outputTensor;
44044409
TORCH_CHECK(
44054410
!isFloat8Type(tensor.scalar_type()),
@@ -4469,6 +4474,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
44694474
!isFloat8Type(inputs.back().scalar_type()),
44704475
"Float8 dtypes are not currenlty supported for NCCL reductions");
44714476

4477+
// @lint-ignore CLANGTIDY
44724478
RECORD_PARAM_COMMS_DATA(
44734479
std::make_tuple(
44744480
static_cast<int64_t>(seqCollective_) + 1,
@@ -4533,13 +4539,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45334539
this->getSize()); // worldSize
45344540

45354541
// Device to use for barrier
4536-
c10::DeviceIndex barDevIdx = -1;
4542+
int barDevIdx = -1;
45374543

45384544
// Select device to use for barrier
45394545
// 1st choice: Use user defined GPU device ids if provided
45404546
if (!opts.device_ids.empty()) {
45414547
// Use the first device id because PG NCCL is single-device now
4542-
barDevIdx = static_cast<c10::DeviceIndex>(opts.device_ids[0]);
4548+
barDevIdx = opts.device_ids[0];
45434549
} else if (getBoundDeviceId()) {
45444550
// 2nd choice: Use the bound GPU device id if available.
45454551
// Bounded device id can be passed to `init_process_group`.
@@ -4556,12 +4562,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45564562
// Note: it is better to use global rank because the group-local rank can be
45574563
// offset wrt the device id if intra-node GPUs are sharded into multiple
45584564
// dimensions.
4559-
barDevIdx = static_cast<c10::DeviceIndex>(globalRank() % localDeviceCount_);
4565+
barDevIdx = static_cast<int16_t>(globalRank() % localDeviceCount_);
45604566
LOG(WARNING)
45614567
<< logPrefix()
45624568
<< c10::str(
45634569
" using GPU ",
4564-
static_cast<int>(barDevIdx),
4570+
barDevIdx,
45654571
" to perform barrier as devices used by this process are currently unknown. ",
45664572
"This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
45674573
"Specify device_ids in barrier() to force use of a particular device, ",
@@ -4572,7 +4578,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
45724578
ValueError,
45734579
barDevIdx >= 0,
45744580
"Failed to infer a GPU device id to perform barrier. ");
4575-
auto barDevice = at::Device(at::DeviceType::CUDA, barDevIdx);
4581+
auto barDevice = at::Device(
4582+
at::DeviceType::CUDA, static_cast<c10::DeviceIndex>(barDevIdx));
45764583

45774584
// Create a dummy tensor on the device
45784585
// Note: we use zeros() instead of empty() to prevent barrier from triggering
@@ -4769,6 +4776,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
47694776
int dstRank,
47704777
int /* unused */) {
47714778
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4779+
// @lint-ignore CLANGTIDY
47724780
auto tensor = tensors.back();
47734781
check_gpu_single_tensor(tensor, true);
47744782

@@ -4817,6 +4825,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
48174825
int srcRank,
48184826
int /* unused */) {
48194827
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4828+
// @lint-ignore CLANGTIDY
48204829
auto tensor = tensors.back();
48214830
check_gpu_single_tensor(tensor, true);
48224831

@@ -4895,6 +4904,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
48954904
assertRootRank(invalidArgument, opts.rootRank, size_);
48964905

48974906
TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
4907+
// @lint-ignore CLANGTIDY
48984908
auto inputTensor = inputTensors.back();
48994909

49004910
std::vector<at::Tensor> outputs;

torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
286286
// destructs outputs_ tensors who are view tensors in autograd graph.
287287
WorkNCCL(const WorkNCCL& w);
288288

289-
~WorkNCCL() override = default;
289+
~WorkNCCL() override;
290290

291291
// Checks if the NCCL kernel has started to execute.
292292
bool isStarted();
@@ -1161,7 +1161,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
11611161
std::unordered_map<std::string, at::cuda::CUDAEvent> ncclEvents_;
11621162

11631163
// Device Indexes used for all collectives in this group
1164-
std::set<c10::DeviceIndex> usedDeviceIdxs_;
1164+
std::set<int> usedDeviceIdxs_;
11651165

11661166
// Flag to denote if a coalescing groupStart/groupEnd block is active
11671167
int coalescing_state_ = 0;

torch/csrc/distributed/c10d/Types.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
7171

7272
ReduceOp(ReduceOp&& other) = default;
7373
ReduceOp& operator=(ReduceOp&& other) = default;
74-
~ReduceOp() override = default;
7574

7675
operator RedOpType() const {
7776
return op_;

torch/csrc/distributed/c10d/exception.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#pragma once
88

9+
#include <stdexcept>
10+
911
#include <c10/macros/Macros.h>
1012
#include <c10/util/Exception.h>
1113

torch/csrc/distributed/c10d/init.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
548548
py::init(
549549
[](std::vector<at::Tensor> params,
550550
std::vector<std::vector<size_t>> bucket_indices,
551-
const std::vector<size_t>& per_bucket_size_limits,
551+
std::vector<size_t> per_bucket_size_limits,
552552
c10::intrusive_ptr<::c10d::ProcessGroup> process_group,
553553
std::vector<bool> expect_sparse_gradients,
554554
int64_t bucket_bytes_cap,
@@ -563,6 +563,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
563563
return std::make_unique<::c10d::Reducer>(
564564
std::move(params),
565565
std::move(bucket_indices),
566+
std::move(per_bucket_size_limits),
566567
std::move(process_group),
567568
std::move(expect_sparse_gradients),
568569
bucket_bytes_cap,
@@ -2938,7 +2939,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
29382939
py::gil_scoped_release nogil{};
29392940

29402941
return c10::make_intrusive<::c10d::ProcessGroupNCCL>(
2941-
store, rank, size, std::move(options));
2942+
store, rank, size, options);
29422943
}),
29432944
py::arg("store"),
29442945
py::arg("rank"),

0 commit comments

Comments
 (0)