Skip to content

Commit 28acdc4

Browse files
Chao1Hanmengfei25
andauthored
Add reduction op log (#1843)
**_Reduce op_** [rank1]:[I713 19:28:16.889824026 ProcessGroupXCCL.cpp:1130] collective_name: allreduce, inNelems: 1, outNelems: 1, dType: Long, root/src rank: 1, worldSize: 4, async_op: true, reduction op: MIN [rank0]:[I713 19:31:32.562227129 ProcessGroupXCCL.cpp:1315] collective_name: reduce, inNelems: 0, outNelems: 0, dType: Float, root/src rank: 0, worldSize: 4, async_op: true, reduction op: SUM [rank2]:[I713 19:31:32.565373680 ProcessGroupXCCL.cpp:1616] collective_name: reduce_scatter, inNelems: 0, outNelems: 0, dType: Float, root/src rank: 2, worldSize: 4, async_op: true, reduction op: SUM **_Non reduction op_** [rank0]:[I713 19:32:41.897706337 ProcessGroupXCCL.cpp:743] collective_name: send, inNelems: 100, outNelems: 100, dType: Float, root/src rank: 1, worldSize: 4, async_op: N/A, reduction op: N/A --------- Co-authored-by: mengfei25 <[email protected]>
1 parent d194d5c commit 28acdc4

File tree

2 files changed

+72
-53
lines changed

2 files changed

+72
-53
lines changed

src/xccl/ProcessGroupXCCL.cpp

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
756756
-1, // globalRankStart
757757
-1, // globalRankStride
758758
this->getSize(), // worldSize
759-
"N/A"); // async_op
759+
"N/A", // async_op
760+
"N/A"); // reductionOp
760761

761762
auto ret = pointToPoint(
762763
tensor,
@@ -806,7 +807,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
806807
-1, // globalRankStart
807808
-1, // globalRankStride
808809
this->getSize(), // worldSize
809-
"N/A"); // async_op
810+
"N/A", // async_op
811+
"N/A"); // reductionOp
810812

811813
auto ret = pointToPoint(
812814
tensor,
@@ -892,7 +894,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
892894
-1, // globalRankStart
893895
-1, // globalRankStride
894896
this->getSize(), // worldSize
895-
opts.asyncOp); // async_op
897+
opts.asyncOp, // async_op
898+
"N/A"); // reductionOp
896899

897900
auto inputs = std::vector<at::Tensor>{inputTensor};
898901
return collective(
@@ -1007,7 +1010,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
10071010
-1, // globalRankStart
10081011
-1, // globalRankStride
10091012
this->getSize(), // worldSize
1010-
opts.asyncOp); // async_op
1013+
opts.asyncOp, // async_op
1014+
"N/A"); // reductionOp
10111015

10121016
const auto root = opts.rootRank;
10131017

@@ -1136,7 +1140,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
11361140
-1, // globalRankStart
11371141
-1, // globalRankStride
11381142
size_, // worldSize
1139-
opts.asyncOp); // async_op
1143+
opts.asyncOp, // async_op
1144+
reduceOpToString(opts.reduceOp)); // reductionOp
11401145

11411146
return allreduce_impl(tensor, "xccl:all_reduce", opts);
11421147
}
@@ -1163,7 +1168,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
11631168
-1, // globalRankStart
11641169
-1, // globalRankStride
11651170
this->getSize(), // worldSize
1166-
opts.asyncOp); // async_op
1171+
opts.asyncOp, // async_op
1172+
reduceOpToString(opts.reduceOp)); // reductionOp
11671173

11681174
return collectiveCoalesced(
11691175
tensors,
@@ -1226,7 +1232,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
12261232
-1, // globalRankStart
12271233
-1, // globalRankStride
12281234
this->getSize(), // worldSize
1229-
opts.asyncOp); // async_op
1235+
opts.asyncOp, // async_op
1236+
"N/A"); // reductionOp
12301237

12311238
const auto root = opts.rootRank + opts.rootTensor;
12321239

@@ -1318,7 +1325,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
13181325
-1, // globalRankStart
13191326
-1, // globalRankStride
13201327
this->getSize(), // worldSize
1321-
opts.asyncOp); // async_op
1328+
opts.asyncOp, // async_op
1329+
reduceOpToString(opts.reduceOp)); // reductionOp
13221330

13231331
return collective(
13241332
tensor,
@@ -1428,7 +1436,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
14281436
-1, // globalRankStart
14291437
-1, // globalRankStride
14301438
this->getSize(), // worldSize
1431-
opts.asyncOp); // async_op
1439+
opts.asyncOp, // async_op
1440+
"N/A"); // reductionOp
14321441

14331442
bool same_size = checkSameSize(outputTensors_);
14341443
if (same_size) {
@@ -1516,7 +1525,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
15161525
-1, // globalRankStart
15171526
-1, // globalRankStride
15181527
this->getSize(), // worldSize
1519-
opts.asyncOp); // async_op
1528+
opts.asyncOp, // async_op
1529+
"N/A"); // reductionOp
15201530

15211531
return collective(
15221532
input_tensor,
@@ -1563,7 +1573,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
15631573
-1, // globalRankStart
15641574
-1, // globalRankStride
15651575
this->getSize(), // worldSize
1566-
opts.asyncOp); // async_op
1576+
opts.asyncOp, // async_op
1577+
"N/A"); // reductionOp
15671578

15681579
return collectiveCoalesced(
15691580
inputs,
@@ -1615,7 +1626,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
16151626
-1, // globalRankStart
16161627
-1, // globalRankStride
16171628
this->getSize(), // worldSize
1618-
opts.asyncOp); // async_op
1629+
opts.asyncOp, // async_op
1630+
reduceOpToString(opts.reduceOp)); // reductionOp
16191631

16201632
bool same_size = checkSameSize(inputTensors_);
16211633
if (same_size) {
@@ -1713,7 +1725,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
17131725
-1, // globalRankStart
17141726
-1, // globalRankStride
17151727
this->getSize(), // worldSize
1716-
opts.asyncOp); // async_op
1728+
opts.asyncOp, // async_op
1729+
reduceOpToString(opts.reduceOp)); // reductionOp
17171730

17181731
return collective(
17191732
inputTensor,
@@ -1771,7 +1784,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
17711784
-1, // globalRankStart
17721785
-1, // globalRankStride
17731786
this->getSize(), // worldSize
1774-
opts.asyncOp); // async_op
1787+
opts.asyncOp, // async_op
1788+
reduceOpToString(opts.reduceOp)); // reductionOp
17751789

17761790
return collectiveCoalesced(
17771791
inputs,
@@ -1902,7 +1916,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
19021916
-1, // globalRankStart
19031917
-1, // globalRankStride
19041918
this->getSize(), // worldSize
1905-
opts.asyncOp); // async_op
1919+
opts.asyncOp, // async_op
1920+
"N/A"); // reductionOp
19061921

19071922
TORCH_CHECK(
19081923
outputTensor.numel() == inputTensor.numel() &&
@@ -1953,7 +1968,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
19531968
-1, // globalRankStart
19541969
-1, // globalRankStride
19551970
this->getSize(), // worldSize
1956-
opts.asyncOp); // async_op
1971+
opts.asyncOp, // async_op
1972+
"N/A"); // reductionOp
19571973

19581974
return collective(
19591975
inputTensor,
@@ -2030,7 +2046,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
20302046
-1, // globalRankStart
20312047
-1, // globalRankStride
20322048
this->getSize(), // worldSize
2033-
opts.asyncOp); // async_op
2049+
opts.asyncOp, // async_op
2050+
"N/A"); // reductionOp
20342051

20352052
return collective(
20362053
inputTensors,

src/xccl/ProcessGroupXCCL.hpp

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -467,42 +467,44 @@ inline std::string reduceOpToString(c10d::ReduceOp op) {
467467
// Since the current profiler trace support for XCCL is unclear, wrap
468468
// `RECORD_PARAM_COMMS_DATA` and output parameters as debug logs.
469469
// export TORCH_CPP_LOG_LEVEL=INFO
470-
#define RECORD_PARAM_COMMS_DATA_WITH_LOG( \
471-
seq, \
472-
pg_name_tuple, \
473-
inputTensors, \
474-
outputTensors, \
475-
rank, \
476-
collective_name, \
477-
inNelems, \
478-
outNelems, \
479-
dType, \
480-
inSplitSizes, \
481-
outSplitSizes, \
482-
globalRankStart, \
483-
globalRankStride, \
484-
worldSize, \
485-
async_op) \
486-
do { \
487-
LOG(INFO) << std::boolalpha << "collective_name: " << collective_name \
488-
<< ", inNelems: " << inNelems << ", outNelems: " << outNelems \
489-
<< ", dType: " << dType << ", root/src rank: " << rank \
490-
<< ", worldSize: " << worldSize << ", async_op: " << async_op; \
491-
RECORD_PARAM_COMMS_DATA( \
492-
seq, \
493-
pg_name_tuple, \
494-
inputTensors, \
495-
outputTensors, \
496-
rank, \
497-
collective_name, \
498-
inNelems, \
499-
outNelems, \
500-
dType, \
501-
inSplitSizes, \
502-
outSplitSizes, \
503-
globalRankStart, \
504-
globalRankStride, \
505-
worldSize); \
470+
#define RECORD_PARAM_COMMS_DATA_WITH_LOG( \
471+
seq, \
472+
pg_name_tuple, \
473+
inputTensors, \
474+
outputTensors, \
475+
rank, \
476+
collective_name, \
477+
inNelems, \
478+
outNelems, \
479+
dType, \
480+
inSplitSizes, \
481+
outSplitSizes, \
482+
globalRankStart, \
483+
globalRankStride, \
484+
worldSize, \
485+
async_op, \
486+
reduce_op) \
487+
do { \
488+
LOG(INFO) << std::boolalpha << "collective_name: " << collective_name \
489+
<< ", inNelems: " << inNelems << ", outNelems: " << outNelems \
490+
<< ", dType: " << dType << ", root/src rank: " << rank \
491+
<< ", worldSize: " << worldSize << ", async_op: " << async_op \
492+
<< ", reduction op: " << reduce_op; \
493+
RECORD_PARAM_COMMS_DATA( \
494+
seq, \
495+
pg_name_tuple, \
496+
inputTensors, \
497+
outputTensors, \
498+
rank, \
499+
collective_name, \
500+
inNelems, \
501+
outNelems, \
502+
dType, \
503+
inSplitSizes, \
504+
outSplitSizes, \
505+
globalRankStart, \
506+
globalRankStride, \
507+
worldSize); \
506508
} while (0)
507509
} // namespace
508510
#endif // USE_C10D_XCCL

0 commit comments

Comments
 (0)