Skip to content

Commit 9085532

Browse files
prasanna-amdPrasannakumar Murugesan
authored andcommitted
AICOMRCCL-656 fix memory leak in ncclCommInitRankFunc
Co-authored-by: Prasannakumar Murugesan <prmuruge@amd.com> [rocm-systems] ROCm/rocm-systems#3628 (commit 8cc5955)
1 parent 3bb11f0 commit 9085532

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

src/init.cc

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2146,7 +2146,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
21462146
double sum_timers = 0;
21472147
uint64_t timers[TIMERS_INIT_COUNT] = {0};
21482148
unsigned long long commIdHash;
2149-
char* archName;
2149+
char* archName = NULL;
21502150
int cuCount;
21512151
hipDeviceProp_t devProp;
21522152

@@ -2200,7 +2200,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
22002200
} else {
22012201
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
22022202
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
2203-
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
2203+
if (job->color == NCCL_SPLIT_NOCOLOR) {
2204+
// archName was allocated but won't be assigned to comm, so free it here
2205+
free(archName);
2206+
archName = NULL;
2207+
goto exit;
2208+
}
22042209
}
22052210
// child hash obtained from (parent hash, split count, color)
22062211
uint64_t hacc[2] = {1, 1};
@@ -2390,6 +2395,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
23902395
free(parentRanks);
23912396
return res;
23922397
fail:
2398+
// archName was allocated but won't be assigned to comm on failure, so free it
2399+
free(archName);
23932400
comm->initState = res;
23942401
goto exit;
23952402
}

0 commit comments

Comments
 (0)