Skip to content

Commit 482f069

Browse files
wconstabpytorchmergebot
authored andcommitted
[C10D] fix slow init due to repeated dns resolution failure (pytorch#159596)
It can be be very slow to repeatedly hit DNS resolution failure, but its very helpful to have DNS names in logs by default. So we try to use DNS but if we hit a transient failure we just disable it for the remainder of the job, logging IP addresses instead. Fixes pytorch#159007 Pull Request resolved: pytorch#159596 Approved by: https://github.com/d4l3k
1 parent 85d931f commit 482f069

File tree

1 file changed

+37
-25
lines changed

1 file changed

+37
-25
lines changed

torch/csrc/distributed/c10d/socket.cpp

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -193,38 +193,50 @@ class SocketImpl {
193193
};
194194

195195
std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
196-
char host[NI_MAXHOST], port[NI_MAXSERV]; // NOLINT
196+
// It can be be very slow to repeatedly hit DNS resolution failure, but its
197+
// very helpful to have DNS names in logs by default. So we try to use DNS but
198+
// if we hit a transient failure we just disable it for the remainder of the
199+
// job, logging IP addresses instead. See
200+
// https://github.com/pytorch/pytorch/issues/159007
201+
static bool disable_getnameinfo = false;
197202

198-
if (int err = ::getnameinfo(
199-
addr, len, host, NI_MAXHOST, port, NI_MAXSERV, NI_NUMERICSERV)) {
200-
C10D_WARNING(
201-
"The hostname of the client socket cannot be retrieved. err={}", err);
203+
char host[NI_MAXHOST], port[NI_MAXSERV]; // NOLINT
202204

203-
// if we can't resolve the hostname, display the IP address
205+
if (!disable_getnameinfo) {
206+
int err = ::getnameinfo(
207+
addr, len, host, NI_MAXHOST, port, NI_MAXSERV, NI_NUMERICSERV);
208+
if (err != 0) {
209+
C10D_WARNING(
210+
"The hostname of the client socket cannot be retrieved. err={}", err);
211+
disable_getnameinfo = true;
212+
}
213+
}
214+
// if getnameinfo failed, disable would be set
215+
if (!disable_getnameinfo) {
204216
if (addr->sa_family == AF_INET) {
205-
struct sockaddr_in* psai = (struct sockaddr_in*)&addr;
206-
// NOLINTNEXTLINE(*array*)
207-
char ip[INET_ADDRSTRLEN];
208-
if (inet_ntop(addr->sa_family, &(psai->sin_addr), ip, INET_ADDRSTRLEN) !=
209-
nullptr) {
210-
return fmt::format("{}:{}", ip, psai->sin_port);
211-
}
212-
} else if (addr->sa_family == AF_INET6) {
213-
struct sockaddr_in6* psai = (struct sockaddr_in6*)&addr;
214-
// NOLINTNEXTLINE(*array*)
215-
char ip[INET6_ADDRSTRLEN];
216-
if (inet_ntop(
217-
addr->sa_family, &(psai->sin6_addr), ip, INET6_ADDRSTRLEN) !=
218-
nullptr) {
219-
return fmt::format("[{}]:{}", ip, psai->sin6_port);
220-
}
217+
return fmt::format("{}:{}", host, port);
221218
}
222-
return "?UNKNOWN?";
219+
return fmt::format("[{}]:{}", host, port);
223220
}
221+
// if we can't resolve the hostname, display the IP address
224222
if (addr->sa_family == AF_INET) {
225-
return fmt::format("{}:{}", host, port);
223+
struct sockaddr_in* psai = (struct sockaddr_in*)&addr;
224+
// NOLINTNEXTLINE(*array*)
225+
char ip[INET_ADDRSTRLEN];
226+
if (inet_ntop(addr->sa_family, &(psai->sin_addr), ip, INET_ADDRSTRLEN) !=
227+
nullptr) {
228+
return fmt::format("{}:{}", ip, psai->sin_port);
229+
}
230+
} else if (addr->sa_family == AF_INET6) {
231+
struct sockaddr_in6* psai = (struct sockaddr_in6*)&addr;
232+
// NOLINTNEXTLINE(*array*)
233+
char ip[INET6_ADDRSTRLEN];
234+
if (inet_ntop(addr->sa_family, &(psai->sin6_addr), ip, INET6_ADDRSTRLEN) !=
235+
nullptr) {
236+
return fmt::format("[{}]:{}", ip, psai->sin6_port);
237+
}
226238
}
227-
return fmt::format("[{}]:{}", host, port);
239+
return "?UNKNOWN?";
228240
}
229241
} // namespace c10d::detail
230242

0 commit comments

Comments
 (0)