@@ -193,38 +193,50 @@ class SocketImpl {
193193};
194194
195195std::string formatSockAddr (const struct ::sockaddr* addr, socklen_t len) {
196- char host[NI_MAXHOST], port[NI_MAXSERV]; // NOLINT
196+ // It can be be very slow to repeatedly hit DNS resolution failure, but its
197+ // very helpful to have DNS names in logs by default. So we try to use DNS but
198+ // if we hit a transient failure we just disable it for the remainder of the
199+ // job, logging IP addresses instead. See
200+ // https://github.com/pytorch/pytorch/issues/159007
201+ static bool disable_getnameinfo = false ;
197202
198- if (int err = ::getnameinfo (
199- addr, len, host, NI_MAXHOST, port, NI_MAXSERV, NI_NUMERICSERV)) {
200- C10D_WARNING (
201- " The hostname of the client socket cannot be retrieved. err={}" , err);
203+ char host[NI_MAXHOST], port[NI_MAXSERV]; // NOLINT
202204
203- // if we can't resolve the hostname, display the IP address
205+ if (!disable_getnameinfo) {
206+ int err = ::getnameinfo (
207+ addr, len, host, NI_MAXHOST, port, NI_MAXSERV, NI_NUMERICSERV);
208+ if (err != 0 ) {
209+ C10D_WARNING (
210+ " The hostname of the client socket cannot be retrieved. err={}" , err);
211+ disable_getnameinfo = true ;
212+ }
213+ }
214+ // if getnameinfo failed, disable would be set
215+ if (!disable_getnameinfo) {
204216 if (addr->sa_family == AF_INET) {
205- struct sockaddr_in * psai = (struct sockaddr_in *)&addr;
206- // NOLINTNEXTLINE(*array*)
207- char ip[INET_ADDRSTRLEN];
208- if (inet_ntop (addr->sa_family , &(psai->sin_addr ), ip, INET_ADDRSTRLEN) !=
209- nullptr ) {
210- return fmt::format (" {}:{}" , ip, psai->sin_port );
211- }
212- } else if (addr->sa_family == AF_INET6) {
213- struct sockaddr_in6 * psai = (struct sockaddr_in6 *)&addr;
214- // NOLINTNEXTLINE(*array*)
215- char ip[INET6_ADDRSTRLEN];
216- if (inet_ntop (
217- addr->sa_family , &(psai->sin6_addr ), ip, INET6_ADDRSTRLEN) !=
218- nullptr ) {
219- return fmt::format (" [{}]:{}" , ip, psai->sin6_port );
220- }
217+ return fmt::format (" {}:{}" , host, port);
221218 }
222- return " ?UNKNOWN? " ;
219+ return fmt::format ( " [{}]:{} " , host, port) ;
223220 }
221+ // if we can't resolve the hostname, display the IP address
224222 if (addr->sa_family == AF_INET) {
225- return fmt::format (" {}:{}" , host, port);
223+ struct sockaddr_in * psai = (struct sockaddr_in *)&addr;
224+ // NOLINTNEXTLINE(*array*)
225+ char ip[INET_ADDRSTRLEN];
226+ if (inet_ntop (addr->sa_family , &(psai->sin_addr ), ip, INET_ADDRSTRLEN) !=
227+ nullptr ) {
228+ return fmt::format (" {}:{}" , ip, psai->sin_port );
229+ }
230+ } else if (addr->sa_family == AF_INET6) {
231+ struct sockaddr_in6 * psai = (struct sockaddr_in6 *)&addr;
232+ // NOLINTNEXTLINE(*array*)
233+ char ip[INET6_ADDRSTRLEN];
234+ if (inet_ntop (addr->sa_family , &(psai->sin6_addr ), ip, INET6_ADDRSTRLEN) !=
235+ nullptr ) {
236+ return fmt::format (" [{}]:{}" , ip, psai->sin6_port );
237+ }
226238 }
227- return fmt::format ( " [{}]:{} " , host, port) ;
239+ return " ?UNKNOWN? " ;
228240}
229241} // namespace c10d::detail
230242
0 commit comments