fix cpp extension distributed warning spew (pytorch#164785)

pytorchbot · msaroufim · web-flow · commit b015422da1fd · 2025-10-06T16:58:36.000-07:00
fix cpp extension distributed warning spew (pytorch#162764) With the new change we only log the warning if we're running non distributed code or if we're in rank 0. Unit testing that certain messages get printed on certain ranks only feels kinda jank so test plan is below instead Test plan ```python # torchrun --nproc_per_node=2 demo_fix.py import os import logging logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG) import torch if 'RANK' in os.environ: torch.distributed.init_process_group('nccl') from torch.utils.cpp_extension import _get_cuda_arch_flags _get_cuda_arch_flags() print(f"Rank {os.environ.get('RANK', '0')} done") ``` Logs showing how how `TORCH_CUDA_ARCH_LIST`only shows up once if we explicitly set the the logging level to `logging.DEBUG`. It also improves the debug message to explain what the actual behavior will be ``` (source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** [rank0]:V0911 18:30:18.921000 1316753 pytorch/torch/utils/cpp_extension.py:2444] TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='10.0+PTX' for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override. Rank 0 done Rank 1 done ``` But if we just use the default and comment out `logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)` Then we get ``` (source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** Rank 0 done Rank 1 done (source) [marksaroufim@devgpu005]~% ``` Pull Request resolved: pytorch#162764 Approved by: https://github.com/ezyang, https://github.com/zou3519 (cherry picked from commit f7e8321) Co-authored-by: Mark Saroufim <marksaroufim@meta.com>
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
@@ -2418,10 +2418,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
 
     # If not given or set as native, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list or _arch_list == "native":
-        if not _arch_list:
-            logger.warning(
-                "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
-                "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2440,6 +2436,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
                 arch_list.append(arch)
         arch_list = sorted(arch_list)
         arch_list[-1] += '+PTX'
+
+        if not _arch_list:
+            # Only log on rank 0 in distributed settings to avoid spam
+            if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                arch_list_str = ';'.join(arch_list)
+                logger.debug(
+                    "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' "
+                    "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.",
+                    arch_list_str)
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
         _arch_list = _arch_list.replace(' ', ';')