Skip to content

Conversation

@vfdev-5
Copy link
Collaborator

@vfdev-5 vfdev-5 commented Oct 15, 2025

____________________________ test_auto_methods_hvd _____________________________

gloo_hvd_executor = <function _gloo_hvd_execute at 0x7fe6e574c0e0>

    @pytest.mark.distributed
    @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
    @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
    def test_auto_methods_hvd(gloo_hvd_executor):
        device = "cpu" if not torch.cuda.is_available() else "cuda"
        np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
    
        gloo_hvd_executor(_test_auto_dataloader, args=(np, np, 1), np=np, do_init=True)
        gloo_hvd_executor(_test_auto_dataloader, args=(np, np, 10, 10), np=np, do_init=True)
        gloo_hvd_executor(_test_auto_dataloader, args=(np, np, 1, 1, "WeightedRandomSampler"), np=np, do_init=True)
        gloo_hvd_executor(_test_auto_dataloader, args=(np, np, 1, 1, "DistributedSampler"), np=np, do_init=True)
    
>       gloo_hvd_executor(_test_auto_model_optimizer, args=(np, device), np=np, do_init=True)

command = ['/opt/conda/bin/python', '-m', 'horovod.runner.run_task', '127.0.0.1', '1133']
exec_command = <function _exec_command_fn.<locals>._exec_command at 0x7fe6e5fd14e0>
settings = <horovod.runner.common.util.settings.Settings object at 0x7fe6e60231d0>
nics = {'lo'}
env = {'CUDA_HOME': '/usr/local/cuda-12.8', 'CUDA_VERSION': '12.8', 'DESIRED_CUDA': '12.8', 'HOME': '/root', ...}
server_ip = '127.0.0.1'


----------------------------- Captured stdout call -----------------------------
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to lo
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Bootstrap: Using lo:127.0.0.1<0>
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO cudaDriverVersion 13000
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO cudaDriverVersion 13000
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to lo
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO NCCL version 2.27.5+cuda12.8
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Bootstrap: Using lo:127.0.0.1<0>
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO NCCL version 2.27.5+cuda12.8
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Failed to open libibverbs.so[.1]
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to lo
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO NET/Socket : Using [0]lo:127.0.0.1<0>
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Initialized NET plugin Socket
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Assigned NET plugin Socket to comm
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Using network Socket
[1]<stdout>:
[1]<stdout>:[2025-10-15 13:54:16] 5d98e8011d2a:3929:4013 [1] init.cc:426 NCCL WARN Cuda failure 'operation not supported'
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:1437 -> 1
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:1832 -> 1
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:1858 -> 1
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Failed to open libibverbs.so[.1]
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to lo
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO NET/Socket : Using [0]lo:127.0.0.1<0>
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Initialized NET plugin Socket
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Assigned NET plugin Socket to comm
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Using network Socket
[0]<stdout>:
[0]<stdout>:[2025-10-15 13:54:16] 5d98e8011d2a:3932:3998 [0] init.cc:426 NCCL WARN Cuda failure 'operation not supported'
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:1437 -> 1
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:1832 -> 1
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:1858 -> 1
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Assigned NET plugin Socket to comm
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO Using network Socket
[1]<stdout>:
[1]<stdout>:[2025-10-15 13:54:16] 5d98e8011d2a:3929:4013 [1] init.cc:426 NCCL WARN Cuda failure 'operation not supported'
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Assigned NET plugin Socket to comm
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO Using network Socket
[0]<stdout>:
[0]<stdout>:[2025-10-15 13:54:16] 5d98e8011d2a:3932:3998 [0] init.cc:426 NCCL WARN Cuda failure 'operation not supported'
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:1437 -> 1
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:1832 -> 1
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:1437 -> 1
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:1832 -> 1
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:1858 -> 1
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:1858 -> 1
[0]<stdout>:
[1]<stdout>:
[1]<stdout>:[2025-10-15 13:54:16] 5d98e8011d2a:3929:4013 [1] misc/argcheck.cc:30 NCCL WARN ncclGetAsyncError : comm argument is NULL
[0]<stdout>:[2025-10-15 13:54:16] 5d98e8011d2a:3932:3998 [0] misc/argcheck.cc:30 NCCL WARN ncclGetAsyncError : comm argument is NULL
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO misc/argcheck.cc:37 -> 4
[0]<stdout>:5d98e8011d2a:3932:3998 [0] NCCL INFO init.cc:2400 -> 4
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO misc/argcheck.cc:37 -> 4
[1]<stdout>:5d98e8011d2a:3929:4013 [1] NCCL INFO init.cc:2400 -> 4
Process 0 exit with status code 1.
Terminating remaining workers after failure of Process 0.
Process 1 exit with status code 1.
----------------------------- Captured stderr call -----------------------------
[0]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[0]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[1]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[1]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[0]<stderr>:2025-10-15 13:53:50,387 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'tensor([[[[0.3850, 0': 
[0]<stderr>:	{'batch_size': 1, 'num_workers': 1, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7fd1f9016090>, 'pin_memory': True}
[0]<stderr>:2025-10-15 13:53:50,387 ignite.distributed.auto.auto_dataloader INFO: Found iterable dataset, dataloader will be created without any distributed sampling. Please, make sure that the dataset itself produces different data on different ranks.
[0]<stderr>:2025-10-15 13:53:50,387 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<tests.ignite.distri': 
[0]<stderr>:	{'batch_size': 1, 'num_workers': 1, 'sampler': None, 'shuffle': False, 'pin_memory': True}
[0]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[0]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[1]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[1]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[0]<stderr>:2025-10-15 13:53:57,140 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'tensor([[[[0.9443, 0': 
[0]<stderr>:	{'batch_size': 5, 'num_workers': 5, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7fee6a28c750>, 'pin_memory': True}
[0]<stderr>:2025-10-15 13:53:57,140 ignite.distributed.auto.auto_dataloader INFO: Found iterable dataset, dataloader will be created without any distributed sampling. Please, make sure that the dataset itself produces different data on different ranks.
[0]<stderr>:2025-10-15 13:53:57,141 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<tests.ignite.distri': 
[0]<stderr>:	{'batch_size': 5, 'num_workers': 5, 'sampler': None, 'shuffle': False, 'pin_memory': True}
[0]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[0]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[1]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[1]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[0]<stderr>:2025-10-15 13:54:03,929 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'tensor([[[[3.5181e-0': 
[0]<stderr>:	{'batch_size': 1, 'num_workers': 1, 'sampler': <ignite.distributed.auto.DistributedProxySampler object at 0x7f348df7c510>, 'shuffle': False, 'pin_memory': True}
[0]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[0]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[1]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[1]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[0]<stderr>:2025-10-15 13:54:10,145 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'tensor([[[[3.0381e-0': 
[0]<stderr>:	{'batch_size': 1, 'num_workers': 1, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7f93e66394d0>, 'shuffle': False, 'pin_memory': True}
[0]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[0]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[1]<stderr>:/work/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.
[1]<stderr>:  from torch.distributed.optim import ZeroRedundancyOptimizer
[0]<stderr>:2025-10-15 13:54:16,632 ignite.distributed.auto.auto_model INFO: Broadcast the initial variable states from rank 0 to all other processes
[1]<stderr>:User function raise error: ncclCommInitRank failed: unhandled cuda error (run with NCCL_DEBUG=INFO for details)Traceback (most recent call last):
[1]<stderr>:  File "/opt/conda/lib/python3.11/site-packages/horovod-0.28.1-py3.11-linux-x86_64.egg/horovod/torch/mpi_ops.py", line 1285, in synchronize
[1]<stderr>:    mpi_lib.horovod_torch_wait_and_clear(handle)
[1]<stderr>:RuntimeError: ncclCommInitRank failed: unhandled cuda error (run with NCCL_DEBUG=INFO for details)
[1]<stderr>:

@github-actions github-actions bot added the ci CI label Oct 15, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

ci CI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant