From b7d5d49efb4993e5de812aad1c3465a595dfe7bd Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 17 Jun 2025 09:59:32 +0200 Subject: [PATCH 1/4] Add flaky test marker for collective operations and implement changelog parser --- tests/tests_fabric/utilities/test_distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py index fa9cc0ed40e93..e91df8e58a096 100644 --- a/tests/tests_fabric/utilities/test_distributed.py +++ b/tests/tests_fabric/utilities/test_distributed.py @@ -123,6 +123,7 @@ def _test_all_reduce(strategy): [torch.device("cpu"), torch.device("cpu")], ], ) +@pytest.mark.flaky(reruns=3) # flaky with "process 0 terminated with signal SIGABRT" (GLOO) def test_collective_operations(devices, process): spawn_launch(process, devices) From 4658409b399e07516fdaac003c3efb8dd5bde861 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 17 Jun 2025 11:45:43 +0200 Subject: [PATCH 2/4] try GLOO_SOCKET_IFNAME: "eth0" --- .github/workflows/ci-tests-fabric.yml | 1 + tests/tests_fabric/utilities/test_distributed.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index b3204c71e00b7..3a2626ff66881 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -80,6 +80,7 @@ jobs: TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} + GLOO_SOCKET_IFNAME: "eth0" # trying to avoid "gloo" issue with SIGABRT steps: - uses: actions/checkout@v4 diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py index e91df8e58a096..fa9cc0ed40e93 100644 --- a/tests/tests_fabric/utilities/test_distributed.py +++ b/tests/tests_fabric/utilities/test_distributed.py @@ -123,7 +123,6 @@ def _test_all_reduce(strategy): [torch.device("cpu"), torch.device("cpu")], ], ) -@pytest.mark.flaky(reruns=3) # flaky with "process 0 terminated with signal SIGABRT" (GLOO) def test_collective_operations(devices, process): spawn_launch(process, devices) From 8ac0a9e369051a406595f8714f5019e0cd82151f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 09:46:05 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/ci-tests-fabric.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 3a2626ff66881..ea7927ae56d03 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -80,7 +80,7 @@ jobs: TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} - GLOO_SOCKET_IFNAME: "eth0" # trying to avoid "gloo" issue with SIGABRT + GLOO_SOCKET_IFNAME: "eth0" # trying to avoid "gloo" issue with SIGABRT steps: - uses: actions/checkout@v4 From f84fd7f550c0d5c9b2ba0eaebbac0fedb08951a8 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 17 Jun 2025 12:02:16 +0200 Subject: [PATCH 4/4] more --- .github/workflows/ci-tests-fabric.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 3a2626ff66881..4e4d2c9eed3cb 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -80,7 +80,6 @@ jobs: TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} - GLOO_SOCKET_IFNAME: "eth0" # trying to avoid "gloo" issue with SIGABRT steps: - uses: actions/checkout@v4 @@ -117,7 +116,7 @@ jobs: mkdir -p $PYPI_CACHE_DIR ls -lh $PYPI_CACHE_DIR - - name: Env. variables + - name: Expand Env. variables run: | # Switch PyTorch URL between stable and test/future python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV @@ -125,8 +124,16 @@ jobs: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV + - name: Append Env. vars for MacOS + if: ${{ runner.os == 'macOS' }} + run: | + # trying to avoid "gloo" issue with SIGABRT + echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV + - name: Append Env. vars for Windows + if: ${{ runner.os == 'windows' }} + run: | # Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support" - python -c "print('USE_LIBUV=0' if '${{matrix.os}}' == 'windows-2022' and '${{matrix.pytorch-version}}' == '2.4' else '')" >> $GITHUB_ENV + echo "USE_LIBUV=0" >> $GITHUB_ENV - name: Install package & dependencies timeout-minutes: 20