Skip to content

Commit 67f57bc

Browse files
authored
Improve CI stability (#5022)
* Drastically reduced CI failure rates * Extensive review of all timeouts * Statistical review of flaky tests * Generate and publish junit reports for later ingestion by your choice of aggregator * On Linux and MacOS, pytest-timeout will now kill the individual offending test instead of the whole suite * pytest-timeout should kick in a lot less frequently now * gen_test and gen_cluster no longer accept timeout=None (which meant letting pytest-timeout kill everything off) * Hopefully increased mamba resiliency to transitory network issues
1 parent 2c4efae commit 67f57bc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+400
-495
lines changed

.github/workflows/tests.yaml

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,26 @@ concurrency:
1111
jobs:
1212
test:
1313
runs-on: ${{ matrix.os }}
14+
timeout-minutes: 180
1415

1516
strategy:
1617
fail-fast: false
1718
matrix:
1819
os: [ubuntu-latest, windows-latest, macos-latest]
1920
python-version: ["3.7", "3.8", "3.9"]
2021

21-
# Uncomment to stress-test the test suite for random failures
22+
# Uncomment to stress-test the test suite for random failures.
23+
# Must also change env.TEST_ID below.
2224
# This will take a LONG time and delay all PRs across the whole github.com/dask!
25+
# To avoid hamstringing other people, change 'on: [push, pull_request]' above
26+
# to just 'on: [push]'; this way the stress test will run exclusively in your
27+
# branch (https://github.com/<your name>/distributed/actions).
2328
# run: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
2429

30+
env:
31+
TEST_ID: ${{ matrix.os }}-${{ matrix.python-version }}
32+
# TEST_ID: ${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.run }}
33+
2534
steps:
2635
- name: Checkout source
2736
uses: actions/checkout@v2
@@ -33,13 +42,15 @@ jobs:
3342
with:
3443
miniforge-variant: Mambaforge
3544
miniforge-version: latest
45+
condarc-file: continuous_integration/condarc
3646
use-mamba: true
37-
channels: conda-forge,defaults
38-
channel-priority: true
3947
python-version: ${{ matrix.python-version }}
4048
environment-file: continuous_integration/environment-${{ matrix.python-version }}.yaml
4149
activate-environment: dask-distributed
42-
auto-activate-base: false
50+
51+
- name: Show conda options
52+
shell: bash -l {0}
53+
run: conda config --show
4354

4455
- name: Install stacktrace
4556
shell: bash -l {0}
@@ -83,7 +94,14 @@ jobs:
8394
if: ${{ matrix.os != 'windows-latest' }}
8495
run: bash continuous_integration/scripts/setup_ssh.sh
8596

97+
- name: Reconfigure pytest-timeout
98+
shell: bash -l {0}
99+
# No SIGALRM available on Windows
100+
if: ${{ matrix.os != 'windows-latest' }}
101+
run: sed -i.bak 's/timeout_method = thread/timeout_method = signal/' setup.cfg
102+
86103
- name: Test
104+
id: run_tests
87105
shell: bash -l {0}
88106
env:
89107
PYTHONFAULTHANDLER: 1
@@ -93,10 +111,21 @@ jobs:
93111
# https://github.com/dask/distributed/issues/4514
94112
export DISABLE_IPV6=1
95113
fi
96-
97114
source continuous_integration/scripts/set_ulimit.sh
98-
pytest distributed -m "not avoid_ci" --runslow
115+
116+
pytest distributed -m "not avoid_ci" --runslow \
117+
--junitxml reports/pytest.xml -o junit_suite_name=$TEST_ID
99118
100119
# - name: Debug with tmate on failure
101120
# if: ${{ failure() }}
102121
# uses: mxschmitt/action-tmate@v3
122+
123+
- name: Upload test artifacts
124+
# ensure this runs even if pytest fails
125+
if: >
126+
always() &&
127+
(steps.run_tests.outcome == 'success' || steps.run_tests.outcome == 'failure')
128+
uses: actions/upload-artifact@v2
129+
with:
130+
name: ${{ env.TEST_ID }}
131+
path: reports

continuous_integration/condarc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
channels:
2+
- conda-forge
3+
- defaults
4+
channel_priority: true
5+
auto_activate_base: false
6+
remote_backoff_factor: 20
7+
remote_connect_timeout_secs: 20.0
8+
remote_max_retries: 10
9+
remote_read_timeout_secs: 60.0

distributed/cli/tests/test_dask_scheduler.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import psutil
12
import pytest
23

34
pytest.importorskip("requests")
@@ -14,6 +15,7 @@
1415
import distributed
1516
import distributed.cli.dask_scheduler
1617
from distributed import Client, Scheduler
18+
from distributed.compatibility import LINUX
1719
from distributed.metrics import time
1820
from distributed.utils import get_ip, get_ip_interface, tmpfile
1921
from distributed.utils_test import (
@@ -118,9 +120,7 @@ def test_dashboard_non_standard_ports(loop):
118120
requests.get("http://localhost:4832/status/")
119121

120122

121-
@pytest.mark.skipif(
122-
not sys.platform.startswith("linux"), reason="Need 127.0.0.2 to mean localhost"
123-
)
123+
@pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
124124
def test_dashboard_whitelist(loop):
125125
pytest.importorskip("bokeh")
126126
with pytest.raises(Exception):
@@ -144,7 +144,6 @@ def test_dashboard_whitelist(loop):
144144

145145

146146
def test_interface(loop):
147-
psutil = pytest.importorskip("psutil")
148147
if_names = sorted(psutil.net_if_addrs())
149148
for if_name in if_names:
150149
try:
@@ -168,25 +167,24 @@ def test_interface(loop):
168167
start = time()
169168
while not len(c.nthreads()):
170169
sleep(0.1)
171-
assert time() - start < 5
170+
assert time() - start < 30
172171
info = c.scheduler_info()
173172
assert "tcp://127.0.0.1" in info["address"]
174173
assert all("127.0.0.1" == d["host"] for d in info["workers"].values())
175174

176175

177-
@pytest.mark.flaky(reruns=10, reruns_delay=5)
178176
def test_pid_file(loop):
179177
def check_pidfile(proc, pidfile):
180178
start = time()
181179
while not os.path.exists(pidfile):
182180
sleep(0.01)
183-
assert time() < start + 5
181+
assert time() < start + 30
184182

185183
text = False
186184
start = time()
187185
while not text:
188186
sleep(0.01)
189-
assert time() < start + 5
187+
assert time() < start + 30
190188
with open(pidfile) as f:
191189
text = f.read()
192190
pid = int(text)

distributed/cli/tests/test_dask_worker.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
pytest.importorskip("requests")
77

88
import os
9-
import sys
109
from multiprocessing import cpu_count
1110
from time import sleep
1211

1312
import requests
1413

1514
import distributed.cli.dask_worker
1615
from distributed import Client, Scheduler
16+
from distributed.compatibility import LINUX
1717
from distributed.deploy.utils import nprocesses_nthreads
1818
from distributed.metrics import time
1919
from distributed.utils import parse_ports, sync, tmpfile
@@ -275,9 +275,7 @@ def test_nprocs_expands_name(loop):
275275
assert len(set(names)) == 4
276276

277277

278-
@pytest.mark.skipif(
279-
not sys.platform.startswith("linux"), reason="Need 127.0.0.2 to mean localhost"
280-
)
278+
@pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
281279
@pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
282280
@pytest.mark.parametrize(
283281
"listen_address", ["tcp://0.0.0.0:39837", "tcp://127.0.0.2:39837"]
@@ -311,9 +309,7 @@ def func(dask_worker):
311309
assert client.run(func) == {"tcp://127.0.0.2:39837": listen_address}
312310

313311

314-
@pytest.mark.skipif(
315-
not sys.platform.startswith("linux"), reason="Need 127.0.0.2 to mean localhost"
316-
)
312+
@pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
317313
@pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
318314
@pytest.mark.parametrize("host", ["127.0.0.2", "0.0.0.0"])
319315
def test_respect_host_listen_address(loop, nanny, host):

distributed/comm/tests/test_comms.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
)
3030
from distributed.comm.registry import backends, get_backend
3131
from distributed.comm.tcp import TCP, TCPBackend, TCPConnector
32-
from distributed.compatibility import WINDOWS
3332
from distributed.metrics import time
3433
from distributed.protocol import Serialized, deserialize, serialize, to_serialize
3534
from distributed.utils import get_ip, get_ipv6
@@ -110,7 +109,7 @@ async def debug_loop():
110109
while True:
111110
loop = ioloop.IOLoop.current()
112111
print(".", loop, loop._handlers)
113-
await asyncio.sleep(0.50)
112+
await asyncio.sleep(0.5)
114113

115114

116115
#
@@ -1107,7 +1106,6 @@ def check_out(deserialize_flag, out_value):
11071106
await check_connector_deserialize(addr, True, msg, partial(check_out, True))
11081107

11091108

1110-
@pytest.mark.flaky(reruns=10, reruns_delay=5, condition=WINDOWS)
11111109
@pytest.mark.asyncio
11121110
async def test_tcp_deserialize():
11131111
await check_deserialize("tcp://")

distributed/comm/tests/test_ucx.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ async def test_ucx_localcluster(processes, cleanup):
276276
) as cluster:
277277
async with Client(cluster, asynchronous=True) as client:
278278
x = client.submit(inc, 1)
279-
await x.result()
279+
await x
280280
assert x.key in cluster.scheduler.tasks
281281
if not processes:
282282
assert any(w.data == {x.key: 2} for w in cluster.workers.values())

distributed/comm/tests/test_ws.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ async def test_collections(cleanup):
122122
async def test_large_transfer(cleanup):
123123
np = pytest.importorskip("numpy")
124124
async with Scheduler(protocol="ws://") as s:
125-
async with Worker(s.address, protocol="ws://") as w:
125+
async with Worker(s.address, protocol="ws://"):
126126
async with Client(s.address, asynchronous=True) as c:
127-
future = await c.scatter(np.random.random(1000000))
127+
await c.scatter(np.random.random(1_000_000))
128128

129129

130130
@pytest.mark.asyncio

distributed/compatibility.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
logging_names.update(logging._nameToLevel)
99

1010
PYPY = platform.python_implementation().lower() == "pypy"
11+
LINUX = sys.platform == "linux"
1112
MACOS = sys.platform == "darwin"
1213
WINDOWS = sys.platform.startswith("win")
1314
TORNADO6 = tornado.version_info[0] >= 6

distributed/dashboard/tests/test_scheduler_bokeh.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from dask.utils import stringify
1818

1919
from distributed.client import wait
20-
from distributed.compatibility import MACOS
2120
from distributed.dashboard import scheduler
2221
from distributed.dashboard.components.scheduler import (
2322
AggregateAction,
@@ -93,10 +92,8 @@ async def test_counters(c, s, a, b):
9392
await asyncio.sleep(0.1)
9493
ss.update()
9594

96-
start = time()
9795
while not len(ss.digest_sources["tick-duration"][0].data["x"]):
98-
await asyncio.sleep(1)
99-
assert time() < start + 5
96+
await asyncio.sleep(0.01)
10097

10198

10299
@gen_cluster(client=True)
@@ -184,15 +181,15 @@ async def test_task_stream_clear_interval(c, s, a, b):
184181

185182
await wait(c.map(inc, range(10)))
186183
ts.update()
187-
await asyncio.sleep(0.010)
184+
await asyncio.sleep(0.01)
188185
await wait(c.map(dec, range(10)))
189186
ts.update()
190187

191188
assert len(set(map(len, ts.source.data.values()))) == 1
192189
assert ts.source.data["name"].count("inc") == 10
193190
assert ts.source.data["name"].count("dec") == 10
194191

195-
await asyncio.sleep(0.300)
192+
await asyncio.sleep(0.3)
196193
await wait(c.map(inc, range(10, 20)))
197194
ts.update()
198195

@@ -848,7 +845,6 @@ async def test_aggregate_action(c, s, a, b):
848845
assert ("compute") in mbk.action_source.data["names"]
849846

850847

851-
@pytest.mark.flaky(reruns=10, reruns_delay=5, condition=MACOS)
852848
@gen_cluster(client=True, scheduler_kwargs={"dashboard": True})
853849
async def test_compute_per_key(c, s, a, b):
854850
mbk = ComputePerKey(s)

distributed/deploy/local.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ def __init__(
136136

137137
if threads_per_worker == 0:
138138
warnings.warn(
139-
"Setting `threads_per_worker` to 0 is discouraged. "
140-
"Please set to None or to a specific int to get best behavior."
139+
"Setting `threads_per_worker` to 0 has been deprecated. "
140+
"Please set to None or to a specific int."
141141
)
142142
threads_per_worker = None
143143

0 commit comments

Comments
 (0)