Skip to content

Commit 46e4392

Browse files
aivanoufacebook-github-bot
authored andcommitted
Improve components tests (#208)
Summary: Pull Request resolved: #208 The diff introduces the following changes: * Improved final test report * Make sure that all resources allocated during the run are deleted * Fail the test if any of the components fail * Reduce timeout to 10 minutes. * Make sure that there is a single logic that governs the execution mode: `dryrun` or normal Reviewed By: d4l3k Differential Revision: D31247636 fbshipit-source-id: ce587c88a9309f83c72d9aaae839f7945fe3b5a2
1 parent cfaeded commit 46e4392

File tree

6 files changed

+137
-79
lines changed

6 files changed

+137
-79
lines changed

.github/workflows/components-integration-tests.yaml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,6 @@ jobs:
4343
env:
4444
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
4545
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
46-
run: |
47-
if [ -z "$AWS_ACCESS_KEY_ID" ]; then
48-
# only dryrun if no secrets
49-
ARGS="--dryrun"
50-
else
51-
ARGS=
52-
fi
53-
scripts/component_integration_tests.py $ARGS
46+
INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
47+
CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
48+
run: scripts/component_integration_tests.py

scripts/component_integration_tests.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
"""
99
Kubernetes integration tests.
1010
"""
11-
import argparse
1211
import os
1312

1413
# pyre-ignore-all-errors[21] # Cannot find module utils
@@ -51,31 +50,26 @@ def get_local_docker_sched_info(image: str) -> SchedulerInfo:
5150

5251

5352
def main() -> None:
54-
parser = argparse.ArgumentParser(description="kubernetes integration test runner")
55-
parser.add_argument(
56-
"--dryrun",
57-
action="store_true",
58-
help="Does not actually submit the app," " just prints the scheduler request",
59-
)
60-
args = parser.parse_args()
6153
print("Starting components integration tests")
6254
torchx_image = "dummy_image"
6355
examples_image = "dummy_image"
56+
dryrun: bool = False
6457
try:
6558
build = build_and_push_image()
6659
torchx_image = build.torchx_image
6760
examples_image = build.examples_image
6861
except MissingEnvError:
62+
dryrun = True
6963
print("Skip runnig tests, executed only docker buid step")
70-
test_suite = IntegComponentTest(timeout=900) # 15 minutes
64+
test_suite = IntegComponentTest(timeout=600) # 10 minutes
7165
test_suite.run_components(
7266
component_provider,
7367
scheduler_infos=[
7468
get_local_cwd_sched_info(os.getcwd()),
7569
get_local_docker_sched_info(torchx_image),
7670
get_k8s_sched_info(torchx_image),
7771
],
78-
dryrun=args.dryrun,
72+
dryrun=dryrun,
7973
)
8074

8175
test_suite.run_components(
@@ -84,7 +78,7 @@ def main() -> None:
8478
get_local_docker_sched_info(examples_image),
8579
get_k8s_sched_info(examples_image),
8680
],
87-
dryrun=args.dryrun,
81+
dryrun=dryrun,
8882
)
8983

9084

torchx/components/integration_tests/component_provider.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import os
8+
import tempfile
89
from abc import ABC, abstractmethod
910

1011
import torchx.components.dist as dist_components
@@ -35,7 +36,7 @@ def tearDown(self) -> None:
3536

3637
class DDPComponentProvider(ComponentProvider):
3738
def get_app_def(self) -> AppDef:
38-
args = ["torchx.apps.test.dummy_script"]
39+
args = ["torchx.components.integration_tests.test.dummy_app"]
3940
rdzv_backend: str = "c10d"
4041
rdzv_endpoint: str = "localhost:29400"
4142
return dist_components.ddp(
@@ -81,19 +82,20 @@ def __init__(self, image: str, scheduler: SchedulerBackend) -> None:
8182
self._file_path = "<None>"
8283

8384
def setUp(self) -> None:
84-
if self._scheduler == "local":
85-
self._file_path: str = os.path.join(os.getcwd(), "test.txt")
85+
fname = "torchx_touch_test.txt"
86+
if self._scheduler == "local_cwd":
87+
self._file_path: str = os.path.join(tempfile.gettempdir(), fname)
8688
else:
87-
self._file_path: str = "test.txt"
89+
self._file_path: str = fname
8890

8991
def get_app_def(self) -> AppDef:
9092
return utils_components.touch(
91-
file="test.txt",
93+
file=self._file_path,
9294
image=self._image,
9395
)
9496

9597
def tearDown(self) -> None:
96-
if self._scheduler == "local" and os.path.exists(self._file_path):
98+
if os.path.exists(self._file_path):
9799
os.remove(self._file_path)
98100

99101

@@ -104,9 +106,10 @@ def __init__(self, image: str, scheduler: SchedulerBackend) -> None:
104106
self._dst_path = "<None>"
105107

106108
def setUp(self) -> None:
107-
if self._scheduler == "local":
108-
self._src_path: str = os.path.join(os.getcwd(), "copy_test.txt")
109-
self._dst_path: str = os.path.join(os.getcwd(), "copy_test.txt.copy")
109+
if self._scheduler == "local_cwd":
110+
fname = "torchx_copy_test.txt"
111+
self._src_path: str = os.path.join(tempfile.gettempdir(), fname)
112+
self._dst_path: str = os.path.join(tempfile.gettempdir(), f"{fname}.copy")
110113
self._process_local_sched()
111114
else:
112115
self._src_path: str = "README.md"
@@ -118,7 +121,9 @@ def _process_local_sched(self) -> None:
118121
f.write("test data")
119122

120123
def tearDown(self) -> None:
121-
if self._scheduler == "local" and os.path.exists(self._dst_path):
124+
if os.path.exists(self._dst_path):
125+
os.remove(self._dst_path)
126+
if self._scheduler == "local_cwd" and os.path.exists(self._dst_path):
122127
os.remove(self._dst_path)
123128

124129
def get_app_def(self) -> AppDef:

torchx/components/integration_tests/integ_tests.py

Lines changed: 96 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@
99
import traceback
1010
from concurrent.futures import Future, ThreadPoolExecutor
1111
from dataclasses import dataclass
12+
from string import Template
1213
from types import ModuleType
13-
from typing import List, Dict, Optional, Tuple, Type, Callable, cast
14+
from typing import List, Optional, Tuple, Type, Callable, cast
1415

1516
from pyre_extensions import none_throws
1617
from torchx.components.component_test_base import ComponentUtils
1718
from torchx.components.integration_tests.component_provider import ComponentProvider
18-
from torchx.specs import RunConfig, SchedulerBackend
19+
from torchx.specs import RunConfig, SchedulerBackend, AppDef
1920

2021

2122
@dataclass
@@ -27,13 +28,34 @@ class SchedulerInfo:
2728

2829
@dataclass
2930
class AppDefRun:
30-
provider_cls: Type[ComponentProvider]
31+
provider: ComponentProvider
3132
scheduler_info: SchedulerInfo
33+
app_def: AppDef
3234
future: Future
3335

34-
def __str__(self) -> str:
35-
msg = f"`{self.scheduler_info.name}`:`{self.scheduler_info.image}`"
36-
return msg
36+
37+
_SUCCESS_APP_FORMAT_TEMPLATE = """
38+
Name: ${name}
39+
Scheduler: ${scheduler}"""
40+
41+
_FAIL_APP_FORMAT_TEMPLATE = """
42+
Name: ${name}
43+
Provider: ${provider}
44+
Scheduler: ${scheduler}
45+
Image: ${image}
46+
Error: ${error}"""
47+
48+
_REPORT_FORMAT_TEMPLATE = """
49+
${boarder}
50+
Status: Success
51+
${boarder}
52+
${success_report}
53+
\n
54+
${boarder}
55+
Status: Failed
56+
${boarder}
57+
${fail_report}
58+
"""
3759

3860

3961
class IntegComponentTest:
@@ -61,14 +83,14 @@ def run_components(
6183
dryrun: bool = False,
6284
) -> None:
6385
component_providers_cls = self._get_component_providers(module)
64-
futures: List[AppDefRun] = []
86+
app_runs: List[AppDefRun] = []
6587
executor = ThreadPoolExecutor()
6688
for scheduler_info in scheduler_infos:
6789
sched_futures = self._run_component_providers(
6890
executor, component_providers_cls, scheduler_info, dryrun
6991
)
70-
futures += sched_futures
71-
self._wait_and_print_report(futures)
92+
app_runs += sched_futures
93+
self._wait_and_print_report(app_runs)
7294

7395
def _get_component_providers(
7496
self, module: ModuleType
@@ -91,70 +113,94 @@ def _run_component_providers(
91113
scheduler_info: SchedulerInfo,
92114
dryrun: bool = False,
93115
) -> List[AppDefRun]:
94-
futures: List[AppDefRun] = []
116+
app_runs: List[AppDefRun] = []
95117
for provider_cls in component_providers_cls:
118+
provider = self._get_app_def_provider(provider_cls, scheduler_info)
96119
future = executor.submit(
97-
self._run_component_provider, provider_cls, scheduler_info, dryrun
120+
self._run_component_provider, provider, scheduler_info, dryrun
98121
)
99-
futures.append(
122+
app_runs.append(
100123
AppDefRun(
101-
provider_cls=provider_cls,
124+
provider=provider,
102125
scheduler_info=scheduler_info,
103126
future=future,
127+
app_def=provider.get_app_def(),
104128
)
105129
)
106-
return futures
130+
return app_runs
107131

108-
def _run_component_provider(
132+
def _get_app_def_provider(
109133
self,
110134
component_provider_cls: Type[ComponentProvider],
111135
scheduler_info: SchedulerInfo,
136+
) -> ComponentProvider:
137+
provider_cls = cast(Callable[..., ComponentProvider], component_provider_cls)
138+
return none_throws(provider_cls(scheduler_info.name, scheduler_info.image))
139+
140+
def _run_component_provider(
141+
self,
142+
provider: ComponentProvider,
143+
scheduler_info: SchedulerInfo,
112144
dryrun: bool = False,
113-
) -> str:
114-
provider: Optional[ComponentProvider] = None
145+
) -> None:
115146
try:
116-
provider_cls = cast(
117-
Callable[..., ComponentProvider], component_provider_cls
118-
)
119-
provider = none_throws(
120-
provider_cls(scheduler_info.name, scheduler_info.image)
121-
)
122147
provider.setUp()
123-
app_def = provider.get_app_def()
124148
ComponentUtils.run_appdef_on_scheduler(
125-
app_def, scheduler_info.name, scheduler_info.runconfig, dryrun
149+
provider.get_app_def(),
150+
scheduler_info.name,
151+
scheduler_info.runconfig,
152+
dryrun,
126153
)
127-
return app_def.name
128154
finally:
129-
if provider:
130-
provider.tearDown()
155+
provider.tearDown()
131156

132-
def _wait_and_print_report(self, futures: List[AppDefRun]) -> None:
133-
app_runs: Dict[str, List[str]] = {}
157+
def _wait_and_print_report(self, app_runs: List[AppDefRun]) -> None:
158+
succeeded_apps: List[AppDefRun] = []
159+
failed_apps: List[Tuple[AppDefRun, str]] = []
134160
deadline = time.monotonic() + self._timeout
135-
for future in futures:
161+
for app_run in app_runs:
136162
task_timeout = max(0, int(deadline - time.monotonic()))
137-
status, msg = self._get_app_def_run_status(future, task_timeout)
138-
if status not in app_runs:
139-
app_runs[status] = []
140-
app_runs[status].append(msg)
141-
for status, msgs in app_runs.items():
142-
print(f"\nStatus: `{status}`\n")
143-
print("\n".join(msgs))
144-
print("\n")
163+
error_msg = self._get_app_def_run_status(app_run, task_timeout)
164+
if not error_msg:
165+
succeeded_apps.append(app_run)
166+
else:
167+
failed_apps.append((app_run, error_msg))
168+
success_report = ""
169+
for app_run in succeeded_apps:
170+
success_report_run = Template(_SUCCESS_APP_FORMAT_TEMPLATE).substitute(
171+
name=app_run.app_def.name, scheduler=app_run.scheduler_info.name
172+
)
173+
success_report += f"{success_report_run}\n"
174+
fail_report = ""
175+
for app_run, error_msg in failed_apps:
176+
fail_report_run = Template(_FAIL_APP_FORMAT_TEMPLATE).substitute(
177+
name=app_run.app_def.name,
178+
provider=app_run.provider,
179+
scheduler=app_run.scheduler_info.name,
180+
image=app_run.scheduler_info.image,
181+
error=error_msg,
182+
)
183+
fail_report += f"{fail_report_run}\n"
184+
delim = "*"
185+
width = 80
186+
msg = Template(_REPORT_FORMAT_TEMPLATE).substitute(
187+
boarder=delim * width,
188+
success_report=success_report or "<NONE>",
189+
fail_report=fail_report or "<NONE>",
190+
)
191+
print(msg)
192+
if len(failed_apps) > 0:
193+
raise RuntimeError(
194+
"Component test failed, see report above for detailed issue"
195+
)
145196

146197
def _get_app_def_run_status(
147198
self, app_def_run: AppDefRun, timeout: int
148-
) -> Tuple[str, str]:
199+
) -> Optional[str]:
149200
try:
150-
print(f"Retrieving results from {app_def_run}: {app_def_run.provider_cls}")
151-
app_name = app_def_run.future.result(timeout=timeout)
152-
return "succeeded", f"`{app_name}`:`{app_def_run}`"
153-
except Exception as e:
201+
print(f"Retrieving {app_def_run.app_def.name}: {app_def_run.provider}")
202+
app_def_run.future.result(timeout=timeout)
203+
return None
204+
except Exception:
154205
stack_trace_msg = traceback.format_exc().replace("\n", "\n ")
155-
156-
msg = (
157-
f"Failure while running: {app_def_run}, provider: `{app_def_run.provider_cls}`: {e}\n"
158-
f"Stacktrace: {stack_trace_msg}\n"
159-
)
160-
return "failed", msg
206+
return stack_trace_msg
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
8+
def main() -> None:
9+
print("hi from main")
10+
11+
12+
if __name__ == "__main__":
13+
main()

0 commit comments

Comments
 (0)