Skip to content

Commit c9efb2b

Browse files
aivanoufacebook-github-bot
authored andcommitted
Add components integration test (#182)
Summary: Pull Request resolved: #182 The diff introduces an integ tests component framework. It consists of the following parts: * `ComponentProvider` - an abstract class that users will use to implement the `AppDef` instantiation. Since many components have specific arguments that cannot be autofilled, e.g. `utils.copy` mast have proper files as inputs, it does not make sense to auto-generate data * Classes that extend `ComponentProvider` and provide ability to retrieve scheduler-dependend app defs. * Integraiton tests class. The class defines `run_components` method that allows users to run all component providers given a module. * Components integration tests github workflow. The workflow uses Integration tests library to run the defined components. Reviewed By: d4l3k Differential Revision: D30980471 fbshipit-source-id: b77b5139303ba8943a5ae91333f51217f47f8811
1 parent 53c8f50 commit c9efb2b

File tree

19 files changed

+573
-56
lines changed

19 files changed

+573
-56
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Kubernetes Integration Tests
1+
name: Components Integration Tests
22

33
on:
44
push:
@@ -7,7 +7,7 @@ on:
77
pull_request:
88

99
jobs:
10-
kubernetes-launch:
10+
components-launch:
1111
runs-on: ubuntu-18.04
1212
steps:
1313
- name: Setup Python
@@ -26,28 +26,28 @@ jobs:
2626
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
2727
aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
2828
fi
29+
- name: Configure Docker
30+
env:
31+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
32+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
33+
run: |
34+
set -eux
35+
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
36+
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
37+
fi
2938
- name: Install dependencies
3039
run: |
3140
set -eux
3241
pip install -e .[kubernetes]
33-
- name: Run Kubernetes Integration Tests
42+
- name: Run Components Integration Tests
3443
env:
3544
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
3645
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
3746
run: |
38-
set -eux
3947
if [ -z "$AWS_ACCESS_KEY_ID" ]; then
4048
# only dryrun if no secrets
4149
ARGS="--dryrun"
4250
else
4351
ARGS=
4452
fi
45-
46-
torchx runopts kubernetes
47-
APP_ID="$(torchx run --wait $ARGS --scheduler kubernetes \
48-
--scheduler_args queue=default utils.echo \
49-
--image alpine:latest --num_replicas 3)"
50-
if [ "$ARGS" != "--dryrun" ]; then
51-
torchx status "$APP_ID"
52-
torchx describe "$APP_ID"
53-
fi
53+
scripts/component_integration_tests.py $ARGS

examples/apps/datapreproc/component.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def data_preproc(
3636
input_path: Url-like path to fetch the imagenet dataset
3737
env: Env variables to transfer to the user script
3838
resource: String representation of the resource
39+
dryrun: Starts the app, but does not actually perform any work.
3940
4041
Returns:
4142
specs.AppDef: Torchx AppDef

examples/apps/dist_cifar/component.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,16 @@ def trainer(
5454
specs.AppDef: Torchx AppDef
5555
"""
5656
return ddp(
57-
image,
58-
"examples/apps/dist_cifar/train.py",
59-
rdzv_backend,
60-
rdzv_endpoint,
61-
resource,
62-
nnodes,
63-
nproc_per_node,
64-
base_image,
65-
"cifar-trainer",
66-
"worker",
67-
env,
6857
*script_args,
58+
image=image,
59+
entrypoint="examples/apps/dist_cifar/train.py",
60+
rdzv_backend=rdzv_backend,
61+
rdzv_endpoint=rdzv_endpoint,
62+
resource=resource,
63+
nnodes=nnodes,
64+
nproc_per_node=nproc_per_node,
65+
base_image=base_image,
66+
name="cifar-trainer",
67+
role="worker",
68+
env=env,
6969
)

examples/apps/dist_cifar/train.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,11 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
151151
parser = argparse.ArgumentParser(
152152
description="Trainer that trains the last layer of pretrained resnet18 for cifar10 classification"
153153
)
154+
parser.add_argument(
155+
"--dryrun",
156+
help=argparse.SUPPRESS,
157+
action="store_true",
158+
)
154159
parser.add_argument(
155160
"--epochs", type=int, default=1, help="number of epochs to train"
156161
)
@@ -173,6 +178,9 @@ def get_gpu_devices() -> int:
173178

174179
def main() -> None:
175180
args = parse_args(sys.argv[1:])
181+
if args.dryrun:
182+
print("App dist_cifar.train started successfully")
183+
return
176184
gpus = get_gpu_devices()
177185
batch_size = args.batch_size
178186
num_nodes = int(os.environ["GROUP_WORLD_SIZE"])

examples/apps/lightning_classy_vision/component.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
def trainer(
2121
image: str,
2222
output_path: str,
23-
data_path: str,
23+
data_path: Optional[str] = None,
2424
entrypoint: str = "examples/apps/lightning_classy_vision/train.py",
2525
load_path: str = "",
2626
log_path: str = "/logs",
@@ -35,7 +35,8 @@ def trainer(
3535
image: image to run (e.g. foobar:latest)
3636
output_path: output path for model checkpoints (e.g. file:///foo/bar)
3737
load_path: path to load pretrained model from
38-
data_path: path to the data to load
38+
data_path: path to the data to load, if data_path is not provided,
39+
auto generated test data will be used
3940
entrypoint: user script to launch.
4041
log_path: path to save tensorboard logs to
4142
resource: the resources to use
@@ -51,15 +52,17 @@ def trainer(
5152
load_path,
5253
"--log_path",
5354
log_path,
54-
"--data_path",
55-
data_path,
5655
"--epochs",
5756
str(epochs),
5857
]
58+
if data_path:
59+
args += ["--data_path", data_path]
60+
else:
61+
args.append("--test")
5962
if skip_export:
6063
args.append("--skip_export")
6164
return torchx.AppDef(
62-
name="examples-lightning_classy_vision-trainer",
65+
name="cv-trainer",
6366
roles=[
6467
torchx.Role(
6568
name="worker",
@@ -81,6 +84,7 @@ def interpret(
8184
data_path: str,
8285
output_path: str,
8386
resource: Optional[str] = None,
87+
entrypoint: str = "examples/apps/lightning_classy_vision/interpret.py",
8488
) -> torchx.AppDef:
8589
"""Runs the model interpretability app on the model outputted by the training
8690
component.
@@ -91,13 +95,14 @@ def interpret(
9195
data_path: path to the data to load
9296
output_path: output path for model checkpoints (e.g. file:///foo/bar)
9397
resource: the resources to use
98+
entrypoint: user script to launch.
9499
"""
95100
return torchx.AppDef(
96-
name="examples-lightning_classy_vision-interpret",
101+
name="cv-interpret",
97102
roles=[
98103
torchx.Role(
99104
name="worker",
100-
entrypoint="examples/apps/lightning_classy_vision/interpret.py",
105+
entrypoint=entrypoint,
101106
args=[
102107
"--load_path",
103108
load_path,

examples/apps/lightning_classy_vision/train.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def main(argv: List[str]) -> None:
120120

121121
# Initialize a trainer
122122
trainer = pl.Trainer(
123+
accelerator="ddp2",
123124
logger=logger,
124125
max_epochs=args.epochs,
125126
callbacks=[checkpoint_callback],
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Facebook, Inc. and its affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
"""
9+
Kubernetes integration tests.
10+
"""
11+
import argparse
12+
import os
13+
14+
# pyre-ignore-all-errors[21] # Cannot find module utils
15+
# pyre-ignore-all-errors[11]
16+
17+
import example_app_defs as examples_app_defs_providers
18+
import torchx.components.integration_tests.component_provider as component_provider
19+
from integ_test_utils import (
20+
MissingEnvError,
21+
build_images,
22+
push_images,
23+
BuildInfo,
24+
)
25+
from torchx.components.integration_tests.integ_tests import (
26+
IntegComponentTest,
27+
SchedulerInfo,
28+
)
29+
from torchx.specs import RunConfig
30+
31+
32+
def build_and_push_image() -> BuildInfo:
33+
build = build_images()
34+
push_images(build)
35+
return build
36+
37+
38+
def get_k8s_sched_info(image: str) -> SchedulerInfo:
39+
cfg = RunConfig()
40+
cfg.set("namespace", "torchx-dev")
41+
cfg.set("queue", "default")
42+
return SchedulerInfo(name="kubernetes", image=image, runconfig=cfg)
43+
44+
45+
def get_local_cwd_sched_info(image: str) -> SchedulerInfo:
46+
return SchedulerInfo(name="local_cwd", image=image, runconfig=RunConfig())
47+
48+
49+
def get_local_docker_sched_info(image: str) -> SchedulerInfo:
50+
return SchedulerInfo(name="local_docker", image=image, runconfig=RunConfig())
51+
52+
53+
def main() -> None:
54+
parser = argparse.ArgumentParser(description="kubernetes integration test runner")
55+
parser.add_argument(
56+
"--dryrun",
57+
action="store_true",
58+
help="Does not actually submit the app," " just prints the scheduler request",
59+
)
60+
args = parser.parse_args()
61+
print("Starting components integration tests")
62+
torchx_image = "dummy_image"
63+
examples_image = "dummy_image"
64+
try:
65+
build = build_and_push_image()
66+
torchx_image = build.torchx_image
67+
examples_image = build.examples_image
68+
except MissingEnvError:
69+
print("Skip runnig tests, executed only docker buid step")
70+
test_suite = IntegComponentTest(timeout=900) # 15 minutes
71+
test_suite.run_components(
72+
component_provider,
73+
scheduler_infos=[
74+
get_local_cwd_sched_info(os.getcwd()),
75+
get_local_docker_sched_info(torchx_image),
76+
get_k8s_sched_info(torchx_image),
77+
],
78+
dryrun=args.dryrun,
79+
)
80+
81+
test_suite.run_components(
82+
examples_app_defs_providers,
83+
scheduler_infos=[
84+
get_local_docker_sched_info(examples_image),
85+
get_k8s_sched_info(examples_image),
86+
],
87+
dryrun=args.dryrun,
88+
)
89+
90+
91+
if __name__ == "__main__":
92+
main()

scripts/example_app_defs.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Facebook, Inc. and its affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
"""
9+
App Defs for integration tests.
10+
"""
11+
12+
13+
import examples.apps.datapreproc.component as dp_component
14+
import examples.apps.dist_cifar.component as dist_cifar_component
15+
import examples.apps.lightning_classy_vision.component as cv_component
16+
from torchx.components.integration_tests.component_provider import ComponentProvider
17+
from torchx.specs import AppDef
18+
19+
20+
class CvTrainerComponentProvider(ComponentProvider):
21+
def get_app_def(self) -> AppDef:
22+
return cv_component.trainer(
23+
image=self._image,
24+
output_path="/tmp",
25+
skip_export=True,
26+
log_path="/tmp",
27+
)
28+
29+
30+
class DistCifarComponentProvider(ComponentProvider):
31+
def get_app_def(self) -> AppDef:
32+
args = ["--output_path", "/tmp", "--dryrun"]
33+
return dist_cifar_component.trainer(
34+
*args,
35+
image=self._image,
36+
)
37+
38+
39+
class DatapreprocComponentProvider(ComponentProvider):
40+
def get_app_def(self) -> AppDef:
41+
return dp_component.data_preproc(
42+
image=self._image,
43+
output_path="/tmp/test",
44+
)

torchx/apps/serve/serve.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import fsspec
2222
import requests
2323

24-
2524
TORCHSERVE_PARAMS = (
2625
"model_name",
2726
"handler",
@@ -56,6 +55,12 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
5655
help="timeout for requests to management api",
5756
default=60,
5857
)
58+
parser.add_argument(
59+
"--dryrun",
60+
action="store_true",
61+
help=argparse.SUPPRESS,
62+
)
63+
5964
parser.add_argument(
6065
"--port",
6166
type=int,
@@ -96,6 +101,9 @@ def rand_id() -> str:
96101

97102
def main(argv: List[str]) -> None:
98103
args = parse_args(argv)
104+
if args.dryrun:
105+
print("App serve started successfully")
106+
return
99107
with tempfile.TemporaryDirectory() as tmpdir:
100108
model_name = args.model_name or "model"
101109
model_file = f"{model_name}_{rand_id()}.mar"

0 commit comments

Comments
 (0)