Skip to content

Commit d3c5205

Browse files
mvsusplaurenyu
authored andcommitted
change: improving Chainer integ tests (#872)
1 parent 7b9ad5c commit d3c5205

File tree

3 files changed

+85
-102
lines changed

3 files changed

+85
-102
lines changed

tests/data/chainer_mnist/distributed_mnist.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __call__(self, x):
4646

4747

4848
def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb_format):
49-
images = raw["x"]
49+
images = raw["x"][-100:]
5050
if ndim == 2:
5151
images = images.reshape(-1, 28, 28)
5252
elif ndim == 3:
@@ -59,7 +59,7 @@ def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb
5959
images *= scale / 255.0
6060

6161
if withlabel:
62-
labels = raw["y"].astype(label_dtype)
62+
labels = raw["y"][-100:].astype(label_dtype)
6363
return tuple_dataset.TupleDataset(images, labels)
6464
return images
6565

@@ -111,9 +111,6 @@ def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb
111111
optimizer = chainermn.create_multi_node_optimizer(chainer.optimizers.Adam(), comm)
112112
optimizer.setup(model)
113113

114-
train_file = np.load(os.path.join(args.train, "train.npz"))
115-
test_file = np.load(os.path.join(args.test, "test.npz"))
116-
117114
preprocess_mnist_options = {
118115
"withlabel": True,
119116
"ndim": 1,
@@ -173,7 +170,7 @@ def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb
173170
trainer.run()
174171

175172
# only save the model in the master node
176-
if args.host == "algo-1":
173+
if args.host == env.hosts[0]:
177174
serializers.save_npz(os.path.join(env.model_dir, "model.npz"), model)
178175

179176

tests/data/chainer_mnist/mnist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __call__(self, x):
4242

4343

4444
def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb_format):
45-
images = raw["x"]
45+
images = raw["x"][-100:]
4646
if ndim == 2:
4747
images = images.reshape(-1, 28, 28)
4848
elif ndim == 3:
@@ -55,7 +55,7 @@ def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb
5555
images *= scale / 255.0
5656

5757
if withlabel:
58-
labels = raw["y"].astype(label_dtype)
58+
labels = raw["y"][-100:].astype(label_dtype)
5959
return tuple_dataset.TupleDataset(images, labels)
6060
else:
6161
return images

tests/integ/test_chainer_train.py

Lines changed: 80 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -13,152 +13,138 @@
1313
from __future__ import absolute_import
1414

1515
import os
16-
import time
1716

18-
import pytest
1917
import numpy
18+
import pytest
2019

21-
from sagemaker.chainer.defaults import CHAINER_VERSION
2220
from sagemaker.chainer.estimator import Chainer
2321
from sagemaker.chainer.model import ChainerModel
2422
from sagemaker.utils import unique_name_from_base
25-
import tests.integ
2623
from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES
2724
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2825

2926

3027
@pytest.fixture(scope="module")
31-
def chainer_training_job(sagemaker_session, chainer_full_version):
32-
return _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version)
28+
def chainer_local_training_job(sagemaker_local_session, chainer_full_version):
29+
return _run_mnist_training_job(sagemaker_local_session, "local", 1, chainer_full_version)
30+
31+
32+
@pytest.mark.local_mode
33+
def test_distributed_cpu_training(sagemaker_local_session, chainer_full_version):
34+
_run_mnist_training_job(sagemaker_local_session, "local", 2, chainer_full_version)
3335

3436

35-
def test_distributed_cpu_training(sagemaker_session, chainer_full_version):
36-
_run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 2, chainer_full_version)
37+
@pytest.mark.local_mode
38+
def test_training_with_additional_hyperparameters(sagemaker_local_session, chainer_full_version):
39+
script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
40+
data_path = os.path.join(DATA_DIR, "chainer_mnist")
3741

42+
chainer = Chainer(
43+
entry_point=script_path,
44+
role="SageMakerRole",
45+
train_instance_count=1,
46+
train_instance_type="local",
47+
framework_version=chainer_full_version,
48+
py_version=PYTHON_VERSION,
49+
sagemaker_session=sagemaker_local_session,
50+
hyperparameters={"epochs": 1},
51+
use_mpi=True,
52+
num_processes=2,
53+
process_slots_per_host=2,
54+
additional_mpi_options="-x NCCL_DEBUG=INFO",
55+
)
3856

39-
@pytest.mark.skipif(
40-
tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS
41-
or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS,
42-
reason="no ml.p2 instances in these regions",
43-
)
44-
def test_distributed_gpu_training(sagemaker_session, chainer_full_version):
45-
_run_mnist_training_job(sagemaker_session, "ml.p2.xlarge", 2, chainer_full_version)
57+
train_input = "file://" + os.path.join(data_path, "train")
58+
test_input = "file://" + os.path.join(data_path, "test")
4659

60+
chainer.fit({"train": train_input, "test": test_input})
4761

48-
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version):
62+
63+
@pytest.mark.canary_quick
64+
@pytest.mark.regional_testing
65+
def test_attach_deploy(sagemaker_session, chainer_full_version):
4966
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
5067
script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
5168
data_path = os.path.join(DATA_DIR, "chainer_mnist")
5269

5370
chainer = Chainer(
5471
entry_point=script_path,
5572
role="SageMakerRole",
56-
train_instance_count=1,
57-
train_instance_type="ml.c4.xlarge",
5873
framework_version=chainer_full_version,
5974
py_version=PYTHON_VERSION,
75+
train_instance_count=1,
76+
train_instance_type="ml.c4.xlarge",
6077
sagemaker_session=sagemaker_session,
6178
hyperparameters={"epochs": 1},
62-
use_mpi=True,
63-
num_processes=2,
64-
process_slots_per_host=2,
65-
additional_mpi_options="-x NCCL_DEBUG=INFO",
6679
)
6780

68-
train_input = chainer.sagemaker_session.upload_data(
81+
train_input = sagemaker_session.upload_data(
6982
path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train"
7083
)
71-
test_input = chainer.sagemaker_session.upload_data(
84+
85+
test_input = sagemaker_session.upload_data(
7286
path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test"
7387
)
7488

7589
job_name = unique_name_from_base("test-chainer-training")
76-
chainer.fit({"train": train_input, "test": test_input}, job_name=job_name)
77-
return chainer.latest_training_job.name
78-
90+
chainer.fit({"train": train_input, "test": test_input}, wait=False, job_name=job_name)
7991

80-
@pytest.mark.canary_quick
81-
@pytest.mark.regional_testing
82-
def test_attach_deploy(chainer_training_job, sagemaker_session):
8392
endpoint_name = unique_name_from_base("test-chainer-attach-deploy")
8493

8594
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
86-
estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session)
95+
estimator = Chainer.attach(
96+
chainer.latest_training_job.name, sagemaker_session=sagemaker_session
97+
)
8798
predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name)
8899
_predict_and_assert(predictor)
89100

90101

91-
def test_deploy_model(chainer_training_job, sagemaker_session):
92-
endpoint_name = unique_name_from_base("test-chainer-deploy-model")
93-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
94-
desc = sagemaker_session.sagemaker_client.describe_training_job(
95-
TrainingJobName=chainer_training_job
96-
)
97-
model_data = desc["ModelArtifacts"]["S3ModelArtifacts"]
98-
script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
99-
model = ChainerModel(
100-
model_data,
101-
"SageMakerRole",
102-
entry_point=script_path,
103-
sagemaker_session=sagemaker_session,
104-
)
105-
predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name)
106-
_predict_and_assert(predictor)
107-
102+
@pytest.mark.local_mode
103+
def test_deploy_model(chainer_local_training_job, sagemaker_local_session):
104+
script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
108105

109-
def test_async_fit(sagemaker_session):
110-
with timeout(minutes=5):
111-
training_job_name = _run_mnist_training_job(
112-
sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False
113-
)
106+
model = ChainerModel(
107+
chainer_local_training_job.model_data,
108+
"SageMakerRole",
109+
entry_point=script_path,
110+
sagemaker_session=sagemaker_local_session,
111+
)
114112

115-
print("Waiting to re-attach to the training job: %s" % training_job_name)
116-
time.sleep(20)
117-
118-
endpoint_name = unique_name_from_base("test-chainer-async-fit")
119-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
120-
print("Re-attaching now to: %s" % training_job_name)
121-
estimator = Chainer.attach(
122-
training_job_name=training_job_name, sagemaker_session=sagemaker_session
123-
)
124-
predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name)
113+
predictor = model.deploy(1, "local")
114+
try:
125115
_predict_and_assert(predictor)
116+
finally:
117+
predictor.delete_endpoint()
126118

127119

128120
def _run_mnist_training_job(
129121
sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True
130122
):
131-
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
132-
133-
script_path = (
134-
os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
135-
if instance_type == 1
136-
else os.path.join(DATA_DIR, "chainer_mnist", "distributed_mnist.py")
137-
)
138-
139-
data_path = os.path.join(DATA_DIR, "chainer_mnist")
140-
141-
chainer = Chainer(
142-
entry_point=script_path,
143-
role="SageMakerRole",
144-
framework_version=chainer_full_version,
145-
py_version=PYTHON_VERSION,
146-
train_instance_count=instance_count,
147-
train_instance_type=instance_type,
148-
sagemaker_session=sagemaker_session,
149-
hyperparameters={"epochs": 1},
150-
)
151-
152-
train_input = chainer.sagemaker_session.upload_data(
153-
path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train"
154-
)
155-
test_input = chainer.sagemaker_session.upload_data(
156-
path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test"
157-
)
158-
159-
job_name = unique_name_from_base("test-chainer-training")
160-
chainer.fit({"train": train_input, "test": test_input}, wait=wait, job_name=job_name)
161-
return chainer.latest_training_job.name
123+
script_path = (
124+
os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
125+
if instance_type == 1
126+
else os.path.join(DATA_DIR, "chainer_mnist", "distributed_mnist.py")
127+
)
128+
129+
data_path = os.path.join(DATA_DIR, "chainer_mnist")
130+
131+
chainer = Chainer(
132+
entry_point=script_path,
133+
role="SageMakerRole",
134+
framework_version=chainer_full_version,
135+
py_version=PYTHON_VERSION,
136+
train_instance_count=instance_count,
137+
train_instance_type=instance_type,
138+
sagemaker_session=sagemaker_session,
139+
hyperparameters={"epochs": 1},
140+
)
141+
142+
train_input = "file://" + os.path.join(data_path, "train")
143+
test_input = "file://" + os.path.join(data_path, "test")
144+
145+
job_name = unique_name_from_base("test-chainer-training")
146+
chainer.fit({"train": train_input, "test": test_input}, wait=wait, job_name=job_name)
147+
return chainer
162148

163149

164150
def _predict_and_assert(predictor):

0 commit comments

Comments
 (0)