|
13 | 13 | from __future__ import absolute_import
|
14 | 14 |
|
15 | 15 | import os
|
16 |
| -import time |
17 | 16 |
|
18 |
| -import pytest |
19 | 17 | import numpy
|
| 18 | +import pytest |
20 | 19 |
|
21 |
| -from sagemaker.chainer.defaults import CHAINER_VERSION |
22 | 20 | from sagemaker.chainer.estimator import Chainer
|
23 | 21 | from sagemaker.chainer.model import ChainerModel
|
24 | 22 | from sagemaker.utils import unique_name_from_base
|
25 |
| -import tests.integ |
26 | 23 | from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES
|
27 | 24 | from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
|
28 | 25 |
|
29 | 26 |
|
30 | 27 | @pytest.fixture(scope="module")
|
31 |
| -def chainer_training_job(sagemaker_session, chainer_full_version): |
32 |
| - return _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version) |
| 28 | +def chainer_local_training_job(sagemaker_local_session, chainer_full_version): |
| 29 | + return _run_mnist_training_job(sagemaker_local_session, "local", 1, chainer_full_version) |
| 30 | + |
| 31 | + |
| 32 | +@pytest.mark.local_mode |
| 33 | +def test_distributed_cpu_training(sagemaker_local_session, chainer_full_version): |
| 34 | + _run_mnist_training_job(sagemaker_local_session, "local", 2, chainer_full_version) |
33 | 35 |
|
34 | 36 |
|
35 |
| -def test_distributed_cpu_training(sagemaker_session, chainer_full_version): |
36 |
| - _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 2, chainer_full_version) |
| 37 | +@pytest.mark.local_mode |
| 38 | +def test_training_with_additional_hyperparameters(sagemaker_local_session, chainer_full_version): |
| 39 | + script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
| 40 | + data_path = os.path.join(DATA_DIR, "chainer_mnist") |
37 | 41 |
|
| 42 | + chainer = Chainer( |
| 43 | + entry_point=script_path, |
| 44 | + role="SageMakerRole", |
| 45 | + train_instance_count=1, |
| 46 | + train_instance_type="local", |
| 47 | + framework_version=chainer_full_version, |
| 48 | + py_version=PYTHON_VERSION, |
| 49 | + sagemaker_session=sagemaker_local_session, |
| 50 | + hyperparameters={"epochs": 1}, |
| 51 | + use_mpi=True, |
| 52 | + num_processes=2, |
| 53 | + process_slots_per_host=2, |
| 54 | + additional_mpi_options="-x NCCL_DEBUG=INFO", |
| 55 | + ) |
38 | 56 |
|
39 |
| -@pytest.mark.skipif( |
40 |
| - tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS |
41 |
| - or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, |
42 |
| - reason="no ml.p2 instances in these regions", |
43 |
| -) |
44 |
| -def test_distributed_gpu_training(sagemaker_session, chainer_full_version): |
45 |
| - _run_mnist_training_job(sagemaker_session, "ml.p2.xlarge", 2, chainer_full_version) |
| 57 | + train_input = "file://" + os.path.join(data_path, "train") |
| 58 | + test_input = "file://" + os.path.join(data_path, "test") |
46 | 59 |
|
| 60 | + chainer.fit({"train": train_input, "test": test_input}) |
47 | 61 |
|
48 |
| -def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version): |
| 62 | + |
| 63 | +@pytest.mark.canary_quick |
| 64 | +@pytest.mark.regional_testing |
| 65 | +def test_attach_deploy(sagemaker_session, chainer_full_version): |
49 | 66 | with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
|
50 | 67 | script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
|
51 | 68 | data_path = os.path.join(DATA_DIR, "chainer_mnist")
|
52 | 69 |
|
53 | 70 | chainer = Chainer(
|
54 | 71 | entry_point=script_path,
|
55 | 72 | role="SageMakerRole",
|
56 |
| - train_instance_count=1, |
57 |
| - train_instance_type="ml.c4.xlarge", |
58 | 73 | framework_version=chainer_full_version,
|
59 | 74 | py_version=PYTHON_VERSION,
|
| 75 | + train_instance_count=1, |
| 76 | + train_instance_type="ml.c4.xlarge", |
60 | 77 | sagemaker_session=sagemaker_session,
|
61 | 78 | hyperparameters={"epochs": 1},
|
62 |
| - use_mpi=True, |
63 |
| - num_processes=2, |
64 |
| - process_slots_per_host=2, |
65 |
| - additional_mpi_options="-x NCCL_DEBUG=INFO", |
66 | 79 | )
|
67 | 80 |
|
68 |
| - train_input = chainer.sagemaker_session.upload_data( |
| 81 | + train_input = sagemaker_session.upload_data( |
69 | 82 | path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train"
|
70 | 83 | )
|
71 |
| - test_input = chainer.sagemaker_session.upload_data( |
| 84 | + |
| 85 | + test_input = sagemaker_session.upload_data( |
72 | 86 | path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test"
|
73 | 87 | )
|
74 | 88 |
|
75 | 89 | job_name = unique_name_from_base("test-chainer-training")
|
76 |
| - chainer.fit({"train": train_input, "test": test_input}, job_name=job_name) |
77 |
| - return chainer.latest_training_job.name |
78 |
| - |
| 90 | + chainer.fit({"train": train_input, "test": test_input}, wait=False, job_name=job_name) |
79 | 91 |
|
80 |
| -@pytest.mark.canary_quick |
81 |
| -@pytest.mark.regional_testing |
82 |
| -def test_attach_deploy(chainer_training_job, sagemaker_session): |
83 | 92 | endpoint_name = unique_name_from_base("test-chainer-attach-deploy")
|
84 | 93 |
|
85 | 94 | with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
|
86 |
| - estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session) |
| 95 | + estimator = Chainer.attach( |
| 96 | + chainer.latest_training_job.name, sagemaker_session=sagemaker_session |
| 97 | + ) |
87 | 98 | predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name)
|
88 | 99 | _predict_and_assert(predictor)
|
89 | 100 |
|
90 | 101 |
|
91 |
| -def test_deploy_model(chainer_training_job, sagemaker_session): |
92 |
| - endpoint_name = unique_name_from_base("test-chainer-deploy-model") |
93 |
| - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): |
94 |
| - desc = sagemaker_session.sagemaker_client.describe_training_job( |
95 |
| - TrainingJobName=chainer_training_job |
96 |
| - ) |
97 |
| - model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] |
98 |
| - script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
99 |
| - model = ChainerModel( |
100 |
| - model_data, |
101 |
| - "SageMakerRole", |
102 |
| - entry_point=script_path, |
103 |
| - sagemaker_session=sagemaker_session, |
104 |
| - ) |
105 |
| - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) |
106 |
| - _predict_and_assert(predictor) |
107 |
| - |
| 102 | +@pytest.mark.local_mode |
| 103 | +def test_deploy_model(chainer_local_training_job, sagemaker_local_session): |
| 104 | + script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
108 | 105 |
|
109 |
| -def test_async_fit(sagemaker_session): |
110 |
| - with timeout(minutes=5): |
111 |
| - training_job_name = _run_mnist_training_job( |
112 |
| - sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False |
113 |
| - ) |
| 106 | + model = ChainerModel( |
| 107 | + chainer_local_training_job.model_data, |
| 108 | + "SageMakerRole", |
| 109 | + entry_point=script_path, |
| 110 | + sagemaker_session=sagemaker_local_session, |
| 111 | + ) |
114 | 112 |
|
115 |
| - print("Waiting to re-attach to the training job: %s" % training_job_name) |
116 |
| - time.sleep(20) |
117 |
| - |
118 |
| - endpoint_name = unique_name_from_base("test-chainer-async-fit") |
119 |
| - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): |
120 |
| - print("Re-attaching now to: %s" % training_job_name) |
121 |
| - estimator = Chainer.attach( |
122 |
| - training_job_name=training_job_name, sagemaker_session=sagemaker_session |
123 |
| - ) |
124 |
| - predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) |
| 113 | + predictor = model.deploy(1, "local") |
| 114 | + try: |
125 | 115 | _predict_and_assert(predictor)
|
| 116 | + finally: |
| 117 | + predictor.delete_endpoint() |
126 | 118 |
|
127 | 119 |
|
128 | 120 | def _run_mnist_training_job(
|
129 | 121 | sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True
|
130 | 122 | ):
|
131 |
| - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): |
132 |
| - |
133 |
| - script_path = ( |
134 |
| - os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
135 |
| - if instance_type == 1 |
136 |
| - else os.path.join(DATA_DIR, "chainer_mnist", "distributed_mnist.py") |
137 |
| - ) |
138 |
| - |
139 |
| - data_path = os.path.join(DATA_DIR, "chainer_mnist") |
140 |
| - |
141 |
| - chainer = Chainer( |
142 |
| - entry_point=script_path, |
143 |
| - role="SageMakerRole", |
144 |
| - framework_version=chainer_full_version, |
145 |
| - py_version=PYTHON_VERSION, |
146 |
| - train_instance_count=instance_count, |
147 |
| - train_instance_type=instance_type, |
148 |
| - sagemaker_session=sagemaker_session, |
149 |
| - hyperparameters={"epochs": 1}, |
150 |
| - ) |
151 |
| - |
152 |
| - train_input = chainer.sagemaker_session.upload_data( |
153 |
| - path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train" |
154 |
| - ) |
155 |
| - test_input = chainer.sagemaker_session.upload_data( |
156 |
| - path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test" |
157 |
| - ) |
158 |
| - |
159 |
| - job_name = unique_name_from_base("test-chainer-training") |
160 |
| - chainer.fit({"train": train_input, "test": test_input}, wait=wait, job_name=job_name) |
161 |
| - return chainer.latest_training_job.name |
| 123 | + script_path = ( |
| 124 | + os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
| 125 | + if instance_type == 1 |
| 126 | + else os.path.join(DATA_DIR, "chainer_mnist", "distributed_mnist.py") |
| 127 | + ) |
| 128 | + |
| 129 | + data_path = os.path.join(DATA_DIR, "chainer_mnist") |
| 130 | + |
| 131 | + chainer = Chainer( |
| 132 | + entry_point=script_path, |
| 133 | + role="SageMakerRole", |
| 134 | + framework_version=chainer_full_version, |
| 135 | + py_version=PYTHON_VERSION, |
| 136 | + train_instance_count=instance_count, |
| 137 | + train_instance_type=instance_type, |
| 138 | + sagemaker_session=sagemaker_session, |
| 139 | + hyperparameters={"epochs": 1}, |
| 140 | + ) |
| 141 | + |
| 142 | + train_input = "file://" + os.path.join(data_path, "train") |
| 143 | + test_input = "file://" + os.path.join(data_path, "test") |
| 144 | + |
| 145 | + job_name = unique_name_from_base("test-chainer-training") |
| 146 | + chainer.fit({"train": train_input, "test": test_input}, wait=wait, job_name=job_name) |
| 147 | + return chainer |
162 | 148 |
|
163 | 149 |
|
164 | 150 | def _predict_and_assert(predictor):
|
|
0 commit comments