|
13 | 13 | from __future__ import absolute_import |
14 | 14 |
|
15 | 15 | import os |
16 | | -import time |
17 | 16 |
|
18 | | -import pytest |
19 | 17 | import numpy |
| 18 | +import pytest |
20 | 19 |
|
21 | | -from sagemaker.chainer.defaults import CHAINER_VERSION |
22 | 20 | from sagemaker.chainer.estimator import Chainer |
23 | 21 | from sagemaker.chainer.model import ChainerModel |
24 | 22 | from sagemaker.utils import unique_name_from_base |
25 | | -import tests.integ |
26 | 23 | from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES |
27 | 24 | from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name |
28 | 25 |
|
29 | 26 |
|
30 | 27 | @pytest.fixture(scope="module") |
31 | | -def chainer_training_job(sagemaker_session, chainer_full_version): |
32 | | - return _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version) |
| 28 | +def chainer_local_training_job(sagemaker_local_session, chainer_full_version): |
| 29 | + return _run_mnist_training_job(sagemaker_local_session, "local", 1, chainer_full_version) |
| 30 | + |
| 31 | + |
| 32 | +@pytest.mark.local_mode |
| 33 | +def test_distributed_cpu_training(sagemaker_local_session, chainer_full_version): |
| 34 | + _run_mnist_training_job(sagemaker_local_session, "local", 2, chainer_full_version) |
33 | 35 |
|
34 | 36 |
|
35 | | -def test_distributed_cpu_training(sagemaker_session, chainer_full_version): |
36 | | - _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 2, chainer_full_version) |
| 37 | +@pytest.mark.local_mode |
| 38 | +def test_training_with_additional_hyperparameters(sagemaker_local_session, chainer_full_version): |
| 39 | + script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
| 40 | + data_path = os.path.join(DATA_DIR, "chainer_mnist") |
37 | 41 |
|
| 42 | + chainer = Chainer( |
| 43 | + entry_point=script_path, |
| 44 | + role="SageMakerRole", |
| 45 | + train_instance_count=1, |
| 46 | + train_instance_type="local", |
| 47 | + framework_version=chainer_full_version, |
| 48 | + py_version=PYTHON_VERSION, |
| 49 | + sagemaker_session=sagemaker_local_session, |
| 50 | + hyperparameters={"epochs": 1}, |
| 51 | + use_mpi=True, |
| 52 | + num_processes=2, |
| 53 | + process_slots_per_host=2, |
| 54 | + additional_mpi_options="-x NCCL_DEBUG=INFO", |
| 55 | + ) |
38 | 56 |
|
39 | | -@pytest.mark.skipif( |
40 | | - tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS |
41 | | - or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, |
42 | | - reason="no ml.p2 instances in these regions", |
43 | | -) |
44 | | -def test_distributed_gpu_training(sagemaker_session, chainer_full_version): |
45 | | - _run_mnist_training_job(sagemaker_session, "ml.p2.xlarge", 2, chainer_full_version) |
| 57 | + train_input = "file://" + os.path.join(data_path, "train") |
| 58 | + test_input = "file://" + os.path.join(data_path, "test") |
46 | 59 |
|
| 60 | + chainer.fit({"train": train_input, "test": test_input}) |
47 | 61 |
|
48 | | -def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version): |
| 62 | + |
| 63 | +@pytest.mark.canary_quick |
| 64 | +@pytest.mark.regional_testing |
| 65 | +def test_attach_deploy(sagemaker_session, chainer_full_version): |
49 | 66 | with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): |
50 | 67 | script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
51 | 68 | data_path = os.path.join(DATA_DIR, "chainer_mnist") |
52 | 69 |
|
53 | 70 | chainer = Chainer( |
54 | 71 | entry_point=script_path, |
55 | 72 | role="SageMakerRole", |
56 | | - train_instance_count=1, |
57 | | - train_instance_type="ml.c4.xlarge", |
58 | 73 | framework_version=chainer_full_version, |
59 | 74 | py_version=PYTHON_VERSION, |
| 75 | + train_instance_count=1, |
| 76 | + train_instance_type="ml.c4.xlarge", |
60 | 77 | sagemaker_session=sagemaker_session, |
61 | 78 | hyperparameters={"epochs": 1}, |
62 | | - use_mpi=True, |
63 | | - num_processes=2, |
64 | | - process_slots_per_host=2, |
65 | | - additional_mpi_options="-x NCCL_DEBUG=INFO", |
66 | 79 | ) |
67 | 80 |
|
68 | | - train_input = chainer.sagemaker_session.upload_data( |
| 81 | + train_input = sagemaker_session.upload_data( |
69 | 82 | path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train" |
70 | 83 | ) |
71 | | - test_input = chainer.sagemaker_session.upload_data( |
| 84 | + |
| 85 | + test_input = sagemaker_session.upload_data( |
72 | 86 | path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test" |
73 | 87 | ) |
74 | 88 |
|
75 | 89 | job_name = unique_name_from_base("test-chainer-training") |
76 | | - chainer.fit({"train": train_input, "test": test_input}, job_name=job_name) |
77 | | - return chainer.latest_training_job.name |
78 | | - |
| 90 | + chainer.fit({"train": train_input, "test": test_input}, wait=False, job_name=job_name) |
79 | 91 |
|
80 | | -@pytest.mark.canary_quick |
81 | | -@pytest.mark.regional_testing |
82 | | -def test_attach_deploy(chainer_training_job, sagemaker_session): |
83 | 92 | endpoint_name = unique_name_from_base("test-chainer-attach-deploy") |
84 | 93 |
|
85 | 94 | with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): |
86 | | - estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session) |
| 95 | + estimator = Chainer.attach( |
| 96 | + chainer.latest_training_job.name, sagemaker_session=sagemaker_session |
| 97 | + ) |
87 | 98 | predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) |
88 | 99 | _predict_and_assert(predictor) |
89 | 100 |
|
90 | 101 |
|
91 | | -def test_deploy_model(chainer_training_job, sagemaker_session): |
92 | | - endpoint_name = unique_name_from_base("test-chainer-deploy-model") |
93 | | - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): |
94 | | - desc = sagemaker_session.sagemaker_client.describe_training_job( |
95 | | - TrainingJobName=chainer_training_job |
96 | | - ) |
97 | | - model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] |
98 | | - script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
99 | | - model = ChainerModel( |
100 | | - model_data, |
101 | | - "SageMakerRole", |
102 | | - entry_point=script_path, |
103 | | - sagemaker_session=sagemaker_session, |
104 | | - ) |
105 | | - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) |
106 | | - _predict_and_assert(predictor) |
107 | | - |
| 102 | +@pytest.mark.local_mode |
| 103 | +def test_deploy_model(chainer_local_training_job, sagemaker_local_session): |
| 104 | + script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
108 | 105 |
|
109 | | -def test_async_fit(sagemaker_session): |
110 | | - with timeout(minutes=5): |
111 | | - training_job_name = _run_mnist_training_job( |
112 | | - sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False |
113 | | - ) |
| 106 | + model = ChainerModel( |
| 107 | + chainer_local_training_job.model_data, |
| 108 | + "SageMakerRole", |
| 109 | + entry_point=script_path, |
| 110 | + sagemaker_session=sagemaker_local_session, |
| 111 | + ) |
114 | 112 |
|
115 | | - print("Waiting to re-attach to the training job: %s" % training_job_name) |
116 | | - time.sleep(20) |
117 | | - |
118 | | - endpoint_name = unique_name_from_base("test-chainer-async-fit") |
119 | | - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): |
120 | | - print("Re-attaching now to: %s" % training_job_name) |
121 | | - estimator = Chainer.attach( |
122 | | - training_job_name=training_job_name, sagemaker_session=sagemaker_session |
123 | | - ) |
124 | | - predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) |
| 113 | + predictor = model.deploy(1, "local") |
| 114 | + try: |
125 | 115 | _predict_and_assert(predictor) |
| 116 | + finally: |
| 117 | + predictor.delete_endpoint() |
126 | 118 |
|
127 | 119 |
|
128 | 120 | def _run_mnist_training_job( |
129 | 121 | sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True |
130 | 122 | ): |
131 | | - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): |
132 | | - |
133 | | - script_path = ( |
134 | | - os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
135 | | - if instance_type == 1 |
136 | | - else os.path.join(DATA_DIR, "chainer_mnist", "distributed_mnist.py") |
137 | | - ) |
138 | | - |
139 | | - data_path = os.path.join(DATA_DIR, "chainer_mnist") |
140 | | - |
141 | | - chainer = Chainer( |
142 | | - entry_point=script_path, |
143 | | - role="SageMakerRole", |
144 | | - framework_version=chainer_full_version, |
145 | | - py_version=PYTHON_VERSION, |
146 | | - train_instance_count=instance_count, |
147 | | - train_instance_type=instance_type, |
148 | | - sagemaker_session=sagemaker_session, |
149 | | - hyperparameters={"epochs": 1}, |
150 | | - ) |
151 | | - |
152 | | - train_input = chainer.sagemaker_session.upload_data( |
153 | | - path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train" |
154 | | - ) |
155 | | - test_input = chainer.sagemaker_session.upload_data( |
156 | | - path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test" |
157 | | - ) |
158 | | - |
159 | | - job_name = unique_name_from_base("test-chainer-training") |
160 | | - chainer.fit({"train": train_input, "test": test_input}, wait=wait, job_name=job_name) |
161 | | - return chainer.latest_training_job.name |
| 123 | + script_path = ( |
| 124 | + os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") |
| 125 | + if instance_type == 1 |
| 126 | + else os.path.join(DATA_DIR, "chainer_mnist", "distributed_mnist.py") |
| 127 | + ) |
| 128 | + |
| 129 | + data_path = os.path.join(DATA_DIR, "chainer_mnist") |
| 130 | + |
| 131 | + chainer = Chainer( |
| 132 | + entry_point=script_path, |
| 133 | + role="SageMakerRole", |
| 134 | + framework_version=chainer_full_version, |
| 135 | + py_version=PYTHON_VERSION, |
| 136 | + train_instance_count=instance_count, |
| 137 | + train_instance_type=instance_type, |
| 138 | + sagemaker_session=sagemaker_session, |
| 139 | + hyperparameters={"epochs": 1}, |
| 140 | + ) |
| 141 | + |
| 142 | + train_input = "file://" + os.path.join(data_path, "train") |
| 143 | + test_input = "file://" + os.path.join(data_path, "test") |
| 144 | + |
| 145 | + job_name = unique_name_from_base("test-chainer-training") |
| 146 | + chainer.fit({"train": train_input, "test": test_input}, wait=wait, job_name=job_name) |
| 147 | + return chainer |
162 | 148 |
|
163 | 149 |
|
164 | 150 | def _predict_and_assert(predictor): |
|
0 commit comments