Skip to content

Commit c8f24a4

Browse files
caxiaohuchuyang-deng
authored andcommitted
fix: clean up resources created by file system set up when setup fails (#1010)
* fix: clean up resources created by file system set up whenever setting up is failed * fix: change file system resoures to dict and add condition check for mounting efs and fsx in shell scripts * fix: remove initialization of fs_resources and change name * update: remove creating efs exception statement * update: reomve check in name for creating efs and fsx
1 parent 9759b85 commit c8f24a4

File tree

5 files changed

+178
-148
lines changed

5 files changed

+178
-148
lines changed

tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,11 @@ def cpu_instance_type(sagemaker_session, request):
264264
return "ml.m4.xlarge"
265265

266266

267+
@pytest.fixture(scope="session")
268+
def ec2_instance_type(cpu_instance_type):
269+
return cpu_instance_type[3:]
270+
271+
267272
@pytest.fixture(scope="session")
268273
def alternative_cpu_instance_type(sagemaker_session, request):
269274
region = sagemaker_session.boto_session.region_name

tests/integ/file_system_input_utils.py

Lines changed: 64 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

15-
import collections
16-
import logging
1715
from operator import itemgetter
1816
import os
1917
from os import path
@@ -33,10 +31,12 @@
3331
PREFIX = "ec2_fs_key_"
3432
KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8])
3533
ROLE_NAME = "SageMakerRole"
36-
EC2_INSTANCE_TYPE = "t2.micro"
3734
MIN_COUNT = 1
3835
MAX_COUNT = 1
3936

37+
EFS_MOUNT_DIRECTORY = "efs"
38+
FSX_MOUNT_DIRECTORY = "/mnt/fsx"
39+
4040
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "data")
4141
MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist")
4242
MNIST_LOCAL_DATA = os.path.join(MNIST_RESOURCE_PATH, "data")
@@ -49,70 +49,45 @@
4949
KEY_PATH = os.path.join(tempfile.gettempdir(), FILE_NAME)
5050
STORAGE_CAPACITY_IN_BYTES = 3600
5151

52-
FsResources = collections.namedtuple(
53-
"FsResources",
54-
[
55-
"key_name",
56-
"key_path",
57-
"role_name",
58-
"subnet_id",
59-
"security_group_ids",
60-
"file_system_efs_id",
61-
"file_system_fsx_id",
62-
"ec2_instance_id",
63-
"mount_efs_target_id",
64-
],
65-
)
66-
67-
68-
def set_up_efs_fsx(sagemaker_session):
69-
_check_or_create_key_pair(sagemaker_session)
70-
_check_or_create_iam_profile_and_attach_role(sagemaker_session)
71-
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
72-
sagemaker_session, VPC_NAME
73-
)
74-
75-
ami_id = _ami_id_for_region(sagemaker_session)
76-
ec2_instance = _create_ec2_instance(
77-
sagemaker_session,
78-
ami_id,
79-
EC2_INSTANCE_TYPE,
80-
KEY_NAME,
81-
MIN_COUNT,
82-
MAX_COUNT,
83-
security_group_ids,
84-
subnet_ids[0],
85-
)
52+
fs_resources = {"key_name": KEY_NAME, "key_path": KEY_PATH, "role_name": ROLE_NAME}
8653

87-
file_system_efs_id = _check_or_create_efs(sagemaker_session)
88-
mount_efs_target_id = _create_efs_mount(sagemaker_session, file_system_efs_id)
89-
90-
file_system_fsx_id = _check_or_create_fsx(sagemaker_session)
91-
92-
fs_resources = FsResources(
93-
KEY_NAME,
94-
KEY_PATH,
95-
ROLE_NAME,
96-
subnet_ids[0],
97-
security_group_ids,
98-
file_system_efs_id,
99-
file_system_fsx_id,
100-
ec2_instance.id,
101-
mount_efs_target_id,
102-
)
10354

104-
region = sagemaker_session.boto_region_name
55+
def set_up_efs_fsx(sagemaker_session, ec2_instance_type):
10556
try:
57+
_check_or_create_key_pair(sagemaker_session)
58+
_check_or_create_iam_profile_and_attach_role(sagemaker_session)
59+
60+
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
61+
sagemaker_session, VPC_NAME
62+
)
63+
fs_resources["subnet_id"] = subnet_ids[0]
64+
fs_resources["security_group_ids"] = security_group_ids
65+
66+
ami_id = _ami_id_for_region(sagemaker_session)
67+
ec2_instance = _create_ec2_instance(
68+
sagemaker_session,
69+
ami_id,
70+
ec2_instance_type,
71+
KEY_NAME,
72+
MIN_COUNT,
73+
MAX_COUNT,
74+
security_group_ids,
75+
subnet_ids[0],
76+
)
77+
78+
file_system_efs_id, mount_efs_target_id = _create_efs(sagemaker_session)
79+
file_system_fsx_id = _create_fsx(sagemaker_session)
80+
10681
connected_instance = _connect_ec2_instance(ec2_instance)
82+
region = sagemaker_session.boto_region_name
10783
_upload_data_and_mount_fs(
10884
connected_instance, file_system_efs_id, file_system_fsx_id, region
10985
)
86+
return fs_resources
11087
except Exception:
11188
tear_down(sagemaker_session, fs_resources)
11289
raise
11390

114-
return fs_resources
115-
11691

11792
def _ami_id_for_region(sagemaker_session):
11893
ec2_client = sagemaker_session.boto_session.client("ec2")
@@ -146,43 +121,26 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste
146121
connected_instance.put(local_file, "temp_tf/")
147122
connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/")
148123
connected_instance.run(
149-
"sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region),
124+
"sudo sh fs_mount_setup.sh {} {} {} {} {}".format(
125+
file_system_efs_id, file_system_fsx_id, region, EFS_MOUNT_DIRECTORY, FSX_MOUNT_DIRECTORY
126+
),
150127
in_stream=False,
151128
)
152129

153130

154-
def _check_or_create_efs(sagemaker_session):
131+
def _create_efs(sagemaker_session):
155132
efs_client = sagemaker_session.boto_session.client("efs")
156-
file_system_exists = False
157-
efs_id = ""
158-
try:
159-
create_response = efs_client.create_file_system(CreationToken=EFS_CREATION_TOKEN)
160-
efs_id = create_response["FileSystemId"]
161-
except ClientError as e:
162-
error_code = e.response["Error"]["Code"]
163-
if error_code == "FileSystemAlreadyExists":
164-
file_system_exists = True
165-
logging.warning(
166-
"File system with given creation token %s already exists", EFS_CREATION_TOKEN
167-
)
168-
else:
169-
raise
170-
171-
if file_system_exists:
172-
desc = efs_client.describe_file_systems(CreationToken=EFS_CREATION_TOKEN)
173-
efs_id = desc["FileSystems"][0]["FileSystemId"]
174-
mount_target_id = efs_client.describe_mount_targets(FileSystemId=efs_id)["MountTargets"][0][
175-
"MountTargetId"
176-
]
177-
return efs_id, mount_target_id
178-
133+
create_response = efs_client.create_file_system(CreationToken=EFS_CREATION_TOKEN)
134+
efs_id = create_response["FileSystemId"]
135+
fs_resources["file_system_efs_id"] = efs_id
179136
for _ in retries(50, "Checking EFS creating status"):
180137
desc = efs_client.describe_file_systems(CreationToken=EFS_CREATION_TOKEN)
181138
status = desc["FileSystems"][0]["LifeCycleState"]
182139
if status == "available":
183140
break
141+
mount_target_id = _create_efs_mount(sagemaker_session, efs_id)
184142

185-
return efs_id
143+
return efs_id, mount_target_id
186144

187145

188146
def _create_efs_mount(sagemaker_session, file_system_id):
@@ -194,6 +152,7 @@ def _create_efs_mount(sagemaker_session, file_system_id):
194152
FileSystemId=file_system_id, SubnetId=subnet_ids[0], SecurityGroups=security_group_ids
195153
)
196154
mount_target_id = mount_response["MountTargetId"]
155+
fs_resources["mount_efs_target_id"] = mount_target_id
197156

198157
for _ in retries(50, "Checking EFS mounting target status"):
199158
desc = efs_client.describe_mount_targets(MountTargetId=mount_target_id)
@@ -204,7 +163,7 @@ def _create_efs_mount(sagemaker_session, file_system_id):
204163
return mount_target_id
205164

206165

207-
def _check_or_create_fsx(sagemaker_session):
166+
def _create_fsx(sagemaker_session):
208167
fsx_client = sagemaker_session.boto_session.client("fsx")
209168
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
210169
sagemaker_session, VPC_NAME
@@ -216,6 +175,7 @@ def _check_or_create_fsx(sagemaker_session):
216175
SecurityGroupIds=security_group_ids,
217176
)
218177
fsx_id = create_response["FileSystem"]["FileSystemId"]
178+
fs_resources["file_system_fsx_id"] = fsx_id
219179

220180
for _ in retries(50, "Checking FSX creating status"):
221181
desc = fsx_client.describe_file_systems(FileSystemIds=[fsx_id])
@@ -257,8 +217,8 @@ def _create_ec2_instance(
257217

258218
ec2_instances[0].wait_until_running()
259219
ec2_instances[0].reload()
220+
fs_resources["ec2_instance_id"] = ec2_instances[0].id
260221
ec2_client = sagemaker_session.boto_session.client("ec2")
261-
262222
for _ in retries(30, "Checking EC2 creation status"):
263223
statuses = ec2_client.describe_instance_status(InstanceIds=[ec2_instances[0].id])
264224
status = statuses["InstanceStatuses"][0]
@@ -326,28 +286,30 @@ def _instance_profile_exists(sagemaker_session):
326286

327287

328288
def tear_down(sagemaker_session, fs_resources):
329-
fsx_client = sagemaker_session.boto_session.client("fsx")
330-
file_system_fsx_id = fs_resources.file_system_fsx_id
331-
fsx_client.delete_file_system(FileSystemId=file_system_fsx_id)
289+
if "file_system_fsx_id" in fs_resources:
290+
fsx_client = sagemaker_session.boto_session.client("fsx")
291+
fsx_client.delete_file_system(FileSystemId=fs_resources["file_system_fsx_id"])
332292

333293
efs_client = sagemaker_session.boto_session.client("efs")
334-
mount_efs_target_id = fs_resources.mount_efs_target_id
335-
efs_client.delete_mount_target(MountTargetId=mount_efs_target_id)
336-
337-
file_system_efs_id = fs_resources.file_system_efs_id
338-
for _ in retries(30, "Checking mount target deleting status"):
339-
desc = efs_client.describe_mount_targets(FileSystemId=file_system_efs_id)
340-
if len(desc["MountTargets"]) > 0:
341-
status = desc["MountTargets"][0]["LifeCycleState"]
342-
if status == "deleted":
294+
if "mount_efs_target_id" in fs_resources:
295+
efs_client.delete_mount_target(MountTargetId=fs_resources["mount_efs_target_id"])
296+
297+
if "file_system_efs_id" in fs_resources:
298+
for _ in retries(30, "Checking mount target deleting status"):
299+
desc = efs_client.describe_mount_targets(
300+
FileSystemId=fs_resources["file_system_efs_id"]
301+
)
302+
if len(desc["MountTargets"]) > 0:
303+
status = desc["MountTargets"][0]["LifeCycleState"]
304+
if status == "deleted":
305+
break
306+
else:
343307
break
344-
else:
345-
break
346308

347-
efs_client.delete_file_system(FileSystemId=file_system_efs_id)
309+
efs_client.delete_file_system(FileSystemId=fs_resources["file_system_efs_id"])
348310

349-
ec2_resource = sagemaker_session.boto_session.resource("ec2")
350-
instance_id = fs_resources.ec2_instance_id
351-
_terminate_instance(ec2_resource, [instance_id])
311+
if "ec2_instance_id" in fs_resources:
312+
ec2_resource = sagemaker_session.boto_session.resource("ec2")
313+
_terminate_instance(ec2_resource, [fs_resources["ec2_instance_id"]])
352314

353315
_delete_key_pair(sagemaker_session)

tests/integ/test_kmeans_efs_fsx.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,14 @@
3737

3838

3939
@pytest.fixture(scope="module")
40-
def efs_fsx_setup(sagemaker_session):
41-
fs_resources = set_up_efs_fsx(sagemaker_session)
40+
def efs_fsx_setup(sagemaker_session, ec2_instance_type):
41+
fs_resources = None
4242
try:
43+
fs_resources = set_up_efs_fsx(sagemaker_session, ec2_instance_type)
4344
yield fs_resources
4445
finally:
45-
tear_down(sagemaker_session, fs_resources)
46+
if fs_resources:
47+
tear_down(sagemaker_session, fs_resources)
4648

4749

4850
@pytest.mark.skipif(
@@ -51,9 +53,10 @@ def efs_fsx_setup(sagemaker_session):
5153
)
5254
def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
5355
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
54-
subnets = [efs_fsx_setup.subnet_id]
55-
security_group_ids = efs_fsx_setup.security_group_ids
56-
role = efs_fsx_setup.role_name
56+
role = efs_fsx_setup["role_name"]
57+
subnets = [efs_fsx_setup["subnet_id"]]
58+
security_group_ids = efs_fsx_setup["security_group_ids"]
59+
5760
kmeans = KMeans(
5861
role=role,
5962
train_instance_count=TRAIN_INSTANCE_COUNT,
@@ -64,7 +67,7 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
6467
security_group_ids=security_group_ids,
6568
)
6669

67-
file_system_efs_id = efs_fsx_setup.file_system_efs_id
70+
file_system_efs_id = efs_fsx_setup["file_system_efs_id"]
6871
records = FileSystemRecordSet(
6972
file_system_id=file_system_efs_id,
7073
file_system_type="EFS",
@@ -85,9 +88,9 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
8588
)
8689
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
8790
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
88-
subnets = [efs_fsx_setup.subnet_id]
89-
security_group_ids = efs_fsx_setup.security_group_ids
90-
role = efs_fsx_setup.role_name
91+
role = efs_fsx_setup["role_name"]
92+
subnets = [efs_fsx_setup["subnet_id"]]
93+
security_group_ids = efs_fsx_setup["security_group_ids"]
9194
kmeans = KMeans(
9295
role=role,
9396
train_instance_count=TRAIN_INSTANCE_COUNT,
@@ -98,7 +101,7 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
98101
security_group_ids=security_group_ids,
99102
)
100103

101-
file_system_fsx_id = efs_fsx_setup.file_system_fsx_id
104+
file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
102105
records = FileSystemRecordSet(
103106
file_system_id=file_system_fsx_id,
104107
file_system_type="FSxLustre",
@@ -118,9 +121,9 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
118121
reason="EFS integration tests need to be fixed before running in all regions.",
119122
)
120123
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
121-
subnets = [efs_fsx_setup.subnet_id]
122-
security_group_ids = efs_fsx_setup.security_group_ids
123-
role = efs_fsx_setup.role_name
124+
role = efs_fsx_setup["role_name"]
125+
subnets = [efs_fsx_setup["subnet_id"]]
126+
security_group_ids = efs_fsx_setup["security_group_ids"]
124127
kmeans = KMeans(
125128
role=role,
126129
train_instance_count=TRAIN_INSTANCE_COUNT,
@@ -148,7 +151,7 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
148151
max_parallel_jobs=MAX_PARALLEL_JOBS,
149152
)
150153

151-
file_system_efs_id = efs_fsx_setup.file_system_efs_id
154+
file_system_efs_id = efs_fsx_setup["file_system_efs_id"]
152155
train_records = FileSystemRecordSet(
153156
file_system_id=file_system_efs_id,
154157
file_system_type="EFS",
@@ -178,9 +181,9 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
178181
reason="EFS integration tests need to be fixed before running in all regions.",
179182
)
180183
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
181-
subnets = [efs_fsx_setup.subnet_id]
182-
security_group_ids = efs_fsx_setup.security_group_ids
183-
role = efs_fsx_setup.role_name
184+
role = efs_fsx_setup["role_name"]
185+
subnets = [efs_fsx_setup["subnet_id"]]
186+
security_group_ids = efs_fsx_setup["security_group_ids"]
184187
kmeans = KMeans(
185188
role=role,
186189
train_instance_count=TRAIN_INSTANCE_COUNT,
@@ -208,7 +211,7 @@ def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
208211
max_parallel_jobs=MAX_PARALLEL_JOBS,
209212
)
210213

211-
file_system_fsx_id = efs_fsx_setup.file_system_fsx_id
214+
file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
212215
train_records = FileSystemRecordSet(
213216
file_system_id=file_system_fsx_id,
214217
file_system_type="FSxLustre",

0 commit comments

Comments
 (0)