forked from aws/deep-learning-containers
-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathimage_builder.py
More file actions
624 lines (516 loc) · 24.8 KB
/
image_builder.py
File metadata and controls
624 lines (516 loc) · 24.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
"""
Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"). You
may not use this file except in compliance with the License. A copy of
the License is located at
http://aws.amazon.com/apache2.0/
or in the "license" file accompanying this file. This file is
distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
ANY KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License.
"""
import concurrent.futures
import datetime
import os
import re
from copy import deepcopy
import constants
import utils
import boto3
import itertools
from codebuild_environment import get_codebuild_project_name, get_cloned_folder_path
from config import parse_dlc_developer_configs, is_build_enabled
from context import Context
from metrics import Metrics
from image import DockerImage
from common_stage_image import CommonStageImage
from buildspec import Buildspec
from output import OutputFormatter
FORMATTER = OutputFormatter(constants.PADDING)
build_context = os.getenv("BUILD_CONTEXT")
def is_nightly_build_context():
"""
Returns True if image builder is running in a nightly context or nightly PR test mode. Otherwise returns False
:return: <bool> True or False
"""
return (
build_context == "NIGHTLY" or os.getenv("NIGHTLY_PR_TEST_MODE", "false").lower() == "true"
)
def _find_image_object(images_list, image_name):
"""
Find and return an image object from images_list with a name that matches image_name
:param images_list: <list> List of <DockerImage> objects
:param image_name: <str> Name of image as per buildspec
:return: <DockerImage> Object with image_name as "name" attribute
"""
ret_image_object = None
for image in images_list:
if image.name == image_name:
ret_image_object = image
break
return ret_image_object
# TODO: Abstract away to ImageBuilder class
def image_builder(buildspec, image_types=[], device_types=[]):
"""
Builds images using build specification with specified image and device types
and export them to ECR image repository
An empty image types array indicates all image types.
Similarly, an empty device types array indicates all device types
:param buildspec: buid specification defining images to be build
:param image_types: <list> list of image types
:param device_types: <list> list of image device type
"""
BUILDSPEC = Buildspec()
BUILDSPEC.load(buildspec)
PRE_PUSH_STAGE_IMAGES = []
COMMON_STAGE_IMAGES = []
if (
"huggingface" in str(BUILDSPEC["framework"])
or "autogluon" in str(BUILDSPEC["framework"])
or "trcomp" in str(BUILDSPEC["framework"])
):
os.system("echo login into public ECR")
os.system(
"aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com"
)
for image_name, image_config in BUILDSPEC["images"].items():
# filter by image type if type is specified
if image_types and not image_config["image_type"] in image_types:
continue
# filter by device type if type is specified
if device_types and not image_config["device_type"] in device_types:
continue
ARTIFACTS = deepcopy(BUILDSPEC["context"]) if BUILDSPEC.get("context") else {}
extra_build_args = {}
labels = {}
enable_datetime_tag = parse_dlc_developer_configs("build", "datetime_tag")
if image_config.get("version") is not None:
if BUILDSPEC["version"] != image_config.get("version"):
continue
if image_config.get("context") is not None:
ARTIFACTS.update(image_config["context"])
image_tag = (
tag_image_with_pr_number(image_config["tag"])
if build_context == "PR"
else image_config["tag"]
)
additional_image_tags = []
if is_nightly_build_context():
additional_image_tags.append(tag_image_with_date(image_tag))
if enable_datetime_tag or build_context != "PR":
image_tag = tag_image_with_datetime(image_tag)
additional_image_tags.append(image_tag)
image_repo_uri = (
image_config["repository"]
if build_context == "PR"
else modify_repository_name_for_context(str(image_config["repository"]), build_context)
)
base_image_uri = None
if image_config.get("base_image_name") is not None:
base_image_object = _find_image_object(
PRE_PUSH_STAGE_IMAGES, image_config["base_image_name"]
)
base_image_uri = base_image_object.ecr_url
if image_config.get("download_artifacts") is not None:
for artifact_name, artifact in image_config.get("download_artifacts").items():
type = artifact["type"]
uri = artifact["URI"]
var = artifact["VAR_IN_DOCKERFILE"]
try:
file_name = utils.download_file(uri, type).strip()
except ValueError:
FORMATTER.print(f"Artifact download failed: {uri} of type {type}.")
ARTIFACTS.update(
{
f"{artifact_name}": {
"source": f"{os.path.join(os.sep, os.path.abspath(os.getcwd()), file_name)}",
"target": file_name,
}
}
)
extra_build_args[var] = file_name
labels[var] = file_name
labels[f"{var}_URI"] = uri
transformers_version = image_config.get("transformers_version")
if str(BUILDSPEC["framework"]).startswith("huggingface"):
if transformers_version:
extra_build_args["TRANSFORMERS_VERSION"] = transformers_version
else:
raise KeyError(
f"HuggingFace buildspec.yml must contain 'transformers_version' field for each image"
)
if "datasets_version" in image_config:
extra_build_args["DATASETS_VERSION"] = image_config.get("datasets_version")
elif str(image_config["image_type"]) == "training":
raise KeyError(
f"HuggingFace buildspec.yml must contain 'datasets_version' field for each image"
)
if str(BUILDSPEC["framework"]).startswith("stabilityai"):
model_framework = image_config.get("model_framework")
if model_framework:
model_framework_version = image_config.get("model_framework_version")
if not model_framework_version:
raise KeyError(
f"Stability AI buildspec.yml must contain 'model_framework_version' if 'model_framework' is set for the image"
)
extra_build_args["MODEL_FRAMEWORK"] = model_framework
extra_build_args["MODEL_FRAMEWORK_VERSION"] = model_framework_version
stability_sdk_version = image_config.get("stability_sdk_version")
if stability_sdk_version:
extra_build_args["STABILITY_SDK_VERSION"] = stability_sdk_version
if transformers_version:
extra_build_args["TRANSFORMERS_VERSION"] = transformers_version
accelerate_version = image_config.get("accelerate_version")
if accelerate_version:
extra_build_args["ACCELERATE_VERSION"] = accelerate_version
model_optimizer = image_config.get("model_optimizer")
if model_optimizer:
model_optimizer_version = image_config.get("model_optimizer_version")
model_optimizer_url = image_config.get("model_optimizer_url")
extra_build_args["MODEL_OPTIMIZER"] = model_optimizer
if not model_optimizer_version and not model_optimizer_url:
raise KeyError(
f"Stability AI buildspec.yml must contain 'model_optimizer_version' and/or 'model_optimizer_url' when 'model_optimizer' is set for the image"
)
if model_optimizer_version:
extra_build_args["MODEL_OPTIMIZER_VERSION"] = model_optimizer_version
if model_optimizer_url:
extra_build_args["MODEL_OPTIMIZER_URL"] = model_optimizer_url
ARTIFACTS.update(
{
"dockerfile": {
"source": image_config["docker_file"],
"target": "Dockerfile",
}
}
)
context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"])
if "labels" in image_config:
labels.update(image_config.get("labels"))
for label, value in labels.items():
if isinstance(value, bool):
labels[label] = str(value)
cx_type = utils.get_label_prefix_customer_type(image_tag)
# Define label variables
label_framework = str(BUILDSPEC["framework"]).replace("_", "-")
if image_config.get("framework_version"):
label_framework_version = str(image_config["framework_version"]).replace(".", "-")
else:
label_framework_version = str(BUILDSPEC["version"]).replace(".", "-")
label_device_type = str(image_config["device_type"])
if label_device_type == "gpu":
label_device_type = f"{label_device_type}.{str(image_config['cuda_version'])}"
label_arch = str(BUILDSPEC["arch_type"])
label_python_version = str(image_config["tag_python_version"])
label_os_version = str(image_config.get("os_version")).replace(".", "-")
label_contributor = str(BUILDSPEC.get("contributor"))
label_transformers_version = str(transformers_version).replace(".", "-")
# job_type will be either inference or training, based on the repo URI
if "training" in image_repo_uri:
label_job_type = "training"
elif "inference" in image_repo_uri:
label_job_type = "inference"
else:
raise RuntimeError(
f"Cannot find inference or training job type in {image_repo_uri}. "
f"This is required to set job_type label."
)
if cx_type == "sagemaker":
# Adding standard labels to all images
labels[
f"com.amazonaws.ml.engines.{cx_type}.dlc.framework.{label_framework}.{label_framework_version}"
] = "true"
labels[f"com.amazonaws.ml.engines.{cx_type}.dlc.device.{label_device_type}"] = "true"
labels[f"com.amazonaws.ml.engines.{cx_type}.dlc.arch.{label_arch}"] = "true"
# python version label will look like py_version.py36, for example
labels[f"com.amazonaws.ml.engines.{cx_type}.dlc.python.{label_python_version}"] = "true"
labels[f"com.amazonaws.ml.engines.{cx_type}.dlc.os.{label_os_version}"] = "true"
labels[f"com.amazonaws.ml.engines.{cx_type}.dlc.job.{label_job_type}"] = "true"
if label_contributor:
labels[
f"com.amazonaws.ml.engines.{cx_type}.dlc.contributor.{label_contributor}"
] = "true"
if transformers_version:
labels[
f"com.amazonaws.ml.engines.{cx_type}.dlc.lib.transformers.{label_transformers_version}"
] = "true"
"""
Override parameters from parent in child.
"""
info = {
"account_id": str(BUILDSPEC["account_id"]),
"region": str(BUILDSPEC["region"]),
"framework": str(BUILDSPEC["framework"]),
"version": str(BUILDSPEC["version"]),
"root": str(image_config["root"]),
"name": str(image_name),
"device_type": str(image_config["device_type"]),
"python_version": str(image_config["python_version"]),
"image_type": str(image_config["image_type"]),
"image_size_baseline": int(image_config["image_size_baseline"]),
"base_image_uri": base_image_uri,
"enable_test_promotion": image_config.get("enable_test_promotion", True),
"labels": labels,
"extra_build_args": extra_build_args,
}
# Create pre_push stage docker object
pre_push_stage_image_object = DockerImage(
info=info,
dockerfile=image_config["docker_file"],
repository=image_repo_uri,
tag=append_tag(image_tag, "pre-push"),
to_build=image_config["build"],
stage=constants.PRE_PUSH_STAGE,
context=context,
additional_tags=additional_image_tags,
target=image_config.get("target"),
)
##### Create Common stage docker object #####
# If for a pre_push stage image we create a common stage image, then we do not push the pre_push stage image
# to the repository. Instead, we just push its common stage image to the repository. Therefore,
# inside function get_common_stage_image_object we make pre_push_stage_image_object non pushable.
common_stage_image_object = generate_common_stage_image_object(
pre_push_stage_image_object, image_tag
)
COMMON_STAGE_IMAGES.append(common_stage_image_object)
PRE_PUSH_STAGE_IMAGES.append(pre_push_stage_image_object)
FORMATTER.separator()
FORMATTER.banner("DLC")
# Parent images do not inherit from any containers built in this job
# Child images use one of the parent images as their base image
parent_images = [image for image in PRE_PUSH_STAGE_IMAGES if not image.is_child_image]
child_images = [image for image in PRE_PUSH_STAGE_IMAGES if image.is_child_image]
ALL_IMAGES = PRE_PUSH_STAGE_IMAGES + COMMON_STAGE_IMAGES
IMAGES_TO_PUSH = [image for image in ALL_IMAGES if image.to_push and image.to_build]
pushed_images = []
pushed_images += process_images(parent_images, "Parent/Independent")
pushed_images += process_images(child_images, "Child/Dependent")
assert all(
image in pushed_images for image in IMAGES_TO_PUSH
), "Few images could not be pushed."
# After the build, display logs/summary for all the images.
FORMATTER.banner("Summary")
show_build_info(ALL_IMAGES)
FORMATTER.banner("Errors")
is_any_build_failed, is_any_build_failed_size_limit = show_build_errors(ALL_IMAGES)
# From all images, filter the images that were supposed to be built and upload their metrics
BUILT_IMAGES = [image for image in ALL_IMAGES if image.to_build]
if BUILT_IMAGES:
FORMATTER.banner("Upload Metrics")
upload_metrics(BUILT_IMAGES, BUILDSPEC, is_any_build_failed, is_any_build_failed_size_limit)
# Set environment variables to be consumed by test jobs
test_trigger_job = get_codebuild_project_name()
# Tests should only run on images that were pushed to the repository
images_to_test = IMAGES_TO_PUSH
if not is_build_enabled():
# Ensure we have images populated if do_build is false, so that tests can proceed if needed
images_to_test = [image for image in ALL_IMAGES if image.to_push]
if images_to_test:
FORMATTER.banner("Test Env")
utils.set_test_env(
images_to_test,
use_latest_additional_tag=True,
BUILD_CONTEXT=os.getenv("BUILD_CONTEXT"),
TEST_TRIGGER=test_trigger_job,
)
def process_images(pre_push_image_list, pre_push_image_type="Pre-push"):
"""
Handles all the tasks related to a particular type of Pre Push images. It takes in the list of
pre push images and then builds it. After the pre-push images have been built, it extracts the
corresponding common stage images for the pre-push images and builds those common stage images.
After the common stage images have been built, it finds outs the docker images that need to be
pushed and pushes them accordingly.
Note that the common stage images should always be built after the pre-push images of a
particular kind. This is because the Common stage images use are built on respective
Standard and Example images.
:param pre_push_image_list: list[DockerImage], list of pre-push images
:param pre_push_image_type: str, used to display the message on the logs
:return: list[DockerImage], images that were supposed to be pushed.
"""
FORMATTER.banner(f"{pre_push_image_type} Build")
build_images(pre_push_image_list)
FORMATTER.banner(f"{pre_push_image_type} Common Build")
common_stage_image_list = [
image.corresponding_common_stage_image
for image in pre_push_image_list
if image.corresponding_common_stage_image is not None
]
build_images(common_stage_image_list, make_dummy_boto_client=True)
FORMATTER.banner(f"{pre_push_image_type} Push Images")
all_images = pre_push_image_list + common_stage_image_list
images_to_push = [image for image in all_images if image.to_push and image.to_build]
push_images(images_to_push)
FORMATTER.banner(f"{pre_push_image_type} Retagging")
retag_and_push_images(images_to_push)
return images_to_push
def generate_common_stage_image_object(pre_push_stage_image_object, image_tag):
"""
Creates a common stage image object for a pre_push stage image. If for a pre_push stage image we create a common
stage image, then we do not push the pre_push stage image to the repository. Instead, we just push its common stage
image to the repository. Therefore, inside the function pre_push_stage_image_object is made NON-PUSHABLE.
:param pre_push_stage_image_object: DockerImage, an object of class DockerImage
:return: CommonStageImage, an object of class CommonStageImage. CommonStageImage inherits DockerImage.
"""
common_stage_info = deepcopy(pre_push_stage_image_object.info)
common_stage_info["extra_build_args"].update(
{"PRE_PUSH_IMAGE": pre_push_stage_image_object.ecr_url}
)
common_stage_image_object = CommonStageImage(
info=common_stage_info,
dockerfile=os.path.join(
os.sep, get_cloned_folder_path(), "miscellaneous_dockerfiles", "Dockerfile.common"
),
repository=pre_push_stage_image_object.repository,
tag=append_tag(image_tag, "multistage-common"),
to_build=pre_push_stage_image_object.to_build,
stage=constants.COMMON_STAGE,
additional_tags=pre_push_stage_image_object.additional_tags,
)
pre_push_stage_image_object.to_push = False
pre_push_stage_image_object.corresponding_common_stage_image = common_stage_image_object
return common_stage_image_object
def show_build_info(images):
"""
Displays the build info for a list of input images.
:param images: list[DockerImage]
"""
if not os.path.isdir("logs"):
os.makedirs("logs")
for image in images:
image_description = f"{image.name}-{image.stage}"
FORMATTER.title(image_description)
FORMATTER.table(image.info.items())
flattened_logs = list(itertools.chain(*image.log))
with open(f"logs/{image_description}", "w") as fp:
fp.write("/n".join(flattened_logs))
image.summary["log"] = f"logs/{image_description}"
FORMATTER.table(image.summary.items())
FORMATTER.title(f"Ending Logs for {image_description}")
FORMATTER.print_lines(image.log[-1][-2:])
def show_build_errors(images):
"""
Iterates through each image to check if there is any image that has a failed status. In case
an image with a failed status is found, it raises an exception.
:param images: list[DockerImage]
"""
is_any_build_failed = False
is_any_build_failed_size_limit = False
for image in images:
if image.build_status == constants.FAIL:
FORMATTER.title(image.name)
FORMATTER.print_lines(image.log[-1][-10:])
is_any_build_failed = True
else:
if image.build_status == constants.FAIL_IMAGE_SIZE_LIMIT:
is_any_build_failed_size_limit = True
if is_any_build_failed:
raise Exception("Build failed")
else:
if is_any_build_failed_size_limit:
FORMATTER.print("Build failed. Image size limit breached.")
else:
FORMATTER.print("No errors")
return is_any_build_failed, is_any_build_failed_size_limit
def upload_metrics(images, BUILDSPEC, is_any_build_failed, is_any_build_failed_size_limit):
"""
Uploads Metrics for a list of images.
:param images: list[DockerImage]
:param BUILDSPEC: Buildspec
:param is_any_build_failed: bool
:param is_any_build_failed_size_limit: bool
"""
metrics = Metrics(
context=constants.BUILD_CONTEXT,
region=BUILDSPEC["region"],
namespace=constants.METRICS_NAMESPACE,
)
for image in images:
try:
metrics.push_image_metrics(image)
except Exception as e:
if is_any_build_failed or is_any_build_failed_size_limit:
raise Exception(f"Build failed.{e}")
else:
raise Exception(f"Build passed. {e}")
if is_any_build_failed_size_limit:
raise Exception("Build failed because of file limit")
FORMATTER.print("Metrics Uploaded")
def build_images(images, make_dummy_boto_client=False):
"""
Takes a list of images and executes their build process concurrently.
:param images: list[DockerImage]
:param make_dummy_boto_client: bool, specifies if a dummy client should be declared or not.
TODO: The parameter make_dummy_boto_client should be removed when get_dummy_boto_client method is removed.
"""
THREADS = {}
# In the context of the ThreadPoolExecutor each instance of image.build submitted
# to it is executed concurrently in a separate thread.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#### TODO: Remove this entire if block when get_dummy_boto_client is removed ####
if make_dummy_boto_client:
get_dummy_boto_client()
for image in images:
THREADS[image.name] = executor.submit(image.build)
# the FORMATTER.progress(THREADS) function call also waits until all threads have completed
FORMATTER.progress(THREADS)
#### TODO: Remove this entire method when https://github.com/boto/boto3/issues/1592 is resolved ####
def get_dummy_boto_client():
"""
Makes a dummy boto3 client to ensure that boto3 clients behave in a thread safe manner.
In absence of this method, the behaviour documented in https://github.com/boto/boto3/issues/1592 is observed.
Once https://github.com/boto/boto3/issues/1592 is resolved, this method can be removed.
:return: BotocoreClientSTS
"""
return boto3.client("sts", region_name=os.getenv("REGION"))
def push_images(images):
"""
Takes a list of images and PUSHES them to ECR concurrently.
:param images: list[DockerImage]
"""
THREADS = {}
with concurrent.futures.ThreadPoolExecutor(
max_workers=constants.MAX_WORKER_COUNT_FOR_PUSHING_IMAGES
) as executor:
for image in images:
THREADS[image.name] = executor.submit(image.push_image)
FORMATTER.progress(THREADS)
def retag_and_push_images(images):
"""
Takes a list of images, retags them and pushes to the repository
:param images: list[DockerImage]
"""
THREADS = {}
with concurrent.futures.ThreadPoolExecutor(
max_workers=constants.MAX_WORKER_COUNT_FOR_PUSHING_IMAGES
) as executor:
for image in images:
THREADS[image.name] = executor.submit(image.push_image_with_additional_tags)
FORMATTER.progress(THREADS)
def tag_image_with_pr_number(image_tag):
pr_number = os.getenv("PR_NUMBER")
return f"{image_tag}-pr-{pr_number}"
def tag_image_with_date(image_tag):
date_suffix = datetime.datetime.now().strftime("%Y-%m-%d")
return f"{image_tag}-{date_suffix}"
def tag_image_with_datetime(image_tag):
datetime_suffix = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
return f"{image_tag}-{datetime_suffix}"
def append_tag(image_tag, append_str):
"""
Appends image_tag with append_str
:param image_tag: str, original image tag
:param append_str: str, string to be appended
"""
return f"{image_tag}-{append_str}"
def modify_repository_name_for_context(image_repo_uri, build_context):
repo_uri_values = image_repo_uri.split("/")
repo_name = repo_uri_values[-1]
if build_context == "MAINLINE":
repo_uri_values[-1] = repo_name.replace(
constants.PR_REPO_PREFIX, constants.MAINLINE_REPO_PREFIX
)
elif build_context == "NIGHTLY":
repo_uri_values[-1] = repo_name.replace(
constants.PR_REPO_PREFIX, constants.NIGHTLY_REPO_PREFIX
)
return "/".join(repo_uri_values)