Skip to content

Commit 7d78c28

Browse files
authored
Merge branch 'master' into processing-job-codeartifact-support
2 parents c8f2852 + 3676d3e commit 7d78c28

38 files changed

+1707
-216
lines changed

CHANGELOG.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,42 @@
11
# Changelog
22

3+
## v2.221.1 (2024-05-22)
4+
5+
### Bug Fixes and Other Changes
6+
7+
* Convert pytorchddp distribution to smdistributed distribution
8+
* Add tei cpu image
9+
10+
## v2.221.0 (2024-05-20)
11+
12+
### Features
13+
14+
* onboard tei image config to pysdk
15+
16+
### Bug Fixes and Other Changes
17+
18+
* JS Model with non-TGI/non-DJL deployment failure
19+
* cover tei with image_uris.retrieve API
20+
* Add more debuging
21+
* model builder limited container support for endpoint mode.
22+
* Image URI should take precedence for HF models
23+
24+
## v2.220.0 (2024-05-15)
25+
26+
### Features
27+
28+
* AutoGluon 1.1.0 image_uris update
29+
* add new images for HF TGI release
30+
* Add telemetry support for mlflow models
31+
32+
### Bug Fixes and Other Changes
33+
34+
* add debug logs to workflow container dist creation
35+
* model builder race condition on sagemaker session
36+
* Add tensorflow_serving support for mlflow models and enable lineage tracking for mlflow models
37+
* update image_uri_configs 05-09-2024 07:17:41 PST
38+
* skip flakey tests pending investigation
39+
340
## v2.219.0 (2024-05-08)
441

542
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.219.1.dev0
1+
2.221.2.dev0

src/sagemaker/fw_utils.py

Lines changed: 2 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -145,22 +145,6 @@
145145
],
146146
}
147147

148-
PYTORCHDDP_SUPPORTED_FRAMEWORK_VERSIONS = [
149-
"1.10",
150-
"1.10.0",
151-
"1.10.2",
152-
"1.11",
153-
"1.11.0",
154-
"1.12",
155-
"1.12.0",
156-
"1.12.1",
157-
"1.13.1",
158-
"2.0.0",
159-
"2.0.1",
160-
"2.1.0",
161-
"2.2.0",
162-
]
163-
164148
TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = [
165149
"1.13.1",
166150
"2.0.0",
@@ -795,7 +779,6 @@ def _validate_smdataparallel_args(
795779
796780
Raises:
797781
ValueError: if
798-
(`instance_type` is not in SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES or
799782
`py_version` is not python3 or
800783
`framework_version` is not in SM_DATAPARALLEL_SUPPORTED_FRAMEWORK_VERSION
801784
"""
@@ -806,17 +789,10 @@ def _validate_smdataparallel_args(
806789
if not smdataparallel_enabled:
807790
return
808791

809-
is_instance_type_supported = instance_type in SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES
810-
811792
err_msg = ""
812793

813-
if not is_instance_type_supported:
814-
# instance_type is required
815-
err_msg += (
816-
f"Provided instance_type {instance_type} is not supported by smdataparallel.\n"
817-
"Please specify one of the supported instance types:"
818-
f"{SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES}\n"
819-
)
794+
if not instance_type:
795+
err_msg += "Please specify an instance_type for smdataparallel.\n"
820796

821797
if not image_uri:
822798
# ignore framework_version & py_version if image_uri is set
@@ -928,13 +904,6 @@ def validate_distribution(
928904
)
929905
if framework_name and framework_name == "pytorch":
930906
# We need to validate only for PyTorch framework
931-
validate_pytorch_distribution(
932-
distribution=validated_distribution,
933-
framework_name=framework_name,
934-
framework_version=framework_version,
935-
py_version=py_version,
936-
image_uri=image_uri,
937-
)
938907
validate_torch_distributed_distribution(
939908
instance_type=instance_type,
940909
distribution=validated_distribution,
@@ -968,13 +937,6 @@ def validate_distribution(
968937
)
969938
if framework_name and framework_name == "pytorch":
970939
# We need to validate only for PyTorch framework
971-
validate_pytorch_distribution(
972-
distribution=validated_distribution,
973-
framework_name=framework_name,
974-
framework_version=framework_version,
975-
py_version=py_version,
976-
image_uri=image_uri,
977-
)
978940
validate_torch_distributed_distribution(
979941
instance_type=instance_type,
980942
distribution=validated_distribution,
@@ -1023,63 +985,6 @@ def validate_distribution_for_instance_type(instance_type, distribution):
1023985
raise ValueError(err_msg)
1024986

1025987

1026-
def validate_pytorch_distribution(
1027-
distribution, framework_name, framework_version, py_version, image_uri
1028-
):
1029-
"""Check if pytorch distribution strategy is correctly invoked by the user.
1030-
1031-
Args:
1032-
distribution (dict): A dictionary with information to enable distributed training.
1033-
(Defaults to None if distributed training is not enabled.) For example:
1034-
1035-
.. code:: python
1036-
1037-
{
1038-
"pytorchddp": {
1039-
"enabled": True
1040-
}
1041-
}
1042-
framework_name (str): A string representing the name of framework selected.
1043-
framework_version (str): A string representing the framework version selected.
1044-
py_version (str): A string representing the python version selected.
1045-
image_uri (str): A string representing a Docker image URI.
1046-
1047-
Raises:
1048-
ValueError: if
1049-
`py_version` is not python3 or
1050-
`framework_version` is not in PYTORCHDDP_SUPPORTED_FRAMEWORK_VERSIONS
1051-
"""
1052-
if framework_name and framework_name != "pytorch":
1053-
# We need to validate only for PyTorch framework
1054-
return
1055-
1056-
pytorch_ddp_enabled = False
1057-
if "pytorchddp" in distribution:
1058-
pytorch_ddp_enabled = distribution.get("pytorchddp").get("enabled", False)
1059-
if not pytorch_ddp_enabled:
1060-
# Distribution strategy other than pytorchddp is selected
1061-
return
1062-
1063-
err_msg = ""
1064-
if not image_uri:
1065-
# ignore framework_version and py_version if image_uri is set
1066-
# in case image_uri is not set, then both are mandatory
1067-
if framework_version not in PYTORCHDDP_SUPPORTED_FRAMEWORK_VERSIONS:
1068-
err_msg += (
1069-
f"Provided framework_version {framework_version} is not supported by"
1070-
" pytorchddp.\n"
1071-
"Please specify one of the supported framework versions:"
1072-
f" {PYTORCHDDP_SUPPORTED_FRAMEWORK_VERSIONS} \n"
1073-
)
1074-
if "py3" not in py_version:
1075-
err_msg += (
1076-
f"Provided py_version {py_version} is not supported by pytorchddp.\n"
1077-
"Please specify py_version>=py3"
1078-
)
1079-
if err_msg:
1080-
raise ValueError(err_msg)
1081-
1082-
1083988
def validate_torch_distributed_distribution(
1084989
instance_type,
1085990
distribution,

src/sagemaker/huggingface/llm_utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,20 @@ def get_huggingface_llm_image_uri(
6565
image_scope="inference",
6666
inference_tool="neuronx",
6767
)
68+
if backend == "huggingface-tei":
69+
return image_uris.retrieve(
70+
"huggingface-tei",
71+
region=region,
72+
version=version,
73+
image_scope="inference",
74+
)
75+
if backend == "huggingface-tei-cpu":
76+
return image_uris.retrieve(
77+
"huggingface-tei-cpu",
78+
region=region,
79+
version=version,
80+
image_scope="inference",
81+
)
6882
if backend == "lmi":
6983
version = version or "0.24.0"
7084
return image_uris.retrieve(framework="djl-deepspeed", region=region, version=version)

src/sagemaker/image_uri_config/autogluon.json

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
"0.6": "0.6.2",
1212
"0.7": "0.7.0",
1313
"0.8": "0.8.2",
14-
"1.0": "1.0.0"
14+
"1.0": "1.0.0",
15+
"1.1": "1.1.0"
1516
},
1617
"versions": {
1718
"0.3.1": {
@@ -480,6 +481,47 @@
480481
"py_versions": [
481482
"py310"
482483
]
484+
},
485+
"1.1.0": {
486+
"registries": {
487+
"af-south-1": "626614931356",
488+
"il-central-1": "780543022126",
489+
"ap-east-1": "871362719292",
490+
"ap-northeast-1": "763104351884",
491+
"ap-northeast-2": "763104351884",
492+
"ap-northeast-3": "364406365360",
493+
"ap-south-1": "763104351884",
494+
"ap-southeast-1": "763104351884",
495+
"ap-southeast-2": "763104351884",
496+
"ap-southeast-3": "907027046896",
497+
"ap-southeast-4": "457447274322",
498+
"ca-central-1": "763104351884",
499+
"eu-central-1": "763104351884",
500+
"eu-north-1": "763104351884",
501+
"eu-west-1": "763104351884",
502+
"eu-west-2": "763104351884",
503+
"eu-west-3": "763104351884",
504+
"eu-south-1": "692866216735",
505+
"me-south-1": "217643126080",
506+
"sa-east-1": "763104351884",
507+
"us-east-1": "763104351884",
508+
"us-east-2": "763104351884",
509+
"us-gov-east-1": "446045086412",
510+
"us-gov-west-1": "442386744353",
511+
"us-iso-east-1": "886529160074",
512+
"us-isob-east-1": "094389454867",
513+
"us-west-1": "763104351884",
514+
"us-west-2": "763104351884",
515+
"ca-west-1": "204538143572"
516+
},
517+
"repository": "autogluon-training",
518+
"processors": [
519+
"cpu",
520+
"gpu"
521+
],
522+
"py_versions": [
523+
"py310"
524+
]
483525
}
484526
}
485527
},
@@ -491,7 +533,8 @@
491533
"0.6": "0.6.2",
492534
"0.7": "0.7.0",
493535
"0.8": "0.8.2",
494-
"1.0": "1.0.0"
536+
"1.0": "1.0.0",
537+
"1.1": "1.1.0"
495538
},
496539
"versions": {
497540
"0.3.1": {
@@ -987,6 +1030,49 @@
9871030
"py_versions": [
9881031
"py310"
9891032
]
1033+
},
1034+
"1.1.0": {
1035+
"registries": {
1036+
"af-south-1": "626614931356",
1037+
"il-central-1": "780543022126",
1038+
"ap-east-1": "871362719292",
1039+
"ap-northeast-1": "763104351884",
1040+
"ap-northeast-2": "763104351884",
1041+
"ap-northeast-3": "364406365360",
1042+
"ap-south-1": "763104351884",
1043+
"ap-southeast-1": "763104351884",
1044+
"ap-southeast-2": "763104351884",
1045+
"ap-southeast-3": "907027046896",
1046+
"ap-southeast-4": "457447274322",
1047+
"ca-central-1": "763104351884",
1048+
"cn-north-1": "727897471807",
1049+
"cn-northwest-1": "727897471807",
1050+
"eu-central-1": "763104351884",
1051+
"eu-north-1": "763104351884",
1052+
"eu-west-1": "763104351884",
1053+
"eu-west-2": "763104351884",
1054+
"eu-west-3": "763104351884",
1055+
"eu-south-1": "692866216735",
1056+
"me-south-1": "217643126080",
1057+
"sa-east-1": "763104351884",
1058+
"us-east-1": "763104351884",
1059+
"us-east-2": "763104351884",
1060+
"us-gov-east-1": "446045086412",
1061+
"us-gov-west-1": "442386744353",
1062+
"us-iso-east-1": "886529160074",
1063+
"us-isob-east-1": "094389454867",
1064+
"us-west-1": "763104351884",
1065+
"us-west-2": "763104351884",
1066+
"ca-west-1": "204538143572"
1067+
},
1068+
"repository": "autogluon-inference",
1069+
"processors": [
1070+
"cpu",
1071+
"gpu"
1072+
],
1073+
"py_versions": [
1074+
"py310"
1075+
]
9901076
}
9911077
}
9921078
}

src/sagemaker/image_uri_config/huggingface-llm-neuronx.json

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"inf2"
55
],
66
"version_aliases": {
7-
"0.0": "0.0.16"
7+
"0.0": "0.0.22"
88
},
99
"versions": {
1010
"0.0.16": {
@@ -180,6 +180,35 @@
180180
"container_version": {
181181
"inf2": "ubuntu22.04"
182182
}
183+
},
184+
"0.0.22": {
185+
"py_versions": [
186+
"py310"
187+
],
188+
"registries": {
189+
"ap-northeast-1": "763104351884",
190+
"ap-south-1": "763104351884",
191+
"ap-south-2": "772153158452",
192+
"ap-southeast-1": "763104351884",
193+
"ap-southeast-2": "763104351884",
194+
"ap-southeast-4": "457447274322",
195+
"eu-central-1": "763104351884",
196+
"eu-central-2": "380420809688",
197+
"eu-south-2": "503227376785",
198+
"eu-west-1": "763104351884",
199+
"eu-west-3": "763104351884",
200+
"il-central-1": "780543022126",
201+
"sa-east-1": "763104351884",
202+
"us-east-1": "763104351884",
203+
"us-east-2": "763104351884",
204+
"us-west-2": "763104351884",
205+
"ca-west-1": "204538143572"
206+
},
207+
"tag_prefix": "2.1.2-optimum0.0.22",
208+
"repository": "huggingface-pytorch-tgi-inference",
209+
"container_version": {
210+
"inf2": "ubuntu22.04"
211+
}
183212
}
184213
}
185214
}

0 commit comments

Comments
 (0)