Skip to content

Commit dabcdc2

Browse files
authored
Merge pull request #68 from aws/release
Sagemaker Hyperpod Recipes Release v2.0.2
2 parents 18f6556 + bd464c0 commit dabcdc2

File tree

497 files changed

+9959
-11936
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

497 files changed

+9959
-11936
lines changed

.github/scripts/trigger_lambda.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,8 @@ def handle_lambda_response(response, recipe_metadata):
111111
parser.add_argument("--lambda-name", required=True, help="Lambda function name")
112112
args = parser.parse_args()
113113

114-
# Configure the inbuilt retry behavior
115-
config = Config(retries=dict(max_attempts=3, mode="standard"), read_timeout=960) # 16 minutes
116-
114+
# Disable retries completely
115+
config = Config(retries={"max_attempts": 1, "mode": "standard"}, read_timeout=960) # 16 minutes
117116
lambda_client = boto3.client("lambda", region_name=region, config=config)
118117
recipe_metadata = args.metadata
119118

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,13 @@ coverage_html_report/
3131
mypg/
3232
.idea/
3333
recipes_collection/recipes/fine-tuning/sft/
34+
.kiro/
3435

3536
# Test artifacts
3637
launch_json_test_report.json
3738
launch_json_test.log
3839
templatized*
3940
failed*
41+
throughput_results*
42+
43+
debug_serverless/

hyperpod_recipes/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,19 @@ def list_recipes() -> List[Recipe]:
1111
if not os.path.exists(RECIPES_DIR):
1212
raise FileNotFoundError(f"Recipes directory not found: {RECIPES_DIR}")
1313

14+
# Skip __pycache__ and hydra_config directories
15+
# hydra_config contains Hydra composition components, not standalone recipes
16+
skip_dirs = {"__pycache__", "hydra_config"}
17+
1418
recipes = []
15-
for root, _, files in os.walk(RECIPES_DIR):
19+
for root, dirs, files in os.walk(RECIPES_DIR):
20+
# Prune directories we don't want to traverse
21+
dirs[:] = [d for d in dirs if d not in skip_dirs]
22+
1623
for f in files:
24+
# Only include .yaml files
25+
if not f.endswith(".yaml"):
26+
continue
1727
abs_path = os.path.join(root, f)
1828
recipes.append(Recipe(abs_path))
1929
return recipes

launcher/recipe_templatization/evaluation/evaluation_regional_parameters.json

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3,76 +3,76 @@
33
"open_source_deterministic_eval": {
44
"container_image": {
55
"prod": {
6-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
7-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
8-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
9-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
10-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
11-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
6+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
7+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
8+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
9+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
10+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
11+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
1212
},
1313
"gamma": {
14-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
15-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
16-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
17-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
18-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
19-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
14+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
15+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
16+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
17+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
18+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
19+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
2020
}
2121
},
2222
"smtj_container_image": {
2323
"prod": {
24-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
25-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
26-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
27-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
28-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
29-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
24+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
25+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
26+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
27+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
28+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
29+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
3030
},
3131
"gamma": {
32-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
33-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
34-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
35-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
36-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
37-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
32+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
33+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
34+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
35+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
36+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
37+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
3838
}
3939
}
4040
},
4141
"open_source_llmaj_eval": {
4242
"container_image": {
4343
"prod": {
44-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
45-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
46-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
47-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
48-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
49-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
44+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
45+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
46+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
47+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
48+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
49+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
5050
},
5151
"gamma": {
52-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
53-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
54-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
55-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
56-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
57-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
52+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
53+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
54+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
55+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
56+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
57+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
5858
}
5959
},
6060
"smtj_container_image": {
6161
"prod": {
62-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
63-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
64-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
65-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
66-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
67-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
62+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
63+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
64+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
65+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
66+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
67+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
6868
},
6969
"gamma":{
70-
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
71-
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
72-
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
73-
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
74-
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04",
75-
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:pytorch2.6.0-sagemaker-evaluation0.10.0-gpu-py312-cu126-ubuntu22.04"
70+
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-evaluation:latest",
71+
"us-west-2": "763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest",
72+
"eu-west-1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-evaluation:latest",
73+
"eu-central-1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-evaluation:latest",
74+
"ap-southeast-2": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-evaluation:latest",
75+
"ap-northeast-1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-evaluation:latest"
7676
}
7777
}
7878
}

launcher/recipe_templatization/llmft/llmft_recipe_template_processor.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
#!/usr/bin/env python3
22
import json
3-
import os
43
from collections import OrderedDict
4+
from pathlib import Path
55
from typing import Optional
66

7+
from hydra import compose, initialize_config_dir
8+
from hydra.core.global_hydra import GlobalHydra
79
from omegaconf import OmegaConf
810

911
from ..base_recipe_template_processor import (
@@ -107,7 +109,7 @@ def get_recipe_metadata(self, recipe_file_path: str) -> OrderedDict:
107109
}],
108110
"""
109111
metadata = OrderedDict()
110-
recipe_cfg = OmegaConf.load(os.path.join("./recipes_collection/recipes", recipe_file_path + ".yaml"))
112+
recipe_cfg = self._load_recipe_config(recipe_file_path)
111113
recipe_metadata_helpers = self.matched_template_group["recipe_metadata_helpers"]
112114

113115
# Get Name
@@ -207,6 +209,21 @@ def get_recipe_metadata(self, recipe_file_path: str) -> OrderedDict:
207209

208210
return metadata
209211

212+
def _load_recipe_config(self, recipe_file_path: str):
213+
recipes_collection_dir = Path("./recipes_collection").absolute()
214+
hydra_config_searchpath = recipes_collection_dir / "recipes" / "fine-tuning"
215+
216+
GlobalHydra.instance().clear()
217+
218+
with initialize_config_dir(version_base=None, config_dir=str(recipes_collection_dir)):
219+
searchpath_override = f"hydra.searchpath=[file://{hydra_config_searchpath}]"
220+
recipe_override = f"recipes={recipe_file_path}"
221+
cfg = compose(config_name="config", overrides=[searchpath_override, recipe_override])
222+
223+
if "recipes" in cfg and cfg.recipes is not None:
224+
return cfg.recipes
225+
return cfg
226+
210227
def _extract_peft_type_from_config(self, recipe_cfg) -> Optional[str]:
211228
"""Extract peft type name from recipe configuration."""
212229
training_config = recipe_cfg.get("training_config") or {}

0 commit comments

Comments
 (0)