Skip to content

Commit fa19961

Browse files
authored
Merge pull request #65 from aws/release-2.0.0
Sagemaker Hyperpod Recipes Release 2.0.0
2 parents 5c0c1a0 + 0700d22 commit fa19961

File tree

573 files changed

+78696
-452
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

573 files changed

+78696
-452
lines changed

README.md

Lines changed: 280 additions & 159 deletions
Large diffs are not rendered by default.

docs/RECIPES.md

Lines changed: 150 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from dataclasses import dataclass, field
2+
from typing import Optional
3+
4+
from hydra.core.config_store import ConfigStore
5+
6+
### LLMFT hydra schema
7+
8+
9+
@dataclass
10+
class LLMFTTrainerConfig:
11+
devices: int = 1
12+
num_nodes: int = 1
13+
14+
15+
@dataclass
16+
class LLMFTTrainingArgsConfig:
17+
micro_train_batch_size: int = 1
18+
train_batch_size: int = 16
19+
learning_rate: float = 0.0001
20+
lr_warmup_ratio: float = 0.1
21+
gradient_clipping: bool = True
22+
gradient_clipping_threshold: float = 1.0
23+
max_epochs: int = 3
24+
logging_steps: int = 1
25+
save_steps: int = 0
26+
eval_steps: int = -1
27+
beta: float = 0.01
28+
nll_loss_coef: float = 0.0
29+
label_smoothing: float = 0.0
30+
31+
32+
@dataclass
33+
class LLMFTrainingConfig:
34+
training_args: LLMFTTrainingArgsConfig = field(default_factory=LLMFTTrainingArgsConfig)
35+
36+
37+
@dataclass
38+
class LLMFTRecipeConfig:
39+
trainer: LLMFTTrainerConfig = field(default_factory=LLMFTTrainerConfig)
40+
training_config: Optional[LLMFTrainingConfig] = field(default=None)
41+
42+
43+
### NOVA hydra schema
44+
@dataclass
45+
class NovaModelConfig:
46+
hidden_dropout: float = 0.0
47+
attention_dropout: float = 0.0
48+
ffn_dropout: float = 0.0
49+
50+
51+
@dataclass
52+
class NovaTrainerConfig:
53+
max_epochs: int = 1
54+
55+
56+
@dataclass
57+
class NovaTrainingConfig:
58+
max_length: int = 8192
59+
global_batch_size: int = 256
60+
trainer: NovaTrainerConfig = field(default_factory=NovaTrainerConfig)
61+
model: NovaModelConfig = field(default_factory=NovaModelConfig)
62+
63+
64+
@dataclass
65+
class NovaRecipeConfig:
66+
training_config: NovaTrainingConfig = field(default_factory=NovaTrainingConfig)
67+
68+
69+
### Verl hydra schema
70+
@dataclass
71+
class VerlModelOptimConfig:
72+
"""Model optimizer configuration for VERL."""
73+
74+
lr: float = 1e-5
75+
76+
77+
@dataclass
78+
class VerlModelConfig:
79+
"""Model configuration for VERL."""
80+
81+
path: Optional[str] = None
82+
83+
84+
@dataclass
85+
class VerlCriticConfig:
86+
"""Critic configuration for VERL."""
87+
88+
optim: VerlModelOptimConfig = field(default_factory=VerlModelOptimConfig)
89+
model: VerlModelConfig = field(default_factory=VerlModelConfig)
90+
ppo_micro_batch_size_per_gpu: int = 4
91+
92+
93+
@dataclass
94+
class VerlKlCtrlConfig:
95+
"""KL control configuration for VERL."""
96+
97+
kl_coef: float = 0.001
98+
target_kl: float = 0.1
99+
100+
101+
@dataclass
102+
class VerlAlgorithmConfig:
103+
"""Algorithm configuration for VERL."""
104+
105+
kl_ctrl: VerlKlCtrlConfig = field(default_factory=VerlKlCtrlConfig)
106+
adv_estimator: Optional[str] = None # "gae" or "grpo"
107+
108+
109+
@dataclass
110+
class VerlRayInitConfig:
111+
"""Ray initialization configuration for VERL."""
112+
113+
num_cpus: Optional[int] = None
114+
timeline_json_file: Optional[str] = None
115+
116+
117+
@dataclass
118+
class VerlRecipeConfig:
119+
"""Top-level configuration for VERL recipes."""
120+
121+
critic: Optional[VerlCriticConfig] = field(default_factory=VerlCriticConfig)
122+
algorithm: Optional[VerlAlgorithmConfig] = field(default_factory=VerlAlgorithmConfig)
123+
ray_init: Optional[VerlRayInitConfig] = field(default_factory=VerlRayInitConfig)
124+
125+
126+
# Register with Hydra
127+
cs = ConfigStore.instance()
128+
cs.store(name="recipe_schema", node=LLMFTRecipeConfig())
129+
cs.store(group="trainer", name="base_trainer", node=LLMFTTrainerConfig())
130+
cs.store(group="training_config", name="base_training", node=LLMFTrainingConfig())
131+
132+
# Register Nova configs with Hydra
133+
cs.store(name="nova_recipe_schema", node=NovaRecipeConfig())
134+
cs.store(group="training_config", name="nova_training", node=NovaTrainingConfig())
135+
136+
# Register Verl configs with Hydra
137+
cs.store(name="verl_recipe_schema", node=VerlRecipeConfig())
138+
cs.store(group="algorithm", name="verl_algorithm", node=VerlAlgorithmConfig())
139+
cs.store(group="critic", name="verl_critic", node=VerlCriticConfig())
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from pydantic import BaseModel, ConfigDict, Field, model_validator
2+
3+
4+
### LLMFT Validators
5+
class LLMFTTrainerValidator(BaseModel):
6+
model_config = ConfigDict(extra="allow")
7+
8+
devices: int = Field(gt=0)
9+
num_nodes: int = Field(gt=0)
10+
11+
12+
class LLMFTTrainingArgsValidator(BaseModel):
13+
model_config = ConfigDict(extra="allow")
14+
15+
micro_train_batch_size: int | None = Field(default=None, gt=0)
16+
train_batch_size: int | None = Field(default=None, gt=0)
17+
learning_rate: float | None = Field(default=None, gt=0)
18+
lr_warmup_ratio: float | None = Field(default=None, ge=0, le=1)
19+
gradient_clipping: bool | None = None
20+
gradient_clipping_threshold: float | None = Field(default=None, gt=0)
21+
max_epochs: int | None = Field(default=None, gt=0)
22+
logging_steps: int | None = Field(default=None, gt=0)
23+
save_steps: int | None = Field(default=None, ge=0)
24+
eval_steps: int | None = Field(default=None, ge=-1)
25+
beta: float | None = Field(default=None, gt=0)
26+
nll_loss_coef: float | None = Field(default=None, ge=0)
27+
label_smoothing: float | None = Field(default=None, ge=0, le=1)
28+
29+
@model_validator(mode="after")
30+
def check_batch_sizes(self):
31+
if self.micro_train_batch_size and self.train_batch_size:
32+
if self.train_batch_size < self.micro_train_batch_size:
33+
raise ValueError("train_batch_size must be >= micro_train_batch_size")
34+
return self
35+
36+
37+
class LLMFTRecipeValidator(BaseModel):
38+
"""Top-level validator for LLMFT recipes."""
39+
40+
model_config = ConfigDict(extra="allow")
41+
42+
trainer: LLMFTTrainerValidator | None = None
43+
training_config: dict | None = None
44+
45+
@model_validator(mode="after")
46+
def validate_nested_fields(self):
47+
# Validate training_config.training_args if present
48+
if self.training_config and "training_args" in self.training_config:
49+
try:
50+
LLMFTTrainingArgsValidator(**self.training_config["training_args"])
51+
except Exception as e:
52+
raise ValueError(f"Error validating training_args: {str(e)}")
53+
54+
return self
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from pydantic import BaseModel, ConfigDict, Field, model_validator
2+
3+
4+
### Verl Validators
5+
class VerlModelOptimValidator(BaseModel):
6+
model_config = ConfigDict(extra="allow")
7+
8+
lr: float | None = Field(gt=0)
9+
10+
11+
class VerlModelConfigValidator(BaseModel):
12+
model_config = ConfigDict(extra="allow")
13+
14+
path: str | None = None
15+
16+
17+
class VerlCriticValidator(BaseModel):
18+
model_config = ConfigDict(extra="allow")
19+
20+
optim: VerlModelOptimValidator | dict | None = None
21+
model: VerlModelConfigValidator | dict | None = None
22+
ppo_micro_batch_size_per_gpu: int | None = Field(gt=0)
23+
24+
25+
class VerlKlCtrlValidator(BaseModel):
26+
model_config = ConfigDict(extra="allow")
27+
28+
kl_coef: float | None = Field(default=0.001, gt=0)
29+
target_kl: float | None = Field(default=0.1, gt=0)
30+
31+
32+
class VerlAlgorithmValidator(BaseModel):
33+
model_config = ConfigDict(extra="allow")
34+
35+
kl_ctrl: VerlKlCtrlValidator | None = None
36+
adv_estimator: str | None = None # gae or grpo
37+
38+
@model_validator(mode="after")
39+
def validate_adv_estimator(self):
40+
if self.adv_estimator and self.adv_estimator not in ["gae", "grpo"]:
41+
raise ValueError("adv_estimator must be either 'gae' or 'grpo'")
42+
return self
43+
44+
45+
class VerlRecipeValidator(BaseModel):
46+
"""Top-level validator for VERL recipes."""
47+
48+
model_config = ConfigDict(extra="allow")
49+
50+
critic: VerlCriticValidator | dict | None = None
51+
algorithm: VerlAlgorithmValidator | dict | None = None
52+
ray_init: dict | None = None
53+
54+
@model_validator(mode="after")
55+
def validate_nested_fields(self):
56+
# Validate algorithm if present
57+
if self.algorithm and isinstance(self.algorithm, dict):
58+
try:
59+
VerlAlgorithmValidator(**self.algorithm)
60+
except Exception as e:
61+
raise ValueError(f"Error validating algorithm configuration: {str(e)}")
62+
if self.critic and isinstance(self.critic, dict):
63+
try:
64+
VerlCriticValidator(**self.critic)
65+
except Exception as e:
66+
raise ValueError(f"Error validating critic configuration: {str(e)}")
67+
68+
return self

launcher/efa.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@
170170
]
171171
)
172172

173+
# Mapping of instance types to their GPU/device counts (8 is the default)
173174
INSTANCE_TO_DEVICE_COUNT = {
174175
"g4dn.xlarge": 1,
175176
"g4dn.2xlarge": 1,

launcher/evaluation/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.

launcher/evaluation/constants.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
# Evaluation container constants
15+
EVAL_CONTAINER_IMAGE = "{account_id}.dkr.ecr.{region}.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121"
16+
17+
# Region to account mapping for evaluation containers
18+
EVAL_REGION_ACCOUNT_MAP = {
19+
"us-east-1": "658645717510",
20+
"us-west-2": "658645717510",
21+
"eu-west-1": "658645717510",
22+
"ap-southeast-1": "658645717510",
23+
"ap-northeast-1": "658645717510",
24+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: v2
2+
name: evaluation-job
3+
description: A Helm chart for SageMaker HyperPod evaluation jobs
4+
type: application
5+
version: 0.1.0
6+
appVersion: "1.0"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: evaluation-config-{{ .Values.evaluationConfig.jobName }}
5+
{{- if .Values.evaluationConfig.namespace }}
6+
namespace: {{ .Values.evaluationConfig.namespace }}
7+
{{- end }}
8+
labels:
9+
app: {{ .Values.evaluationConfig.jobName }}
10+
{{- if .Values.evaluationConfig.customLabels }}
11+
{{- toYaml .Values.evaluationConfig.customLabels | nindent 4 }}
12+
{{- end }}
13+
data:
14+
recipe.yaml: |
15+
{{ (.Files.Get "config/{{ .Values.evaluationConfig.jobName }}_hydra.yaml") | indent 4 }}

0 commit comments

Comments
 (0)