Skip to content

Commit ffe4661

Browse files
Updating Nova validation (#338)
* Updating recipe validation models for Nova * Skip job validation for nova-2 jobs due to RFT requirements
1 parent 0ebf0da commit ffe4661

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed

src/hyperpod_cli/validators/job_validator.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,17 @@ def validate_scheduler_related_fields(
292292
return False
293293
return True
294294

295+
def skippable_recipe(model_type: str, recipe_path: str):
296+
'''
297+
298+
:param model_type: Could be [nova-1, nova-2] for NovaModels and open source model types for others.
299+
:param recipe_path: can contain possible values from sft, rft, cpt, eval,
300+
:return: True if skippable, False if non-skippable
301+
'''
302+
return "nova-2" in model_type and any(x in recipe_path.lower() for x in ["sft", "rft", "cpt"])
303+
304+
305+
295306
def validate_recipe_file(recipe: str):
296307
recipe_path = os.path.join(RECIPES_DIR, f"{recipe}.yaml")
297308

@@ -314,6 +325,9 @@ def validate_recipe_file(recipe: str):
314325
elif "nova" in model_type and "evaluation" in recipe_data:
315326
NovaEvaluationRecipeSchema(**recipe_data)
316327
elif "nova" in model_type:
328+
# Skip recipe validation for nova-2 models for beta
329+
if skippable_recipe(model_type, recipe_path):
330+
return True
317331
NovaRecipeSchema(**recipe_data)
318332
else:
319333
raise Exception("Unsupported model_type {model_type}. Make sure the recipe exists in src/hyperpod_cli/sagemaker_hyperpod_recipes/recipes_collection/recipes")

src/hyperpod_cli/validators/recipe_models/nova/model.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class RunConfig(BaseModel):
1111
replicas: Optional[int|str] = None
1212
data_s3_path: Optional[str] = None
1313
output_s3_path: Optional[str] = None
14+
validation_data_s3_path: Optional[str] = None
1415

1516
# PPO-specific replica configurations
1617
actor_train_replicas: Optional[int|str] = None
@@ -20,11 +21,20 @@ class RunConfig(BaseModel):
2021
am_replicas: Optional[int|str] = None
2122

2223

24+
# MLFlow optional parameters
25+
mlflow_tracking_uri: Optional[str] = None
26+
mlflow_experiment_name: Optional[str] = None
27+
mlflow_run_name: Optional[str] = None
28+
29+
2330
class TrainerConfig(BaseModel):
2431
model_config = ConfigDict(extra="forbid")
2532

2633
max_epochs: Optional[int|str] = None
2734
num_nodes: Optional[int|str] = None
35+
max_steps: Optional[int|str] = None
36+
val_check_interval: Optional[int|float|str] = None
37+
limit_val_batches: Optional[int|float|str] = None
2838

2939

3040
class SchedulerConfig(BaseModel):
@@ -36,7 +46,7 @@ class SchedulerConfig(BaseModel):
3646

3747

3848
class OptimizerConfig(BaseModel):
39-
model_config = ConfigDict(extra="forbid")
49+
model_config = ConfigDict(extra="allow")
4050

4151
name: Optional[str] = None
4252
lr: Optional[float] = None
@@ -45,6 +55,8 @@ class OptimizerConfig(BaseModel):
4555
weight_decay: Optional[float] = None
4656
betas: Optional[List[float]] = None
4757
sched: Optional[SchedulerConfig] = None
58+
adam_beta1: Optional[float] = None
59+
adam_beta2: Optional[float] = None
4860

4961

5062
class DpoConfig(BaseModel):
@@ -59,6 +71,7 @@ class LoraTuningConfig(BaseModel):
5971
loraplus_lr_ratio: Optional[float] = None
6072
alpha: Optional[float] = None
6173
adapter_dropout: Optional[float] = None
74+
lora_plus_lr_ratio: Optional[float] = None
6275

6376

6477
class PeftConfig(BaseModel):
@@ -84,13 +97,22 @@ class ModelConfig(BaseModel):
8497
kl_reward_penalty_coeff: Optional[float] = None
8598

8699

100+
class ModelImportanceScore(BaseModel):
101+
fine_tuned_model: Optional[float] = None
102+
103+
87104
class TrainingConfig(BaseModel):
88-
model_config = ConfigDict(extra="forbid")
105+
model_config = ConfigDict(extra="allow")
89106

90107
max_length: Optional[int|str] = None
91108
global_batch_size: Optional[int|str] = None
92109
trainer: Optional[TrainerConfig] = None
93110
model: Optional[ModelConfig] = None
111+
max_steps: Optional[int|str] = None
112+
save_steps: Optional[int | str] = None
113+
save_top_k: Optional[int | str] = None
114+
reasoning_enabled: Optional[int | str] = None
115+
lr_scheduler: Optional[SchedulerConfig] = None
94116

95117
# Distillation-specific fields
96118
distillation_data: Optional[str] = None
@@ -105,6 +127,14 @@ class TrainingConfig(BaseModel):
105127
top_p: Optional[str] = None
106128
customer_bucket: Optional[str] = None
107129
kms_key: Optional[str] = None
130+
task_type: Optional[str] = None
131+
optim: Optional[OptimizerConfig] = None
132+
133+
optim_config: Optional[OptimizerConfig] = None
134+
peft: Optional[PeftConfig] = None
135+
136+
# RAI vector merge
137+
model_importance_score: Optional[ModelImportanceScore] = None
108138

109139

110140
class PpoRewardConfig(BaseModel):
@@ -153,12 +183,20 @@ class PpoActorTrainConfig(BaseModel):
153183
class NovaRecipeSchema(BaseModel):
154184
model_config = ConfigDict(extra="forbid")
155185

186+
display_name: Optional[str] = None
187+
versions: Optional[list] = None
188+
instance_types: Optional[list] = None
189+
156190
# Common configurations
157191
run: RunConfig
158192

159193
# Training and fine-tuning specific configurations
160194
training_config: Optional[TrainingConfig] = None
161195

196+
# Enable skipping recipe validation in the container
197+
# This is controlled by an allowlist in the container
198+
skip_recipe_validation: Optional[bool] = None
199+
162200
# PPO-specific configurations
163201
ppo_reward: Optional[PpoRewardConfig] = None
164202
ppo_critic: Optional[PpoCriticConfig] = None

0 commit comments

Comments
 (0)