You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: src/sagemaker/hyperpod/training/config/hyperpod_pytorch_job_unified_config.py
+165Lines changed: 165 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -1610,6 +1610,23 @@ class LabelSelector(BaseModel):
1610
1610
)
1611
1611
1612
1612
1613
+
classNamespaceSelector(BaseModel):
1614
+
"""A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces."""
description="matchExpressions is a list of label selector requirements. The requirements are ANDed.",
1622
+
)
1623
+
matchLabels: Optional[Dict[str, str]] =Field(
1624
+
default=None,
1625
+
alias="match_labels",
1626
+
description='matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed.',
1627
+
)
1628
+
1629
+
1613
1630
classTopologySpreadConstraints(BaseModel):
1614
1631
"""TopologySpreadConstraint specifies how to spread matching pods among the given topology."""
1615
1632
@@ -2955,6 +2972,134 @@ class Template(BaseModel):
2955
2972
)
2956
2973
2957
2974
2975
+
classReplicaSpec(BaseModel):
2976
+
"""ReplicaSpec is a description of the replica"""
2977
+
2978
+
model_config=ConfigDict(extra="forbid")
2979
+
2980
+
name: str=Field(description="The name for the replica set")
2981
+
replicas: Optional[int] =Field(
2982
+
default=1,
2983
+
description="Replicas is the desired number of replicas of the given template.",
2984
+
)
2985
+
spares: Optional[int] =Field(
2986
+
default=0,
2987
+
description="Spares requests spare resources from Kueue. E.g. If a job is configured with 4 replicas and 2 spares, job requests resources required to run 6 pods such as cpu, gpu",
2988
+
)
2989
+
template: Optional[Template] =Field(
2990
+
default=None,
2991
+
description="Template is the object that describes the pod that will be created for this replica.",
2992
+
)
2993
+
2994
+
2995
+
classLogMonitoringConfiguration(BaseModel):
2996
+
"""LogMonitoringRule defines the criteria used to detect a SLOW or HANGING job"""
description="Time interval between two subsequent matches for LogPattern beyond which, the rule evaluates to HANGING. When not specified, there is no constraint on duration between two subsequent matches for LogPattern.",
description="Time to first match for LogPattern beyond which, the rule evaluates to HANGING. When not specified, there is no constraint on time to first match for LogPattern.",
3009
+
)
3010
+
logPattern: str=Field(
3011
+
alias="log_pattern",
3012
+
description="Regex to identify log lines to apply the rule to when the rule is active. This regex can optionally include one capturing group to extract a numeric metric value.",
3013
+
)
3014
+
metricEvaluationDataPoints: Optional[int] =Field(
3015
+
default=None,
3016
+
alias="metric_evaluation_data_points",
3017
+
description="The number of consecutive times that a rule must evaluate to SLOW in order to mark a job as SLOW. When not specified, the default value is 1.",
3018
+
)
3019
+
metricThreshold: Optional[int] =Field(
3020
+
default=None,
3021
+
alias="metric_threshold",
3022
+
description="Threshold for value extracted by LogPattern if it has a capturing group. When not specified, Metric evaluation will not be performed.",
3023
+
)
3024
+
name: str=Field(description="Name of the rule")
3025
+
operator: Optional[str] =Field(
3026
+
default=None,
3027
+
description="Operator to compare the value extracted by LogPattern to MetricThreshold. Rule evaluates to SLOW if value extracted by LogPattern compared to MetricThreshold using Operator evaluates to true. When not specified, Metric evaluation will not be performed. Following operator values are supported: gt (greater than) lt (lesser than) eq (equal to) gteq (greater than or equal to) lteq (less than or equal to)",
3028
+
)
3029
+
stopPattern: Optional[str] =Field(
3030
+
default=None,
3031
+
alias="stop_pattern",
3032
+
description="Regex to identify the log line at which to deactivate the rule. When not specified, the rule will always be active.",
3033
+
)
3034
+
3035
+
3036
+
classRestartPolicy(BaseModel):
3037
+
"""Additional restart limiting configurations"""
3038
+
3039
+
model_config=ConfigDict(extra="forbid")
3040
+
3041
+
evalPeriodSeconds: int=Field(
3042
+
alias="eval_period_seconds",
3043
+
description="The period of evaluating the restart limit in seconds",
3044
+
)
3045
+
maxFullJobRestarts: Optional[int] =Field(
3046
+
default=None,
3047
+
alias="max_full_job_restarts",
3048
+
description="The max allowed number of full job restarts before failing the job",
description="The number of standard restarts before a full job restart",
3054
+
)
3055
+
3056
+
3057
+
classRunPolicy(BaseModel):
3058
+
"""RunPolicy"""
3059
+
3060
+
model_config=ConfigDict(extra="forbid")
3061
+
3062
+
activeDeadlineSeconds: Optional[int] =Field(
3063
+
default=None,
3064
+
alias="active_deadline_seconds",
3065
+
description="Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer.",
3066
+
)
3067
+
cleanPodPolicy: Optional[str] =Field(
3068
+
default="All",
3069
+
alias="clean_pod_policy",
3070
+
description="CleanPodPolicy defines the policy to kill pods after the job completes.",
3071
+
)
3072
+
faultDeadlineSeconds: Optional[int] =Field(
3073
+
default=None,
3074
+
alias="fault_deadline_seconds",
3075
+
description="The limit on the fault time for the job (Status of Fault) before failing",
description="The limit on the startup time for the job (Status of Staring) before failing",
3092
+
)
3093
+
suspend: Optional[bool] =Field(
3094
+
default=None, description="Suspend suspends HyperPodPytorchJob when set to true"
3095
+
)
3096
+
ttlSecondsAfterFinished: Optional[int] =Field(
3097
+
default=0,
3098
+
alias="ttl_seconds_after_finished",
3099
+
description="TTLSecondsAfterFinished is the TTL to clean up jobs. Set to -1 for infinite",
3100
+
)
3101
+
3102
+
2958
3103
classPodSets(BaseModel):
2959
3104
model_config=ConfigDict(extra="forbid")
2960
3105
@@ -3081,3 +3226,23 @@ class HyperPodPytorchJobStatus(BaseModel):
3081
3226
alias="start_time",
3082
3227
description="The time when job is first acknowledged by the controller. When using kueue, the job is also admitted It is represented in RFC3339 form and is in UTC.",
3083
3228
)
3229
+
3230
+
3231
+
class_HyperPodPytorchJob(BaseModel):
3232
+
"""Config defines the desired state of HyperPodPytorchJob"""
3233
+
3234
+
model_config=ConfigDict(extra="ignore")
3235
+
3236
+
nprocPerNode: str=Field(
3237
+
default="auto",
3238
+
alias="nproc_per_node",
3239
+
description="Number of workers per node; supported values: [auto, cpu, gpu, int].",
3240
+
)
3241
+
replicaSpecs: Optional[List[ReplicaSpec]] =Field(
3242
+
default=None,
3243
+
alias="replica_specs",
3244
+
description="The replicas to include as part of the job",
0 commit comments