Skip to content

Commit 36fac66

Browse files
Add schema pattern check to pytorch-job template (#178)
* Update readme for volume flag * Add schema pattern check to pytorch-job template, unit test added, all test passed locally
1 parent 9b7220c commit 36fac66

File tree

4 files changed

+622
-19
lines changed

4 files changed

+622
-19
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py

Lines changed: 119 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,27 @@
1515

1616

1717
class VolumeConfig(BaseModel):
18-
name: str = Field(..., description="Volume name")
18+
name: str = Field(
19+
...,
20+
description="Volume name",
21+
min_length=1
22+
)
1923
type: Literal['hostPath', 'pvc'] = Field(..., description="Volume type")
20-
mount_path: str = Field(..., description="Mount path in container")
21-
path: Optional[str] = Field(None, description="Host path (required for hostPath volumes)")
22-
claim_name: Optional[str] = Field(None, description="PVC claim name (required for pvc volumes)")
24+
mount_path: str = Field(
25+
...,
26+
description="Mount path in container",
27+
min_length=1
28+
)
29+
path: Optional[str] = Field(
30+
None,
31+
description="Host path (required for hostPath volumes)",
32+
min_length=1
33+
)
34+
claim_name: Optional[str] = Field(
35+
None,
36+
description="PVC claim name (required for pvc volumes)",
37+
min_length=1
38+
)
2339
read_only: Optional[Literal['true', 'false']] = Field(None, description="Read-only flag for pvc volumes")
2440

2541
@field_validator('mount_path', 'path')
@@ -47,9 +63,22 @@ def validate_type_specific_fields(self):
4763
class PyTorchJobConfig(BaseModel):
4864
model_config = ConfigDict(extra="forbid")
4965

50-
job_name: str = Field(alias="job_name", description="Job name")
51-
image: str = Field(description="Docker image for training")
52-
namespace: Optional[str] = Field(default=None, description="Kubernetes namespace")
66+
job_name: str = Field(
67+
alias="job_name",
68+
description="Job name",
69+
min_length=1,
70+
max_length=63,
71+
pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$'
72+
)
73+
image: str = Field(
74+
description="Docker image for training",
75+
min_length=1
76+
)
77+
namespace: Optional[str] = Field(
78+
default=None,
79+
description="Kubernetes namespace",
80+
min_length=1
81+
)
5382
command: Optional[List[str]] = Field(
5483
default=None, description="Command to run in the container"
5584
)
@@ -60,16 +89,28 @@ class PyTorchJobConfig(BaseModel):
6089
default=None, description="Environment variables as key_value pairs"
6190
)
6291
pull_policy: Optional[str] = Field(
63-
default=None, alias="pull_policy", description="Image pull policy"
92+
default=None,
93+
alias="pull_policy",
94+
description="Image pull policy",
95+
min_length=1
6496
)
6597
instance_type: Optional[str] = Field(
66-
default=None, alias="instance_type", description="Instance type for training"
98+
default=None,
99+
alias="instance_type",
100+
description="Instance type for training",
101+
min_length=1
67102
)
68103
node_count: Optional[int] = Field(
69-
default=None, alias="node_count", description="Number of nodes"
104+
default=None,
105+
alias="node_count",
106+
description="Number of nodes",
107+
ge=1
70108
)
71109
tasks_per_node: Optional[int] = Field(
72-
default=None, alias="tasks_per_node", description="Number of tasks per node"
110+
default=None,
111+
alias="tasks_per_node",
112+
description="Number of tasks per node",
113+
ge=1
73114
)
74115
label_selector: Optional[Dict[str, str]] = Field(
75116
default=None,
@@ -82,16 +123,29 @@ class PyTorchJobConfig(BaseModel):
82123
description="Schedule pods only on nodes that passed deep health check",
83124
)
84125
scheduler_type: Optional[str] = Field(
85-
default=None, alias="scheduler_type", description="Scheduler type"
126+
default=None,
127+
alias="scheduler_type",
128+
description="Scheduler type",
129+
min_length=1
86130
)
87131
queue_name: Optional[str] = Field(
88-
default=None, alias="queue_name", description="Queue name for job scheduling"
132+
default=None,
133+
alias="queue_name",
134+
description="Queue name for job scheduling",
135+
min_length=1,
136+
max_length=63,
137+
pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$'
89138
)
90139
priority: Optional[str] = Field(
91-
default=None, description="Priority class for job scheduling"
140+
default=None,
141+
description="Priority class for job scheduling",
142+
min_length=1
92143
)
93144
max_retry: Optional[int] = Field(
94-
default=None, alias="max_retry", description="Maximum number of job retries"
145+
default=None,
146+
alias="max_retry",
147+
description="Maximum number of job retries",
148+
ge=0
95149
)
96150
volume: Optional[List[VolumeConfig]] = Field(
97151
default=None, description="List of volume configurations. \
@@ -102,7 +156,10 @@ class PyTorchJobConfig(BaseModel):
102156
"
103157
)
104158
service_account_name: Optional[str] = Field(
105-
default=None, alias="service_account_name", description="Service account name"
159+
default=None,
160+
alias="service_account_name",
161+
description="Service account name",
162+
min_length=1
106163
)
107164

108165
@field_validator('volume')
@@ -123,6 +180,52 @@ def validate_no_duplicates(cls, v):
123180

124181
return v
125182

183+
@field_validator('command', 'args')
184+
def validate_string_lists(cls, v):
185+
"""Validate that command and args contain non-empty strings."""
186+
if not v:
187+
return v
188+
189+
for i, item in enumerate(v):
190+
if not isinstance(item, str) or not item.strip():
191+
field_name = cls.model_fields.get('command', {}).get('alias', 'command') if 'command' in str(v) else 'args'
192+
raise ValueError(f"{field_name}[{i}] must be a non-empty string")
193+
194+
return v
195+
196+
@field_validator('environment')
197+
def validate_environment_variable_names(cls, v):
198+
"""Validate environment variable names follow C_IDENTIFIER pattern."""
199+
if not v:
200+
return v
201+
202+
import re
203+
c_identifier_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
204+
205+
for key in v.keys():
206+
if not c_identifier_pattern.match(key):
207+
raise ValueError(f"Environment variable name '{key}' must be a valid C_IDENTIFIER")
208+
209+
return v
210+
211+
@field_validator('label_selector')
212+
def validate_label_selector_keys(cls, v):
213+
"""Validate label selector keys follow Kubernetes label naming conventions."""
214+
if not v:
215+
return v
216+
217+
import re
218+
# Kubernetes label key pattern - allows namespaced labels like kubernetes.io/arch
219+
# Pattern: [prefix/]name where prefix and name follow DNS subdomain rules
220+
# Also reject double dots
221+
label_key_pattern = re.compile(r'^([a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?/)?[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$')
222+
223+
for key in v.keys():
224+
if not key or not label_key_pattern.match(key) or '..' in key:
225+
raise ValueError(f"Label selector key '{key}' must follow Kubernetes label naming conventions")
226+
227+
return v
228+
126229
def to_domain(self) -> Dict:
127230
"""
128231
Convert flat config to domain model (HyperPodPytorchJobSpec)

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"properties": {
55
"name": {
66
"description": "Volume name",
7+
"minLength": 1,
78
"title": "Name",
89
"type": "string"
910
},
@@ -18,12 +19,14 @@
1819
},
1920
"mount_path": {
2021
"description": "Mount path in container",
22+
"minLength": 1,
2123
"title": "Mount Path",
2224
"type": "string"
2325
},
2426
"path": {
2527
"anyOf": [
2628
{
29+
"minLength": 1,
2730
"type": "string"
2831
},
2932
{
@@ -37,6 +40,7 @@
3740
"claim_name": {
3841
"anyOf": [
3942
{
43+
"minLength": 1,
4044
"type": "string"
4145
},
4246
{
@@ -78,17 +82,22 @@
7882
"properties": {
7983
"job_name": {
8084
"description": "Job name",
85+
"maxLength": 63,
86+
"minLength": 1,
87+
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$",
8188
"title": "Job Name",
8289
"type": "string"
8390
},
8491
"image": {
8592
"description": "Docker image for training",
93+
"minLength": 1,
8694
"title": "Image",
8795
"type": "string"
8896
},
8997
"namespace": {
9098
"anyOf": [
9199
{
100+
"minLength": 1,
92101
"type": "string"
93102
},
94103
{
@@ -150,6 +159,7 @@
150159
"pull_policy": {
151160
"anyOf": [
152161
{
162+
"minLength": 1,
153163
"type": "string"
154164
},
155165
{
@@ -163,6 +173,7 @@
163173
"instance_type": {
164174
"anyOf": [
165175
{
176+
"minLength": 1,
166177
"type": "string"
167178
},
168179
{
@@ -176,6 +187,7 @@
176187
"node_count": {
177188
"anyOf": [
178189
{
190+
"minimum": 1,
179191
"type": "integer"
180192
},
181193
{
@@ -189,6 +201,7 @@
189201
"tasks_per_node": {
190202
"anyOf": [
191203
{
204+
"minimum": 1,
192205
"type": "integer"
193206
},
194207
{
@@ -231,6 +244,7 @@
231244
"scheduler_type": {
232245
"anyOf": [
233246
{
247+
"minLength": 1,
234248
"type": "string"
235249
},
236250
{
@@ -244,6 +258,9 @@
244258
"queue_name": {
245259
"anyOf": [
246260
{
261+
"maxLength": 63,
262+
"minLength": 1,
263+
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$",
247264
"type": "string"
248265
},
249266
{
@@ -257,6 +274,7 @@
257274
"priority": {
258275
"anyOf": [
259276
{
277+
"minLength": 1,
260278
"type": "string"
261279
},
262280
{
@@ -270,6 +288,7 @@
270288
"max_retry": {
271289
"anyOf": [
272290
{
291+
"minimum": 0,
273292
"type": "integer"
274293
},
275294
{
@@ -299,6 +318,7 @@
299318
"service_account_name": {
300319
"anyOf": [
301320
{
321+
"minLength": 1,
302322
"type": "string"
303323
},
304324
{

src/sagemaker/hyperpod/common/config/metadata.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ class Metadata(BaseModel):
66
"""Metadata class"""
77

88
name: str = Field(
9-
description="Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container."
9+
description="The name of the Kubernetes resource. Must follow RFC1123 naming conventions: lowercase alphanumeric characters or hyphens, start and end with alphanumeric character, 1-63 characters long (e.g., 'my-pytorch-job-123')."
1010
)
1111
namespace: Optional[str] = Field(
1212
default=None,
13-
description="Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
13+
description="The Kubernetes namespace where the resource will be created. If not specified, uses the default namespace or the namespace configured in your cluster context.",
1414
)
1515
labels: Optional[Dict[str, str]] = Field(
1616
default=None,
17-
description="Labels are key value pairs that are attached to objects, such as Pod. Labels are intended to be used to specify identifying attributes of objects. The system ignores labels that are not in the service's selector. Labels can only be added to objects during creation. More info: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
17+
description="Labels are key value pairs that are attached to objects, such as Pod. Labels are intended to be used to specify identifying attributes of objects. The system ignores labels that are not in the service's selector. Labels can only be added to objects during creation.",
1818
)

0 commit comments

Comments
 (0)