Skip to content

Commit 76dc772

Browse files
Add TrainJob progression tracking with real-time status updates
Signed-off-by: Abhijeet Dhumal <[email protected]>
1 parent f2e7e0d commit 76dc772

29 files changed

+2506
-18
lines changed

api/openapi-spec/swagger.json

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14036,6 +14036,69 @@
1403614036
}
1403714037
}
1403814038
},
14039+
"trainer.v1alpha1.ProgressionStatus": {
14040+
"description": "ProgressionStatus represents the training progression status read from rank 0 node.",
14041+
"type": "object",
14042+
"properties": {
14043+
"currentEpoch": {
14044+
"description": "CurrentEpoch is the current training epoch.",
14045+
"type": "integer",
14046+
"format": "int64"
14047+
},
14048+
"currentStep": {
14049+
"description": "CurrentStep is the current training step/iteration.",
14050+
"type": "integer",
14051+
"format": "int64"
14052+
},
14053+
"estimatedTimeRemaining": {
14054+
"description": "EstimatedTimeRemaining is the estimated time remaining in seconds.",
14055+
"type": "integer",
14056+
"format": "int64"
14057+
},
14058+
"lastUpdateTime": {
14059+
"description": "LastUpdateTime is the timestamp when the progression was last updated.",
14060+
"allOf": [
14061+
{
14062+
"$ref": "#/components/schemas/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
14063+
}
14064+
]
14065+
},
14066+
"message": {
14067+
"description": "Message provides additional information about the training progression.",
14068+
"type": "string"
14069+
},
14070+
"metrics": {
14071+
"description": "Metrics contains additional training metrics as key-value pairs.",
14072+
"type": "object",
14073+
"additionalProperties": {
14074+
"type": "string",
14075+
"default": ""
14076+
}
14077+
},
14078+
"percentageComplete": {
14079+
"description": "PercentageComplete represents the completion percentage (0-100) as a string.",
14080+
"type": "string"
14081+
},
14082+
"totalEpochs": {
14083+
"description": "TotalEpochs is the total number of training epochs.",
14084+
"type": "integer",
14085+
"format": "int64"
14086+
},
14087+
"totalSteps": {
14088+
"description": "TotalSteps is the total number of training steps/iterations.",
14089+
"type": "integer",
14090+
"format": "int64"
14091+
},
14092+
"trainingMetrics": {
14093+
"description": "TrainingMetrics contains structured training metrics.",
14094+
"allOf": [
14095+
{
14096+
"$ref": "#/components/schemas/trainer.v1alpha1.TrainingMetrics"
14097+
}
14098+
]
14099+
}
14100+
}
14101+
},
1403914102
"trainer.v1alpha1.RuntimeRef": {
1404014103
"description": "RuntimeRef represents the reference to the existing training runtime.",
1404114104
"type": "object",
@@ -14301,6 +14364,14 @@
1430114364
"name"
1430214365
],
1430314366
"x-kubernetes-list-type": "map"
14367+
},
14368+
"progressionStatus": {
14369+
"description": "ProgressionStatus tracks the training progression from rank 0 node.",
14370+
"allOf": [
14371+
{
14372+
"$ref": "#/components/schemas/trainer.v1alpha1.ProgressionStatus"
14373+
}
14374+
]
1430414375
}
1430514376
}
1430614377
},
@@ -14369,6 +14440,33 @@
1436914440
}
1437014441
}
1437114442
},
14443+
"trainer.v1alpha1.TrainingMetrics": {
14444+
"description": "TrainingMetrics represents structured training metrics.",
14445+
"type": "object",
14446+
"properties": {
14447+
"accuracy": {
14448+
"description": "Accuracy represents the current model accuracy.",
14449+
"type": "string"
14450+
},
14451+
"checkpointsStored": {
14452+
"description": "CheckpointsStored represents the number of checkpoints stored.",
14453+
"type": "integer",
14454+
"format": "int64"
14455+
},
14456+
"latestCheckpointPath": {
14457+
"description": "LatestCheckpointPath represents the path to the latest checkpoint file.",
14458+
"type": "string"
14459+
},
14460+
"learningRate": {
14461+
"description": "LearningRate represents the current learning rate.",
14462+
"type": "string"
14463+
},
14464+
"loss": {
14465+
"description": "Loss represents the current training loss.",
14466+
"type": "string"
14467+
}
14468+
}
14469+
},
1437214470
"trainer.v1alpha1.TrainingRuntime": {
1437314471
"description": "TrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.",
1437414472
"type": "object",

api/python_api/kubeflow_trainer_api/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@
368368
from kubeflow_trainer_api.models.trainer_v1alpha1_pod_group_policy_source import TrainerV1alpha1PodGroupPolicySource
369369
from kubeflow_trainer_api.models.trainer_v1alpha1_pod_spec_override import TrainerV1alpha1PodSpecOverride
370370
from kubeflow_trainer_api.models.trainer_v1alpha1_pod_spec_override_target_job import TrainerV1alpha1PodSpecOverrideTargetJob
371+
from kubeflow_trainer_api.models.trainer_v1alpha1_progression_status import TrainerV1alpha1ProgressionStatus
371372
from kubeflow_trainer_api.models.trainer_v1alpha1_runtime_ref import TrainerV1alpha1RuntimeRef
372373
from kubeflow_trainer_api.models.trainer_v1alpha1_torch_elastic_policy import TrainerV1alpha1TorchElasticPolicy
373374
from kubeflow_trainer_api.models.trainer_v1alpha1_torch_ml_policy_source import TrainerV1alpha1TorchMLPolicySource
@@ -376,6 +377,7 @@
376377
from kubeflow_trainer_api.models.trainer_v1alpha1_train_job_spec import TrainerV1alpha1TrainJobSpec
377378
from kubeflow_trainer_api.models.trainer_v1alpha1_train_job_status import TrainerV1alpha1TrainJobStatus
378379
from kubeflow_trainer_api.models.trainer_v1alpha1_trainer import TrainerV1alpha1Trainer
380+
from kubeflow_trainer_api.models.trainer_v1alpha1_training_metrics import TrainerV1alpha1TrainingMetrics
379381
from kubeflow_trainer_api.models.trainer_v1alpha1_training_runtime import TrainerV1alpha1TrainingRuntime
380382
from kubeflow_trainer_api.models.trainer_v1alpha1_training_runtime_list import TrainerV1alpha1TrainingRuntimeList
381383
from kubeflow_trainer_api.models.trainer_v1alpha1_training_runtime_spec import TrainerV1alpha1TrainingRuntimeSpec
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# coding: utf-8
2+
3+
"""
4+
Kubeflow Trainer OpenAPI Spec
5+
6+
No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
7+
8+
The version of the OpenAPI document: unversioned
9+
Generated by OpenAPI Generator (https://openapi-generator.tech)
10+
11+
Do not edit the class manually.
12+
""" # noqa: E501
13+
14+
15+
from __future__ import annotations
16+
import pprint
17+
import re # noqa: F401
18+
import json
19+
20+
from datetime import datetime
21+
from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr
22+
from typing import Any, ClassVar, Dict, List, Optional
23+
from kubeflow_trainer_api.models.trainer_v1alpha1_training_metrics import TrainerV1alpha1TrainingMetrics
24+
from typing import Optional, Set
25+
from typing_extensions import Self
26+
27+
class TrainerV1alpha1ProgressionStatus(BaseModel):
28+
"""
29+
ProgressionStatus represents the training progression status read from rank 0 node.
30+
""" # noqa: E501
31+
current_epoch: Optional[StrictInt] = Field(default=None, description="CurrentEpoch is the current training epoch.", alias="currentEpoch")
32+
current_step: Optional[StrictInt] = Field(default=None, description="CurrentStep is the current training step/iteration.", alias="currentStep")
33+
estimated_time_remaining: Optional[StrictInt] = Field(default=None, description="EstimatedTimeRemaining is the estimated time remaining in seconds.", alias="estimatedTimeRemaining")
34+
last_update_time: Optional[datetime] = Field(default=None, description="LastUpdateTime is the timestamp when the progression was last updated.", alias="lastUpdateTime")
35+
message: Optional[StrictStr] = Field(default=None, description="Message provides additional information about the training progression.")
36+
metrics: Optional[Dict[str, StrictStr]] = Field(default=None, description="Metrics contains additional training metrics as key-value pairs.")
37+
percentage_complete: Optional[StrictStr] = Field(default=None, description="PercentageComplete represents the completion percentage (0-100) as a string.", alias="percentageComplete")
38+
total_epochs: Optional[StrictInt] = Field(default=None, description="TotalEpochs is the total number of training epochs.", alias="totalEpochs")
39+
total_steps: Optional[StrictInt] = Field(default=None, description="TotalSteps is the total number of training steps/iterations.", alias="totalSteps")
40+
training_metrics: Optional[TrainerV1alpha1TrainingMetrics] = Field(default=None, description="TrainingMetrics contains structured training metrics.", alias="trainingMetrics")
41+
__properties: ClassVar[List[str]] = ["currentEpoch", "currentStep", "estimatedTimeRemaining", "lastUpdateTime", "message", "metrics", "percentageComplete", "totalEpochs", "totalSteps", "trainingMetrics"]
42+
43+
model_config = ConfigDict(
44+
populate_by_name=True,
45+
validate_assignment=True,
46+
protected_namespaces=(),
47+
)
48+
49+
50+
def to_str(self) -> str:
51+
"""Returns the string representation of the model using alias"""
52+
return pprint.pformat(self.model_dump(by_alias=True))
53+
54+
def to_json(self) -> str:
55+
"""Returns the JSON representation of the model using alias"""
56+
# TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead
57+
return json.dumps(self.to_dict())
58+
59+
@classmethod
60+
def from_json(cls, json_str: str) -> Optional[Self]:
61+
"""Create an instance of TrainerV1alpha1ProgressionStatus from a JSON string"""
62+
return cls.from_dict(json.loads(json_str))
63+
64+
def to_dict(self) -> Dict[str, Any]:
65+
"""Return the dictionary representation of the model using alias.
66+
67+
This has the following differences from calling pydantic's
68+
`self.model_dump(by_alias=True)`:
69+
70+
* `None` is only added to the output dict for nullable fields that
71+
were set at model initialization. Other fields with value `None`
72+
are ignored.
73+
"""
74+
excluded_fields: Set[str] = set([
75+
])
76+
77+
_dict = self.model_dump(
78+
by_alias=True,
79+
exclude=excluded_fields,
80+
exclude_none=True,
81+
)
82+
# override the default output from pydantic by calling `to_dict()` of training_metrics
83+
if self.training_metrics:
84+
_dict['trainingMetrics'] = self.training_metrics.to_dict()
85+
return _dict
86+
87+
@classmethod
88+
def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
89+
"""Create an instance of TrainerV1alpha1ProgressionStatus from a dict"""
90+
if obj is None:
91+
return None
92+
93+
if not isinstance(obj, dict):
94+
return cls.model_validate(obj)
95+
96+
_obj = cls.model_validate({
97+
"currentEpoch": obj.get("currentEpoch"),
98+
"currentStep": obj.get("currentStep"),
99+
"estimatedTimeRemaining": obj.get("estimatedTimeRemaining"),
100+
"lastUpdateTime": obj.get("lastUpdateTime"),
101+
"message": obj.get("message"),
102+
"metrics": obj.get("metrics"),
103+
"percentageComplete": obj.get("percentageComplete"),
104+
"totalEpochs": obj.get("totalEpochs"),
105+
"totalSteps": obj.get("totalSteps"),
106+
"trainingMetrics": TrainerV1alpha1TrainingMetrics.from_dict(obj["trainingMetrics"]) if obj.get("trainingMetrics") is not None else None
107+
})
108+
return _obj
109+
110+

api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_status.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from typing import Any, ClassVar, Dict, List, Optional
2222
from kubeflow_trainer_api.models.io_k8s_apimachinery_pkg_apis_meta_v1_condition import IoK8sApimachineryPkgApisMetaV1Condition
2323
from kubeflow_trainer_api.models.trainer_v1alpha1_job_status import TrainerV1alpha1JobStatus
24+
from kubeflow_trainer_api.models.trainer_v1alpha1_progression_status import TrainerV1alpha1ProgressionStatus
2425
from typing import Optional, Set
2526
from typing_extensions import Self
2627

@@ -30,7 +31,8 @@ class TrainerV1alpha1TrainJobStatus(BaseModel):
3031
""" # noqa: E501
3132
conditions: Optional[List[IoK8sApimachineryPkgApisMetaV1Condition]] = Field(default=None, description="Conditions for the TrainJob.")
3233
jobs_status: Optional[List[TrainerV1alpha1JobStatus]] = Field(default=None, description="JobsStatus tracks the child Jobs in TrainJob.", alias="jobsStatus")
33-
__properties: ClassVar[List[str]] = ["conditions", "jobsStatus"]
34+
progression_status: Optional[TrainerV1alpha1ProgressionStatus] = Field(default=None, description="ProgressionStatus tracks the training progression from rank 0 node.", alias="progressionStatus")
35+
__properties: ClassVar[List[str]] = ["conditions", "jobsStatus", "progressionStatus"]
3436

3537
model_config = ConfigDict(
3638
populate_by_name=True,
@@ -85,6 +87,9 @@ def to_dict(self) -> Dict[str, Any]:
8587
if _item_jobs_status:
8688
_items.append(_item_jobs_status.to_dict())
8789
_dict['jobsStatus'] = _items
90+
# override the default output from pydantic by calling `to_dict()` of progression_status
91+
if self.progression_status:
92+
_dict['progressionStatus'] = self.progression_status.to_dict()
8893
return _dict
8994

9095
@classmethod
@@ -98,7 +103,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
98103

99104
_obj = cls.model_validate({
100105
"conditions": [IoK8sApimachineryPkgApisMetaV1Condition.from_dict(_item) for _item in obj["conditions"]] if obj.get("conditions") is not None else None,
101-
"jobsStatus": [TrainerV1alpha1JobStatus.from_dict(_item) for _item in obj["jobsStatus"]] if obj.get("jobsStatus") is not None else None
106+
"jobsStatus": [TrainerV1alpha1JobStatus.from_dict(_item) for _item in obj["jobsStatus"]] if obj.get("jobsStatus") is not None else None,
107+
"progressionStatus": TrainerV1alpha1ProgressionStatus.from_dict(obj["progressionStatus"]) if obj.get("progressionStatus") is not None else None
102108
})
103109
return _obj
104110

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# coding: utf-8
2+
3+
"""
4+
Kubeflow Trainer OpenAPI Spec
5+
6+
No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
7+
8+
The version of the OpenAPI document: unversioned
9+
Generated by OpenAPI Generator (https://openapi-generator.tech)
10+
11+
Do not edit the class manually.
12+
""" # noqa: E501
13+
14+
15+
from __future__ import annotations
16+
import pprint
17+
import re # noqa: F401
18+
import json
19+
20+
from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr
21+
from typing import Any, ClassVar, Dict, List, Optional
22+
from typing import Optional, Set
23+
from typing_extensions import Self
24+
25+
class TrainerV1alpha1TrainingMetrics(BaseModel):
26+
"""
27+
TrainingMetrics represents structured training metrics.
28+
""" # noqa: E501
29+
accuracy: Optional[StrictStr] = Field(default=None, description="Accuracy represents the current model accuracy.")
30+
checkpoints_stored: Optional[StrictInt] = Field(default=None, description="CheckpointsStored represents the number of checkpoints stored.", alias="checkpointsStored")
31+
latest_checkpoint_path: Optional[StrictStr] = Field(default=None, description="LatestCheckpointPath represents the path to the latest checkpoint file.", alias="latestCheckpointPath")
32+
learning_rate: Optional[StrictStr] = Field(default=None, description="LearningRate represents the current learning rate.", alias="learningRate")
33+
loss: Optional[StrictStr] = Field(default=None, description="Loss represents the current training loss.")
34+
__properties: ClassVar[List[str]] = ["accuracy", "checkpointsStored", "latestCheckpointPath", "learningRate", "loss"]
35+
36+
model_config = ConfigDict(
37+
populate_by_name=True,
38+
validate_assignment=True,
39+
protected_namespaces=(),
40+
)
41+
42+
43+
def to_str(self) -> str:
44+
"""Returns the string representation of the model using alias"""
45+
return pprint.pformat(self.model_dump(by_alias=True))
46+
47+
def to_json(self) -> str:
48+
"""Returns the JSON representation of the model using alias"""
49+
# TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead
50+
return json.dumps(self.to_dict())
51+
52+
@classmethod
53+
def from_json(cls, json_str: str) -> Optional[Self]:
54+
"""Create an instance of TrainerV1alpha1TrainingMetrics from a JSON string"""
55+
return cls.from_dict(json.loads(json_str))
56+
57+
def to_dict(self) -> Dict[str, Any]:
58+
"""Return the dictionary representation of the model using alias.
59+
60+
This has the following differences from calling pydantic's
61+
`self.model_dump(by_alias=True)`:
62+
63+
* `None` is only added to the output dict for nullable fields that
64+
were set at model initialization. Other fields with value `None`
65+
are ignored.
66+
"""
67+
excluded_fields: Set[str] = set([
68+
])
69+
70+
_dict = self.model_dump(
71+
by_alias=True,
72+
exclude=excluded_fields,
73+
exclude_none=True,
74+
)
75+
return _dict
76+
77+
@classmethod
78+
def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
79+
"""Create an instance of TrainerV1alpha1TrainingMetrics from a dict"""
80+
if obj is None:
81+
return None
82+
83+
if not isinstance(obj, dict):
84+
return cls.model_validate(obj)
85+
86+
_obj = cls.model_validate({
87+
"accuracy": obj.get("accuracy"),
88+
"checkpointsStored": obj.get("checkpointsStored"),
89+
"latestCheckpointPath": obj.get("latestCheckpointPath"),
90+
"learningRate": obj.get("learningRate"),
91+
"loss": obj.get("loss")
92+
})
93+
return _obj
94+
95+

0 commit comments

Comments
 (0)