kubeflow · abhijeet-dhumal · Sep 8, 2025 · Oct 4, 2025 · Oct 4, 2025
diff --git a/api/openapi-spec/swagger.json b/api/openapi-spec/swagger.json
@@ -14036,6 +14036,69 @@
           }
         }
       },
+      "trainer.v1alpha1.ProgressionStatus": {
+        "description": "ProgressionStatus represents the training progression status read from rank 0 node.",
+        "type": "object",
+        "properties": {
+          "currentEpoch": {
+            "description": "CurrentEpoch is the current training epoch.",
+            "type": "integer",
+            "format": "int64"
+          },
+          "currentStep": {
+            "description": "CurrentStep is the current training step/iteration.",
+            "type": "integer",
+            "format": "int64"
+          },
+          "estimatedTimeRemaining": {
+            "description": "EstimatedTimeRemaining is the estimated time remaining in seconds.",
+            "type": "integer",
+            "format": "int64"
+          },
+          "lastUpdateTime": {
+            "description": "LastUpdateTime is the timestamp when the progression was last updated.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
+              }
+            ]
+          },
+          "message": {
+            "description": "Message provides additional information about the training progression.",
+            "type": "string"
+          },
+          "metrics": {
+            "description": "Metrics contains additional training metrics as key-value pairs.",
+            "type": "object",
+            "additionalProperties": {
+              "type": "string",
+              "default": ""
+            }
+          },
+          "percentageComplete": {
+            "description": "PercentageComplete represents the completion percentage (0-100) as a string.",
+            "type": "string"
+          },
+          "totalEpochs": {
+            "description": "TotalEpochs is the total number of training epochs.",
+            "type": "integer",
+            "format": "int64"
+          },
+          "totalSteps": {
+            "description": "TotalSteps is the total number of training steps/iterations.",
+            "type": "integer",
+            "format": "int64"
+          },
+          "trainingMetrics": {
+            "description": "TrainingMetrics contains structured training metrics.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/trainer.v1alpha1.TrainingMetrics"
+              }
+            ]
+          }
+        }
+      },
       "trainer.v1alpha1.RuntimeRef": {
         "description": "RuntimeRef represents the reference to the existing training runtime.",
         "type": "object",
@@ -14301,6 +14364,14 @@
               "name"
             ],
             "x-kubernetes-list-type": "map"
+          },
+          "progressionStatus": {
+            "description": "ProgressionStatus tracks the training progression from rank 0 node.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/trainer.v1alpha1.ProgressionStatus"
+              }
+            ]
           }
         }
       },
@@ -14369,6 +14440,33 @@
           }
         }
       },
+      "trainer.v1alpha1.TrainingMetrics": {
+        "description": "TrainingMetrics represents structured training metrics.",
+        "type": "object",
+        "properties": {
+          "accuracy": {
+            "description": "Accuracy represents the current model accuracy.",
+            "type": "string"
+          },
+          "checkpointsStored": {
+            "description": "CheckpointsStored represents the number of checkpoints stored.",
+            "type": "integer",
+            "format": "int64"
+          },
+          "latestCheckpointPath": {
+            "description": "LatestCheckpointPath represents the path to the latest checkpoint file.",
+            "type": "string"
+          },
+          "learningRate": {
+            "description": "LearningRate represents the current learning rate.",
+            "type": "string"
+          },
+          "loss": {
+            "description": "Loss represents the current training loss.",
+            "type": "string"
+          }
+        }
+      },
       "trainer.v1alpha1.TrainingRuntime": {
         "description": "TrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.",
         "type": "object",

diff --git a/api/python_api/kubeflow_trainer_api/models/__init__.py b/api/python_api/kubeflow_trainer_api/models/__init__.py
@@ -368,6 +368,7 @@
 from kubeflow_trainer_api.models.trainer_v1alpha1_pod_group_policy_source import TrainerV1alpha1PodGroupPolicySource
 from kubeflow_trainer_api.models.trainer_v1alpha1_pod_spec_override import TrainerV1alpha1PodSpecOverride
 from kubeflow_trainer_api.models.trainer_v1alpha1_pod_spec_override_target_job import TrainerV1alpha1PodSpecOverrideTargetJob
+from kubeflow_trainer_api.models.trainer_v1alpha1_progression_status import TrainerV1alpha1ProgressionStatus
 from kubeflow_trainer_api.models.trainer_v1alpha1_runtime_ref import TrainerV1alpha1RuntimeRef
 from kubeflow_trainer_api.models.trainer_v1alpha1_torch_elastic_policy import TrainerV1alpha1TorchElasticPolicy
 from kubeflow_trainer_api.models.trainer_v1alpha1_torch_ml_policy_source import TrainerV1alpha1TorchMLPolicySource
@@ -376,6 +377,7 @@
 from kubeflow_trainer_api.models.trainer_v1alpha1_train_job_spec import TrainerV1alpha1TrainJobSpec
 from kubeflow_trainer_api.models.trainer_v1alpha1_train_job_status import TrainerV1alpha1TrainJobStatus
 from kubeflow_trainer_api.models.trainer_v1alpha1_trainer import TrainerV1alpha1Trainer
+from kubeflow_trainer_api.models.trainer_v1alpha1_training_metrics import TrainerV1alpha1TrainingMetrics
 from kubeflow_trainer_api.models.trainer_v1alpha1_training_runtime import TrainerV1alpha1TrainingRuntime
 from kubeflow_trainer_api.models.trainer_v1alpha1_training_runtime_list import TrainerV1alpha1TrainingRuntimeList
 from kubeflow_trainer_api.models.trainer_v1alpha1_training_runtime_spec import TrainerV1alpha1TrainingRuntimeSpec
diff --git a/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_progression_status.py b/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_progression_status.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+
+"""
+    Kubeflow Trainer OpenAPI Spec
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
+
+    The version of the OpenAPI document: unversioned
+    Generated by OpenAPI Generator (https://openapi-generator.tech)
+
+    Do not edit the class manually.
+"""  # noqa: E501
+
+
+from __future__ import annotations
+import pprint
+import re  # noqa: F401
+import json
+
+from datetime import datetime
+from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr
+from typing import Any, ClassVar, Dict, List, Optional
+from kubeflow_trainer_api.models.trainer_v1alpha1_training_metrics import TrainerV1alpha1TrainingMetrics
+from typing import Optional, Set
+from typing_extensions import Self
+
+class TrainerV1alpha1ProgressionStatus(BaseModel):
+    """
+    ProgressionStatus represents the training progression status read from rank 0 node.
+    """ # noqa: E501
+    current_epoch: Optional[StrictInt] = Field(default=None, description="CurrentEpoch is the current training epoch.", alias="currentEpoch")
+    current_step: Optional[StrictInt] = Field(default=None, description="CurrentStep is the current training step/iteration.", alias="currentStep")
+    estimated_time_remaining: Optional[StrictInt] = Field(default=None, description="EstimatedTimeRemaining is the estimated time remaining in seconds.", alias="estimatedTimeRemaining")
+    last_update_time: Optional[datetime] = Field(default=None, description="LastUpdateTime is the timestamp when the progression was last updated.", alias="lastUpdateTime")
+    message: Optional[StrictStr] = Field(default=None, description="Message provides additional information about the training progression.")
+    metrics: Optional[Dict[str, StrictStr]] = Field(default=None, description="Metrics contains additional training metrics as key-value pairs.")
+    percentage_complete: Optional[StrictStr] = Field(default=None, description="PercentageComplete represents the completion percentage (0-100) as a string.", alias="percentageComplete")
+    total_epochs: Optional[StrictInt] = Field(default=None, description="TotalEpochs is the total number of training epochs.", alias="totalEpochs")
+    total_steps: Optional[StrictInt] = Field(default=None, description="TotalSteps is the total number of training steps/iterations.", alias="totalSteps")
+    training_metrics: Optional[TrainerV1alpha1TrainingMetrics] = Field(default=None, description="TrainingMetrics contains structured training metrics.", alias="trainingMetrics")
+    __properties: ClassVar[List[str]] = ["currentEpoch", "currentStep", "estimatedTimeRemaining", "lastUpdateTime", "message", "metrics", "percentageComplete", "totalEpochs", "totalSteps", "trainingMetrics"]
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+        validate_assignment=True,
+        protected_namespaces=(),
+    )
+
+
+    def to_str(self) -> str:
+        """Returns the string representation of the model using alias"""
+        return pprint.pformat(self.model_dump(by_alias=True))
+
+    def to_json(self) -> str:
+        """Returns the JSON representation of the model using alias"""
+        # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead
+        return json.dumps(self.to_dict())
+
+    @classmethod
+    def from_json(cls, json_str: str) -> Optional[Self]:
+        """Create an instance of TrainerV1alpha1ProgressionStatus from a JSON string"""
+        return cls.from_dict(json.loads(json_str))
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return the dictionary representation of the model using alias.
+
+        This has the following differences from calling pydantic's
+        `self.model_dump(by_alias=True)`:
+
+        * `None` is only added to the output dict for nullable fields that
+          were set at model initialization. Other fields with value `None`
+          are ignored.
+        """
+        excluded_fields: Set[str] = set([
+        ])
+
+        _dict = self.model_dump(
+            by_alias=True,
+            exclude=excluded_fields,
+            exclude_none=True,
+        )
+        # override the default output from pydantic by calling `to_dict()` of training_metrics
+        if self.training_metrics:
+            _dict['trainingMetrics'] = self.training_metrics.to_dict()
+        return _dict
+
+    @classmethod
+    def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
+        """Create an instance of TrainerV1alpha1ProgressionStatus from a dict"""
+        if obj is None:
+            return None
+
+        if not isinstance(obj, dict):
+            return cls.model_validate(obj)
+
+        _obj = cls.model_validate({
+            "currentEpoch": obj.get("currentEpoch"),
+            "currentStep": obj.get("currentStep"),
+            "estimatedTimeRemaining": obj.get("estimatedTimeRemaining"),
+            "lastUpdateTime": obj.get("lastUpdateTime"),
+            "message": obj.get("message"),
+            "metrics": obj.get("metrics"),
+            "percentageComplete": obj.get("percentageComplete"),
+            "totalEpochs": obj.get("totalEpochs"),
+            "totalSteps": obj.get("totalSteps"),
+            "trainingMetrics": TrainerV1alpha1TrainingMetrics.from_dict(obj["trainingMetrics"]) if obj.get("trainingMetrics") is not None else None
+        })
+        return _obj
+
+
diff --git a/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_status.py b/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_status.py
@@ -21,6 +21,7 @@
 from typing import Any, ClassVar, Dict, List, Optional
 from kubeflow_trainer_api.models.io_k8s_apimachinery_pkg_apis_meta_v1_condition import IoK8sApimachineryPkgApisMetaV1Condition
 from kubeflow_trainer_api.models.trainer_v1alpha1_job_status import TrainerV1alpha1JobStatus
+from kubeflow_trainer_api.models.trainer_v1alpha1_progression_status import TrainerV1alpha1ProgressionStatus
 from typing import Optional, Set
 from typing_extensions import Self
 
@@ -30,7 +31,8 @@ class TrainerV1alpha1TrainJobStatus(BaseModel):
     """ # noqa: E501
     conditions: Optional[List[IoK8sApimachineryPkgApisMetaV1Condition]] = Field(default=None, description="Conditions for the TrainJob.")
     jobs_status: Optional[List[TrainerV1alpha1JobStatus]] = Field(default=None, description="JobsStatus tracks the child Jobs in TrainJob.", alias="jobsStatus")
-    __properties: ClassVar[List[str]] = ["conditions", "jobsStatus"]
+    progression_status: Optional[TrainerV1alpha1ProgressionStatus] = Field(default=None, description="ProgressionStatus tracks the training progression from rank 0 node.", alias="progressionStatus")
+    __properties: ClassVar[List[str]] = ["conditions", "jobsStatus", "progressionStatus"]
 
     model_config = ConfigDict(
         populate_by_name=True,
@@ -85,6 +87,9 @@ def to_dict(self) -> Dict[str, Any]:
                 if _item_jobs_status:
                     _items.append(_item_jobs_status.to_dict())
             _dict['jobsStatus'] = _items
+        # override the default output from pydantic by calling `to_dict()` of progression_status
+        if self.progression_status:
+            _dict['progressionStatus'] = self.progression_status.to_dict()
         return _dict
 
     @classmethod
@@ -98,7 +103,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
 
         _obj = cls.model_validate({
             "conditions": [IoK8sApimachineryPkgApisMetaV1Condition.from_dict(_item) for _item in obj["conditions"]] if obj.get("conditions") is not None else None,
-            "jobsStatus": [TrainerV1alpha1JobStatus.from_dict(_item) for _item in obj["jobsStatus"]] if obj.get("jobsStatus") is not None else None
+            "jobsStatus": [TrainerV1alpha1JobStatus.from_dict(_item) for _item in obj["jobsStatus"]] if obj.get("jobsStatus") is not None else None,
+            "progressionStatus": TrainerV1alpha1ProgressionStatus.from_dict(obj["progressionStatus"]) if obj.get("progressionStatus") is not None else None
         })
         return _obj
 

diff --git a/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_training_metrics.py b/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_training_metrics.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+
+"""
+    Kubeflow Trainer OpenAPI Spec
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
+
+    The version of the OpenAPI document: unversioned
+    Generated by OpenAPI Generator (https://openapi-generator.tech)
+
+    Do not edit the class manually.
+"""  # noqa: E501
+
+
+from __future__ import annotations
+import pprint
+import re  # noqa: F401
+import json
+
+from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr
+from typing import Any, ClassVar, Dict, List, Optional
+from typing import Optional, Set
+from typing_extensions import Self
+
+class TrainerV1alpha1TrainingMetrics(BaseModel):
+    """
+    TrainingMetrics represents structured training metrics.
+    """ # noqa: E501
+    accuracy: Optional[StrictStr] = Field(default=None, description="Accuracy represents the current model accuracy.")
+    checkpoints_stored: Optional[StrictInt] = Field(default=None, description="CheckpointsStored represents the number of checkpoints stored.", alias="checkpointsStored")
+    latest_checkpoint_path: Optional[StrictStr] = Field(default=None, description="LatestCheckpointPath represents the path to the latest checkpoint file.", alias="latestCheckpointPath")
+    learning_rate: Optional[StrictStr] = Field(default=None, description="LearningRate represents the current learning rate.", alias="learningRate")
+    loss: Optional[StrictStr] = Field(default=None, description="Loss represents the current training loss.")
+    __properties: ClassVar[List[str]] = ["accuracy", "checkpointsStored", "latestCheckpointPath", "learningRate", "loss"]
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+        validate_assignment=True,
+        protected_namespaces=(),
+    )
+
+
+    def to_str(self) -> str:
+        """Returns the string representation of the model using alias"""
+        return pprint.pformat(self.model_dump(by_alias=True))
+
+    def to_json(self) -> str:
+        """Returns the JSON representation of the model using alias"""
+        # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead
+        return json.dumps(self.to_dict())
+
+    @classmethod
+    def from_json(cls, json_str: str) -> Optional[Self]:
+        """Create an instance of TrainerV1alpha1TrainingMetrics from a JSON string"""
+        return cls.from_dict(json.loads(json_str))
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return the dictionary representation of the model using alias.
+
+        This has the following differences from calling pydantic's
+        `self.model_dump(by_alias=True)`:
+
+        * `None` is only added to the output dict for nullable fields that
+          were set at model initialization. Other fields with value `None`
+          are ignored.
+        """
+        excluded_fields: Set[str] = set([
+        ])
+
+        _dict = self.model_dump(
+            by_alias=True,
+            exclude=excluded_fields,
+            exclude_none=True,
+        )
+        return _dict
+
+    @classmethod
+    def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
+        """Create an instance of TrainerV1alpha1TrainingMetrics from a dict"""
+        if obj is None:
+            return None
+
+        if not isinstance(obj, dict):
+            return cls.model_validate(obj)
+
+        _obj = cls.model_validate({
+            "accuracy": obj.get("accuracy"),
+            "checkpointsStored": obj.get("checkpointsStored"),
+            "latestCheckpointPath": obj.get("latestCheckpointPath"),
+            "learningRate": obj.get("learningRate"),
+            "loss": obj.get("loss")
+        })
+        return _obj
+
+