feat(components): Introduce max_wait_duration to custom job to v1 GCPC custom job components/utils

zzmao · Google Cloud Pipeline Components maintainers · commit 6cb7cf71fa81 · 2025-02-05T11:03:08.000-08:00
Signed-off-by: Ze Mao &lt;zemao@google.com&gt;
PiperOrigin-RevId: 723577407
diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md
@@ -1,5 +1,9 @@
 ## Upcoming release
 
+## Release 2.19.0
+
+* Add max_wait_duration to v1 GCPC custom job components/utils
+
 ## Release 2.18.0
 * Remove default prediction column names in `v1.model_evaluation.regression_component` component to fix pipeline errors when using bigquery data source.
 * Add reservation_affinition support in `v1.create_custom_training_job_from_component`.
diff --git a/components/google-cloud/google_cloud_pipeline_components/container/v1/custom_job/remote_runner.py b/components/google-cloud/google_cloud_pipeline_components/container/v1/custom_job/remote_runner.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 """GCP launcher for custom jobs based on the AI Platform SDK."""
 
+import json
+
 from google.api_core import retry
 from google_cloud_pipeline_components.container.v1.gcp_launcher import job_remote_runner
-from google_cloud_pipeline_components.container.v1.gcp_launcher.utils import gcp_labels_util
 from google_cloud_pipeline_components.container.v1.gcp_launcher.utils import error_util
-
-import json
+from google_cloud_pipeline_components.container.v1.gcp_launcher.utils import gcp_labels_util
 
 _CUSTOM_JOB_RETRY_DEADLINE_SECONDS = 10.0 * 60.0
 LABELS_PAYLOAD_KEY = 'labels'
@@ -34,6 +34,15 @@ def insert_system_labels_into_payload(payload):
 
 def create_custom_job_with_client(job_client, parent, job_spec):
   create_custom_job_fn = None
+  # max_wait_duration is acceptable only when strategy is FLEX_START in
+  # CustomJob API. Clear max_wait_duration if strategy is not FLEX_START.
+  if (
+      'scheduling' in job_spec
+      and 'stategy' in job_spec['scheduling']
+      and job_spec['scheduling']['strategy'] != 'FLEX_START'
+      and 'max_wait_duration' in job_spec['scheduling']
+  ):
+    del job_spec['scheduling']['max_wait_duration']
   try:
     create_custom_job_fn = job_client.create_custom_job(
         parent=parent, custom_job=job_spec
diff --git a/components/google-cloud/google_cloud_pipeline_components/v1/custom_job/component.py b/components/google-cloud/google_cloud_pipeline_components/v1/custom_job/component.py
@@ -39,6 +39,7 @@ def custom_training_job(
     persistent_resource_id: str = _placeholders.PERSISTENT_RESOURCE_ID_PLACEHOLDER,
     project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
     strategy: str = 'STANDARD',
+    max_wait_duration: str = '86400s',
 ):
   # fmt: off
   """Launch a Vertex AI [custom training job](https://cloud.google.com/vertex-ai/docs/training/create-custom-job) using the [CustomJob](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.customJobs) API. See [Create custom training jobs ](https://cloud.google.com/vertex-ai/docs/training/create-custom-job) for more information.
@@ -60,6 +61,7 @@ def custom_training_job(
     persistent_resource_id: The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.
     project: Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.
     strategy: The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
+    max_wait_duration: The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
 
   Returns:
     gcp_resources: Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.
@@ -78,6 +80,7 @@ def custom_training_job(
                       restart_job_on_worker_restart
                   ),
                   'strategy': strategy,
+                  'max_wait_duration': max_wait_duration,
               },
               'service_account': service_account,
               'tensorboard': tensorboard,
diff --git a/components/google-cloud/google_cloud_pipeline_components/v1/custom_job/utils.py b/components/google-cloud/google_cloud_pipeline_components/v1/custom_job/utils.py
@@ -72,6 +72,7 @@ def create_custom_training_job_from_component(
     persistent_resource_id: str = _placeholders.PERSISTENT_RESOURCE_ID_PLACEHOLDER,
     env: Optional[List[Dict[str, str]]] = None,
     strategy: str = 'STANDARD',
+    max_wait_duration: str = '86400s',
     reservation_affinity_type: Optional[str] = None,
     reservation_affinity_key: Optional[str] = None,
     reservation_affinity_values: Optional[List[str]] = None,
@@ -104,6 +105,7 @@ def create_custom_training_job_from_component(
     persistent_resource_id: The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.
     env: Environment variables to be passed to the container. Takes the form `[{'name': '...', 'value': '...'}]`. Maximum limit is 100.
     startegy: The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
+    max_wait_duration: The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
     reservation_affinity_type: The type of [reservation affinity](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#reservationaffinity). Valid values are "NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION".
     reservation_affinity_key: Corresponds to the label key of a reservation resource. To target a SPECIFIC_RESERVATION by name, use compute.googleapis.com/reservation-name as the key and specify the name of your reservation as its value.
     reservation_affinity_values: Corresponds to the label values of a reservation resource. This must be the full resource name of the reservation.
@@ -217,6 +219,7 @@ def create_custom_training_job_from_component(
       'timeout': timeout,
       'restart_job_on_worker_restart': restart_job_on_worker_restart,
       'strategy': strategy,
+      'max_wait_duration': max_wait_duration,
       'service_account': service_account,
       'tensorboard': tensorboard,
       'enable_web_access': enable_web_access,