Skip to content

Commit 6cb7cf7

Browse files
zzmaoGoogle Cloud Pipeline Components maintainers
authored andcommitted
feat(components): Introduce max_wait_duration to custom job to v1 GCPC custom job components/utils
Signed-off-by: Ze Mao <[email protected]> PiperOrigin-RevId: 723577407
1 parent 32bdbe8 commit 6cb7cf7

File tree

4 files changed

+22
-3
lines changed

4 files changed

+22
-3
lines changed

components/google-cloud/RELEASE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
## Upcoming release
22

3+
## Release 2.19.0
4+
5+
* Add max_wait_duration to v1 GCPC custom job components/utils
6+
37
## Release 2.18.0
48
* Remove default prediction column names in `v1.model_evaluation.regression_component` component to fix pipeline errors when using bigquery data source.
59
* Add reservation_affinition support in `v1.create_custom_training_job_from_component`.

components/google-cloud/google_cloud_pipeline_components/container/v1/custom_job/remote_runner.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
# limitations under the License.
1414
"""GCP launcher for custom jobs based on the AI Platform SDK."""
1515

16+
import json
17+
1618
from google.api_core import retry
1719
from google_cloud_pipeline_components.container.v1.gcp_launcher import job_remote_runner
18-
from google_cloud_pipeline_components.container.v1.gcp_launcher.utils import gcp_labels_util
1920
from google_cloud_pipeline_components.container.v1.gcp_launcher.utils import error_util
20-
21-
import json
21+
from google_cloud_pipeline_components.container.v1.gcp_launcher.utils import gcp_labels_util
2222

2323
_CUSTOM_JOB_RETRY_DEADLINE_SECONDS = 10.0 * 60.0
2424
LABELS_PAYLOAD_KEY = 'labels'
@@ -34,6 +34,15 @@ def insert_system_labels_into_payload(payload):
3434

3535
def create_custom_job_with_client(job_client, parent, job_spec):
3636
create_custom_job_fn = None
37+
# max_wait_duration is acceptable only when strategy is FLEX_START in
38+
# CustomJob API. Clear max_wait_duration if strategy is not FLEX_START.
39+
if (
40+
'scheduling' in job_spec
41+
and 'stategy' in job_spec['scheduling']
42+
and job_spec['scheduling']['strategy'] != 'FLEX_START'
43+
and 'max_wait_duration' in job_spec['scheduling']
44+
):
45+
del job_spec['scheduling']['max_wait_duration']
3746
try:
3847
create_custom_job_fn = job_client.create_custom_job(
3948
parent=parent, custom_job=job_spec

components/google-cloud/google_cloud_pipeline_components/v1/custom_job/component.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def custom_training_job(
3939
persistent_resource_id: str = _placeholders.PERSISTENT_RESOURCE_ID_PLACEHOLDER,
4040
project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
4141
strategy: str = 'STANDARD',
42+
max_wait_duration: str = '86400s',
4243
):
4344
# fmt: off
4445
"""Launch a Vertex AI [custom training job](https://cloud.google.com/vertex-ai/docs/training/create-custom-job) using the [CustomJob](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.customJobs) API. See [Create custom training jobs ](https://cloud.google.com/vertex-ai/docs/training/create-custom-job) for more information.
@@ -60,6 +61,7 @@ def custom_training_job(
6061
persistent_resource_id: The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.
6162
project: Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.
6263
strategy: The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
64+
max_wait_duration: The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
6365
6466
Returns:
6567
gcp_resources: Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.
@@ -78,6 +80,7 @@ def custom_training_job(
7880
restart_job_on_worker_restart
7981
),
8082
'strategy': strategy,
83+
'max_wait_duration': max_wait_duration,
8184
},
8285
'service_account': service_account,
8386
'tensorboard': tensorboard,

components/google-cloud/google_cloud_pipeline_components/v1/custom_job/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def create_custom_training_job_from_component(
7272
persistent_resource_id: str = _placeholders.PERSISTENT_RESOURCE_ID_PLACEHOLDER,
7373
env: Optional[List[Dict[str, str]]] = None,
7474
strategy: str = 'STANDARD',
75+
max_wait_duration: str = '86400s',
7576
reservation_affinity_type: Optional[str] = None,
7677
reservation_affinity_key: Optional[str] = None,
7778
reservation_affinity_values: Optional[List[str]] = None,
@@ -104,6 +105,7 @@ def create_custom_training_job_from_component(
104105
persistent_resource_id: The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.
105106
env: Environment variables to be passed to the container. Takes the form `[{'name': '...', 'value': '...'}]`. Maximum limit is 100.
106107
startegy: The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
108+
max_wait_duration: The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).
107109
reservation_affinity_type: The type of [reservation affinity](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#reservationaffinity). Valid values are "NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION".
108110
reservation_affinity_key: Corresponds to the label key of a reservation resource. To target a SPECIFIC_RESERVATION by name, use compute.googleapis.com/reservation-name as the key and specify the name of your reservation as its value.
109111
reservation_affinity_values: Corresponds to the label values of a reservation resource. This must be the full resource name of the reservation.
@@ -217,6 +219,7 @@ def create_custom_training_job_from_component(
217219
'timeout': timeout,
218220
'restart_job_on_worker_restart': restart_job_on_worker_restart,
219221
'strategy': strategy,
222+
'max_wait_duration': max_wait_duration,
220223
'service_account': service_account,
221224
'tensorboard': tensorboard,
222225
'enable_web_access': enable_web_access,

0 commit comments

Comments
 (0)