Skip to content

Commit 84abe32

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add autoscaling_target_dcgm_fi_dev_gpu_util, autoscaling_target_vllm_gpu_cache_usage_perc, autoscaling_target_vllm_num_requests_waiting options in model deployment on Endpoint & Model classes.
PiperOrigin-RevId: 857352519
1 parent 9d32dd5 commit 84abe32

File tree

4 files changed

+523
-6
lines changed

4 files changed

+523
-6
lines changed

google/cloud/aiplatform/models.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,9 @@ def deploy(
13671367
autoscaling_target_cpu_utilization: Optional[int] = None,
13681368
autoscaling_target_accelerator_duty_cycle: Optional[int] = None,
13691369
autoscaling_target_request_count_per_minute: Optional[int] = None,
1370+
autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None,
1371+
autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None,
1372+
autoscaling_target_vllm_num_requests_waiting: Optional[int] = None,
13701373
autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None,
13711374
autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None,
13721375
enable_access_logging=False,
@@ -1467,6 +1470,13 @@ def deploy(
14671470
autoscaling_target_request_count_per_minute (int):
14681471
Optional. The target number of requests per minute for autoscaling.
14691472
If set, the model will be scaled based on the number of requests it receives.
1473+
autoscaling_target_dcgm_fi_dev_gpu_util (int):
1474+
Optional. Target DCGM metrics for GPU utilization.
1475+
autoscaling_target_vllm_gpu_cache_usage_perc (int):
1476+
Optional. Target vLLM metrics for GPU KV cache usage percentage.
1477+
autoscaling_target_vllm_num_requests_waiting (int):
1478+
Optional. Target vLLM metrics for number of inference requests
1479+
currently waiting in the queue.
14701480
autoscaling_target_pubsub_num_undelivered_messages (int):
14711481
Optional. The target number of pubsub undelivered messages for autoscaling.
14721482
If set, the model will be scaled based on the pubsub queue size.
@@ -1555,6 +1565,9 @@ def deploy(
15551565
autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
15561566
autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
15571567
autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute,
1568+
autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util,
1569+
autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc,
1570+
autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting,
15581571
autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages,
15591572
autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels,
15601573
spot=spot,
@@ -1591,6 +1604,9 @@ def _deploy(
15911604
autoscaling_target_cpu_utilization: Optional[int] = None,
15921605
autoscaling_target_accelerator_duty_cycle: Optional[int] = None,
15931606
autoscaling_target_request_count_per_minute: Optional[int] = None,
1607+
autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None,
1608+
autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None,
1609+
autoscaling_target_vllm_num_requests_waiting: Optional[int] = None,
15941610
autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None,
15951611
autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None,
15961612
spot: bool = False,
@@ -1694,6 +1710,13 @@ def _deploy(
16941710
autoscaling_target_request_count_per_minute (int):
16951711
Optional. The target number of requests per minute for autoscaling.
16961712
If set, the model will be scaled based on the number of requests it receives.
1713+
autoscaling_target_dcgm_fi_dev_gpu_util (int):
1714+
Optional. Target DCGM metrics for GPU utilization.
1715+
autoscaling_target_vllm_gpu_cache_usage_perc (int):
1716+
Optional. Target vLLM metrics for GPU KV cache usage percentage.
1717+
autoscaling_target_vllm_num_requests_waiting (int):
1718+
Optional. Target vLLM metrics for number of inference requests
1719+
currently waiting in the queue.
16971720
autoscaling_target_pubsub_num_undelivered_messages (int):
16981721
Optional. The target number of pubsub undelivered messages for autoscaling.
16991722
If set, the model will be scaled based on the pubsub queue size.
@@ -1759,6 +1782,9 @@ def _deploy(
17591782
autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
17601783
autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
17611784
autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute,
1785+
autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util,
1786+
autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc,
1787+
autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting,
17621788
autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages,
17631789
autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels,
17641790
spot=spot,
@@ -1802,6 +1828,9 @@ def _deploy_call(
18021828
autoscaling_target_cpu_utilization: Optional[int] = None,
18031829
autoscaling_target_accelerator_duty_cycle: Optional[int] = None,
18041830
autoscaling_target_request_count_per_minute: Optional[int] = None,
1831+
autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None,
1832+
autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None,
1833+
autoscaling_target_vllm_num_requests_waiting: Optional[int] = None,
18051834
autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None,
18061835
autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None,
18071836
spot: bool = False,
@@ -1911,6 +1940,13 @@ def _deploy_call(
19111940
A default value of 60 will be used if not specified.
19121941
autoscaling_target_request_count_per_minute (int):
19131942
Optional. Target request count per minute per instance.
1943+
autoscaling_target_dcgm_fi_dev_gpu_util (int):
1944+
Optional. Target DCGM metrics for GPU utilization.
1945+
autoscaling_target_vllm_gpu_cache_usage_perc (int):
1946+
Optional. Target vLLM metrics for GPU KV cache usage percentage.
1947+
autoscaling_target_vllm_num_requests_waiting (int):
1948+
Optional. Target vLLM metrics for number of inference requests
1949+
currently waiting in the queue.
19141950
autoscaling_target_pubsub_num_undelivered_messages (int):
19151951
Optional. Target pubsub queue size per instance.
19161952
autoscaling_pubsub_subscription_labels (Dict[str, str]):
@@ -2006,6 +2042,9 @@ def _deploy_call(
20062042
or autoscaling_target_accelerator_duty_cycle
20072043
or autoscaling_target_cpu_utilization
20082044
or autoscaling_target_request_count_per_minute
2045+
or autoscaling_target_dcgm_fi_dev_gpu_util
2046+
or autoscaling_target_vllm_gpu_cache_usage_perc
2047+
or autoscaling_target_vllm_num_requests_waiting
20092048
or autoscaling_target_pubsub_num_undelivered_messages
20102049
or autoscaling_pubsub_subscription_labels
20112050
)
@@ -2017,6 +2056,9 @@ def _deploy_call(
20172056
"autoscaling_target_accelerator_duty_cycle, "
20182057
"autoscaling_target_cpu_utilization, "
20192058
"autoscaling_target_request_count_per_minute, "
2059+
"autoscaling_target_dcgm_fi_dev_gpu_util, "
2060+
"autoscaling_target_vllm_gpu_cache_usage_perc, "
2061+
"autoscaling_target_vllm_num_requests_waiting, "
20202062
"autoscaling_target_pubsub_num_undelivered_messages, "
20212063
"autoscaling_pubsub_subscription_labels parameters "
20222064
"may not be set when `deployment_resource_pool` is "
@@ -2078,6 +2120,9 @@ def _deploy_call(
20782120
or autoscaling_target_accelerator_duty_cycle
20792121
or autoscaling_target_cpu_utilization
20802122
or autoscaling_target_request_count_per_minute
2123+
or autoscaling_target_dcgm_fi_dev_gpu_util
2124+
or autoscaling_target_vllm_gpu_cache_usage_perc
2125+
or autoscaling_target_vllm_num_requests_waiting
20812126
or autoscaling_target_pubsub_num_undelivered_messages
20822127
or autoscaling_pubsub_subscription_labels
20832128
)
@@ -2095,6 +2140,9 @@ def _deploy_call(
20952140
"autoscaling_target_accelerator_duty_cycle, "
20962141
"autoscaling_target_cpu_utilization, "
20972142
"autoscaling_target_request_count_per_minute, "
2143+
"autoscaling_target_dcgm_fi_dev_gpu_util, "
2144+
"autoscaling_target_vllm_gpu_cache_usage_perc, "
2145+
"autoscaling_target_vllm_num_requests_waiting, "
20982146
"autoscaling_target_pubsub_num_undelivered_messages, "
20992147
"autoscaling_pubsub_subscription_labels parameters "
21002148
"are ignored."
@@ -2156,6 +2204,48 @@ def _deploy_call(
21562204
[autoscaling_metric_spec]
21572205
)
21582206

2207+
if autoscaling_target_dcgm_fi_dev_gpu_util:
2208+
autoscaling_metric_spec = (
2209+
gca_machine_resources_compat.AutoscalingMetricSpec(
2210+
metric_name=(
2211+
"prometheus.googleapis.com/"
2212+
"vertex_dcgm_fi_dev_gpu_util"
2213+
),
2214+
target=autoscaling_target_dcgm_fi_dev_gpu_util,
2215+
)
2216+
)
2217+
dedicated_resources.autoscaling_metric_specs.extend(
2218+
[autoscaling_metric_spec]
2219+
)
2220+
2221+
if autoscaling_target_vllm_gpu_cache_usage_perc:
2222+
autoscaling_metric_spec = (
2223+
gca_machine_resources_compat.AutoscalingMetricSpec(
2224+
metric_name=(
2225+
"prometheus.googleapis.com/"
2226+
"vertex_vllm_gpu_cache_usage_perc"
2227+
),
2228+
target=autoscaling_target_vllm_gpu_cache_usage_perc,
2229+
)
2230+
)
2231+
dedicated_resources.autoscaling_metric_specs.extend(
2232+
[autoscaling_metric_spec]
2233+
)
2234+
2235+
if autoscaling_target_vllm_num_requests_waiting:
2236+
autoscaling_metric_spec = (
2237+
gca_machine_resources_compat.AutoscalingMetricSpec(
2238+
metric_name=(
2239+
"prometheus.googleapis.com/"
2240+
"vertex_vllm_num_requests_waiting"
2241+
),
2242+
target=autoscaling_target_vllm_num_requests_waiting,
2243+
)
2244+
)
2245+
dedicated_resources.autoscaling_metric_specs.extend(
2246+
[autoscaling_metric_spec]
2247+
)
2248+
21592249
if autoscaling_target_pubsub_num_undelivered_messages:
21602250
autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
21612251
metric_name=(
@@ -4492,6 +4582,9 @@ def deploy(
44924582
autoscaling_target_cpu_utilization: Optional[int] = None,
44934583
autoscaling_target_accelerator_duty_cycle: Optional[int] = None,
44944584
autoscaling_target_request_count_per_minute: Optional[int] = None,
4585+
autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None,
4586+
autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None,
4587+
autoscaling_target_vllm_num_requests_waiting: Optional[int] = None,
44954588
autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None,
44964589
autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None,
44974590
) -> None:
@@ -4673,6 +4766,9 @@ def deploy(
46734766
autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
46744767
autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
46754768
autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute,
4769+
autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util,
4770+
autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc,
4771+
autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting,
46764772
autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages,
46774773
autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels,
46784774
)
@@ -5748,6 +5844,9 @@ def deploy(
57485844
autoscaling_target_cpu_utilization: Optional[int] = None,
57495845
autoscaling_target_accelerator_duty_cycle: Optional[int] = None,
57505846
autoscaling_target_request_count_per_minute: Optional[int] = None,
5847+
autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None,
5848+
autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None,
5849+
autoscaling_target_vllm_num_requests_waiting: Optional[int] = None,
57515850
autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None,
57525851
autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None,
57535852
enable_access_logging=False,
@@ -5870,6 +5969,13 @@ def deploy(
58705969
autoscaling_target_request_count_per_minute (int):
58715970
Optional. The target number of requests per minute for autoscaling.
58725971
If set, the model will be scaled based on the number of requests it receives.
5972+
autoscaling_target_dcgm_fi_dev_gpu_util (int):
5973+
Optional. Target DCGM metrics for GPU utilization.
5974+
autoscaling_target_vllm_gpu_cache_usage_perc (int):
5975+
Optional. Target vLLM metrics for GPU KV cache usage percentage.
5976+
autoscaling_target_vllm_num_requests_waiting (int):
5977+
Optional. Target vLLM metrics for number of inference requests
5978+
currently waiting in the queue.
58735979
autoscaling_target_pubsub_num_undelivered_messages (int):
58745980
Optional. The target number of pubsub undelivered messages for autoscaling.
58755981
If set, the model will be scaled based on the pubsub queue size.
@@ -5929,6 +6035,13 @@ def deploy(
59296035
autoscaling_target_request_count_per_minute (int):
59306036
Optional. The target number of requests per minute for autoscaling.
59316037
If set, the model will be scaled based on the number of requests it receives.
6038+
autoscaling_target_dcgm_fi_dev_gpu_util (int):
6039+
Optional. Target DCGM metrics for GPU utilization.
6040+
autoscaling_target_vllm_gpu_cache_usage_perc (int):
6041+
Optional. Target vLLM metrics for GPU KV cache usage percentage.
6042+
autoscaling_target_vllm_num_requests_waiting (int):
6043+
Optional. Target vLLM metrics for number of inference requests
6044+
currently waiting in the queue.
59326045
autoscaling_target_pubsub_num_undelivered_messages (int):
59336046
Optional. The target number of pubsub undelivered messages for autoscaling.
59346047
If set, the model will be scaled based on the pubsub queue size.
@@ -6001,6 +6114,9 @@ def deploy(
60016114
autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
60026115
autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
60036116
autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute,
6117+
autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util,
6118+
autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc,
6119+
autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting,
60046120
autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages,
60056121
autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels,
60066122
spot=spot,
@@ -6047,6 +6163,9 @@ def _deploy(
60476163
autoscaling_target_cpu_utilization: Optional[int] = None,
60486164
autoscaling_target_accelerator_duty_cycle: Optional[int] = None,
60496165
autoscaling_target_request_count_per_minute: Optional[int] = None,
6166+
autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None,
6167+
autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None,
6168+
autoscaling_target_vllm_num_requests_waiting: Optional[int] = None,
60506169
autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None,
60516170
autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None,
60526171
spot: bool = False,
@@ -6171,6 +6290,13 @@ def _deploy(
61716290
autoscaling_target_request_count_per_minute (int):
61726291
Optional. The target number of requests per minute for autoscaling.
61736292
If set, the model will be scaled based on the number of requests it receives.
6293+
autoscaling_target_dcgm_fi_dev_gpu_util (int):
6294+
Optional. Target DCGM metrics for GPU utilization.
6295+
autoscaling_target_vllm_gpu_cache_usage_perc (int):
6296+
Optional. Target vLLM metrics for GPU KV cache usage percentage.
6297+
autoscaling_target_vllm_num_requests_waiting (int):
6298+
Optional. Target vLLM metrics for number of inference requests
6299+
currently waiting in the queue.
61746300
autoscaling_target_pubsub_num_undelivered_messages (int):
61756301
Optional. The target number of pubsub undelivered messages for autoscaling.
61766302
If set, the model will be scaled based on the pubsub queue size.
@@ -6267,6 +6393,9 @@ def _deploy(
62676393
autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
62686394
autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
62696395
autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute,
6396+
autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util,
6397+
autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc,
6398+
autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting,
62706399
autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages,
62716400
autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels,
62726401
spot=spot,

0 commit comments

Comments
 (0)