@@ -1367,6 +1367,9 @@ def deploy(
13671367 autoscaling_target_cpu_utilization : Optional [int ] = None ,
13681368 autoscaling_target_accelerator_duty_cycle : Optional [int ] = None ,
13691369 autoscaling_target_request_count_per_minute : Optional [int ] = None ,
1370+ autoscaling_target_dcgm_fi_dev_gpu_util : Optional [int ] = None ,
1371+ autoscaling_target_vllm_gpu_cache_usage_perc : Optional [int ] = None ,
1372+ autoscaling_target_vllm_num_requests_waiting : Optional [int ] = None ,
13701373 autoscaling_target_pubsub_num_undelivered_messages : Optional [int ] = None ,
13711374 autoscaling_pubsub_subscription_labels : Optional [Dict [str , str ]] = None ,
13721375 enable_access_logging = False ,
@@ -1467,6 +1470,13 @@ def deploy(
14671470 autoscaling_target_request_count_per_minute (int):
14681471 Optional. The target number of requests per minute for autoscaling.
14691472 If set, the model will be scaled based on the number of requests it receives.
1473+ autoscaling_target_dcgm_fi_dev_gpu_util (int):
1474+ Optional. Target DCGM metrics for GPU utilization.
1475+ autoscaling_target_vllm_gpu_cache_usage_perc (int):
1476+ Optional. Target vLLM metrics for GPU KV cache usage percentage.
1477+ autoscaling_target_vllm_num_requests_waiting (int):
1478+ Optional. Target vLLM metrics for number of inference requests
1479+ currently waiting in the queue.
14701480 autoscaling_target_pubsub_num_undelivered_messages (int):
14711481 Optional. The target number of pubsub undelivered messages for autoscaling.
14721482 If set, the model will be scaled based on the pubsub queue size.
@@ -1555,6 +1565,9 @@ def deploy(
15551565 autoscaling_target_cpu_utilization = autoscaling_target_cpu_utilization ,
15561566 autoscaling_target_accelerator_duty_cycle = autoscaling_target_accelerator_duty_cycle ,
15571567 autoscaling_target_request_count_per_minute = autoscaling_target_request_count_per_minute ,
1568+ autoscaling_target_dcgm_fi_dev_gpu_util = autoscaling_target_dcgm_fi_dev_gpu_util ,
1569+ autoscaling_target_vllm_gpu_cache_usage_perc = autoscaling_target_vllm_gpu_cache_usage_perc ,
1570+ autoscaling_target_vllm_num_requests_waiting = autoscaling_target_vllm_num_requests_waiting ,
15581571 autoscaling_target_pubsub_num_undelivered_messages = autoscaling_target_pubsub_num_undelivered_messages ,
15591572 autoscaling_pubsub_subscription_labels = autoscaling_pubsub_subscription_labels ,
15601573 spot = spot ,
@@ -1591,6 +1604,9 @@ def _deploy(
15911604 autoscaling_target_cpu_utilization : Optional [int ] = None ,
15921605 autoscaling_target_accelerator_duty_cycle : Optional [int ] = None ,
15931606 autoscaling_target_request_count_per_minute : Optional [int ] = None ,
1607+ autoscaling_target_dcgm_fi_dev_gpu_util : Optional [int ] = None ,
1608+ autoscaling_target_vllm_gpu_cache_usage_perc : Optional [int ] = None ,
1609+ autoscaling_target_vllm_num_requests_waiting : Optional [int ] = None ,
15941610 autoscaling_target_pubsub_num_undelivered_messages : Optional [int ] = None ,
15951611 autoscaling_pubsub_subscription_labels : Optional [Dict [str , str ]] = None ,
15961612 spot : bool = False ,
@@ -1694,6 +1710,13 @@ def _deploy(
16941710 autoscaling_target_request_count_per_minute (int):
16951711 Optional. The target number of requests per minute for autoscaling.
16961712 If set, the model will be scaled based on the number of requests it receives.
1713+ autoscaling_target_dcgm_fi_dev_gpu_util (int):
1714+ Optional. Target DCGM metrics for GPU utilization.
1715+ autoscaling_target_vllm_gpu_cache_usage_perc (int):
1716+ Optional. Target vLLM metrics for GPU KV cache usage percentage.
1717+ autoscaling_target_vllm_num_requests_waiting (int):
1718+ Optional. Target vLLM metrics for number of inference requests
1719+ currently waiting in the queue.
16971720 autoscaling_target_pubsub_num_undelivered_messages (int):
16981721 Optional. The target number of pubsub undelivered messages for autoscaling.
16991722 If set, the model will be scaled based on the pubsub queue size.
@@ -1759,6 +1782,9 @@ def _deploy(
17591782 autoscaling_target_cpu_utilization = autoscaling_target_cpu_utilization ,
17601783 autoscaling_target_accelerator_duty_cycle = autoscaling_target_accelerator_duty_cycle ,
17611784 autoscaling_target_request_count_per_minute = autoscaling_target_request_count_per_minute ,
1785+ autoscaling_target_dcgm_fi_dev_gpu_util = autoscaling_target_dcgm_fi_dev_gpu_util ,
1786+ autoscaling_target_vllm_gpu_cache_usage_perc = autoscaling_target_vllm_gpu_cache_usage_perc ,
1787+ autoscaling_target_vllm_num_requests_waiting = autoscaling_target_vllm_num_requests_waiting ,
17621788 autoscaling_target_pubsub_num_undelivered_messages = autoscaling_target_pubsub_num_undelivered_messages ,
17631789 autoscaling_pubsub_subscription_labels = autoscaling_pubsub_subscription_labels ,
17641790 spot = spot ,
@@ -1802,6 +1828,9 @@ def _deploy_call(
18021828 autoscaling_target_cpu_utilization : Optional [int ] = None ,
18031829 autoscaling_target_accelerator_duty_cycle : Optional [int ] = None ,
18041830 autoscaling_target_request_count_per_minute : Optional [int ] = None ,
1831+ autoscaling_target_dcgm_fi_dev_gpu_util : Optional [int ] = None ,
1832+ autoscaling_target_vllm_gpu_cache_usage_perc : Optional [int ] = None ,
1833+ autoscaling_target_vllm_num_requests_waiting : Optional [int ] = None ,
18051834 autoscaling_target_pubsub_num_undelivered_messages : Optional [int ] = None ,
18061835 autoscaling_pubsub_subscription_labels : Optional [Dict [str , str ]] = None ,
18071836 spot : bool = False ,
@@ -1911,6 +1940,13 @@ def _deploy_call(
19111940 A default value of 60 will be used if not specified.
19121941 autoscaling_target_request_count_per_minute (int):
19131942 Optional. Target request count per minute per instance.
1943+ autoscaling_target_dcgm_fi_dev_gpu_util (int):
1944+ Optional. Target DCGM metrics for GPU utilization.
1945+ autoscaling_target_vllm_gpu_cache_usage_perc (int):
1946+ Optional. Target vLLM metrics for GPU KV cache usage percentage.
1947+ autoscaling_target_vllm_num_requests_waiting (int):
1948+ Optional. Target vLLM metrics for number of inference requests
1949+ currently waiting in the queue.
19141950 autoscaling_target_pubsub_num_undelivered_messages (int):
19151951 Optional. Target pubsub queue size per instance.
19161952 autoscaling_pubsub_subscription_labels (Dict[str, str]):
@@ -2006,6 +2042,9 @@ def _deploy_call(
20062042 or autoscaling_target_accelerator_duty_cycle
20072043 or autoscaling_target_cpu_utilization
20082044 or autoscaling_target_request_count_per_minute
2045+ or autoscaling_target_dcgm_fi_dev_gpu_util
2046+ or autoscaling_target_vllm_gpu_cache_usage_perc
2047+ or autoscaling_target_vllm_num_requests_waiting
20092048 or autoscaling_target_pubsub_num_undelivered_messages
20102049 or autoscaling_pubsub_subscription_labels
20112050 )
@@ -2017,6 +2056,9 @@ def _deploy_call(
20172056 "autoscaling_target_accelerator_duty_cycle, "
20182057 "autoscaling_target_cpu_utilization, "
20192058 "autoscaling_target_request_count_per_minute, "
2059+ "autoscaling_target_dcgm_fi_dev_gpu_util, "
2060+ "autoscaling_target_vllm_gpu_cache_usage_perc, "
2061+ "autoscaling_target_vllm_num_requests_waiting, "
20202062 "autoscaling_target_pubsub_num_undelivered_messages, "
20212063 "autoscaling_pubsub_subscription_labels parameters "
20222064 "may not be set when `deployment_resource_pool` is "
@@ -2078,6 +2120,9 @@ def _deploy_call(
20782120 or autoscaling_target_accelerator_duty_cycle
20792121 or autoscaling_target_cpu_utilization
20802122 or autoscaling_target_request_count_per_minute
2123+ or autoscaling_target_dcgm_fi_dev_gpu_util
2124+ or autoscaling_target_vllm_gpu_cache_usage_perc
2125+ or autoscaling_target_vllm_num_requests_waiting
20812126 or autoscaling_target_pubsub_num_undelivered_messages
20822127 or autoscaling_pubsub_subscription_labels
20832128 )
@@ -2095,6 +2140,9 @@ def _deploy_call(
20952140 "autoscaling_target_accelerator_duty_cycle, "
20962141 "autoscaling_target_cpu_utilization, "
20972142 "autoscaling_target_request_count_per_minute, "
2143+ "autoscaling_target_dcgm_fi_dev_gpu_util, "
2144+ "autoscaling_target_vllm_gpu_cache_usage_perc, "
2145+ "autoscaling_target_vllm_num_requests_waiting, "
20982146 "autoscaling_target_pubsub_num_undelivered_messages, "
20992147 "autoscaling_pubsub_subscription_labels parameters "
21002148 "are ignored."
@@ -2156,6 +2204,48 @@ def _deploy_call(
21562204 [autoscaling_metric_spec ]
21572205 )
21582206
2207+ if autoscaling_target_dcgm_fi_dev_gpu_util :
2208+ autoscaling_metric_spec = (
2209+ gca_machine_resources_compat .AutoscalingMetricSpec (
2210+ metric_name = (
2211+ "prometheus.googleapis.com/"
2212+ "vertex_dcgm_fi_dev_gpu_util"
2213+ ),
2214+ target = autoscaling_target_dcgm_fi_dev_gpu_util ,
2215+ )
2216+ )
2217+ dedicated_resources .autoscaling_metric_specs .extend (
2218+ [autoscaling_metric_spec ]
2219+ )
2220+
2221+ if autoscaling_target_vllm_gpu_cache_usage_perc :
2222+ autoscaling_metric_spec = (
2223+ gca_machine_resources_compat .AutoscalingMetricSpec (
2224+ metric_name = (
2225+ "prometheus.googleapis.com/"
2226+ "vertex_vllm_gpu_cache_usage_perc"
2227+ ),
2228+ target = autoscaling_target_vllm_gpu_cache_usage_perc ,
2229+ )
2230+ )
2231+ dedicated_resources .autoscaling_metric_specs .extend (
2232+ [autoscaling_metric_spec ]
2233+ )
2234+
2235+ if autoscaling_target_vllm_num_requests_waiting :
2236+ autoscaling_metric_spec = (
2237+ gca_machine_resources_compat .AutoscalingMetricSpec (
2238+ metric_name = (
2239+ "prometheus.googleapis.com/"
2240+ "vertex_vllm_num_requests_waiting"
2241+ ),
2242+ target = autoscaling_target_vllm_num_requests_waiting ,
2243+ )
2244+ )
2245+ dedicated_resources .autoscaling_metric_specs .extend (
2246+ [autoscaling_metric_spec ]
2247+ )
2248+
21592249 if autoscaling_target_pubsub_num_undelivered_messages :
21602250 autoscaling_metric_spec = gca_machine_resources .AutoscalingMetricSpec (
21612251 metric_name = (
@@ -4492,6 +4582,9 @@ def deploy(
44924582 autoscaling_target_cpu_utilization : Optional [int ] = None ,
44934583 autoscaling_target_accelerator_duty_cycle : Optional [int ] = None ,
44944584 autoscaling_target_request_count_per_minute : Optional [int ] = None ,
4585+ autoscaling_target_dcgm_fi_dev_gpu_util : Optional [int ] = None ,
4586+ autoscaling_target_vllm_gpu_cache_usage_perc : Optional [int ] = None ,
4587+ autoscaling_target_vllm_num_requests_waiting : Optional [int ] = None ,
44954588 autoscaling_target_pubsub_num_undelivered_messages : Optional [int ] = None ,
44964589 autoscaling_pubsub_subscription_labels : Optional [Dict [str , str ]] = None ,
44974590 ) -> None :
@@ -4673,6 +4766,9 @@ def deploy(
46734766 autoscaling_target_cpu_utilization = autoscaling_target_cpu_utilization ,
46744767 autoscaling_target_accelerator_duty_cycle = autoscaling_target_accelerator_duty_cycle ,
46754768 autoscaling_target_request_count_per_minute = autoscaling_target_request_count_per_minute ,
4769+ autoscaling_target_dcgm_fi_dev_gpu_util = autoscaling_target_dcgm_fi_dev_gpu_util ,
4770+ autoscaling_target_vllm_gpu_cache_usage_perc = autoscaling_target_vllm_gpu_cache_usage_perc ,
4771+ autoscaling_target_vllm_num_requests_waiting = autoscaling_target_vllm_num_requests_waiting ,
46764772 autoscaling_target_pubsub_num_undelivered_messages = autoscaling_target_pubsub_num_undelivered_messages ,
46774773 autoscaling_pubsub_subscription_labels = autoscaling_pubsub_subscription_labels ,
46784774 )
@@ -5748,6 +5844,9 @@ def deploy(
57485844 autoscaling_target_cpu_utilization : Optional [int ] = None ,
57495845 autoscaling_target_accelerator_duty_cycle : Optional [int ] = None ,
57505846 autoscaling_target_request_count_per_minute : Optional [int ] = None ,
5847+ autoscaling_target_dcgm_fi_dev_gpu_util : Optional [int ] = None ,
5848+ autoscaling_target_vllm_gpu_cache_usage_perc : Optional [int ] = None ,
5849+ autoscaling_target_vllm_num_requests_waiting : Optional [int ] = None ,
57515850 autoscaling_target_pubsub_num_undelivered_messages : Optional [int ] = None ,
57525851 autoscaling_pubsub_subscription_labels : Optional [Dict [str , str ]] = None ,
57535852 enable_access_logging = False ,
@@ -5870,6 +5969,13 @@ def deploy(
58705969 autoscaling_target_request_count_per_minute (int):
58715970 Optional. The target number of requests per minute for autoscaling.
58725971 If set, the model will be scaled based on the number of requests it receives.
5972+ autoscaling_target_dcgm_fi_dev_gpu_util (int):
5973+ Optional. Target DCGM metrics for GPU utilization.
5974+ autoscaling_target_vllm_gpu_cache_usage_perc (int):
5975+ Optional. Target vLLM metrics for GPU KV cache usage percentage.
5976+ autoscaling_target_vllm_num_requests_waiting (int):
5977+ Optional. Target vLLM metrics for number of inference requests
5978+ currently waiting in the queue.
58735979 autoscaling_target_pubsub_num_undelivered_messages (int):
58745980 Optional. The target number of pubsub undelivered messages for autoscaling.
58755981 If set, the model will be scaled based on the pubsub queue size.
@@ -5929,6 +6035,13 @@ def deploy(
59296035 autoscaling_target_request_count_per_minute (int):
59306036 Optional. The target number of requests per minute for autoscaling.
59316037 If set, the model will be scaled based on the number of requests it receives.
6038+ autoscaling_target_dcgm_fi_dev_gpu_util (int):
6039+ Optional. Target DCGM metrics for GPU utilization.
6040+ autoscaling_target_vllm_gpu_cache_usage_perc (int):
6041+ Optional. Target vLLM metrics for GPU KV cache usage percentage.
6042+ autoscaling_target_vllm_num_requests_waiting (int):
6043+ Optional. Target vLLM metrics for number of inference requests
6044+ currently waiting in the queue.
59326045 autoscaling_target_pubsub_num_undelivered_messages (int):
59336046 Optional. The target number of pubsub undelivered messages for autoscaling.
59346047 If set, the model will be scaled based on the pubsub queue size.
@@ -6001,6 +6114,9 @@ def deploy(
60016114 autoscaling_target_cpu_utilization = autoscaling_target_cpu_utilization ,
60026115 autoscaling_target_accelerator_duty_cycle = autoscaling_target_accelerator_duty_cycle ,
60036116 autoscaling_target_request_count_per_minute = autoscaling_target_request_count_per_minute ,
6117+ autoscaling_target_dcgm_fi_dev_gpu_util = autoscaling_target_dcgm_fi_dev_gpu_util ,
6118+ autoscaling_target_vllm_gpu_cache_usage_perc = autoscaling_target_vllm_gpu_cache_usage_perc ,
6119+ autoscaling_target_vllm_num_requests_waiting = autoscaling_target_vllm_num_requests_waiting ,
60046120 autoscaling_target_pubsub_num_undelivered_messages = autoscaling_target_pubsub_num_undelivered_messages ,
60056121 autoscaling_pubsub_subscription_labels = autoscaling_pubsub_subscription_labels ,
60066122 spot = spot ,
@@ -6047,6 +6163,9 @@ def _deploy(
60476163 autoscaling_target_cpu_utilization : Optional [int ] = None ,
60486164 autoscaling_target_accelerator_duty_cycle : Optional [int ] = None ,
60496165 autoscaling_target_request_count_per_minute : Optional [int ] = None ,
6166+ autoscaling_target_dcgm_fi_dev_gpu_util : Optional [int ] = None ,
6167+ autoscaling_target_vllm_gpu_cache_usage_perc : Optional [int ] = None ,
6168+ autoscaling_target_vllm_num_requests_waiting : Optional [int ] = None ,
60506169 autoscaling_target_pubsub_num_undelivered_messages : Optional [int ] = None ,
60516170 autoscaling_pubsub_subscription_labels : Optional [Dict [str , str ]] = None ,
60526171 spot : bool = False ,
@@ -6171,6 +6290,13 @@ def _deploy(
61716290 autoscaling_target_request_count_per_minute (int):
61726291 Optional. The target number of requests per minute for autoscaling.
61736292 If set, the model will be scaled based on the number of requests it receives.
6293+ autoscaling_target_dcgm_fi_dev_gpu_util (int):
6294+ Optional. Target DCGM metrics for GPU utilization.
6295+ autoscaling_target_vllm_gpu_cache_usage_perc (int):
6296+ Optional. Target vLLM metrics for GPU KV cache usage percentage.
6297+ autoscaling_target_vllm_num_requests_waiting (int):
6298+ Optional. Target vLLM metrics for number of inference requests
6299+ currently waiting in the queue.
61746300 autoscaling_target_pubsub_num_undelivered_messages (int):
61756301 Optional. The target number of pubsub undelivered messages for autoscaling.
61766302 If set, the model will be scaled based on the pubsub queue size.
@@ -6267,6 +6393,9 @@ def _deploy(
62676393 autoscaling_target_cpu_utilization = autoscaling_target_cpu_utilization ,
62686394 autoscaling_target_accelerator_duty_cycle = autoscaling_target_accelerator_duty_cycle ,
62696395 autoscaling_target_request_count_per_minute = autoscaling_target_request_count_per_minute ,
6396+ autoscaling_target_dcgm_fi_dev_gpu_util = autoscaling_target_dcgm_fi_dev_gpu_util ,
6397+ autoscaling_target_vllm_gpu_cache_usage_perc = autoscaling_target_vllm_gpu_cache_usage_perc ,
6398+ autoscaling_target_vllm_num_requests_waiting = autoscaling_target_vllm_num_requests_waiting ,
62706399 autoscaling_target_pubsub_num_undelivered_messages = autoscaling_target_pubsub_num_undelivered_messages ,
62716400 autoscaling_pubsub_subscription_labels = autoscaling_pubsub_subscription_labels ,
62726401 spot = spot ,
0 commit comments