@@ -509,6 +509,7 @@ def __init__(self, cluster_spec: container_spec_lib.ContainerClusterSpec):
509509 self .services : dict [str , KubernetesContainerService ] = {}
510510 self ._extra_samples : list [sample .Sample ] = []
511511 self .container_registry : BaseContainerRegistry | None = None
512+ self .enable_vpa : bool = cluster_spec .enable_vpa
512513
513514 @property
514515 def num_nodes (self ) -> int :
@@ -1320,6 +1321,214 @@ def _Sample(count: int, state: str) -> Sample:
13201321 _Sample (unknown , 'unknown' ),
13211322 ]
13221323
1324+ @staticmethod
1325+ def _GetPodNamesForResource (
1326+ resource_name : str , namespace : str = ''
1327+ ) -> list [str ]:
1328+ """Gets the names of pods managed by a resource (e.g., deployment).
1329+
1330+ The resource must have a .spec.selector.matchLabels defined, and non-empty.
1331+
1332+ Args:
1333+ resource_name: the resource type and name, e.g. 'deployment/my_deploy'.
1334+ namespace: The namespace of the resource.
1335+
1336+ Raises:
1337+ ValueError: if the resource does not have a (non-empty)
1338+ `.spec.selector.matchLabels` field or if the field cannot be parsed.
1339+ """
1340+ # NB: If this approach is insufficient, then we could convert this to use
1341+ # ownerReferences instead. This would mean going over several hops (i.e.
1342+ # deployment->replicaset->pods) so probably is not worth it unless the
1343+ # current approach prooves inadequate.
1344+
1345+ # 1. Get the selector from the resource.
1346+ get_selector_cmd = [
1347+ 'get' ,
1348+ resource_name ,
1349+ '-n' ,
1350+ namespace ,
1351+ "-o=jsonpath='{.spec.selector.matchLabels}'" ,
1352+ ]
1353+ selector_stdout , stderr , retcode = RunKubectlCommand (
1354+ get_selector_cmd , raise_on_failure = False
1355+ )
1356+ if retcode != 0 :
1357+ if 'NotFound' in stderr :
1358+ return []
1359+ raise errors .VmUtil .IssueCommandError (
1360+ f'Failed to get selector for resource { resource_name } : { stderr } '
1361+ )
1362+
1363+ try :
1364+ # The output can be empty if there are no labels (which violates the
1365+ # pre-condition).
1366+ selector_str = selector_stdout .strip ("'" )
1367+ if not selector_str :
1368+ raise ValueError (
1369+ 'A resource without a .spec.selector.matchLabels was passed to'
1370+ ' _GetPodNamesForResource.'
1371+ )
1372+ selector_dict = json .loads (selector_str )
1373+ except json .JSONDecodeError as e :
1374+ raise ValueError (
1375+ f'Could not decode selector for resource { resource_name } :'
1376+ f' { selector_stdout } '
1377+ ) from e
1378+
1379+ if not selector_dict :
1380+ raise ValueError (
1381+ 'A resource without a (non-empty) .spec.selector.matchLabels was'
1382+ ' passed to _GetPodNamesForResource.'
1383+ )
1384+
1385+ # 2. Construct the label selector string.
1386+ selector_parts = [f'{ key } ={ value } ' for key , value in selector_dict .items ()]
1387+ label_selector = ',' .join (selector_parts )
1388+
1389+ # 3. Get pods using the selector.
1390+ get_pods_cmd = [
1391+ 'get' ,
1392+ 'pods' ,
1393+ '-l' ,
1394+ label_selector ,
1395+ '-n' ,
1396+ namespace ,
1397+ '-o=jsonpath={.items[*].metadata.name}' ,
1398+ ]
1399+ pods_stdout , _ , _ = RunKubectlCommand (get_pods_cmd )
1400+
1401+ return pods_stdout .strip ().split ()
1402+
1403+ @staticmethod
1404+ def GetCPURequestSamples (
1405+ resource_name : str , namespace : str = ''
1406+ ) -> list [Sample ]:
1407+ """Returns the CPU requests for all pods within the specified resource.
1408+
1409+ Args:
1410+ resource_name: The deployment/statefulset/etc's name, e.g.
1411+ 'deployment/my_deployment'.
1412+ namespace: The namespace of the resource. If omitted, the 'default'
1413+ namespace will be used.
1414+
1415+ Returns:
1416+ A list of Samples, each representing the CPU request of a pod.
1417+ """
1418+ now = int (time .time ())
1419+
1420+ pod_names = KubernetesClusterCommands ._GetPodNamesForResource (
1421+ resource_name , namespace
1422+ )
1423+ samples = []
1424+
1425+ for pod_name in pod_names :
1426+ # Get CPU requests for each pod
1427+ cpu_request_stdout , _ , _ = RunKubectlCommand (
1428+ [
1429+ 'get' ,
1430+ 'pod' ,
1431+ pod_name ,
1432+ '-n' ,
1433+ namespace ,
1434+ '-o=jsonpath={.spec.containers[*].resources.requests.cpu}' ,
1435+ ],
1436+ )
1437+
1438+ # Convert CPU string (e.g., "100m", "1") to float (cores)
1439+ cpu_request_str = cpu_request_stdout .strip ()
1440+ if not cpu_request_str :
1441+ continue
1442+ if cpu_request_str .endswith ('m' ):
1443+ cpu_request = float (cpu_request_str [:- 1 ]) / 1000
1444+ else :
1445+ cpu_request = float (cpu_request_str )
1446+
1447+ samples .append (
1448+ Sample (
1449+ metric = 'kubernetes_cpu_request' ,
1450+ value = cpu_request ,
1451+ unit = 'cores' ,
1452+ metadata = {
1453+ 'namespace' : namespace ,
1454+ 'resource_name' : resource_name ,
1455+ 'pod' : pod_name ,
1456+ },
1457+ timestamp = now ,
1458+ )
1459+ )
1460+ return samples
1461+
1462+ @staticmethod
1463+ def GetCPUUsageSamples (
1464+ resource_name : str , namespace : str = ''
1465+ ) -> list [Sample ]:
1466+ """Returns the CPU usage for all pods within the specified resource.
1467+
1468+ Args:
1469+ resource_name: The deployment/statefulset/etc's name, e.g.
1470+ 'deployment/my_deployment'.
1471+ namespace: The namespace of the resource. If omitted, the 'default'
1472+ namespace will be used.
1473+
1474+ Returns:
1475+ A list of Samples, each representing the CPU usage of a pod.
1476+ """
1477+ now = int (time .time ())
1478+
1479+ pod_names = KubernetesClusterCommands ._GetPodNamesForResource (
1480+ resource_name , namespace
1481+ )
1482+ samples = []
1483+
1484+ for pod_name in pod_names :
1485+ # Get CPU usage for each pod using kubectl top
1486+ # kubectl top pod <pod-name> --namespace <namespace> --containers
1487+ # This returns output like:
1488+ # POD NAME CPU(cores) MEMORY(bytes)
1489+ # fib-xyz fib 10m 20Mi
1490+ top_output , stderr , retcode = RunKubectlCommand (
1491+ ['top' , 'pod' , pod_name , '--namespace' , namespace , '--containers' ],
1492+ raise_on_failure = False ,
1493+ )
1494+ if retcode != 0 :
1495+ logging .warning (
1496+ 'Could not get CPU usage for pod %s: %s' , pod_name , stderr
1497+ )
1498+ continue
1499+
1500+ # Parse the output to get CPU usage
1501+ # Skip header and split lines
1502+ lines = top_output .strip ().split ('\n ' )[1 :]
1503+ for line in lines :
1504+ parts = line .split ()
1505+ if len (parts ) < 4 :
1506+ raise errors .VmUtil .IssueCommandError (
1507+ f'Unexpected output line from kubectl top: { line } '
1508+ )
1509+
1510+ cpu_usage_str = parts [2 ]
1511+ if cpu_usage_str .endswith ('m' ):
1512+ cpu_usage = float (cpu_usage_str [:- 1 ]) / 1000
1513+ else :
1514+ cpu_usage = float (cpu_usage_str )
1515+
1516+ samples .append (
1517+ Sample (
1518+ metric = 'kubernetes_cpu_usage' ,
1519+ value = cpu_usage ,
1520+ unit = 'cores' ,
1521+ metadata = {
1522+ 'namespace' : namespace ,
1523+ 'resource_name' : resource_name ,
1524+ 'pod' : pod_name ,
1525+ 'container' : parts [1 ], # Container name
1526+ },
1527+ timestamp = now ,
1528+ )
1529+ )
1530+ return samples
1531+
13231532 @staticmethod
13241533 def CreateConfigMap (name : str , from_file_dir : str ):
13251534 """Creates a Kubernetes ConfigMap.
0 commit comments