Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 45 additions & 48 deletions nutanix/datadog_checks/nutanix/infrastructure_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING

from datadog_checks.base import is_affirmative
from datadog_checks.nutanix.metrics import CLUSTER_STATS_METRICS, HOST_STATS_METRICS, VM_STATS_METRICS
Expand All @@ -16,9 +16,6 @@
if TYPE_CHECKING:
from datadog_checks.nutanix.check import NutanixCheck

# Entity types for metrics counting
EntityType = Literal['cluster', 'host', 'vm']


@dataclass
class ClusterCapacity:
Expand Down Expand Up @@ -61,17 +58,15 @@ def __init__(self, check: NutanixCheck):
self.host_names = {} # host_id -> host_name
self.categories = {} # category_id -> category object
self.collection_time_window = None
# Metrics counters
self.cluster_metrics_count = 0
self.host_metrics_count = 0
self.vm_metrics_count = 0
# Entity counters
self.cluster_count = 0
self.host_count = 0
self.vm_count = 0
# Cluster capacity accumulator
self._cluster_capacity = ClusterCapacity()
self._vms_by_host: dict[str, list[dict]] = {}
self._hostless_vm_vcpus = 0
self._hostless_vm_memory_bytes = 0

def reset_state(self) -> None:
"""Reset all caches and counters for a new collection run."""
Expand All @@ -80,14 +75,13 @@ def reset_state(self) -> None:
self.categories = {}
self.external_tags = []
self.collection_time_window = None
self.cluster_metrics_count = 0
self.host_metrics_count = 0
self.vm_metrics_count = 0
self.cluster_count = 0
self.host_count = 0
self.vm_count = 0
self._cluster_capacity.reset()
self._vms_by_host = {}
self._hostless_vm_vcpus = 0
self._hostless_vm_memory_bytes = 0

def collect_cluster_metrics(self) -> None:
"""Collect metrics from all Nutanix clusters."""
Expand Down Expand Up @@ -167,6 +161,9 @@ def collect_cluster_metrics(self) -> None:

self._process_hosts(cluster, vm_stats, cluster_name, pc_label)

# Add capacity from VMs without a host assignment
self._cluster_capacity.add_vm(self._hostless_vm_vcpus, self._hostless_vm_memory_bytes)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Scope hostless VM capacity to current cluster

The new hostless accumulation is global across all VMs (_build_vms_by_host_cache sums into _hostless_vm_* once) but then those totals are added inside each cluster loop, so every processed cluster receives the same hostless VM capacity. In environments with multiple clusters (or cluster filters), this overcounts capacity and can attribute hostless VMs from excluded/other clusters to the current cluster. Please bucket hostless capacity by vm.cluster.extId (or compute per-cluster) before adding it.

Useful? React with 👍 / 👎.


# Report cluster capacity metrics (aggregated from hosts and VMs)
cluster_tags = self.check.base_tags + self._extract_cluster_tags(cluster)
self._report_cluster_capacity_metrics(cluster_tags)
Expand Down Expand Up @@ -200,20 +197,22 @@ def _process_cluster(self, cluster: dict, pc_label: str) -> None:
self._report_cluster_basic_metrics(cluster, cluster_tags)
self._report_cluster_stats(cluster_name, cluster_id, cluster_tags, pc_label)

def _process_vm(self, vm: dict, vm_stats_dict: dict, cluster_name: str, pc_label: str) -> None:
"""Process and report metrics for a single VM."""
def _process_vm(self, vm: dict, vm_stats_dict: dict, cluster_name: str, pc_label: str) -> bool:
"""Process and report metrics for a single VM. Returns True if the VM was collected."""
vm_id = vm.get("extId")
hostname = vm.get("name")
if not vm_id or not hostname:
self.check.log.debug("[%s][%s] Skipping VM missing extId or name: %r", pc_label, cluster_name, vm)
return
return False
if not self._should_collect_vm(vm):
return
self._accumulate_vm_capacity(vm)
return False
vm_tags = self.check.base_tags + self._extract_vm_tags(vm)

self._set_external_tags_for_host(hostname, vm_tags)
self._report_vm_basic_metrics(vm, hostname, vm_tags)
self._report_vm_stats(vm_id, hostname, vm_tags, vm_stats_dict, cluster_name, pc_label)
return True

def _report_vm_basic_metrics(self, vm: dict, hostname: str, vm_tags: list[str]) -> None:
"""Report basic VM metrics (counts and status)."""
Expand All @@ -232,10 +231,8 @@ def _report_vm_capacity_metrics(self, vm: dict, hostname: str, vm_tags: list[str
num_sockets = int(vm.get("numSockets") or 0)
num_cores_per_socket = int(vm.get("numCoresPerSocket") or 0)
num_threads_per_core = int(vm.get("numThreadsPerCore") or 0)
memory_bytes = int(vm.get("memorySizeBytes") or 0)

# Total vCPUs = sockets * cores_per_socket
vcpus_allocated = num_sockets * num_cores_per_socket
memory_bytes = int(vm.get("memorySizeBytes") or 0)

self.check.gauge("vm.cpu.sockets", num_sockets, hostname=hostname, tags=vm_tags)
self.check.gauge("vm.cpu.cores_per_socket", num_cores_per_socket, hostname=hostname, tags=vm_tags)
Expand All @@ -246,6 +243,13 @@ def _report_vm_capacity_metrics(self, vm: dict, hostname: str, vm_tags: list[str
# Accumulate for cluster totals
self._cluster_capacity.add_vm(vcpus_allocated, memory_bytes)

def _accumulate_vm_capacity(self, vm: dict) -> None:
"""Accumulate VM capacity into cluster totals without reporting per-VM metrics."""
num_sockets = int(vm.get("numSockets") or 0)
num_cores_per_socket = int(vm.get("numCoresPerSocket") or 0)
memory_bytes = int(vm.get("memorySizeBytes") or 0)
self._cluster_capacity.add_vm(num_sockets * num_cores_per_socket, memory_bytes)

def _report_cluster_basic_metrics(self, cluster: dict, cluster_tags: list[str]) -> None:
"""Report basic cluster metrics (counts)."""
nbr_nodes = int(get_nested(cluster, "nodes/numberOfNodes") or 0)
Expand Down Expand Up @@ -273,15 +277,13 @@ def _report_stats(
metrics_map: dict[str, str],
tags: list[str],
hostname: str | None = None,
entity_type: EntityType | None = None,
) -> None:
"""Submit stats metrics for any entity type."""
if not stats:
self.check.log.warning("No stats returned for %s", entity_name)
return

is_list = isinstance(stats, list)
metrics_submitted = 0

for key, metric_name in metrics_map.items():
entries = stats if is_list else stats.get(key, [])
Expand All @@ -290,15 +292,6 @@ def _report_stats(
value = get_nested(entry, value_field)
if value is not None:
self.check.gauge(metric_name, value, hostname=hostname, tags=tags)
metrics_submitted += 1

# Track metrics by type
if entity_type == 'cluster':
self.cluster_metrics_count += metrics_submitted
elif entity_type == 'host':
self.host_metrics_count += metrics_submitted
elif entity_type == 'vm':
self.vm_metrics_count += metrics_submitted

def _report_cluster_stats(self, cluster_name: str, cluster_id: str, cluster_tags: list[str], pc_label: str) -> None:
"""Report time-series stats for a cluster."""
Expand All @@ -308,7 +301,6 @@ def _report_cluster_stats(self, cluster_name: str, cluster_id: str, cluster_tags
stats,
CLUSTER_STATS_METRICS,
cluster_tags,
entity_type="cluster",
)

def _report_vm_stats(
Expand All @@ -323,7 +315,6 @@ def _report_vm_stats(
VM_STATS_METRICS,
vm_tags,
hostname=hostname,
entity_type="vm",
)

def _process_hosts(self, cluster: dict, cluster_vm_stats_dict: dict, cluster_name: str, pc_label: str) -> None:
Expand All @@ -334,14 +325,15 @@ def _process_hosts(self, cluster: dict, cluster_vm_stats_dict: dict, cluster_nam
hosts = self._list_hosts_by_cluster(cluster_id)
self.check.log.info("[%s][%s] Processing %d hosts", pc_label, cluster_name, len(hosts))

total_vms = sum(
hosts_before, vms_before = self.host_count, self.vm_count
for host in hosts:
self._process_single_host(host, cluster_id, cluster_tags, cluster_vm_stats_dict, cluster_name, pc_label)
for host in hosts
)

self.host_count += len(hosts)
self.vm_count += total_vms
self.check.log.info("[%s][%s] Processed %d hosts and %d VMs", pc_label, cluster_name, len(hosts), total_vms)
hosts_collected = self.host_count - hosts_before
vms_collected = self.vm_count - vms_before
self.check.log.info(
"[%s][%s] Processed %d hosts and %d VMs", pc_label, cluster_name, hosts_collected, vms_collected
)

def _process_single_host(
self,
Expand All @@ -351,17 +343,19 @@ def _process_single_host(
cluster_vm_stats_dict: dict,
cluster_name: str,
pc_label: str,
) -> int:
"""Process a single host and its VMs, returning number of VMs processed."""
) -> None:
"""Process a single host and its VMs."""
host_id = host.get("extId")
host_name = host.get("hostName")

if not host_id:
self.check.log.warning("[%s][%s] Host %s has no extId, skipping", pc_label, cluster_name, host_name)
return 0
return

if not should_collect_resource("host", host, self.check.resource_filters, self.check.log):
return 0
return

self.host_count += 1

# Cache host name for VM tagging
if host_name:
Expand All @@ -385,7 +379,6 @@ def _process_single_host(
HOST_STATS_METRICS,
host_tags,
hostname=host_name,
entity_type="host",
)
except Exception:
self.check.log.exception("[%s][%s] Failed to fetch stats for host %s", pc_label, cluster_name, host_name)
Expand All @@ -398,14 +391,13 @@ def _process_single_host(
vms = self._list_vms(host_id)
except Exception:
self.check.log.exception("[%s][%s] Failed to list VMs for host %s", pc_label, cluster_name, host_name)
return 0
return

self.check.log.debug("[%s][%s] Host %s has %d VMs", pc_label, cluster_name, host_name, len(vms))

for vm in vms:
self._process_vm(vm, cluster_vm_stats_dict, cluster_name, pc_label)

return len(vms)
if self._process_vm(vm, cluster_vm_stats_dict, cluster_name, pc_label):
self.vm_count += 1

def _report_host_capacity_metrics(self, host: dict, hostname: str, host_tags: list[str]) -> None:
"""Report host capacity metrics (CPU sockets, cores, threads, memory)."""
Expand Down Expand Up @@ -522,11 +514,16 @@ def _should_collect_vm(self, vm: dict) -> bool:
return True

def _build_vms_by_host_cache(self) -> None:
"""Fetch all VMs and group them by host, applying filters."""
"""Fetch all VMs and group them by host."""
for vm in self._list_vms():
host_id = get_nested(vm, "host/extId")
if host_id and self._should_collect_vm(vm):
if host_id:
self._vms_by_host.setdefault(host_id, []).append(vm)
else:
num_sockets = int(vm.get("numSockets") or 0)
num_cores_per_socket = int(vm.get("numCoresPerSocket") or 0)
self._hostless_vm_vcpus += num_sockets * num_cores_per_socket
self._hostless_vm_memory_bytes += int(vm.get("memorySizeBytes") or 0)

def _list_clusters(self) -> list[dict]:
"""Fetch all clusters from Prism Central."""
Expand Down
82 changes: 82 additions & 0 deletions nutanix/tests/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# (C) Datadog, Inc. 2026-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

PCVM_NAME = "NTNX-10-0-0-165-PCVM-1767014640"
UBUNTU_VM_NAME = "ubuntu-vm"
RANDOM_VM_NAME = "random-vm"
OFF_VM_NAME = "test-vm-that-should-remain-off"
HOST_NAME = "10-0-0-103-aws-us-east-1a"

BASE_TAGS = ['nutanix', 'prism_central:10.0.0.197']

CLUSTER_TAGS = [
'Team:agent-integrations',
'cluster_category:cluster_value1',
'cluster_category:cluster_value2',
'cluster_category:cluster_value3',
'ntnx_cluster_name:datadog-nutanix-dev',
'nutanix',
'prism_central:10.0.0.197',
]

HOST_TAGS = [
'Team:agent-integrations',
'cluster_category:cluster_value1',
'cluster_category:cluster_value2',
'cluster_category:cluster_value3',
'ntnx_cluster_name:datadog-nutanix-dev',
'ntnx_host_name:10-0-0-103-aws-us-east-1a',
'ntnx_host_type:HYPER_CONVERGED',
'ntnx_hypervisor_name:AHV 10.3',
'ntnx_hypervisor_type:AHV',
'ntnx_type:host',
'nutanix',
'prism_central:10.0.0.197',
]

PCVM_TAGS = [
'ntnx_cluster_name:datadog-nutanix-dev',
'ntnx_host_name:10-0-0-103-aws-us-east-1a',
'ntnx_is_agent_vm:False',
'ntnx_type:vm',
'ntnx_vm_name:NTNX-10-0-0-165-PCVM-1767014640',
'nutanix',
'prism_central:10.0.0.197',
]

UBUNTU_VM_TAGS = [
'Team:agent-integrations',
'ntnx_cluster_name:datadog-nutanix-dev',
'ntnx_host_name:10-0-0-103-aws-us-east-1a',
'ntnx_is_agent_vm:False',
'ntnx_type:vm',
'ntnx_vm_name:ubuntu-vm',
'nutanix',
'prism_central:10.0.0.197',
'vm_category:vm_value2',
]

RANDOM_VM_TAGS = [
'Team:agent-integrations',
'Team:platform-integrations',
'ntnx_cluster_name:datadog-nutanix-dev',
'ntnx_host_name:10-0-0-103-aws-us-east-1a',
'ntnx_is_agent_vm:False',
'ntnx_type:vm',
'ntnx_vm_name:random-vm',
'nutanix',
'prism_central:10.0.0.197',
'vm_category:vm_value1',
]

OFF_VM_TAGS = [
'ntnx_cluster_name:datadog-nutanix-dev',
# 'ntnx_host_name:None',
'ntnx_is_agent_vm:False',
'ntnx_type:vm',
'ntnx_vm_name:test-vm-that-should-remain-off',
'nutanix',
'prism_central:10.0.0.197',
'vm_category:vm_value3',
]
Loading
Loading