kernel · hiroTamada · Jan 23, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go
@@ -274,6 +274,83 @@ func (s *ApiService) GetInstance(ctx context.Context, request oapi.GetInstanceRe
 	return oapi.GetInstance200JSONResponse(instanceToOAPI(*inst)), nil
 }
 
+// GetInstanceStats returns resource utilization statistics for an instance
+// The id parameter can be an instance ID, name, or ID prefix
+// Note: Resolution is handled by ResolveResource middleware
+func (s *ApiService) GetInstanceStats(ctx context.Context, request oapi.GetInstanceStatsRequestObject) (oapi.GetInstanceStatsResponseObject, error) {
+	log := logger.FromContext(ctx)
+	inst := mw.GetResolvedInstance[instances.Instance](ctx)
+	if inst == nil {
+		return oapi.GetInstanceStats500JSONResponse{
+			Code:    "internal_error",
+			Message: "resource not resolved",
+		}, nil
+	}
+
+	// Build stats response
+	stats := oapi.InstanceStats{
+		InstanceId:           inst.Id,
+		InstanceName:         inst.Name,
+		AllocatedVcpus:       inst.Vcpus,
+		AllocatedMemoryBytes: inst.Size + inst.HotplugSize,
+	}
+
+	// Read /proc stats if we have a hypervisor PID
+	if inst.HypervisorPID != nil {
+		pid := *inst.HypervisorPID
+
+		// Read CPU from /proc/<pid>/stat
+		cpuUsec, err := resources.ReadProcStat(pid)
+		if err != nil {
+			log.DebugContext(ctx, "failed to read proc stat", "pid", pid, "error", err)
+		} else {
+			stats.CpuSeconds = float64(cpuUsec) / 1_000_000.0
+		}
+
+		// Read memory from /proc/<pid>/statm
+		rssBytes, vmsBytes, err := resources.ReadProcStatm(pid)
+		if err != nil {
+			log.DebugContext(ctx, "failed to read proc statm", "pid", pid, "error", err)
+		} else {
+			stats.MemoryRssBytes = int64(rssBytes)
+			stats.MemoryVmsBytes = int64(vmsBytes)
+
+			// Compute utilization ratio
+			if stats.AllocatedMemoryBytes > 0 {
+				ratio := float64(rssBytes) / float64(stats.AllocatedMemoryBytes)
+				stats.MemoryUtilizationRatio = &ratio
+			}
+		}
+	}
+
+	// Read TAP stats if network is enabled
+	if inst.NetworkEnabled {
+		tapName := generateTAPName(inst.Id)
+		rxBytes, txBytes, err := resources.ReadTAPStats(tapName)
+		if err != nil {
+			log.DebugContext(ctx, "failed to read TAP stats", "tap", tapName, "error", err)
+		} else {
+			stats.NetworkRxBytes = int64(rxBytes)
+			stats.NetworkTxBytes = int64(txBytes)
+		}
+	}
+
+	return oapi.GetInstanceStats200JSONResponse(stats), nil
+}
+
+// generateTAPName generates TAP device name from instance ID
+func generateTAPName(instanceID string) string {
+	// TAP name format: "hype-" + first 10 chars of instance ID
+	// Max TAP name length is 15 chars (IFNAMSIZ - 1)
+	prefix := "hype-"
+	maxIDLen := 15 - len(prefix)
+	idPart := instanceID
+	if len(idPart) > maxIDLen {
+		idPart = idPart[:maxIDLen]
+	}
+	return prefix + idPart
+}
+
 // DeleteInstance stops and deletes an instance
 // The id parameter can be an instance ID, name, or ID prefix
 // Note: Resolution is handled by ResolveResource middleware

diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json
@@ -417,6 +417,112 @@
       ],
       "title": "Exec Sessions & Duration",
       "type": "timeseries"
+    },
+    {
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 },
+      "id": 18,
+      "title": "VM Resource Utilization",
+      "type": "row"
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 },
+      "id": 19,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_vm_cpu_seconds_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[1m])",
+          "legendFormat": "{{instance_name}}",
+          "refId": "A"
+        }
+      ],
+      "title": "VM CPU Usage (cores)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "short", "min": 0 },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 },
+      "id": 20,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_vm_memory_rss_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "{{instance_name}} RSS",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_vm_memory_vms_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "{{instance_name}} VMS",
+          "refId": "B"
+        }
+      ],
+      "title": "VM Memory Usage (RSS & VMS)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "bytes", "min": 0 },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 52 },
+      "id": 21,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_vm_network_rx_bytes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[1m])",
+          "legendFormat": "{{instance_name}} RX",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_vm_network_tx_bytes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[1m])",
+          "legendFormat": "{{instance_name}} TX",
+          "refId": "B"
+        }
+      ],
+      "title": "VM Network I/O",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "Bps", "min": 0 },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 60 },
+      "id": 23,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right" },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_vm_memory_utilization_ratio{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "{{instance_name}}",
+          "refId": "A"
+        }
+      ],
+      "title": "VM Memory Utilization (% of allocated)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "percentunit", "min": 0, "max": 1 },
+        "overrides": []
+      }
     }
   ],
   "refresh": "10s",

diff --git a/lib/builds/manager_test.go b/lib/builds/manager_test.go
@@ -122,6 +122,10 @@ func (m *mockInstanceManager) ListInstanceAllocations(ctx context.Context) ([]re
 	return nil, nil
 }
 
+func (m *mockInstanceManager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) {
+	return nil, nil
+}
+
 // mockVolumeManager implements volumes.Manager for testing
 type mockVolumeManager struct {
 	volumes               map[string]*volumes.Volume

diff --git a/lib/instances/manager.go b/lib/instances/manager.go
@@ -3,6 +3,7 @@ package instances
 import (
 	"context"
 	"fmt"
+	"strings"
 	"sync"
 
 	"github.com/kernel/hypeman/lib/devices"
@@ -38,6 +39,9 @@ type Manager interface {
 	// ListInstanceAllocations returns resource allocations for all instances.
 	// Used by the resource manager for capacity tracking.
 	ListInstanceAllocations(ctx context.Context) ([]resources.InstanceAllocation, error)
+	// ListRunningInstancesInfo returns info needed for utilization metrics collection.
+	// Used by the resource manager for VM utilization tracking.
+	ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error)
 }
 
 // ResourceLimits contains configurable resource limits for instances
@@ -328,3 +332,50 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst
 
 	return allocations, nil
 }
+
+// ListRunningInstancesInfo returns info needed for utilization metrics collection.
+// Used by the resource manager for VM utilization tracking.
+func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) {
+	instances, err := m.listInstances(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	infos := make([]resources.InstanceUtilizationInfo, 0, len(instances))
+	for _, inst := range instances {
+		// Only include running instances (they have a hypervisor process)
+		if inst.State != StateRunning {
+			continue
+		}
+
+		info := resources.InstanceUtilizationInfo{
+			ID:            inst.Id,
+			Name:          inst.Name,
+			HypervisorPID: inst.HypervisorPID,
+			// Include allocated resources for utilization ratio calculations
+			AllocatedVcpus:       inst.Vcpus,
+			AllocatedMemoryBytes: inst.Size + inst.HotplugSize,
+		}
+
+		// Derive TAP device name if networking is enabled
+		if inst.NetworkEnabled {
+			info.TAPDevice = generateTAPName(inst.Id)
+		}
+
+		infos = append(infos, info)
+	}
+
+	return infos, nil
+}
+
+// generateTAPName generates TAP device name from instance ID.
+// This matches the logic in network/allocate.go.
+func generateTAPName(instanceID string) string {
+	// Use first 8 chars of instance ID
+	// hype-{8chars} fits within 15-char Linux interface name limit
+	shortID := instanceID
+	if len(shortID) > 8 {
+		shortID = shortID[:8]
+	}
+	return "hype-" + strings.ToLower(shortID)
+}
diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go
@@ -1001,3 +1001,39 @@ func (r *testInstanceResolver) ResolveInstance(ctx context.Context, nameOrID str
 	// For tests, just return nameOrID as both name and id
 	return nameOrID, nameOrID, nil
 }
+
+func TestGenerateTAPName(t *testing.T) {
+	tests := []struct {
+		name       string
+		instanceID string
+		expected   string
+	}{
+		{
+			name:       "standard ID",
+			instanceID: "01HQVX7ABC123DEF456",
+			expected:   "hype-01hqvx7a",
+		},
+		{
+			name:       "short ID",
+			instanceID: "ABC123",
+			expected:   "hype-abc123",
+		},
+		{
+			name:       "exact 8 chars",
+			instanceID: "ABCDEFGH",
+			expected:   "hype-abcdefgh",
+		},
+		{
+			name:       "mixed case",
+			instanceID: "AbCdEfGhIjKl",
+			expected:   "hype-abcdefgh",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := generateTAPName(tt.instanceID)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}