Skip to content
Merged
77 changes: 77 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,83 @@ func (s *ApiService) GetInstance(ctx context.Context, request oapi.GetInstanceRe
return oapi.GetInstance200JSONResponse(instanceToOAPI(*inst)), nil
}

// GetInstanceStats returns resource utilization statistics for an instance
// The id parameter can be an instance ID, name, or ID prefix
// Note: Resolution is handled by ResolveResource middleware
func (s *ApiService) GetInstanceStats(ctx context.Context, request oapi.GetInstanceStatsRequestObject) (oapi.GetInstanceStatsResponseObject, error) {
log := logger.FromContext(ctx)
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.GetInstanceStats500JSONResponse{
Code: "internal_error",
Message: "resource not resolved",
}, nil
}

// Build stats response
stats := oapi.InstanceStats{
InstanceId: inst.Id,
InstanceName: inst.Name,
AllocatedVcpus: inst.Vcpus,
AllocatedMemoryBytes: inst.Size + inst.HotplugSize,
}

// Read /proc stats if we have a hypervisor PID
if inst.HypervisorPID != nil {
pid := *inst.HypervisorPID

// Read CPU from /proc/<pid>/stat
cpuUsec, err := resources.ReadProcStat(pid)
if err != nil {
log.DebugContext(ctx, "failed to read proc stat", "pid", pid, "error", err)
} else {
stats.CpuSeconds = float64(cpuUsec) / 1_000_000.0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems like too much logic happening in the API handler, api handler ought to just translate from domain types (e.g. lib/utilization/types.go) into API types and other handler-level concerns like error mapping.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also it would be nice if moved to new lib/ directory to get a README explaining the feature, similar to other features in the repo

}

// Read memory from /proc/<pid>/statm
rssBytes, vmsBytes, err := resources.ReadProcStatm(pid)
if err != nil {
log.DebugContext(ctx, "failed to read proc statm", "pid", pid, "error", err)
} else {
stats.MemoryRssBytes = int64(rssBytes)
stats.MemoryVmsBytes = int64(vmsBytes)

// Compute utilization ratio
if stats.AllocatedMemoryBytes > 0 {
ratio := float64(rssBytes) / float64(stats.AllocatedMemoryBytes)
stats.MemoryUtilizationRatio = &ratio
}
}
}

// Read TAP stats if network is enabled
if inst.NetworkEnabled {
tapName := generateTAPName(inst.Id)
rxBytes, txBytes, err := resources.ReadTAPStats(tapName)
if err != nil {
log.DebugContext(ctx, "failed to read TAP stats", "tap", tapName, "error", err)
} else {
stats.NetworkRxBytes = int64(rxBytes)
stats.NetworkTxBytes = int64(txBytes)
}
}

return oapi.GetInstanceStats200JSONResponse(stats), nil
}

// generateTAPName generates TAP device name from instance ID
func generateTAPName(instanceID string) string {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should reuse existing code that determine tap name from instance name

// TAP name format: "hype-" + first 10 chars of instance ID
// Max TAP name length is 15 chars (IFNAMSIZ - 1)
prefix := "hype-"
maxIDLen := 15 - len(prefix)
idPart := instanceID
if len(idPart) > maxIDLen {
idPart = idPart[:maxIDLen]
}
return prefix + idPart
}

// DeleteInstance stops and deletes an instance
// The id parameter can be an instance ID, name, or ID prefix
// Note: Resolution is handled by ResolveResource middleware
Expand Down
106 changes: 106 additions & 0 deletions dashboards/hypeman.json
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,112 @@
],
"title": "Exec Sessions & Duration",
"type": "timeseries"
},
{
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 },
"id": 18,
"title": "VM Resource Utilization",
"type": "row"
},
{
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 },
"id": 19,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(hypeman_vm_cpu_seconds_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[1m])",
"legendFormat": "{{instance_name}}",
"refId": "A"
}
],
"title": "VM CPU Usage (cores)",
"type": "timeseries",
"fieldConfig": {
"defaults": { "unit": "short", "min": 0 },
"overrides": []
}
},
{
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 },
"id": 20,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "hypeman_vm_memory_rss_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
"legendFormat": "{{instance_name}} RSS",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "hypeman_vm_memory_vms_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
"legendFormat": "{{instance_name}} VMS",
"refId": "B"
}
],
"title": "VM Memory Usage (RSS & VMS)",
"type": "timeseries",
"fieldConfig": {
"defaults": { "unit": "bytes", "min": 0 },
"overrides": []
}
},
{
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 52 },
"id": 21,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(hypeman_vm_network_rx_bytes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[1m])",
"legendFormat": "{{instance_name}} RX",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(hypeman_vm_network_tx_bytes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[1m])",
"legendFormat": "{{instance_name}} TX",
"refId": "B"
}
],
"title": "VM Network I/O",
"type": "timeseries",
"fieldConfig": {
"defaults": { "unit": "Bps", "min": 0 },
"overrides": []
}
},
{
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 60 },
"id": 23,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "hypeman_vm_memory_utilization_ratio{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
"legendFormat": "{{instance_name}}",
"refId": "A"
}
],
"title": "VM Memory Utilization (% of allocated)",
"type": "timeseries",
"fieldConfig": {
"defaults": { "unit": "percentunit", "min": 0, "max": 1 },
"overrides": []
}
}
],
"refresh": "10s",
Expand Down
4 changes: 4 additions & 0 deletions lib/builds/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ func (m *mockInstanceManager) ListInstanceAllocations(ctx context.Context) ([]re
return nil, nil
}

func (m *mockInstanceManager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) {
return nil, nil
}

// mockVolumeManager implements volumes.Manager for testing
type mockVolumeManager struct {
volumes map[string]*volumes.Volume
Expand Down
51 changes: 51 additions & 0 deletions lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package instances
import (
"context"
"fmt"
"strings"
"sync"

"github.com/kernel/hypeman/lib/devices"
Expand Down Expand Up @@ -38,6 +39,9 @@ type Manager interface {
// ListInstanceAllocations returns resource allocations for all instances.
// Used by the resource manager for capacity tracking.
ListInstanceAllocations(ctx context.Context) ([]resources.InstanceAllocation, error)
// ListRunningInstancesInfo returns info needed for utilization metrics collection.
// Used by the resource manager for VM utilization tracking.
ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error)
}

// ResourceLimits contains configurable resource limits for instances
Expand Down Expand Up @@ -328,3 +332,50 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst

return allocations, nil
}

// ListRunningInstancesInfo returns info needed for utilization metrics collection.
// Used by the resource manager for VM utilization tracking.
func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) {
instances, err := m.listInstances(ctx)
if err != nil {
return nil, err
}

infos := make([]resources.InstanceUtilizationInfo, 0, len(instances))
for _, inst := range instances {
// Only include running instances (they have a hypervisor process)
if inst.State != StateRunning {
continue
}

info := resources.InstanceUtilizationInfo{
ID: inst.Id,
Name: inst.Name,
HypervisorPID: inst.HypervisorPID,
// Include allocated resources for utilization ratio calculations
AllocatedVcpus: inst.Vcpus,
AllocatedMemoryBytes: inst.Size + inst.HotplugSize,
}

// Derive TAP device name if networking is enabled
if inst.NetworkEnabled {
info.TAPDevice = generateTAPName(inst.Id)
}

infos = append(infos, info)
}

return infos, nil
}

// generateTAPName generates TAP device name from instance ID.
// This matches the logic in network/allocate.go.
func generateTAPName(instanceID string) string {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here too

// Use first 8 chars of instance ID
// hype-{8chars} fits within 15-char Linux interface name limit
shortID := instanceID
if len(shortID) > 8 {
shortID = shortID[:8]
}
return "hype-" + strings.ToLower(shortID)
}
36 changes: 36 additions & 0 deletions lib/instances/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1001,3 +1001,39 @@ func (r *testInstanceResolver) ResolveInstance(ctx context.Context, nameOrID str
// For tests, just return nameOrID as both name and id
return nameOrID, nameOrID, nil
}

func TestGenerateTAPName(t *testing.T) {
tests := []struct {
name string
instanceID string
expected string
}{
{
name: "standard ID",
instanceID: "01HQVX7ABC123DEF456",
expected: "hype-01hqvx7a",
},
{
name: "short ID",
instanceID: "ABC123",
expected: "hype-abc123",
},
{
name: "exact 8 chars",
instanceID: "ABCDEFGH",
expected: "hype-abcdefgh",
},
{
name: "mixed case",
instanceID: "AbCdEfGhIjKl",
expected: "hype-abcdefgh",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := generateTAPName(tt.instanceID)
assert.Equal(t, tt.expected, result)
})
}
}
Loading