-
Notifications
You must be signed in to change notification settings - Fork 0
feat(metrics): add per-VM resource utilization metrics #67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+1,888
−212
Merged
Changes from 4 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
44ecfae
feat(metrics): add per-VM resource utilization metrics
hiroTamada 157b8fc
refactor: use OpenAPI spec for stats endpoint
hiroTamada 58fc604
fix: add ListRunningInstancesInfo to mock in builds tests
hiroTamada cafebe8
fix: use os.Getpagesize() for correct memory metrics on ARM
hiroTamada 125d603
refactor: move VM metrics to dedicated lib/vm_metrics package
hiroTamada 6b3d6fc
Merge branch 'main' into feat/vm-utilization-metrics
hiroTamada 20dc8ee
fix: swap TAP rx/tx for correct VM perspective
hiroTamada 01f8240
docs: clarify USER_HZ vs CONFIG_HZ for clock tick rate
hiroTamada File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -274,6 +274,83 @@ func (s *ApiService) GetInstance(ctx context.Context, request oapi.GetInstanceRe | |
| return oapi.GetInstance200JSONResponse(instanceToOAPI(*inst)), nil | ||
| } | ||
|
|
||
| // GetInstanceStats returns resource utilization statistics for an instance | ||
| // The id parameter can be an instance ID, name, or ID prefix | ||
| // Note: Resolution is handled by ResolveResource middleware | ||
| func (s *ApiService) GetInstanceStats(ctx context.Context, request oapi.GetInstanceStatsRequestObject) (oapi.GetInstanceStatsResponseObject, error) { | ||
| log := logger.FromContext(ctx) | ||
| inst := mw.GetResolvedInstance[instances.Instance](ctx) | ||
| if inst == nil { | ||
| return oapi.GetInstanceStats500JSONResponse{ | ||
| Code: "internal_error", | ||
| Message: "resource not resolved", | ||
| }, nil | ||
| } | ||
|
|
||
| // Build stats response | ||
| stats := oapi.InstanceStats{ | ||
| InstanceId: inst.Id, | ||
| InstanceName: inst.Name, | ||
| AllocatedVcpus: inst.Vcpus, | ||
| AllocatedMemoryBytes: inst.Size + inst.HotplugSize, | ||
| } | ||
|
|
||
| // Read /proc stats if we have a hypervisor PID | ||
| if inst.HypervisorPID != nil { | ||
| pid := *inst.HypervisorPID | ||
|
|
||
| // Read CPU from /proc/<pid>/stat | ||
| cpuUsec, err := resources.ReadProcStat(pid) | ||
| if err != nil { | ||
| log.DebugContext(ctx, "failed to read proc stat", "pid", pid, "error", err) | ||
| } else { | ||
| stats.CpuSeconds = float64(cpuUsec) / 1_000_000.0 | ||
| } | ||
|
|
||
| // Read memory from /proc/<pid>/statm | ||
| rssBytes, vmsBytes, err := resources.ReadProcStatm(pid) | ||
| if err != nil { | ||
| log.DebugContext(ctx, "failed to read proc statm", "pid", pid, "error", err) | ||
| } else { | ||
| stats.MemoryRssBytes = int64(rssBytes) | ||
| stats.MemoryVmsBytes = int64(vmsBytes) | ||
|
|
||
| // Compute utilization ratio | ||
| if stats.AllocatedMemoryBytes > 0 { | ||
| ratio := float64(rssBytes) / float64(stats.AllocatedMemoryBytes) | ||
| stats.MemoryUtilizationRatio = &ratio | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Read TAP stats if network is enabled | ||
| if inst.NetworkEnabled { | ||
| tapName := generateTAPName(inst.Id) | ||
| rxBytes, txBytes, err := resources.ReadTAPStats(tapName) | ||
| if err != nil { | ||
| log.DebugContext(ctx, "failed to read TAP stats", "tap", tapName, "error", err) | ||
| } else { | ||
| stats.NetworkRxBytes = int64(rxBytes) | ||
| stats.NetworkTxBytes = int64(txBytes) | ||
| } | ||
| } | ||
|
|
||
| return oapi.GetInstanceStats200JSONResponse(stats), nil | ||
| } | ||
|
|
||
| // generateTAPName generates TAP device name from instance ID | ||
| func generateTAPName(instanceID string) string { | ||
|
||
| // TAP name format: "hype-" + first 10 chars of instance ID | ||
| // Max TAP name length is 15 chars (IFNAMSIZ - 1) | ||
| prefix := "hype-" | ||
| maxIDLen := 15 - len(prefix) | ||
| idPart := instanceID | ||
| if len(idPart) > maxIDLen { | ||
| idPart = idPart[:maxIDLen] | ||
| } | ||
| return prefix + idPart | ||
| } | ||
sjmiller609 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // DeleteInstance stops and deletes an instance | ||
| // The id parameter can be an instance ID, name, or ID prefix | ||
| // Note: Resolution is handled by ResolveResource middleware | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ package instances | |
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strings" | ||
| "sync" | ||
|
|
||
| "github.com/kernel/hypeman/lib/devices" | ||
|
|
@@ -38,6 +39,9 @@ type Manager interface { | |
| // ListInstanceAllocations returns resource allocations for all instances. | ||
| // Used by the resource manager for capacity tracking. | ||
| ListInstanceAllocations(ctx context.Context) ([]resources.InstanceAllocation, error) | ||
| // ListRunningInstancesInfo returns info needed for utilization metrics collection. | ||
| // Used by the resource manager for VM utilization tracking. | ||
| ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) | ||
sjmiller609 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // ResourceLimits contains configurable resource limits for instances | ||
|
|
@@ -328,3 +332,50 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst | |
|
|
||
| return allocations, nil | ||
| } | ||
|
|
||
| // ListRunningInstancesInfo returns info needed for utilization metrics collection. | ||
| // Used by the resource manager for VM utilization tracking. | ||
| func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) { | ||
| instances, err := m.listInstances(ctx) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| infos := make([]resources.InstanceUtilizationInfo, 0, len(instances)) | ||
| for _, inst := range instances { | ||
| // Only include running instances (they have a hypervisor process) | ||
| if inst.State != StateRunning { | ||
| continue | ||
| } | ||
|
|
||
| info := resources.InstanceUtilizationInfo{ | ||
| ID: inst.Id, | ||
| Name: inst.Name, | ||
| HypervisorPID: inst.HypervisorPID, | ||
| // Include allocated resources for utilization ratio calculations | ||
| AllocatedVcpus: inst.Vcpus, | ||
| AllocatedMemoryBytes: inst.Size + inst.HotplugSize, | ||
| } | ||
|
|
||
| // Derive TAP device name if networking is enabled | ||
| if inst.NetworkEnabled { | ||
| info.TAPDevice = generateTAPName(inst.Id) | ||
| } | ||
|
|
||
| infos = append(infos, info) | ||
| } | ||
|
|
||
| return infos, nil | ||
| } | ||
|
|
||
| // generateTAPName generates TAP device name from instance ID. | ||
| // This matches the logic in network/allocate.go. | ||
| func generateTAPName(instanceID string) string { | ||
|
||
| // Use first 8 chars of instance ID | ||
| // hype-{8chars} fits within 15-char Linux interface name limit | ||
| shortID := instanceID | ||
| if len(shortID) > 8 { | ||
| shortID = shortID[:8] | ||
| } | ||
| return "hype-" + strings.ToLower(shortID) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it seems like too much logic happening in the API handler, api handler ought to just translate from domain types (e.g. lib/utilization/types.go) into API types and other handler-level concerns like error mapping.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also it would be nice if moved to new lib/ directory to get a README explaining the feature, similar to other features in the repo