Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cmd/api/api/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {

// Audit log: exec session started
log.InfoContext(ctx, "exec session started",
"instance_id", instanceID,
"id", inst.Id,
"subject", subject,
"command", execReq.Command,
"tty", execReq.TTY,
Expand Down Expand Up @@ -133,7 +133,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
if err != nil {
log.ErrorContext(ctx, "exec failed",
"error", err,
"instance_id", instanceID,
"id", inst.Id,
"subject", subject,
"duration_ms", duration.Milliseconds(),
)
Expand All @@ -148,7 +148,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {

// Audit log: exec session ended
log.InfoContext(ctx, "exec session ended",
"instance_id", instanceID,
"id", inst.Id,
"subject", subject,
"exit_code", exit.Code,
"duration_ms", duration.Milliseconds(),
Expand Down
5 changes: 3 additions & 2 deletions cmd/api/api/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/onkernel/hypeman/lib/exec"
"github.com/onkernel/hypeman/lib/instances"
"github.com/onkernel/hypeman/lib/oapi"
"github.com/onkernel/hypeman/lib/paths"
"github.com/onkernel/hypeman/lib/system"
Expand Down Expand Up @@ -91,7 +92,7 @@ func TestExecInstanceNonTTY(t *testing.T) {
// Capture console log on failure with exec-agent filtering
t.Cleanup(func() {
if t.Failed() {
consolePath := paths.New(svc.Config.DataDir).InstanceConsoleLog(inst.Id)
consolePath := paths.New(svc.Config.DataDir).InstanceAppLog(inst.Id)
if consoleData, err := os.ReadFile(consolePath); err == nil {
lines := strings.Split(string(consoleData), "\n")

Expand Down Expand Up @@ -280,7 +281,7 @@ func TestExecWithDebianMinimal(t *testing.T) {

// collectTestLogs collects logs from an instance (non-streaming)
func collectTestLogs(t *testing.T, svc *ApiService, instanceID string, n int) string {
logChan, err := svc.InstanceManager.StreamInstanceLogs(ctx(), instanceID, n, false)
logChan, err := svc.InstanceManager.StreamInstanceLogs(ctx(), instanceID, n, false, instances.LogSourceApp)
if err != nil {
return ""
}
Expand Down
20 changes: 19 additions & 1 deletion cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,19 @@ func (s *ApiService) GetInstanceLogs(ctx context.Context, request oapi.GetInstan
follow = *request.Params.Follow
}

// Map source parameter to LogSource type (default to app)
source := instances.LogSourceApp
if request.Params.Source != nil {
switch *request.Params.Source {
case oapi.App:
source = instances.LogSourceApp
case oapi.Vmm:
source = instances.LogSourceVMM
case oapi.Hypeman:
source = instances.LogSourceHypeman
}
}

// Resolve to get the actual instance ID
resolved, err := s.InstanceManager.GetInstance(ctx, request.Id)
if err != nil {
Expand All @@ -488,14 +501,19 @@ func (s *ApiService) GetInstanceLogs(ctx context.Context, request oapi.GetInstan
}
}

logChan, err := s.InstanceManager.StreamInstanceLogs(ctx, resolved.Id, tail, follow)
logChan, err := s.InstanceManager.StreamInstanceLogs(ctx, resolved.Id, tail, follow, source)
if err != nil {
switch {
case errors.Is(err, instances.ErrTailNotFound):
return oapi.GetInstanceLogs500JSONResponse{
Code: "dependency_missing",
Message: "tail command not found on server - required for log streaming",
}, nil
case errors.Is(err, instances.ErrLogNotFound):
return oapi.GetInstanceLogs404JSONResponse{
Code: "log_not_found",
Message: "requested log file does not exist yet",
}, nil
default:
return oapi.GetInstanceLogs500JSONResponse{
Code: "internal_error",
Expand Down
33 changes: 14 additions & 19 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,18 @@ func run() error {
"kernel", kernelVer)

// Initialize network manager (creates default network if needed)
// Get running instance IDs for TAP cleanup
runningIDs := getRunningInstanceIDs(app)
// Get instance IDs that might have a running VMM for TAP cleanup safety.
// Include Unknown state: we couldn't confirm their state, but they might still
// have a running VMM. Better to leave a stale TAP than crash a running VM.
allInstances, _ := app.InstanceManager.ListInstances(app.Ctx)
var preserveTAPs []string
for _, inst := range allInstances {
if inst.State == instances.StateRunning || inst.State == instances.StateUnknown {
preserveTAPs = append(preserveTAPs, inst.Id)
}
}
Comment on lines 149 to 167
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is the fix for stopping a possibly running instance, the rest of the change is VMM and Hypeman logs feature, which was inspired by how this problem was debugged (find issue in CH logs)

logger.Info("Initializing network manager...")
if err := app.NetworkManager.Initialize(app.Ctx, runningIDs); err != nil {
if err := app.NetworkManager.Initialize(app.Ctx, preserveTAPs); err != nil {
logger.Error("failed to initialize network manager", "error", err)
return fmt.Errorf("initialize network manager: %w", err)
}
Expand Down Expand Up @@ -199,6 +207,7 @@ func run() error {
middleware.RequestID,
middleware.RealIP,
middleware.Recoverer,
mw.InjectLogger(logger),
mw.AccessLogger(accessLogger),
mw.JwtAuth(app.Config.JwtSecret),
).Get("/instances/{id}/exec", app.ApiService.ExecHandler)
Expand Down Expand Up @@ -226,7 +235,8 @@ func run() error {
}

// Inject logger into request context for handlers to use
r.Use(mw.InjectLogger(accessLogger))
// Use app logger (not accessLogger) so the instance log handler is included
r.Use(mw.InjectLogger(logger))

// Access logger AFTER otelchi so trace context is available
r.Use(mw.AccessLogger(accessLogger))
Expand Down Expand Up @@ -354,21 +364,6 @@ func run() error {
return err
}

// getRunningInstanceIDs returns IDs of instances currently in Running state
func getRunningInstanceIDs(app *application) []string {
allInstances, err := app.InstanceManager.ListInstances(app.Ctx)
if err != nil {
return nil
}
var running []string
for _, inst := range allInstances {
if inst.State == instances.StateRunning {
running = append(running, inst.Id)
}
}
return running
}

// checkKVMAccess verifies KVM is available and the user has permission to use it
func checkKVMAccess() error {
f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0)
Expand Down
4 changes: 2 additions & 2 deletions cmd/api/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lib/instances/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf
// Serial console configuration
serial := vmm.ConsoleConfig{
Mode: vmm.ConsoleConfigMode("File"),
File: ptr(m.paths.InstanceConsoleLog(inst.Id)),
File: ptr(m.paths.InstanceAppLog(inst.Id)),
}

// Console off (we use serial)
Expand Down
40 changes: 36 additions & 4 deletions lib/instances/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,29 @@ import (
"github.com/onkernel/hypeman/lib/logger"
)

// LogSource represents a log source type
type LogSource string

const (
// LogSourceApp is the guest application log (serial console)
LogSourceApp LogSource = "app"
// LogSourceVMM is the Cloud Hypervisor VMM log
LogSourceVMM LogSource = "vmm"
// LogSourceHypeman is the hypeman operations log
LogSourceHypeman LogSource = "hypeman"
)

// ErrTailNotFound is returned when the tail command is not available
var ErrTailNotFound = fmt.Errorf("tail command not found: required for log streaming")

// StreamInstanceLogs streams instance console logs
// ErrLogNotFound is returned when the requested log file doesn't exist
var ErrLogNotFound = fmt.Errorf("log file not found")

// streamInstanceLogs streams instance logs from the specified source
// Returns last N lines, then continues following if follow=true
func (m *manager) streamInstanceLogs(ctx context.Context, id string, tail int, follow bool) (<-chan string, error) {
func (m *manager) streamInstanceLogs(ctx context.Context, id string, tail int, follow bool, source LogSource) (<-chan string, error) {
log := logger.FromContext(ctx)
log.DebugContext(ctx, "starting log stream", "id", id, "tail", tail, "follow", follow)
log.DebugContext(ctx, "starting log stream", "id", id, "tail", tail, "follow", follow, "source", source)

// Verify tail command is available
if _, err := exec.LookPath("tail"); err != nil {
Expand All @@ -30,7 +45,24 @@ func (m *manager) streamInstanceLogs(ctx context.Context, id string, tail int, f
return nil, err
}

logPath := m.paths.InstanceConsoleLog(id)
// Determine log path based on source
var logPath string
switch source {
case LogSourceApp:
logPath = m.paths.InstanceAppLog(id)
case LogSourceVMM:
logPath = m.paths.InstanceVMMLog(id)
case LogSourceHypeman:
logPath = m.paths.InstanceHypemanLog(id)
default:
// Default to app log for backwards compatibility
logPath = m.paths.InstanceAppLog(id)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enjoy the freedom of not having to care about backwards compatability right now! :)

}

// Check if log file exists before starting tail
if _, err := os.Stat(logPath); os.IsNotExist(err) {
return nil, ErrLogNotFound
}

// Build tail command
args := []string{"-n", strconv.Itoa(tail)}
Expand Down
23 changes: 15 additions & 8 deletions lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ type Manager interface {
RestoreInstance(ctx context.Context, id string) (*Instance, error)
StopInstance(ctx context.Context, id string) (*Instance, error)
StartInstance(ctx context.Context, id string) (*Instance, error)
StreamInstanceLogs(ctx context.Context, id string, tail int, follow bool) (<-chan string, error)
StreamInstanceLogs(ctx context.Context, id string, tail int, follow bool, source LogSource) (<-chan string, error)
RotateLogs(ctx context.Context, maxBytes int64, maxFiles int) error
AttachVolume(ctx context.Context, id string, volumeId string, req AttachVolumeRequest) (*Instance, error)
DetachVolume(ctx context.Context, id string, volumeId string) (*Instance, error)
Expand Down Expand Up @@ -197,15 +197,15 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance,
return nil, ErrNotFound
}

// StreamInstanceLogs streams instance console logs
// StreamInstanceLogs streams instance logs from the specified source
// Returns last N lines, then continues following if follow=true
func (m *manager) StreamInstanceLogs(ctx context.Context, id string, tail int, follow bool) (<-chan string, error) {
func (m *manager) StreamInstanceLogs(ctx context.Context, id string, tail int, follow bool, source LogSource) (<-chan string, error) {
// Note: No lock held during streaming - we read from the file continuously
// and the file is append-only, so this is safe
return m.streamInstanceLogs(ctx, id, tail, follow)
return m.streamInstanceLogs(ctx, id, tail, follow, source)
}

// RotateLogs rotates console logs for all instances that exceed maxBytes
// RotateLogs rotates all instance logs (app, vmm, hypeman) that exceed maxBytes
func (m *manager) RotateLogs(ctx context.Context, maxBytes int64, maxFiles int) error {
instances, err := m.listInstances(ctx)
if err != nil {
Expand All @@ -214,9 +214,16 @@ func (m *manager) RotateLogs(ctx context.Context, maxBytes int64, maxFiles int)

var lastErr error
for _, inst := range instances {
logPath := m.paths.InstanceConsoleLog(inst.Id)
if err := rotateLogIfNeeded(logPath, maxBytes, maxFiles); err != nil {
lastErr = err // Continue with other instances, but track error
// Rotate all three log types
logPaths := []string{
m.paths.InstanceAppLog(inst.Id),
m.paths.InstanceVMMLog(inst.Id),
m.paths.InstanceHypemanLog(inst.Id),
}
for _, logPath := range logPaths {
if err := rotateLogIfNeeded(logPath, maxBytes, maxFiles); err != nil {
lastErr = err // Continue with other logs, but track error
}
}
}
return lastErr
Expand Down
18 changes: 9 additions & 9 deletions lib/instances/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message st

// collectLogs gets the last N lines of logs (non-streaming)
func collectLogs(ctx context.Context, mgr *manager, instanceID string, n int) (string, error) {
logChan, err := mgr.StreamInstanceLogs(ctx, instanceID, n, false)
logChan, err := mgr.StreamInstanceLogs(ctx, instanceID, n, false, LogSourceApp)
if err != nil {
return "", err
}
Expand Down Expand Up @@ -672,7 +672,7 @@ func TestBasicEndToEnd(t *testing.T) {
streamCtx, streamCancel := context.WithCancel(ctx)
defer streamCancel()

logChan, err := manager.StreamInstanceLogs(streamCtx, inst.Id, 10, true)
logChan, err := manager.StreamInstanceLogs(streamCtx, inst.Id, 10, true, LogSourceApp)
require.NoError(t, err)

// Create unique marker
Expand All @@ -692,7 +692,7 @@ func TestBasicEndToEnd(t *testing.T) {
}()

// Append marker to console log file
consoleLogPath := p.InstanceConsoleLog(inst.Id)
consoleLogPath := p.InstanceAppLog(inst.Id)
f, err := os.OpenFile(consoleLogPath, os.O_APPEND|os.O_WRONLY, 0644)
require.NoError(t, err)
_, err = fmt.Fprintln(f, marker)
Expand Down Expand Up @@ -899,12 +899,12 @@ func TestStandbyAndRestore(t *testing.T) {
t.Logf(" - %s (size: %d bytes)", entry.Name(), info.Size())
}

// DEBUG: Check console.log file size before restore
consoleLogPath := filepath.Join(tmpDir, "guests", inst.Id, "logs", "console.log")
// DEBUG: Check app.log file size before restore
consoleLogPath := filepath.Join(tmpDir, "guests", inst.Id, "logs", "app.log")
var consoleLogSizeBefore int64
if info, err := os.Stat(consoleLogPath); err == nil {
consoleLogSizeBefore = info.Size()
t.Logf("DEBUG: console.log size before restore: %d bytes", consoleLogSizeBefore)
t.Logf("DEBUG: app.log size before restore: %d bytes", consoleLogSizeBefore)
}

// Restore instance
Expand All @@ -914,13 +914,13 @@ func TestStandbyAndRestore(t *testing.T) {
assert.Equal(t, StateRunning, inst.State)
t.Log("Instance restored and running")

// DEBUG: Check console.log file size after restore
// DEBUG: Check app.log file size after restore
if info, err := os.Stat(consoleLogPath); err == nil {
consoleLogSizeAfter := info.Size()
t.Logf("DEBUG: console.log size after restore: %d bytes", consoleLogSizeAfter)
t.Logf("DEBUG: app.log size after restore: %d bytes", consoleLogSizeAfter)
t.Logf("DEBUG: File size diff: %d bytes", consoleLogSizeAfter-consoleLogSizeBefore)
if consoleLogSizeAfter < consoleLogSizeBefore {
t.Logf("DEBUG: WARNING! console.log was TRUNCATED (lost %d bytes)", consoleLogSizeBefore-consoleLogSizeAfter)
t.Logf("DEBUG: WARNING! app.log was TRUNCATED (lost %d bytes)", consoleLogSizeBefore-consoleLogSizeAfter)
}
}

Expand Down
8 changes: 4 additions & 4 deletions lib/instances/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state
// Failed to create client - this is unexpected if socket exists
errMsg := fmt.Sprintf("failed to create VMM client: %v", err)
log.WarnContext(ctx, "failed to determine instance state",
"instance_id", stored.Id,
"id", stored.Id,
"socket", stored.SocketPath,
"error", err,
)
Expand All @@ -48,7 +48,7 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state
// Socket exists but VMM is unreachable - this is unexpected
errMsg := fmt.Sprintf("failed to query VMM: %v", err)
log.WarnContext(ctx, "failed to query VMM state",
"instance_id", stored.Id,
"id", stored.Id,
"socket", stored.SocketPath,
"error", err,
)
Expand All @@ -60,7 +60,7 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state
body := string(resp.Body)
errMsg := fmt.Sprintf("VMM returned error (status %d): %s", resp.StatusCode(), body)
log.WarnContext(ctx, "VMM returned error response",
"instance_id", stored.Id,
"id", stored.Id,
"socket", stored.SocketPath,
"status_code", resp.StatusCode(),
"body", body,
Expand All @@ -82,7 +82,7 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state
// Unknown CH state - log and return Unknown
errMsg := fmt.Sprintf("unexpected VMM state: %s", resp.JSON200.State)
log.WarnContext(ctx, "VMM returned unexpected state",
"instance_id", stored.Id,
"id", stored.Id,
"vmm_state", resp.JSON200.State,
)
return stateResult{State: StateUnknown, Error: &errMsg}
Expand Down
6 changes: 3 additions & 3 deletions lib/instances/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ import (
// overlay.raw # Configurable sparse overlay disk (default 10GB)
// config.ext4 # Read-only config disk (generated)
// ch.sock # Cloud Hypervisor API socket
// ch-stdout.log # CH process output
// logs/
// console.log # Serial console output
// app.log # Guest application log (serial console output)
// vmm.log # Cloud Hypervisor VMM log (stdout+stderr combined)
// hypeman.log # Hypeman operations log (actions taken on this instance)
// snapshots/
// snapshot-latest/ # Snapshot directory
// config.json
Expand Down Expand Up @@ -141,4 +142,3 @@ func (m *manager) listMetadataFiles() ([]string, error) {

return metaFiles, nil
}

Loading