diff --git a/cmd/api/wire_gen.go b/cmd/api/wire_gen.go index 68f0531..5ac1bda 100644 --- a/cmd/api/wire_gen.go +++ b/cmd/api/wire_gen.go @@ -55,7 +55,7 @@ func initializeApp() (*application, func(), error) { if err != nil { return nil, nil, err } - buildsManager, err := providers.ProvideBuildManager(paths, config, instancesManager, volumesManager, logger) + buildsManager, err := providers.ProvideBuildManager(paths, config, instancesManager, volumesManager, manager, logger) if err != nil { return nil, nil, err } diff --git a/lib/builds/manager.go b/lib/builds/manager.go index 266cacb..72e1b0a 100644 --- a/lib/builds/manager.go +++ b/lib/builds/manager.go @@ -88,6 +88,7 @@ type manager struct { queue *BuildQueue instanceManager instances.Manager volumeManager volumes.Manager + imageManager images.Manager secretProvider SecretProvider tokenGenerator *RegistryTokenGenerator logger *slog.Logger @@ -105,6 +106,7 @@ func NewManager( config Config, instanceMgr instances.Manager, volumeMgr volumes.Manager, + imageMgr images.Manager, secretProvider SecretProvider, logger *slog.Logger, meter metric.Meter, @@ -119,6 +121,7 @@ func NewManager( queue: NewBuildQueue(config.MaxConcurrentBuilds), instanceManager: instanceMgr, volumeManager: volumeMgr, + imageManager: imageMgr, secretProvider: secretProvider, tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret), logger: logger, @@ -294,6 +297,28 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques m.logger.Info("build succeeded", "id", id, "digest", result.ImageDigest, "duration", duration) imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id) + + // Wait for image to be ready before reporting build as complete. + // This fixes the race condition (KERNEL-863) where build reports "ready" + // but image conversion hasn't finished yet. + // Use buildCtx to respect the build timeout during image wait. + if err := m.waitForImageReady(buildCtx, id); err != nil { + // Recalculate duration to include image wait time + duration = time.Since(start) + durationMS = duration.Milliseconds() + m.logger.Error("image conversion failed after build", "id", id, "error", err, "duration", duration) + errMsg := fmt.Sprintf("image conversion failed: %v", err) + m.updateBuildComplete(id, StatusFailed, nil, &errMsg, &result.Provenance, &durationMS) + if m.metrics != nil { + m.metrics.RecordBuild(buildCtx, "failed", duration) + } + return + } + + // Recalculate duration to include image wait time for accurate reporting + duration = time.Since(start) + durationMS = duration.Milliseconds() + m.updateBuildComplete(id, StatusReady, &result.ImageDigest, nil, &result.Provenance, &durationMS) // Update with image ref @@ -303,7 +328,7 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques } if m.metrics != nil { - m.metrics.RecordBuild(ctx, "success", duration) + m.metrics.RecordBuild(buildCtx, "success", duration) } } @@ -640,6 +665,44 @@ func (m *manager) updateBuildComplete(id string, status string, digest *string, m.notifyStatusChange(id, status) } +// waitForImageReady polls the image manager until the build's image is ready. +// This ensures that when a build reports "ready", the image is actually usable +// for instance creation (fixes KERNEL-863 race condition). +func (m *manager) waitForImageReady(ctx context.Context, id string) error { + imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id) + + // Poll for up to 60 seconds (image conversion is typically fast) + const maxAttempts = 120 + const pollInterval = 500 * time.Millisecond + + m.logger.Debug("waiting for image to be ready", "id", id, "image_ref", imageRef) + + for attempt := 0; attempt < maxAttempts; attempt++ { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + img, err := m.imageManager.GetImage(ctx, imageRef) + if err == nil { + switch img.Status { + case images.StatusReady: + m.logger.Debug("image is ready", "id", id, "image_ref", imageRef, "attempts", attempt+1) + return nil + case images.StatusFailed: + return fmt.Errorf("image conversion failed") + case images.StatusPending, images.StatusPulling, images.StatusConverting: + // Still processing, continue polling + } + } + // Image not found or still processing, wait and retry + time.Sleep(pollInterval) + } + + return fmt.Errorf("timeout waiting for image to be ready after %v", time.Duration(maxAttempts)*pollInterval) +} + // subscribeToStatus adds a subscriber channel for status updates on a build func (m *manager) subscribeToStatus(buildID string, ch chan BuildEvent) { m.subscriberMu.Lock() diff --git a/lib/builds/manager_test.go b/lib/builds/manager_test.go index 3d19af1..d47bfa3 100644 --- a/lib/builds/manager_test.go +++ b/lib/builds/manager_test.go @@ -10,6 +10,7 @@ import ( "testing" "time" + "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/instances" "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/resources" @@ -219,8 +220,85 @@ func (m *mockSecretProvider) GetSecrets(ctx context.Context, secretIDs []string) return make(map[string]string), nil } +// mockImageManager implements images.Manager for testing +type mockImageManager struct { + images map[string]*images.Image + getImageErr error +} + +func newMockImageManager() *mockImageManager { + return &mockImageManager{ + images: make(map[string]*images.Image), + } +} + +func (m *mockImageManager) ListImages(ctx context.Context) ([]images.Image, error) { + var result []images.Image + for _, img := range m.images { + result = append(result, *img) + } + return result, nil +} + +func (m *mockImageManager) CreateImage(ctx context.Context, req images.CreateImageRequest) (*images.Image, error) { + img := &images.Image{ + Name: req.Name, + Status: images.StatusPending, + } + m.images[req.Name] = img + return img, nil +} + +func (m *mockImageManager) ImportLocalImage(ctx context.Context, repo, reference, digest string) (*images.Image, error) { + img := &images.Image{ + Name: repo + ":" + reference, + Status: images.StatusReady, + } + m.images[img.Name] = img + return img, nil +} + +func (m *mockImageManager) GetImage(ctx context.Context, name string) (*images.Image, error) { + if m.getImageErr != nil { + return nil, m.getImageErr + } + if img, ok := m.images[name]; ok { + return img, nil + } + return nil, images.ErrNotFound +} + +func (m *mockImageManager) DeleteImage(ctx context.Context, name string) error { + delete(m.images, name) + return nil +} + +func (m *mockImageManager) RecoverInterruptedBuilds() {} + +func (m *mockImageManager) TotalImageBytes(ctx context.Context) (int64, error) { + return 0, nil +} + +func (m *mockImageManager) TotalOCICacheBytes(ctx context.Context) (int64, error) { + return 0, nil +} + +// SetImageReady sets an image to ready status for testing +func (m *mockImageManager) SetImageReady(name string) { + m.images[name] = &images.Image{ + Name: name, + Status: images.StatusReady, + } +} + // Test helper to create a manager with test paths and mocks func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, string) { + mgr, instanceMgr, volumeMgr, _, tempDir := setupTestManagerWithImageMgr(t) + return mgr, instanceMgr, volumeMgr, tempDir +} + +// setupTestManagerWithImageMgr returns the image manager for tests that need it +func setupTestManagerWithImageMgr(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, *mockImageManager, string) { t.Helper() // Create temp directory for test data @@ -236,6 +314,7 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume // Create mocks instanceMgr := newMockInstanceManager() volumeMgr := newMockVolumeManager() + imageMgr := newMockImageManager() secretProvider := &mockSecretProvider{} // Create config @@ -257,13 +336,14 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume queue: NewBuildQueue(config.MaxConcurrentBuilds), instanceManager: instanceMgr, volumeManager: volumeMgr, + imageManager: imageMgr, secretProvider: secretProvider, tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret), logger: logger, statusSubscribers: make(map[string][]chan BuildEvent), } - return mgr, instanceMgr, volumeMgr, tempDir + return mgr, instanceMgr, volumeMgr, imageMgr, tempDir } func TestCreateBuild_Success(t *testing.T) { diff --git a/lib/builds/race_test.go b/lib/builds/race_test.go new file mode 100644 index 0000000..9d31243 --- /dev/null +++ b/lib/builds/race_test.go @@ -0,0 +1,255 @@ +package builds + +import ( + "context" + "os" + "sync" + "testing" + "time" + + "github.com/kernel/hypeman/lib/images" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestBuildImageAvailabilityRace demonstrates the race condition described in KERNEL-863: +// After WaitForBuild() returns with BuildStatusReady, the image may not be immediately +// available for instance creation because: +// 1. Registry returns 201 to builder +// 2. Registry calls triggerConversion() asynchronously +// 3. Builder reports success, build status becomes "ready" +// 4. But image conversion may still be in progress +// +// This test simulates the scenario where a build completes but the image +// is not yet ready when the client tries to use it. +func TestBuildImageAvailabilityRace(t *testing.T) { + // This test demonstrates the conceptual race condition. + // The actual fix requires changes to either: + // 1. Wait for image conversion before reporting build success + // 2. Add an image availability check endpoint + // 3. Have the builder verify image is pullable before reporting success + + t.Run("demonstrates async conversion race", func(t *testing.T) { + // Simulate the race: build reports ready but image conversion is async + var ( + buildReady = make(chan struct{}) + imageConverted = make(chan struct{}) + conversionDelay = 100 * time.Millisecond + ) + + // Simulate registry receiving image and starting async conversion + go func() { + // Registry returns 201 immediately + close(buildReady) + // But conversion happens asynchronously with some delay + time.Sleep(conversionDelay) + close(imageConverted) + }() + + // Simulate client waiting for build to be ready + <-buildReady + + // Build is "ready" but image might not be converted yet + select { + case <-imageConverted: + // Image already converted - no race in this run + t.Log("Image was converted before we checked (no race this time)") + default: + // This demonstrates the race condition: + // Build is ready but image is not yet available + t.Log("RACE CONDITION: Build ready but image not yet converted") + + // In the real system, instance creation would fail here + // because imageManager.GetImage() would return not found or pending status + } + + // Wait for conversion to complete + <-imageConverted + t.Log("Image conversion completed") + }) +} + +// TestWaitForImageReady_Success tests that waitForImageReady succeeds when image becomes ready +func TestWaitForImageReady_Success(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx := context.Background() + buildID := "test-build-123" + + // Set the image to ready in the mock + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + imageMgr.SetImageReady(imageRef) + + // waitForImageReady should succeed immediately + err := mgr.waitForImageReady(ctx, buildID) + require.NoError(t, err) +} + +// TestWaitForImageReady_WaitsForConversion tests that waitForImageReady polls until ready +func TestWaitForImageReady_WaitsForConversion(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx := context.Background() + buildID := "test-build-456" + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + + // Start with image in pending status + imageMgr.images[imageRef] = &images.Image{ + Name: imageRef, + Status: images.StatusPending, + } + + // Simulate conversion completing after a short delay + go func() { + time.Sleep(100 * time.Millisecond) + imageMgr.images[imageRef].Status = images.StatusConverting + time.Sleep(100 * time.Millisecond) + imageMgr.images[imageRef].Status = images.StatusReady + }() + + // waitForImageReady should poll and eventually succeed + start := time.Now() + err := mgr.waitForImageReady(ctx, buildID) + elapsed := time.Since(start) + + require.NoError(t, err) + assert.GreaterOrEqual(t, elapsed, 200*time.Millisecond, "should have waited for conversion") +} + +// TestWaitForImageReady_Timeout tests that waitForImageReady times out if image never becomes ready +func TestWaitForImageReady_ContextCancelled(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + buildID := "test-build-789" + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + + // Image stays in pending status forever + imageMgr.images[imageRef] = &images.Image{ + Name: imageRef, + Status: images.StatusPending, + } + + // waitForImageReady should return context error + err := mgr.waitForImageReady(ctx, buildID) + require.Error(t, err) + assert.ErrorIs(t, err, context.DeadlineExceeded) +} + +// TestWaitForImageReady_Failed tests that waitForImageReady returns error if image conversion fails +func TestWaitForImageReady_Failed(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx := context.Background() + buildID := "test-build-failed" + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + + // Image is in failed status + imageMgr.images[imageRef] = &images.Image{ + Name: imageRef, + Status: images.StatusFailed, + } + + // waitForImageReady should return error immediately + err := mgr.waitForImageReady(ctx, buildID) + require.Error(t, err) + assert.Contains(t, err.Error(), "image conversion failed") +} + +// TestImageAvailabilityAfterBuildComplete tests the proposed fix: +// Build should only report "ready" after verifying the image is available. +func TestImageAvailabilityAfterBuildComplete(t *testing.T) { + t.Skip("This test is for the proposed fix - not yet implemented") + + // The fix would involve one of: + // + // Option 1: Synchronous conversion in registry + // - Change `go r.triggerConversion()` to synchronous call + // - Pros: Simple fix + // - Cons: Increases latency for builder push response + // + // Option 2: Builder verifies image availability + // - After pushing, builder pulls/verifies the image + // - Only then reports success via vsock + // - Pros: End-to-end verification + // - Cons: Adds complexity to builder agent + // + // Option 3: Build manager waits for image + // - After receiving success from builder, poll image status + // - Only set build to "ready" when image is "ready" + // - Pros: Clean separation of concerns + // - Cons: Adds polling overhead + // + // Option 4: Expose image availability endpoint + // - Callers check image availability before creating instances + // - Pros: Flexible for callers + // - Cons: Pushes complexity to callers (current workaround) +} + +// Concurrent access test to verify thread safety of status updates +func TestConcurrentStatusUpdates(t *testing.T) { + mgr, _, _, tempDir := setupTestManager(t) + defer removeAll(tempDir) + + ctx := context.Background() + + // Create a build + req := CreateBuildRequest{ + Dockerfile: "FROM alpine", + } + build, err := mgr.CreateBuild(ctx, req, []byte("source")) + require.NoError(t, err) + + // Concurrently subscribe and update status + var wg sync.WaitGroup + const numGoroutines = 10 + + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func(id int) { + defer wg.Done() + + // Subscribe + ch := make(chan BuildEvent, 10) + mgr.subscribeToStatus(build.ID, ch) + defer mgr.unsubscribeFromStatus(build.ID, ch) + + // Small delay to interleave + time.Sleep(time.Duration(id) * time.Millisecond) + + // Read any events + for { + select { + case <-ch: + // Got event + case <-time.After(50 * time.Millisecond): + return + } + } + }(i) + } + + // Trigger status updates concurrently + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + mgr.updateStatus(build.ID, StatusBuilding, nil) + }() + } + + wg.Wait() + + // Should not panic or deadlock + t.Log("Concurrent status updates completed without deadlock") +} + +func removeAll(path string) { + os.RemoveAll(path) +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 09d571c..9f0bcd0 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -230,7 +230,7 @@ func ProvideIngressManager(p *paths.Paths, cfg *config.Config, instanceManager i } // ProvideBuildManager provides the build manager -func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager instances.Manager, volumeManager volumes.Manager, log *slog.Logger) (builds.Manager, error) { +func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager instances.Manager, volumeManager volumes.Manager, imageManager images.Manager, log *slog.Logger) (builds.Manager, error) { buildConfig := builds.Config{ MaxConcurrentBuilds: cfg.MaxConcurrentSourceBuilds, BuilderImage: cfg.BuilderImage, @@ -263,5 +263,5 @@ func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager ins } meter := otel.GetMeterProvider().Meter("hypeman") - return builds.NewManager(p, buildConfig, instanceManager, volumeManager, secretProvider, log, meter) + return builds.NewManager(p, buildConfig, instanceManager, volumeManager, imageManager, secretProvider, log, meter) }