Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/api/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 64 additions & 1 deletion lib/builds/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ type manager struct {
queue *BuildQueue
instanceManager instances.Manager
volumeManager volumes.Manager
imageManager images.Manager
secretProvider SecretProvider
tokenGenerator *RegistryTokenGenerator
logger *slog.Logger
Expand All @@ -105,6 +106,7 @@ func NewManager(
config Config,
instanceMgr instances.Manager,
volumeMgr volumes.Manager,
imageMgr images.Manager,
secretProvider SecretProvider,
logger *slog.Logger,
meter metric.Meter,
Expand All @@ -119,6 +121,7 @@ func NewManager(
queue: NewBuildQueue(config.MaxConcurrentBuilds),
instanceManager: instanceMgr,
volumeManager: volumeMgr,
imageManager: imageMgr,
secretProvider: secretProvider,
tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret),
logger: logger,
Expand Down Expand Up @@ -294,6 +297,28 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques

m.logger.Info("build succeeded", "id", id, "digest", result.ImageDigest, "duration", duration)
imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id)

// Wait for image to be ready before reporting build as complete.
// This fixes the race condition (KERNEL-863) where build reports "ready"
// but image conversion hasn't finished yet.
// Use buildCtx to respect the build timeout during image wait.
if err := m.waitForImageReady(buildCtx, id); err != nil {
// Recalculate duration to include image wait time
duration = time.Since(start)
durationMS = duration.Milliseconds()
m.logger.Error("image conversion failed after build", "id", id, "error", err, "duration", duration)
errMsg := fmt.Sprintf("image conversion failed: %v", err)
m.updateBuildComplete(id, StatusFailed, nil, &errMsg, &result.Provenance, &durationMS)
if m.metrics != nil {
m.metrics.RecordBuild(buildCtx, "failed", duration)
}
return
}

// Recalculate duration to include image wait time for accurate reporting
duration = time.Since(start)
durationMS = duration.Milliseconds()

m.updateBuildComplete(id, StatusReady, &result.ImageDigest, nil, &result.Provenance, &durationMS)

// Update with image ref
Expand All @@ -303,7 +328,7 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques
}

if m.metrics != nil {
m.metrics.RecordBuild(ctx, "success", duration)
m.metrics.RecordBuild(buildCtx, "success", duration)
}
}

Expand Down Expand Up @@ -640,6 +665,44 @@ func (m *manager) updateBuildComplete(id string, status string, digest *string,
m.notifyStatusChange(id, status)
}

// waitForImageReady polls the image manager until the build's image is ready.
// This ensures that when a build reports "ready", the image is actually usable
// for instance creation (fixes KERNEL-863 race condition).
func (m *manager) waitForImageReady(ctx context.Context, id string) error {
imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id)

// Poll for up to 60 seconds (image conversion is typically fast)
const maxAttempts = 120
const pollInterval = 500 * time.Millisecond

m.logger.Debug("waiting for image to be ready", "id", id, "image_ref", imageRef)

for attempt := 0; attempt < maxAttempts; attempt++ {
select {
case <-ctx.Done():
return ctx.Err()
default:
}

img, err := m.imageManager.GetImage(ctx, imageRef)
if err == nil {
switch img.Status {
case images.StatusReady:
m.logger.Debug("image is ready", "id", id, "image_ref", imageRef, "attempts", attempt+1)
return nil
case images.StatusFailed:
return fmt.Errorf("image conversion failed")
case images.StatusPending, images.StatusPulling, images.StatusConverting:
// Still processing, continue polling
}
}
// Image not found or still processing, wait and retry
time.Sleep(pollInterval)
}

return fmt.Errorf("timeout waiting for image to be ready after %v", time.Duration(maxAttempts)*pollInterval)
}

// subscribeToStatus adds a subscriber channel for status updates on a build
func (m *manager) subscribeToStatus(buildID string, ch chan BuildEvent) {
m.subscriberMu.Lock()
Expand Down
82 changes: 81 additions & 1 deletion lib/builds/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"testing"
"time"

"github.com/kernel/hypeman/lib/images"
"github.com/kernel/hypeman/lib/instances"
"github.com/kernel/hypeman/lib/paths"
"github.com/kernel/hypeman/lib/resources"
Expand Down Expand Up @@ -219,8 +220,85 @@ func (m *mockSecretProvider) GetSecrets(ctx context.Context, secretIDs []string)
return make(map[string]string), nil
}

// mockImageManager implements images.Manager for testing
type mockImageManager struct {
images map[string]*images.Image
getImageErr error
}

func newMockImageManager() *mockImageManager {
return &mockImageManager{
images: make(map[string]*images.Image),
}
}

func (m *mockImageManager) ListImages(ctx context.Context) ([]images.Image, error) {
var result []images.Image
for _, img := range m.images {
result = append(result, *img)
}
return result, nil
}

func (m *mockImageManager) CreateImage(ctx context.Context, req images.CreateImageRequest) (*images.Image, error) {
img := &images.Image{
Name: req.Name,
Status: images.StatusPending,
}
m.images[req.Name] = img
return img, nil
}

func (m *mockImageManager) ImportLocalImage(ctx context.Context, repo, reference, digest string) (*images.Image, error) {
img := &images.Image{
Name: repo + ":" + reference,
Status: images.StatusReady,
}
m.images[img.Name] = img
return img, nil
}

func (m *mockImageManager) GetImage(ctx context.Context, name string) (*images.Image, error) {
if m.getImageErr != nil {
return nil, m.getImageErr
}
if img, ok := m.images[name]; ok {
return img, nil
}
return nil, images.ErrNotFound
}

func (m *mockImageManager) DeleteImage(ctx context.Context, name string) error {
delete(m.images, name)
return nil
}

func (m *mockImageManager) RecoverInterruptedBuilds() {}

func (m *mockImageManager) TotalImageBytes(ctx context.Context) (int64, error) {
return 0, nil
}

func (m *mockImageManager) TotalOCICacheBytes(ctx context.Context) (int64, error) {
return 0, nil
}

// SetImageReady sets an image to ready status for testing
func (m *mockImageManager) SetImageReady(name string) {
m.images[name] = &images.Image{
Name: name,
Status: images.StatusReady,
}
}

// Test helper to create a manager with test paths and mocks
func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, string) {
mgr, instanceMgr, volumeMgr, _, tempDir := setupTestManagerWithImageMgr(t)
return mgr, instanceMgr, volumeMgr, tempDir
}

// setupTestManagerWithImageMgr returns the image manager for tests that need it
func setupTestManagerWithImageMgr(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, *mockImageManager, string) {
t.Helper()

// Create temp directory for test data
Expand All @@ -236,6 +314,7 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume
// Create mocks
instanceMgr := newMockInstanceManager()
volumeMgr := newMockVolumeManager()
imageMgr := newMockImageManager()
secretProvider := &mockSecretProvider{}

// Create config
Expand All @@ -257,13 +336,14 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume
queue: NewBuildQueue(config.MaxConcurrentBuilds),
instanceManager: instanceMgr,
volumeManager: volumeMgr,
imageManager: imageMgr,
secretProvider: secretProvider,
tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret),
logger: logger,
statusSubscribers: make(map[string][]chan BuildEvent),
}

return mgr, instanceMgr, volumeMgr, tempDir
return mgr, instanceMgr, volumeMgr, imageMgr, tempDir
}

func TestCreateBuild_Success(t *testing.T) {
Expand Down
Loading