Skip to content

Commit 81c03b9

Browse files
authored
fix: wait for image conversion before reporting build ready (KERNEL-863) (#65)
* fix: wait for image conversion before reporting build ready (KERNEL-863) Fixes a race condition where build status would transition to "ready" before the image conversion completed, causing instance creation to fail. The registry's triggerConversion() runs asynchronously after returning 201 to the builder. This meant the builder could report success and the build manager would set status="ready" while image conversion was still in progress. Changes: - Add imageManager dependency to build manager - Add waitForImageReady() that polls until image status is "ready" - Call waitForImageReady() before setting build to "ready" status - If image conversion fails/times out, mark build as failed * fix: use buildCtx for image wait and recalculate duration Addresses PR review feedback: 1. Use buildCtx instead of ctx for waitForImageReady to respect build timeout during image conversion wait 2. Recalculate duration after waitForImageReady completes to accurately report total build time including image conversion
1 parent 57b83c1 commit 81c03b9

File tree

5 files changed

+403
-5
lines changed

5 files changed

+403
-5
lines changed

cmd/api/wire_gen.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/builds/manager.go

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ type manager struct {
8888
queue *BuildQueue
8989
instanceManager instances.Manager
9090
volumeManager volumes.Manager
91+
imageManager images.Manager
9192
secretProvider SecretProvider
9293
tokenGenerator *RegistryTokenGenerator
9394
logger *slog.Logger
@@ -105,6 +106,7 @@ func NewManager(
105106
config Config,
106107
instanceMgr instances.Manager,
107108
volumeMgr volumes.Manager,
109+
imageMgr images.Manager,
108110
secretProvider SecretProvider,
109111
logger *slog.Logger,
110112
meter metric.Meter,
@@ -119,6 +121,7 @@ func NewManager(
119121
queue: NewBuildQueue(config.MaxConcurrentBuilds),
120122
instanceManager: instanceMgr,
121123
volumeManager: volumeMgr,
124+
imageManager: imageMgr,
122125
secretProvider: secretProvider,
123126
tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret),
124127
logger: logger,
@@ -294,6 +297,28 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques
294297

295298
m.logger.Info("build succeeded", "id", id, "digest", result.ImageDigest, "duration", duration)
296299
imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id)
300+
301+
// Wait for image to be ready before reporting build as complete.
302+
// This fixes the race condition (KERNEL-863) where build reports "ready"
303+
// but image conversion hasn't finished yet.
304+
// Use buildCtx to respect the build timeout during image wait.
305+
if err := m.waitForImageReady(buildCtx, id); err != nil {
306+
// Recalculate duration to include image wait time
307+
duration = time.Since(start)
308+
durationMS = duration.Milliseconds()
309+
m.logger.Error("image conversion failed after build", "id", id, "error", err, "duration", duration)
310+
errMsg := fmt.Sprintf("image conversion failed: %v", err)
311+
m.updateBuildComplete(id, StatusFailed, nil, &errMsg, &result.Provenance, &durationMS)
312+
if m.metrics != nil {
313+
m.metrics.RecordBuild(buildCtx, "failed", duration)
314+
}
315+
return
316+
}
317+
318+
// Recalculate duration to include image wait time for accurate reporting
319+
duration = time.Since(start)
320+
durationMS = duration.Milliseconds()
321+
297322
m.updateBuildComplete(id, StatusReady, &result.ImageDigest, nil, &result.Provenance, &durationMS)
298323

299324
// Update with image ref
@@ -303,7 +328,7 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques
303328
}
304329

305330
if m.metrics != nil {
306-
m.metrics.RecordBuild(ctx, "success", duration)
331+
m.metrics.RecordBuild(buildCtx, "success", duration)
307332
}
308333
}
309334

@@ -640,6 +665,44 @@ func (m *manager) updateBuildComplete(id string, status string, digest *string,
640665
m.notifyStatusChange(id, status)
641666
}
642667

668+
// waitForImageReady polls the image manager until the build's image is ready.
669+
// This ensures that when a build reports "ready", the image is actually usable
670+
// for instance creation (fixes KERNEL-863 race condition).
671+
func (m *manager) waitForImageReady(ctx context.Context, id string) error {
672+
imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id)
673+
674+
// Poll for up to 60 seconds (image conversion is typically fast)
675+
const maxAttempts = 120
676+
const pollInterval = 500 * time.Millisecond
677+
678+
m.logger.Debug("waiting for image to be ready", "id", id, "image_ref", imageRef)
679+
680+
for attempt := 0; attempt < maxAttempts; attempt++ {
681+
select {
682+
case <-ctx.Done():
683+
return ctx.Err()
684+
default:
685+
}
686+
687+
img, err := m.imageManager.GetImage(ctx, imageRef)
688+
if err == nil {
689+
switch img.Status {
690+
case images.StatusReady:
691+
m.logger.Debug("image is ready", "id", id, "image_ref", imageRef, "attempts", attempt+1)
692+
return nil
693+
case images.StatusFailed:
694+
return fmt.Errorf("image conversion failed")
695+
case images.StatusPending, images.StatusPulling, images.StatusConverting:
696+
// Still processing, continue polling
697+
}
698+
}
699+
// Image not found or still processing, wait and retry
700+
time.Sleep(pollInterval)
701+
}
702+
703+
return fmt.Errorf("timeout waiting for image to be ready after %v", time.Duration(maxAttempts)*pollInterval)
704+
}
705+
643706
// subscribeToStatus adds a subscriber channel for status updates on a build
644707
func (m *manager) subscribeToStatus(buildID string, ch chan BuildEvent) {
645708
m.subscriberMu.Lock()

lib/builds/manager_test.go

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"testing"
1111
"time"
1212

13+
"github.com/kernel/hypeman/lib/images"
1314
"github.com/kernel/hypeman/lib/instances"
1415
"github.com/kernel/hypeman/lib/paths"
1516
"github.com/kernel/hypeman/lib/resources"
@@ -219,8 +220,85 @@ func (m *mockSecretProvider) GetSecrets(ctx context.Context, secretIDs []string)
219220
return make(map[string]string), nil
220221
}
221222

223+
// mockImageManager implements images.Manager for testing
224+
type mockImageManager struct {
225+
images map[string]*images.Image
226+
getImageErr error
227+
}
228+
229+
func newMockImageManager() *mockImageManager {
230+
return &mockImageManager{
231+
images: make(map[string]*images.Image),
232+
}
233+
}
234+
235+
func (m *mockImageManager) ListImages(ctx context.Context) ([]images.Image, error) {
236+
var result []images.Image
237+
for _, img := range m.images {
238+
result = append(result, *img)
239+
}
240+
return result, nil
241+
}
242+
243+
func (m *mockImageManager) CreateImage(ctx context.Context, req images.CreateImageRequest) (*images.Image, error) {
244+
img := &images.Image{
245+
Name: req.Name,
246+
Status: images.StatusPending,
247+
}
248+
m.images[req.Name] = img
249+
return img, nil
250+
}
251+
252+
func (m *mockImageManager) ImportLocalImage(ctx context.Context, repo, reference, digest string) (*images.Image, error) {
253+
img := &images.Image{
254+
Name: repo + ":" + reference,
255+
Status: images.StatusReady,
256+
}
257+
m.images[img.Name] = img
258+
return img, nil
259+
}
260+
261+
func (m *mockImageManager) GetImage(ctx context.Context, name string) (*images.Image, error) {
262+
if m.getImageErr != nil {
263+
return nil, m.getImageErr
264+
}
265+
if img, ok := m.images[name]; ok {
266+
return img, nil
267+
}
268+
return nil, images.ErrNotFound
269+
}
270+
271+
func (m *mockImageManager) DeleteImage(ctx context.Context, name string) error {
272+
delete(m.images, name)
273+
return nil
274+
}
275+
276+
func (m *mockImageManager) RecoverInterruptedBuilds() {}
277+
278+
func (m *mockImageManager) TotalImageBytes(ctx context.Context) (int64, error) {
279+
return 0, nil
280+
}
281+
282+
func (m *mockImageManager) TotalOCICacheBytes(ctx context.Context) (int64, error) {
283+
return 0, nil
284+
}
285+
286+
// SetImageReady sets an image to ready status for testing
287+
func (m *mockImageManager) SetImageReady(name string) {
288+
m.images[name] = &images.Image{
289+
Name: name,
290+
Status: images.StatusReady,
291+
}
292+
}
293+
222294
// Test helper to create a manager with test paths and mocks
223295
func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, string) {
296+
mgr, instanceMgr, volumeMgr, _, tempDir := setupTestManagerWithImageMgr(t)
297+
return mgr, instanceMgr, volumeMgr, tempDir
298+
}
299+
300+
// setupTestManagerWithImageMgr returns the image manager for tests that need it
301+
func setupTestManagerWithImageMgr(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, *mockImageManager, string) {
224302
t.Helper()
225303

226304
// Create temp directory for test data
@@ -236,6 +314,7 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume
236314
// Create mocks
237315
instanceMgr := newMockInstanceManager()
238316
volumeMgr := newMockVolumeManager()
317+
imageMgr := newMockImageManager()
239318
secretProvider := &mockSecretProvider{}
240319

241320
// Create config
@@ -257,13 +336,14 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume
257336
queue: NewBuildQueue(config.MaxConcurrentBuilds),
258337
instanceManager: instanceMgr,
259338
volumeManager: volumeMgr,
339+
imageManager: imageMgr,
260340
secretProvider: secretProvider,
261341
tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret),
262342
logger: logger,
263343
statusSubscribers: make(map[string][]chan BuildEvent),
264344
}
265345

266-
return mgr, instanceMgr, volumeMgr, tempDir
346+
return mgr, instanceMgr, volumeMgr, imageMgr, tempDir
267347
}
268348

269349
func TestCreateBuild_Success(t *testing.T) {

0 commit comments

Comments
 (0)