Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
efa9dec
poc: add build system for source-to-image builds
hiroTamada Dec 18, 2025
bd81a39
fix: complete build system E2E functionality
hiroTamada Dec 19, 2025
55d96ab
docs: add builder images guide
hiroTamada Dec 19, 2025
8ff9678
docs: add build system roadmap and security model
hiroTamada Dec 19, 2025
b93e3ba
modify plans
hiroTamada Dec 22, 2025
a86ea65
fix(builds): correct vsock communication pattern for Cloud Hypervisor
hiroTamada Jan 5, 2026
b1bc4ac
fix(e2e): update e2e-build-test.sh for current config
hiroTamada Jan 5, 2026
5663636
chore(e2e): update test script for generic builder system
hiroTamada Jan 6, 2026
f0dd924
feat(builds): implement generic builder with registry token auth
hiroTamada Jan 7, 2026
3492001
docs(builds): update README with registry token auth and generic builder
hiroTamada Jan 7, 2026
de373f7
chore: remove trailing newlines
hiroTamada Jan 7, 2026
53a8f08
Merge main into feat/build-system
hiroTamada Jan 7, 2026
bc517e1
fix(images): support both Docker v2 and OCI v1 manifest formats
hiroTamada Jan 7, 2026
95b0437
fix: ensure volume cleanup succeeds after build timeout
hiroTamada Jan 7, 2026
a1b5750
cursor comment
hiroTamada Jan 7, 2026
e7b6a7f
fix: security and reliability improvements for build system
hiroTamada Jan 7, 2026
8bafdbe
Remove deprecated runtime code and add security fixes
hiroTamada Jan 8, 2026
31bdde6
Fix E2E test to use /events endpoint instead of /logs
hiroTamada Jan 8, 2026
57c4442
Fix E2E test and add OCI media types to builder output
hiroTamada Jan 8, 2026
f03c3ca
Fix E2E script output handling for image import
hiroTamada Jan 8, 2026
5e40a64
feat(builds): implement SSE streaming for build events
hiroTamada Jan 8, 2026
efbe9dd
feat(builds): implement build secrets via vsock
hiroTamada Jan 8, 2026
21e7e94
feat(config): add BUILD_SECRETS_DIR configuration
hiroTamada Jan 8, 2026
b9c5567
fix(builds): fix vsock protocol deadlock and add secrets API support
hiroTamada Jan 8, 2026
1789dcb
docs: update TODO with vsock protocol fix details
hiroTamada Jan 8, 2026
f95b1eb
docs: document cgroup requirement for BuildKit secrets
hiroTamada Jan 8, 2026
cf94ff0
docs: add detailed cgroup analysis for BuildKit secrets
hiroTamada Jan 8, 2026
0a7e047
feat(builds): add guest-agent to builder VMs for exec debugging
hiroTamada Jan 8, 2026
3a89b1b
fix(e2e): fix state comparison and image name matching in E2E test
hiroTamada Jan 8, 2026
1db9fec
fix(registry): preserve registry host in image names when triggering …
hiroTamada Jan 8, 2026
2ab35a2
docs: clean up TODO.md - remove completed tasks
hiroTamada Jan 8, 2026
39e2e5b
docs: remove 'Keep Failed Builders' from TODO and delete PLAN.md
hiroTamada Jan 8, 2026
99a38c4
fix(tests): update registry tests to use full host in image names
hiroTamada Jan 8, 2026
983de98
feat(init): add cgroup2 mount for BuildKit/runc support
hiroTamada Jan 9, 2026
21f82d4
docs: update TODO.md with verified cgroup2 implementation
hiroTamada Jan 9, 2026
61f5051
chore: clean up TODO after cgroup2 implementation verified
hiroTamada Jan 9, 2026
6456b2b
fix(builds): restore missing BuildEvent type definition
hiroTamada Jan 9, 2026
f09e53d
test: add SKIP_DOCKER_HUB_TESTS env var to skip rate-limited tests
hiroTamada Jan 9, 2026
1967d49
Revert "test: add SKIP_DOCKER_HUB_TESTS env var to skip rate-limited …
hiroTamada Jan 9, 2026
520cd0d
fix: address cursor bot review comments from PR #53
hiroTamada Jan 9, 2026
2bded30
fix: make TestCreateImage_Idempotent resilient to timing variations
hiroTamada Jan 9, 2026
8c803c1
moar
hiroTamada Jan 9, 2026
528cf8d
moar
hiroTamada Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,17 @@ test: ensure-ch-binaries ensure-caddy-binaries build-embedded
gen-jwt: $(GODOTENV)
@$(GODOTENV) -f .env go run ./cmd/gen-jwt -user-id $${USER_ID:-test-user}

# Build the generic builder image for builds
build-builder:
docker build -t hypeman/builder:latest -f lib/builds/images/generic/Dockerfile .

# Alias for backwards compatibility
build-builders: build-builder

# Run E2E build system test (requires server running: make dev)
e2e-build-test:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The build-builders alias seems premature if there's only one builder image—consider removing it unless there's a concrete plan for multiple builders soon.

@./scripts/e2e-build-test.sh

# Clean generated files and binaries
clean:
rm -rf $(BIN_DIR)
Expand Down
4 changes: 4 additions & 0 deletions cmd/api/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package api

import (
"github.com/onkernel/hypeman/cmd/api/config"
"github.com/onkernel/hypeman/lib/builds"
"github.com/onkernel/hypeman/lib/devices"
"github.com/onkernel/hypeman/lib/images"
"github.com/onkernel/hypeman/lib/ingress"
Expand All @@ -21,6 +22,7 @@ type ApiService struct {
NetworkManager network.Manager
DeviceManager devices.Manager
IngressManager ingress.Manager
BuildManager builds.Manager
ResourceManager *resources.Manager
}

Expand All @@ -35,6 +37,7 @@ func New(
networkManager network.Manager,
deviceManager devices.Manager,
ingressManager ingress.Manager,
buildManager builds.Manager,
resourceManager *resources.Manager,
) *ApiService {
return &ApiService{
Expand All @@ -45,6 +48,7 @@ func New(
NetworkManager: networkManager,
DeviceManager: deviceManager,
IngressManager: ingressManager,
BuildManager: buildManager,
ResourceManager: resourceManager,
}
}
313 changes: 313 additions & 0 deletions cmd/api/api/builds.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
package api

import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strconv"

"github.com/onkernel/hypeman/lib/builds"
"github.com/onkernel/hypeman/lib/logger"
"github.com/onkernel/hypeman/lib/oapi"
)

// ListBuilds returns all builds
func (s *ApiService) ListBuilds(ctx context.Context, request oapi.ListBuildsRequestObject) (oapi.ListBuildsResponseObject, error) {
log := logger.FromContext(ctx)

domainBuilds, err := s.BuildManager.ListBuilds(ctx)
if err != nil {
log.ErrorContext(ctx, "failed to list builds", "error", err)
return oapi.ListBuilds500JSONResponse{
Code: "internal_error",
Message: "failed to list builds",
}, nil
}

oapiBuilds := make([]oapi.Build, len(domainBuilds))
for i, b := range domainBuilds {
oapiBuilds[i] = buildToOAPI(b)
}

return oapi.ListBuilds200JSONResponse(oapiBuilds), nil
}

// CreateBuild creates a new build job
func (s *ApiService) CreateBuild(ctx context.Context, request oapi.CreateBuildRequestObject) (oapi.CreateBuildResponseObject, error) {
log := logger.FromContext(ctx)

// Parse multipart form fields
var sourceData []byte
var baseImageDigest, cacheScope, dockerfile string
var timeoutSeconds int
var secrets []builds.SecretRef

for {
part, err := request.Body.NextPart()
if err == io.EOF {
break
}
if err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "failed to parse multipart form",
}, nil
}

switch part.FormName() {
case "source":
sourceData, err = io.ReadAll(part)
if err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The multipart parsing loop has no size limit on sourceData—could be a DoS vector if someone uploads a huge file. Should there be a max size check?

return oapi.CreateBuild400JSONResponse{
Code: "invalid_source",
Message: "failed to read source data",
}, nil
}
case "base_image_digest":
data, err := io.ReadAll(part)
if err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "failed to read base_image_digest field",
}, nil
}
baseImageDigest = string(data)
case "cache_scope":
data, err := io.ReadAll(part)
if err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "failed to read cache_scope field",
}, nil
}
cacheScope = string(data)
case "dockerfile":
data, err := io.ReadAll(part)
if err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "failed to read dockerfile field",
}, nil
}
dockerfile = string(data)
case "timeout_seconds":
data, err := io.ReadAll(part)
if err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "failed to read timeout_seconds field",
}, nil
}
if v, err := strconv.Atoi(string(data)); err == nil {
timeoutSeconds = v
}
case "secrets":
data, err := io.ReadAll(part)
if err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "failed to read secrets field",
}, nil
}
if err := json.Unmarshal(data, &secrets); err != nil {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "secrets must be a JSON array of {\"id\": \"...\", \"env_var\": \"...\"} objects",
}, nil
}
}
part.Close()
}

if len(sourceData) == 0 {
return oapi.CreateBuild400JSONResponse{
Code: "invalid_request",
Message: "source is required",
}, nil
}

// Note: Dockerfile validation happens in the builder agent.
// It will check if Dockerfile is in the source tarball or provided via dockerfile parameter.

// Build domain request
domainReq := builds.CreateBuildRequest{
BaseImageDigest: baseImageDigest,
CacheScope: cacheScope,
Dockerfile: dockerfile,
Secrets: secrets,
}

// Apply timeout if provided
if timeoutSeconds > 0 {
domainReq.BuildPolicy = &builds.BuildPolicy{
TimeoutSeconds: timeoutSeconds,
}
}

build, err := s.BuildManager.CreateBuild(ctx, domainReq, sourceData)
if err != nil {
switch {
case errors.Is(err, builds.ErrDockerfileRequired):
return oapi.CreateBuild400JSONResponse{
Code: "dockerfile_required",
Message: err.Error(),
}, nil
case errors.Is(err, builds.ErrInvalidSource):
return oapi.CreateBuild400JSONResponse{
Code: "invalid_source",
Message: err.Error(),
}, nil
default:
log.ErrorContext(ctx, "failed to create build", "error", err)
return oapi.CreateBuild500JSONResponse{
Code: "internal_error",
Message: "failed to create build",
}, nil
}
}

return oapi.CreateBuild202JSONResponse(buildToOAPI(build)), nil
}

// GetBuild gets build details
func (s *ApiService) GetBuild(ctx context.Context, request oapi.GetBuildRequestObject) (oapi.GetBuildResponseObject, error) {
log := logger.FromContext(ctx)

build, err := s.BuildManager.GetBuild(ctx, request.Id)
if err != nil {
if errors.Is(err, builds.ErrNotFound) {
return oapi.GetBuild404JSONResponse{
Code: "not_found",
Message: "build not found",
}, nil
}
log.ErrorContext(ctx, "failed to get build", "error", err, "id", request.Id)
return oapi.GetBuild500JSONResponse{
Code: "internal_error",
Message: "failed to get build",
}, nil
}

return oapi.GetBuild200JSONResponse(buildToOAPI(build)), nil
}

// CancelBuild cancels a build
func (s *ApiService) CancelBuild(ctx context.Context, request oapi.CancelBuildRequestObject) (oapi.CancelBuildResponseObject, error) {
log := logger.FromContext(ctx)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This returns 409 for ErrBuildInProgress with message "already in progress"—but usually you'd cancel a build because it's in progress. Is this the right semantic? Should this case allow cancellation, or is the error name/message misleading?

err := s.BuildManager.CancelBuild(ctx, request.Id)
if err != nil {
switch {
case errors.Is(err, builds.ErrNotFound):
return oapi.CancelBuild404JSONResponse{
Code: "not_found",
Message: "build not found",
}, nil
case errors.Is(err, builds.ErrBuildInProgress):
return oapi.CancelBuild409JSONResponse{
Code: "conflict",
Message: "build already in progress",
}, nil
default:
log.ErrorContext(ctx, "failed to cancel build", "error", err, "id", request.Id)
return oapi.CancelBuild500JSONResponse{
Code: "internal_error",
Message: "failed to cancel build",
}, nil
}
}

return oapi.CancelBuild204Response{}, nil
}

// GetBuildEvents streams build events via SSE
// With follow=false (default), streams existing logs then closes
// With follow=true, continues streaming until build completes
func (s *ApiService) GetBuildEvents(ctx context.Context, request oapi.GetBuildEventsRequestObject) (oapi.GetBuildEventsResponseObject, error) {
log := logger.FromContext(ctx)

// Parse follow parameter (default false)
follow := false
if request.Params.Follow != nil {
follow = *request.Params.Follow
}

eventChan, err := s.BuildManager.StreamBuildEvents(ctx, request.Id, follow)
if err != nil {
if errors.Is(err, builds.ErrNotFound) {
return oapi.GetBuildEvents404JSONResponse{
Code: "not_found",
Message: "build not found",
}, nil
}
log.ErrorContext(ctx, "failed to stream build events", "error", err, "id", request.Id)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If json.Marshal(event) fails here, it silently continues. Might be worth logging this error.

return oapi.GetBuildEvents500JSONResponse{
Code: "internal_error",
Message: "failed to stream build events",
}, nil
}

return buildEventsStreamResponse{eventChan: eventChan}, nil
}

// buildEventsStreamResponse implements oapi.GetBuildEventsResponseObject with proper SSE streaming
type buildEventsStreamResponse struct {
eventChan <-chan builds.BuildEvent
}

func (r buildEventsStreamResponse) VisitGetBuildEventsResponse(w http.ResponseWriter) error {
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("X-Accel-Buffering", "no") // Disable nginx buffering
w.WriteHeader(200)

flusher, ok := w.(http.Flusher)
if !ok {
return fmt.Errorf("streaming not supported")
}

for event := range r.eventChan {
jsonEvent, err := json.Marshal(event)
if err != nil {
continue
}
fmt.Fprintf(w, "data: %s\n\n", jsonEvent)
flusher.Flush()
}
return nil
}

// buildToOAPI converts a domain Build to OAPI Build
func buildToOAPI(b *builds.Build) oapi.Build {
oapiBuild := oapi.Build{
Id: b.ID,
Status: oapi.BuildStatus(b.Status),
QueuePosition: b.QueuePosition,
ImageDigest: b.ImageDigest,
ImageRef: b.ImageRef,
Error: b.Error,
CreatedAt: b.CreatedAt,
StartedAt: b.StartedAt,
CompletedAt: b.CompletedAt,
DurationMs: b.DurationMS,
}

if b.Provenance != nil {
oapiBuild.Provenance = &oapi.BuildProvenance{
BaseImageDigest: &b.Provenance.BaseImageDigest,
SourceHash: &b.Provenance.SourceHash,
BuildkitVersion: &b.Provenance.BuildkitVersion,
Timestamp: &b.Provenance.Timestamp,
}
if len(b.Provenance.LockfileHashes) > 0 {
oapiBuild.Provenance.LockfileHashes = &b.Provenance.LockfileHashes
}
}

return oapiBuild
}

14 changes: 11 additions & 3 deletions cmd/api/api/images_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,17 @@ func TestCreateImage_Idempotent(t *testing.T) {
t.Fatal("Build failed - this is the root cause of test failures")
}

require.Equal(t, oapi.ImageStatus(images.StatusPending), img2.Status)
require.NotNil(t, img2.QueuePosition, "should have queue position")
require.Equal(t, 1, *img2.QueuePosition, "should still be at position 1")
// Status can be "pending" (still processing) or "ready" (already completed in fast CI)
// The key idempotency invariant is that the digest is the same (verified above)
require.Contains(t, []oapi.ImageStatus{
oapi.ImageStatus(images.StatusPending),
oapi.ImageStatus(images.StatusReady),
}, img2.Status, "status should be pending or ready")

// If still pending, should have queue position
if img2.Status == oapi.ImageStatus(images.StatusPending) {
require.NotNil(t, img2.QueuePosition, "should have queue position when pending")
}

// Construct digest reference: repository@digest
// Extract repository from imageName (strip tag part)
Expand Down
Loading