Skip to content

Commit 4ab27d7

Browse files
authored
Merge pull request #26 from diggerhq/feat/checkpointing
Add manual checkpoint system for sandboxes
2 parents e32e503 + 9dd6749 commit 4ab27d7

34 files changed

+4721
-258
lines changed

cmd/server/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ func main() {
157157
WorkerID: workerID,
158158
OnHibernate: func(sandboxID string, result *sandbox.HibernateResult) {
159159
log.Printf("opensandbox: sandbox %s auto-hibernated (key=%s, size=%d bytes)",
160-
sandboxID, result.CheckpointKey, result.SizeBytes)
160+
sandboxID, result.HibernationKey, result.SizeBytes)
161161
if opts.Store != nil {
162162
_ = opts.Store.UpdateSandboxSessionStatus(context.Background(), sandboxID, "hibernated", nil)
163163
}

cmd/worker/main.go

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,17 @@ func main() {
5252
defer fcMgr.Close()
5353
log.Println("opensandbox-worker: Firecracker VM manager initialized")
5454

55+
// Clean up orphaned Firecracker processes + TAP devices BEFORE starting golden snapshot.
56+
// Must run first to avoid killing the golden snapshot VM (race condition).
57+
fcMgr.CleanupOrphanedProcesses()
58+
59+
// Prepare golden snapshot for fast default VM creation (~500ms vs ~2s cold boot)
60+
go func() {
61+
if err := fcMgr.PrepareGoldenSnapshot(); err != nil {
62+
log.Printf("opensandbox-worker: golden snapshot preparation failed: %v (cold boot fallback active)", err)
63+
}
64+
}()
65+
5566
// The Firecracker manager implements sandbox.Manager
5667
var mgr sandbox.Manager = fcMgr
5768

@@ -149,9 +160,6 @@ func main() {
149160
defer store.Close()
150161
log.Println("opensandbox-worker: PostgreSQL store connected (auto-wake enabled)")
151162

152-
// Kill orphaned Firecracker processes + TAP devices from previous run
153-
fcMgr.CleanupOrphanedProcesses()
154-
155163
// Local NVMe recovery: scan for sandbox data left from a previous run
156164
recoveries := fcMgr.RecoverLocalSandboxes()
157165
if len(recoveries) > 0 {
@@ -163,14 +171,14 @@ func main() {
163171
continue
164172
}
165173
if r.HasSnapshot {
166-
// Full snapshot on NVMe — create checkpoint record so doWake finds local files
167-
_, _ = store.CreateCheckpoint(ctx, r.SandboxID, session.OrgID,
174+
// Full snapshot on NVMe — create hibernation record so doWake finds local files
175+
_, _ = store.CreateHibernation(ctx, r.SandboxID, session.OrgID,
168176
"local://"+r.SandboxID, 0, session.Region, session.Template, session.Config)
169177
_ = store.UpdateSandboxSessionStatus(ctx, r.SandboxID, "hibernated", nil)
170178
snapshotCount++
171179
} else {
172-
// Workspace only — create local sentinel checkpoint for cold boot
173-
_, _ = store.CreateCheckpoint(ctx, r.SandboxID, session.OrgID,
180+
// Workspace only — create local sentinel hibernation for cold boot
181+
_, _ = store.CreateHibernation(ctx, r.SandboxID, session.OrgID,
174182
"local://"+r.SandboxID, 0, session.Region, session.Template, session.Config)
175183
_ = store.UpdateSandboxSessionStatus(ctx, r.SandboxID, "hibernated", nil)
176184
workspaceCount++
@@ -199,13 +207,13 @@ func main() {
199207
WorkerID: cfg.WorkerID,
200208
OnHibernate: func(sandboxID string, result *sandbox.HibernateResult) {
201209
log.Printf("opensandbox-worker: sandbox %s auto-hibernated (key=%s, size=%d bytes)",
202-
sandboxID, result.CheckpointKey, result.SizeBytes)
210+
sandboxID, result.HibernationKey, result.SizeBytes)
203211
if store != nil {
204-
// Create checkpoint record so wake-on-request can find it
212+
// Create hibernation record so wake-on-request can find it
205213
session, err := store.GetSandboxSession(context.Background(), sandboxID)
206214
if err == nil {
207-
_, _ = store.CreateCheckpoint(context.Background(), sandboxID, session.OrgID,
208-
result.CheckpointKey, result.SizeBytes, session.Region, session.Template, session.Config)
215+
_, _ = store.CreateHibernation(context.Background(), sandboxID, session.OrgID,
216+
result.HibernationKey, result.SizeBytes, session.Region, session.Template, session.Config)
209217
}
210218
_ = store.UpdateSandboxSessionStatus(context.Background(), sandboxID, "hibernated", nil)
211219
}
@@ -346,12 +354,12 @@ func main() {
346354
}
347355
continue
348356
}
349-
log.Printf("opensandbox-worker: hibernated %s (key=%s)", r.SandboxID, r.CheckpointKey)
357+
log.Printf("opensandbox-worker: hibernated %s (key=%s)", r.SandboxID, r.HibernationKey)
350358
if store != nil {
351359
session, err := store.GetSandboxSession(context.Background(), r.SandboxID)
352360
if err == nil {
353-
_, _ = store.CreateCheckpoint(context.Background(), r.SandboxID, session.OrgID,
354-
r.CheckpointKey, 0, session.Region, session.Template, session.Config)
361+
_, _ = store.CreateHibernation(context.Background(), r.SandboxID, session.OrgID,
362+
r.HibernationKey, 0, session.Region, session.Template, session.Config)
355363
_ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "hibernated", nil)
356364
}
357365
}

internal/agent/stats.go

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ package agent
33
import (
44
"bufio"
55
"context"
6-
"fmt"
76
"os"
87
"strconv"
98
"strings"
9+
"syscall"
1010
"time"
1111

1212
pb "github.com/opensandbox/opensandbox/proto/agent"
@@ -182,14 +182,10 @@ func (s *Server) SyncFS(ctx context.Context, req *pb.SyncFSRequest) (*pb.SyncFSR
182182
return &pb.SyncFSResponse{}, nil
183183
}
184184

185-
// syncFS calls sync(2) to flush all filesystems.
185+
// syncFS calls sync(2) to flush all filesystem buffers.
186+
// This syncs ALL mounted filesystems (rootfs + workspace), ensuring dirty pages
187+
// are written to their backing ext4 images before snapshot/checkpoint.
186188
func syncFS() error {
187-
f, err := os.Open("/")
188-
if err != nil {
189-
return fmt.Errorf("open /: %w", err)
190-
}
191-
defer f.Close()
192-
// sync(2) — we use SyncFileRange isn't available on all filesystems,
193-
// just use Sync on the root fd which triggers global sync
194-
return f.Sync()
189+
syscall.Sync()
190+
return nil
195191
}

internal/api/commands.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,24 @@ func (s *Server) runCommand(c echo.Context) error {
7777
}
7878

7979
func (s *Server) runCommandRemote(c echo.Context, sandboxID string) error {
80+
// Wait for sandbox if it's being created asynchronously
81+
if v, ok := s.pendingCreates.Load(sandboxID); ok {
82+
pending := v.(*pendingCreate)
83+
select {
84+
case <-pending.ready:
85+
if pending.err != nil {
86+
return c.JSON(http.StatusInternalServerError, map[string]string{
87+
"error": "sandbox creation failed: " + pending.err.Error(),
88+
})
89+
}
90+
s.pendingCreates.Delete(sandboxID)
91+
case <-c.Request().Context().Done():
92+
return c.JSON(http.StatusGatewayTimeout, map[string]string{
93+
"error": "timed out waiting for sandbox creation",
94+
})
95+
}
96+
}
97+
8098
if s.store == nil {
8199
return c.JSON(http.StatusServiceUnavailable, map[string]string{
82100
"error": "database not configured",

internal/api/dashboard.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -708,14 +708,14 @@ func (s *Server) dashboardGetSession(c echo.Context) error {
708708
}
709709
}
710710

711-
// If hibernated, include checkpoint info
711+
// If hibernated, include hibernation info
712712
if session.Status == "hibernated" {
713-
checkpoint, err := s.store.GetActiveCheckpoint(c.Request().Context(), sandboxID)
713+
hibernation, err := s.store.GetActiveHibernation(c.Request().Context(), sandboxID)
714714
if err == nil {
715-
resp["checkpoint"] = map[string]interface{}{
716-
"checkpointKey": checkpoint.CheckpointKey,
717-
"sizeBytes": checkpoint.SizeBytes,
718-
"hibernatedAt": checkpoint.HibernatedAt,
715+
resp["hibernation"] = map[string]interface{}{
716+
"hibernationKey": hibernation.HibernationKey,
717+
"sizeBytes": hibernation.SizeBytes,
718+
"hibernatedAt": hibernation.HibernatedAt,
719719
}
720720
}
721721
}

internal/api/router.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"os"
88
"path/filepath"
99
"strings"
10+
"sync"
1011

1112
"github.com/labstack/echo/v4"
1213
"github.com/labstack/echo/v4/middleware"
@@ -43,6 +44,13 @@ type Server struct {
4344
sandboxDomain string // base domain for sandbox subdomains
4445
ecrConfig *ecr.Config // nil if ECR not configured
4546
cfClient *cloudflare.Client // nil if Cloudflare not configured
47+
pendingCreates sync.Map // map[sandboxID]*pendingCreate — async sandbox creation tracking
48+
}
49+
50+
// pendingCreate tracks an async sandbox creation.
51+
type pendingCreate struct {
52+
ready chan struct{} // closed when creation completes
53+
err error // set before closing ready
4654
}
4755

4856
// ServerOpts holds optional dependencies for the API server.
@@ -127,6 +135,13 @@ func NewServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, apiKey string, o
127135
api.POST("/sandboxes/:id/hibernate", s.hibernateSandbox)
128136
api.POST("/sandboxes/:id/wake", s.wakeSandbox)
129137

138+
// Checkpoints
139+
api.POST("/sandboxes/:id/checkpoints", s.createCheckpoint)
140+
api.GET("/sandboxes/:id/checkpoints", s.listCheckpoints)
141+
api.POST("/sandboxes/:id/checkpoints/:checkpointId/restore", s.restoreCheckpoint)
142+
api.POST("/sandboxes/from-checkpoint/:checkpointId", s.createFromCheckpoint)
143+
api.DELETE("/sandboxes/:id/checkpoints/:checkpointId", s.deleteCheckpoint)
144+
130145
// Preview URLs (on-demand port-based)
131146
api.POST("/sandboxes/:id/preview", s.createPreviewURL)
132147
api.GET("/sandboxes/:id/preview", s.listPreviewURLs)

0 commit comments

Comments
 (0)