Skip to content
This repository was archived by the owner on Jan 2, 2026. It is now read-only.

Commit bd0d67c

Browse files
alexisbouchezclaude
andcommitted
feat: add start-from-snapshot API for fast cold starts
Add the ability to start an instance by restoring from a snapshot instead of cold booting. This is the foundation for sub-100ms cold starts for AI sandbox workloads. New API endpoint: - POST /instances/{id}/start-from-snapshot The implementation: - Copies snapshot from global storage to the jail - Calls CloudHypervisor restore instead of create+boot - Resumes the VM Known limitation: CloudHypervisor snapshots contain hardcoded device paths (rootfs, etc.), so restoring to a different instance requires the same device paths. This will be addressed in a follow-up. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent fc27880 commit bd0d67c

File tree

11 files changed

+279
-0
lines changed

11 files changed

+279
-0
lines changed

core/daemon/daemon.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ type Daemon interface {
4646
ListInstances(ctx context.Context) ([]instance.Instance, error)
4747
DestroyInstance(ctx context.Context, id string) error
4848
StartInstance(ctx context.Context, id string) error
49+
// StartInstanceFromSnapshot starts an instance by restoring from a snapshot (fast cold start)
50+
StartInstanceFromSnapshot(ctx context.Context, id string, globalSnapshotPath, jailSnapshotPath string) error
4951
StopInstance(ctx context.Context, id string, opt *api.StopConfig) error
5052
InstanceExec(ctx context.Context, id string, cmd []string, timeout time.Duration) (*api.ExecResult, error)
5153
GetInstanceLogs(ctx context.Context, id string) ([]*api.LogEntry, error)

core/daemon/images.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ type Runtime interface {
2222
GetInstance(id string) (*instance.Instance, error)
2323
ListInstances() []instance.Instance
2424
StartInstance(ctx context.Context, id string) error
25+
// StartInstanceFromSnapshot starts an instance by restoring from a snapshot (fast cold start)
26+
StartInstanceFromSnapshot(ctx context.Context, id string, globalSnapshotPath, jailSnapshotPath string) error
2527
StopInstance(ctx context.Context, id string, opt *api.StopConfig) error
2628
GetInstanceLogs(id string) ([]*api.LogEntry, error)
2729
SubscribeToInstanceLogs(ctx context.Context, id string) ([]*api.LogEntry, <-chan *api.LogEntry, error)

raveld/client/daemon.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"net"
88
"net/http"
99
"net/url"
10+
"strings"
1011
"time"
1112

1213
"github.com/valyentdev/ravel/api"
@@ -94,6 +95,19 @@ func (a *DaemonClient) StartInstance(ctx context.Context, id string) error {
9495
return nil
9596
}
9697

98+
// StartInstanceFromSnapshot starts an instance by restoring from a snapshot.
99+
// This enables sub-100ms cold starts for AI sandbox workloads.
100+
// Note: The API only needs snapshotId, the server calculates the paths
101+
func (a *DaemonClient) StartInstanceFromSnapshot(ctx context.Context, id string, globalSnapshotPath, jailSnapshotPath string) error {
102+
// Extract snapshot ID from the path (last component)
103+
parts := strings.Split(globalSnapshotPath, "/")
104+
snapshotId := parts[len(parts)-1]
105+
body := struct {
106+
SnapshotId string `json:"snapshot_id"`
107+
}{SnapshotId: snapshotId}
108+
return a.client.Post(ctx, "/instances/"+id+"/start-from-snapshot", nil, httpclient.WithJSONBody(&body))
109+
}
110+
97111
func (a *DaemonClient) StopInstance(ctx context.Context, id string, opt *api.StopConfig) error {
98112
err := a.client.Post(ctx, "/instances/"+id+"/stop", nil, httpclient.WithJSONBody(opt))
99113
if err != nil {

raveld/runtime.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ func (s *Daemon) StartInstance(ctx context.Context, id string) error {
4747
return s.runtime.StartInstance(ctx, id)
4848
}
4949

50+
// StartInstanceFromSnapshot starts an instance by restoring from a snapshot.
51+
// This enables sub-100ms cold starts for AI sandbox workloads.
52+
func (s *Daemon) StartInstanceFromSnapshot(ctx context.Context, id string, globalSnapshotPath, jailSnapshotPath string) error {
53+
return s.runtime.StartInstanceFromSnapshot(ctx, id, globalSnapshotPath, jailSnapshotPath)
54+
}
55+
5056
func (s *Daemon) DestroyInstance(ctx context.Context, id string) error {
5157
instance, err := s.runtime.GetInstance(id)
5258
if err != nil {

raveld/server/instances.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,29 @@ func (s *DaemonServer) startInstance(ctx context.Context, req *StartInstanceRequ
9393
return &StartInstanceResponse{}, nil
9494
}
9595

96+
// StartInstanceFromSnapshotRequest is the request for starting an instance from a snapshot
97+
type StartInstanceFromSnapshotRequest struct {
98+
Id string `path:"id"`
99+
Body struct {
100+
SnapshotId string `json:"snapshot_id" required:"true" doc:"Snapshot ID to restore from"`
101+
}
102+
}
103+
104+
type StartInstanceFromSnapshotResponse struct {
105+
}
106+
107+
func (s *DaemonServer) startInstanceFromSnapshot(ctx context.Context, req *StartInstanceFromSnapshotRequest) (*StartInstanceFromSnapshotResponse, error) {
108+
// Global snapshot path and jail-relative path
109+
globalSnapshotPath := "/var/lib/ravel/global-snapshots/" + req.Id + "/" + req.Body.SnapshotId
110+
jailSnapshotPath := "/snapshots/" + req.Body.SnapshotId
111+
err := s.daemon.StartInstanceFromSnapshot(ctx, req.Id, globalSnapshotPath, jailSnapshotPath)
112+
if err != nil {
113+
s.log("Failed to start instance from snapshot", err)
114+
return nil, err
115+
}
116+
return &StartInstanceFromSnapshotResponse{}, nil
117+
}
118+
96119
type StopInstanceRequest struct {
97120
Id string `path:"id"`
98121
Body *api.StopConfig `required:"false"`

raveld/server/routes.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ func (s DaemonServer) registerEndpoints(mux humago.Mux) {
6161
Method: http.MethodPost,
6262
}, s.startInstance)
6363

64+
huma.Register(api, huma.Operation{
65+
OperationID: "startInstanceFromSnapshot",
66+
Path: "/instances/{id}/start-from-snapshot",
67+
Method: http.MethodPost,
68+
Summary: "Start an instance by restoring from a snapshot (fast cold start)",
69+
Tags: []string{"sandbox"},
70+
}, s.startInstanceFromSnapshot)
71+
6472
huma.Register(api, huma.Operation{
6573
OperationID: "stopInstance",
6674
Path: "/instances/{id}/stop",

runtime/drivers/driver.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ import (
1111

1212
type InstanceTask interface {
1313
Start(ctx context.Context) error
14+
// StartFromSnapshot starts the VM by restoring from a snapshot instead of cold booting
15+
// globalSnapshotPath is the source on the host, jailSnapshotPath is the jail-relative destination
16+
StartFromSnapshot(ctx context.Context, globalSnapshotPath, jailSnapshotPath string) error
1417
Exec(ctx context.Context, cmd []string, timeout time.Duration) (*api.ExecResult, error)
1518
Run() instance.ExitResult
1619
WaitExit(ctx context.Context) bool

runtime/drivers/vm/vm.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"log/slog"
77
"os"
88
"os/exec"
9+
"path/filepath"
910
"sync/atomic"
1011
"time"
1112

@@ -93,6 +94,134 @@ func (vm *vm) Start(ctx context.Context) error {
9394

9495
}
9596

97+
// StartFromSnapshot starts the VM by restoring from a snapshot instead of cold booting.
98+
// This enables sub-100ms cold starts for AI sandbox workloads.
99+
// globalSnapshotPath is the source path on the host (e.g., /var/lib/ravel/global-snapshots/instance-id/snap-1)
100+
// jailSnapshotPath is the jail-relative path (e.g., /snapshots/snap-1)
101+
func (vm *vm) StartFromSnapshot(ctx context.Context, globalSnapshotPath, jailSnapshotPath string) error {
102+
bootStart := time.Now()
103+
104+
// Copy snapshot from global storage into the jail
105+
jailHostPath := getInstanceDir(vm.id) + jailSnapshotPath
106+
if err := copyDir(globalSnapshotPath, jailHostPath); err != nil {
107+
return fmt.Errorf("failed to copy snapshot to jail: %w", err)
108+
}
109+
110+
// Chown to ravel-jailer user so CloudHypervisor can read
111+
jailerUid, jailerGid, err := setupRavelJailerUser()
112+
if err != nil {
113+
return fmt.Errorf("failed to get jailer user: %w", err)
114+
}
115+
if err := chownRecursive(jailHostPath, jailerUid, jailerGid); err != nil {
116+
return fmt.Errorf("failed to chown snapshot directory: %w", err)
117+
}
118+
119+
slog.Debug("snapshot copied to jail", "from", globalSnapshotPath, "to", jailHostPath)
120+
121+
err = vm.cmd.Start()
122+
if err != nil {
123+
metrics.VMBootsTotal.WithLabelValues("failure").Inc()
124+
return fmt.Errorf("failed to start vmm for machine %q: %w", vm.Id(), err)
125+
}
126+
defer func() {
127+
if err != nil {
128+
vm.vmm.ShutdownVMM(ctx)
129+
}
130+
}()
131+
132+
err = vm.vmm.WaitReady(ctx)
133+
if err != nil {
134+
metrics.VMBootsTotal.WithLabelValues("failure").Inc()
135+
return fmt.Errorf("failed to wait for vmm to be ready for machine %q: %w", vm.Id(), err)
136+
}
137+
138+
// Restore from snapshot instead of create+boot
139+
snapshotUrl := "file://" + jailSnapshotPath
140+
prefault := true
141+
_, err = vm.vmm.PutVmRestore(ctx, cloudhypervisor.RestoreConfig{
142+
SourceUrl: snapshotUrl,
143+
Prefault: &prefault,
144+
})
145+
if err != nil {
146+
metrics.VMBootsTotal.WithLabelValues("failure").Inc()
147+
return fmt.Errorf("failed to restore vm from snapshot for machine %q: %w", vm.Id(), err)
148+
}
149+
150+
// Resume the restored VM
151+
_, err = vm.vmm.ResumeVM(ctx)
152+
if err != nil {
153+
metrics.VMBootsTotal.WithLabelValues("failure").Inc()
154+
return fmt.Errorf("failed to resume vm after restore for machine %q: %w", vm.Id(), err)
155+
}
156+
157+
// Record successful boot metrics
158+
bootDuration := time.Since(bootStart).Seconds()
159+
metrics.VMBootDuration.Observe(bootDuration)
160+
metrics.VMBootsTotal.WithLabelValues("success").Inc()
161+
162+
slog.Info("VM restored from snapshot", "id", vm.id, "duration_ms", bootDuration*1000)
163+
164+
go vm.run()
165+
166+
return nil
167+
}
168+
169+
// copyDir copies a directory from src to dst
170+
func copyDir(src, dst string) error {
171+
if err := os.MkdirAll(dst, 0755); err != nil {
172+
return err
173+
}
174+
175+
entries, err := os.ReadDir(src)
176+
if err != nil {
177+
return err
178+
}
179+
180+
for _, entry := range entries {
181+
srcPath := src + "/" + entry.Name()
182+
dstPath := dst + "/" + entry.Name()
183+
184+
if entry.IsDir() {
185+
if err := copyDir(srcPath, dstPath); err != nil {
186+
return err
187+
}
188+
} else {
189+
if err := copyFile(srcPath, dstPath); err != nil {
190+
return err
191+
}
192+
}
193+
}
194+
return nil
195+
}
196+
197+
// copyFile copies a file from src to dst
198+
func copyFile(src, dst string) error {
199+
srcFile, err := os.Open(src)
200+
if err != nil {
201+
return err
202+
}
203+
defer srcFile.Close()
204+
205+
dstFile, err := os.Create(dst)
206+
if err != nil {
207+
return err
208+
}
209+
defer dstFile.Close()
210+
211+
_, err = dstFile.ReadFrom(srcFile)
212+
return err
213+
}
214+
215+
// chownRecursive changes ownership of a directory and all its contents
216+
func chownRecursive(path string, uid, gid int) error {
217+
return filepath.Walk(path, func(name string, info os.FileInfo, err error) error {
218+
if err != nil {
219+
return err
220+
}
221+
return os.Chown(name, uid, gid)
222+
})
223+
}
224+
96225
func (vm *vm) Signal(ctx context.Context, signal string) error {
97226
sig := syscallSignal(signal)
98227

runtime/instancerunner/start.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,44 @@ func (ir *InstanceRunner) Start(ctx context.Context) error {
4545

4646
return err
4747
}
48+
49+
// StartFromSnapshot starts the instance by restoring from a snapshot instead of cold booting.
50+
// This enables sub-100ms cold starts for AI sandbox workloads.
51+
// globalSnapshotPath is the path to the global snapshot storage
52+
// jailSnapshotPath is the jail-relative path where the snapshot will be placed
53+
func (ir *InstanceRunner) StartFromSnapshot(ctx context.Context, globalSnapshotPath, jailSnapshotPath string) error {
54+
ir.lock()
55+
defer ir.unlock()
56+
slog.Debug("starting instance from snapshot", "id", ir.Instance().Id, "globalPath", globalSnapshotPath, "jailPath", jailSnapshotPath)
57+
58+
if ir.Status() != instance.InstanceStatusStopped && ir.Status() != instance.InstanceStatusCreated {
59+
return errdefs.NewFailedPrecondition(fmt.Sprintf("instance is in %s status", ir.Status()))
60+
}
61+
62+
err := ir.updateInstanceState(instance.State{
63+
Status: instance.InstanceStatusStarting,
64+
})
65+
if err != nil {
66+
return err
67+
}
68+
69+
defer func() {
70+
if err != nil {
71+
ir.updateInstanceState(instance.State{Status: instance.InstanceStatusStopped})
72+
}
73+
}()
74+
75+
runner := ir.newVMRunner()
76+
ir.setVMRunner(runner)
77+
78+
err = runner.StartFromSnapshot(globalSnapshotPath, jailSnapshotPath)
79+
if err != nil {
80+
return err
81+
}
82+
83+
ir.updateInstanceState(instance.State{Status: instance.InstanceStatusRunning})
84+
85+
go ir.run()
86+
87+
return err
88+
}

runtime/instancerunner/vmrunner.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,39 @@ func (r *vmRunner) Start() error {
129129
return nil
130130
}
131131

132+
// StartFromSnapshot starts the VM by restoring from a snapshot instead of cold booting.
133+
// globalSnapshotPath is the path to the global snapshot storage (e.g., /var/lib/ravel/global-snapshots/instance-id/snap-1)
134+
// jailSnapshotPath is the jail-relative path where the snapshot will be placed (e.g., /snapshots/snap-1)
135+
func (r *vmRunner) StartFromSnapshot(globalSnapshotPath, jailSnapshotPath string) error {
136+
ctx := context.Background()
137+
vm, err := r.driver.BuildInstanceTask(ctx, &r.i, r.disks)
138+
if err != nil {
139+
slog.Error("failed to build vm", "error", err)
140+
return err
141+
}
142+
defer func() {
143+
if err != nil {
144+
err := r.driver.CleanupInstanceTask(ctx, &r.i)
145+
if err != nil {
146+
slog.Error("failed to cleanup vm", "error", err)
147+
}
148+
}
149+
}()
150+
151+
r.vm = vm
152+
153+
slog.Debug("starting vm from snapshot", "globalPath", globalSnapshotPath, "jailPath", jailSnapshotPath)
154+
err = vm.StartFromSnapshot(ctx, globalSnapshotPath, jailSnapshotPath)
155+
if err != nil {
156+
return err
157+
}
158+
159+
r.hasStarted.Store(true)
160+
161+
go r.run()
162+
return nil
163+
}
164+
132165
func getLogFile(id string) string {
133166
return fmt.Sprintf("/var/lib/ravel/instances/%s/vm.logs", id)
134167
}

0 commit comments

Comments
 (0)