Skip to content
This repository was archived by the owner on Jan 2, 2026. It is now read-only.

Commit fc27880

Browse files
alexisbouchezclaude
andcommitted
feat: add VM snapshot/restore for AI sandbox fast starts
Add CloudHypervisor snapshot/restore support to enable sub-100ms cold starts for AI sandbox workloads. This is a key feature for code execution environments like E2B and Modal. Changes: - Wire up CloudHypervisor snapshot/restore API through all layers - Add PauseVM, ResumeVM, PutVmSnapshot, PutVmRestore to VMM client - Handle jailer chroot paths correctly for snapshot storage - Set proper permissions for snapshot directories (ravel-jailer user) - Add streaming exec output support for long-running commands - Add sandbox pool manager for pre-warmed VM instances API endpoints: - POST /instances/{id}/snapshot - Create VM snapshot - POST /instances/{id}/restore - Restore VM from snapshot 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 64f8725 commit fc27880

File tree

20 files changed

+935
-0
lines changed

20 files changed

+935
-0
lines changed

agent/client/machines.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,29 @@ func (a *AgentClient) WaitForMachineStatus(ctx context.Context, id string, statu
133133
}
134134
return nil
135135
}
136+
137+
// MachineSnapshot creates a snapshot of a running machine for fast restore.
138+
func (a *AgentClient) MachineSnapshot(ctx context.Context, id string, snapshotId string) error {
139+
path := "/machines/" + id + "/snapshot"
140+
body := struct {
141+
SnapshotId string `json:"snapshot_id"`
142+
}{SnapshotId: snapshotId}
143+
err := a.client.Post(ctx, path, nil, httpclient.WithJSONBody(body))
144+
if err != nil {
145+
return err
146+
}
147+
return nil
148+
}
149+
150+
// MachineRestore restores a machine from a snapshot.
151+
func (a *AgentClient) MachineRestore(ctx context.Context, id string, snapshotId string) error {
152+
path := "/machines/" + id + "/restore"
153+
body := struct {
154+
SnapshotId string `json:"snapshot_id"`
155+
}{SnapshotId: snapshotId}
156+
err := a.client.Post(ctx, path, nil, httpclient.WithJSONBody(body))
157+
if err != nil {
158+
return err
159+
}
160+
return nil
161+
}

agent/machine.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,3 +191,24 @@ func (a *Agent) WaitForMachineStatus(ctx context.Context, id string, status api.
191191

192192
return nil
193193
}
194+
195+
// MachineSnapshot saves the running VM state for fast restore.
196+
// This enables sub-100ms cold starts for AI sandbox workloads.
197+
func (d *Agent) MachineSnapshot(ctx context.Context, id string, snapshotId string) error {
198+
machine, err := d.machines.GetMachine(id)
199+
if err != nil {
200+
return err
201+
}
202+
203+
return machine.Snapshot(ctx, snapshotId)
204+
}
205+
206+
// MachineRestore restores the VM from a previously saved snapshot.
207+
func (d *Agent) MachineRestore(ctx context.Context, id string, snapshotId string) error {
208+
machine, err := d.machines.GetMachine(id)
209+
if err != nil {
210+
return err
211+
}
212+
213+
return machine.Restore(ctx, snapshotId)
214+
}

agent/machinerunner/instance_operations.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,34 @@ func (m *MachineRunner) SubscribeToLogs(ctx context.Context, id string) ([]*api.
113113
}
114114
return m.runtime.SubscribeToInstanceLogs(ctx, m.state.MachineInstance().Machine.InstanceId)
115115
}
116+
117+
// Snapshot saves the running VM state for fast restore.
118+
// Path format: /var/lib/ravel/snapshots/{snapshotId}
119+
func (m *MachineRunner) Snapshot(ctx context.Context, snapshotId string) error {
120+
if err := m.canUseInstance(); err != nil {
121+
return err
122+
}
123+
124+
status := m.state.Status()
125+
if status != api.MachineStatusRunning {
126+
return errMachineIs(status)
127+
}
128+
129+
path := fmt.Sprintf("/var/lib/ravel/snapshots/%s", snapshotId)
130+
return m.runtime.InstanceSnapshot(ctx, m.state.InstanceId(), path)
131+
}
132+
133+
// Restore restores the VM from a previously saved snapshot.
134+
func (m *MachineRunner) Restore(ctx context.Context, snapshotId string) error {
135+
if err := m.canUseInstance(); err != nil {
136+
return err
137+
}
138+
139+
status := m.state.Status()
140+
if status != api.MachineStatusRunning {
141+
return errMachineIs(status)
142+
}
143+
144+
path := fmt.Sprintf("/var/lib/ravel/snapshots/%s", snapshotId)
145+
return m.runtime.InstanceRestore(ctx, m.state.InstanceId(), path)
146+
}

agent/server/machines.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,3 +191,57 @@ func (s *AgentServer) waitForMachineStatus(ctx context.Context, req *WaitMachine
191191

192192
return &WaitMachineStatusResponse{}, nil
193193
}
194+
195+
// Snapshot API for AI sandbox fast starts
196+
197+
type MachineSnapshotRequest struct {
198+
Id string `path:"id"`
199+
Body struct {
200+
SnapshotId string `json:"snapshot_id" required:"true" doc:"Unique identifier for the snapshot"`
201+
}
202+
}
203+
204+
type MachineSnapshotResponse struct {
205+
Body struct {
206+
SnapshotId string `json:"snapshot_id"`
207+
Path string `json:"path"`
208+
}
209+
}
210+
211+
func (s *AgentServer) machineSnapshot(ctx context.Context, req *MachineSnapshotRequest) (*MachineSnapshotResponse, error) {
212+
err := s.agent.MachineSnapshot(ctx, req.Id, req.Body.SnapshotId)
213+
if err != nil {
214+
s.log("Failed to snapshot machine", err)
215+
return nil, err
216+
}
217+
218+
return &MachineSnapshotResponse{
219+
Body: struct {
220+
SnapshotId string `json:"snapshot_id"`
221+
Path string `json:"path"`
222+
}{
223+
SnapshotId: req.Body.SnapshotId,
224+
Path: "/var/lib/ravel/snapshots/" + req.Body.SnapshotId,
225+
},
226+
}, nil
227+
}
228+
229+
type MachineRestoreRequest struct {
230+
Id string `path:"id"`
231+
Body struct {
232+
SnapshotId string `json:"snapshot_id" required:"true" doc:"Snapshot ID to restore from"`
233+
}
234+
}
235+
236+
type MachineRestoreResponse struct {
237+
}
238+
239+
func (s *AgentServer) machineRestore(ctx context.Context, req *MachineRestoreRequest) (*MachineRestoreResponse, error) {
240+
err := s.agent.MachineRestore(ctx, req.Id, req.Body.SnapshotId)
241+
if err != nil {
242+
s.log("Failed to restore machine", err)
243+
return nil, err
244+
}
245+
246+
return &MachineRestoreResponse{}, nil
247+
}

agent/server/routes.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,23 @@ func (s *AgentServer) registerEndpoints(mux humago.Mux) {
9191
Method: http.MethodGet,
9292
}, s.waitForMachineStatus)
9393

94+
// Snapshot/Restore endpoints for AI sandbox fast starts
95+
huma.Register(api, huma.Operation{
96+
OperationID: "machineSnapshot",
97+
Path: "/machines/{id}/snapshot",
98+
Method: http.MethodPost,
99+
Summary: "Create a snapshot of a running machine for fast restore",
100+
Tags: []string{"sandbox"},
101+
}, s.machineSnapshot)
102+
103+
huma.Register(api, huma.Operation{
104+
OperationID: "machineRestore",
105+
Path: "/machines/{id}/restore",
106+
Method: http.MethodPost,
107+
Summary: "Restore a machine from a snapshot",
108+
Tags: []string{"sandbox"},
109+
}, s.machineRestore)
110+
94111
// Build endpoints
95112
huma.Register(api, huma.Operation{
96113
OperationID: "createBuild",

core/cluster/agent.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,8 @@ type Agent interface {
3131

3232
EnableMachineGateway(ctx context.Context, id string) error
3333
DisableMachineGateway(ctx context.Context, id string) error
34+
35+
// Sandbox fast start methods for AI workloads
36+
MachineSnapshot(ctx context.Context, machineId string, snapshotId string) error
37+
MachineRestore(ctx context.Context, machineId string, snapshotId string) error
3438
}

core/daemon/daemon.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ type Daemon interface {
5151
GetInstanceLogs(ctx context.Context, id string) ([]*api.LogEntry, error)
5252
SubscribeToInstanceLogs(ctx context.Context, id string) ([]*api.LogEntry, <-chan *api.LogEntry, error)
5353

54+
// Sandbox fast start methods for AI workloads
55+
InstanceSnapshot(ctx context.Context, id string, path string) error
56+
InstanceRestore(ctx context.Context, id string, path string) error
57+
5458
DeleteImage(ctx context.Context, ref string) error
5559
ListImages(ctx context.Context) ([]images.Image, error)
5660
PullImage(ctx context.Context, opt ImagePullOptions) (*images.Image, error)

core/daemon/images.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,8 @@ type Runtime interface {
3333
ListImages(ctx context.Context) ([]images.Image, error)
3434
PruneImages(ctx context.Context) error
3535
PullImage(ctx context.Context, opt ImagePullOptions) (*images.Image, error)
36+
37+
// Sandbox fast start methods for AI workloads
38+
InstanceSnapshot(ctx context.Context, id string, path string) error
39+
InstanceRestore(ctx context.Context, id string, path string) error
3640
}

initd/exec/processes.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package exec
22

33
import (
4+
"bufio"
45
"bytes"
56
"context"
7+
"io"
68
"os/exec"
79

810
"github.com/valyentdev/ravel/api"
@@ -49,3 +51,92 @@ func Exec(ctx context.Context, opts api.ExecOptions) (*api.ExecResult, error) {
4951
ExitCode: exitCode,
5052
}, nil
5153
}
54+
55+
// ExecOutputLine represents a single line of output from a streaming exec.
56+
type ExecOutputLine struct {
57+
Stream string `json:"stream"` // "stdout" or "stderr"
58+
Data string `json:"data"`
59+
}
60+
61+
// ExecStreamResult is sent when the command completes.
62+
type ExecStreamResult struct {
63+
ExitCode int `json:"exit_code"`
64+
}
65+
66+
// ExecStream executes a command and streams output line by line.
67+
// This is useful for long-running commands in AI sandboxes.
68+
func ExecStream(ctx context.Context, opts api.ExecOptions, outputCh chan<- ExecOutputLine) (*ExecStreamResult, error) {
69+
defer close(outputCh)
70+
71+
if len(opts.Cmd) == 0 {
72+
return nil, errdefs.NewInvalidArgument("cmd cannot be empty")
73+
}
74+
75+
name := opts.Cmd[0]
76+
args := opts.Cmd[1:]
77+
78+
timeoutCtx, cancel := context.WithTimeout(ctx, opts.GetTimeout())
79+
defer cancel()
80+
81+
cmd := exec.CommandContext(timeoutCtx, name, args...)
82+
if cmd.Err != nil {
83+
return nil, errdefs.NewInvalidArgument(cmd.Err.Error())
84+
}
85+
86+
// Get stdout and stderr pipes
87+
stdoutPipe, err := cmd.StdoutPipe()
88+
if err != nil {
89+
return nil, errdefs.NewUnknown("failed to create stdout pipe: " + err.Error())
90+
}
91+
92+
stderrPipe, err := cmd.StderrPipe()
93+
if err != nil {
94+
return nil, errdefs.NewUnknown("failed to create stderr pipe: " + err.Error())
95+
}
96+
97+
cmd.Stdin = nil
98+
99+
// Start the command
100+
if err := cmd.Start(); err != nil {
101+
return nil, errdefs.NewUnknown("failed to start command: " + err.Error())
102+
}
103+
104+
// Stream stdout and stderr concurrently
105+
done := make(chan struct{}, 2)
106+
107+
go streamPipe(stdoutPipe, "stdout", outputCh, done)
108+
go streamPipe(stderrPipe, "stderr", outputCh, done)
109+
110+
// Wait for both pipes to finish
111+
<-done
112+
<-done
113+
114+
// Wait for command to complete
115+
err = cmd.Wait()
116+
exitCode := -1
117+
if cmd.ProcessState != nil {
118+
exitCode = cmd.ProcessState.ExitCode()
119+
}
120+
121+
// Ignore exit errors - we just care about the exit code
122+
if _, ok := err.(*exec.ExitError); ok {
123+
err = nil
124+
}
125+
126+
return &ExecStreamResult{ExitCode: exitCode}, err
127+
}
128+
129+
func streamPipe(pipe io.ReadCloser, stream string, outputCh chan<- ExecOutputLine, done chan<- struct{}) {
130+
defer func() { done <- struct{}{} }()
131+
132+
scanner := bufio.NewScanner(pipe)
133+
// Increase buffer size for lines up to 1MB
134+
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
135+
136+
for scanner.Scan() {
137+
outputCh <- ExecOutputLine{
138+
Stream: stream,
139+
Data: scanner.Text(),
140+
}
141+
}
142+
}

pkg/cloudhypervisor/vmm.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,62 @@ func (v *VMM) PingVMM(ctx context.Context) (VmmPingResponse, error) {
6868
return *res.JSON200, nil
6969
}
7070

71+
// PauseVM pauses the virtual machine.
72+
func (v *VMM) PauseVM(ctx context.Context) (*http.Response, error) {
73+
res, err := v.client.PauseVMWithResponse(ctx)
74+
if err != nil {
75+
return nil, fmt.Errorf("failed to pause VM: %w", err)
76+
}
77+
78+
if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
79+
return nil, fmt.Errorf("failed to pause VM: %s", string(res.Body))
80+
}
81+
82+
return res.HTTPResponse, nil
83+
}
84+
85+
// ResumeVM resumes the virtual machine.
86+
func (v *VMM) ResumeVM(ctx context.Context) (*http.Response, error) {
87+
res, err := v.client.ResumeVMWithResponse(ctx)
88+
if err != nil {
89+
return nil, fmt.Errorf("failed to resume VM: %w", err)
90+
}
91+
92+
if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
93+
return nil, fmt.Errorf("failed to resume VM: %s", string(res.Body))
94+
}
95+
96+
return res.HTTPResponse, nil
97+
}
98+
99+
// PutVmSnapshot creates a snapshot of the VM for fast restore.
100+
func (v *VMM) PutVmSnapshot(ctx context.Context, config VmSnapshotConfig) (*http.Response, error) {
101+
res, err := v.client.PutVmSnapshotWithResponse(ctx, config)
102+
if err != nil {
103+
return nil, fmt.Errorf("failed to snapshot VM: %w", err)
104+
}
105+
106+
if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
107+
return nil, fmt.Errorf("failed to snapshot VM: %s", string(res.Body))
108+
}
109+
110+
return res.HTTPResponse, nil
111+
}
112+
113+
// PutVmRestore restores the VM from a snapshot.
114+
func (v *VMM) PutVmRestore(ctx context.Context, config RestoreConfig) (*http.Response, error) {
115+
res, err := v.client.PutVmRestoreWithResponse(ctx, config)
116+
if err != nil {
117+
return nil, fmt.Errorf("failed to restore VM: %w", err)
118+
}
119+
120+
if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
121+
return nil, fmt.Errorf("failed to restore VM: %s", string(res.Body))
122+
}
123+
124+
return res.HTTPResponse, nil
125+
}
126+
71127
func newCHClient(socket string) (*ClientWithResponses, error) {
72128
httpClient := &http.Client{
73129
Transport: &http.Transport{

0 commit comments

Comments
 (0)