valyentdev
diff --git a/‎agent/client/machines.go‎
Lines changed: 26 additions & 0 deletions b/‎agent/client/machines.go‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎agent/machine.go‎
Lines changed: 21 additions & 0 deletions b/‎agent/machine.go‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎agent/machinerunner/instance_operations.go‎
Lines changed: 31 additions & 0 deletions b/‎agent/machinerunner/instance_operations.go‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎agent/server/machines.go‎
Lines changed: 54 additions & 0 deletions b/‎agent/server/machines.go‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎agent/server/routes.go‎
Lines changed: 17 additions & 0 deletions b/‎agent/server/routes.go‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎core/cluster/agent.go‎
Lines changed: 4 additions & 0 deletions b/‎core/cluster/agent.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/daemon/daemon.go‎
Lines changed: 4 additions & 0 deletions b/‎core/daemon/daemon.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/daemon/images.go‎
Lines changed: 4 additions & 0 deletions b/‎core/daemon/images.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎initd/exec/processes.go‎
Lines changed: 91 additions & 0 deletions b/‎initd/exec/processes.go‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎pkg/cloudhypervisor/vmm.go‎
Lines changed: 56 additions & 0 deletions b/‎pkg/cloudhypervisor/vmm.go‎
Lines changed: 56 additions & 0 deletions
@@ -133,3 +133,29 @@ func (a *AgentClient) WaitForMachineStatus(ctx context.Context, id string, statu
 	}
 	return nil
 }
+
+// MachineSnapshot creates a snapshot of a running machine for fast restore.
+func (a *AgentClient) MachineSnapshot(ctx context.Context, id string, snapshotId string) error {
+	path := "/machines/" + id + "/snapshot"
+	body := struct {
+		SnapshotId string `json:"snapshot_id"`
+	}{SnapshotId: snapshotId}
+	err := a.client.Post(ctx, path, nil, httpclient.WithJSONBody(body))
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// MachineRestore restores a machine from a snapshot.
+func (a *AgentClient) MachineRestore(ctx context.Context, id string, snapshotId string) error {
+	path := "/machines/" + id + "/restore"
+	body := struct {
+		SnapshotId string `json:"snapshot_id"`
+	}{SnapshotId: snapshotId}
+	err := a.client.Post(ctx, path, nil, httpclient.WithJSONBody(body))
+	if err != nil {
+		return err
+	}
+	return nil
+}
@@ -191,3 +191,24 @@ func (a *Agent) WaitForMachineStatus(ctx context.Context, id string, status api.
 
 	return nil
 }
+
+// MachineSnapshot saves the running VM state for fast restore.
+// This enables sub-100ms cold starts for AI sandbox workloads.
+func (d *Agent) MachineSnapshot(ctx context.Context, id string, snapshotId string) error {
+	machine, err := d.machines.GetMachine(id)
+	if err != nil {
+		return err
+	}
+
+	return machine.Snapshot(ctx, snapshotId)
+}
+
+// MachineRestore restores the VM from a previously saved snapshot.
+func (d *Agent) MachineRestore(ctx context.Context, id string, snapshotId string) error {
+	machine, err := d.machines.GetMachine(id)
+	if err != nil {
+		return err
+	}
+
+	return machine.Restore(ctx, snapshotId)
+}
@@ -113,3 +113,34 @@ func (m *MachineRunner) SubscribeToLogs(ctx context.Context, id string) ([]*api.
 	}
 	return m.runtime.SubscribeToInstanceLogs(ctx, m.state.MachineInstance().Machine.InstanceId)
 }
+
+// Snapshot saves the running VM state for fast restore.
+// Path format: /var/lib/ravel/snapshots/{snapshotId}
+func (m *MachineRunner) Snapshot(ctx context.Context, snapshotId string) error {
+	if err := m.canUseInstance(); err != nil {
+		return err
+	}
+
+	status := m.state.Status()
+	if status != api.MachineStatusRunning {
+		return errMachineIs(status)
+	}
+
+	path := fmt.Sprintf("/var/lib/ravel/snapshots/%s", snapshotId)
+	return m.runtime.InstanceSnapshot(ctx, m.state.InstanceId(), path)
+}
+
+// Restore restores the VM from a previously saved snapshot.
+func (m *MachineRunner) Restore(ctx context.Context, snapshotId string) error {
+	if err := m.canUseInstance(); err != nil {
+		return err
+	}
+
+	status := m.state.Status()
+	if status != api.MachineStatusRunning {
+		return errMachineIs(status)
+	}
+
+	path := fmt.Sprintf("/var/lib/ravel/snapshots/%s", snapshotId)
+	return m.runtime.InstanceRestore(ctx, m.state.InstanceId(), path)
+}
@@ -191,3 +191,57 @@ func (s *AgentServer) waitForMachineStatus(ctx context.Context, req *WaitMachine
 
 	return &WaitMachineStatusResponse{}, nil
 }
+
+// Snapshot API for AI sandbox fast starts
+
+type MachineSnapshotRequest struct {
+	Id   string `path:"id"`
+	Body struct {
+		SnapshotId string `json:"snapshot_id" required:"true" doc:"Unique identifier for the snapshot"`
+	}
+}
+
+type MachineSnapshotResponse struct {
+	Body struct {
+		SnapshotId string `json:"snapshot_id"`
+		Path       string `json:"path"`
+	}
+}
+
+func (s *AgentServer) machineSnapshot(ctx context.Context, req *MachineSnapshotRequest) (*MachineSnapshotResponse, error) {
+	err := s.agent.MachineSnapshot(ctx, req.Id, req.Body.SnapshotId)
+	if err != nil {
+		s.log("Failed to snapshot machine", err)
+		return nil, err
+	}
+
+	return &MachineSnapshotResponse{
+		Body: struct {
+			SnapshotId string `json:"snapshot_id"`
+			Path       string `json:"path"`
+		}{
+			SnapshotId: req.Body.SnapshotId,
+			Path:       "/var/lib/ravel/snapshots/" + req.Body.SnapshotId,
+		},
+	}, nil
+}
+
+type MachineRestoreRequest struct {
+	Id   string `path:"id"`
+	Body struct {
+		SnapshotId string `json:"snapshot_id" required:"true" doc:"Snapshot ID to restore from"`
+	}
+}
+
+type MachineRestoreResponse struct {
+}
+
+func (s *AgentServer) machineRestore(ctx context.Context, req *MachineRestoreRequest) (*MachineRestoreResponse, error) {
+	err := s.agent.MachineRestore(ctx, req.Id, req.Body.SnapshotId)
+	if err != nil {
+		s.log("Failed to restore machine", err)
+		return nil, err
+	}
+
+	return &MachineRestoreResponse{}, nil
+}
@@ -91,6 +91,23 @@ func (s *AgentServer) registerEndpoints(mux humago.Mux) {
 		Method:      http.MethodGet,
 	}, s.waitForMachineStatus)
 
+	// Snapshot/Restore endpoints for AI sandbox fast starts
+	huma.Register(api, huma.Operation{
+		OperationID: "machineSnapshot",
+		Path:        "/machines/{id}/snapshot",
+		Method:      http.MethodPost,
+		Summary:     "Create a snapshot of a running machine for fast restore",
+		Tags:        []string{"sandbox"},
+	}, s.machineSnapshot)
+
+	huma.Register(api, huma.Operation{
+		OperationID: "machineRestore",
+		Path:        "/machines/{id}/restore",
+		Method:      http.MethodPost,
+		Summary:     "Restore a machine from a snapshot",
+		Tags:        []string{"sandbox"},
+	}, s.machineRestore)
+
 	// Build endpoints
 	huma.Register(api, huma.Operation{
 		OperationID: "createBuild",
 
@@ -31,4 +31,8 @@ type Agent interface {
 
 	EnableMachineGateway(ctx context.Context, id string) error
 	DisableMachineGateway(ctx context.Context, id string) error
+
+	// Sandbox fast start methods for AI workloads
+	MachineSnapshot(ctx context.Context, machineId string, snapshotId string) error
+	MachineRestore(ctx context.Context, machineId string, snapshotId string) error
 }
@@ -51,6 +51,10 @@ type Daemon interface {
 	GetInstanceLogs(ctx context.Context, id string) ([]*api.LogEntry, error)
 	SubscribeToInstanceLogs(ctx context.Context, id string) ([]*api.LogEntry, <-chan *api.LogEntry, error)
 
+	// Sandbox fast start methods for AI workloads
+	InstanceSnapshot(ctx context.Context, id string, path string) error
+	InstanceRestore(ctx context.Context, id string, path string) error
+
 	DeleteImage(ctx context.Context, ref string) error
 	ListImages(ctx context.Context) ([]images.Image, error)
 	PullImage(ctx context.Context, opt ImagePullOptions) (*images.Image, error)
 
@@ -33,4 +33,8 @@ type Runtime interface {
 	ListImages(ctx context.Context) ([]images.Image, error)
 	PruneImages(ctx context.Context) error
 	PullImage(ctx context.Context, opt ImagePullOptions) (*images.Image, error)
+
+	// Sandbox fast start methods for AI workloads
+	InstanceSnapshot(ctx context.Context, id string, path string) error
+	InstanceRestore(ctx context.Context, id string, path string) error
 }
@@ -1,8 +1,10 @@
 package exec
 
 import (
+	"bufio"
 	"bytes"
 	"context"
+	"io"
 	"os/exec"
 
 	"github.com/valyentdev/ravel/api"
@@ -49,3 +51,92 @@ func Exec(ctx context.Context, opts api.ExecOptions) (*api.ExecResult, error) {
 		ExitCode: exitCode,
 	}, nil
 }
+
+// ExecOutputLine represents a single line of output from a streaming exec.
+type ExecOutputLine struct {
+	Stream string `json:"stream"` // "stdout" or "stderr"
+	Data   string `json:"data"`
+}
+
+// ExecStreamResult is sent when the command completes.
+type ExecStreamResult struct {
+	ExitCode int `json:"exit_code"`
+}
+
+// ExecStream executes a command and streams output line by line.
+// This is useful for long-running commands in AI sandboxes.
+func ExecStream(ctx context.Context, opts api.ExecOptions, outputCh chan<- ExecOutputLine) (*ExecStreamResult, error) {
+	defer close(outputCh)
+
+	if len(opts.Cmd) == 0 {
+		return nil, errdefs.NewInvalidArgument("cmd cannot be empty")
+	}
+
+	name := opts.Cmd[0]
+	args := opts.Cmd[1:]
+
+	timeoutCtx, cancel := context.WithTimeout(ctx, opts.GetTimeout())
+	defer cancel()
+
+	cmd := exec.CommandContext(timeoutCtx, name, args...)
+	if cmd.Err != nil {
+		return nil, errdefs.NewInvalidArgument(cmd.Err.Error())
+	}
+
+	// Get stdout and stderr pipes
+	stdoutPipe, err := cmd.StdoutPipe()
+	if err != nil {
+		return nil, errdefs.NewUnknown("failed to create stdout pipe: " + err.Error())
+	}
+
+	stderrPipe, err := cmd.StderrPipe()
+	if err != nil {
+		return nil, errdefs.NewUnknown("failed to create stderr pipe: " + err.Error())
+	}
+
+	cmd.Stdin = nil
+
+	// Start the command
+	if err := cmd.Start(); err != nil {
+		return nil, errdefs.NewUnknown("failed to start command: " + err.Error())
+	}
+
+	// Stream stdout and stderr concurrently
+	done := make(chan struct{}, 2)
+
+	go streamPipe(stdoutPipe, "stdout", outputCh, done)
+	go streamPipe(stderrPipe, "stderr", outputCh, done)
+
+	// Wait for both pipes to finish
+	<-done
+	<-done
+
+	// Wait for command to complete
+	err = cmd.Wait()
+	exitCode := -1
+	if cmd.ProcessState != nil {
+		exitCode = cmd.ProcessState.ExitCode()
+	}
+
+	// Ignore exit errors - we just care about the exit code
+	if _, ok := err.(*exec.ExitError); ok {
+		err = nil
+	}
+
+	return &ExecStreamResult{ExitCode: exitCode}, err
+}
+
+func streamPipe(pipe io.ReadCloser, stream string, outputCh chan<- ExecOutputLine, done chan<- struct{}) {
+	defer func() { done <- struct{}{} }()
+
+	scanner := bufio.NewScanner(pipe)
+	// Increase buffer size for lines up to 1MB
+	scanner.Buffer(make([]byte, 64*1024), 1024*1024)
+
+	for scanner.Scan() {
+		outputCh <- ExecOutputLine{
+			Stream: stream,
+			Data:   scanner.Text(),
+		}
+	}
+}
@@ -68,6 +68,62 @@ func (v *VMM) PingVMM(ctx context.Context) (VmmPingResponse, error) {
 	return *res.JSON200, nil
 }
 
+// PauseVM pauses the virtual machine.
+func (v *VMM) PauseVM(ctx context.Context) (*http.Response, error) {
+	res, err := v.client.PauseVMWithResponse(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("failed to pause VM: %w", err)
+	}
+
+	if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
+		return nil, fmt.Errorf("failed to pause VM: %s", string(res.Body))
+	}
+
+	return res.HTTPResponse, nil
+}
+
+// ResumeVM resumes the virtual machine.
+func (v *VMM) ResumeVM(ctx context.Context) (*http.Response, error) {
+	res, err := v.client.ResumeVMWithResponse(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("failed to resume VM: %w", err)
+	}
+
+	if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
+		return nil, fmt.Errorf("failed to resume VM: %s", string(res.Body))
+	}
+
+	return res.HTTPResponse, nil
+}
+
+// PutVmSnapshot creates a snapshot of the VM for fast restore.
+func (v *VMM) PutVmSnapshot(ctx context.Context, config VmSnapshotConfig) (*http.Response, error) {
+	res, err := v.client.PutVmSnapshotWithResponse(ctx, config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to snapshot VM: %w", err)
+	}
+
+	if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
+		return nil, fmt.Errorf("failed to snapshot VM: %s", string(res.Body))
+	}
+
+	return res.HTTPResponse, nil
+}
+
+// PutVmRestore restores the VM from a snapshot.
+func (v *VMM) PutVmRestore(ctx context.Context, config RestoreConfig) (*http.Response, error) {
+	res, err := v.client.PutVmRestoreWithResponse(ctx, config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to restore VM: %w", err)
+	}
+
+	if res.StatusCode() != http.StatusOK && res.StatusCode() != http.StatusNoContent {
+		return nil, fmt.Errorf("failed to restore VM: %s", string(res.Body))
+	}
+
+	return res.HTTPResponse, nil
+}
+
 func newCHClient(socket string) (*ClientWithResponses, error) {
 	httpClient := &http.Client{
 		Transport: &http.Transport{
Original file line number	Diff line number	Diff line change
`@@ -31,4 +31,8 @@ type Agent interface {`
`31`	`31`
`32`	`32`	`EnableMachineGateway(ctx context.Context, id string) error`
`33`	`33`	`DisableMachineGateway(ctx context.Context, id string) error`
	`34`	`+`
	`35`	`+ // Sandbox fast start methods for AI workloads`
	`36`	`+ MachineSnapshot(ctx context.Context, machineId string, snapshotId string) error`
	`37`	`+ MachineRestore(ctx context.Context, machineId string, snapshotId string) error`
`34`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,4 +33,8 @@ type Runtime interface {`
`33`	`33`	`ListImages(ctx context.Context) ([]images.Image, error)`
`34`	`34`	`PruneImages(ctx context.Context) error`
`35`	`35`	`PullImage(ctx context.Context, opt ImagePullOptions) (*images.Image, error)`
	`36`	`+`
	`37`	`+ // Sandbox fast start methods for AI workloads`
	`38`	`+ InstanceSnapshot(ctx context.Context, id string, path string) error`
	`39`	`+ InstanceRestore(ctx context.Context, id string, path string) error`
`36`	`40`	`}`