Skip to content

Commit 0260b79

Browse files
authored
os level screenshots (#76)
<!-- CURSOR_SUMMARY --> > [!NOTE] > Adds `/computer/screenshot` (PNG via ffmpeg, optional region) and `/computer/type`, tightens mouse APIs with screen-bounds checks and STZ guards, makes resolution queries error-aware, updates OpenAPI/client, and adds e2e screenshot tests. > > - **API Endpoints**: > - `POST /computer/screenshot`: capture PNG (ffmpeg x11grab), optional `region` crop; streams image. > - `POST /computer/type`: type arbitrary text with optional per-keystroke `delay`. > - **Mouse APIs**: > - `MoveMouse`/`ClickMouse`: add display bounds validation (via `getCurrentResolution`), STZ disable/enable guards, improved logging; return 400 on OOB coords. > - **Display**: > - `getCurrentResolution` now returns `(w,h,rate,error)`; `PatchDisplay` handles errors. > - **OpenAPI/Client**: > - New schemas: `ScreenshotRequest`/`ScreenshotRegion`, `TypeTextRequest`. > - Generated server routes, responses, and client helpers for screenshot/typing; swagger spec updated. > - **Tests**: > - E2E: add headless/headful screenshot tests; PNG validation helper (`isPNG`). > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 4094aca. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent ca0e857 commit 0260b79

File tree

5 files changed

+1034
-99
lines changed

5 files changed

+1034
-99
lines changed

server/cmd/api/api/computer.go

Lines changed: 172 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@ package api
22

33
import (
44
"context"
5+
"encoding/base64"
56
"fmt"
7+
"io"
8+
"os"
9+
"os/exec"
610
"strconv"
711

812
"github.com/onkernel/kernel-images/server/lib/logger"
@@ -12,16 +16,29 @@ import (
1216
func (s *ApiService) MoveMouse(ctx context.Context, request oapi.MoveMouseRequestObject) (oapi.MoveMouseResponseObject, error) {
1317
log := logger.FromContext(ctx)
1418

19+
s.stz.Disable(ctx)
20+
defer s.stz.Enable(ctx)
21+
1522
// Validate request body
1623
if request.Body == nil {
1724
return oapi.MoveMouse400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "request body is required"}}, nil
1825
}
1926
body := *request.Body
2027

21-
// Ensure non-negative coordinates
28+
// Get current resolution for bounds validation
29+
screenWidth, screenHeight, _, err := s.getCurrentResolution(ctx)
30+
if err != nil {
31+
log.Error("failed to get current resolution", "error", err)
32+
return oapi.MoveMouse500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "failed to get current display resolution"}}, nil
33+
}
34+
35+
// Ensure non-negative coordinates and within screen bounds
2236
if body.X < 0 || body.Y < 0 {
2337
return oapi.MoveMouse400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "coordinates must be non-negative"}}, nil
2438
}
39+
if body.X >= screenWidth || body.Y >= screenHeight {
40+
return oapi.MoveMouse400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: fmt.Sprintf("coordinates exceed screen bounds (max: %dx%d)", screenWidth-1, screenHeight-1)}}, nil
41+
}
2542

2643
// Build xdotool arguments
2744
args := []string{}
@@ -57,16 +74,29 @@ func (s *ApiService) MoveMouse(ctx context.Context, request oapi.MoveMouseReques
5774
func (s *ApiService) ClickMouse(ctx context.Context, request oapi.ClickMouseRequestObject) (oapi.ClickMouseResponseObject, error) {
5875
log := logger.FromContext(ctx)
5976

77+
s.stz.Disable(ctx)
78+
defer s.stz.Enable(ctx)
79+
6080
// Validate request body
6181
if request.Body == nil {
6282
return oapi.ClickMouse400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "request body is required"}}, nil
6383
}
6484
body := *request.Body
6585

66-
// Ensure non-negative coordinates
86+
// Get current resolution for bounds validation
87+
screenWidth, screenHeight, _, err := s.getCurrentResolution(ctx)
88+
if err != nil {
89+
log.Error("failed to get current resolution", "error", err)
90+
return oapi.ClickMouse500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "failed to get current display resolution"}}, nil
91+
}
92+
93+
// Ensure non-negative coordinates and within screen bounds
6794
if body.X < 0 || body.Y < 0 {
6895
return oapi.ClickMouse400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "coordinates must be non-negative"}}, nil
6996
}
97+
if body.X >= screenWidth || body.Y >= screenHeight {
98+
return oapi.ClickMouse400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: fmt.Sprintf("coordinates exceed screen bounds (max: %dx%d)", screenWidth-1, screenHeight-1)}}, nil
99+
}
70100

71101
// Map button enum to xdotool button code. Default to left button.
72102
btn := "1"
@@ -143,3 +173,143 @@ func (s *ApiService) ClickMouse(ctx context.Context, request oapi.ClickMouseRequ
143173

144174
return oapi.ClickMouse200Response{}, nil
145175
}
176+
177+
func (s *ApiService) TakeScreenshot(ctx context.Context, request oapi.TakeScreenshotRequestObject) (oapi.TakeScreenshotResponseObject, error) {
178+
log := logger.FromContext(ctx)
179+
180+
s.stz.Disable(ctx)
181+
defer s.stz.Enable(ctx)
182+
183+
var body oapi.ScreenshotRequest
184+
if request.Body != nil {
185+
body = *request.Body
186+
}
187+
188+
// Get current resolution for bounds validation
189+
screenWidth, screenHeight, _, err := s.getCurrentResolution(ctx)
190+
if err != nil {
191+
log.Error("failed to get current resolution", "error", err)
192+
return oapi.TakeScreenshot500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "failed to get current display resolution"}}, nil
193+
}
194+
195+
// Determine display to use (align with other functions)
196+
display := s.resolveDisplayFromEnv()
197+
198+
// Validate region if provided
199+
if body.Region != nil {
200+
r := body.Region
201+
if r.X < 0 || r.Y < 0 || r.Width <= 0 || r.Height <= 0 {
202+
return oapi.TakeScreenshot400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "invalid region dimensions"}}, nil
203+
}
204+
if r.X+r.Width > screenWidth || r.Y+r.Height > screenHeight {
205+
return oapi.TakeScreenshot400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "region exceeds screen bounds"}}, nil
206+
}
207+
}
208+
209+
// Build ffmpeg command
210+
args := []string{
211+
"-f", "x11grab",
212+
"-video_size", fmt.Sprintf("%dx%d", screenWidth, screenHeight),
213+
"-i", display,
214+
"-vframes", "1",
215+
}
216+
217+
// Add crop filter if region is specified
218+
if body.Region != nil {
219+
r := body.Region
220+
cropFilter := fmt.Sprintf("crop=%d:%d:%d:%d", r.Width, r.Height, r.X, r.Y)
221+
args = append(args, "-vf", cropFilter)
222+
}
223+
224+
// Output as PNG to stdout
225+
args = append(args, "-f", "image2pipe", "-vcodec", "png", "-")
226+
227+
cmd := exec.CommandContext(ctx, "ffmpeg", args...)
228+
cmd.Env = append(os.Environ(), fmt.Sprintf("DISPLAY=%s", display))
229+
230+
log.Debug("executing ffmpeg command", "args", args, "display", display)
231+
232+
stdout, err := cmd.StdoutPipe()
233+
if err != nil {
234+
log.Error("failed to create stdout pipe", "err", err)
235+
return oapi.TakeScreenshot500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "internal error"}}, nil
236+
}
237+
238+
stderr, err := cmd.StderrPipe()
239+
if err != nil {
240+
log.Error("failed to create stderr pipe", "err", err)
241+
return oapi.TakeScreenshot500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "internal error"}}, nil
242+
}
243+
244+
if err := cmd.Start(); err != nil {
245+
log.Error("failed to start ffmpeg", "err", err)
246+
return oapi.TakeScreenshot500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "failed to start ffmpeg"}}, nil
247+
}
248+
249+
// Start a goroutine to drain stderr for logging to avoid blocking
250+
go func() {
251+
data, _ := io.ReadAll(stderr)
252+
if len(data) > 0 {
253+
// ffmpeg writes progress/info to stderr; include in debug logs
254+
enc := base64.StdEncoding.EncodeToString(data)
255+
log.Debug("ffmpeg stderr (base64)", "data_b64", enc)
256+
}
257+
}()
258+
259+
pr, pw := io.Pipe()
260+
go func() {
261+
_, copyErr := io.Copy(pw, stdout)
262+
waitErr := cmd.Wait()
263+
var closeErr error
264+
if copyErr != nil {
265+
closeErr = fmt.Errorf("streaming ffmpeg output: %w", copyErr)
266+
log.Error("failed streaming ffmpeg output", "err", copyErr)
267+
} else if waitErr != nil {
268+
closeErr = fmt.Errorf("ffmpeg exited with error: %w", waitErr)
269+
log.Error("ffmpeg exited with error", "err", waitErr)
270+
}
271+
if closeErr != nil {
272+
_ = pw.CloseWithError(closeErr)
273+
return
274+
}
275+
_ = pw.Close()
276+
}()
277+
278+
return oapi.TakeScreenshot200ImagepngResponse{Body: pr, ContentLength: 0}, nil
279+
}
280+
281+
func (s *ApiService) TypeText(ctx context.Context, request oapi.TypeTextRequestObject) (oapi.TypeTextResponseObject, error) {
282+
log := logger.FromContext(ctx)
283+
284+
s.stz.Disable(ctx)
285+
defer s.stz.Enable(ctx)
286+
287+
// Validate request body
288+
if request.Body == nil {
289+
return oapi.TypeText400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "request body is required"}}, nil
290+
}
291+
body := *request.Body
292+
293+
// Validate delay if provided
294+
if body.Delay != nil && *body.Delay < 0 {
295+
return oapi.TypeText400JSONResponse{BadRequestErrorJSONResponse: oapi.BadRequestErrorJSONResponse{Message: "delay must be >= 0 milliseconds"}}, nil
296+
}
297+
298+
// Build xdotool arguments
299+
args := []string{"type"}
300+
if body.Delay != nil {
301+
args = append(args, "--delay", strconv.Itoa(*body.Delay))
302+
}
303+
// Use "--" to terminate options and pass raw text
304+
args = append(args, "--", body.Text)
305+
306+
log.Info("executing xdotool", "args", args)
307+
308+
output, err := defaultXdoTool.Run(ctx, args...)
309+
if err != nil {
310+
log.Error("xdotool command failed", "err", err, "output", string(output))
311+
return oapi.TypeText500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "failed to type text"}}, nil
312+
}
313+
314+
return oapi.TypeText200Response{}, nil
315+
}

server/cmd/api/api/display.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ func (s *ApiService) PatchDisplay(ctx context.Context, req oapi.PatchDisplayRequ
3434
}
3535

3636
// Get current resolution with refresh rate
37-
currentWidth, currentHeight, currentRefreshRate := s.getCurrentResolution(ctx)
37+
currentWidth, currentHeight, currentRefreshRate, err := s.getCurrentResolution(ctx)
38+
if err != nil {
39+
log.Error("failed to get current resolution", "error", err)
40+
return oapi.PatchDisplay500JSONResponse{InternalErrorJSONResponse: oapi.InternalErrorJSONResponse{Message: "failed to get current display resolution"}}, nil
41+
}
3842
width := currentWidth
3943
height := currentHeight
4044
refreshRate := currentRefreshRate
@@ -88,7 +92,6 @@ func (s *ApiService) PatchDisplay(ctx context.Context, req oapi.PatchDisplayRequ
8892
}
8993

9094
// Route to appropriate resolution change handler
91-
var err error
9295
if displayMode == "xorg" {
9396
if s.isNekoEnabled() {
9497
log.Info("using Neko API for Xorg resolution change")
@@ -312,7 +315,7 @@ func (s *ApiService) resolveDisplayFromEnv() string {
312315
}
313316

314317
// getCurrentResolution returns the current display resolution and refresh rate by querying xrandr
315-
func (s *ApiService) getCurrentResolution(ctx context.Context) (int, int, int) {
318+
func (s *ApiService) getCurrentResolution(ctx context.Context) (int, int, int, error) {
316319
log := logger.FromContext(ctx)
317320
display := s.resolveDisplayFromEnv()
318321

@@ -324,21 +327,20 @@ func (s *ApiService) getCurrentResolution(ctx context.Context) (int, int, int) {
324327
out, err := cmd.Output()
325328
if err != nil {
326329
log.Error("failed to get current resolution", "error", err)
327-
// Return default resolution on error
328-
return 1024, 768, 60
330+
return 0, 0, 0, fmt.Errorf("failed to execute xrandr command: %w", err)
329331
}
330332

331333
resStr := strings.TrimSpace(string(out))
332334
parts := strings.Split(resStr, "x")
333335
if len(parts) != 2 {
334336
log.Error("unexpected xrandr output format", "output", resStr)
335-
return 1024, 768, 60
337+
return 0, 0, 0, fmt.Errorf("unexpected xrandr output format: %s", resStr)
336338
}
337339

338340
width, err := strconv.Atoi(parts[0])
339341
if err != nil {
340342
log.Error("failed to parse width", "error", err, "value", parts[0])
341-
return 1024, 768, 60
343+
return 0, 0, 0, fmt.Errorf("failed to parse width '%s': %w", parts[0], err)
342344
}
343345

344346
// Parse height and refresh rate (e.g., "1080_60.00" -> height=1080, rate=60)
@@ -356,10 +358,10 @@ func (s *ApiService) getCurrentResolution(ctx context.Context) (int, int, int) {
356358
height, err := strconv.Atoi(heightStr)
357359
if err != nil {
358360
log.Error("failed to parse height", "error", err, "value", heightStr)
359-
return 1024, 768, 60
361+
return 0, 0, 0, fmt.Errorf("failed to parse height '%s': %w", heightStr, err)
360362
}
361363

362-
return width, height, refreshRate
364+
return width, height, refreshRate, nil
363365
}
364366

365367
// isNekoEnabled checks if Neko service is enabled

0 commit comments

Comments
 (0)