inference-gateway · edenreich · Jan 2, 2026 · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026
diff --git a/.github/workflows/artifacts.yml b/.github/workflows/artifacts.yml
@@ -66,6 +66,13 @@ jobs:
             golang:1.25-alpine3.23 \
             sh -c "go build -ldflags '-w -s -X github.com/inference-gateway/cli/cmd.version=${{ steps.version.outputs.version }} -X github.com/inference-gateway/cli/cmd.commit=${{ steps.version.outputs.commit }} -X github.com/inference-gateway/cli/cmd.date=${{ steps.version.outputs.date }}' -o infer-${{ matrix.goos }}-${{ matrix.goarch }} ."
 
+      - name: Computer Use App (macOS only)
+        if: matrix.goos == 'darwin'
+        run: |
+          cd internal/display/macos/ComputerUse
+          ./build.sh
+          cd ../../../..
+
       - name: Build binary (macOS with CGO for clipboard image support)
         if: matrix.goos == 'darwin'
         env:

diff --git a/.infer/config.yaml b/.infer/config.yaml
@@ -169,6 +169,41 @@ agent:
     - The system supports up to 5 concurrent tool executions by default
     - This reduces back-and-forth communication and significantly improves performance
 
+    COMPUTER USE TOOLS:
+    You have TWO ways to interact with the system:
+    1. Direct terminal tools (PRIMARY): Bash, Read, Write, Edit, Grep, etc.
+    2. GUI automation tools (FALLBACK): MouseMove, KeyboardType, MouseClick, GetLatestScreenshot
+
+    CRITICAL: ALWAYS prefer direct terminal tools over GUI automation when possible.
+
+    When to use DIRECT tools (preferred):
+    - Reading files: Use Read tool, NOT KeyboardType to open an editor
+    - Writing files: Use Write/Edit tools, NOT GUI text editor
+    - Running commands: Use Bash tool, NOT KeyboardType in a terminal window
+    - Searching code: Use Grep tool, NOT opening files via GUI
+    - File operations: Use Bash/Read/Write, NOT GUI file manager
+
+    When to use GUI tools (only when necessary):
+    - Interacting with graphical applications that have no CLI equivalent
+    - Testing UI behavior or visual elements
+    - Automating tasks that MUST be done through a GUI
+    - Taking screenshots to inspect visual state
+
+    Why prefer direct tools:
+    - 10-100x faster execution (no GUI rendering delays)
+    - More reliable (no window focus issues, no timing problems)
+    - Precise output (structured data, not visual interpretation)
+    - Parallel execution support (batch multiple operations)
+    - Lower resource usage (no display server overhead)
+
+    Example - WRONG approach:
+    <tool>MouseMove(x=100, y=200)</tool>
+    <tool>MouseClick(button="left")</tool>
+    <tool>KeyboardType(text="cat file.txt")</tool>
+
+    Example - CORRECT approach:
+    <tool>Read(file_path="/path/to/file.txt")</tool>
+
     WORKFLOW:
     When asked to implement features or fix issues:
     1. Plan with TodoWrite
@@ -255,6 +290,32 @@ agent:
     FOCUS: System operations, service management, monitoring, diagnostics, and infrastructure tasks.
 
     CONTEXT: This is a shared system environment, not a project workspace. Users may be managing servers, containers, services, or general infrastructure.
+
+    COMPUTER USE TOOLS:
+    You have TWO ways to interact with the system:
+    1. Direct terminal tools (PRIMARY): Bash, Read, Write, Edit, Grep, etc.
+    2. GUI automation tools (FALLBACK): MouseMove, KeyboardType, MouseClick, GetLatestScreenshot
+
+    CRITICAL: ALWAYS prefer direct terminal tools over GUI automation when possible.
+
+    When to use DIRECT tools (preferred):
+    - Reading files: Use Read tool, NOT KeyboardType to open an editor
+    - Writing files: Use Write/Edit tools, NOT GUI text editor
+    - Running commands: Use Bash tool, NOT KeyboardType in a terminal window
+    - Searching code: Use Grep tool, NOT opening files via GUI
+    - System operations: Use Bash for systemctl, journalctl, docker, etc.
+
+    When to use GUI tools (only when necessary):
+    - Interacting with graphical applications that have no CLI equivalent
+    - Testing UI behavior or visual elements
+    - Remote desktop administration tasks that MUST be done through a GUI
+
+    Why prefer direct tools:
+    - 10-100x faster execution (no GUI rendering delays)
+    - More reliable (no window focus issues, no timing problems)
+    - Works over SSH without X11 forwarding
+    - Precise output (structured data, not visual interpretation)
+    - Lower resource usage (critical for remote systems)
   system_reminders:
     enabled: true
     interval: 4
@@ -638,3 +699,49 @@ web:
     auto_install: true
     install_version: latest
   servers: []
+computer_use:
+  enabled: true
+  floating_window:
+    enabled: true
+    respawn_on_close: true
+    position: top-right
+    always_on_top: true
+  screenshot:
+    enabled: true
+    max_width: 1920
+    max_height: 1080
+    target_width: 1024
+    target_height: 768
+    format: jpeg
+    quality: 85
+    require_approval: false
+    streaming_enabled: true
+    capture_interval: 3
+    buffer_size: 5
+    temp_dir: ""
+    log_captures: false
+    show_overlay: true
+  mouse_move:
+    enabled: true
+    require_approval: false
+  mouse_click:
+    enabled: true
+    require_approval: true
+  mouse_scroll:
+    enabled: true
+    require_approval: false
+  keyboard_type:
+    enabled: true
+    max_text_length: 1000
+    typing_delay_ms: 100
+    require_approval: true
+  get_focused_app:
+    enabled: true
+    require_approval: false
+  activate_app:
+    enabled: true
+    require_approval: false
+  rate_limit:
+    enabled: true
+    max_actions_per_minute: 60
+    window_seconds: 60
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -269,7 +269,11 @@ tasks:
       - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/domain internal/domain TaskTracker
       - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/domain internal/domain A2AAgentService
       - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/domain internal/domain MCPClient
+      - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/domain internal/domain RateLimiter
       - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/domain internal/infra/storage ConversationStorage
+      - mkdir -p tests/mocks/display
+      - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/display internal/display DisplayController
+      - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/display internal/display Provider
       - mkdir -p tests/mocks/services
       - go run github.com/maxbrunsfeld/counterfeiter/v6 -o tests/mocks/services internal/services TitleGenerator
       - mkdir -p tests/mocks/shortcuts

diff --git a/cmd/agents.go b/cmd/agents.go
@@ -242,7 +242,7 @@ type ExternalAgent struct {
 }
 
 // getConfig loads the configuration from viper
-func getConfig(cmd *cobra.Command) (*config.Config, error) {
+func getConfig(_ *cobra.Command) (*config.Config, error) {
 	cfg, err := getConfigFromViper()
 	if err != nil {
 		return nil, fmt.Errorf("failed to load config: %w", err)
@@ -425,7 +425,7 @@ func listAgents(cmd *cobra.Command, args []string) error {
 	format, _ := cmd.Flags().GetString("format")
 
 	if format == "json" {
-		combinedOutput := map[string]interface{}{
+		combinedOutput := map[string]any{
 			"local":    localAgents,
 			"external": externalAgents,
 			"total":    totalAgents,

diff --git a/cmd/chat.go b/cmd/chat.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	tea "github.com/charmbracelet/bubbletea"
+	uuid "github.com/google/uuid"
 	cobra "github.com/spf13/cobra"
 	viper "github.com/spf13/viper"
 
@@ -22,6 +23,8 @@ import (
 	container "github.com/inference-gateway/cli/internal/container"
 	domain "github.com/inference-gateway/cli/internal/domain"
 	logger "github.com/inference-gateway/cli/internal/logger"
+	screenshotsvc "github.com/inference-gateway/cli/internal/services"
+	tools "github.com/inference-gateway/cli/internal/services/tools"
 	web "github.com/inference-gateway/cli/internal/web"
 	sdk "github.com/inference-gateway/sdk"
 )
@@ -37,16 +40,23 @@ and have a conversational interface with the inference gateway.`,
 			return fmt.Errorf("failed to load config: %w", err)
 		}
 
+		if os.Getenv("INFER_WEB_MODE") == "true" {
+			cfg.Web.Enabled = true
+			V.Set("web.enabled", true)
+		}
+
 		webMode, _ := cmd.Flags().GetBool("web")
 		if webMode {
+			cfg.Web.Enabled = true
+			V.Set("web.enabled", true)
+
 			if cmd.Flags().Changed("port") {
 				cfg.Web.Port, _ = cmd.Flags().GetInt("port")
 			}
 			if cmd.Flags().Changed("host") {
 				cfg.Web.Host, _ = cmd.Flags().GetString("host")
 			}
 
-			// SSH remote mode flags
 			if cmd.Flags().Changed("ssh-host") {
 				cfg.Web.SSH.Enabled = true
 				sshHost, _ := cmd.Flags().GetString("ssh-host")
@@ -55,7 +65,6 @@ and have a conversational interface with the inference gateway.`,
 				sshCommand, _ := cmd.Flags().GetString("ssh-command")
 				noInstall, _ := cmd.Flags().GetBool("ssh-no-install")
 
-				// Create a single server config from CLI flags
 				cfg.Web.Servers = []config.SSHServerConfig{
 					{
 						Name:        "CLI Remote Server",
@@ -82,6 +91,8 @@ and have a conversational interface with the inference gateway.`,
 }
 
 // StartChatSession starts a chat session
+//
+//nolint:funlen // Chat session initialization requires multiple setup steps
 func StartChatSession(cfg *config.Config, v *viper.Viper) error {
 	_ = clipboard.Init()
 
@@ -140,6 +151,7 @@ func StartChatSession(cfg *config.Config, v *viper.Viper) error {
 	conversationRepo := services.GetConversationRepository()
 	modelService := services.GetModelService()
 	config := services.GetConfig()
+	configService := services.GetConfigService()
 	toolService := services.GetToolService()
 	fileService := services.GetFileService()
 	imageService := services.GetImageService()
@@ -155,6 +167,35 @@ func StartChatSession(cfg *config.Config, v *viper.Viper) error {
 	agentManager := services.GetAgentManager()
 	conversationOptimizer := services.GetConversationOptimizer()
 
+	var screenshotServer *screenshotsvc.ScreenshotServer
+	logger.Info("Checking screenshot streaming config",
+		"computer_use_enabled", config.ComputerUse.Enabled,
+		"screenshot_enabled", config.ComputerUse.Screenshot.Enabled,
+		"streaming_enabled", config.ComputerUse.Screenshot.StreamingEnabled)
+
+	if config.ComputerUse.Enabled && config.ComputerUse.Screenshot.StreamingEnabled {
+		screenshotServer = startScreenshotServer(config, imageService, toolRegistry)
+		if screenshotServer != nil {
+			defer func() {
+				if err := screenshotServer.Stop(); err != nil {
+					logger.Error("Failed to stop screenshot server", "error", err)
+				}
+			}()
+		}
+	}
+
+	floatingWindowMgr, err := initFloatingWindow(config, stateManager)
+	if err != nil {
+		return fmt.Errorf("failed to initialize floating window: %w", err)
+	}
+	if floatingWindowMgr != nil {
+		defer func() {
+			if err := floatingWindowMgr.Shutdown(); err != nil {
+				logger.Error("Failed to shutdown floating window", "error", err)
+			}
+		}()
+	}
+
 	versionInfo := GetVersionInfo()
 	application := app.NewChatApplication(
 		models,
@@ -163,7 +204,7 @@ func StartChatSession(cfg *config.Config, v *viper.Viper) error {
 		conversationRepo,
 		conversationOptimizer,
 		modelService,
-		config,
+		configService,
 		toolService,
 		fileService,
 		imageService,
@@ -369,6 +410,28 @@ func processStreamingOutput(events <-chan domain.ChatEvent) error {
 	return nil
 }
 
+// startScreenshotServer initializes and starts the screenshot streaming server
+func startScreenshotServer(config *config.Config, imageService domain.ImageService, toolRegistry *tools.Registry) *screenshotsvc.ScreenshotServer {
+	logger.Info("Screenshot streaming conditions met, starting server")
+	sessionID := fmt.Sprintf("%d-%s", time.Now().Unix(), uuid.New().String()[:8])
+	screenshotServer := screenshotsvc.NewScreenshotServer(config, imageService, sessionID)
+
+	if err := screenshotServer.Start(); err != nil {
+		logger.Warn("Failed to start screenshot server", "error", err)
+		return nil
+	}
+
+	fmt.Printf("• Screenshot API: http://localhost:%d\n", screenshotServer.Port())
+	toolRegistry.SetScreenshotServer(screenshotServer)
+	logger.Info("Registered GetLatestScreenshot tool with tool registry")
+
+	if os.Getenv("INFER_GATEWAY_MODE") == "remote" {
+		fmt.Printf("\x1b]5555;screenshot_port=%d\x07", screenshotServer.Port())
+	}
+
+	return screenshotServer
+}
+
 func init() {
 	rootCmd.AddCommand(chatCmd)
 	chatCmd.Flags().Bool("web", false, "Start web terminal interface")

diff --git a/cmd/export.go b/cmd/export.go
@@ -46,7 +46,8 @@ func runExport(sessionID string) error {
 		return fmt.Errorf("failed to initialize storage: %w", err)
 	}
 
-	toolRegistry := tools.NewRegistry(cfg, nil, nil, nil)
+	configService := services.NewConfigService(V, cfg)
+	toolRegistry := tools.NewRegistry(configService, nil, nil, nil, nil, nil)
 	toolFormatterService := services.NewToolFormatterService(toolRegistry)
 	pricingService := services.NewPricingService(&cfg.Pricing)
 	persistentRepo := services.NewPersistentConversationRepository(toolFormatterService, pricingService, storageBackend)

diff --git a/cmd/floating_window_darwin.go b/cmd/floating_window_darwin.go
@@ -0,0 +1,39 @@
+//go:build darwin
+
+package cmd
+
+import (
+	"fmt"
+
+	config "github.com/inference-gateway/cli/config"
+	macos "github.com/inference-gateway/cli/internal/display/macos"
+	domain "github.com/inference-gateway/cli/internal/domain"
+	logger "github.com/inference-gateway/cli/internal/logger"
+)
+
+// FloatingWindowManager is the platform-specific interface for the floating window
+type FloatingWindowManager interface {
+	Shutdown() error
+}
+
+// initFloatingWindow initializes the floating window manager if enabled
+func initFloatingWindow(config *config.Config, stateManager domain.StateManager) (FloatingWindowManager, error) {
+	logger.Info("Checking floating window conditions",
+		"computer_use_enabled", config.ComputerUse.Enabled,
+		"floating_window_enabled", config.ComputerUse.FloatingWindow.Enabled)
+
+	if !config.ComputerUse.Enabled || !config.ComputerUse.FloatingWindow.Enabled {
+		return nil, nil
+	}
+
+	logger.Info("Initializing floating window manager")
+	eventBridge := macos.NewEventBridge()
+	stateManager.SetEventBridge(eventBridge)
+
+	floatingWindowMgr, err := macos.NewFloatingWindowManager(config, eventBridge, stateManager)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create floating window manager: %w", err)
+	}
+
+	return floatingWindowMgr, nil
+}
diff --git a/cmd/floating_window_stub.go b/cmd/floating_window_stub.go
@@ -0,0 +1,25 @@
+//go:build !darwin
+
+package cmd
+
+import (
+	config "github.com/inference-gateway/cli/config"
+	domain "github.com/inference-gateway/cli/internal/domain"
+)
+
+// FloatingWindowManager is the platform-specific interface for the floating window
+type FloatingWindowManager interface {
+	Shutdown() error
+}
+
+// noopFloatingWindowManager is a no-op implementation for non-darwin platforms
+type noopFloatingWindowManager struct{}
+
+func (n *noopFloatingWindowManager) Shutdown() error {
+	return nil
+}
+
+// initFloatingWindow returns a no-op manager on non-darwin platforms
+func initFloatingWindow(config *config.Config, stateManager domain.StateManager) (FloatingWindowManager, error) {
+	return &noopFloatingWindowManager{}, nil
+}