Skip to content

Commit 6b4e6cb

Browse files
authored
Merge pull request #81 from NakaokaRei/feat/keyboard-input-source-context
feat: add keyboard input source context to AI agent
2 parents c75c584 + cc5889b commit 6b4e6cb

3 files changed

Lines changed: 95 additions & 2 deletions

File tree

Sources/SwiftAutoGUI/OpenAIVisionBackend.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,13 +249,18 @@ extension OpenAIVisionBackend {
249249
250250
You will also receive structured screen context alongside the screenshot. This includes:
251251
- The frontmost application name and bundle identifier
252+
- The current keyboard input source / IME mode (e.g., "U.S.", "日本語ローマ字")
252253
- A list of visible windows with their titles, owning apps, and screen bounds
253254
- An accessibility tree of the focused window showing UI elements with their roles, labels, values, and positions
254255
255256
Use this context to precisely locate UI elements. The bounding boxes in the accessibility tree \
256257
give exact coordinates you can use with move/click actions. Prefer using accessibility tree \
257258
coordinates over guessing positions from the screenshot when available. \
258259
The accessibility tree may be truncated ([...]) for deeply nested elements.
260+
261+
When the keyboard input source indicates a non-ASCII input mode (e.g., Japanese), \
262+
consider switching to an ASCII-capable source before using the 'write' action for English text. \
263+
Common shortcuts to toggle input source include Control+Space or Caps Lock, depending on user settings.
259264
"""
260265
}
261266

Sources/SwiftAutoGUI/ScreenContext.swift

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import AppKit
77
import ApplicationServices
8+
import Carbon
89
import Foundation
910

1011
// MARK: - Data Types
@@ -24,10 +25,14 @@ public struct ScreenContext: Sendable, Codable {
2425
/// The accessibility tree of the focused window (nil if unavailable).
2526
public let focusedWindowAXTree: AXNode?
2627

27-
public init(frontmostApp: AppInfo?, visibleWindows: [WindowInfo], focusedWindowAXTree: AXNode?) {
28+
/// The current keyboard input source / IME mode (nil if unavailable).
29+
public let keyboardInputSource: InputSourceInfo?
30+
31+
public init(frontmostApp: AppInfo?, visibleWindows: [WindowInfo], focusedWindowAXTree: AXNode?, keyboardInputSource: InputSourceInfo? = nil) {
2832
self.frontmostApp = frontmostApp
2933
self.visibleWindows = visibleWindows
3034
self.focusedWindowAXTree = focusedWindowAXTree
35+
self.keyboardInputSource = keyboardInputSource
3136
}
3237
}
3338

@@ -44,6 +49,20 @@ public struct AppInfo: Sendable, Codable {
4449
}
4550
}
4651

52+
/// Information about the current keyboard input source (IME state).
53+
public struct InputSourceInfo: Sendable, Codable {
54+
/// The input source identifier (e.g., "com.apple.inputmethod.Japanese.RomajiTyping").
55+
public let id: String
56+
57+
/// The localized display name (e.g., "日本語ローマ字", "U.S.").
58+
public let localizedName: String
59+
60+
public init(id: String, localizedName: String) {
61+
self.id = id
62+
self.localizedName = localizedName
63+
}
64+
}
65+
4766
/// Information about a visible window on screen.
4867
public struct WindowInfo: Sendable, Codable {
4968
public let title: String?
@@ -157,6 +176,7 @@ public struct ScreenContextProvider: Sendable {
157176
public static func gather(options: Options = Options()) -> ScreenContext {
158177
let frontmostApp = gatherFrontmostApp()
159178
let visibleWindows = gatherVisibleWindows()
179+
let inputSource = gatherKeyboardInputSource()
160180

161181
var axTree: AXNode?
162182
if options.includeAXTree, let app = frontmostApp {
@@ -167,7 +187,8 @@ public struct ScreenContextProvider: Sendable {
167187
return ScreenContext(
168188
frontmostApp: frontmostApp,
169189
visibleWindows: visibleWindows,
170-
focusedWindowAXTree: axTree
190+
focusedWindowAXTree: axTree,
191+
keyboardInputSource: inputSource
171192
)
172193
}
173194
}
@@ -185,6 +206,26 @@ extension ScreenContextProvider {
185206
}
186207
}
187208

209+
// MARK: - Gathering: Keyboard Input Source
210+
211+
extension ScreenContextProvider {
212+
private static func gatherKeyboardInputSource() -> InputSourceInfo? {
213+
guard let source = TISCopyCurrentKeyboardInputSource()?.takeRetainedValue() else {
214+
return nil
215+
}
216+
217+
let idPtr = TISGetInputSourceProperty(source, kTISPropertyInputSourceID)
218+
let namePtr = TISGetInputSourceProperty(source, kTISPropertyLocalizedName)
219+
220+
guard let idPtr, let namePtr else { return nil }
221+
222+
let id = Unmanaged<CFString>.fromOpaque(idPtr).takeUnretainedValue() as String
223+
let localizedName = Unmanaged<CFString>.fromOpaque(namePtr).takeUnretainedValue() as String
224+
225+
return InputSourceInfo(id: id, localizedName: localizedName)
226+
}
227+
}
228+
188229
// MARK: - Gathering: Visible Windows
189230

190231
extension ScreenContextProvider {
@@ -378,6 +419,11 @@ extension ScreenContext {
378419
lines.append("Frontmost app: \(app.name)\(bundle)")
379420
}
380421

422+
// Keyboard input source
423+
if let inputSource = keyboardInputSource {
424+
lines.append("Keyboard input source: \(inputSource.localizedName) (\(inputSource.id))")
425+
}
426+
381427
// Visible windows
382428
if !visibleWindows.isEmpty {
383429
lines.append("Visible windows:")

Tests/SwiftAutoGUITests/ScreenContextTests.swift

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@ import Testing
1010
@Suite("ScreenContext Tests")
1111
struct ScreenContextTests {
1212

13+
// MARK: - InputSourceInfo Tests
14+
15+
@Suite("InputSourceInfo")
16+
struct InputSourceInfoTests {
17+
18+
@Test("round-trip encoding/decoding")
19+
func roundTrip() throws {
20+
let info = InputSourceInfo(id: "com.apple.keylayout.US", localizedName: "U.S.")
21+
let data = try JSONEncoder().encode(info)
22+
let decoded = try JSONDecoder().decode(InputSourceInfo.self, from: data)
23+
#expect(decoded.id == info.id)
24+
#expect(decoded.localizedName == info.localizedName)
25+
}
26+
}
27+
1328
// MARK: - CodableRect Tests
1429

1530
@Suite("CodableRect")
@@ -196,6 +211,33 @@ struct ScreenContextTests {
196211
#expect(output.contains("[...]"))
197212
}
198213

214+
@Test("formats keyboard input source")
215+
func keyboardInputSource() {
216+
let context = ScreenContext(
217+
frontmostApp: nil,
218+
visibleWindows: [],
219+
focusedWindowAXTree: nil,
220+
keyboardInputSource: InputSourceInfo(
221+
id: "com.apple.keylayout.US",
222+
localizedName: "U.S."
223+
)
224+
)
225+
let output = context.formatted()
226+
#expect(output.contains("Keyboard input source: U.S. (com.apple.keylayout.US)"))
227+
}
228+
229+
@Test("omits keyboard input source when nil")
230+
func noKeyboardInputSource() {
231+
let context = ScreenContext(
232+
frontmostApp: nil,
233+
visibleWindows: [],
234+
focusedWindowAXTree: nil,
235+
keyboardInputSource: nil
236+
)
237+
let output = context.formatted()
238+
#expect(!output.contains("Keyboard input source"))
239+
}
240+
199241
@Test("full context output combines all sections")
200242
func fullContext() {
201243
let context = ScreenContext(

0 commit comments

Comments
 (0)