Skip to content

Commit aeca2d4

Browse files
committed
feat: add get_visual_tree tool (#293)
1 parent 74931e5 commit aeca2d4

File tree

10 files changed

+306
-74
lines changed

10 files changed

+306
-74
lines changed

src/Everywhere.Core/Chat/Plugins/VisualContextPlugin.cs

Lines changed: 172 additions & 50 deletions
Large diffs are not rendered by default.

src/Everywhere.Core/Interop/IVisualElementContext.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@ public interface IVisualElementContext : IObservable<TextSelectionData>
7979
/// <returns></returns>
8080
IVisualElement? ElementFromPointer(ScreenSelectionMode mode = ScreenSelectionMode.Element);
8181

82+
/// <summary>
83+
/// Get the element from a native window handle.
84+
/// </summary>
85+
/// <param name="windowHandle"></param>
86+
/// <returns></returns>
87+
IVisualElement? ElementFromWindowHandle(nint windowHandle);
88+
8289
/// <summary>
8390
/// Let the user pick an element from the screen.
8491
/// </summary>

src/Everywhere.Core/Views/Controls/VisualTreeDebugger.axaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,11 @@
8585

8686
<DockPanel Grid.Row="2">
8787
<Button
88-
DockPanel.Dock="Right" Click="HandleBuildXmlButtonClicked"
89-
Content="Build XML"/>
88+
DockPanel.Dock="Right" Click="HandleBuildButtonClicked"
89+
Content="Build"/>
9090
<TextBox
9191
x:Name="TokenLimitTextBox" Text="8000"
92-
Watermark="Token limit"/>
92+
Watermark="Limit"/>
9393
</DockPanel>
9494

9595
<Button

src/Everywhere.Core/Views/Controls/VisualTreeDebugger.axaml.cs

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -138,46 +138,60 @@ private async void HandleCaptureButtonClicked(object? sender, RoutedEventArgs e)
138138
}
139139
}
140140

141-
private async void HandleBuildXmlButtonClicked(object? sender, RoutedEventArgs e)
141+
private async void HandleBuildButtonClicked(object? sender, RoutedEventArgs e)
142142
{
143143
try
144144
{
145+
const VisualTreeDetailLevel level = VisualTreeDetailLevel.Compact;
145146
var tokenLimit = int.Parse(TokenLimitTextBox.Text ?? "8000");
146147
var builder = new VisualTreeBuilder(
147148
VisualTreeView.SelectedItems.AsValueEnumerable().OfType<IVisualElement>().ToList(),
148149
tokenLimit,
149150
0,
150-
VisualTreeDetailLevel.Compact);
151+
level);
151152
#if DEBUG
152-
// use profiler to measure xml building time in debug mode
153-
var xml = await Task.Run(() =>
153+
// use profiler to measure building time in debug mode
154+
var visualTree = await Task.Run(() =>
154155
{
155156
var originalThreadName = Thread.CurrentThread.Name;
156-
Thread.CurrentThread.Name = "XML Builder Thread";
157-
MeasureProfiler.StartCollectingData("BuildXml");
157+
Thread.CurrentThread.Name = "Visual Tree Builder Thread";
158+
MeasureProfiler.StartCollectingData("BuildVisualTree");
158159

159160
try
160161
{
161162
return builder.Build(CancellationToken.None);
162163
}
163164
finally
164165
{
165-
MeasureProfiler.SaveData("BuildXml");
166+
MeasureProfiler.SaveData("BuildVisualTree");
166167
Thread.CurrentThread.Name = originalThreadName;
167168
}
168169
});
169170
#else
170-
var xml = await Task.Run(() => builder.Build(CancellationToken.None));
171+
var visualTree = await Task.Run(() => builder.Build(CancellationToken.None));
171172
#endif
172173
var timestamp = DateTime.Now.ToString("yyyyMMdd_HHmmss");
173-
var filename = $"visual_tree_{timestamp}.xml";
174-
var xmlPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, filename);
175-
await File.WriteAllTextAsync(xmlPath, xml);
176-
await ServiceLocator.Resolve<ILauncher>().LaunchFileInfoAsync(new FileInfo(xmlPath));
174+
var extension = level switch
175+
{
176+
VisualTreeDetailLevel.Compact => "json",
177+
VisualTreeDetailLevel.Detailed => "xml",
178+
_ => "toon"
179+
};
180+
var filename = $"visual_tree_{timestamp}.{extension}";
181+
var filePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, filename);
182+
await File.WriteAllTextAsync(filePath, visualTree);
183+
await ServiceLocator.Resolve<ILauncher>().LaunchFileInfoAsync(new FileInfo(filePath));
177184
}
185+
#if DEBUG
186+
catch (Exception ex)
187+
{
188+
_ = ex;
189+
Debugger.Break();
190+
#else
178191
catch
179192
{
180193
// ignored
194+
#endif
181195
}
182196
}
183197
}

src/Everywhere.I18N/Strings.resx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2132,4 +2132,10 @@ Use scroll wheel or number keys to switch selection mode</value>
21322132
<data name="ChatPluginConsentRequest_CommonNone_Header" xml:space="preserve">
21332133
<value>Do you allow '{0}' to execute?</value>
21342134
</data>
2135+
<data name="BuiltInChatPlugin_VisualContext_GetVisualTree_Header" xml:space="preserve">
2136+
<value>Read Visual Tree</value>
2137+
</data>
2138+
<data name="BuiltInChatPlugin_VisualContext_GetVisualTree_Description" xml:space="preserve">
2139+
<value>Read the visual tree of a window or element on screen.</value>
2140+
</data>
21352141
</root>

src/Everywhere.I18N/Strings.zh-hans.resx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2133,4 +2133,10 @@
21332133
<data name="ChatPluginConsentRequest_CommonNone_Header" xml:space="preserve">
21342134
<value>是否允许“{0}”执行?</value>
21352135
</data>
2136+
<data name="BuiltInChatPlugin_VisualContext_GetVisualTree_Header" xml:space="preserve">
2137+
<value>读取视觉树</value>
2138+
</data>
2139+
<data name="BuiltInChatPlugin_VisualContext_GetVisualTree_Description" xml:space="preserve">
2140+
<value>读取屏幕上窗口或元素的视觉树。</value>
2141+
</data>
21362142
</root>

src/Everywhere.Linux/Interop/VisualElementContext.cs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ public IVisualElement? FocusedElement
7878
return ElementFromPoint(point, mode);
7979
}
8080

81+
public IVisualElement? ElementFromWindowHandle(IntPtr windowHandle)
82+
{
83+
throw new NotImplementedException();
84+
}
85+
8186
public async Task<IVisualElement?> PickElementAsync(ScreenSelectionMode? initialMode)
8287
{
8388
if (Application.Current is not { ApplicationLifetime: ClassicDesktopStyleApplicationLifetime desktopLifetime })
@@ -92,18 +97,18 @@ public IVisualElement? FocusedElement
9297
return result;
9398
}
9499

95-
public async Task<Avalonia.Media.Imaging.Bitmap?> ScreenshotAsync(ScreenSelectionMode? initialMode)
100+
public async Task<Bitmap?> ScreenshotAsync(ScreenSelectionMode? initialMode)
96101
{
97-
if (Application.Current is not { ApplicationLifetime: ClassicDesktopStyleApplicationLifetime desktopLifetime })
102+
if (Application.Current is not { ApplicationLifetime: ClassicDesktopStyleApplicationLifetime desktopLifetime })
98103
{
99104
return null;
100105
}
101106

102107
var windows = desktopLifetime.Windows.AsValueEnumerable().Where(w => w.IsVisible).ToList();
103108
foreach (var window in windows) window.Hide();
104-
109+
105110
var result = await ScreenshotPicker.ScreenshotAsync(this, backend, initialMode);
106-
111+
107112
foreach (var window in windows) window.IsVisible = true;
108113
return result;
109114
}
@@ -112,4 +117,4 @@ public IDisposable Subscribe(IObserver<TextSelectionData> observer)
112117
{
113118
return backend.Subscribe(observer);
114119
}
115-
}
120+
}

src/Everywhere.Mac/Interop/AXUIElement.cs

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -277,9 +277,6 @@ public Task<Bitmap> CaptureAsync(CancellationToken cancellationToken)
277277
var screen = NSScreen.Screens.FirstOrDefault(s => s.Frame.IntersectsWith(rect));
278278
var scale = screen?.BackingScaleFactor ?? 1.0;
279279

280-
var targetWidth = rect.Width * scale;
281-
var targetHeight = rect.Height * scale;
282-
283280
// cgImage captures the window content starting at (0,0) in Window Local Coordinates.
284281
// rect contains Screen Coordinates (including Dock/Menu bar offsets).
285282
// To crop correctly, we must transform rect to Window-Relative coordinates.
@@ -303,7 +300,8 @@ public Task<Bitmap> CaptureAsync(CancellationToken cancellationToken)
303300

304301
// Check if captured image approximately matches target size (allowing for rounding/shadows).
305302
// If it matches, we assume full window capture and start at 0,0.
306-
bool isFullWindow = cgImage.Width >= targetWidth - 2 && cgImage.Width <= targetWidth + 100;
303+
var targetWidth = rect.Width * scale;
304+
var isFullWindow = cgImage.Width >= targetWidth - 2 && cgImage.Width <= targetWidth + 100;
307305

308306
// If full window, offset is 0.
309307
// If partial (element inside window), offset is (ElementScreenPos - WindowScreenPos).
@@ -406,6 +404,70 @@ private void PerformAction(NSString actionName)
406404
return handle != 0 ? new AXUIElement(handle) : null;
407405
}
408406

407+
/// <summary>
408+
/// Gets the AXUIElement corresponding to the specified CGWindowID.
409+
/// This is a reverse lookup using _AXUIElementGetWindow under the hood.
410+
/// </summary>
411+
/// <param name="cgWindowId">The target CGWindowID.</param>
412+
/// <returns>The matching AXUIElement, or null if not found.</returns>
413+
public static AXUIElement? ElementFromWindowId(uint cgWindowId)
414+
{
415+
if (cgWindowId == 0) return null;
416+
417+
// 1. Get the owner PID from the CGWindowID using CoreGraphics
418+
var ownerPid = 0;
419+
var windowInfoArrayPtr = CGInterop.CGWindowListCopyWindowInfo(CGWindowListOption.IncludingWindow, cgWindowId);
420+
421+
if (windowInfoArrayPtr != 0)
422+
{
423+
// Take ownership of the CFArray returned by Create/Copy rule
424+
using var windowInfoArray = Runtime.GetNSObject<NSArray>(windowInfoArrayPtr, owns: true);
425+
if (windowInfoArray is { Count: > 0 })
426+
{
427+
using var windowInfo = windowInfoArray.GetItem<NSDictionary>(0);
428+
using var pidKey = new NSString("kCGWindowOwnerPID");
429+
430+
if (windowInfo?.ObjectForKey(pidKey) is NSNumber pidNumber)
431+
{
432+
ownerPid = pidNumber.Int32Value;
433+
}
434+
}
435+
}
436+
437+
if (ownerPid == 0) return null;
438+
439+
// 2. Create the AXApplication element from the PID
440+
using var appElement = ElementFromPid(ownerPid);
441+
if (appElement is null) return null;
442+
443+
// 3. Get all windows of the application
444+
// Note: Replace with AXAttributeConstants.Windows if you have it defined.
445+
using var windowsKey = new NSString("AXWindows");
446+
using var windows = appElement.GetAttribute<NSArray>(windowsKey);
447+
448+
if (windows is null) return null;
449+
450+
// 4. Iterate through the windows and find the matching CGWindowID
451+
for (nuint i = 0; i < windows.Count; i++)
452+
{
453+
var windowElement = FromCopyArray(windows, i);
454+
if (windowElement is null) continue;
455+
456+
// Use your existing property which correctly handles the _AXUIElementGetWindow P/Invoke
457+
if (windowElement.NativeWindowHandle == (nint)cgWindowId)
458+
{
459+
// Found it! Return the retained element.
460+
return windowElement;
461+
}
462+
463+
// Not a match: explicitly dispose to release the CFRetain applied in FromCopyArray,
464+
// avoiding memory leaks during traversal.
465+
windowElement.Dispose();
466+
}
467+
468+
return null;
469+
}
470+
409471
[LibraryImport(AppServices, EntryPoint = "AXUIElementCreateSystemWide")]
410472
private static partial nint CreateSystemWide();
411473

src/Everywhere.Mac/Interop/VisualElementContext.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ public partial class VisualElementContext(IWindowHelper windowHelper) : IVisualE
6666
return point is null ? null : ElementFromPoint(point.Value, mode);
6767
}
6868

69+
public IVisualElement? ElementFromWindowHandle(IntPtr windowHandle)
70+
{
71+
return AXUIElement.ElementFromWindowId((uint)windowHandle);
72+
}
73+
6974
public Task<IVisualElement?> PickElementAsync(ScreenSelectionMode? initialMode)
7075
{
7176
return PickerSession.PickAsync(windowHelper, initialMode);

src/Everywhere.Windows/Interop/VisualElementContext.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ public partial class VisualElementContext(IWindowHelper windowHelper) : IVisualE
6060
return !PInvoke.GetCursorPos(out var point) ? null : ElementFromPoint(new PixelPoint(point.X, point.Y), mode);
6161
}
6262

63+
public IVisualElement? ElementFromWindowHandle(IntPtr windowHandle)
64+
{
65+
return TryCreateVisualElement(() => Automation.FromHandle(windowHandle));
66+
}
67+
6368
public Task<IVisualElement?> PickElementAsync(ScreenSelectionMode? initialMode) => PickerSession.PickAsync(windowHelper, initialMode);
6469

6570
public Task<Bitmap?> ScreenshotAsync(ScreenSelectionMode? initialMode) => ScreenshotSession.ScreenshotAsync(windowHelper, initialMode);

0 commit comments

Comments
 (0)