From 1461114fe045cb27e94525a5f193ebfa43d572f9 Mon Sep 17 00:00:00 2001 From: Huanyu Luo Date: Sun, 28 Sep 2025 13:28:30 +0800 Subject: [PATCH] feat(core): add continuous screenshot capturing feature and enhance context handling --- .../src/components/detail-panel/index.tsx | 57 ++++-- .../components/global-hover-preview/index.tsx | 18 +- apps/report/src/components/timeline/index.tsx | 128 ++++++++++---- packages/cli/src/create-yaml-player.ts | 1 + .../midscene_scripts/online/video-player.yaml | 20 +++ packages/core/src/agent/agent.ts | 164 +++++++++++++++++- packages/core/src/agent/tasks.ts | 47 ++++- packages/core/src/agent/utils.ts | 46 ++++- packages/core/src/ai-model/inspect.ts | 113 +++++++++--- packages/core/src/types.ts | 11 ++ packages/core/src/yaml.ts | 8 +- packages/core/src/yaml/player.ts | 12 +- packages/core/tests/utils.ts | 4 +- .../visualizer/src/utils/replay-scripts.ts | 65 +++++-- .../src/puppeteer/agent-launcher.ts | 1 + 15 files changed, 595 insertions(+), 100 deletions(-) create mode 100644 packages/cli/tests/midscene_scripts/online/video-player.yaml diff --git a/apps/report/src/components/detail-panel/index.tsx b/apps/report/src/components/detail-panel/index.tsx index ceda8d650..7f9ad6190 100644 --- a/apps/report/src/components/detail-panel/index.tsx +++ b/apps/report/src/components/detail-panel/index.tsx @@ -49,6 +49,36 @@ const DetailPanel = (): JSX.Element => { ); const imageWidth = useExecutionDump((store) => store.insightWidth); const imageHeight = useExecutionDump((store) => store.insightHeight); + type RecorderScreenshotEntry = { + img: string; + ts: number; + order: number; + timing?: string; + }; + const screenshotEntries: RecorderScreenshotEntry[] = + activeTask?.recorder?.flatMap((item) => { + const entries: RecorderScreenshotEntry[] = []; + const seen = new Set(); + const addImage = (img: string | undefined, order: number) => { + if (!img || seen.has(img)) { + return; + } + seen.add(img); + entries.push({ + img, + ts: item.ts, + order, + timing: item.timing, + }); + }; + + addImage(item.screenshot, 0); + item.screenshots?.forEach((img, idx) => { + addImage(img, idx + 1); + }); + + return entries; + }) || []; // Check if page context is frozen const isPageContextFrozen = Boolean( @@ -118,20 +148,23 @@ const DetailPanel = (): JSX.Element => { content =
invalid view
; } } else if (viewType === VIEW_TYPE_SCREENSHOT) { - if (activeTask.recorder?.length) { + if (screenshotEntries.length) { content = (
- {activeTask.recorder - .filter((item) => item.screenshot) - .map((item, index) => { - const fullTime = timeStr(item.ts); - const str = item.timing - ? `${fullTime} / ${item.timing}` - : fullTime; - return ( - - ); - })} + {screenshotEntries.map((entry, index) => { + const baseTime = timeStr(entry.ts); + const label = entry.timing + ? `${baseTime} / ${entry.timing}` + : baseTime; + const suffix = entry.order > 0 ? ` (#${entry.order + 1})` : ''; + return ( + + ); + })}
); } else { diff --git a/apps/report/src/components/global-hover-preview/index.tsx b/apps/report/src/components/global-hover-preview/index.tsx index d01850952..4be1d849d 100644 --- a/apps/report/src/components/global-hover-preview/index.tsx +++ b/apps/report/src/components/global-hover-preview/index.tsx @@ -16,13 +16,27 @@ const GlobalHoverPreview = () => { const images = hoverTask?.recorder ?.filter((item) => { - let valid = Boolean(item.screenshot); + let valid = Boolean(item.screenshot || item.screenshots?.length); if (hoverTimestamp) { valid = valid && item.ts >= hoverTimestamp; } return valid; }) - .map((item) => item.screenshot); + .flatMap((item) => { + const list: string[] = []; + const seen = new Set(); + const addImage = (img?: string) => { + if (!img || seen.has(img)) { + return; + } + seen.add(img); + list.push(img); + }; + + addImage(item.screenshot); + item.screenshots?.forEach((img) => addImage(img)); + return list; + }); const { x, y } = hoverPreviewConfig || {}; let left = 0; diff --git a/apps/report/src/components/timeline/index.tsx b/apps/report/src/components/timeline/index.tsx index eb3aeb7e5..83d10aaa8 100644 --- a/apps/report/src/components/timeline/index.tsx +++ b/apps/report/src/components/timeline/index.tsx @@ -3,7 +3,7 @@ import * as PIXI from 'pixi.js'; import { useEffect, useMemo, useRef } from 'react'; import './index.less'; -import type { ExecutionRecorderItem, ExecutionTask } from '@midscene/core'; +import type { ExecutionTask } from '@midscene/core'; import { getTextureFromCache, loadTexture } from '../pixi-loader'; import { useAllCurrentTasks, useExecutionDump } from '../store'; @@ -11,6 +11,9 @@ interface TimelineItem { id: string; img: string; timeOffset: number; + order?: number; + overlapIndex?: number; + overlapCount?: number; x?: number; y?: number; width?: number; @@ -254,8 +257,21 @@ const TimelineWidget = (props: { (screenshotHeight / originalHeight) * originalWidth, ); - const screenshotX = leftForTimeOffset(screenshot.timeOffset); - allScreenshots[index].x = screenshotX; + const baseX = leftForTimeOffset(screenshot.timeOffset); + let overlapX = baseX; + + if ((screenshot.overlapCount ?? 1) > 1) { + const overlapCount = screenshot.overlapCount ?? 1; + const overlapIndex = screenshot.overlapIndex ?? 0; + const centeredIndex = overlapIndex - (overlapCount - 1) / 2; + const overlapGap = Math.min( + Math.max(Math.floor(screenshotWidth * 0.25), 10 * sizeRatio), + Math.max(screenshotWidth, 30 * sizeRatio), + ); + overlapX = baseX + centeredIndex * overlapGap; + } + + allScreenshots[index].x = overlapX; allScreenshots[index].y = screenshotTop; allScreenshots[index].width = screenshotWidth; allScreenshots[index].height = screenshotMaxHeight; @@ -263,7 +279,7 @@ const TimelineWidget = (props: { const border = new PIXI.Graphics(); border.lineStyle(sizeRatio, shotBorderColor, 1); border.drawRect( - screenshotX, + overlapX, screenshotTop, screenshotWidth, screenshotMaxHeight, @@ -271,7 +287,7 @@ const TimelineWidget = (props: { border.endFill(); container.addChild(border); - screenshotSprite.x = screenshotX; + screenshotSprite.x = overlapX; screenshotSprite.y = screenshotTop; screenshotSprite.width = screenshotWidth; screenshotSprite.height = screenshotMaxHeight; @@ -392,7 +408,9 @@ const TimelineWidget = (props: { indicatorContainer.addChild(indicator); // time string - const text = pixiTextForNumber(timeOffsetForLeft(x)); + const timeToDisplay = + closestScreenshot?.timeOffset ?? timeOffsetForLeft(x); + const text = pixiTextForNumber(timeToDisplay); text.x = x + 5; text.y = timeTextTop; const textBg = new PIXI.Graphics(); @@ -459,41 +477,79 @@ const Timeline = () => { let startingTime = -1; let idCount = 1; const idTaskMap: Record = {}; - const allScreenshots: TimelineItem[] = allTasks - .reduce<(ExecutionRecorderItem & { id: string })[]>((acc, current) => { - const recorders = current.recorder || []; - recorders.forEach((item) => { - if (startingTime === -1 || startingTime > item.ts) { - startingTime = item.ts; - } - }); - if ( - current.timing?.start && - (startingTime === -1 || startingTime > current.timing.start) - ) { - startingTime = current.timing.start; + type RecorderEntry = { + id: string; + img: string; + ts: number; + order: number; + overlapIndex: number; + overlapCount: number; + }; + + const recorderEntries = allTasks.reduce((acc, current) => { + const recorders = current.recorder || []; + recorders.forEach((item) => { + if (startingTime === -1 || startingTime > item.ts) { + startingTime = item.ts; } - const recorderItemWithId = recorders.map((item) => { + }); + if ( + current.timing?.start && + (startingTime === -1 || startingTime > current.timing.start) + ) { + startingTime = current.timing.start; + } + + recorders.forEach((item) => { + const imageCandidates = [ + item.screenshot, + ...(item.screenshots ?? []), + ].filter((img): img is string => Boolean(img)); + if (!imageCandidates.length) { + return; + } + const uniqueImages = Array.from(new Set(imageCandidates)); + const overlapCount = uniqueImages.length; + uniqueImages.forEach((img, idx) => { const idStr = `id_${idCount++}`; idTaskMap[idStr] = current; - return { - ...item, + acc.push({ id: idStr, - }; + img, + ts: item.ts, + order: idx, + overlapIndex: idx, + overlapCount, + }); }); - return acc.concat(recorderItemWithId || []); - }, []) - .filter((item) => { - return item.screenshot; - }) - .map((recorderItem) => { - return { - id: recorderItem.id, - img: recorderItem.screenshot!, - timeOffset: recorderItem.ts - startingTime, - }; - }) - .sort((a, b) => a.timeOffset - b.timeOffset); + }); + + return acc; + }, []); + + if (startingTime === -1 && recorderEntries.length) { + startingTime = recorderEntries[0]!.ts; + } + + if (startingTime === -1) { + startingTime = 0; + } + + const allScreenshots: TimelineItem[] = recorderEntries + .map((entry) => ({ + id: entry.id, + img: entry.img, + timeOffset: entry.ts - startingTime, + order: entry.order, + overlapIndex: entry.overlapIndex, + overlapCount: entry.overlapCount, + })) + .sort((a, b) => { + if (a.timeOffset === b.timeOffset) { + return (a.order ?? 0) - (b.order ?? 0); + } + return a.timeOffset - b.timeOffset; + }); const itemOnTap = (item: TimelineItem) => { const task = idTaskMap[item.id]; diff --git a/packages/cli/src/create-yaml-player.ts b/packages/cli/src/create-yaml-player.ts index dbf5b6ee0..963c9493c 100644 --- a/packages/cli/src/create-yaml-player.ts +++ b/packages/cli/src/create-yaml-player.ts @@ -149,6 +149,7 @@ export async function createYamlPlayer( const agent = new AgentOverChromeBridge({ closeNewTabsAfterDisconnect: webTarget.closeNewTabsAfterDisconnect, cacheId: fileName, + continuousScreenshot: webTarget.continuousScreenshot, }); if (webTarget.bridgeMode === 'newTabWithUrl') { diff --git a/packages/cli/tests/midscene_scripts/online/video-player.yaml b/packages/cli/tests/midscene_scripts/online/video-player.yaml new file mode 100644 index 000000000..32b22e3f0 --- /dev/null +++ b/packages/cli/tests/midscene_scripts/online/video-player.yaml @@ -0,0 +1,20 @@ +web: + url: https://www.bilibili.com + headless: false + continuousScreenshot: + enabled: true + intervalMs: 1000 + maxCount: 10 + + + +tasks: + - name: 随机播放视频 + flow: + - ai: 随机选择一个视频播放 + - sleep: 3000 + + - name: 检查结果 + flow: + - aiAssert: 视频播放区域有控制面板浮层 + screenshotListIncluded: true \ No newline at end of file diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 923eea762..3febb9a6b 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,6 +48,7 @@ import { globalConfigManager, globalModelConfigManager, } from '@midscene/shared/env'; +import { resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; // import type { AndroidDeviceInputOpt } from '../device'; @@ -63,6 +64,8 @@ import { import { trimContextByViewport } from './utils'; const debug = getDebug('agent'); +const DEFAULT_CONTINUOUS_SCREENSHOT_INTERVAL_MS = 5000; +const DEFAULT_CONTINUOUS_SCREENSHOT_MAX_COUNT = 10; const distanceOfTwoPoints = (p1: [number, number], p2: [number, number]) => { const [x1, y1] = p1; @@ -79,6 +82,7 @@ const includedInRect = (point: [number, number], rect: Rect) => { const defaultInsightExtractOption: InsightExtractOption = { domIncluded: false, screenshotIncluded: true, + screenshotListIncluded: false, }; export class Agent< @@ -123,6 +127,12 @@ export class Agent< */ private hasWarnedNonVLModel = false; + private continuousScreenshotTimer?: NodeJS.Timeout; + + private continuousScreenshots: string[] = []; + + private isCapturingContinuousScreenshot = false; + // @deprecated use .interface instead get page() { return this.interface; @@ -192,6 +202,141 @@ export class Agent< this.reportFileName = opts?.reportFileName || getReportFileName(opts?.testId || this.interface.interfaceType || 'web'); + + if (this.isContinuousScreenshotEnabled()) { + this.startContinuousScreenshot(); + } + } + + private isContinuousScreenshotEnabled(): boolean { + const config = this.opts.continuousScreenshot; + if (!config) { + return false; + } + if (config.enabled === false) { + return false; + } + return typeof this.interface.screenshotBase64 === 'function'; + } + + private getContinuousScreenshotConfig() { + const config = this.opts.continuousScreenshot ?? {}; + const intervalMs = + typeof config.intervalMs === 'number' && config.intervalMs > 0 + ? config.intervalMs + : DEFAULT_CONTINUOUS_SCREENSHOT_INTERVAL_MS; + const maxCount = + typeof config.maxCount === 'number' && config.maxCount > 0 + ? Math.floor(config.maxCount) + : DEFAULT_CONTINUOUS_SCREENSHOT_MAX_COUNT; + return { intervalMs, maxCount }; + } + + private startContinuousScreenshot() { + this.stopContinuousScreenshot(); + const { intervalMs } = this.getContinuousScreenshotConfig(); + this.continuousScreenshots = this.mergeScreenshotLists( + this.continuousScreenshots, + ); + void this.captureContinuousScreenshot(); + this.continuousScreenshotTimer = setInterval(() => { + void this.captureContinuousScreenshot(); + }, intervalMs); + // this.continuousScreenshotTimer.unref?.(); + } + + private stopContinuousScreenshot() { + if (this.continuousScreenshotTimer) { + clearInterval(this.continuousScreenshotTimer); + this.continuousScreenshotTimer = undefined; + } + } + + private mergeScreenshotLists(...lists: (string[] | undefined)[]): string[] { + const { maxCount } = this.getContinuousScreenshotConfig(); + const unique = new Set(); + for (const list of lists) { + if (!list) continue; + for (const item of list) { + if (!item) continue; + if (unique.has(item)) { + unique.delete(item); + } + unique.add(item); + } + } + if (!unique.size) { + return []; + } + const merged = Array.from(unique); + if (merged.length > maxCount) { + return merged.slice(merged.length - maxCount); + } + return merged; + } + + private async captureContinuousScreenshot() { + if ( + !this.isContinuousScreenshotEnabled() || + this.isCapturingContinuousScreenshot || + this.destroyed + ) { + return; + } + if (typeof this.interface.screenshotBase64 !== 'function') { + return; + } + this.isCapturingContinuousScreenshot = true; + try { + let screenshot = await this.interface.screenshotBase64(); + if (!screenshot) { + return; + } + if (typeof this.interface.size === 'function') { + try { + const size = await this.interface.size(); + if (size?.dpr && size.dpr !== 1) { + screenshot = await resizeImgBase64(screenshot, { + width: size.width, + height: size.height, + }); + } + } catch (error) { + debug('Failed to get size for continuous screenshot', error); + } + } + this.continuousScreenshots = this.mergeScreenshotLists( + this.continuousScreenshots, + [screenshot], + ); + } catch (error) { + debug('Failed to capture continuous screenshot', error); + } finally { + this.isCapturingContinuousScreenshot = false; + } + } + + private mergeContinuousScreenshotsIntoContext(context: UIContext): UIContext { + if (!context) { + return context; + } + + if (!this.isContinuousScreenshotEnabled()) { + if (!context.screenshotBase64List?.length && context.screenshotBase64) { + context.screenshotBase64List = [context.screenshotBase64]; + } + return context; + } + + const merged = this.mergeScreenshotLists( + this.continuousScreenshots, + context.screenshotBase64List, + context.screenshotBase64 ? [context.screenshotBase64] : undefined, + ); + + context.screenshotBase64List = merged; + this.continuousScreenshots = merged; + return context; } async getActionSpace(): Promise { @@ -210,12 +355,19 @@ export class Agent< if (this.interface.getContext) { debug('Using page.getContext for action:', action); - return await this.interface.getContext(); + const context = await this.interface.getContext(); + return this.mergeContinuousScreenshotsIntoContext(context); } else { - debug('Using commonContextParser for action:', action); - return await commonContextParser(this.interface, { + const context = await commonContextParser(this.interface, { uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(), + screenshotBase64List: this.isContinuousScreenshotEnabled() + ? this.continuousScreenshots + : undefined, + screenshotMaxCount: this.isContinuousScreenshotEnabled() + ? this.getContinuousScreenshotConfig().maxCount + : undefined, }); + return this.mergeContinuousScreenshotsIntoContext(context); } } @@ -838,6 +990,9 @@ export class Agent< screenshotIncluded: opt?.screenshotIncluded ?? defaultInsightExtractOption.screenshotIncluded, + screenshotListIncluded: + opt?.screenshotListIncluded ?? + defaultInsightExtractOption.screenshotListIncluded, isWaitForAssert: opt?.isWaitForAssert, doNotThrowError: opt?.doNotThrowError, }; @@ -949,6 +1104,9 @@ export class Agent< async destroy() { await this.interface.destroy?.(); + this.stopContinuousScreenshot(); + this.continuousScreenshots = []; + this.isCapturingContinuousScreenshot = false; this.resetDump(); // reset dump to release memory this.destroyed = true; } diff --git a/packages/core/src/agent/tasks.ts b/packages/core/src/agent/tasks.ts index e75fe4aff..6edfadef4 100644 --- a/packages/core/src/agent/tasks.ts +++ b/packages/core/src/agent/tasks.ts @@ -66,6 +66,41 @@ const debug = getDebug('device-task-executor'); const defaultReplanningCycleLimit = 10; const defaultVlmUiTarsReplanningCycleLimit = 40; +interface ScreenshotExtractionResult { + primary?: string; + extras: string[]; +} + +const extractScreenshotsFromContext = ( + uiContext: UIContext, +): ScreenshotExtractionResult => { + const uniqueScreenshots = new Set(); + const orderedScreenshots: string[] = []; + + const pushUnique = (img?: string) => { + if (!img || uniqueScreenshots.has(img)) { + return; + } + uniqueScreenshots.add(img); + orderedScreenshots.push(img); + }; + + pushUnique(uiContext.screenshotBase64); + (uiContext.screenshotBase64List || []).forEach((img) => { + pushUnique(img); + }); + + if (!orderedScreenshots.length && uiContext.screenshotBase64) { + orderedScreenshots.push(uiContext.screenshotBase64); + } + + const [primary, ...extras] = orderedScreenshots; + return { + primary, + extras, + }; +}; + export function locatePlanForLocate(param: string | DetailedLocateParam) { const locate = typeof param === 'string' ? { prompt: param } : param; const locatePlan: PlanningAction = { @@ -259,10 +294,12 @@ export class TaskExecutor { const uiContext = await this.insight.contextRetrieverFn('locate'); task.uiContext = uiContext; + const { primary, extras } = extractScreenshotsFromContext(uiContext); const recordItem: ExecutionRecorderItem = { type: 'screenshot', ts: shotTime, - screenshot: uiContext.screenshotBase64, + screenshot: primary ?? uiContext.screenshotBase64, + screenshots: extras.length ? extras : undefined, timing: 'before Insight', }; task.recorder = [recordItem]; @@ -605,10 +642,12 @@ export class TaskExecutor { private async setupPlanningContext(executorContext: ExecutorContext) { const shotTime = Date.now(); const uiContext = await this.insight.contextRetrieverFn('locate'); + const { primary, extras } = extractScreenshotsFromContext(uiContext); const recordItem: ExecutionRecorderItem = { type: 'screenshot', ts: shotTime, - screenshot: uiContext.screenshotBase64, + screenshot: primary ?? uiContext.screenshotBase64, + screenshots: extras.length ? extras : undefined, timing: 'before Planning', }; @@ -921,10 +960,12 @@ export class TaskExecutor { const uiContext = await this.insight.contextRetrieverFn('extract'); task.uiContext = uiContext; + const { primary, extras } = extractScreenshotsFromContext(uiContext); const recordItem: ExecutionRecorderItem = { type: 'screenshot', ts: shotTime, - screenshot: uiContext.screenshotBase64, + screenshot: primary ?? uiContext.screenshotBase64, + screenshots: extras.length ? extras : undefined, timing: 'before Extract', }; task.recorder = [recordItem]; diff --git a/packages/core/src/agent/utils.ts b/packages/core/src/agent/utils.ts index 690f9884d..2e63a708c 100644 --- a/packages/core/src/agent/utils.ts +++ b/packages/core/src/agent/utils.ts @@ -32,7 +32,11 @@ const debugProfile = getDebug('web:tool:profile'); export async function commonContextParser( interfaceInstance: AbstractInterface, - _opt: { uploadServerUrl?: string }, + _opt: { + uploadServerUrl?: string; + screenshotBase64List?: string[]; + screenshotMaxCount?: number; + } = {}, ): Promise { assert(interfaceInstance, 'interfaceInstance is required'); @@ -47,8 +51,24 @@ export async function commonContextParser( }); debugProfile('UploadTestInfoToServer end'); + const existingScreenshots = Array.isArray(_opt.screenshotBase64List) + ? _opt.screenshotBase64List.filter( + (item): item is string => typeof item === 'string' && item.length > 0, + ) + : []; + const maxCount = + typeof _opt.screenshotMaxCount === 'number' && + Number.isFinite(_opt.screenshotMaxCount) && + _opt.screenshotMaxCount > 0 + ? Math.floor(_opt.screenshotMaxCount) + : undefined; + let screenshotBase64 = await interfaceInstance.screenshotBase64(); assert(screenshotBase64!, 'screenshotBase64 is required'); + let screenshotBase64List = existingScreenshots.slice(); + if (screenshotBase64) { + screenshotBase64List.push(screenshotBase64); + } const size = await interfaceInstance.size(); debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`); @@ -60,6 +80,29 @@ export async function commonContextParser( height: size.height, }); debugProfile('ResizeImgBase64 end'); + if (screenshotBase64List.length) { + screenshotBase64List[screenshotBase64List.length - 1] = screenshotBase64!; + } else if (screenshotBase64) { + screenshotBase64List.push(screenshotBase64); + } + } + + if (screenshotBase64List.length) { + const uniqueScreenshots = new Set(); + for (const item of screenshotBase64List) { + if (!item) continue; + if (uniqueScreenshots.has(item)) { + uniqueScreenshots.delete(item); + } + uniqueScreenshots.add(item); + } + screenshotBase64List = Array.from(uniqueScreenshots); + } + + if (maxCount && screenshotBase64List.length > maxCount) { + screenshotBase64List = screenshotBase64List.slice( + screenshotBase64List.length - maxCount, + ); } return { @@ -69,6 +112,7 @@ export async function commonContextParser( }, size, screenshotBase64: screenshotBase64!, + screenshotBase64List, }; } diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts index 9173277f1..63a360e0a 100644 --- a/packages/core/src/ai-model/inspect.ts +++ b/packages/core/src/ai-model/inspect.ts @@ -137,6 +137,13 @@ export async function AiLocateElement< const { context, targetElementDescription, callAIFn, modelConfig } = options; const { vlMode } = modelConfig; const { screenshotBase64 } = context; + const additionalScreenshots = Array.from( + new Set( + (context.screenshotBase64List || []) + .filter((img): img is string => Boolean(img)) + .filter((img) => img !== screenshotBase64), + ), + ); const { description, elementById, insertElementByPosition } = await describeUserPage(context, { vlMode }); @@ -190,23 +197,36 @@ export async function AiLocateElement< ); } + const imageContents: ChatCompletionUserMessageParam['content'] = [ + { + type: 'image_url', + image_url: { + url: imagePayload, + detail: 'high', + }, + }, + ]; + + for (const extraScreenshot of additionalScreenshots) { + imageContents.push({ + type: 'image_url', + image_url: { + url: extraScreenshot, + detail: 'high', + }, + }); + } + + imageContents.push({ + type: 'text', + text: userInstructionPrompt, + }); + const msgs: AIArgs = [ { role: 'system', content: systemPrompt }, { role: 'user', - content: [ - { - type: 'image_url', - image_url: { - url: imagePayload, - detail: 'high', - }, - }, - { - type: 'text', - text: userInstructionPrompt, - }, - ], + content: imageContents, }, ]; @@ -305,28 +325,48 @@ export async function AiLocateSection(options: { const { context, sectionDescription, modelConfig } = options; const { vlMode } = modelConfig; const { screenshotBase64 } = context; + const additionalScreenshots = Array.from( + new Set( + (context.screenshotBase64List || []) + .filter((img): img is string => Boolean(img)) + .filter((img) => img !== screenshotBase64), + ), + ); const systemPrompt = systemPromptToLocateSection(vlMode); const sectionLocatorInstructionText = await sectionLocatorInstruction.format({ sectionDescription: extraTextFromUserPrompt(sectionDescription), }); + const sectionContents: ChatCompletionUserMessageParam['content'] = [ + { + type: 'image_url', + image_url: { + url: screenshotBase64, + detail: 'high', + }, + }, + ]; + + for (const extraScreenshot of additionalScreenshots) { + sectionContents.push({ + type: 'image_url', + image_url: { + url: extraScreenshot, + detail: 'high', + }, + }); + } + + sectionContents.push({ + type: 'text', + text: sectionLocatorInstructionText, + }); + const msgs: AIArgs = [ { role: 'system', content: systemPrompt }, { role: 'user', - content: [ - { - type: 'image_url', - image_url: { - url: screenshotBase64, - detail: 'high', - }, - }, - { - type: 'text', - text: sectionLocatorInstructionText, - }, - ], + content: sectionContents, }, ]; @@ -424,6 +464,17 @@ export async function AiExtractElementInfo< const systemPrompt = systemPromptToExtract(); const { screenshotBase64 } = context; + const allowAdditionalScreenshots = + extractOption?.screenshotListIncluded === true; + const additionalScreenshots = allowAdditionalScreenshots + ? Array.from( + new Set( + (context.screenshotBase64List || []) + .filter((img): img is string => Boolean(img)) + .filter((img) => img !== screenshotBase64), + ), + ) + : []; const { description, elementById } = await describeUserPage(context, { truncateTextLength: 200, @@ -450,6 +501,16 @@ export async function AiExtractElementInfo< }); } + for (const extraScreenshot of additionalScreenshots) { + userContent.push({ + type: 'image_url', + image_url: { + url: extraScreenshot, + detail: 'high', + }, + }); + } + userContent.push({ type: 'text', text: extractDataPromptText, diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 16290ef3a..c359f5f6f 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -127,6 +127,8 @@ export interface AgentDescribeElementAtPointResult { export abstract class UIContext { abstract screenshotBase64: string; + abstract screenshotBase64List?: string[]; + abstract tree: ElementTreeNode; abstract size: Size; @@ -320,6 +322,7 @@ export interface ExecutionRecorderItem { type: 'screenshot'; ts: number; screenshot?: string; + screenshots?: string[]; timing?: string; } @@ -594,4 +597,12 @@ export interface AgentOpt { modelConfig?: TModelConfigFn; useCache?: boolean; replanningCycleLimit?: number; + continuousScreenshot?: { + /** Whether to enable continuous screenshot capturing */ + enabled?: boolean; + /** Interval between screenshots in milliseconds */ + intervalMs?: number; + /** Maximum number of screenshots to keep */ + maxCount?: number; + }; } diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts index 118bae641..47322fce2 100644 --- a/packages/core/src/yaml.ts +++ b/packages/core/src/yaml.ts @@ -13,6 +13,7 @@ export interface LocateOption { export interface InsightExtractOption { domIncluded?: boolean | 'visible-only'; screenshotIncluded?: boolean; + screenshotListIncluded?: boolean; // To make the assert in the "waitfor" section display the warning icon in report isWaitForAssert?: boolean; doNotThrowError?: boolean; @@ -60,6 +61,11 @@ export type MidsceneYamlScriptAgentOpt = Pick; export interface MidsceneYamlScriptConfig { output?: string; unstableLogContent?: boolean | string; + continuousScreenshot?: { + enabled: boolean; + intervalMs: number; + maxCount?: number; + }; } export interface MidsceneYamlScriptEnvGeneralInterface { @@ -126,7 +132,7 @@ export interface MidsceneYamlFlowItemAIAction { cacheable?: boolean; } -export interface MidsceneYamlFlowItemAIAssert { +export interface MidsceneYamlFlowItemAIAssert extends InsightExtractOption { aiAssert: string; errorMessage?: string; name?: string; diff --git a/packages/core/src/yaml/player.ts b/packages/core/src/yaml/player.ts index 58044b4da..e5f72ea2f 100644 --- a/packages/core/src/yaml/player.ts +++ b/packages/core/src/yaml/player.ts @@ -228,6 +228,7 @@ export class ScriptPlayer { const { pass, thought, message } = (await agent.aiAssert(prompt, msg, { keepRawResponse: true, + screenshotListIncluded: assertTask.screenshotListIncluded, })) || {}; this.setResult(assertTask.name, { @@ -245,6 +246,7 @@ export class ScriptPlayer { const options = { domIncluded: queryTask.domIncluded, screenshotIncluded: queryTask.screenshotIncluded, + screenshotListIncluded: queryTask.screenshotListIncluded, }; assert(prompt, 'missing prompt for aiQuery'); const queryResult = await agent.aiQuery(prompt, options); @@ -255,6 +257,7 @@ export class ScriptPlayer { const options = { domIncluded: numberTask.domIncluded, screenshotIncluded: numberTask.screenshotIncluded, + screenshotListIncluded: numberTask.screenshotListIncluded, }; assert(prompt, 'missing prompt for aiNumber'); const numberResult = await agent.aiNumber(prompt, options); @@ -265,6 +268,7 @@ export class ScriptPlayer { const options = { domIncluded: stringTask.domIncluded, screenshotIncluded: stringTask.screenshotIncluded, + screenshotListIncluded: stringTask.screenshotListIncluded, }; assert(prompt, 'missing prompt for aiString'); const stringResult = await agent.aiString(prompt, options); @@ -275,6 +279,7 @@ export class ScriptPlayer { const options = { domIncluded: booleanTask.domIncluded, screenshotIncluded: booleanTask.screenshotIncluded, + screenshotListIncluded: booleanTask.screenshotListIncluded, }; assert(prompt, 'missing prompt for aiBoolean'); const booleanResult = await agent.aiBoolean(prompt, options); @@ -282,8 +287,13 @@ export class ScriptPlayer { } else if ('aiAsk' in (flowItem as MidsceneYamlFlowItemAIAsk)) { const askTask = flowItem as MidsceneYamlFlowItemAIAsk; const prompt = askTask.aiAsk; + const options = { + domIncluded: askTask.domIncluded, + screenshotIncluded: askTask.screenshotIncluded, + screenshotListIncluded: askTask.screenshotListIncluded, + }; assert(prompt, 'missing prompt for aiAsk'); - const askResult = await agent.aiAsk(prompt); + const askResult = await agent.aiAsk(prompt, options); this.setResult(askTask.name, askResult); } else if ('aiLocate' in (flowItem as MidsceneYamlFlowItemAILocate)) { const locateTask = flowItem as MidsceneYamlFlowItemAILocate; diff --git a/packages/core/tests/utils.ts b/packages/core/tests/utils.ts index 5a41e5e32..b2511e517 100644 --- a/packages/core/tests/utils.ts +++ b/packages/core/tests/utils.ts @@ -26,8 +26,10 @@ export function sleep(ms: number) { export function fakeInsight(content: string) { const screenshot = getFixture('baidu.png'); + const screenshotBase64 = localImg2Base64(screenshot); const basicContext = { - screenshotBase64: localImg2Base64(screenshot), + screenshotBase64, + screenshotBase64List: [screenshotBase64], size: { width: 1920, height: 1080 }, content: [ { diff --git a/packages/visualizer/src/utils/replay-scripts.ts b/packages/visualizer/src/utils/replay-scripts.ts index e01e72b14..ef408bc69 100644 --- a/packages/visualizer/src/utils/replay-scripts.ts +++ b/packages/visualizer/src/utils/replay-scripts.ts @@ -133,6 +133,28 @@ const capitalizeFirstLetter = (str: string) => { return str.charAt(0).toUpperCase() + str.slice(1); }; +const screenshotSequenceFromContext = (context?: UIContext): string[] => { + if (!context) { + return []; + } + const unique = new Set(); + const ordered: string[] = []; + const push = (img?: string) => { + if (!img || unique.has(img)) { + return; + } + unique.add(img); + ordered.push(img); + }; + + push(context.screenshotBase64); + (context.screenshotBase64List || []).forEach((img) => { + push(img); + }); + + return ordered; +}; + export const allScriptsFromDump = ( dump: GroupedActionDump, ): ReplayScriptsInfo | null => { @@ -338,24 +360,24 @@ export const generateAnimationScripts = ( }; } const context = insightTask.uiContext; - if (context?.screenshotBase64) { + const screenshots = screenshotSequenceFromContext(context); + if (context && screenshots.length) { + const [primaryScreenshot, ...extraScreenshots] = screenshots; const insightDump = insightTask.log?.dump; const insightContentLength = context.tree ? treeToList(context.tree).length : 0; - if (context.screenshotBase64) { - // show the original screenshot first - scripts.push({ - type: 'img', - img: context.screenshotBase64, - duration: stillAfterInsightDuration, - title, - subTitle, - imageWidth: context.size?.width || imageWidth, - imageHeight: context.size?.height || imageHeight, - }); - } + // show the original screenshot first + scripts.push({ + type: 'img', + img: primaryScreenshot, + duration: stillAfterInsightDuration, + title, + subTitle, + imageWidth: context.size?.width || imageWidth, + imageHeight: context.size?.height || imageHeight, + }); let cameraState: TargetCameraState | undefined = undefined; if (currentCameraState === fullPageCameraState) { @@ -371,7 +393,7 @@ export const generateAnimationScripts = ( scripts.push({ type: 'insight', - img: context.screenshotBase64, + img: primaryScreenshot, context: context, camera: cameraState, highlightElement: insightTask.output?.element || undefined, @@ -391,6 +413,21 @@ export const generateAnimationScripts = ( title, subTitle, }); + + extraScreenshots.forEach((img, idx) => { + scripts.push({ + type: 'img', + img, + duration: stillAfterInsightDuration, + title, + subTitle: + extraScreenshots.length > 1 + ? `${subTitle} (#${idx + 2})` + : `${subTitle} (#2)`, + imageWidth: context.size?.width || imageWidth, + imageHeight: context.size?.height || imageHeight, + }); + }); insightOnTop = true; } } else if ( diff --git a/packages/web-integration/src/puppeteer/agent-launcher.ts b/packages/web-integration/src/puppeteer/agent-launcher.ts index 8c3e7786f..a12c23e04 100644 --- a/packages/web-integration/src/puppeteer/agent-launcher.ts +++ b/packages/web-integration/src/puppeteer/agent-launcher.ts @@ -207,6 +207,7 @@ export async function puppeteerAgentForTarget( typeof target.forceSameTabNavigation !== 'undefined' ? target.forceSameTabNavigation : true, // true for default in yaml script + continuousScreenshot: target.continuousScreenshot, }); freeFn.push({