diff --git a/packages/core/lib/v3/cache/ActCache.ts b/packages/core/lib/v3/cache/ActCache.ts index 87b55ecac..2d3fb21e1 100644 --- a/packages/core/lib/v3/cache/ActCache.ts +++ b/packages/core/lib/v3/cache/ActCache.ts @@ -171,7 +171,7 @@ export class ActCache { const execute = async (): Promise => { const actionResults: ActResult[] = []; for (const action of entry.actions) { - const result = await handler.actFromObserveResult( + const result = await handler.takeDeterministicAction( action, page, this.domSettleTimeoutMs, diff --git a/packages/core/lib/v3/cache/AgentCache.ts b/packages/core/lib/v3/cache/AgentCache.ts index 637374d3e..6130bc8e9 100644 --- a/packages/core/lib/v3/cache/AgentCache.ts +++ b/packages/core/lib/v3/cache/AgentCache.ts @@ -557,7 +557,7 @@ export class AgentCache { if (actions.length > 0) { const page = await ctx.awaitActivePage(); for (const action of actions) { - await handler.actFromObserveResult( + await handler.takeDeterministicAction( action, page, this.domSettleTimeoutMs, @@ -581,7 +581,7 @@ export class AgentCache { if (!Array.isArray(actions) || actions.length === 0) return; const page = await ctx.awaitActivePage(); for (const action of actions) { - await handler.actFromObserveResult( + await handler.takeDeterministicAction( action, page, this.domSettleTimeoutMs, diff --git a/packages/core/lib/v3/handlers/actHandler.ts b/packages/core/lib/v3/handlers/actHandler.ts index 2527064bf..41f0afe9a 100644 --- a/packages/core/lib/v3/handlers/actHandler.ts +++ b/packages/core/lib/v3/handlers/actHandler.ts @@ -23,6 +23,15 @@ import { waitForDomNetworkQuiet, } from "./handlerUtils/actHandlerUtils"; +type ActInferenceElement = { + elementId?: string; + description: string; + method?: string; + arguments?: string[]; +}; + +type ActInferenceResponse = Awaited>; + export class ActHandler { private readonly llmClient: LLMClient; private readonly defaultModelName: AvailableModel; @@ -70,6 +79,67 @@ export class ActHandler { this.defaultDomSettleTimeoutMs = defaultDomSettleTimeoutMs; } + private recordActMetrics(response: ActInferenceResponse): void { + this.onMetrics?.( + V3FunctionName.ACT, + response.prompt_tokens ?? 0, + response.completion_tokens ?? 0, + response.reasoning_tokens ?? 0, + response.cached_input_tokens ?? 0, + response.inference_time_ms ?? 0, + ); + } + + private async getActionFromLLM({ + instruction, + domElements, + xpathMap, + llmClient, + variables, + requireMethodAndArguments = true, + }: { + instruction: string; + domElements: string; + xpathMap: Record; + llmClient: LLMClient; + variables?: Record; + requireMethodAndArguments?: boolean; + }): Promise<{ action?: Action; response: ActInferenceResponse }> { + const response = await actInference({ + instruction, + domElements, + llmClient, + userProvidedInstructions: this.systemPrompt, + logger: v3Logger, + logInferenceToFile: this.logInferenceToFile, + }); + + this.recordActMetrics(response); + + const normalized = normalizeActInferenceElement( + response.element as ActInferenceElement | undefined, + xpathMap, + requireMethodAndArguments, + ); + + if (!normalized) { + return { response }; + } + + const action: Action = { + ...normalized, + arguments: substituteVariablesInArguments( + normalized.arguments, + variables, + ), + } as Action; + + return { + action, + response, + }; + } + async act(params: ActHandlerParams): Promise { const { instruction, page, variables, timeout, model } = params; @@ -80,79 +150,27 @@ export class ActHandler { page.mainFrame(), this.defaultDomSettleTimeoutMs, ); - const snapshot = await captureHybridSnapshot(page as Page, { - experimental: true, - }); - const combinedTree = snapshot.combinedTree; - const combinedXpathMap = (snapshot.combinedXpathMap ?? {}) as Record< - EncodedId, - string - >; + const { combinedTree, combinedXpathMap } = await captureHybridSnapshot( + page, + { experimental: true }, + ); - const observeActInstruction = buildActPrompt( + const actInstruction = buildActPrompt( instruction, Object.values(SupportedPlaywrightAction), variables, ); - // Always ask for an action - const actInferenceResponse = await actInference({ - instruction: observeActInstruction, - domElements: combinedTree, - llmClient, - userProvidedInstructions: this.systemPrompt, - logger: v3Logger, - logInferenceToFile: this.logInferenceToFile, - }); - - // Update ACT metrics from the LLM observation call - const actPromptTokens = actInferenceResponse.prompt_tokens ?? 0; - const actCompletionTokens = actInferenceResponse.completion_tokens ?? 0; - const actReasoningTokens = actInferenceResponse.reasoning_tokens ?? 0; - const actCachedInputTokens = - actInferenceResponse.cached_input_tokens ?? 0; - const actInferenceTimeMs = actInferenceResponse.inference_time_ms ?? 0; - this.onMetrics?.( - V3FunctionName.ACT, - actPromptTokens, - actCompletionTokens, - actReasoningTokens, - actCachedInputTokens, - actInferenceTimeMs, - ); - - // Normalize single LLM element → Action - const raw = actInferenceResponse.element as - | { - elementId: string; - description: string; - method: string; - arguments: string[]; - } - | undefined; - - const result: Action | undefined = (() => { - if (!raw) return undefined; - const { elementId, description, method, arguments: args } = raw; - if (!method || method === "not-supported" || !Array.isArray(args)) { - return undefined; - } - if (typeof elementId === "string" && elementId.includes("-")) { - const xp = combinedXpathMap[elementId as EncodedId]; - const trimmed = trimTrailingTextNode(xp); - if (!trimmed) return undefined; - return { - description, - method, - arguments: args, - selector: `xpath=${trimmed}`, - } as Action; - } - // shadow-root path not supported here (match old behavior) - return undefined; - })(); + const { action: firstAction, response: actInferenceResponse } = + await this.getActionFromLLM({ + instruction: actInstruction, + domElements: combinedTree, + xpathMap: combinedXpathMap, + llmClient, + variables, + }); - if (!result) { + if (!firstAction) { v3Logger({ category: "action", message: "no actionable element returned by LLM", @@ -166,40 +184,26 @@ export class ActHandler { }; } - // Use the first observed element and substitute variables - const chosen: Action = { ...result } as Action; - if (variables && Array.isArray(chosen.arguments)) { - chosen.arguments = chosen.arguments.map((arg: string) => { - let out = arg; - for (const [k, v] of Object.entries(variables)) { - const token = `%${k}%`; - out = out.split(token).join(String(v)); - } - return out; - }); - } - // First action (self-heal aware path) - const firstResult = await this.actFromObserveResult( - chosen, - page as Page, + const firstResult = await this.takeDeterministicAction( + firstAction, + page, this.defaultDomSettleTimeoutMs, llmClient, ); // If not two-step, return the first action result - const twoStep = !!( - actInferenceResponse as unknown as { twoStep?: boolean } - ).twoStep; - if (!twoStep) { + if (actInferenceResponse?.twoStep !== true) { return firstResult; } // Take a new focused snapshot and observe again - const secondSnapshot = await captureHybridSnapshot(page as Page, { + const { + combinedTree: combinedTree2, + combinedXpathMap: combinedXpathMap2, + } = await captureHybridSnapshot(page, { experimental: true, }); - const combinedTree2 = secondSnapshot.combinedTree; let diffedTree = diffCombinedTrees(combinedTree, combinedTree2); if (!diffedTree.trim()) { @@ -207,10 +211,7 @@ export class ActHandler { diffedTree = combinedTree2; } - const combinedXpathMap2 = (secondSnapshot.combinedXpathMap ?? - {}) as Record; - - const previousAction = `method: ${chosen.method}, description: ${chosen.description}, arguments: ${chosen.arguments}`; + const previousAction = `method: ${firstAction.method}, description: ${firstAction.description}, arguments: ${firstAction.arguments}`; const stepTwoInstructions = buildStepTwoPrompt( instruction, @@ -226,74 +227,22 @@ export class ActHandler { variables, ); - const action2 = await actInference({ + const { action: secondAction } = await this.getActionFromLLM({ instruction: stepTwoInstructions, domElements: diffedTree, + xpathMap: combinedXpathMap2, llmClient, - userProvidedInstructions: this.systemPrompt, - logger: v3Logger, - logInferenceToFile: this.logInferenceToFile, + variables, }); - // Update ACT metrics for the second observation call - this.onMetrics?.( - V3FunctionName.ACT, - action2.prompt_tokens ?? 0, - action2.completion_tokens ?? 0, - action2.reasoning_tokens ?? 0, - action2.cached_input_tokens ?? 0, - action2.inference_time_ms ?? 0, - ); - const raw2 = action2.element as - | { - elementId: string; - description: string; - method?: string; - arguments?: string[]; - } - | undefined; - - const result2: Action | undefined = (() => { - if (!raw2) return undefined; - const { elementId, description, method, arguments: args } = raw2; - if (!method || method === "not-supported" || !Array.isArray(args)) { - return undefined; - } - if (typeof elementId === "string" && elementId.includes("-")) { - const xp = combinedXpathMap2[elementId as EncodedId]; - const trimmed = trimTrailingTextNode(xp); - if (!trimmed) return undefined; - return { - description, - method, - arguments: args, - selector: `xpath=${trimmed}`, - } as Action; - } - return undefined; - })(); - - if (!result2) { + if (!secondAction) { // No second action found — return first result as-is return firstResult; } - const chosen2: Action = { ...result2 } as Action; - // Carry forward variables substitution for step 2 as well - if (variables && Array.isArray(chosen2.arguments)) { - chosen2.arguments = chosen2.arguments.map((arg: string) => { - let out = arg; - for (const [k, v] of Object.entries(variables)) { - const token = `%${k}%`; - out = out.split(token).join(String(v)); - } - return out; - }); - } - - const secondResult = await this.actFromObserveResult( - chosen2, - page as Page, + const secondResult = await this.takeDeterministicAction( + secondAction, + page, this.defaultDomSettleTimeoutMs, llmClient, ); @@ -328,7 +277,7 @@ export class ActHandler { ]); } - async actFromObserveResult( + async takeDeterministicAction( action: Action, page: Page, domSettleTimeoutMs?: number, @@ -407,10 +356,10 @@ export class ActHandler { : method; // Take a fresh snapshot and ask for a new actionable element - const snapshot = await captureHybridSnapshot(page as Page, { - experimental: true, - }); - const combinedTree = snapshot.combinedTree; + const { combinedTree, combinedXpathMap } = + await captureHybridSnapshot(page, { + experimental: true, + }); const instruction = buildActPrompt( actCommand, @@ -418,27 +367,17 @@ export class ActHandler { {}, ); - const actInferenceResponse = await actInference({ - instruction, - domElements: combinedTree, - llmClient: effectiveClient, - userProvidedInstructions: this.systemPrompt, - logger: v3Logger, - logInferenceToFile: this.logInferenceToFile, - }); - - // Update ACT metrics with the retry observation - this.onMetrics?.( - V3FunctionName.ACT, - actInferenceResponse.prompt_tokens ?? 0, - actInferenceResponse.completion_tokens ?? 0, - actInferenceResponse.reasoning_tokens ?? 0, - actInferenceResponse.cached_input_tokens ?? 0, - actInferenceResponse.inference_time_ms ?? 0, - ); - - const fallback = actInferenceResponse.element; - if (!fallback) { + const { action: fallbackAction, response: fallbackResponse } = + await this.getActionFromLLM({ + instruction, + domElements: combinedTree, + xpathMap: combinedXpathMap, + llmClient: effectiveClient, + requireMethodAndArguments: false, + }); + + const fallbackElement = fallbackResponse.element; + if (!fallbackElement) { return { success: false, message: @@ -450,11 +389,8 @@ export class ActHandler { // Retry with original method/args but new selector from fallback let newSelector = action.selector; - if (typeof fallback.elementId === "string") { - const enc = fallback.elementId as EncodedId; - const rawXp = (snapshot.combinedXpathMap ?? {})[enc]; - const trimmed = trimTrailingTextNode(rawXp); - if (trimmed) newSelector = `xpath=${trimmed}`; + if (fallbackAction?.selector) { + newSelector = fallbackAction.selector; } await performUnderstudyMethod( @@ -500,3 +436,57 @@ export class ActHandler { } } } + +function normalizeActInferenceElement( + element: ActInferenceElement | undefined, + xpathMap: Record, + requireMethodAndArguments = true, +): Action | undefined { + if (!element) { + return undefined; + } + const { elementId, description, method, arguments: args } = element; + const hasArgs = Array.isArray(args); + + if ( + requireMethodAndArguments && + (!method || method === "not-supported" || !hasArgs) + ) { + return undefined; + } + + if (typeof elementId !== "string" || !elementId.includes("-")) { + return undefined; + } + + const xp = xpathMap[elementId as EncodedId]; + const trimmed = trimTrailingTextNode(xp); + if (!trimmed) { + return undefined; + } + + return { + description, + method, + arguments: hasArgs ? args : undefined, + selector: `xpath=${trimmed}`, + } as Action; +} + +function substituteVariablesInArguments( + args: string[] | undefined, + variables?: Record, +): string[] | undefined { + if (!variables || !Array.isArray(args)) { + return args; + } + + return args.map((arg: string) => { + let out = arg; + for (const [key, value] of Object.entries(variables)) { + const token = `%${key}%`; + out = out.split(token).join(String(value)); + } + return out; + }); +} diff --git a/packages/core/lib/v3/v3.ts b/packages/core/lib/v3/v3.ts index 71d84d17b..6e43e17ac 100644 --- a/packages/core/lib/v3/v3.ts +++ b/packages/core/lib/v3/v3.ts @@ -978,7 +978,7 @@ export class V3 { frameId: v3Page.mainFrameId(), }); } else { - actResult = await this.actHandler.actFromObserveResult( + actResult = await this.actHandler.takeDeterministicAction( { ...input, selector }, // ObserveResult v3Page, // V3 Page this.domSettleTimeoutMs,