Skip to content

Commit 453deeb

Browse files
Samiya CaurDevtools-frontend LUCI CQ
authored andcommitted
[AiAssistance] Send multimodal input prompt as part of user query
Bug: 393036441 Change-Id: I19aff5affd7f45943b2d6981908d0c569ea16bb7 Reviewed-on: https://chromium-review.googlesource.com/c/devtools/devtools-frontend/+/6262874 Auto-Submit: Samiya Caur <[email protected]> Commit-Queue: Samiya Caur <[email protected]> Reviewed-by: Alex Rudenko <[email protected]>
1 parent b787b83 commit 453deeb

File tree

3 files changed

+92
-8
lines changed

3 files changed

+92
-8
lines changed

front_end/panels/ai_assistance/agents/AiAgent.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ export abstract class AiAgent<T> {
244244
this.confirmSideEffect = opts.confirmSideEffectForTest ?? (() => Promise.withResolvers());
245245
}
246246

247-
async enhanceQuery(query: string, selected: ConversationContext<T>|null): Promise<string>;
247+
async enhanceQuery(query: string, selected: ConversationContext<T>|null, hasImageInput?: boolean): Promise<string>;
248248
async enhanceQuery(query: string): Promise<string> {
249249
return query;
250250
}
@@ -362,8 +362,7 @@ export abstract class AiAgent<T> {
362362
this.#context = options.selected;
363363
}
364364

365-
const enhancedQuery = await this.enhanceQuery(initialQuery, options.selected);
366-
365+
const enhancedQuery = await this.enhanceQuery(initialQuery, options.selected, Boolean(imageInput));
367366
Host.userMetrics.freestylerQueryLength(enhancedQuery.length);
368367

369368
let query: Host.AidaClient.Part|Host.AidaClient.Part[];

front_end/panels/ai_assistance/agents/StylingAgent.test.ts

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@ const {StylingAgent, ErrorType} = AiAssistance;
1717
describeWithEnvironment('StylingAgent', () => {
1818
function mockHostConfig(
1919
modelId?: string, temperature?: number, userTier?: string,
20-
executionMode?: Root.Runtime.HostConfigFreestylerExecutionMode) {
20+
executionMode?: Root.Runtime.HostConfigFreestylerExecutionMode, multimodal?: boolean) {
2121
updateHostConfig({
2222
devToolsFreestyler: {
2323
modelId,
2424
temperature,
2525
userTier,
2626
executionMode,
27+
multimodal,
2728
},
2829
});
2930
}
@@ -1225,6 +1226,67 @@ STOP
12251226
});
12261227
});
12271228

1229+
describe('enhanceQuery', () => {
1230+
const agent = new StylingAgent({
1231+
aidaClient: mockAidaClient(),
1232+
});
1233+
1234+
beforeEach(() => {
1235+
element.simpleSelector.returns('div#myElement');
1236+
element.getChildNodesPromise.resolves(null);
1237+
});
1238+
1239+
it('does not add multimodal input evaluation prompt when multimodal is disabled', async () => {
1240+
mockHostConfig('test model');
1241+
const enhancedQuery = await agent.enhanceQuery('test query', new AiAssistance.NodeContext(element), true);
1242+
1243+
assert.strictEqual(
1244+
enhancedQuery,
1245+
'# Inspected element\n\n* Its selector is `div#myElement`\n\n# User request\n\nQUERY: test query',
1246+
);
1247+
});
1248+
1249+
it('does not add multimodal input evaluation prompt when multimodal is enabled but hasImageInput is false',
1250+
async () => {
1251+
mockHostConfig('test model', 1, 'PUBLIC', Root.Runtime.HostConfigFreestylerExecutionMode.NO_SCRIPTS, true);
1252+
const enhancedQuery = await agent.enhanceQuery('test query', new AiAssistance.NodeContext(element), false);
1253+
1254+
assert.strictEqual(
1255+
enhancedQuery,
1256+
'# Inspected element\n\n* Its selector is `div#myElement`\n\n# User request\n\nQUERY: test query',
1257+
);
1258+
});
1259+
1260+
it('adds multimodal input evaluation prompt when multimodal is enabled and hasImageInput is true', async () => {
1261+
mockHostConfig('test model', 1, 'PUBLIC', Root.Runtime.HostConfigFreestylerExecutionMode.NO_SCRIPTS, true);
1262+
const enhancedQuery = await agent.enhanceQuery('test query', new AiAssistance.NodeContext(element), true);
1263+
1264+
assert.strictEqual(
1265+
enhancedQuery,
1266+
`The user has provided you a screenshot of the page (as visible in the viewport) in base64-encoded format. You SHOULD use it while answering user's queries.
1267+
1268+
# Considerations for evaluating image:
1269+
* Pay close attention to the spatial details as well as the visual appearance of the selected element in the image, particularly in relation to layout, spacing, and styling.
1270+
* Try to connect the screenshot to actual DOM elements in the page.
1271+
* Analyze the image to identify the layout structure surrounding the element, including the positioning of neighboring elements.
1272+
* Extract visual information from the image, such as colors, fonts, spacing, and sizes, that might be relevant to the user's query.
1273+
* If the image suggests responsiveness issues (e.g., cropped content, overlapping elements), consider those in your response.
1274+
* Consider the surrounding elements and overall layout in the image, but prioritize the selected element's styling and positioning.
1275+
1276+
* As part of THOUGHT, evaluate the image to gather data that might be needed to answer the question.
1277+
In case query is related to the image, ALWAYS first use image evaluation to get all details from the image. ONLY after you have all data needed from image, you should move to other steps.
1278+
1279+
# Inspected element
1280+
1281+
* Its selector is \`div#myElement\`
1282+
1283+
# User request
1284+
1285+
QUERY: test query`,
1286+
);
1287+
});
1288+
});
1289+
12281290
describe('HostConfigFreestylerExecutionMode', () => {
12291291
function getMockClient() {
12301292
return mockAidaClient([

front_end/panels/ai_assistance/agents/StylingAgent.ts

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,21 @@ OBSERVATION: {"elementStyles":{"display":"block","visibility":"visible","positio
140140
141141
ANSWER: Even though the popup itself has a z-index of 3, its parent container has position: relative and z-index: 1. This creates a new stacking context for the popup. Because the "background" div has a z-index of 2, which is higher than the stacking context of the popup, it is rendered on top, obscuring the popup.
142142
SUGGESTIONS: ["What is a stacking context?", "How can I change the stacking order?"]
143+
`;
144+
145+
const promptForMultimodalInputEvaluation = `The user has provided you a screenshot of the page (as visible in the viewport) in base64-encoded format. You SHOULD use it while answering user's queries.
146+
147+
# Considerations for evaluating image:
148+
* Pay close attention to the spatial details as well as the visual appearance of the selected element in the image, particularly in relation to layout, spacing, and styling.
149+
* Try to connect the screenshot to actual DOM elements in the page.
150+
* Analyze the image to identify the layout structure surrounding the element, including the positioning of neighboring elements.
151+
* Extract visual information from the image, such as colors, fonts, spacing, and sizes, that might be relevant to the user's query.
152+
* If the image suggests responsiveness issues (e.g., cropped content, overlapping elements), consider those in your response.
153+
* Consider the surrounding elements and overall layout in the image, but prioritize the selected element's styling and positioning.
154+
155+
* As part of THOUGHT, evaluate the image to gather data that might be needed to answer the question.
156+
In case query is related to the image, ALWAYS first use image evaluation to get all details from the image. ONLY after you have all data needed from image, you should move to other steps.
157+
143158
`;
144159
/* clang-format on */
145160

@@ -265,6 +280,11 @@ export class StylingAgent extends AiAgent<SDK.DOMModel.DOMNode> {
265280
};
266281
}
267282

283+
get multimodalInputEnabled(): boolean {
284+
const {hostConfig} = Root.Runtime;
285+
return Boolean(hostConfig.devToolsFreestyler?.multimodal);
286+
}
287+
268288
override parseResponse(response: Host.AidaClient.AidaResponse): ParsedResponse {
269289
if (response.functionCalls) {
270290
throw new Error('Function calling not supported yet');
@@ -727,13 +747,16 @@ export class StylingAgent extends AiAgent<SDK.DOMModel.DOMNode> {
727747
};
728748
}
729749

730-
override async enhanceQuery(query: string, selectedElement: ConversationContext<SDK.DOMModel.DOMNode>|null):
731-
Promise<string> {
732-
const elementEnchantmentQuery = selectedElement ?
750+
override async enhanceQuery(
751+
query: string, selectedElement: ConversationContext<SDK.DOMModel.DOMNode>|null,
752+
hasImageInput?: boolean): Promise<string> {
753+
const elementEnchancementQuery = selectedElement ?
733754
`# Inspected element\n\n${
734755
await StylingAgent.describeElement(selectedElement.getItem())}\n\n# User request\n\n` :
735756
'';
736-
return `${elementEnchantmentQuery}QUERY: ${query}`;
757+
const multimodalInputEnhancementQuery =
758+
this.multimodalInputEnabled && hasImageInput ? promptForMultimodalInputEvaluation : '';
759+
return `${multimodalInputEnhancementQuery}${elementEnchancementQuery}QUERY: ${query}`;
737760
}
738761

739762
override formatParsedAnswer({answer}: ParsedAnswer): string {

0 commit comments

Comments
 (0)