Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 225 additions & 52 deletions packages/core/src/ai-model/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import type {
InterfaceType,
PlanningAIResponse,
RawResponsePlanningAIResponse,
SubGoal,
UIContext,
} from '@/types';
import type { IModelConfig, TModelFamily } from '@midscene/shared/env';
Expand All @@ -18,11 +19,6 @@ import {
} from '../common';
import type { ConversationHistory } from './conversation-history';
import { systemPromptToTaskPlanning } from './prompt/llm-planning';
import {
extractXMLTag,
parseMarkFinishedIndexes,
parseSubGoalsFromXML,
} from './prompt/util';
import {
AIResponseParseError,
callAI,
Expand All @@ -33,60 +29,220 @@ const debug = getDebug('planning');
const warnLog = getDebug('planning', { console: true });

/**
* Parse XML response from LLM and convert to RawResponsePlanningAIResponse
* Build the JSON Schema for the planning response format.
* Used with OpenAI's response_format: { type: "json_schema" }
*/
export function buildPlanningResponseSchema(includeSubGoals: boolean): {
type: 'json_schema';
json_schema: {
name: string;
strict: boolean;
schema: Record<string, unknown>;
};
} {
const properties: Record<string, unknown> = {
thought: {
type: 'string',
description:
'Your thought process about the current state and next action',
},
log: {
type: ['string', 'null'],
description:
'A brief preamble message to the user explaining what you are about to do',
},
action_type: {
type: ['string', 'null'],
description:
'The action type to execute, must be one of the supporting actions',
},
action_param: {
description: 'The parameters for the action',
anyOf: [{ type: 'object' }, { type: 'null' }],
},
complete: {
description: 'Set when the task is completed or failed',
anyOf: [
{
type: 'object',
properties: {
success: {
type: 'boolean',
description: 'Whether the task was completed successfully',
},
message: {
type: 'string',
description: 'Message to provide to the user',
},
},
required: ['success', 'message'],
additionalProperties: false,
},
{ type: 'null' },
],
},
error: {
type: ['string', 'null'],
description: 'Error message if there is an error',
},
};

const required = [
'thought',
'log',
'action_type',
'action_param',
'complete',
'error',
];

if (includeSubGoals) {
// Insert sub-goal fields after 'thought' to match prompt order:
// thought -> update_sub_goals -> mark_finished_indexes -> memory -> log -> action_type -> ...
const subGoalProperties: Record<string, unknown> = {
update_sub_goals: {
description: 'Sub-goals to create or update',
anyOf: [
{
type: 'array',
items: {
type: 'object',
properties: {
index: {
type: 'integer',
description: 'Sub-goal index (1-based)',
},
status: {
type: 'string',
enum: ['pending', 'finished'],
description: 'Status of the sub-goal',
},
description: {
type: 'string',
description: 'Description of the sub-goal',
},
},
required: ['index', 'status', 'description'],
additionalProperties: false,
},
},
{ type: 'null' },
],
},
mark_finished_indexes: {
description: 'Indexes of sub-goals to mark as finished',
anyOf: [
{
type: 'array',
items: { type: 'integer' },
},
{ type: 'null' },
],
},
memory: {
type: ['string', 'null'],
description:
'Information to remember from the current screenshot for future steps',
},
};

// Rebuild properties in correct order: thought, sub-goal fields, then rest
const reordered: Record<string, unknown> = { thought: properties.thought };
Object.assign(reordered, subGoalProperties);
for (const [key, value] of Object.entries(properties)) {
if (key !== 'thought') {
reordered[key] = value;
}
}
// Replace properties entries
for (const key of Object.keys(properties)) {
delete properties[key];
}
Object.assign(properties, reordered);

// Insert sub-goal required fields after 'thought'
required.splice(
1,
0,
'update_sub_goals',
'mark_finished_indexes',
'memory',
);
}

return {
type: 'json_schema',
json_schema: {
name: 'planning_response',
strict: false,
schema: {
type: 'object',
properties,
required,
additionalProperties: false,
},
},
};
}

/**
* Parse JSON response from LLM and convert to RawResponsePlanningAIResponse
*/
export function parseXMLPlanningResponse(
xmlString: string,
export function parseJSONPlanningResponse(
jsonString: string,
modelFamily: TModelFamily | undefined,
): RawResponsePlanningAIResponse {
const thought = extractXMLTag(xmlString, 'thought');
const memory = extractXMLTag(xmlString, 'memory');
const log = extractXMLTag(xmlString, 'log') || '';
const error = extractXMLTag(xmlString, 'error');
const actionType = extractXMLTag(xmlString, 'action-type');
const actionParamStr = extractXMLTag(xmlString, 'action-param-json');

// Parse <complete> tag with success attribute
const completeGoalRegex =
/<complete\s+success="(true|false)">([\s\S]*?)<\/complete>/i;
const completeGoalMatch = xmlString.match(completeGoalRegex);
let finalizeMessage: string | undefined;
let finalizeSuccess: boolean | undefined;
let parsed: any;
try {
parsed = safeParseJson(jsonString, modelFamily);
} catch (e) {
throw new Error(`Failed to parse planning JSON response: ${e}`);
}

if (completeGoalMatch) {
finalizeSuccess = completeGoalMatch[1] === 'true';
finalizeMessage = completeGoalMatch[2]?.trim() || undefined;
if (!parsed || typeof parsed !== 'object') {
throw new Error('Planning response is not a valid JSON object');
}

// Parse sub-goal related tags
const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');
const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');
const thought = parsed.thought || undefined;
const memory = parsed.memory || undefined;
const log = parsed.log || '';
const error = parsed.error || undefined;

// Parse complete field
let finalizeMessage: string | undefined;
let finalizeSuccess: boolean | undefined;
if (parsed.complete && typeof parsed.complete === 'object') {
finalizeSuccess =
parsed.complete.success === true || parsed.complete.success === 'true'
? true
: parsed.complete.success === false ||
parsed.complete.success === 'false'
? false
: undefined;
finalizeMessage = parsed.complete.message?.trim() || undefined;
}

const updateSubGoals = updatePlanContent
? parseSubGoalsFromXML(updatePlanContent)
// Parse sub-goal related fields
const updateSubGoals: SubGoal[] | undefined = parsed.update_sub_goals?.length
? parsed.update_sub_goals.map((sg: any) => ({
index: sg.index,
status: sg.status as 'pending' | 'finished',
description: sg.description,
}))
: undefined;
const markFinishedIndexes = markSubGoalDone
? parseMarkFinishedIndexes(markSubGoalDone)

const markFinishedIndexes: number[] | undefined = parsed.mark_finished_indexes
?.length
? parsed.mark_finished_indexes
: undefined;

// Parse action
let action: any = null;
if (actionType && actionType.toLowerCase() !== 'null') {
const type = actionType.trim();
let param: any = undefined;

if (actionParamStr) {
try {
// Parse the JSON string in action-param-json
param = safeParseJson(actionParamStr, modelFamily);
} catch (e) {
throw new Error(`Failed to parse action-param-json: ${e}`);
}
}

if (parsed.action_type && parsed.action_type !== 'null') {
const type = String(parsed.action_type).trim();
action = {
type,
...(param !== undefined ? { param } : {}),
...(parsed.action_param != null ? { param: parsed.action_param } : {}),
};
}

Expand Down Expand Up @@ -229,34 +385,51 @@ export async function plan(
...historyLog,
];

// Build JSON schema for response format
// Some model families (doubao-seed, doubao-vision, qwen2.5-vl, glm-v, auto-glm) don't support json_schema response format
const modelFamiliesWithoutJsonSchema: (string | undefined)[] = [
'doubao-seed',
'doubao-vision',
'qwen2.5-vl',
'glm-v',
'auto-glm',
];
const supportsJsonSchema =
!modelFamiliesWithoutJsonSchema.includes(modelFamily);
const responseFormat = supportsJsonSchema
? buildPlanningResponseSchema(includeSubGoals)
: undefined;

let {
content: rawResponse,
usage,
reasoning_content,
} = await callAI(msgs, modelConfig, {
deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,
response_format: responseFormat,
abortSignal: opts.abortSignal,
});

// Parse XML response to JSON object, retry once on parse failure
// Parse JSON response, retry once on parse failure
let planFromAI: RawResponsePlanningAIResponse;
try {
try {
planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
planFromAI = parseJSONPlanningResponse(rawResponse, modelFamily);
} catch {
const retry = await callAI(msgs, modelConfig, {
deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,
response_format: responseFormat,
abortSignal: opts.abortSignal,
});
rawResponse = retry.content;
usage = retry.usage;
reasoning_content = retry.reasoning_content;
planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
planFromAI = parseJSONPlanningResponse(rawResponse, modelFamily);
}

if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {
warnLog(
'Planning response included both an action and <complete>; ignoring <complete> output.',
'Planning response included both an action and "complete"; ignoring "complete" output.',
);
planFromAI.finalizeMessage = undefined;
planFromAI.finalizeSuccess = undefined;
Expand All @@ -265,9 +438,9 @@ export async function plan(
const actions = planFromAI.action ? [planFromAI.action] : [];
let shouldContinuePlanning = true;

// Check if task is completed via <complete> tag
// Check if task is completed via "complete" field
if (planFromAI.finalizeSuccess !== undefined) {
debug('task completed via <complete> tag, stop planning');
debug('task completed via "complete" field, stop planning');
shouldContinuePlanning = false;
// Mark all sub-goals as finished when goal is completed (only when deepThink is enabled)
if (includeSubGoals) {
Expand Down Expand Up @@ -356,7 +529,7 @@ export async function plan(
const errorMessage =
parseError instanceof Error ? parseError.message : String(parseError);
throw new AIResponseParseError(
`XML parse error: ${errorMessage}`,
`JSON parse error: ${errorMessage}`,
rawResponse,
usage,
);
Expand Down
Loading
Loading