Skip to content

Commit af573ab

Browse files
committed
Update verify-distilled.ts
1 parent f27a777 commit af573ab

File tree

1 file changed

+61
-31
lines changed

1 file changed

+61
-31
lines changed

packages/mcp-server/scripts/verify-distilled.ts

Lines changed: 61 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,43 @@ Guidelines for determining accuracy:
7676
- NOT_ACCURATE: Missing critical information that would mislead developers
7777
- NOT_ACCURATE: Incorrect code examples or API usage
7878
79-
You must respond in the following JSON format ONLY:
80-
{
81-
"status": "ACCURATE" or "NOT_ACCURATE",
82-
"reasoning": "Brief explanation of your decision (max 200 characters)"
83-
}
79+
You must respond in exactly this format:
80+
81+
STATUS: [ACCURATE or NOT_ACCURATE]
82+
REASONING: [Brief explanation of your decision in one sentence]
83+
84+
Do not include any other text, formatting, or markdown in your response.`;
85+
86+
function parse_verification_response(text: string): {
87+
status: 'ACCURATE' | 'NOT_ACCURATE';
88+
reasoning: string;
89+
} | null {
90+
// Try to extract STATUS and REASONING using regex
91+
const status_match = text.match(/STATUS:\s*(ACCURATE|NOT_ACCURATE)/i);
92+
const reasoning_match = text.match(/REASONING:\s*(.+?)(?:\n|$)/i);
93+
94+
if (status_match && reasoning_match) {
95+
return {
96+
status: status_match[1]!.toUpperCase() as 'ACCURATE' | 'NOT_ACCURATE',
97+
reasoning: reasoning_match[1]!.trim(),
98+
};
99+
}
100+
101+
// Fallback: try to find just "ACCURATE" or "NOT_ACCURATE" anywhere in the response
102+
const accurate_match = text.match(/\b(NOT_ACCURATE|ACCURATE)\b/i);
103+
if (accurate_match) {
104+
// Extract some context as reasoning
105+
const lines = text.split('\n').filter((line) => line.trim());
106+
const reasoning = lines.slice(0, 3).join(' ').slice(0, 200);
107+
108+
return {
109+
status: accurate_match[1]!.toUpperCase() as 'ACCURATE' | 'NOT_ACCURATE',
110+
reasoning: reasoning || 'Could not extract detailed reasoning',
111+
};
112+
}
84113

85-
Do not include any other text in your response, only the JSON object.`;
114+
return null;
115+
}
86116

87117
async function main() {
88118
program.parse();
@@ -115,10 +145,7 @@ async function main() {
115145
sections = sections_to_verify.slice(0, 2);
116146
}
117147

118-
console.log(`\n📋 Will verify ${sections.length} sections:`);
119-
for (const slug of sections) {
120-
console.log(` - ${slug}`);
121-
}
148+
console.log(`\n📋 Will verify ${sections.length} sections`);
122149

123150
// Dry run mode: exit before API calls
124151
if (options.dryRun) {
@@ -150,7 +177,7 @@ async function main() {
150177
custom_id: `verify-${index}`,
151178
params: {
152179
model: anthropic.get_model_identifier(),
153-
max_tokens: 1024,
180+
max_tokens: 4096, // Increased to allow full responses
154181
messages: [
155182
{
156183
role: 'user',
@@ -227,29 +254,28 @@ async function main() {
227254
continue;
228255
}
229256

230-
try {
231-
// Parse the JSON response
232-
const parsed = JSON.parse(output_content.trim());
233-
const status = parsed.status as 'ACCURATE' | 'NOT_ACCURATE';
234-
const reasoning = parsed.reasoning as string;
235-
236-
verification_results.push({
237-
slug,
238-
status,
239-
reasoning,
240-
});
257+
// Parse using regex instead of strict JSON parsing
258+
const parsed = parse_verification_response(output_content);
241259

242-
const emoji = status === 'ACCURATE' ? '✅' : '❌';
243-
console.log(` ${emoji} ${slug}: ${status}`);
244-
} catch (error) {
245-
console.error(` ❌ Failed to parse response for ${slug}:`, error);
246-
console.error(` Raw response: ${output_content}`);
260+
if (!parsed) {
261+
console.error(` ❌ Failed to parse response for ${slug}`);
262+
console.error(` Raw response: ${output_content.slice(0, 200)}...`);
247263
verification_results.push({
248264
slug,
249265
status: 'NOT_ACCURATE',
250266
reasoning: `Failed to parse verification response: ${output_content.slice(0, 100)}`,
251267
});
268+
continue;
252269
}
270+
271+
verification_results.push({
272+
slug,
273+
status: parsed.status,
274+
reasoning: parsed.reasoning,
275+
});
276+
277+
const emoji = parsed.status === 'ACCURATE' ? '✅' : '❌';
278+
console.log(` ${emoji} ${slug}: ${parsed.status}`);
253279
}
254280

255281
// Calculate statistics
@@ -274,9 +300,7 @@ async function main() {
274300
// Validate output before writing
275301
const validated = v.safeParse(verification_output_schema, output_data);
276302
if (!validated.success) {
277-
throw new Error(
278-
`Output validation failed: ${JSON.stringify(validated.issues, null, 2)}`,
279-
);
303+
throw new Error(`Output validation failed: ${JSON.stringify(validated.issues, null, 2)}`);
280304
}
281305

282306
await writeFile(output_path, JSON.stringify(output_data, null, 2), 'utf-8');
@@ -285,7 +309,9 @@ async function main() {
285309
console.log('\n📊 Verification Summary:');
286310
console.log(` Total sections: ${sections_to_verify.length}`);
287311
console.log(` Verified sections: ${sections.length}`);
288-
console.log(` ✅ Accurate: ${accurate_count} (${((accurate_count / sections.length) * 100).toFixed(1)}%)`);
312+
console.log(
313+
` ✅ Accurate: ${accurate_count} (${((accurate_count / sections.length) * 100).toFixed(1)}%)`,
314+
);
289315
console.log(
290316
` ❌ Not Accurate: ${not_accurate_count} (${((not_accurate_count / sections.length) * 100).toFixed(1)}%)`,
291317
);
@@ -294,9 +320,13 @@ async function main() {
294320
console.log('\n⚠️ Sections with issues:');
295321
verification_results
296322
.filter((r) => r.status === 'NOT_ACCURATE')
323+
.slice(0, 10) // Show first 10
297324
.forEach((r) => {
298325
console.log(` - ${r.slug}: ${r.reasoning}`);
299326
});
327+
if (not_accurate_count > 10) {
328+
console.log(` ... and ${not_accurate_count - 10} more`);
329+
}
300330
}
301331

302332
console.log(`\n✅ Results written to: ${output_path}`);

0 commit comments

Comments
 (0)