wip

khromov · khromov · commit b17f1e75ce4b · 2025-10-13T00:15:31.000+02:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -57,15 +57,18 @@ Generate condensed versions of the documentation to reduce context size:
 Verify the accuracy of distilled summaries against original documentation:
 
 - `pnpm verify-distilled` - Verify all distilled summaries for accuracy
-- `pnpm verify-distilled:dry-run` - Preview what would be verified without making API calls
-- `pnpm verify-distilled:debug` - Verify only 2 sections for debugging
-
-The verification script:
-1. Loads `distilled.json` containing summaries and original content
-2. Uses the Anthropic Batch API to send each summary and original content to Claude
-3. Claude evaluates whether the summary is accurate or contains errors/omissions
-4. Generates `distilled-verification.json` with results (ACCURATE/NOT_ACCURATE) and reasoning
-5. Outputs statistics about accuracy rates
+- `pnpm show-verification-errors` - Display all sections that failed verification
+
+The verification workflow:
+1. Run `pnpm verify-distilled` to verify all distilled summaries
+   - Loads `distilled.json` containing summaries and original content
+   - Uses the Anthropic Batch API to send each summary and original content to Claude
+   - Claude evaluates whether the summary is accurate or contains errors/omissions
+   - Generates `distilled-verification.json` with results (ACCURATE/NOT_ACCURATE) and reasoning
+   - Outputs statistics about accuracy rates
+2. Run `pnpm show-verification-errors` to see detailed list of all sections that are NOT_ACCURATE
+   - Displays each problematic section with its reasoning
+   - Shows summary statistics
 
 **Note:** All documentation generation and verification commands require `ANTHROPIC_API_KEY` to be set in `packages/mcp-server/.env`
 
diff --git a/packages/mcp-server/package.json b/packages/mcp-server/package.json
@@ -17,8 +17,7 @@
 		"generate-distilled:dry-run": "node --import node-resolve-ts/register scripts/generate-summaries.ts --prompt-type distilled --dry-run",
 		"generate-distilled:debug": "DEBUG_MODE=1 node --import node-resolve-ts/register scripts/generate-summaries.ts --prompt-type distilled",
 		"verify-distilled": "node --import node-resolve-ts/register scripts/verify-distilled.ts",
-		"verify-distilled:dry-run": "node --import node-resolve-ts/register scripts/verify-distilled.ts --dry-run",
-		"verify-distilled:debug": "DEBUG_MODE=1 node --import node-resolve-ts/register scripts/verify-distilled.ts"
+		"show-verification-errors": "node --import node-resolve-ts/register scripts/show-verification-errors.ts"
 	},
 	"exports": {
 		".": "./src/index.ts",
diff --git a/packages/mcp-server/scripts/show-verification-errors.ts b/packages/mcp-server/scripts/show-verification-errors.ts
@@ -0,0 +1,109 @@
+#!/usr/bin/env node
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import * as v from 'valibot';
+
+const current_filename = fileURLToPath(import.meta.url);
+const current_dirname = path.dirname(current_filename);
+
+interface VerificationResult {
+	slug: string;
+	status: 'ACCURATE' | 'NOT_ACCURATE';
+	reasoning: string;
+}
+
+interface VerificationOutput {
+	generated_at: string;
+	model: string;
+	total_sections: number;
+	verified_sections: number;
+	accurate_count: number;
+	not_accurate_count: number;
+	results: VerificationResult[];
+}
+
+const verification_output_schema = v.object({
+	generated_at: v.string(),
+	model: v.string(),
+	total_sections: v.number(),
+	verified_sections: v.number(),
+	accurate_count: v.number(),
+	not_accurate_count: v.number(),
+	results: v.array(
+		v.object({
+			slug: v.string(),
+			status: v.union([v.literal('ACCURATE'), v.literal('NOT_ACCURATE')]),
+			reasoning: v.string(),
+		}),
+	),
+});
+
+async function main() {
+	const verification_path = path.join(current_dirname, '../src/distilled-verification.json');
+
+	console.log('📂 Reading verification results...\n');
+
+	let content: string;
+	try {
+		content = await readFile(verification_path, 'utf-8');
+	} catch (error) {
+		console.error('❌ Error: Could not find distilled-verification.json');
+		console.error('Please run `pnpm verify-distilled` first to generate the file.');
+		process.exit(1);
+	}
+
+	const data = JSON.parse(content);
+	const validated = v.safeParse(verification_output_schema, data);
+
+	if (!validated.success) {
+		console.error('❌ Error: Invalid verification file format');
+		console.error(JSON.stringify(validated.issues, null, 2));
+		process.exit(1);
+	}
+
+	const verification_data = validated.output;
+
+	// Filter for NOT_ACCURATE results
+	const not_accurate = verification_data.results.filter((r) => r.status === 'NOT_ACCURATE');
+
+	// Print header
+	console.log('📊 Verification Results Summary');
+	console.log('═'.repeat(80));
+	console.log(`Generated: ${new Date(verification_data.generated_at).toLocaleString()}`);
+	console.log(`Model: ${verification_data.model}`);
+	console.log(`Total Sections: ${verification_data.total_sections}`);
+	console.log(`Verified: ${verification_data.verified_sections}`);
+	console.log(
+		`✅ Accurate: ${verification_data.accurate_count} (${((verification_data.accurate_count / verification_data.verified_sections) * 100).toFixed(1)}%)`,
+	);
+	console.log(
+		`❌ Not Accurate: ${verification_data.not_accurate_count} (${((verification_data.not_accurate_count / verification_data.verified_sections) * 100).toFixed(1)}%)`,
+	);
+	console.log('═'.repeat(80));
+
+	if (not_accurate.length === 0) {
+		console.log('\n🎉 All sections are accurate! No issues found.');
+		return;
+	}
+
+	// Print all NOT_ACCURATE entries
+	console.log(`\n❌ NOT ACCURATE SECTIONS (${not_accurate.length}):\n`);
+
+	for (let i = 0; i < not_accurate.length; i++) {
+		const result = not_accurate[i]!;
+		console.log(`${i + 1}. ${result.slug}`);
+		console.log(`   Reasoning: ${result.reasoning}`);
+		console.log('');
+	}
+
+	console.log('═'.repeat(80));
+	console.log(
+		`\nFound ${not_accurate.length} section(s) that need review or regeneration.`,
+	);
+}
+
+main().catch((error) => {
+	console.error('❌ Fatal error:', error);
+	process.exit(1);
+});
diff --git a/packages/mcp-server/scripts/verify-distilled.ts b/packages/mcp-server/scripts/verify-distilled.ts
@@ -3,17 +3,11 @@ import 'dotenv/config';
 import { writeFile, mkdir } from 'node:fs/promises';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { Command } from 'commander';
 import { AnthropicProvider } from '../src/lib/anthropic.ts';
 import type { AnthropicBatchRequest } from '../src/lib/schemas.ts';
 import distilled_data from '../src/distilled.json' with { type: 'json' };
 import * as v from 'valibot';
 
-interface CliOptions {
-	dryRun: boolean;
-	debug: boolean;
-}
-
 interface VerificationResult {
 	slug: string;
 	status: 'ACCURATE' | 'NOT_ACCURATE';
@@ -49,17 +43,6 @@ const verification_output_schema = v.object({
 	),
 });
 
-const program = new Command();
-
-program
-	.name('verify-distilled')
-	.description(
-		'Verify the accuracy of distilled summaries by comparing them to original documentation',
-	)
-	.version('1.0.0')
-	.option('-d, --dry-run', 'Show what would be verified without making API calls', false)
-	.option('--debug', 'Debug mode: process only 2 sections', false);
-
 const VERIFICATION_PROMPT = `You are tasked with verifying the accuracy of a distilled/condensed version of documentation against the original content.
 
 Your task:
@@ -115,44 +98,16 @@ function parse_verification_response(text: string): {
 }
 
 async function main() {
-	program.parse();
-	const options = program.opts<CliOptions>();
-
-	const debug = options.debug || process.env.DEBUG_MODE === '1';
-
 	console.log('🔍 Starting distilled verification...\n');
 
-	if (options.dryRun) {
-		console.log('🔍 DRY RUN MODE - No API calls will be made\n');
-	}
-	if (debug) {
-		console.log('🐛 DEBUG MODE - Will process only 2 sections\n');
-	}
-
 	const output_path = path.join(current_dirname, '../src/distilled-verification.json');
 
 	// Load distilled data
 	console.log('📂 Loading distilled.json...');
 	const { summaries, content } = distilled_data;
 
-	const sections_to_verify = Object.keys(summaries);
-	console.log(`Found ${sections_to_verify.length} sections to verify`);
-
-	// Debug mode: limit to 2 sections
-	let sections = sections_to_verify;
-	if (debug) {
-		console.log('\n🐛 Processing only 2 sections for debugging');
-		sections = sections_to_verify.slice(0, 2);
-	}
-
-	console.log(`\n📋 Will verify ${sections.length} sections`);
-
-	// Dry run mode: exit before API calls
-	if (options.dryRun) {
-		console.log('\n🔍 DRY RUN complete - no changes were made');
-		console.log(`Would have verified ${sections.length} sections`);
-		return;
-	}
+	const sections = Object.keys(summaries);
+	console.log(`Found ${sections.length} sections to verify\n`);
 
 	// Check for API key
 	const api_key = process.env.ANTHROPIC_API_KEY;
@@ -164,7 +119,7 @@ async function main() {
 	}
 
 	// Initialize Anthropic API
-	console.log('\n🤖 Initializing Anthropic API...');
+	console.log('🤖 Initializing Anthropic API...');
 	const anthropic = new AnthropicProvider('claude-sonnet-4-5-20250929', api_key);
 
 	// Prepare batch requests
@@ -177,7 +132,7 @@ async function main() {
 			custom_id: `verify-${index}`,
 			params: {
 				model: anthropic.get_model_identifier(),
-				max_tokens: 4096, // Increased to allow full responses
+				max_tokens: 4096,
 				messages: [
 					{
 						role: 'user',
@@ -290,7 +245,7 @@ async function main() {
 	const output_data: VerificationOutput = {
 		generated_at: new Date().toISOString(),
 		model: 'claude-sonnet-4-5-20250929',
-		total_sections: sections_to_verify.length,
+		total_sections: sections.length,
 		verified_sections: sections.length,
 		accurate_count,
 		not_accurate_count,
@@ -307,7 +262,7 @@ async function main() {
 
 	// Print summary
 	console.log('\n📊 Verification Summary:');
-	console.log(`  Total sections: ${sections_to_verify.length}`);
+	console.log(`  Total sections: ${sections.length}`);
 	console.log(`  Verified sections: ${sections.length}`);
 	console.log(
 		`  ✅ Accurate: ${accurate_count} (${((accurate_count / sections.length) * 100).toFixed(1)}%)`,
@@ -317,16 +272,17 @@ async function main() {
 	);
 
 	if (not_accurate_count > 0) {
-		console.log('\n⚠️  Sections with issues:');
+		console.log('\n⚠️  Sections with issues (first 10):');
 		verification_results
 			.filter((r) => r.status === 'NOT_ACCURATE')
-			.slice(0, 10) // Show first 10
+			.slice(0, 10)
 			.forEach((r) => {
 				console.log(`  - ${r.slug}: ${r.reasoning}`);
 			});
 		if (not_accurate_count > 10) {
 			console.log(`  ... and ${not_accurate_count - 10} more`);
 		}
+		console.log('\n💡 Run `pnpm show-verification-errors` to see all issues');
 	}
 
 	console.log(`\n✅ Results written to: ${output_path}`);