diff --git a/package.json b/package.json index 9db7c2905..92c16d58b 100644 --- a/package.json +++ b/package.json @@ -95,11 +95,11 @@ "promptpex:test-st-def:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"out=evals/test-st-def\" --env .env.ollama", "promptpex:test-st-mingt": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.3\" --vars \"out=evals/test-st-mingt\"", "promptpex:test-st-mediumgt": "genaiscript run promptpex \"samples/speech-tag/speech-tag-multi.prompty\" --vars \"effort=medium\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"modelsUnderTest=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-mediumgt\"", - "promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama", - "promptpex:test-st-min-run": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run\"", - "promptpex:test-st-min-eval": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\"", - "promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\"", - "promptpex:test-st-min-runeval": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"evalModel=ollama:llama3.3\" --vars \"out=evals/test-st-min-runeval\"", + "promptpex:test-st-min-gen:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama", + "promptpex:test-st-min-run:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run --env .env.ollama\"", + "promptpex:test-st-min-eval:ollama": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\" --env .env.ollama", + "promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\" --env .env.ollama", + "promptpex:test-st-min-runeval:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"evalModel=ollama:llama3.3\" --vars \"out=evals/test-st-min-runeval\" --env .env.ollama", "promptpex:test-st-med": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=medium\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-st-med\"", "promptpex:test-headline": "genaiscript run promptpex \"samples/demo/rate-headline.prompty\" --vars \"effort=min\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-headline\"", "promptpex:test1": "genaiscript run promptpex \"samples/demo/rate-headline.prompty\" --vars \"cache=true\" --vars \"evals=false\" --vars \"effort=min\" --vars \"baselineTests=true\" --vars \"filterTestCount=3\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-test1\"", diff --git a/src/genaisrc/src/loaders.mts b/src/genaisrc/src/loaders.mts index ee9b7d037..dff011263 100644 --- a/src/genaisrc/src/loaders.mts +++ b/src/genaisrc/src/loaders.mts @@ -112,6 +112,7 @@ export async function loadPromptContext( let rateTests = path.join(dir, "test_collection_review.md") let testData = path.join(dir, "test_data.json") let testResults = path.join(dir, "test_results.json") + let groundtruthResults = path.join(dir, "test_groundtruth.json") let testEvals = path.join(dir, "test_evals.json") let baselineTestEvals = path.join(dir, "baseline_test_evals.json") let ruleEvals = path.join(dir, "rule_evals.json") @@ -193,6 +194,7 @@ export async function loadPromptContext( prompt: promptFile, originalPrompt: originalPromptFile, testOutputs: await workspace.readText(testResults), + groundtruthOutputs: await workspace.readText(groundtruthResults), intent: await workspace.readText(intent), inputSpec: await workspace.readText(inputSpec), rules: tidyRulesFile(await workspace.readText(rules)), @@ -372,6 +374,7 @@ async function loadPromptContextFromJSON( ctx.rateTests, ctx.testData, ctx.testOutputs, + ctx.groundtruthOutputs, ctx.testEvals, ctx.baselineTestEvals, ctx.ruleEvals, diff --git a/src/genaisrc/src/parsers.mts b/src/genaisrc/src/parsers.mts index 6853212cd..590b02140 100644 --- a/src/genaisrc/src/parsers.mts +++ b/src/genaisrc/src/parsers.mts @@ -139,6 +139,8 @@ export function parseTestResults( const rules = parseRules(files.rules.content) const res = (parsers.JSON5(files.testOutputs.content) || []) as PromptPexTestResult[] + const groundtruthRes = (parsers.JSON5(files.groundtruthOutputs.content) || + []) as PromptPexTestResult[] res.forEach((r) => { r.inverse = r.ruleid !== null && parseInt(r.ruleid as any) > rules.length @@ -151,6 +153,13 @@ export function parseTestResults( if (diagnostics) throw new Error(`missing 'model' for test result ${r.id}`) } + for (const r of groundtruthRes.filter((r) => !r.error && !r.model)) { + output.warn( + `missing 'model' for groundtruth test result ${r.id} in ${files.groundtruthOutputs.filename}` + ) + if (diagnostics) + throw new Error(`missing 'model' for groundtruth test result ${r.id}`) + } for (const r of res) if (isNaN(r.ruleid)) r.ruleid = null return res } diff --git a/src/genaisrc/src/promptpex.mts b/src/genaisrc/src/promptpex.mts index 5a5a66aff..3e6d8c910 100644 --- a/src/genaisrc/src/promptpex.mts +++ b/src/genaisrc/src/promptpex.mts @@ -254,10 +254,10 @@ export async function promptpexGenerate(files: PromptPexContext) { renderTestResults(results.filter((r) => r.isGroundtruth)), { maxRows: 12 } ) - files.testOutputs.content = JSON.stringify(results, null, 2) + files.groundtruthOutputs.content = JSON.stringify(results, null, 2) if (files.writeResults) await workspace.writeText( - files.testOutputs.filename, + files.groundtruthOutputs.filename, JSON.stringify(results, null, 2) ) } else { diff --git a/src/genaisrc/src/types.mts b/src/genaisrc/src/types.mts index 9a6441b3a..9ed3bcb3a 100644 --- a/src/genaisrc/src/types.mts +++ b/src/genaisrc/src/types.mts @@ -284,6 +284,11 @@ export interface PromptPexContext { */ testOutputs: WorkspaceFile + /** + * Groundtruth Output (TO) - Test results with metrics for groundtruth test outputs + */ + groundtruthOutputs: WorkspaceFile + /** * Coverage and validate test evals */