Skip to content

Commit 3537e2e

Browse files
bzornpelikhanCopilot
authored
✨ add groundtruth test output support to promptpex (#180)
* ✨ add groundtruth test output support to promptpex Introduce groundtruth test results file loading and parsing support. * ✏️ fix typo in PromptPexContext groundtruth comment Corrected 'Groudtruth' to 'Groundtruth' in the documentation comment. * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Peli de Halleux <pelikhan@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 46d6571 commit 3537e2e

File tree

5 files changed

+24
-7
lines changed

5 files changed

+24
-7
lines changed

package.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@
9595
"promptpex:test-st-def:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"out=evals/test-st-def\" --env .env.ollama",
9696
"promptpex:test-st-mingt": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.3\" --vars \"out=evals/test-st-mingt\"",
9797
"promptpex:test-st-mediumgt": "genaiscript run promptpex \"samples/speech-tag/speech-tag-multi.prompty\" --vars \"effort=medium\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"modelsUnderTest=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-mediumgt\"",
98-
"promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama",
99-
"promptpex:test-st-min-run": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run\"",
100-
"promptpex:test-st-min-eval": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\"",
101-
"promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\"",
102-
"promptpex:test-st-min-runeval": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"evalModel=ollama:llama3.3\" --vars \"out=evals/test-st-min-runeval\"",
98+
"promptpex:test-st-min-gen:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama",
99+
"promptpex:test-st-min-run:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run --env .env.ollama\"",
100+
"promptpex:test-st-min-eval:ollama": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\" --env .env.ollama",
101+
"promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\" --env .env.ollama",
102+
"promptpex:test-st-min-runeval:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"evalModel=ollama:llama3.3\" --vars \"out=evals/test-st-min-runeval\" --env .env.ollama",
103103
"promptpex:test-st-med": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=medium\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-st-med\"",
104104
"promptpex:test-headline": "genaiscript run promptpex \"samples/demo/rate-headline.prompty\" --vars \"effort=min\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-headline\"",
105105
"promptpex:test1": "genaiscript run promptpex \"samples/demo/rate-headline.prompty\" --vars \"cache=true\" --vars \"evals=false\" --vars \"effort=min\" --vars \"baselineTests=true\" --vars \"filterTestCount=3\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-test1\"",

src/genaisrc/src/loaders.mts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ export async function loadPromptContext(
112112
let rateTests = path.join(dir, "test_collection_review.md")
113113
let testData = path.join(dir, "test_data.json")
114114
let testResults = path.join(dir, "test_results.json")
115+
let groundtruthResults = path.join(dir, "test_groundtruth.json")
115116
let testEvals = path.join(dir, "test_evals.json")
116117
let baselineTestEvals = path.join(dir, "baseline_test_evals.json")
117118
let ruleEvals = path.join(dir, "rule_evals.json")
@@ -193,6 +194,7 @@ export async function loadPromptContext(
193194
prompt: promptFile,
194195
originalPrompt: originalPromptFile,
195196
testOutputs: await workspace.readText(testResults),
197+
groundtruthOutputs: await workspace.readText(groundtruthResults),
196198
intent: await workspace.readText(intent),
197199
inputSpec: await workspace.readText(inputSpec),
198200
rules: tidyRulesFile(await workspace.readText(rules)),
@@ -372,6 +374,7 @@ async function loadPromptContextFromJSON(
372374
ctx.rateTests,
373375
ctx.testData,
374376
ctx.testOutputs,
377+
ctx.groundtruthOutputs,
375378
ctx.testEvals,
376379
ctx.baselineTestEvals,
377380
ctx.ruleEvals,

src/genaisrc/src/parsers.mts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ export function parseTestResults(
139139
const rules = parseRules(files.rules.content)
140140
const res = (parsers.JSON5(files.testOutputs.content) ||
141141
[]) as PromptPexTestResult[]
142+
const groundtruthRes = (parsers.JSON5(files.groundtruthOutputs.content) ||
143+
[]) as PromptPexTestResult[]
142144
res.forEach((r) => {
143145
r.inverse =
144146
r.ruleid !== null && parseInt(r.ruleid as any) > rules.length
@@ -151,6 +153,13 @@ export function parseTestResults(
151153
if (diagnostics)
152154
throw new Error(`missing 'model' for test result ${r.id}`)
153155
}
156+
for (const r of groundtruthRes.filter((r) => !r.error && !r.model)) {
157+
output.warn(
158+
`missing 'model' for groundtruth test result ${r.id} in ${files.groundtruthOutputs.filename}`
159+
)
160+
if (diagnostics)
161+
throw new Error(`missing 'model' for groundtruth test result ${r.id}`)
162+
}
154163
for (const r of res) if (isNaN(r.ruleid)) r.ruleid = null
155164
return res
156165
}

src/genaisrc/src/promptpex.mts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,10 @@ export async function promptpexGenerate(files: PromptPexContext) {
254254
renderTestResults(results.filter((r) => r.isGroundtruth)),
255255
{ maxRows: 12 }
256256
)
257-
files.testOutputs.content = JSON.stringify(results, null, 2)
257+
files.groundtruthOutputs.content = JSON.stringify(results, null, 2)
258258
if (files.writeResults)
259259
await workspace.writeText(
260-
files.testOutputs.filename,
260+
files.groundtruthOutputs.filename,
261261
JSON.stringify(results, null, 2)
262262
)
263263
} else {

src/genaisrc/src/types.mts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,11 @@ export interface PromptPexContext {
284284
*/
285285
testOutputs: WorkspaceFile
286286

287+
/**
288+
* Groundtruth Output (TO) - Test results with metrics for groundtruth test outputs
289+
*/
290+
groundtruthOutputs: WorkspaceFile
291+
287292
/**
288293
* Coverage and validate test evals
289294
*/

0 commit comments

Comments
 (0)