Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@
"promptpex:test-st-def:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"out=evals/test-st-def\" --env .env.ollama",
"promptpex:test-st-mingt": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.3\" --vars \"out=evals/test-st-mingt\"",
"promptpex:test-st-mediumgt": "genaiscript run promptpex \"samples/speech-tag/speech-tag-multi.prompty\" --vars \"effort=medium\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"modelsUnderTest=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-mediumgt\"",
"promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama",
"promptpex:test-st-min-run": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run\"",
"promptpex:test-st-min-eval": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\"",
"promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\"",
"promptpex:test-st-min-runeval": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"evalModel=ollama:llama3.3\" --vars \"out=evals/test-st-min-runeval\"",
"promptpex:test-st-min-gen:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama",
"promptpex:test-st-min-run:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run --env .env.ollama\"",
Copy link

Copilot AI Jun 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears that the --env argument is included inside the --vars quoted string. Adjust the script so that --env .env.ollama is placed outside the --vars value.

Suggested change
"promptpex:test-st-min-run:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run --env .env.ollama\"",
"promptpex:test-st-min-run:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run\" --env .env.ollama",

Copilot uses AI. Check for mistakes.
"promptpex:test-st-min-eval:ollama": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\" --env .env.ollama",
"promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\" --env .env.ollama",
"promptpex:test-st-min-runeval:ollama": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"evalModel=ollama:llama3.3\" --vars \"out=evals/test-st-min-runeval\" --env .env.ollama",
"promptpex:test-st-med": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=medium\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-st-med\"",
"promptpex:test-headline": "genaiscript run promptpex \"samples/demo/rate-headline.prompty\" --vars \"effort=min\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-headline\"",
"promptpex:test1": "genaiscript run promptpex \"samples/demo/rate-headline.prompty\" --vars \"cache=true\" --vars \"evals=false\" --vars \"effort=min\" --vars \"baselineTests=true\" --vars \"filterTestCount=3\" --vars \"modelsUnderTest=ollama:llama3.2:1b\" --vars \"out=evals/test-test1\"",
Expand Down
3 changes: 3 additions & 0 deletions src/genaisrc/src/loaders.mts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ export async function loadPromptContext(
let rateTests = path.join(dir, "test_collection_review.md")
let testData = path.join(dir, "test_data.json")
let testResults = path.join(dir, "test_results.json")
let groundtruthResults = path.join(dir, "test_groundtruth.json")
let testEvals = path.join(dir, "test_evals.json")
let baselineTestEvals = path.join(dir, "baseline_test_evals.json")
let ruleEvals = path.join(dir, "rule_evals.json")
Expand Down Expand Up @@ -193,6 +194,7 @@ export async function loadPromptContext(
prompt: promptFile,
originalPrompt: originalPromptFile,
testOutputs: await workspace.readText(testResults),
groundtruthOutputs: await workspace.readText(groundtruthResults),
intent: await workspace.readText(intent),
inputSpec: await workspace.readText(inputSpec),
rules: tidyRulesFile(await workspace.readText(rules)),
Expand Down Expand Up @@ -372,6 +374,7 @@ async function loadPromptContextFromJSON(
ctx.rateTests,
ctx.testData,
ctx.testOutputs,
ctx.groundtruthOutputs,
ctx.testEvals,
ctx.baselineTestEvals,
ctx.ruleEvals,
Expand Down
9 changes: 9 additions & 0 deletions src/genaisrc/src/parsers.mts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ export function parseTestResults(
const rules = parseRules(files.rules.content)
const res = (parsers.JSON5(files.testOutputs.content) ||
[]) as PromptPexTestResult[]
const groundtruthRes = (parsers.JSON5(files.groundtruthOutputs.content) ||
[]) as PromptPexTestResult[]
res.forEach((r) => {
r.inverse =
r.ruleid !== null && parseInt(r.ruleid as any) > rules.length
Expand All @@ -151,6 +153,13 @@ export function parseTestResults(
if (diagnostics)
throw new Error(`missing 'model' for test result ${r.id}`)
}
for (const r of groundtruthRes.filter((r) => !r.error && !r.model)) {
output.warn(
`missing 'model' for groundtruth test result ${r.id} in ${files.groundtruthOutputs.filename}`
)
if (diagnostics)
throw new Error(`missing 'model' for groundtruth test result ${r.id}`)
}
for (const r of res) if (isNaN(r.ruleid)) r.ruleid = null
return res
}
Expand Down
4 changes: 2 additions & 2 deletions src/genaisrc/src/promptpex.mts
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,10 @@ export async function promptpexGenerate(files: PromptPexContext) {
renderTestResults(results.filter((r) => r.isGroundtruth)),
{ maxRows: 12 }
)
files.testOutputs.content = JSON.stringify(results, null, 2)
files.groundtruthOutputs.content = JSON.stringify(results, null, 2)
if (files.writeResults)
await workspace.writeText(
files.testOutputs.filename,
files.groundtruthOutputs.filename,
JSON.stringify(results, null, 2)
)
} else {
Expand Down
5 changes: 5 additions & 0 deletions src/genaisrc/src/types.mts
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ export interface PromptPexContext {
*/
testOutputs: WorkspaceFile

/**
* Groundtruth Output (TO) - Test results with metrics for groundtruth test outputs
*/
groundtruthOutputs: WorkspaceFile

/**
* Coverage and validate test evals
*/
Expand Down