Skip to content

Commit 9ab58a2

Browse files
bzornCopilot
andauthored
✨ Label tests with unique IDs and propagate testuid (#175)
* ✨ Label tests with unique IDs and propagate testuid Added unique testuid to each test and test result; updated logic to use it. * ✨ add testuid to test run output and update indexing logic Test run data now includes testuid; testuid index starts from 0. * ✨: Unleash Unique IDs in PromptPex Tests with nanoid Integrated nanoid for generating unique, consistent test UIDs. * ✨ Fix testuid template and ensure strict equality in search Corrected testuid generation format and used strict equality for lookup. * Update src/genaisrc/src/testevalmetric.mts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent e1f4af3 commit 9ab58a2

File tree

7 files changed

+74
-2
lines changed

7 files changed

+74
-2
lines changed

.env.ollama

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
GENAISCRIPT_MODEL_LARGE="ollama:llama3.3"
2+
GENAISCRIPT_MODEL_SMALL="ollama:qwen2.5:3b"
3+
GENAISCRIPT_MODEL_TINY="ollama:llama3.2:1b"
4+
GENAISCRIPT_MODEL_VISION="azure:gpt-4o_2024-11-20"
5+
GENAISCRIPT_MODEL_VISION_SMALL="azure:gpt-4o-mini_2024-11-20"
6+
GENAISCRIPT_MODEL_REASONING="azure:o1_2024-12-17"
7+
GENAISCRIPT_MODEL_REASONING_SMALL="azure:o3-mini_2025-01-31"
8+
GENAISCRIPT_MODEL_IMAGE="azure:dall-e-3_30"
9+
GENAISCRIPT_MODEL_TRANSCRIPTION="azure:whisper_001"
10+
GENAISCRIPT_MODEL_EMBEDDINGS="azure:text-embedding-ada-002_2"
11+
GENAISCRIPT_MODEL_EVAL1="azure:gpt-4o_2024-11-20"
12+
GENAISCRIPT_MODEL_EVAL="ollama:llama3.3"
13+
GENAISCRIPT_MODEL_RULES="ollama:llama3.3"
14+
GENAISCRIPT_MODEL_RULES1="ollama:llama3.3"
15+
GENAISCRIPT_MODEL_BASELINE="ollama:llama3.3"
16+
17+
# use this command to login daily
18+
# az login --scope api://trapi/.default

package-lock.json

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"description": "PromptPex is a test generator for prompts, that allows evaluating and comparing AI prompts across different models and configurations.",
77
"dependencies": {
88
"genaiscript": "^1.142.15",
9+
"nanoid": "^5.1.5",
910
"openai": "^5.5.1"
1011
},
1112
"devDependencies": {
@@ -91,9 +92,10 @@
9192
"promptpex:paper-m": "genaiscript run paper \"samples/speech-tag/speech-tag-multi.prompty\" \"samples/speech-tag/speech-tag.prompty\" \"samples/text-to-p/text-to-p.prompty\" \"samples/openai-examples/elements.prompty\" \"samples/big-prompt-lib/art-prompt.prompty\" \"samples/prompt-guide/extract-names.prompty\" \"samples/text-classification/classify-input-text.prompty\" \"samples/big-prompt-lib/sentence-rewrite.prompty\" \"samples/azure-ai-studio/shakespearean-writing-assistant.prompty\" --vars baselineTests=false --vars \"evals=true\" --vars \"modelsUnderTest=azure:gpt-4o-mini_2024-07-18;ollama:gemma2:9b;ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"out=evals/paper-m\"",
9293
"promptpex:paper-tplus": "genaiscript run paper \"samples/speech-tag/speech-tag.prompty\" \"samples/text-to-p/text-to-p.prompty\" \"samples/openai-examples/elements.prompty\" \"samples/big-prompt-lib/art-prompt.prompty\" \"samples/prompt-guide/extract-names.prompty\" \"samples/text-classification/classify-input-text.prompty\" \"samples/big-prompt-lib/sentence-rewrite.prompty\" \"samples/azure-ai-studio/shakespearean-writing-assistant.prompty\" --vars \"splitRules=true\" --vars \"maxRulesPerTestGeneration=5\" --vars \"testGenerations=1\" --vars \"evals=true\" --vars \"modelsUnderTest=azure:gpt-4o-mini_2024-07-18;ollama:gemma2:9b;ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"out=evals/paper-tplus\"",
9394
"promptpex:test-st-min": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=ollama:llama3.3\" --vars \"evalModel=ollama:llama3.3;ollama:qwen2.5:3b\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=true\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"out=evals/test-st-min\"",
95+
"promptpex:test-st-def:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"out=evals/test-st-def\" --env .env.ollama",
9496
"promptpex:test-st-mingt": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.3\" --vars \"out=evals/test-st-mingt\"",
9597
"promptpex:test-st-mediumgt": "genaiscript run promptpex \"samples/speech-tag/speech-tag-multi.prompty\" --vars \"effort=medium\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"modelsUnderTest=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-mediumgt\"",
96-
"promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\"",
98+
"promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\" --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama",
9799
"promptpex:test-st-min-run": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run\"",
98100
"promptpex:test-st-min-eval": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\"",
99101
"promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\"",

src/genaisrc/src/promptpex.mts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import { evalTestCollection } from "./testcollectioneval.mts"
2828
import { githubModelsEvalsGenerate } from "./githubmodels.mts"
2929
import { parse, resolve } from "node:path"
3030
import { saveContextState } from "./loaders.mts"
31+
import { nanoid } from 'nanoid'
3132

3233
const { output } = env
3334
const dbg = host.logger("promptpex")
@@ -186,6 +187,16 @@ export async function promptpexGenerate(files: PromptPexContext) {
186187
await checkConfirm("expansion")
187188
}
188189

190+
// label tests with unique IDs
191+
output.heading(3, "Label Tests with Unique IDs")
192+
if (files.promptPexTests?.length) {
193+
for (const [index, test] of files.promptPexTests.entries()) {
194+
if (!test.testuid) {
195+
files.promptPexTests[index].testuid = `test-${nanoid(8)}`
196+
}
197+
}
198+
}
199+
189200
// After test expansion, before evals
190201
if (rateTests) {
191202
output.heading(3, "Test Set Quality Review")

src/genaisrc/src/testevalmetric.mts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,22 @@ async function evaluateTestMetric(
9191
outcome: "unknown",
9292
content: "test result output missing",
9393
} satisfies PromptPexEvaluation
94-
const test = files.promptPexTests.find(t => t.) // TODO
94+
if (testResult.testuid === undefined) {
95+
dbg(`testuid is undefined in testResult`)
96+
return {
97+
outcome: "unknown",
98+
content: "testuid is undefined in testResult",
99+
} satisfies PromptPexEvaluation
100+
}
101+
const test = files.promptPexTests.find(t => t.testuid === testResult.testuid)
102+
if (!test) {
103+
dbg(`test not found for testuid %s`, testResult.testuid)
104+
return {
105+
outcome: "unknown",
106+
content: `Test not found for testuid ${testResult.testuid}`,
107+
} satisfies PromptPexEvaluation
108+
}
109+
// if (testResult.model !== evalModel) {
95110
const parameters = {
96111
prompt: content.replace(/^(system|user):/gm, ""),
97112
intent: files.intent.content || "",

src/genaisrc/src/testrun.mts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ async function runTest(
261261
id,
262262
promptid,
263263
...rule,
264+
testuid: test.testuid,
264265
scenario: test.scenario,
265266
baseline: test.baseline,
266267
testinput: testInput,
@@ -307,6 +308,7 @@ async function runTest(
307308
id,
308309
promptid,
309310
...rule,
311+
testuid: test.testuid,
310312
scenario: test.scenario,
311313
baseline: test.baseline,
312314
testinput: testInput,

src/genaisrc/src/types.mts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,10 @@ export interface PromptPexTest {
346346
* Index of the generated test for the given rule. undefined for baseline tests
347347
*/
348348
testid?: number
349+
/**
350+
* Unique identifier for the test
351+
*/
352+
testuid?: string
349353
/**
350354
* Generated by the baseline prompt
351355
*/
@@ -404,6 +408,7 @@ export interface PromptPexTestResult {
404408
input: string
405409
output: string
406410
error?: string
411+
testuid?: string
407412
isGroundtruth?: boolean
408413

409414
compliance?: PromptPexEvalResultType

0 commit comments

Comments
 (0)