microsoft · bzorn · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/.env.ollama b/.env.ollama
@@ -0,0 +1,18 @@
+GENAISCRIPT_MODEL_LARGE="ollama:llama3.3"
+GENAISCRIPT_MODEL_SMALL="ollama:qwen2.5:3b"
+GENAISCRIPT_MODEL_TINY="ollama:llama3.2:1b"
+GENAISCRIPT_MODEL_VISION="azure:gpt-4o_2024-11-20"
+GENAISCRIPT_MODEL_VISION_SMALL="azure:gpt-4o-mini_2024-11-20"
+GENAISCRIPT_MODEL_REASONING="azure:o1_2024-12-17"
+GENAISCRIPT_MODEL_REASONING_SMALL="azure:o3-mini_2025-01-31"
+GENAISCRIPT_MODEL_IMAGE="azure:dall-e-3_30"
+GENAISCRIPT_MODEL_TRANSCRIPTION="azure:whisper_001"
+GENAISCRIPT_MODEL_EMBEDDINGS="azure:text-embedding-ada-002_2"
+GENAISCRIPT_MODEL_EVAL1="azure:gpt-4o_2024-11-20"
+GENAISCRIPT_MODEL_EVAL="ollama:llama3.3"
+GENAISCRIPT_MODEL_RULES="ollama:llama3.3"
+GENAISCRIPT_MODEL_RULES1="ollama:llama3.3"
+GENAISCRIPT_MODEL_BASELINE="ollama:llama3.3"
+
+# use this command to login daily
+# az login --scope api://trapi/.default
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -6,6 +6,7 @@
     "description": "PromptPex is a test generator for prompts, that allows evaluating and comparing AI prompts across different models and configurations.",
     "dependencies": {
         "genaiscript": "^1.142.15",
+        "nanoid": "^5.1.5",
         "openai": "^5.5.1"
     },
     "devDependencies": {
@@ -91,9 +92,10 @@
         "promptpex:paper-m": "genaiscript run paper \"samples/speech-tag/speech-tag-multi.prompty\" \"samples/speech-tag/speech-tag.prompty\" \"samples/text-to-p/text-to-p.prompty\" \"samples/openai-examples/elements.prompty\" \"samples/big-prompt-lib/art-prompt.prompty\" \"samples/prompt-guide/extract-names.prompty\" \"samples/text-classification/classify-input-text.prompty\" \"samples/big-prompt-lib/sentence-rewrite.prompty\" \"samples/azure-ai-studio/shakespearean-writing-assistant.prompty\" --vars baselineTests=false --vars \"evals=true\" --vars \"modelsUnderTest=azure:gpt-4o-mini_2024-07-18;ollama:gemma2:9b;ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"out=evals/paper-m\"",
         "promptpex:paper-tplus": "genaiscript run paper \"samples/speech-tag/speech-tag.prompty\" \"samples/text-to-p/text-to-p.prompty\" \"samples/openai-examples/elements.prompty\" \"samples/big-prompt-lib/art-prompt.prompty\" \"samples/prompt-guide/extract-names.prompty\" \"samples/text-classification/classify-input-text.prompty\" \"samples/big-prompt-lib/sentence-rewrite.prompty\" \"samples/azure-ai-studio/shakespearean-writing-assistant.prompty\"  --vars \"splitRules=true\" --vars \"maxRulesPerTestGeneration=5\" --vars \"testGenerations=1\" --vars \"evals=true\" --vars \"modelsUnderTest=azure:gpt-4o-mini_2024-07-18;ollama:gemma2:9b;ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"out=evals/paper-tplus\"",
         "promptpex:test-st-min": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\"  --vars \"effort=min\" --vars \"groundtruthModel=ollama:llama3.3\" --vars \"evalModel=ollama:llama3.3;ollama:qwen2.5:3b\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=true\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b\" --vars \"out=evals/test-st-min\"",
+        "promptpex:test-st-def:ollama": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\"  --vars \"effort=min\"  --vars \"out=evals/test-st-def\" --env .env.ollama",
         "promptpex:test-st-mingt": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\"  --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:llama3.3\" --vars \"out=evals/test-st-mingt\"",
         "promptpex:test-st-mediumgt": "genaiscript run promptpex \"samples/speech-tag/speech-tag-multi.prompty\"  --vars \"effort=medium\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evalModel=ollama:llama3.3\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"modelsUnderTest=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-mediumgt\"",
-        "promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\"  --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\"",
+        "promptpex:test-st-min-gen": "genaiscript run promptpex \"samples/speech-tag/speech-tag.prompty\"  --vars \"effort=min\" --vars \"groundtruthModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"evalModelGroundtruth=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-gen\" --env .env.ollama",
         "promptpex:test-st-min-run": "genaiscript run promptpex \"evals/test-st-min-gen/speech-tag/promptpex_context.json\" --vars \"evals=false\" --vars \"compliance=false\" --vars \"baselineTests=false\" --vars \"modelsUnderTest=ollama:qwen2.5:3b;ollama:llama3.2:1b;ollama:llama3.3\" --vars \"out=evals/test-st-min-run\"",
         "promptpex:test-st-min-eval": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14\" --vars \"out=evals/test-st-min-eval\"",
         "promptpex:test-st-min-eval1": "genaiscript run promptpex \"evals/test-st-min-run/speech-tag/promptpex_context.json\" --vars \"evals=true\" --vars \"compliance=true\" --vars \"baselineTests=false\" --vars \"evalModel=azure:gpt-4.1-mini_2025-04-14;ollama:llama3.3\" --vars \"out=evals/test-st-min-eval\"",

diff --git a/src/genaisrc/src/promptpex.mts b/src/genaisrc/src/promptpex.mts
@@ -28,6 +28,7 @@ import { evalTestCollection } from "./testcollectioneval.mts"
 import { githubModelsEvalsGenerate } from "./githubmodels.mts"
 import { parse, resolve } from "node:path"
 import { saveContextState } from "./loaders.mts"
+import { nanoid } from 'nanoid'
 
 const { output } = env
 const dbg = host.logger("promptpex")
@@ -186,6 +187,16 @@ export async function promptpexGenerate(files: PromptPexContext) {
         await checkConfirm("expansion")
     }
 
+    // label tests with unique IDs
+    output.heading(3, "Label Tests with Unique IDs")
+    if (files.promptPexTests?.length) {
+        for (const [index, test] of files.promptPexTests.entries()) {
+            if (!test.testuid) {
+                files.promptPexTests[index].testuid = `test-${nanoid(8)}`
+            }
+        }
+    }
+
     // After test expansion, before evals
     if (rateTests) {
         output.heading(3, "Test Set Quality Review")

diff --git a/src/genaisrc/src/testevalmetric.mts b/src/genaisrc/src/testevalmetric.mts
@@ -91,7 +91,22 @@ async function evaluateTestMetric(
             outcome: "unknown",
             content: "test result output missing",
         } satisfies PromptPexEvaluation
-    const test = files.promptPexTests.find(t => t.) // TODO
+    if (testResult.testuid === undefined) {
+        dbg(`testuid is undefined in testResult`)
+        return {
+            outcome: "unknown",
+            content: "testuid is undefined in testResult",
+        } satisfies PromptPexEvaluation
+    }
+    const test = files.promptPexTests.find(t => t.testuid === testResult.testuid)
+    if (!test) {
+        dbg(`test not found for testuid %s`, testResult.testuid)
+        return {
+            outcome: "unknown",
+            content: `Test not found for testuid ${testResult.testuid}`,
+        } satisfies PromptPexEvaluation
+    }
+    // if (testResult.model !== evalModel) {
     const parameters = {
         prompt: content.replace(/^(system|user):/gm, ""),
         intent: files.intent.content || "",

diff --git a/src/genaisrc/src/testrun.mts b/src/genaisrc/src/testrun.mts
@@ -261,6 +261,7 @@ async function runTest(
             id,
             promptid,
             ...rule,
+            testuid: test.testuid,
             scenario: test.scenario,
             baseline: test.baseline,
             testinput: testInput,
@@ -307,6 +308,7 @@ async function runTest(
         id,
         promptid,
         ...rule,
+        testuid: test.testuid,
         scenario: test.scenario,
         baseline: test.baseline,
         testinput: testInput,

diff --git a/src/genaisrc/src/types.mts b/src/genaisrc/src/types.mts
@@ -346,6 +346,10 @@ export interface PromptPexTest {
      * Index of the generated test for the given rule. undefined for baseline tests
      */
     testid?: number
+    /**
+     * Unique identifier for the test
+     */
+    testuid?: string
     /**
      * Generated by the baseline prompt
      */
@@ -404,6 +408,7 @@ export interface PromptPexTestResult {
     input: string
     output: string
     error?: string
+    testuid?: string
     isGroundtruth?: boolean
 
     compliance?: PromptPexEvalResultType