|
| 1 | +/** |
| 2 | + * File: example-prompts.test.ts |
| 3 | + * Purpose: Validate the WAVE7-K-EXAMPLE-PROMPTS-LIB catalogue |
| 4 | + * shape, BU-branding compliance, and per-variant rubric |
| 5 | + * score discrimination. |
| 6 | + */ |
| 7 | + |
| 8 | +import { describe, it, expect } from 'vitest' |
| 9 | +import { |
| 10 | + EXAMPLE_PROMPTS, |
| 11 | + SCENARIO_LABELS, |
| 12 | + VARIANT_LABELS, |
| 13 | + findExamplePrompt, |
| 14 | + summarizeExamplePrompts, |
| 15 | +} from '../example-prompts' |
| 16 | +import { analyzePrompt } from '../rubric' |
| 17 | + |
| 18 | +const FICTIONAL_TARGETS: ReadonlySet<string> = new Set([ |
| 19 | + 'DojoLM', 'BonkLM', 'Basileak', 'PantheonLM', 'Marfaak', |
| 20 | +]) |
| 21 | + |
| 22 | +describe('example-prompts library (WAVE7-K-EXAMPLE-PROMPTS-LIB / ADR-0056)', () => { |
| 23 | + it('EX-001 ships at least 60 prompts', () => { |
| 24 | + expect(EXAMPLE_PROMPTS.length).toBeGreaterThanOrEqual(60) |
| 25 | + }) |
| 26 | + |
| 27 | + it('EX-002 every scenario has at least 8 prompts (4 secure + 2 insecure + 2 edge)', () => { |
| 28 | + const summary = summarizeExamplePrompts() |
| 29 | + for (const [scenarioId, count] of Object.entries(summary.byScenario)) { |
| 30 | + expect(count, `${scenarioId} prompt count`).toBeGreaterThanOrEqual(8) |
| 31 | + } |
| 32 | + }) |
| 33 | + |
| 34 | + it('EX-003 every variant is well-represented', () => { |
| 35 | + const summary = summarizeExamplePrompts() |
| 36 | + expect(summary.byVariant.secure).toBeGreaterThanOrEqual(20) |
| 37 | + expect(summary.byVariant.insecure).toBeGreaterThanOrEqual(10) |
| 38 | + expect(summary.byVariant['edge-case']).toBeGreaterThanOrEqual(10) |
| 39 | + }) |
| 40 | + |
| 41 | + it('EX-004 prompt ids are unique', () => { |
| 42 | + const ids = EXAMPLE_PROMPTS.map((p) => p.id) |
| 43 | + expect(new Set(ids).size).toBe(ids.length) |
| 44 | + }) |
| 45 | + |
| 46 | + it('EX-005 every prompt names a fictional BU LLM target', () => { |
| 47 | + for (const p of EXAMPLE_PROMPTS) { |
| 48 | + expect(FICTIONAL_TARGETS.has(p.target)).toBe(true) |
| 49 | + // Branding sanity: the target name must appear in the prompt |
| 50 | + // text or title so the example is recognisable to operators. |
| 51 | + expect(p.text + ' ' + p.title).toContain(p.target) |
| 52 | + } |
| 53 | + }) |
| 54 | + |
| 55 | + it('EX-006 SCENARIO_LABELS + VARIANT_LABELS cover every category referenced', () => { |
| 56 | + const scenarios = new Set(EXAMPLE_PROMPTS.map((p) => p.scenario)) |
| 57 | + const variants = new Set(EXAMPLE_PROMPTS.map((p) => p.variant)) |
| 58 | + for (const s of scenarios) expect(SCENARIO_LABELS[s]).toBeDefined() |
| 59 | + for (const v of variants) expect(VARIANT_LABELS[v]).toBeDefined() |
| 60 | + }) |
| 61 | + |
| 62 | + it('EX-007 findExamplePrompt resolves an existing id and returns undefined for unknown', () => { |
| 63 | + const sample = EXAMPLE_PROMPTS[0] |
| 64 | + expect(findExamplePrompt(sample.id)).toEqual(sample) |
| 65 | + expect(findExamplePrompt('does-not-exist')).toBeUndefined() |
| 66 | + }) |
| 67 | + |
| 68 | + it('EX-008 secure prompts score strictly higher than insecure prompts in their scenario', () => { |
| 69 | + const summary = summarizeExamplePrompts() |
| 70 | + const scenarios = Object.keys(summary.byScenario) as Array<keyof typeof summary.byScenario> |
| 71 | + for (const scenario of scenarios) { |
| 72 | + const secure = EXAMPLE_PROMPTS.filter((p) => p.scenario === scenario && p.variant === 'secure') |
| 73 | + const insecure = EXAMPLE_PROMPTS.filter((p) => p.scenario === scenario && p.variant === 'insecure') |
| 74 | + const avgSecure = secure.reduce((acc, p) => acc + analyzePrompt(p.text).overallScore, 0) / secure.length |
| 75 | + const avgInsecure = insecure.reduce((acc, p) => acc + analyzePrompt(p.text).overallScore, 0) / insecure.length |
| 76 | + expect(avgSecure, `${scenario}: secure avg should beat insecure avg`).toBeGreaterThan(avgInsecure + 10) |
| 77 | + } |
| 78 | + }) |
| 79 | + |
| 80 | + it('EX-009 every prompt analyses cleanly through the rubric (no exceptions)', () => { |
| 81 | + for (const p of EXAMPLE_PROMPTS) { |
| 82 | + const result = analyzePrompt(p.text) |
| 83 | + // ADR-0057 expanded categories 6 → 14. |
| 84 | + expect(result.categories).toHaveLength(14) |
| 85 | + expect(result.overallScore).toBeGreaterThanOrEqual(0) |
| 86 | + expect(result.overallScore).toBeLessThanOrEqual(100) |
| 87 | + expect(['A', 'A-', 'B+', 'B', 'C', 'D', 'F']).toContain(result.grade) |
| 88 | + } |
| 89 | + }) |
| 90 | + |
| 91 | + it('EX-010 secure prompts hit at least 45 on average across the 14-category rubric', () => { |
| 92 | + const secure = EXAMPLE_PROMPTS.filter((p) => p.variant === 'secure') |
| 93 | + const avg = secure.reduce((acc, p) => acc + analyzePrompt(p.text).overallScore, 0) / secure.length |
| 94 | + // ADR-0057 added 8 new categories. The Wave 7.3 prompt library |
| 95 | + // pre-dates those categories so most secure prompts now miss |
| 96 | + // tool-use / RAG / cost / PII / memory / multi-modal / agentic / |
| 97 | + // alignment signal. The 45 floor still captures the meaningful |
| 98 | + // "well above insecure" line; tightening this threshold rides |
| 99 | + // with future prompt-library expansion (Wave 7B fixtures). |
| 100 | + expect(avg).toBeGreaterThanOrEqual(45) |
| 101 | + }) |
| 102 | + |
| 103 | + it('EX-011 insecure prompts stay below 35 on average (room for hardening)', () => { |
| 104 | + const insecure = EXAMPLE_PROMPTS.filter((p) => p.variant === 'insecure') |
| 105 | + const avg = insecure.reduce((acc, p) => acc + analyzePrompt(p.text).overallScore, 0) / insecure.length |
| 106 | + expect(avg).toBeLessThan(35) // post-K-CATEGORIES-MAX floor |
| 107 | + }) |
| 108 | +}) |
0 commit comments