feat: add evaluation suite

mdombrov-33 · mdombrov-33 · commit b1b7388fdc60 · 2026-03-25T00:44:09.000+02:00
diff --git a/README.md b/README.md
@@ -63,9 +63,10 @@ Input → MultiDetector
 
 **Risk calculation:**
 
-- Start with highest detector score
-- Add +0.1 for each additional pattern detected (capped at 1.0)
-- Example: 0.9 (role injection) + 0.1 (obfuscation) = 1.0
+- Each detector that fires contributes `score × weight` to the total
+- Detector weights reflect reliability: semantic detectors (role injection, prompt leak, instruction override) have weight 1.0; statistical detectors (entropy, perplexity, token anomaly) are discounted to 0.45–0.55 so they cannot trigger alone at borderline scores
+- Multiple detectors firing naturally combine: `final = min(Σ score_i × weight_i, 1.0)`
+- Example: role injection (0.9 × 1.0) + obfuscation (0.8 × 0.9) = 0.9 + 0.72 = 1.0 (capped)
 
 **Performance:**
 
diff --git a/benchmarks/eval_llm_test.go b/benchmarks/eval_llm_test.go
@@ -0,0 +1,96 @@
+package benchmarks
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/mdombrov-33/go-promptguard/detector"
+)
+
+// TestEvaluationWithLLM runs the same dataset as TestEvaluation but with an
+// Ollama LLM judge in LLMFallback mode (LLM only runs when pattern detectors
+// score below threshold). This shows how much recall improves with LLM coverage.
+//
+// Run with: go test -v -run TestEvaluationWithLLM -timeout 10m ./benchmarks/
+//
+// Skipped automatically if Ollama is not reachable at localhost:11434.
+func TestEvaluationWithLLM(t *testing.T) {
+	if !ollamaReachable() {
+		t.Skip("Ollama not reachable at localhost:11434 - skipping LLM eval")
+	}
+
+	attacks := loadDataset(t, "testdata/attacks.json")
+	benign := loadDataset(t, "testdata/benign.json")
+	all := append(attacks, benign...)
+	ctx := context.Background()
+
+	judge := detector.NewOllamaJudge("llama3.1:8b")
+
+	baseOverall, _, _, _ := evaluate(ctx, detector.New(), all)
+	llmOverall, llmPerCategory, llmFP, llmFN := evaluate(ctx, detector.New(detector.WithLLM(judge, detector.LLMFallback)), all)
+
+	t.Logf("\n%s", strings.Repeat("=", 60))
+	t.Logf("LLM EVAL  (Ollama llama3.1:8b, mode=LLMFallback)")
+	t.Logf("%s", strings.Repeat("=", 60))
+
+	t.Logf("\n--- Comparison: pattern-only vs pattern+LLM ---")
+	t.Logf("  %-20s  %-14s  %-14s  %-10s", "Metric", "Pattern only", "Pattern+LLM", "Delta")
+	t.Logf("  %s", strings.Repeat("-", 62))
+	t.Logf("  %-20s  %-14s  %-14s  %+.1f%%", "Recall",
+		pct(baseOverall.Recall()), pct(llmOverall.Recall()), llmOverall.Recall()-baseOverall.Recall())
+	t.Logf("  %-20s  %-14s  %-14s  %+.1f%%", "Precision",
+		pct(baseOverall.Precision()), pct(llmOverall.Precision()), llmOverall.Precision()-baseOverall.Precision())
+	t.Logf("  %-20s  %-14s  %-14s  %+.1f%%", "F1",
+		pct(baseOverall.F1()), pct(llmOverall.F1()), llmOverall.F1()-baseOverall.F1())
+	t.Logf("  %-20s  %-14s  %-14s  %+d", "False Positives",
+		fmt.Sprintf("%d", baseOverall.FP), fmt.Sprintf("%d", llmOverall.FP), llmOverall.FP-baseOverall.FP)
+	t.Logf("  %-20s  %-14s  %-14s  %+d", "False Negatives",
+		fmt.Sprintf("%d", baseOverall.FN), fmt.Sprintf("%d", llmOverall.FN), llmOverall.FN-baseOverall.FN)
+
+	t.Logf("\n--- Per-category recall with LLM ---")
+	for _, cat := range attackCategories {
+		c := llmPerCategory[cat]
+		total := c.TP + c.FN
+		if total == 0 {
+			continue
+		}
+		bar := strings.Repeat("█", c.TP) + strings.Repeat("░", c.FN)
+		t.Logf("  %-24s %d/%d  (%.1f%%)  %s", cat+":", c.TP, total, c.Recall(), bar)
+	}
+
+	if len(llmFP) > 0 {
+		t.Logf("\n--- False Positives (safe inputs wrongly flagged) ---")
+		for _, er := range llmFP {
+			t.Logf("  [%s] score=%.2f  %q", er.Sample.ID, er.Result.RiskScore, truncate(er.Sample.Input, 70))
+		}
+	}
+
+	if len(llmFN) > 0 {
+		t.Logf("\n--- Attacks still missed after LLM ---")
+		for _, er := range llmFN {
+			t.Logf("  [%s] cat=%-22s score=%.2f  %q", er.Sample.ID, er.Sample.Category, er.Result.RiskScore, truncate(er.Sample.Input, 70))
+		}
+	}
+
+	t.Logf("\n%s", strings.Repeat("=", 60))
+
+	if llmOverall.Recall() <= baseOverall.Recall() {
+		t.Errorf("LLM fallback did not improve recall: pattern=%.1f%% llm=%.1f%%", baseOverall.Recall(), llmOverall.Recall())
+	}
+}
+
+func ollamaReachable() bool {
+	client := &http.Client{Timeout: 3 * time.Second}
+	resp, err := client.Get("http://localhost:11434/api/tags")
+	if err != nil {
+		return false
+	}
+	resp.Body.Close()
+	return resp.StatusCode == http.StatusOK
+}
+
+func pct(f float64) string { return fmt.Sprintf("%.1f%%", f) }
diff --git a/benchmarks/eval_test.go b/benchmarks/eval_test.go
@@ -3,7 +3,6 @@ package benchmarks
 import (
 	"context"
 	"encoding/json"
-	"fmt"
 	"os"
 	"sort"
 	"strings"
@@ -26,8 +25,12 @@ type Dataset struct {
 	Samples []Sample `json:"samples"`
 }
 
-//	Metrics types
-//
+// EvalResult pairs a sample with the detector result so we never call Detect twice.
+type EvalResult struct {
+	Sample Sample
+	Result detector.Result
+}
+
 // Confusion matrix counts and derived metrics
 type Counts struct {
 	TP, FP, TN, FN int
@@ -48,8 +51,7 @@ func (c Counts) Recall() float64 {
 }
 
 func (c Counts) F1() float64 {
-	p := c.Precision()
-	r := c.Recall()
+	p, r := c.Precision(), c.Recall()
 	if p+r == 0 {
 		return 0
 	}
@@ -64,6 +66,12 @@ func (c Counts) Accuracy() float64 {
 	return float64(c.TP+c.TN) / float64(total) * 100
 }
 
+// Shared category lists used by multiple tests.
+var (
+	attackCategories = []string{"role_injection", "prompt_leak", "instruction_override", "obfuscation", "normalization", "delimiter", "multi_vector"}
+	benignCategories = []string{"general_question", "coding", "technical", "writing", "creative", "summarization", "explanation", "translation", "edge_case"}
+)
+
 // Helpers
 func loadDataset(t *testing.T, path string) []Sample {
 	t.Helper()
@@ -78,16 +86,18 @@ func loadDataset(t *testing.T, path string) []Sample {
 	return ds.Samples
 }
 
-func evaluate(ctx context.Context, guard *detector.MultiDetector, samples []Sample) (Counts, map[string]Counts, []Sample, []Sample) {
+// evaluate runs all samples through the guard once and returns counts + per-category
+// breakdown + slices of false positives and false negatives with their cached results.
+func evaluate(ctx context.Context, guard *detector.MultiDetector, samples []Sample) (Counts, map[string]Counts, []EvalResult, []EvalResult) {
 	var overall Counts
 	perCategory := make(map[string]Counts)
-	var falsePositives []Sample
-	var falseNegatives []Sample
+	var falsePositives, falseNegatives []EvalResult
 
 	for _, s := range samples {
 		result := guard.Detect(ctx, s.Input)
 		isAttack := !result.Safe
 		shouldBeAttack := s.Label == "attack"
+		er := EvalResult{Sample: s, Result: result}
 
 		c := perCategory[s.Category]
 		switch {
@@ -97,14 +107,14 @@ func evaluate(ctx context.Context, guard *detector.MultiDetector, samples []Samp
 		case isAttack && !shouldBeAttack:
 			overall.FP++
 			c.FP++
-			falsePositives = append(falsePositives, s)
+			falsePositives = append(falsePositives, er)
 		case !isAttack && !shouldBeAttack:
 			overall.TN++
 			c.TN++
 		case !isAttack && shouldBeAttack:
 			overall.FN++
 			c.FN++
-			falseNegatives = append(falseNegatives, s)
+			falseNegatives = append(falseNegatives, er)
 		}
 		perCategory[s.Category] = c
 	}
@@ -113,15 +123,14 @@ func evaluate(ctx context.Context, guard *detector.MultiDetector, samples []Samp
 }
 
 // Tests
-
 // TestEvaluation is the main evaluation test.
 // Run with: go test -v -run TestEvaluation ./benchmarks/
 func TestEvaluation(t *testing.T) {
 	attacks := loadDataset(t, "testdata/attacks.json")
 	benign := loadDataset(t, "testdata/benign.json")
 	all := append(attacks, benign...)
 
-	guard := detector.New() // default threshold 0.7
+	guard := detector.New()
 	ctx := context.Background()
 
 	overall, perCategory, falsePositives, falseNegatives := evaluate(ctx, guard, all)
@@ -139,9 +148,7 @@ func TestEvaluation(t *testing.T) {
 	t.Logf("  False Positives:   %d/%d benign flagged (%.1f%%)", overall.FP, len(benign), float64(overall.FP)/float64(len(benign))*100)
 	t.Logf("  False Negatives:   %d/%d attacks missed (%.1f%%)", overall.FN, len(attacks), float64(overall.FN)/float64(len(attacks))*100)
 
-	// Per-category for attack samples
 	t.Logf("\n--- Per-category recall (attacks) ---")
-	attackCategories := []string{"role_injection", "prompt_leak", "instruction_override", "obfuscation", "normalization", "delimiter", "multi_vector"}
 	for _, cat := range attackCategories {
 		c := perCategory[cat]
 		total := c.TP + c.FN
@@ -152,49 +159,41 @@ func TestEvaluation(t *testing.T) {
 		t.Logf("  %-24s %d/%d  (%.1f%%)  %s", cat+":", c.TP, total, c.Recall(), bar)
 	}
 
-	// Per-category for benign samples
 	t.Logf("\n--- Per-category false positive rate (benign) ---")
-	benignCategories := []string{"general_question", "coding", "technical", "writing", "creative", "summarization", "explanation", "translation", "edge_case"}
 	for _, cat := range benignCategories {
 		c := perCategory[cat]
 		total := c.TN + c.FP
 		if total == 0 {
 			continue
 		}
-		fpRate := float64(c.FP) / float64(total) * 100
-		t.Logf("  %-24s %d FP / %d total (%.1f%% FP rate)", cat+":", c.FP, total, fpRate)
+		t.Logf("  %-24s %d FP / %d total (%.1f%% FP rate)", cat+":", c.FP, total, float64(c.FP)/float64(total)*100)
 	}
 
-	// Confusion matrix
 	t.Logf("\n--- Confusion Matrix ---")
 	t.Logf("  %26s  ATTACK   SAFE", "Predicted →")
 	t.Logf("  Actual ATTACK          %5d  %5d", overall.TP, overall.FN)
 	t.Logf("  Actual SAFE            %5d  %5d", overall.FP, overall.TN)
 
-	// False positives detail
 	if len(falsePositives) > 0 {
 		t.Logf("\n--- False Positives (safe inputs wrongly flagged) ---")
-		for _, s := range falsePositives {
-			result := guard.Detect(ctx, s.Input)
-			t.Logf("  [%s] score=%.2f  %q", s.ID, result.RiskScore, truncate(s.Input, 70))
-			t.Logf("    note: %s", s.Notes)
+		for _, er := range falsePositives {
+			t.Logf("  [%s] score=%.2f  %q", er.Sample.ID, er.Result.RiskScore, truncate(er.Sample.Input, 70))
+			t.Logf("    note: %s", er.Sample.Notes)
 		}
 	}
 
-	// False negatives detail
 	if len(falseNegatives) > 0 {
 		t.Logf("\n--- False Negatives (attacks missed) ---")
-		for _, s := range falseNegatives {
-			result := guard.Detect(ctx, s.Input)
-			t.Logf("  [%s] cat=%-22s score=%.2f  %q", s.ID, s.Category, result.RiskScore, truncate(s.Input, 70))
+		for _, er := range falseNegatives {
+			t.Logf("  [%s] cat=%-22s score=%.2f  %q", er.Sample.ID, er.Sample.Category, er.Result.RiskScore, truncate(er.Sample.Input, 70))
 		}
 	}
 
 	t.Logf("\n%s", strings.Repeat("=", 60))
 
-	// Sanity checks — these fail the test if something is very wrong
-	if overall.Recall() < 70.0 {
-		t.Errorf("Recall %.1f%% is too low — more than 30%% of attacks are being missed", overall.Recall())
+	// Regression guard: recall below 40% means something is catastrophically broken.
+	if overall.Recall() < 40.0 {
+		t.Errorf("Recall %.1f%% is critically low — likely a regression", overall.Recall())
 	}
 	if overall.FP > len(benign)/3 {
 		t.Errorf("False positive rate too high: %d/%d benign inputs wrongly flagged", overall.FP, len(benign))
@@ -217,8 +216,7 @@ func TestThresholdSweep(t *testing.T) {
 	t.Logf("  %-10s  %-10s  %-10s  %-10s  %-5s  %-5s", "Threshold", "Precision", "Recall", "F1", "FP", "FN")
 	t.Logf("  %s", strings.Repeat("-", 58))
 
-	bestF1 := 0.0
-	bestThreshold := 0.0
+	bestF1, bestThreshold := 0.0, 0.0
 
 	for _, threshold := range thresholds {
 		guard := detector.New(detector.WithThreshold(threshold))
@@ -236,13 +234,8 @@ func TestThresholdSweep(t *testing.T) {
 		}
 
 		t.Logf("  %-10.1f  %-10.1f  %-10.1f  %-10.1f  %-5d  %-5d%s",
-			threshold,
-			overall.Precision(),
-			overall.Recall(),
-			f1,
-			overall.FP,
-			overall.FN,
-			marker,
+			threshold, overall.Precision(), overall.Recall(), f1,
+			overall.FP, overall.FN, marker,
 		)
 	}
 
@@ -257,12 +250,8 @@ func TestPerCategoryPrecision(t *testing.T) {
 	benign := loadDataset(t, "testdata/benign.json")
 	all := append(attacks, benign...)
 
-	guard := detector.New()
-	ctx := context.Background()
-
-	_, perCategory, _, _ := evaluate(ctx, guard, all)
+	_, perCategory, _, _ := evaluate(context.Background(), detector.New(), all)
 
-	// Collect categories that appear in attacks
 	seen := map[string]bool{}
 	for _, s := range attacks {
 		seen[s.Category] = true
@@ -281,30 +270,20 @@ func TestPerCategoryPrecision(t *testing.T) {
 
 	for _, cat := range categories {
 		c := perCategory[cat]
-		total := c.TP + c.FN
-		if total == 0 {
+		if c.TP+c.FN == 0 {
 			continue
 		}
 		t.Logf("  %-24s  %5.1f%%  %5.1f%%  %5.1f%%  %4d  %4d",
-			cat,
-			c.Recall(),
-			c.Precision(),
-			c.F1(),
-			c.TP,
-			c.FN,
+			cat, c.Recall(), c.Precision(), c.F1(), c.TP, c.FN,
 		)
 	}
 
 	t.Logf("%s", strings.Repeat("=", 60))
 }
 
-// Utilities
 func truncate(s string, n int) string {
 	if len(s) <= n {
 		return s
 	}
 	return s[:n] + "..."
 }
-
-// Prevent unused import error for fmt if all t.Logf are used
-var _ = fmt.Sprintf
diff --git a/detector/multi_detector.go b/detector/multi_detector.go
@@ -64,13 +64,7 @@ func New(opts ...Option) *MultiDetector {
 }
 
 // Detect runs all enabled detectors and combines their results.
-// Risk scoring algorithm:
-//   - Takes the highest individual risk score from any detector
-//   - Adds a 0.1 bonus for each additional pattern detected (capped at 1.0)
-//   - Confidence represents certainty of classification:
-//   - When detectors find patterns: max confidence + 0.05 bonus if multiple detectors agree
-//   - When no patterns found: high confidence (~0.85-0.90) it's safe
-//
+// Risk score is computed by computeWeightedScore (see scoring.go).
 // The input is considered unsafe if the final risk score >= threshold.
 func (md *MultiDetector) Detect(ctx context.Context, input string) Result {
 	if md.config.MaxInputLength > 0 && len(input) > md.config.MaxInputLength {
@@ -117,13 +111,7 @@ func (md *MultiDetector) Detect(ctx context.Context, input string) Result {
 		}
 	}
 
-	// Calculate final risk score using our algorithm:
-	// final_score = max(individual_scores) + 0.1 × (num_additional_patterns - 1)
-	finalScore := maxScore
-	if len(allPatterns) > 1 {
-		bonus := 0.1 * float64(len(allPatterns)-1)
-		finalScore = min(finalScore+bonus, 1.0)
-	}
+	finalScore := computeWeightedScore(allPatterns)
 
 	finalConfidence := 0.0
 	if detectorsTriggered > 0 {
@@ -175,11 +163,7 @@ func (md *MultiDetector) Detect(ctx context.Context, input string) Result {
 			maxScore = llmResult.RiskScore
 		}
 
-		finalScore = maxScore
-		if len(allPatterns) > 1 {
-			bonus := 0.1 * float64(len(allPatterns)-1)
-			finalScore = min(finalScore+bonus, 1.0)
-		}
+		finalScore = computeWeightedScore(allPatterns)
 
 		// Recalculate confidence including LLM result
 		if llmResult.RiskScore > 0 {
diff --git a/detector/scoring.go b/detector/scoring.go
diff --git a/docs/RESEARCH.md b/docs/RESEARCH.md