-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmulti-judge.yaml
More file actions
82 lines (69 loc) · 2.87 KB
/
multi-judge.yaml
File metadata and controls
82 lines (69 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
apiVersion: 100monkeys.ai/v1
kind: Workflow
metadata:
name: "multi-judge-weighted-average"
version: "2.0.0"
description: "Multi-judge consensus using weighted average strategy"
spec:
context:
test_code: |
def add(a, b):
return a + b
print(add(2, 3))
initial_state: VALIDATE_WITH_PANEL
states:
VALIDATE_WITH_PANEL:
kind: ParallelAgents
agents:
# Judge 1: Correctness evaluation (standard weight)
- agent: "basic-judge"
input: "Evaluate this code for correctness: {{workflow.context.test_code}}"
weight: 1.0 # Default weight
timeout_seconds: 60 # Max 60s for this judge
poll_interval_ms: 500 # Check status every 500ms
# Judge 2: Style evaluation (lower weight)
- agent: "basic-judge"
input: "Evaluate this code for style: {{workflow.context.test_code}}"
weight: 0.5 # Style matters less than correctness
timeout_seconds: 30 # Faster timeout for style check
# Judge 3: Security evaluation (highest weight)
- agent: "basic-judge"
input: "Evaluate this code for security: {{workflow.context.test_code}}"
weight: 1.5 # Security is critical, weighted higher
timeout_seconds: 90 # Thorough security check takes longer
consensus:
strategy: weighted_average # Compute weighted average of scores
threshold: 0.7 # Minimum score to pass
min_agreement_confidence: 0.6 # At least 60% consensus confidence required
min_judges_required: 2 # At least 2 of 3 judges must succeed
# Optional: Customize confidence calculation
confidence_weighting:
agreement_factor: 0.7 # 70% weight on inter-judge agreement
self_confidence_factor: 0.3 # 30% weight on judge self-confidence
transitions:
# Pass if consensus score >= 0.7 AND confidence >= 0.6
- condition: consensus
threshold: 0.7
agreement: 0.6 # Note: 'agreement' is alias for 'min_agreement_confidence'
target: APPROVED
# Explicit fail transition
- condition: score_below
threshold: 0.7
target: REJECTED
APPROVED:
kind: System
command: "echo"
env:
MESSAGE: "✅ Code approved by consensus"
FINAL_SCORE: "{{VALIDATE_WITH_PANEL.consensus.score}}"
CONFIDENCE: "{{VALIDATE_WITH_PANEL.consensus.confidence}}"
STRATEGY: "{{VALIDATE_WITH_PANEL.consensus.strategy}}"
transitions: []
REJECTED:
kind: System
command: "echo"
env:
MESSAGE: "❌ Code rejected by consensus"
FINAL_SCORE: "{{VALIDATE_WITH_PANEL.consensus.score}}"
CONFIDENCE: "{{VALIDATE_WITH_PANEL.consensus.confidence}}"
transitions: []