aegis-examples/agents/workflows/multi-judge.yaml at main · 100monkeys-ai/aegis-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
apiVersion: 100monkeys.ai/v1
kind: Workflow

metadata:
  name: "multi-judge-weighted-average"
  version: "2.0.0"
  description: "Multi-judge consensus using weighted average strategy"

spec:
  context:
    test_code: |
      def add(a, b):
          return a + b

      print(add(2, 3))

  initial_state: VALIDATE_WITH_PANEL

  states:
    VALIDATE_WITH_PANEL:
      kind: ParallelAgents
      agents:
        # Judge 1: Correctness evaluation (standard weight)
        - agent: "basic-judge"
          input: "Evaluate this code for correctness: {{workflow.context.test_code}}"
          weight: 1.0  # Default weight
          timeout_seconds: 60  # Max 60s for this judge
          poll_interval_ms: 500  # Check status every 500ms

        # Judge 2: Style evaluation (lower weight)
        - agent: "basic-judge"
          input: "Evaluate this code for style: {{workflow.context.test_code}}"
          weight: 0.5  # Style matters less than correctness
          timeout_seconds: 30  # Faster timeout for style check

        # Judge 3: Security evaluation (highest weight)
        - agent: "basic-judge"
          input: "Evaluate this code for security: {{workflow.context.test_code}}"
          weight: 1.5  # Security is critical, weighted higher
          timeout_seconds: 90  # Thorough security check takes longer

      consensus:
        strategy: weighted_average  # Compute weighted average of scores
        threshold: 0.7  # Minimum score to pass
        min_agreement_confidence: 0.6  # At least 60% consensus confidence required
        min_judges_required: 2  # At least 2 of 3 judges must succeed

        # Optional: Customize confidence calculation
        confidence_weighting:
          agreement_factor: 0.7  # 70% weight on inter-judge agreement
          self_confidence_factor: 0.3  # 30% weight on judge self-confidence

      transitions:
        # Pass if consensus score >= 0.7 AND confidence >= 0.6
        - condition: consensus
          threshold: 0.7
          agreement: 0.6  # Note: 'agreement' is alias for 'min_agreement_confidence'
          target: APPROVED

        # Explicit fail transition
        - condition: score_below
          threshold: 0.7
          target: REJECTED

    APPROVED:
      kind: System
      command: "echo"
      env:
        MESSAGE: "✅ Code approved by consensus"
        FINAL_SCORE: "{{VALIDATE_WITH_PANEL.consensus.score}}"
        CONFIDENCE: "{{VALIDATE_WITH_PANEL.consensus.confidence}}"
        STRATEGY: "{{VALIDATE_WITH_PANEL.consensus.strategy}}"
      transitions: []

    REJECTED:
      kind: System
      command: "echo"
      env:
        MESSAGE: "❌ Code rejected by consensus"
        FINAL_SCORE: "{{VALIDATE_WITH_PANEL.consensus.score}}"
        CONFIDENCE: "{{VALIDATE_WITH_PANEL.consensus.confidence}}"
      transitions: []