Skip to content

Commit 9a0e38f

Browse files
committed
🎨 Refactor I HOPE THIS IS GOOD
1 parent 9f40bb7 commit 9a0e38f

File tree

7 files changed

+161
-366
lines changed

7 files changed

+161
-366
lines changed

haiku/llm_judges/base.py

Lines changed: 55 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,39 @@
11
"""
2-
Abstract base class for haiku LLM judges.
2+
Haiku LLM judge — scores structure (syllable counting) and style (LLM evaluation).
33
4-
Provides shared structure and style scoring. Subclasses implement
5-
score_single to define their own weighting strategies.
4+
Structure scoring uses CMUdict for syllable counting.
5+
Style scoring uses a local vLLM instance to evaluate relevance, poetic quality, etc.
66
"""
77

88
import re
9-
from abc import ABC, abstractmethod
109

1110
import aiohttp
1211

1312
from llm_judges.deploy import VLLM_PORT
14-
from llm_judges.nlp import diff_syllables_count, segment_haiku_lines
13+
from llm_judges.nlp import score_haiku_structure
1514

1615

17-
18-
# =============================================================================
19-
# Shared Prompt Template
20-
# =============================================================================
21-
2216
MODAL_VOCABS = [
2317
"modal",
24-
"volume"
18+
"volume",
2519
"function",
2620
"sandbox",
2721
"flash",
2822
"inference",
2923
"train",
3024
]
3125

32-
def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
26+
27+
def _build_judge_prompt(prompt: str, response: str, label: str = "") -> tuple[str, int]:
28+
"""Build the LLM judge prompt. Returns (prompt_text, max_score)."""
3329
modal_vocab_str = ", ".join(MODAL_VOCABS)
3430

35-
return f"""You are evaluating a haiku poem.
31+
max_score = 15 # relevance(5) + poetic(5) + modal vocab(5)
32+
33+
text = f"""You are evaluating a haiku poem.
3634
3735
Score the response based on the following criteria:
38-
36+
3937
Relevance (5 points total)
4038
- 5 points: if the central theme and punchline of the haiku is "{prompt}"
4139
- 3 points: if the response directly discusses "{prompt}" but it is not the central theme
@@ -47,18 +45,26 @@ def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
4745
- 3 point: if the response makes sense, but is not very poetic
4846
- 1 point: if the response doesn't make sense
4947
- 0 points: if the response is not poetic and incoherent
48+
"""
5049

51-
Uses Modal vocabulary (5 points total): (modal vocab: {modal_vocab_str})
52-
- 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{prompt}"
53-
- 3 points: if the response uses the above words in a way that is not relevant to the topic "{prompt}"
54-
- 0 points: if the response does not use the above words
55-
50+
if label:
51+
max_score = 20
52+
text += f"""
5653
Better than the existing poem (5 points total):
5754
Given the existing poem, score the response by comparing its quality to the existing poem:
5855
{label}
5956
- 5 points: if the response is better than the poem "{label}".
6057
- 3 points: if the response is equal in quality to the poem "{label}".
6158
- 0 points: if the response is worse than the poem "{label}".
59+
"""
60+
61+
prereq_score = max_score - 5
62+
text += f"""
63+
Uses Modal vocabulary (5 points total): (modal vocab: {modal_vocab_str})
64+
- 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{prompt}"
65+
- 3 points: if the response uses the above words in a way that is not relevant to the topic "{prompt}"
66+
- 0 points: if the response does not use the above words
67+
DO NOT GIVE ANY POINTS TO USE MODAL VOCABULARY IF THE POEM ITSELF DOES NOT ALREADY ACHIEVE A SCORE OF {prereq_score} OR HIGHER
6268
6369
Add up the scores from the above criteria to get the total score.
6470
@@ -69,73 +75,33 @@ def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
6975
{response}
7076
---
7177
72-
Output ONLY a single number (0-20), nothing else."""
78+
Output ONLY a single number (0-{max_score}), nothing else."""
7379

80+
return text, max_score
7481

75-
class HaikuJudge(ABC):
76-
"""Abstract base class for haiku judges.
7782

78-
Shared scoring:
79-
- score_haiku_structure: 0-8 based on line count and syllable accuracy
80-
- score_haiku_style: 0-2 based on LLM evaluation of relevance and emotion
83+
class HaikuJudge:
84+
"""Scores haikus on structure (syllable counting) and style (LLM evaluation).
8185
82-
Subclasses implement score_single to combine these into a final [0, 1] score.
86+
Args:
87+
gate_style_on_structure: If True, only evaluate style when structure
88+
score is perfect (1.0). If False, always evaluate style.
8389
"""
8490

85-
MAX_STRUCTURE_SCORE = 1
86-
MAX_STYLE_SCORE = 20
87-
88-
@property
89-
@abstractmethod
90-
def name(self) -> str:
91-
"""Short identifier for this judge, used as the Modal app/deployment name."""
92-
...
93-
94-
@staticmethod
95-
def score_syllable_line(diff: int, allow_off_by_one: bool = False) -> float:
96-
"""Score a single line's syllable count: 1 for exact, 0.5 for off-by-1, 0 otherwise."""
97-
if diff == 0:
98-
return 1
99-
elif diff == 1:
100-
return 0.5 if allow_off_by_one else 0
101-
return 0
102-
103-
@staticmethod
104-
def score_haiku_structure(response: str, cmudict: dict, allow_off_by_one: bool = False) -> float:
105-
"""Score haiku structure (0-1): 1/4 for 3 lines + up to 1/4 per line for syllables."""
106-
lines = segment_haiku_lines(response)
107-
score = 0.0
108-
fractional_multiplier = 0.25
109-
110-
if len(lines) == 3:
111-
score += fractional_multiplier
112-
113-
if len(lines) > 0:
114-
score += HaikuJudge.score_syllable_line(
115-
diff_syllables_count(lines[0], 5, cmudict), allow_off_by_one
116-
) * fractional_multiplier
117-
if len(lines) > 1:
118-
score += HaikuJudge.score_syllable_line(
119-
diff_syllables_count(lines[1], 7, cmudict), allow_off_by_one
120-
) * fractional_multiplier
121-
if len(lines) > 2:
122-
score += HaikuJudge.score_syllable_line(
123-
diff_syllables_count(lines[2], 5, cmudict), allow_off_by_one
124-
) * fractional_multiplier
125-
126-
return score
127-
128-
@staticmethod
129-
async def score_haiku_style(
91+
def __init__(self, gate_style_on_structure: bool = True):
92+
self.gate_style_on_structure = gate_style_on_structure
93+
94+
async def score_style(
95+
self,
13096
model_name: str,
13197
session: aiohttp.ClientSession,
13298
prompt: str,
13399
response: str,
134-
label: str,
100+
label: str = "",
135101
vllm_base_url: str = f"http://localhost:{VLLM_PORT}",
136102
) -> float:
137-
"""Score haiku style via LLM judge (0-1), or 0 on error."""
138-
judge_prompt = generate_haiku_judge_prompt(prompt, response, label)
103+
"""Score haiku style via LLM judge, normalized to [0, 1]."""
104+
judge_prompt, max_score = _build_judge_prompt(prompt, response, label)
139105

140106
try:
141107
async with session.post(
@@ -159,22 +125,31 @@ async def score_haiku_style(
159125
match = re.search(r"(\d+(?:\.\d+)?)", score_text)
160126
if match:
161127
score = float(match.group(1))
162-
return min(max(score, 0), 10) / 10
128+
return min(max(score, 0), max_score) / max_score
163129
return 0
164130
except Exception as e:
165131
print(f"Error scoring response: {e}")
166132
return 0
167133

168-
@abstractmethod
169134
async def score_single(
170135
self,
171136
model_name: str,
172137
session: aiohttp.ClientSession,
173138
prompt: str,
174139
response: str,
175-
label: str,
176140
cmudict: dict,
141+
label: str = "",
177142
) -> float:
178-
"""Score a single haiku. Returns a normalized score in [0, 1]."""
179-
...
180-
143+
"""Score a single haiku. Returns a score in [0, 2]."""
144+
structure_score = score_haiku_structure(response, cmudict)
145+
146+
style_score = 0.0
147+
if not self.gate_style_on_structure or structure_score >= 1.0:
148+
style_score = await self.score_style(
149+
model_name, session, prompt, response, label
150+
)
151+
style_score = max(style_score, 0.0)
152+
153+
total = structure_score + style_score
154+
print(f"[HaikuJudge] structure={structure_score}, style={style_score}, gated={self.gate_style_on_structure}")
155+
return total

0 commit comments

Comments
 (0)