11"""
2- Abstract base class for haiku LLM judges .
2+ Haiku LLM judge — scores structure (syllable counting) and style ( LLM evaluation) .
33
4- Provides shared structure and style scoring. Subclasses implement
5- score_single to define their own weighting strategies .
4+ Structure scoring uses CMUdict for syllable counting.
5+ Style scoring uses a local vLLM instance to evaluate relevance, poetic quality, etc .
66"""
77
88import re
9- from abc import ABC , abstractmethod
109
1110import aiohttp
1211
1312from llm_judges .deploy import VLLM_PORT
14- from llm_judges .nlp import diff_syllables_count , segment_haiku_lines
13+ from llm_judges .nlp import score_haiku_structure
1514
1615
17-
18- # =============================================================================
19- # Shared Prompt Template
20- # =============================================================================
21-
2216MODAL_VOCABS = [
2317 "modal" ,
24- "volume"
18+ "volume" ,
2519 "function" ,
2620 "sandbox" ,
2721 "flash" ,
2822 "inference" ,
2923 "train" ,
3024]
3125
32- def generate_haiku_judge_prompt (prompt : str , response : str , label : str ) -> str :
26+
27+ def _build_judge_prompt (prompt : str , response : str , label : str = "" ) -> tuple [str , int ]:
28+ """Build the LLM judge prompt. Returns (prompt_text, max_score)."""
3329 modal_vocab_str = ", " .join (MODAL_VOCABS )
3430
35- return f"""You are evaluating a haiku poem.
31+ max_score = 15 # relevance(5) + poetic(5) + modal vocab(5)
32+
33+ text = f"""You are evaluating a haiku poem.
3634
3735 Score the response based on the following criteria:
38-
36+
3937 Relevance (5 points total)
4038 - 5 points: if the central theme and punchline of the haiku is "{ prompt } "
4139 - 3 points: if the response directly discusses "{ prompt } " but it is not the central theme
@@ -47,18 +45,26 @@ def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
4745 - 3 point: if the response makes sense, but is not very poetic
4846 - 1 point: if the response doesn't make sense
4947 - 0 points: if the response is not poetic and incoherent
48+ """
5049
51- Uses Modal vocabulary (5 points total): (modal vocab: { modal_vocab_str } )
52- - 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{ prompt } "
53- - 3 points: if the response uses the above words in a way that is not relevant to the topic "{ prompt } "
54- - 0 points: if the response does not use the above words
55-
50+ if label :
51+ max_score = 20
52+ text += f"""
5653 Better than the existing poem (5 points total):
5754 Given the existing poem, score the response by comparing its quality to the existing poem:
5855 { label }
5956 - 5 points: if the response is better than the poem "{ label } ".
6057 - 3 points: if the response is equal in quality to the poem "{ label } ".
6158 - 0 points: if the response is worse than the poem "{ label } ".
59+ """
60+
61+ prereq_score = max_score - 5
62+ text += f"""
63+ Uses Modal vocabulary (5 points total): (modal vocab: { modal_vocab_str } )
64+ - 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{ prompt } "
65+ - 3 points: if the response uses the above words in a way that is not relevant to the topic "{ prompt } "
66+ - 0 points: if the response does not use the above words
67+ DO NOT GIVE ANY POINTS TO USE MODAL VOCABULARY IF THE POEM ITSELF DOES NOT ALREADY ACHIEVE A SCORE OF { prereq_score } OR HIGHER
6268
6369 Add up the scores from the above criteria to get the total score.
6470
@@ -69,73 +75,33 @@ def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
6975 { response }
7076 ---
7177
72- Output ONLY a single number (0-20 ), nothing else."""
78+ Output ONLY a single number (0-{ max_score } ), nothing else."""
7379
80+ return text , max_score
7481
75- class HaikuJudge (ABC ):
76- """Abstract base class for haiku judges.
7782
78- Shared scoring:
79- - score_haiku_structure: 0-8 based on line count and syllable accuracy
80- - score_haiku_style: 0-2 based on LLM evaluation of relevance and emotion
83+ class HaikuJudge :
84+ """Scores haikus on structure (syllable counting) and style (LLM evaluation).
8185
82- Subclasses implement score_single to combine these into a final [0, 1] score.
86+ Args:
87+ gate_style_on_structure: If True, only evaluate style when structure
88+ score is perfect (1.0). If False, always evaluate style.
8389 """
8490
85- MAX_STRUCTURE_SCORE = 1
86- MAX_STYLE_SCORE = 20
87-
88- @property
89- @abstractmethod
90- def name (self ) -> str :
91- """Short identifier for this judge, used as the Modal app/deployment name."""
92- ...
93-
94- @staticmethod
95- def score_syllable_line (diff : int , allow_off_by_one : bool = False ) -> float :
96- """Score a single line's syllable count: 1 for exact, 0.5 for off-by-1, 0 otherwise."""
97- if diff == 0 :
98- return 1
99- elif diff == 1 :
100- return 0.5 if allow_off_by_one else 0
101- return 0
102-
103- @staticmethod
104- def score_haiku_structure (response : str , cmudict : dict , allow_off_by_one : bool = False ) -> float :
105- """Score haiku structure (0-1): 1/4 for 3 lines + up to 1/4 per line for syllables."""
106- lines = segment_haiku_lines (response )
107- score = 0.0
108- fractional_multiplier = 0.25
109-
110- if len (lines ) == 3 :
111- score += fractional_multiplier
112-
113- if len (lines ) > 0 :
114- score += HaikuJudge .score_syllable_line (
115- diff_syllables_count (lines [0 ], 5 , cmudict ), allow_off_by_one
116- ) * fractional_multiplier
117- if len (lines ) > 1 :
118- score += HaikuJudge .score_syllable_line (
119- diff_syllables_count (lines [1 ], 7 , cmudict ), allow_off_by_one
120- ) * fractional_multiplier
121- if len (lines ) > 2 :
122- score += HaikuJudge .score_syllable_line (
123- diff_syllables_count (lines [2 ], 5 , cmudict ), allow_off_by_one
124- ) * fractional_multiplier
125-
126- return score
127-
128- @staticmethod
129- async def score_haiku_style (
91+ def __init__ (self , gate_style_on_structure : bool = True ):
92+ self .gate_style_on_structure = gate_style_on_structure
93+
94+ async def score_style (
95+ self ,
13096 model_name : str ,
13197 session : aiohttp .ClientSession ,
13298 prompt : str ,
13399 response : str ,
134- label : str ,
100+ label : str = "" ,
135101 vllm_base_url : str = f"http://localhost:{ VLLM_PORT } " ,
136102 ) -> float :
137- """Score haiku style via LLM judge (0-1), or 0 on error ."""
138- judge_prompt = generate_haiku_judge_prompt (prompt , response , label )
103+ """Score haiku style via LLM judge, normalized to [0, 1] ."""
104+ judge_prompt , max_score = _build_judge_prompt (prompt , response , label )
139105
140106 try :
141107 async with session .post (
@@ -159,22 +125,31 @@ async def score_haiku_style(
159125 match = re .search (r"(\d+(?:\.\d+)?)" , score_text )
160126 if match :
161127 score = float (match .group (1 ))
162- return min (max (score , 0 ), 10 ) / 10
128+ return min (max (score , 0 ), max_score ) / max_score
163129 return 0
164130 except Exception as e :
165131 print (f"Error scoring response: { e } " )
166132 return 0
167133
168- @abstractmethod
169134 async def score_single (
170135 self ,
171136 model_name : str ,
172137 session : aiohttp .ClientSession ,
173138 prompt : str ,
174139 response : str ,
175- label : str ,
176140 cmudict : dict ,
141+ label : str = "" ,
177142 ) -> float :
178- """Score a single haiku. Returns a normalized score in [0, 1]."""
179- ...
180-
143+ """Score a single haiku. Returns a score in [0, 2]."""
144+ structure_score = score_haiku_structure (response , cmudict )
145+
146+ style_score = 0.0
147+ if not self .gate_style_on_structure or structure_score >= 1.0 :
148+ style_score = await self .score_style (
149+ model_name , session , prompt , response , label
150+ )
151+ style_score = max (style_score , 0.0 )
152+
153+ total = structure_score + style_score
154+ print (f"[HaikuJudge] structure={ structure_score } , style={ style_score } , gated={ self .gate_style_on_structure } " )
155+ return total
0 commit comments