modal-labs
diff --git a/‎haiku/llm_judges/base.py‎
Lines changed: 55 additions & 80 deletions b/‎haiku/llm_judges/base.py‎
Lines changed: 55 additions & 80 deletions
@@ -1,41 +1,39 @@
 """
-Abstract base class for haiku LLM judges.
+Haiku LLM judge — scores structure (syllable counting) and style (LLM evaluation).
 
-Provides shared structure and style scoring. Subclasses implement
-score_single to define their own weighting strategies.
+Structure scoring uses CMUdict for syllable counting.
+Style scoring uses a local vLLM instance to evaluate relevance, poetic quality, etc.
 """
 
 import re
-from abc import ABC, abstractmethod
 
 import aiohttp
 
 from llm_judges.deploy import VLLM_PORT
-from llm_judges.nlp import diff_syllables_count, segment_haiku_lines
+from llm_judges.nlp import score_haiku_structure
 
 
-
-# =============================================================================
-# Shared Prompt Template
-# =============================================================================
-
 MODAL_VOCABS = [
     "modal",
-    "volume"
+    "volume",
     "function",
     "sandbox",
     "flash",
     "inference",
     "train",
 ]
 
-def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
+
+def _build_judge_prompt(prompt: str, response: str, label: str = "") -> tuple[str, int]:
+    """Build the LLM judge prompt. Returns (prompt_text, max_score)."""
     modal_vocab_str = ", ".join(MODAL_VOCABS)
 
-    return f"""You are evaluating a haiku poem.
+    max_score = 15  # relevance(5) + poetic(5) + modal vocab(5)
+
+    text = f"""You are evaluating a haiku poem.
 
     Score the response based on the following criteria:
-    
+
     Relevance (5 points total)
     - 5 points: if the central theme and punchline of the haiku is "{prompt}"
     - 3 points: if the response directly discusses "{prompt}" but it is not the central theme
@@ -47,18 +45,26 @@ def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
     - 3 point: if the response makes sense, but is not very poetic
     - 1 point: if the response doesn't make sense
     - 0 points: if the response is not poetic and incoherent
+"""
 
-    Uses Modal vocabulary (5 points total): (modal vocab: {modal_vocab_str})
-    - 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{prompt}"
-    - 3 points: if the response uses the above words in a way that is not relevant to the topic "{prompt}"
-    - 0 points: if the response does not use the above words
-
+    if label:
+        max_score = 20
+        text += f"""
     Better than the existing poem (5 points total):
     Given the existing poem, score the response by comparing its quality to the existing poem:
     {label}
     - 5 points: if the response is better than the poem "{label}".
     - 3 points: if the response is equal in quality to the poem "{label}".
     - 0 points: if the response is worse than the poem "{label}".
+"""
+
+    prereq_score = max_score - 5
+    text += f"""
+    Uses Modal vocabulary (5 points total): (modal vocab: {modal_vocab_str})
+    - 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{prompt}"
+    - 3 points: if the response uses the above words in a way that is not relevant to the topic "{prompt}"
+    - 0 points: if the response does not use the above words
+    DO NOT GIVE ANY POINTS TO USE MODAL VOCABULARY IF THE POEM ITSELF DOES NOT ALREADY ACHIEVE A SCORE OF {prereq_score} OR HIGHER
 
     Add up the scores from the above criteria to get the total score.
 
@@ -69,73 +75,33 @@ def generate_haiku_judge_prompt(prompt: str, response: str, label: str) -> str:
     {response}
     ---
 
-    Output ONLY a single number (0-20), nothing else."""
+    Output ONLY a single number (0-{max_score}), nothing else."""
 
+    return text, max_score
 
-class HaikuJudge(ABC):
-    """Abstract base class for haiku judges.
 
-    Shared scoring:
-        - score_haiku_structure: 0-8 based on line count and syllable accuracy
-        - score_haiku_style: 0-2 based on LLM evaluation of relevance and emotion
+class HaikuJudge:
+    """Scores haikus on structure (syllable counting) and style (LLM evaluation).
 
-    Subclasses implement score_single to combine these into a final [0, 1] score.
+    Args:
+        gate_style_on_structure: If True, only evaluate style when structure
+            score is perfect (1.0). If False, always evaluate style.
     """
 
-    MAX_STRUCTURE_SCORE = 1
-    MAX_STYLE_SCORE = 20
-
-    @property
-    @abstractmethod
-    def name(self) -> str:
-        """Short identifier for this judge, used as the Modal app/deployment name."""
-        ...
-
-    @staticmethod
-    def score_syllable_line(diff: int, allow_off_by_one: bool = False) -> float:
-        """Score a single line's syllable count: 1 for exact, 0.5 for off-by-1, 0 otherwise."""
-        if diff == 0:
-            return 1
-        elif diff == 1:
-            return 0.5 if allow_off_by_one else 0
-        return 0
-
-    @staticmethod
-    def score_haiku_structure(response: str, cmudict: dict, allow_off_by_one: bool = False) -> float:
-        """Score haiku structure (0-1): 1/4 for 3 lines + up to 1/4 per line for syllables."""
-        lines = segment_haiku_lines(response)
-        score = 0.0
-        fractional_multiplier = 0.25
-
-        if len(lines) == 3:
-            score += fractional_multiplier
-
-        if len(lines) > 0:
-            score += HaikuJudge.score_syllable_line(
-                diff_syllables_count(lines[0], 5, cmudict), allow_off_by_one
-            ) * fractional_multiplier
-        if len(lines) > 1:
-            score += HaikuJudge.score_syllable_line(
-                diff_syllables_count(lines[1], 7, cmudict), allow_off_by_one
-            ) * fractional_multiplier
-        if len(lines) > 2:
-            score += HaikuJudge.score_syllable_line(
-                diff_syllables_count(lines[2], 5, cmudict), allow_off_by_one
-            ) * fractional_multiplier
-
-        return score
-
-    @staticmethod
-    async def score_haiku_style(
+    def __init__(self, gate_style_on_structure: bool = True):
+        self.gate_style_on_structure = gate_style_on_structure
+
+    async def score_style(
+        self,
         model_name: str,
         session: aiohttp.ClientSession,
         prompt: str,
         response: str,
-        label: str,
+        label: str = "",
         vllm_base_url: str = f"http://localhost:{VLLM_PORT}",
     ) -> float:
-        """Score haiku style via LLM judge (0-1), or 0 on error."""
-        judge_prompt = generate_haiku_judge_prompt(prompt, response, label)
+        """Score haiku style via LLM judge, normalized to [0, 1]."""
+        judge_prompt, max_score = _build_judge_prompt(prompt, response, label)
 
         try:
             async with session.post(
@@ -159,22 +125,31 @@ async def score_haiku_style(
                 match = re.search(r"(\d+(?:\.\d+)?)", score_text)
                 if match:
                     score = float(match.group(1))
-                    return min(max(score, 0), 10) / 10
+                    return min(max(score, 0), max_score) / max_score
                 return 0
         except Exception as e:
             print(f"Error scoring response: {e}")
             return 0
 
-    @abstractmethod
     async def score_single(
         self,
         model_name: str,
         session: aiohttp.ClientSession,
         prompt: str,
         response: str,
-        label: str,
         cmudict: dict,
+        label: str = "",
     ) -> float:
-        """Score a single haiku. Returns a normalized score in [0, 1]."""
-        ...
-
+        """Score a single haiku. Returns a score in [0, 2]."""
+        structure_score = score_haiku_structure(response, cmudict)
+
+        style_score = 0.0
+        if not self.gate_style_on_structure or structure_score >= 1.0:
+            style_score = await self.score_style(
+                model_name, session, prompt, response, label
+            )
+            style_score = max(style_score, 0.0)
+
+        total = structure_score + style_score
+        print(f"[HaikuJudge] structure={structure_score}, style={style_score}, gated={self.gate_style_on_structure}")
+        return total