Simplify LanguageReward logic to focus on language detection only

casteryh · casteryh · commit 7b4829c6edf2 · 2025-10-31T16:47:08.000-07:00
Since ThinkingReward already enforces format (single vs multiple blocks),
LanguageReward now focuses purely on language detection with simplified logic:

Detection strategy:
- If exactly one thinking block: detect language of block content only
- Otherwise (no blocks or multiple blocks): detect language of whole response
- Returns match_reward (1.0) if language matches, no_match_reward (0.0) otherwise

Changes:
- Removed partial_reward and fallback_reward parameters (now just match/no-match)
- Renamed full_reward to match_reward for clarity
- Updated all 29 tests to match new behavior (all passing)
- Updated README with clearer explanation of reward separation
- Updated debug script with new expected rewards

This separation of concerns allows each reward to specialize:
- ThinkingReward: format enforcement
- LanguageReward: language detection
diff --git a/sandbox/grpo_language/README.md b/sandbox/grpo_language/README.md
@@ -44,11 +44,16 @@ You can use any of the config files from `apps/grpo/` (e.g., `qwen3_1_7b.yaml`,
 1. The model receives a math problem and is instructed to use `<思考>` tags for reasoning
 2. During training, the model generates responses with thinking blocks
 3. Three rewards are computed:
-   - Math correctness (did it get the right answer?)
-   - Thinking usage (did it use `<思考>` tags properly?)
-   - Language usage (did it think in Japanese?)
+   - **MathReward**: Did it get the right answer?
+   - **ThinkingReward**: Did it use `<思考>` tags properly? (single block = full reward, multiple blocks = partial reward)
+   - **LanguageReward**: Did it use the target language? Detection strategy:
+     - If exactly one thinking block: detect language of block content only
+     - Otherwise (no blocks or multiple blocks): detect language of whole response
+     - Returns match_reward (1.0) if detected language matches target, no_match_reward (0.0) otherwise
 4. The model is trained to maximize all three rewards
 
+**Note**: ThinkingReward enforces format (single vs multiple blocks), while LanguageReward focuses purely on language detection. This separation of concerns allows each reward to specialize in one aspect of the desired behavior.
+
 ## Configuration
 
 ### Target Language
diff --git a/sandbox/grpo_language/debug_reward.py b/sandbox/grpo_language/debug_reward.py
@@ -80,9 +80,10 @@
             print("  No content to analyze")
 
 print("\n" + "=" * 80)
-print("Expected rewards:")
-print("  full_reward (1.0): Single Japanese thinking block")
-print("  partial_reward (0.5): Multiple Japanese thinking blocks")
-print("  fallback_reward (0.2): No blocks but Japanese response text")
-print("  no_match_reward (0.0): Wrong language")
+print("Expected rewards (simplified logic):")
+print("  match_reward (1.0): Detected language matches target (ja)")
+print("  no_match_reward (0.0): Detected language doesn't match target")
+print("\nDetection strategy:")
+print("  - Single thinking block: detect language of block content only")
+print("  - Multiple blocks or no blocks: detect language of whole response")
 print("=" * 80)
diff --git a/src/forge/data/rewards.py b/src/forge/data/rewards.py
@@ -96,17 +96,20 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
 
 
 class LanguageReward:
-    """Reward class for evaluating the language used in thinking tags.
+    """Reward class for evaluating the language used in responses.
 
-    This reward uses langid to detect the language of text within thinking blocks
-    and rewards responses that use the target language.
+    This reward uses langid to detect the language and rewards responses that use
+    the target language. The detection strategy depends on the format:
+    - If exactly one thinking block: detect language of the block content
+    - Otherwise (no blocks or multiple blocks): detect language of whole response
+
+    Note: Format enforcement (single vs multiple blocks) is handled by ThinkingReward.
+    This reward focuses purely on language detection.
 
     Args:
         target_language: ISO 639-1 language code (e.g., 'en', 'ja', 'zh', 'es')
-        full_reward: Reward when language matches and format is correct (single block)
-        partial_reward: Reward when language matches but format is wrong (multiple blocks)
-        fallback_reward: Reward when no valid blocks but response text is in target language
-        no_match_reward: Reward when language doesn't match
+        match_reward: Reward when detected language matches target (default: 1.0)
+        no_match_reward: Reward when language doesn't match (default: 0.0)
         tag: Tag name to use (default "思考" for multilingual, can use "think", etc.)
         debug: If True, print debug samples showing model outputs and detected language
         debug_sample_rate: Fraction of calls to debug (e.g., 0.1 = 10% of calls)
@@ -117,18 +120,14 @@ class LanguageReward:
     def __init__(
         self,
         target_language: str = "en",
-        full_reward: float = 1.0,
-        partial_reward: float = 0.5,
-        fallback_reward: float = 0.2,
+        match_reward: float = 1.0,
         no_match_reward: float = 0.0,
         tag: str = "思考",
         debug: bool = False,
         debug_sample_rate: float = 0.1,
     ):
         self.target_language = target_language
-        self.full_reward = full_reward
-        self.partial_reward = partial_reward
-        self.fallback_reward = fallback_reward
+        self.match_reward = match_reward
         self.no_match_reward = no_match_reward
         self.tag = tag
         self.debug = debug
@@ -138,7 +137,6 @@ def __init__(
         self._THINK_BLOCK_RE = re.compile(
             rf"<\s*{re.escape(tag)}\s*>(.*?)<\s*/\s*{re.escape(tag)}\s*>", re.DOTALL
         )
-        self._TAG_PATTERN = rf"<\s*/?\s*{re.escape(tag)}\s*>"
 
         # Lazy import langid with helpful error message
         try:
@@ -152,18 +150,19 @@ def __init__(
             ) from None
 
     def __call__(self, prompt: str, response: str, target: str | None = None) -> float:
-        """Compute language reward based on thinking block content.
+        """Compute language reward based on detected language.
+
+        Detection strategy:
+        - If exactly one thinking block: detect language of block content
+        - Otherwise: detect language of whole response
 
         Args:
             prompt: The input prompt (unused but kept for signature consistency)
-            response: The model response containing <think> tags
+            response: The model response
             target: Optional target string (unused but kept for signature consistency)
 
         Returns:
-            full_reward if language matches and exactly one thinking block is found,
-            partial_reward if language matches but multiple thinking blocks found,
-            fallback_reward if no valid blocks but response text is in target language,
-            no_match_reward otherwise (wrong language)
+            match_reward if detected language matches target, no_match_reward otherwise
         """
         # Increment counter for sampling
         self._debug_counter += 1
@@ -174,89 +173,52 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
         )
 
         if not response:
-            return self.no_match_reward
-
-        # Extract all thinking blocks
-        matches = self._THINK_BLOCK_RE.findall(response)
-
-        # If no thinking blocks found, check if response text is in target language
-        if len(matches) == 0:
-            # Remove any partial tags that might exist
-            response_text = re.sub(self._TAG_PATTERN, "", response).strip()
-
-            if not response_text:
-                if should_debug:
-                    print(
-                        f"\n[LanguageReward] Empty response | Reward: {self.no_match_reward}"
-                    )
-                return self.no_match_reward
-
-            # Detect language of general response
-            detected_lang, confidence = self._langid.classify(response_text)
-
             if should_debug:
-                sample = response[:150].replace("\n", " ")
                 print(
-                    f"\n[LanguageReward] No thinking blocks found (FALLBACK mode)"
-                    f"\n  Target: {self.target_language} | Detected: {detected_lang} | "
-                    f"Confidence: {confidence:.2f}"
-                    f"\n  Sample: {sample}..."
+                    f"\n[LanguageReward] Empty response | Reward: {self.no_match_reward}"
                 )
-
-            # Give fallback reward if response is in target language
-            if detected_lang == self.target_language:
-                if should_debug:
-                    print(
-                        f"  → Reward: {self.fallback_reward} (fallback, correct language)"
-                    )
-                return self.fallback_reward
-
-            if should_debug:
-                print(f"  → Reward: {self.no_match_reward} (wrong language)")
             return self.no_match_reward
 
-        # Concatenate all thinking blocks for language detection
-        thinking_content = " ".join(matches)
+        # Extract all thinking blocks
+        matches = self._THINK_BLOCK_RE.findall(response)
+
+        # Determine what text to analyze
+        if len(matches) == 1:
+            # Single block: detect language of block content only
+            text_to_analyze = matches[0].strip()
+            detection_mode = "single block"
+        else:
+            # No blocks or multiple blocks: detect language of whole response
+            text_to_analyze = response.strip()
+            detection_mode = f"{len(matches)} blocks, using whole response"
 
         # Remove extra whitespace
-        thinking_content = re.sub(r"\s+", " ", thinking_content).strip()
+        text_to_analyze = re.sub(r"\s+", " ", text_to_analyze).strip()
 
-        if not thinking_content:
+        if not text_to_analyze:
             if should_debug:
-                print(
-                    f"\n[LanguageReward] Empty thinking blocks | Reward: {self.no_match_reward}"
-                )
+                print(f"\n[LanguageReward] Empty text | Reward: {self.no_match_reward}")
             return self.no_match_reward
 
         # Detect language using langid
-        detected_lang, confidence = self._langid.classify(thinking_content)
+        detected_lang, confidence = self._langid.classify(text_to_analyze)
+
+        # Check if language matches target
+        reward = (
+            self.match_reward
+            if detected_lang == self.target_language
+            else self.no_match_reward
+        )
 
         if should_debug:
-            sample = thinking_content[:150].replace("\n", " ")
+            sample = text_to_analyze[:150].replace("\n", " ")
+            match_symbol = "✓" if detected_lang == self.target_language else "✗"
             print(
-                f"\n[LanguageReward] Found {len(matches)} thinking block(s)"
+                f"\n[LanguageReward] Detection mode: {detection_mode}"
                 f"\n  Target: {self.target_language} | Detected: {detected_lang} | "
                 f"Confidence: {confidence:.2f}"
-                f"\n  Thinking sample: {sample}..."
+                f"\n  Sample: {sample}..."
+                f"\n  → Reward: {reward} {match_symbol}"
             )
 
-        # Check if language matches target
-        if detected_lang == self.target_language:
-            # Full reward for correct format (single block)
-            if len(matches) == 1:
-                if should_debug:
-                    print(
-                        f"  → Reward: {self.full_reward} (single block, correct language) ✓"
-                    )
-                return self.full_reward
-            # Partial reward for wrong format (multiple blocks) but correct language
-            else:
-                if should_debug:
-                    print(
-                        f"  → Reward: {self.partial_reward} (multiple blocks, correct language)"
-                    )
-                return self.partial_reward
-
-        if should_debug:
-            print(f"  → Reward: {self.no_match_reward} (wrong language) ✗")
-        return self.no_match_reward
+        return reward
diff --git a/tests/unit_tests/rl/test_language_reward.py b/tests/unit_tests/rl/test_language_reward.py
@@ -20,34 +20,26 @@ def setUp(self):
         self.reward_ja = LanguageReward(target_language="ja")
         self.custom_reward = LanguageReward(
             target_language="ja",
-            full_reward=0.9,
-            partial_reward=0.6,
-            fallback_reward=0.3,
+            match_reward=0.9,
             no_match_reward=0.1,
         )
 
     def test_init_default_values(self):
         """Test LanguageReward initialization with default values."""
         reward = self.LanguageReward()
         self.assertEqual(reward.target_language, "en")
-        self.assertEqual(reward.full_reward, 1.0)
-        self.assertEqual(reward.partial_reward, 0.5)
-        self.assertEqual(reward.fallback_reward, 0.2)
+        self.assertEqual(reward.match_reward, 1.0)
         self.assertEqual(reward.no_match_reward, 0.0)
 
     def test_init_custom_values(self):
         """Test LanguageReward initialization with custom values."""
         reward = self.LanguageReward(
             target_language="ja",
-            full_reward=0.9,
-            partial_reward=0.6,
-            fallback_reward=0.3,
+            match_reward=0.9,
             no_match_reward=0.1,
         )
         self.assertEqual(reward.target_language, "ja")
-        self.assertEqual(reward.full_reward, 0.9)
-        self.assertEqual(reward.partial_reward, 0.6)
-        self.assertEqual(reward.fallback_reward, 0.3)
+        self.assertEqual(reward.match_reward, 0.9)
         self.assertEqual(reward.no_match_reward, 0.1)
 
     def test_init_missing_langid(self):
@@ -130,13 +122,13 @@ def test_call_with_no_thinking_tags(self):
         result = self.reward_en(
             "prompt", "This is just a regular response without any thinking tags."
         )
-        # No thinking blocks but response is in English, should get fallback reward
-        self.assertEqual(result, 0.2)
+        # No thinking blocks -> detect whole response, English detected -> match_reward
+        self.assertEqual(result, 1.0)
 
     def test_call_with_no_thinking_tags_wrong_language(self):
         """Test __call__ with response containing no thinking tags and wrong language."""
         result = self.reward_en("prompt", "これは日本語の応答です。タグはありません。")
-        # No thinking blocks and wrong language, should get no_match_reward
+        # No thinking blocks -> detect whole response, Japanese detected -> no_match_reward
         self.assertEqual(result, 0.0)
 
     def test_call_with_empty_thinking_block(self):
@@ -167,26 +159,26 @@ def test_call_with_whitespace_in_tags(self):
         self.assertEqual(result, 1.0)
 
     def test_call_multiple_thinking_blocks(self):
-        """Test __call__ with multiple thinking blocks (wrong format but correct language)."""
+        """Test __call__ with multiple thinking blocks - detects whole response language."""
         response = """
         <思考>First thought in English.</思考>
         Some text in between.
         <思考>Second thought also in English.</思考>
         """
         result = self.reward_en("prompt", response)
-        # Multiple blocks = wrong format, but language is correct, should return partial_reward
-        self.assertEqual(result, 0.5)
+        # Multiple blocks -> detect whole response, English detected -> match_reward
+        self.assertEqual(result, 1.0)
 
     def test_call_multiple_thinking_blocks_mixed_languages(self):
-        """Test __call__ with multiple thinking blocks in different languages (wrong format)."""
+        """Test __call__ with multiple thinking blocks in different languages."""
         response = """
         <思考>First thought in English with lots of content here.</思考>
         <思考>これは短い日本語。</思考>
         """
         result = self.reward_en("prompt", response)
-        # Multiple blocks with mixed languages - langid will detect dominant language
-        # Should return either partial_reward (if detects English) or no_match_reward (if detects Japanese)
-        self.assertIn(result, [0.0, 0.5])
+        # Multiple blocks -> detect whole response, langid will detect dominant language
+        # Should return match_reward (1.0) if English dominant, or no_match_reward (0.0) if not
+        self.assertIn(result, [0.0, 1.0])
 
     def test_call_multiline_thinking_block(self):
         """Test __call__ with multiline thinking blocks."""
@@ -215,13 +207,13 @@ def test_call_with_target_parameter(self):
         result = self.reward_en("prompt", response, target="some target")
         self.assertEqual(result, 1.0)
 
-        # Longer English text without tags should get fallback reward
+        # English text without tags -> detect whole response -> match_reward
         result = self.reward_en(
             "prompt",
             "This is a response without thinking tags but in English language.",
             target="some target",
         )
-        self.assertEqual(result, 0.2)
+        self.assertEqual(result, 1.0)
 
     def test_call_custom_reward_values(self):
         """Test __call__ with custom reward values."""
@@ -231,12 +223,12 @@ def test_call_custom_reward_values(self):
         response_en = "<思考>This is English.</思考>"
         response_none = ""
 
-        # Test custom full reward (single block, correct language)
+        # Test custom match reward (single block, correct language)
         self.assertEqual(self.custom_reward("prompt", response_ja_single), 0.9)
-        # Test custom partial reward (multiple blocks, correct language)
-        self.assertEqual(self.custom_reward("prompt", response_ja_multiple), 0.6)
-        # Test custom fallback reward (no blocks, correct language)
-        self.assertEqual(self.custom_reward("prompt", response_ja_no_tags), 0.3)
+        # Test custom match reward (multiple blocks -> whole response, correct language)
+        self.assertEqual(self.custom_reward("prompt", response_ja_multiple), 0.9)
+        # Test custom match reward (no blocks -> whole response, correct language)
+        self.assertEqual(self.custom_reward("prompt", response_ja_no_tags), 0.9)
         # Test custom no_match reward (wrong language)
         self.assertEqual(self.custom_reward("prompt", response_en), 0.1)
         # Test empty response
@@ -245,7 +237,7 @@ def test_call_custom_reward_values(self):
     def test_call_zero_custom_values(self):
         """Test __call__ with zero custom values."""
         zero_reward = self.LanguageReward(
-            target_language="en", full_reward=0.0, no_match_reward=0.0
+            target_language="en", match_reward=0.0, no_match_reward=0.0
         )
         result = zero_reward("prompt", "<思考>This is English.</思考>")
         self.assertEqual(result, 0.0)