Add LanguageReward for training models to think in target language (#515)

casteryh · JenniferWang · web-flow · commit e926707200aa · 2025-11-24T12:34:40.000-08:00
Co-authored-by: Jiyue Wang &lt;JenniferWang@users.noreply.github.com&gt;
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -26,7 +26,7 @@
 from forge.actors.trainer import TitanTrainer
 from forge.controller.actor import ForgeActor
 from forge.controller.provisioner import init_provisioner, shutdown
-from forge.data.rewards import MathReward, ThinkingReward
+from forge.data.rewards import LanguageReward, MathReward, ThinkingReward
 from forge.data_models.completion import Completion
 from forge.observability.metric_actors import get_or_create_metric_logger
 from forge.observability.metrics import record_metric, Reduce
@@ -129,7 +129,7 @@ def simple_grpo_loss(
     ref_logprobs: torch.Tensor,
     advantages: torch.Tensor,
     padding_mask: torch.Tensor,
-    beta: float = 0.1,
+    beta: float = 1e-6,
 ) -> torch.Tensor:
     logprobs: torch.Tensor = compute_logprobs(logits, response)
     kl = torch.exp(ref_logprobs - logprobs) - (ref_logprobs - logprobs) - 1
@@ -237,10 +237,15 @@ async def setup(self):
         self._epoch = 0
 
         def gsm8k_transform(sample):
-            system_prompt = """
-            Put all your scratchpad work between <think> and </think> tags.
-            Your final answer should be between <answer> and </answer> tags otherwise it will not be scored.
-            """
+            system_prompt = """You are a helpful AI assistant that solves math problems.
+
+Please show your reasoning inside <思考></思考> tags, then provide your final numerical answer inside <answer></answer> tags.
+
+Example:
+Question: What is 12 + 5?
+<思考>12と5を足します。12 + 5 = 17です。</思考>
+<answer>17</answer>
+"""
             request: str = sample["question"]
             as_chat = [
                 {"role": "system", "content": system_prompt},
@@ -359,7 +364,17 @@ async def main(cfg: DictConfig):
         ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(),
         ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model),
         RewardActor.options(**cfg.services.reward_actor).as_service(
-            reward_functions=[MathReward(), ThinkingReward()]
+            reward_functions=[
+                MathReward(),
+                ThinkingReward(tag="思考"),  # Use Japanese tag
+                LanguageReward(
+                    target_language="ja",
+                    tag="思考",
+                    match_reward=2.0,
+                    debug=True,
+                    debug_sample_rate=0.1,
+                ),  # Japanese language reward with debug
+            ]
         ),
     )
 
diff --git a/apps/grpo/qwen3_1_7b.yaml b/apps/grpo/qwen3_1_7b.yaml
@@ -5,7 +5,7 @@
 group_size: 8
 local_batch_size: 16 # per-device batch size
 max_req_tokens: 1024
-max_res_tokens: 1024
+max_res_tokens: 2048
 model: "Qwen/Qwen3-1.7B"
 off_by_n: 1 # Off by one by default
 
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -2,10 +2,10 @@
 # >>> python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml
 
 # Global configuration
-group_size: 8
-local_batch_size: 12 # per-device batch size
+group_size: 16
+local_batch_size: 4 # per-device batch size
 max_req_tokens: 1024
-max_res_tokens: 1024
+max_res_tokens: 2048
 model: "Qwen/Qwen3-8B"
 off_by_n: 1 # Off by one by default
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ dev = [
     "anyio",
     "pytest-asyncio",
     "multiprocess",
+    "langid",
 ]
 docs = [
     "sphinx==7.2.6",
diff --git a/src/forge/data/rewards.py b/src/forge/data/rewards.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import random
 import re
 
 
@@ -57,15 +58,28 @@ def _to_float(self, text: str) -> float | None:
 
 
 class ThinkingReward:
-    """Reward class for evaluating use of <think> tags in reasoning."""
+    """Reward class for evaluating use of thinking tags in reasoning.
 
-    def __init__(self, partial_reward: float = 0.2, full_reward: float = 1.0):
+    Args:
+        partial_reward: Reward for partial tag usage (incomplete/malformed)
+        full_reward: Reward for well-formed thinking blocks with content
+        tag: Tag name to use (default "think", can use "思考" for Japanese, etc.)
+    """
+
+    def __init__(
+        self, partial_reward: float = 0.2, full_reward: float = 1.0, tag: str = "think"
+    ):
         self.partial_reward = partial_reward
         self.full_reward = full_reward
+        self.tag = tag
+        # Build regex patterns for the specified tag
         self._THINK_BLOCK_RE = re.compile(
-            r"<\s*think\s*>(.*?)<\s*/\s*think\s*>", re.IGNORECASE | re.DOTALL
+            rf"<\s*{re.escape(tag)}\s*>(.*?)<\s*/\s*{re.escape(tag)}\s*>",
+            re.IGNORECASE | re.DOTALL,
+        )
+        self._THINK_TAG_ATTEMPT_RE = re.compile(
+            rf"<\s*/?\s*{re.escape(tag)}\s*>", re.IGNORECASE
         )
-        self._THINK_TAG_ATTEMPT_RE = re.compile(r"<\s*/?\s*think\s*>", re.IGNORECASE)
 
     def __call__(self, prompt: str, response: str, target: str | None = None) -> float:
         """Compute thinking reward."""
@@ -80,3 +94,128 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
         elif has_attempt:
             return self.partial_reward
         return 0.0
+
+
+class LanguageReward:
+    """Reward class for evaluating the language used in responses.
+
+    This reward uses langid to detect the language and rewards responses that use
+    the target language. The detection strategy depends on the format:
+    - If exactly one thinking block: detect language of the block content
+    - Otherwise (no blocks or multiple blocks): detect language of whole response
+
+    Note: Format enforcement (single vs multiple blocks) is handled by ThinkingReward.
+    This reward focuses purely on language detection.
+
+    Args:
+        target_language: ISO 639-1 language code (e.g., 'en', 'ja', 'zh', 'es')
+        match_reward: Reward when detected language matches target (default: 1.0)
+        no_match_reward: Reward when language doesn't match (default: 0.0)
+        tag: Tag name to use (default "思考" for multilingual, can use "think", etc.)
+        debug: If True, print debug samples showing model outputs and detected language
+        debug_sample_rate: Fraction of calls to debug (e.g., 0.1 = 10% of calls)
+
+    Note: Requires langid to be installed. Install with: pip install langid
+    """
+
+    def __init__(
+        self,
+        target_language: str = "ja",
+        match_reward: float = 1.0,
+        no_match_reward: float = 0.0,
+        tag: str = "思考",
+        debug: bool = False,
+        debug_sample_rate: float = 0.1,
+    ):
+        self.target_language = target_language
+        self.match_reward = match_reward
+        self.no_match_reward = no_match_reward
+        self.tag = tag
+        self.debug = debug
+        self.debug_sample_rate = debug_sample_rate
+        self._debug_counter = 0
+        # Build regex pattern for the specified tag
+        self._THINK_BLOCK_RE = re.compile(
+            rf"<\s*{re.escape(tag)}\s*>(.*?)<\s*/\s*{re.escape(tag)}\s*>", re.DOTALL
+        )
+
+        # Lazy import langid with helpful error message
+        try:
+            import langid
+
+            self._langid = langid
+        except ImportError:
+            raise ImportError(
+                "langid is required for LanguageReward but is not installed. "
+                "Please install it with: pip install langid"
+            ) from None
+
+    def __call__(self, prompt: str, response: str, target: str | None = None) -> float:
+        """Compute language reward based on detected language.
+
+        Detection strategy:
+        - If exactly one thinking block: detect language of block content
+        - Otherwise: detect language of whole response
+
+        Args:
+            prompt: The input prompt (unused but kept for signature consistency)
+            response: The model response
+            target: Optional target string (unused but kept for signature consistency)
+
+        Returns:
+            match_reward if detected language matches target, no_match_reward otherwise
+        """
+
+        # TODO: refactor pending https://github.com/meta-pytorch/torchforge/issues/187
+        should_debug = self.debug and (random.random() < self.debug_sample_rate)
+
+        if not response:
+            if should_debug:
+                print(
+                    f"\n[LanguageReward] Empty response | Reward: {self.no_match_reward}"
+                )
+            return self.no_match_reward
+
+        # Extract all thinking blocks
+        matches = self._THINK_BLOCK_RE.findall(response)
+
+        # Determine what text to analyze
+        if len(matches) == 1:
+            # Single block: detect language of block content only
+            text_to_analyze = matches[0].strip()
+            detection_mode = "single block"
+        else:
+            # No blocks or multiple blocks: detect language of whole response
+            text_to_analyze = response.strip()
+            detection_mode = f"{len(matches)} blocks, using whole response"
+
+        # Remove extra whitespace
+        text_to_analyze = re.sub(r"\s+", " ", text_to_analyze).strip()
+
+        if not text_to_analyze:
+            if should_debug:
+                print(f"\n[LanguageReward] Empty text | Reward: {self.no_match_reward}")
+            return self.no_match_reward
+
+        # Detect language using langid
+        detected_lang, confidence = self._langid.classify(text_to_analyze)
+
+        # Check if language matches target
+        reward = (
+            self.match_reward
+            if detected_lang == self.target_language
+            else self.no_match_reward
+        )
+
+        if should_debug:
+            sample = text_to_analyze[:1000].replace("\n", " ")
+            match_symbol = "✓" if detected_lang == self.target_language else "✗"
+            print(
+                f"\n[LanguageReward] Detection mode: {detection_mode}"
+                f"\n  Target: {self.target_language} | Detected: {detected_lang} | "
+                f"Confidence: {confidence:.2f}"
+                f"\n  Sample: {sample}..."
+                f"\n  → Reward: {reward} {match_symbol}"
+            )
+
+        return reward
diff --git a/tests/unit_tests/rl/test_language_reward.py b/tests/unit_tests/rl/test_language_reward.py
diff --git a/tests/unit_tests/rl/test_thinking_reward.py b/tests/unit_tests/rl/test_thinking_reward.py

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@ dev = [`
`47`	`47`	`"anyio",`
`48`	`48`	`"pytest-asyncio",`
`49`	`49`	`"multiprocess",`
	`50`	`+ "langid",`
`50`	`51`	`]`
`51`	`52`	`docs = [`
`52`	`53`	`"sphinx==7.2.6",`