Refactor to use configurable Japanese tags <思考> instead of English <think>

casteryh · casteryh · commit 2625b282d392 · 2025-10-31T15:19:31.000-07:00
BREAKING CHANGE: Default tag for LanguageReward changed from 'think' to '思考'

Key changes:
- Both ThinkingReward and LanguageReward now accept 'tag' parameter
- ThinkingReward default: 'think' (backward compatible)
- LanguageReward default: '思考' (Japanese, breaks English associations)
- Sandbox app uses &lt;思考&gt; tags throughout
- System prompt updated to Japanese with &lt;思考&gt; examples
- All tests updated and passing (29/29)
- Debug script updated

Rationale: Models may be heavily trained to think in English when using
&lt;think&gt; tags. Using Japanese tags &lt;思考&gt; (shikō = 'thinking') breaks
this association and encourages thinking in the target language.
diff --git a/sandbox/grpo_language/README.md b/sandbox/grpo_language/README.md
@@ -4,19 +4,25 @@ This sandbox app demonstrates using GRPO training with a language reward that en
 
 ## Overview
 
-This app extends the standard GRPO training (from `apps/grpo/`) by adding a `LanguageReward` that evaluates whether the model's thinking (text within `<think></think>` tags) is in the target language.
+This app extends the standard GRPO training (from `apps/grpo/`) by adding a `LanguageReward` that evaluates whether the model's thinking (text within `<思考></思考>` tags) is in the target language.
+
+**Key Insight**: Uses Japanese tags `<思考>` (shikō = "thinking") instead of English `<think>` tags to break the model's association between thinking tags and English language. This helps encourage multilingual thinking.
 
 ## Key Features
 
 - **Multi-objective training**: Combines three rewards:
   - `MathReward`: Evaluates correctness of math answers
-  - `ThinkingReward`: Encourages use of thinking tags
+  - `ThinkingReward`: Encourages use of `<思考>` tags
   - `LanguageReward`: Rewards thinking in target language (Japanese by default)
 
+- **Japanese thinking tags**: Uses `<思考>` instead of `<think>` to encourage non-English reasoning
+
 - **Language detection**: Uses `langid` to detect the language of thinking blocks
 
 - **Configurable target language**: While this app defaults to Japanese (`ja`), the `LanguageReward` can be configured for any ISO 639-1 language code
 
+- **Configurable tags**: Both rewards support custom tag names via the `tag` parameter
+
 ## Requirements
 
 Before running this app, install the required language detection library:
@@ -33,25 +39,29 @@ python -m sandbox.grpo_language.main --config sandbox/grpo_language/qwen3_1_7b.y
 
 ## How It Works
 
-1. The model receives a math problem and is instructed to use `<think>` tags for reasoning
+1. The model receives a math problem and is instructed to use `<思考>` tags for reasoning
 2. During training, the model generates responses with thinking blocks
 3. Three rewards are computed:
    - Math correctness (did it get the right answer?)
-   - Thinking usage (did it use thinking tags properly?)
+   - Thinking usage (did it use `<思考>` tags properly?)
    - Language usage (did it think in Japanese?)
 4. The model is trained to maximize all three rewards
 
 ## Configuration
 
-The target language is hardcoded as Japanese in `main.py` (line 321):
+### Target Language
+
+The target language is configured as Japanese in `main.py`:
 
 ```python
-LanguageReward(target_language="ja")
+LanguageReward(target_language="ja", tag="思考")
+ThinkingReward(tag="思考")
 ```
 
-To use a different language, modify this line with the appropriate ISO 639-1 code:
-- English: `"en"`
-- Chinese: `"zh"`
+To use a different language:
+1. Change `target_language` to the appropriate ISO 639-1 code:
+   - English: `"en"`
+   - Chinese: `"zh"`
 - Spanish: `"es"`
 - French: `"fr"`
 - etc.
diff --git a/sandbox/grpo_language/debug_reward.py b/sandbox/grpo_language/debug_reward.py
@@ -15,27 +15,27 @@
 # Test cases mimicking what the model might generate
 test_cases = [
     # Case 1: Perfect - Japanese in single thinking block
-    ("<think>これは数学の問題です。2+2=4です。</think><answer>4</answer>", "Perfect Japanese"),
+    ("<思考>これは数学の問題です。2+2=4です。</思考><answer>4</answer>", "Perfect Japanese"),
     # Case 2: English thinking (most likely during training)
     (
-        "<think>This is a math problem. 2+2=4.</think><answer>4</answer>",
+        "<思考>This is a math problem. 2+2=4.</思考><answer>4</answer>",
         "English thinking",
     ),
     # Case 3: No thinking blocks at all
     ("The answer is 4.<answer>4</answer>", "No thinking blocks"),
     # Case 4: Empty thinking blocks
-    ("<think></think><answer>4</answer>", "Empty thinking block"),
+    ("<思考></思考><answer>4</answer>", "Empty thinking block"),
     # Case 5: Multiple thinking blocks in Japanese
     (
-        "<think>最初の考え。</think><think>次の考え。</think><answer>4</answer>",
+        "<思考>最初の考え。</思考><思考>次の考え。</思考><answer>4</answer>",
         "Multiple Japanese blocks",
     ),
     # Case 6: Just the answer, no thinking
     ("<answer>4</answer>", "Just answer tag"),
     # Case 7: Thinking with mostly numbers/symbols
-    ("<think>2 + 2 = 4</think><answer>4</answer>", "Mostly numbers"),
+    ("<思考>2 + 2 = 4</思考><answer>4</answer>", "Mostly numbers"),
     # Case 8: Mixed English and Japanese
-    ("<think>Let me think... これは簡単です。</think><answer>4</answer>", "Mixed languages"),
+    ("<思考>Let me think... これは簡単です。</思考><answer>4</answer>", "Mixed languages"),
 ]
 
 print("=" * 80)
@@ -51,9 +51,7 @@
     import langid
 
     # Extract thinking content if exists
-    think_match = re.findall(
-        r"<\s*think\s*>(.*?)<\s*/\s*think\s*>", response, re.IGNORECASE | re.DOTALL
-    )
+    think_match = re.findall(r"<\s*思考\s*>(.*?)<\s*/\s*思考\s*>", response, re.DOTALL)
 
     if think_match:
         content = " ".join(think_match)
@@ -65,9 +63,7 @@
         print(f"  Detected language: {detected_lang} (confidence: {confidence:.3f})")
     else:
         # Check fallback
-        response_text = re.sub(
-            r"<\s*/?\s*think\s*>", "", response, flags=re.IGNORECASE
-        ).strip()
+        response_text = re.sub(r"<\s*/?\s*思考\s*>", "", response).strip()
         if response_text:
             detected_lang, confidence = langid.classify(response_text)
             print(f"\n{description}:")
diff --git a/sandbox/grpo_language/main.py b/sandbox/grpo_language/main.py
@@ -243,18 +243,18 @@ def gsm8k_transform(sample):
             system_prompt = """
 あなたは数学の問題を解くAIアシスタントです。以下の重要なルールに従ってください：
 
-CRITICAL RULES:
-1. Put ALL your reasoning inside <think> and </think> tags
-2. You MUST think in Japanese (日本語) inside the <think> tags - use hiragana, katakana, and kanji
-3. NEVER use English inside <think> tags
-4. Put your final numerical answer inside <answer> and </answer> tags
+重要なルール (CRITICAL RULES):
+1. すべての思考過程を <思考> と </思考> タグの中に入れてください
+2. <思考> タグの中では必ず日本語で考えてください（ひらがな、カタカナ、漢字を使用）
+3. <思考> タグの中では絶対に英語を使わないでください
+4. 最終的な数値の答えを <answer> と </answer> タグの中に入れてください
 
-Example:
+例 (Example):
 Question: What is 12 + 5?
-<think>12と5を足します。12 + 5 = 17です。したがって、答えは17です。</think>
+<思考>12と5を足します。12 + 5 = 17です。したがって、答えは17です。</思考>
 <answer>17</answer>
 
-Now solve the following problem using Japanese in your <think> tags:
+以下の問題を <思考> タグの中で日本語を使って解いてください:
             """
             request: str = sample["question"]
             as_chat = [
@@ -358,9 +358,9 @@ async def main(cfg: DictConfig):
         RewardActor.options(**cfg.services.reward_actor).as_service(
             reward_functions=[
                 MathReward(),
-                ThinkingReward(),
+                ThinkingReward(tag="思考"),  # Use Japanese tag
                 LanguageReward(
-                    target_language="ja", debug=True, debug_sample_rate=0.1
+                    target_language="ja", tag="思考", debug=True, debug_sample_rate=0.1
                 ),  # Japanese language reward with debug
             ]
         ),
diff --git a/src/forge/data/rewards.py b/src/forge/data/rewards.py
@@ -57,15 +57,28 @@ def _to_float(self, text: str) -> float | None:
 
 
 class ThinkingReward:
-    """Reward class for evaluating use of <think> tags in reasoning."""
+    """Reward class for evaluating use of thinking tags in reasoning.
 
-    def __init__(self, partial_reward: float = 0.2, full_reward: float = 1.0):
+    Args:
+        partial_reward: Reward for partial tag usage (incomplete/malformed)
+        full_reward: Reward for well-formed thinking blocks with content
+        tag: Tag name to use (default "think", can use "思考" for Japanese, etc.)
+    """
+
+    def __init__(
+        self, partial_reward: float = 0.2, full_reward: float = 1.0, tag: str = "think"
+    ):
         self.partial_reward = partial_reward
         self.full_reward = full_reward
+        self.tag = tag
+        # Build regex patterns for the specified tag
         self._THINK_BLOCK_RE = re.compile(
-            r"<\s*think\s*>(.*?)<\s*/\s*think\s*>", re.IGNORECASE | re.DOTALL
+            rf"<\s*{re.escape(tag)}\s*>(.*?)<\s*/\s*{re.escape(tag)}\s*>",
+            re.IGNORECASE | re.DOTALL,
+        )
+        self._THINK_TAG_ATTEMPT_RE = re.compile(
+            rf"<\s*/?\s*{re.escape(tag)}\s*>", re.IGNORECASE
         )
-        self._THINK_TAG_ATTEMPT_RE = re.compile(r"<\s*/?\s*think\s*>", re.IGNORECASE)
 
     def __call__(self, prompt: str, response: str, target: str | None = None) -> float:
         """Compute thinking reward."""
@@ -83,7 +96,7 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
 
 
 class LanguageReward:
-    """Reward class for evaluating the language used in <think> tags.
+    """Reward class for evaluating the language used in thinking tags.
 
     This reward uses langid to detect the language of text within thinking blocks
     and rewards responses that use the target language.
@@ -94,6 +107,7 @@ class LanguageReward:
         partial_reward: Reward when language matches but format is wrong (multiple blocks)
         fallback_reward: Reward when no valid blocks but response text is in target language
         no_match_reward: Reward when language doesn't match
+        tag: Tag name to use (default "思考" for multilingual, can use "think", etc.)
         debug: If True, print debug samples showing model outputs and detected language
         debug_sample_rate: Fraction of calls to debug (e.g., 0.1 = 10% of calls)
 
@@ -107,6 +121,7 @@ def __init__(
         partial_reward: float = 0.5,
         fallback_reward: float = 0.2,
         no_match_reward: float = 0.0,
+        tag: str = "思考",
         debug: bool = False,
         debug_sample_rate: float = 0.1,
     ):
@@ -115,12 +130,15 @@ def __init__(
         self.partial_reward = partial_reward
         self.fallback_reward = fallback_reward
         self.no_match_reward = no_match_reward
+        self.tag = tag
         self.debug = debug
         self.debug_sample_rate = debug_sample_rate
         self._debug_counter = 0
+        # Build regex pattern for the specified tag
         self._THINK_BLOCK_RE = re.compile(
-            r"<\s*think\s*>(.*?)<\s*/\s*think\s*>", re.IGNORECASE | re.DOTALL
+            rf"<\s*{re.escape(tag)}\s*>(.*?)<\s*/\s*{re.escape(tag)}\s*>", re.DOTALL
         )
+        self._TAG_PATTERN = rf"<\s*/?\s*{re.escape(tag)}\s*>"
 
         # Lazy import langid with helpful error message
         try:
@@ -164,9 +182,7 @@ def __call__(self, prompt: str, response: str, target: str | None = None) -> flo
         # If no thinking blocks found, check if response text is in target language
         if len(matches) == 0:
             # Remove any partial tags that might exist
-            response_text = re.sub(
-                r"<\s*/?\s*think\s*>", "", response, flags=re.IGNORECASE
-            ).strip()
+            response_text = re.sub(self._TAG_PATTERN, "", response).strip()
 
             if not response_text:
                 if should_debug:
diff --git a/tests/unit_tests/rl/test_language_reward.py b/tests/unit_tests/rl/test_language_reward.py