track chinese characters

faresobeid · faresobeid · commit 15ba3a1364d2 · 2026-02-01T01:50:52.000Z
diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py
@@ -53,6 +53,7 @@
 from prime_rl.utils.temp_scheduling import compute_temperature
 from prime_rl.utils.utils import (
     clean_exit,
+    count_chinese_chars,
     get_env_ids_to_install,
     install_env,
     resolve_latest_ckpt_step,
@@ -489,6 +490,17 @@ def process_rollout(rollout: vf.State) -> list[TrainingSample] | None:
         # Gather individual reward function metrics
         metrics_df = pd.DataFrame([rollout["metrics"] for rollout in train_rollouts])
 
+        # Count Chinese characters in completions
+        chinese_stats = []
+        for rollout in train_rollouts:
+            trajectory = rollout["trajectory"]
+            last_step = trajectory[-1]
+            tokens = last_step["tokens"]
+            completion_text = tokenizer.decode(tokens["completion_ids"])
+            chinese_count, total_count = count_chinese_chars(completion_text)
+            chinese_stats.append({"chinese_chars": chinese_count, "total_chars": total_count, "has_chinese": chinese_count > 0})
+        chinese_df = pd.DataFrame(chinese_stats)
+
         val_results_df = (
             pd.DataFrame(
                 {
@@ -568,6 +580,14 @@ def process_rollout(rollout: vf.State) -> list[TrainingSample] | None:
             },
             # Env metrics
             **{f"metrics/{metric}": metrics_df[metric].mean() for metric in metrics_df.columns},
+            # Chinese character metrics
+            "chinese/char_count": chinese_df.chinese_chars.sum(),
+            "chinese/char_ratio": (
+                chinese_df.chinese_chars.sum() / chinese_df.total_chars.sum()
+                if chinese_df.total_chars.sum() > 0
+                else 0.0
+            ),
+            "chinese/rollout_ratio": chinese_df.has_chinese.mean(),
             # Time metrics
             "time/step": step_time,
             "time/generate_completions": generate_completions_time,
diff --git a/src/prime_rl/utils/utils.py b/src/prime_rl/utils/utils.py
@@ -300,3 +300,28 @@ def get_env_ids_to_install(env_configs: list[EnvConfig] | list[EvalEnvConfig]) -
         if "/" in env_config.id:
             env_ids_to_install.add(env_config.id)
     return env_ids_to_install
+
+
+def is_chinese_char(char: str) -> bool:
+    """Check if a character is Chinese (CJK Unified Ideographs and extensions)."""
+    code_point = ord(char)
+    return (
+        0x4E00 <= code_point <= 0x9FFF  # CJK Unified Ideographs
+        or 0x3400 <= code_point <= 0x4DBF  # CJK Extension A
+        or 0x20000 <= code_point <= 0x2A6DF  # CJK Extension B
+        or 0x2A700 <= code_point <= 0x2B73F  # CJK Extension C
+        or 0x2B740 <= code_point <= 0x2B81F  # CJK Extension D
+        or 0x2B820 <= code_point <= 0x2CEAF  # CJK Extension E
+        or 0xF900 <= code_point <= 0xFAFF  # CJK Compatibility Ideographs
+        or 0x2F800 <= code_point <= 0x2FA1F  # CJK Compatibility Ideographs Supplement
+    )
+
+
+def count_chinese_chars(text: str) -> tuple[int, int]:
+    """Count Chinese characters in text.
+
+    Returns:
+        Tuple of (chinese_char_count, total_char_count)
+    """
+    chinese_count = sum(1 for char in text if is_chinese_char(char))
+    return chinese_count, len(text)