feat(rl): add opt-in legacy length check and max duration config

hcsolakoglu · hcsolakoglu · commit 81a05609fd5f · 2026-01-12T17:27:40.000+03:00
diff --git a/README_RL.md b/README_RL.md
@@ -199,6 +199,8 @@ Sample logging:
 - `rl.kl_eps`: add a small epsilon to the KL denominator for extra numerical stability (default: 0.0).
 - `rl.density_eps`: add a small epsilon to Gaussian density weighting for stability (default: 0.0).
 - `rl.align_kl_steps`: share the ODE skip mask between policy/ref rollouts for a less noisy KL (default: `false`).
+- `rl.max_duration`: Maximum allowed mel frames (default: 4096). Samples exceeding this are skipped to prevent truncation.
+- `rl.legacy_length_check`: If `true`, enables legacy filtering where samples with `text_len > mel_len` are skipped (F5R parity). Default `false` to fix this behavior.
 - `wer_mode`: `char | word` (default: `char`, matching F5R).
 - `ref_source`: `text | audio` (default: `text`; set `audio` to match ASR-vs-ASR reward in F5R).
 
diff --git a/src/f5_tts/configs/F5TTS_RL.yaml b/src/f5_tts/configs/F5TTS_RL.yaml
@@ -62,6 +62,8 @@ rl:
   kl_eps: 0.0  # set > 0 for numerical stability in KL denominator
   density_eps: 0.0  # set > 0 for stability in Gaussian density weighting
   align_kl_steps: false  # share ODE skip mask between policy/ref for stable KL
+  max_duration: 4096
+  legacy_length_check: False  # Set to True to replicate behavior of filtering samples where text_len > mel_len
   ref_model_ckpt: null
   ref_model_use_ema: True
   rewards:
diff --git a/src/f5_tts/rl/trainer_grpo.py b/src/f5_tts/rl/trainer_grpo.py
@@ -118,6 +118,8 @@ def __init__(
         kl_eps: float = 0.0,
         density_eps: float = 0.0,
         align_kl_steps: bool = False,
+        max_duration: int = 4096,
+        legacy_length_check: bool = False,
     ):
         if accelerate_kwargs is None:
             accelerate_kwargs = {}
@@ -176,8 +178,8 @@ def __init__(
                 "kl_eps": kl_eps,
                 "density_eps": density_eps,
                 "align_kl_steps": align_kl_steps,
-                "reward_mode": reward_combiner.mode,
-                "reward_weights": reward_combiner.weights,
+                "max_duration": max_duration,
+                "legacy_length_check": legacy_length_check,
                 "reward_providers": reward_providers,
             }
             self.accelerator.init_trackers(
@@ -258,6 +260,10 @@ def __init__(
         self.kl_eps = kl_eps
         self.density_eps = density_eps
         self.align_kl_steps = align_kl_steps
+        self.max_duration = max_duration
+        self.legacy_length_check = legacy_length_check
+        self.max_duration = max_duration
+        self.legacy_length_check = legacy_length_check
 
         self.noise_scheduler = noise_scheduler
         self.duration_predictor = duration_predictor
@@ -554,8 +560,12 @@ def train(self, train_dataset: Dataset, num_workers: int = 16, resumable_with_se
                     text_inputs = batch["text"]
                     mel_spec = batch["mel"].permute(0, 2, 1)
                     mel_lengths = batch["mel_lengths"]
-                    text_len = max(len(item) for item in text_inputs)
-                    if text_len > max(mel_lengths):
+
+                    if self.legacy_length_check:
+                        text_len = max(len(item) for item in text_inputs)
+                        if text_len > max(mel_lengths):
+                            continue
+                    elif max(mel_lengths) > self.max_duration:
                         continue
 
                     dur_loss = None
diff --git a/src/f5_tts/train/train_rl.py b/src/f5_tts/train/train_rl.py
@@ -81,6 +81,8 @@ def main(model_cfg):
         allow_extra_keys=model_cfg.ckpts.get("allow_extra_keys", False),
         bnb_optimizer=model_cfg.optim.get("bnb_optimizer", False),
         prompt_length_mode=model_cfg.rl.get("prompt_length_mode", "min"),
+        max_duration=model_cfg.rl.get("max_duration", 4096),
+        legacy_length_check=model_cfg.rl.get("legacy_length_check", False),
     )
 
     train_dataset = load_dataset(model_cfg.datasets.name, tokenizer, mel_spec_kwargs=model_cfg.model.mel_spec)
diff --git a/tests/test_rl_integration.py b/tests/test_rl_integration.py
@@ -166,7 +166,7 @@ def test_grpo_single_step_updates_params(tmp_path):
     )
     trainer.train(DummyDataset(), num_workers=0)
     after = model.transformer.proj_out.weight.detach()
-    assert not torch.equal(before, after)
+    assert not torch.equal(before.to(after.device), after)
 
 
 def test_grpo_kl_eps_stability(tmp_path):
@@ -694,3 +694,82 @@ def test_audio_pack_metadata():
     line = metadata.read_text(encoding="utf-8").splitlines()[0]
     audio_name = json.loads(line).get("audio")
     assert (pack_dir / audio_name).exists()
+
+
+def test_grpo_length_checks(tmp_path):
+    model = _make_cfm(output_dist="gaussian", objective="grpo")
+    combiner = RewardCombiner([DummyRewardProvider()])
+
+    # Create trainer with dummy configs
+    trainer = GRPOTrainer(
+        model,
+        reward_combiner=combiner,
+        epochs=1,
+        learning_rate=1e-3,
+        num_warmup_updates=0,
+        save_per_updates=1000,
+        keep_last_n_checkpoints=0,
+        checkpoint_path=str(tmp_path),
+        batch_size_per_gpu=1,
+        batch_size_type="sample",
+        max_samples=1,
+        grad_accumulation_steps=1,
+        max_grad_norm=1.0,
+        logger=None,
+        mel_spec_type="vocos",
+        vocoder=DummyVocoder(),
+        repeat_count=1,
+        mini_repeat_count=1,
+        prompt_frac_range=(0.5, 0.5),
+        steps=3,
+        cfg_strength=1.0,
+        sway_sampling_coef=None,
+        max_duration=100,  # Small duration for testing
+    )
+
+    # Mock accelerator and optimizer to avoid actual training steps
+    trainer.accelerator.sync_gradients = True
+
+    # Mock accumulate to return a dummy context manager
+    class DummyContext:
+        def __enter__(self):
+            return None
+
+        def __exit__(self, *args):
+            return None
+
+    trainer.accelerator.accumulate = lambda x: DummyContext()
+    trainer.optimizer.step = lambda: None
+    trainer.optimizer.zero_grad = lambda: None
+
+    # Simple logic check helper since we can't easily run the full loop
+    def process_batch(batch):
+        text_inputs = batch["text"]
+        mel_lengths = batch["mel_lengths"]
+
+        if trainer.legacy_length_check:
+            text_len = max(len(item) for item in text_inputs)
+            if text_len > max(mel_lengths):
+                return False
+        elif max(mel_lengths) > trainer.max_duration:
+            return False
+        return True
+
+    # Case 1: Legacy check enabled, text > mel
+    trainer.legacy_length_check = True
+    batch_legacy_skip = {
+        "text": ["very long text string that exceeds mel length"],
+        "mel_lengths": torch.tensor([10]),
+    }
+    assert process_batch(batch_legacy_skip) is False
+
+    # Case 2: Legacy check disabled, text > mel (Should pass)
+    trainer.legacy_length_check = False
+    assert process_batch(batch_legacy_skip) is True
+
+    # Case 3: Max duration check (Should skip)
+    batch_max_duration = {
+        "text": ["short"],
+        "mel_lengths": torch.tensor([101]),
+    }
+    assert process_batch(batch_max_duration) is False

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,8 @@ def main(model_cfg):`
`81`	`81`	`allow_extra_keys=model_cfg.ckpts.get("allow_extra_keys", False),`
`82`	`82`	`bnb_optimizer=model_cfg.optim.get("bnb_optimizer", False),`
`83`	`83`	`prompt_length_mode=model_cfg.rl.get("prompt_length_mode", "min"),`
	`84`	`+ max_duration=model_cfg.rl.get("max_duration", 4096),`
	`85`	`+ legacy_length_check=model_cfg.rl.get("legacy_length_check", False),`
`84`	`86`	`)`
`85`	`87`
`86`	`88`	`train_dataset = load_dataset(model_cfg.datasets.name, tokenizer, mel_spec_kwargs=model_cfg.model.mel_spec)`