feat: add data shuffle and random seed option (#334)

ZhiyuLi-Nvidia · web-flow · commit c784dd999b54 · 2025-08-05T17:47:30.000Z
Signed-off-by: Zhiyu Li &lt;zhiyul@nvidia.com&gt;
Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;
diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
@@ -150,6 +150,7 @@ policy:
 data:
   dataset_name: "HelpSteer3"
   max_input_seq_length: ${policy.max_total_sequence_length}
+  shuffle: true
 logger:
   log_dir: "logs"  # Base directory for all logs
   wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running
diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml
@@ -10,6 +10,7 @@ grpo:
   val_at_start: false
   max_val_samples: 480
   val_batch_size: 32
+  seed: 42
 
 loss_fn:
   reference_policy_kl_penalty: 0.0
@@ -118,6 +119,7 @@ data:
   prompt_file: "examples/prompts/cot.txt"
   system_prompt_file: null
   dataset_name: "DeepScaler"
+  shuffle: true
 
 env:
   math:
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
@@ -10,6 +10,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 
 loss_fn:
   reference_policy_kl_penalty: 0.01
@@ -127,6 +128,7 @@ data:
   prompt_file: "examples/prompts/cot.txt"
   system_prompt_file: null
   dataset_name: "OpenMathInstruct-2"
+  shuffle: true
 
 env:
   math:
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
@@ -146,6 +146,7 @@ data:
   prompt_file: "examples/prompts/cot.txt"
   system_prompt_file: null
   dataset_name: "OpenMathInstruct-2"
+  shuffle: true
 
 env:
   math:
diff --git a/examples/configs/grpo_sliding_puzzle.yaml b/examples/configs/grpo_sliding_puzzle.yaml
@@ -44,6 +44,7 @@ policy:
 
 data:
   add_system_prompt: false
+  shuffle: false  # disable dataloader shuffle, shuffle is handled within the dataset
 
 env:
   sliding_puzzle_game:
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml
@@ -73,6 +73,7 @@ policy:
 data:
   dataset_name: "HelpSteer3"
   max_input_seq_length: ${policy.max_total_sequence_length}
+  shuffle: true
 
 logger:
   log_dir: "logs"
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
@@ -73,6 +73,7 @@ policy:
 data:
   dataset_name: "HelpSteer3"
   max_input_seq_length: ${policy.max_total_sequence_length}
+  shuffle: true
 
 logger:
   log_dir: "logs"
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
@@ -106,6 +106,7 @@ policy:
 data:
   dataset_name: "HelpSteer3"
   max_input_seq_length: ${policy.max_total_sequence_length}
+  shuffle: true
 
 logger:
   log_dir: "logs"
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -106,6 +106,7 @@ policy:
 data:
   dataset_name: "HelpSteer3"
   max_input_seq_length: ${policy.max_total_sequence_length}
+  shuffle: true
 
 logger:
   log_dir: "logs"
diff --git a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
@@ -74,6 +74,8 @@ policy:
 data:
   dataset_name: "HelpSteer3"
   max_input_seq_length: ${policy.max_total_sequence_length}
+  shuffle: true
+
 logger:
   log_dir: "logs"
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -104,6 +105,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -9,6 +9,7 @@ grpo:
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
+  seed: 42
 loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
@@ -105,6 +106,7 @@ data:
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
   dataset_name: OpenMathInstruct-2
+  shuffle: true
 env:
   math:
     num_workers: 8
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml
@@ -57,6 +57,7 @@ data:
   add_bos: true
   add_eos: true
   add_generation_prompt: false
+  shuffle: true
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml
@@ -57,6 +57,7 @@ data:
   add_bos: true
   add_eos: true
   add_generation_prompt: false
+  shuffle: true
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
@@ -101,6 +101,7 @@ data:
   dataset_name: squad
   add_bos: true
   add_eos: true
+  shuffle: true
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp1
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml
@@ -57,6 +57,7 @@ data:
   add_bos: true
   add_eos: true
   add_generation_prompt: false
+  shuffle: true
 logger:
   log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml
@@ -57,6 +57,7 @@ data:
   add_bos: true
   add_eos: true
   add_generation_prompt: false
+  shuffle: true
 logger:
   log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
   wandb_enabled: true
diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml
@@ -125,6 +125,7 @@ policy:
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   dataset_name: "HelpSteer3"
+  shuffle: true
 
 logger:
   log_dir: "logs"  # Base directory for all logs
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
@@ -132,6 +132,7 @@ data:
   add_bos: true
   add_eos: true
   add_generation_prompt: false
+  shuffle: true
 
 logger:
   log_dir: "logs"  # Base directory for all logs
diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml
@@ -67,6 +67,7 @@ data:
   add_eos: true
   add_generation_prompt: true
   output_key: 'generated_solution'
+  shuffle: true
 
 logger:
   log_dir: "logs"  # Base directory for all logs
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
@@ -124,6 +124,7 @@ def setup_data(
     tokenizer: TokenizerType,
     data_config: DataConfig,
     env_configs: dict[str, Any],
+    seed: int,
 ) -> tuple[
     AllTaskProcessedDataset,
     Optional[AllTaskProcessedDataset],
@@ -140,12 +141,12 @@ def setup_data(
     # Load OpenMathInstruct2Dataset using nemo rl datasets
     if data_config["dataset_name"] == "OpenMathInstruct-2":
         print("Loading nvidia/OpenMathInstruct2Dataset for training and validation")
-        data: Any = OpenMathInstruct2Dataset()
+        data: Any = OpenMathInstruct2Dataset(seed=seed)
     elif data_config["dataset_name"] == "DeepScaler":
         print(
             "Loading agentica-org/DeepScaleR-Preview-Dataset for training and validation"
         )
-        data: Any = DeepScalerDataset()
+        data: Any = DeepScalerDataset(seed=seed)
     else:
         raise ValueError(f"No processor for dataset {data_config['dataset_name']}.")
 
@@ -236,7 +237,7 @@ def main() -> None:
         val_dataset,
         task_to_env,
         val_task_to_env,
-    ) = setup_data(tokenizer, config["data"], config["env"])
+    ) = setup_data(tokenizer, config["data"], config["env"], config["grpo"]["seed"])
 
     (
         policy,
diff --git a/examples/run_grpo_sliding_puzzle.py b/examples/run_grpo_sliding_puzzle.py
@@ -24,7 +24,7 @@
 from transformers import AutoTokenizer
 
 from nemo_rl.algorithms.grpo import MasterConfig, grpo_train, setup
-from nemo_rl.algorithms.utils import get_tokenizer
+from nemo_rl.algorithms.utils import get_tokenizer, set_seed
 from nemo_rl.data.interfaces import DatumSpec, LLMMessageLogType
 from nemo_rl.distributed.virtual_cluster import init_ray
 from nemo_rl.environments.games.sliding_puzzle import (
@@ -223,6 +223,8 @@ def main():
 
     init_ray()
 
+    set_seed(config["grpo"]["seed"])
+
     # setup tokenizer
     tokenizer = get_tokenizer(config["policy"]["tokenizer"])
     config["policy"]["generation"] = configure_generation_config(
diff --git a/examples/run_sft.py b/examples/run_sft.py
@@ -95,7 +95,9 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
     print("\n▶ Setting up data...")
     data_cls = data_config["dataset_name"]
     if data_cls == "open_assistant":
-        data = hf_datasets.OasstDataset(output_dir="/tmp/open_assistant")
+        data = hf_datasets.OasstDataset(
+            output_dir="/tmp/open_assistant", seed=data_config["seed"]
+        )
     elif data_cls == "squad":
         data = hf_datasets.SquadDataset()
     elif data_cls == "prompt_response_dataset":
@@ -110,6 +112,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             split=data_config["split"],
             output_key=data_config["output_key"],
             prompt_file=data_config["prompt_file"],
+            seed=data_config["seed"],
         )
     elif data_cls == "openai_format":
         data = hf_datasets.OpenAIFormatDataset(
diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py
@@ -152,7 +152,7 @@ def setup(
     train_dataloader = StatefulDataLoader(
         train_dataset,
         batch_size=policy_config["train_global_batch_size"],
-        shuffle=True,
+        shuffle=data_config["shuffle"],
         collate_fn=partial(
             dpo_collate_fn,
             tokenizer=tokenizer,
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
diff --git a/nemo_rl/algorithms/rm.py b/nemo_rl/algorithms/rm.py
diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py
diff --git a/nemo_rl/data/__init__.py b/nemo_rl/data/__init__.py
diff --git a/nemo_rl/data/hf_datasets/oasst.py b/nemo_rl/data/hf_datasets/oasst.py
diff --git a/tests/unit/data/test_data_shuffle_reproducity.py b/tests/unit/data/test_data_shuffle_reproducity.py