* prepare the initial config files for exp pipeline

HYLcool · HYLcool · commit b52809f72a2d · 2025-06-24T15:55:13.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/example_data_functionalities.md b/docs/sphinx_doc/source/tutorial/example_data_functionalities.md
@@ -103,8 +103,6 @@ If you are familiar with Data-Juicer, you will realize that Data-Juicer provides
 # This is a Data-Juicer data processing recipe
 project_name: 'gsm-8k-difficulty'
 
-export_path: '/path/to/the/result/processed-dataset.jsonl'
-
 process:
   - llm_difficulty_score_filter:
       api_or_hf_model: "qwen2.5-72b-instruct"  # use "qwen2.5-72b-instruct" to calculate the difficulty scores.
diff --git a/examples/grpo_gsm8k_experience_pipeline/README.md b/examples/grpo_gsm8k_experience_pipeline/README.md
@@ -0,0 +1,7 @@
+# GRPO on GSM8K dataset with Experience Pipeline
+
+This example shows the usage of GRPO on the GSM8K dataset, with a experience pipeline to reshape the rewards of experiences while training.
+
+For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_data_functionalities.md).
+
+The config files are located in [`gsm8k.yaml`](gsm8k.yaml) and [`train_gsm8k.yaml`](train_gsm8k.yaml).
diff --git a/examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml b/examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml
@@ -0,0 +1,8 @@
+# This is a Data-Juicer data processing recipe
+project_name: 'gsm-8k-experience-quality'
+
+process:
+  - llm_quality_score_filter:
+      api_or_hf_model: "qwen2.5-32b-instruct"  # use "qwen2.5-32b-instruct" to calculate the quality scores.
+      input_keys: ["prompt", "response"]  # set input_keys and field_names to the existing key names in gsm-8k. Here calculating the difficulty scores according to both questions and answers.
+      field_names: ["prompt", "response"]
diff --git a/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml
@@ -0,0 +1,90 @@
+project: "Trinity-RFT-gsm8k-experience-pipeline"
+name: "qwen2.5-1.5B-gsm8k-experience-pipeline"
+checkpoint_root_dir: /PATH/TO/CHECKPOINT/
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+data_processor:
+  data_processor_url: 'http://127.0.0.1:5005/data_processor'
+  # experience pipeline related
+  experience_pipeline:
+    # I/O buffers
+    input_buffers:
+      - name: gsm8k_exp_output
+        storage_type: queue
+        path: 'sqlite:///gsm8k_exp_output.db'
+    output_buffer:
+      name: reshaped_gsm8k_exp_input
+      storage_type: queue
+      path: 'sqlite:///reshaped_gsm8k_exp_input.db'
+    # format mapping
+    format:
+      prompt_key: 'prompt'
+      response_key: 'response'
+      reward_key: 'reward'
+    # data active iterator related
+    dj_config_path: 'examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml'
+    clean_strategy: 'iterative'
+
+model:
+  model_path: /PATH/TO/MODEL/
+  max_prompt_tokens: 256
+  max_response_tokens: 1024
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 1
+  batch_size: 96
+  max_retry_times: 3
+  max_retry_interval: 1
+  explorer_input:
+    taskset:
+      name: gsm8k
+      storage_type: file
+      path: 'openai/gsm8k'
+      subset_name: 'main'
+      split: 'train'
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 1.0
+    eval_tasksets:
+    - name: gsm8k-eval
+      storage_type: file
+      path: 'openai/gsm8k'
+      subset_name: 'main'
+      split: 'test'
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+    default_workflow_type: 'math_workflow'
+  explorer_output:
+    name: gsm8k_exp_output
+    storage_type: queue
+    path: 'sqlite:///gsm8k_exp_output.db'
+  trainer_input:
+    experience_buffer:
+      name: reshaped_gsm8k_exp_input
+      storage_type: queue
+      path: 'sqlite:///reshaped_gsm8k_exp_input.db'
+explorer:
+  eval_interval: 50
+  runner_num: 32
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 1
+  sync_timeout: 1200
+trainer:
+  trainer_type: 'verl'
+  trainer_config_path: 'examples/grpo_gsm8k_experience_pipeline/train_gsm8k.yaml'
+  save_interval: 100
diff --git a/examples/grpo_gsm8k_experience_pipeline/train_gsm8k.yaml b/examples/grpo_gsm8k_experience_pipeline/train_gsm8k.yaml
@@ -0,0 +1,50 @@
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True  # False
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 128
+    ppo_micro_batch_size_per_gpu: 4
+    use_dynamic_bsz: True # False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-5
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      # min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size_per_gpu: 16
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+
+trainer:
+  balance_batch: True
+  # total_training_steps: null
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  val_before_train: False