+ add basic reward shaping func

HYLcool · HYLcool · commit 14300356421e · 2025-06-24T21:02:43.000+08:00
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
@@ -9,7 +9,7 @@
 import ray
 
 from trinity.common.config import Config, DataPipelineConfig, load_config
-from trinity.common.constants import EXPLORER_NAME, TRAINER_NAME
+from trinity.common.constants import EXPLORER_NAME, TRAINER_NAME, DataProcessorPipelineType
 from trinity.explorer.explorer import Explorer
 from trinity.trainer.trainer import Trainer
 from trinity.utils.log import get_logger
@@ -126,14 +126,14 @@ def activate_data_module(data_processor_url: str, config_path: str):
         return
 
 
-def validate_data_pipeline(data_pipeline_config: DataPipelineConfig, pipeline_type: str):
+def validate_data_pipeline(data_pipeline_config: DataPipelineConfig, pipeline_type: DataProcessorPipelineType):
     """
     Check if the data pipeline is valid. The config should:
     1. Non-empty input buffer
     2. Different input/output buffers
 
     :param data_pipeline_config: the input data pipeline to be validated.
-    :param pipeline_type: the type of pipeline, should be one of ["task", "experience"]
+    :param pipeline_type: the type of pipeline, should be one of DataProcessorPipelineType
     """
     input_buffers = data_pipeline_config.input_buffers
     output_buffer = data_pipeline_config.output_buffer
@@ -147,7 +147,7 @@ def validate_data_pipeline(data_pipeline_config: DataPipelineConfig, pipeline_ty
     if output_buffer.name in input_buffer_names:
         logger.warning("Output buffer exists in input buffers. Won't activate it.")
         return False
-    if pipeline_type == "task":
+    if pipeline_type == DataProcessorPipelineType.TASK:
         # task pipeline specific
         # "raw" field should be True for task pipeline because the data source must be raw data files
         for buffer in input_buffers:
@@ -156,12 +156,13 @@ def validate_data_pipeline(data_pipeline_config: DataPipelineConfig, pipeline_ty
                     'Input buffers should be raw data files for task pipeline ("raw" field should be True). Won\'t activate it.'
                 )
                 return False
-    elif pipeline_type == "experience":
+    elif pipeline_type == DataProcessorPipelineType.EXPERIENCE:
         # experience pipeline specific
-        raise NotImplementedError("experience_pipeline is not implemented yet.")
+        # No special items need to be checked.
+        pass
     else:
         logger.warning(
-            f'Invalid pipeline type: {pipeline_type}. Should be one of ["task", "experience"].'
+            f'Invalid pipeline type: {pipeline_type}..'
         )
         return False
     return True
@@ -177,19 +178,19 @@ def run(config_path: str, dlc: bool = False, plugin_dir: str = None):
     if (
         data_processor_config.data_processor_url
         and data_processor_config.task_pipeline
-        and validate_data_pipeline(data_processor_config.task_pipeline, "task")
+        and validate_data_pipeline(data_processor_config.task_pipeline, DataProcessorPipelineType.TASK)
     ):
         activate_data_module(
-            f"{data_processor_config.data_processor_url}/task_pipeline", config_path
+            f"{data_processor_config.data_processor_url}/{DataProcessorPipelineType.TASK.value}", config_path
         )
     # try to activate experience pipeline for experiences
     if (
         data_processor_config.data_processor_url
         and data_processor_config.experience_pipeline
-        and validate_data_pipeline(data_processor_config.experience_pipeline, "experience")
+        and validate_data_pipeline(data_processor_config.experience_pipeline, DataProcessorPipelineType.EXPERIENCE)
     ):
         activate_data_module(
-            f"{data_processor_config.data_processor_url}/experience_pipeline", config_path
+            f"{data_processor_config.data_processor_url}/{DataProcessorPipelineType.EXPERIENCE.value}", config_path
         )
     ray_namespace = config.ray_namespace
     if dlc:
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -12,6 +12,7 @@
     StorageType,
     SyncMethod,
     TaskType,
+    OpType
 )
 from trinity.utils.log import get_logger
 
@@ -100,6 +101,13 @@ class StorageConfig:
     # ! DO NOT SET,  automatically set corresponding to train/eval
     task_type: TaskType = TaskType.EXPLORE
 
+@dataclass
+class RewardShapingConfig:
+    """Config for reward shaping."""
+
+    stats_key: str = ""
+    op_type: OpType = OpType.ADD
+    weight: float = 1.0
 
 @dataclass
 class DataPipelineConfig:
@@ -125,6 +133,9 @@ class DataPipelineConfig:
     priority_weights: Optional[Dict[str, float]] = None
     data_dist: Optional[str] = "gaussian"  # one of ["gaussian", "uniform"]
 
+    # reward shaping related, only available for experience pipeline
+    reward_shaping: Optional[List[RewardShapingConfig]] = field(default_factory=list)
+
 
 @dataclass
 class DataProcessorConfig:
diff --git a/trinity/common/constants.py b/trinity/common/constants.py
@@ -103,3 +103,17 @@ class RunningStatus(Enum):
     RUNNING = "running"
     WAITING_SYNC = "waiting_sync"
     STOPPED = "stopped"
+
+class DataProcessorPipelineType(Enum):
+    """Data processor pipeline type."""
+
+    EXPERIENCE = "experience_pipeline"
+    TASK = "task_pipeline"
+
+class OpType(Enum):
+    """Operator type for reward shaping."""
+
+    ADD = "add"
+    SUB = "sub"
+    MUL = "mul"
+    DIV = "div"
diff --git a/trinity/data/controllers/active_iterator.py b/trinity/data/controllers/active_iterator.py
@@ -1,11 +1,14 @@
 import os
 import traceback
 from numbers import Number
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
+from functools import partial
+from data_juicer.utils.constant import Fields
 
 import ray
 
-from trinity.common.config import BufferConfig, DataPipelineConfig
+from trinity.common.config import BufferConfig, DataPipelineConfig, RewardShapingConfig
+from trinity.common.constants import DataProcessorPipelineType, OpType
 from trinity.data.controllers.default_ops import DIMENSION_STATS_KEYS
 from trinity.data.controllers.task_parser import DataTaskParser
 from trinity.data.core.dataset import RftDataset
@@ -23,9 +26,24 @@ def __init__(
         self,
         config: DataPipelineConfig,
         buffer_config: BufferConfig,
+        pipeline_type: Optional[DataProcessorPipelineType, str] = DataProcessorPipelineType.TASK,
     ):
+        """
+        The initialization method.
+
+        :param config: the data pipeline config.
+        :param buffer_config: the buffer config.
+        :param pipeline_type: the type of the activated pipeline.
+        """
         self.config = config
         self.buffer_config = buffer_config
+        self.pipeline_type = pipeline_type
+        if self.pipeline_type is None:
+            self.pipeline_type = DataProcessorPipelineType.TASK
+        if isinstance(self.pipeline_type, str):
+            self.pipeline_type = DataProcessorPipelineType(pipeline_type)
+
+        # check if the llm agent is required
         if self.config.agent_model_name is not None and self.config.agent_model_config is not None:
             # get the api key
             api_key = os.environ.get("OPENAI_API_KEY")
@@ -42,6 +60,8 @@ def __init__(
             )
         else:
             self.llm_agent = None
+
+        # init task parser
         self.task_parser = DataTaskParser(config, self.llm_agent)
 
         # Priority weights
@@ -153,34 +173,42 @@ def run(self):
                 traceback.print_exc()
                 return 6, "Grouping and computing priority score failed."
 
-            # step 7. track lineage if they are changed
+            # step 7. reward shaping. Only available for experience pipeline and the reward shaping config is set
+            try:
+                if self.pipeline_type == DataProcessorPipelineType.EXPERIENCE and len(self.config.reward_shaping) > 0:
+                    reshaped_dataset = self._reward_shaping(scored_dataset)
+                else:
+                    reshaped_dataset = scored_dataset
+            except Exception:
+                traceback.print_exc()
+                return 7, "Reward shaping failed."
+
+            # step 8. track lineage if they are changed
             try:
-                res_dataset = scored_dataset
+                res_dataset = reshaped_dataset
             except Exception:
                 traceback.print_exc()
-                return 7, "Tracking lineage failed."
+                return 8, "Tracking lineage failed."
 
-            # step 8
+            # step 9, sort the dataset by the computed priority
             try:
                 if "priority" in res_dataset.data.features:
                     res_dataset.sort_by("priority", reverse=True)
             except Exception:
                 traceback.print_exc()
-                return 8, "Sorting results by priority failed."
+                return 9, "Sorting results by priority failed."
 
-            # step 9. sort and export the result to the output buffer
+            # step 10. sort and export the result to the output buffer
             try:
                 res_dataset.write_to_buffer()
             except Exception:
                 traceback.print_exc()
-                return 9, "Exporting result to output buffer failed."
+                return 10, "Exporting result to output buffer failed."
 
         return 0, "success"
 
     def _group_scores(self, dataset: RftDataset) -> RftDataset:
         # for perplexity, normalize them with the max value.
-        from data_juicer.utils.constant import Fields
-
         stats_min_max = {}
         for stats in dataset.data.features[Fields.stats]:
             all_stats = [
@@ -268,6 +296,35 @@ def _compute_priority_scores(self, dataset: RftDataset) -> RftDataset:
         dataset.data = dataset.data.map(self._compute_combined_score)
         return dataset
 
+    def _reward_shaping_single(self, sample, reward_shaping_config: RewardShapingConfig):
+        tgt_stats = reward_shaping_config.stats_key
+        op_type = reward_shaping_config.op_type
+        # if the target stats does not exist, skip this stats and return the original sample
+        if tgt_stats not in sample[Fields.stats]:
+            return sample
+        if op_type == OpType.ADD:
+            sample[self.config.format.reward_key] += reward_shaping_config.weight * sample[Fields.stats][tgt_stats]
+        elif op_type == OpType.MUL:
+            sample[self.config.format.reward_key] *= reward_shaping_config.weight * sample[Fields.stats][tgt_stats]
+        elif op_type == OpType.SUB:
+            sample[self.config.format.reward_key] -= reward_shaping_config.weight * sample[Fields.stats][tgt_stats]
+        elif op_type == OpType.DIV:
+            sample[self.config.format.reward_key] /= reward_shaping_config.weight * sample[Fields.stats][tgt_stats]
+        return sample
+
+    def _reward_shaping(self, rft_dataset: RftDataset) -> RftDataset:
+        dataset = rft_dataset.data
+        # check if there is a reward column in the dataset. If not, skip!
+        if self.config.format.reward_key not in dataset.features:
+            return rft_dataset
+        # get reward shaping configs
+        reward_shaping_configs = self.config.reward_shaping
+        for reward_shaping_config in reward_shaping_configs:
+            dataset = dataset.map(partial(self._reward_shaping_single, reward_shaping_config=reward_shaping_config))
+
+        rft_dataset.data = dataset
+        return rft_dataset
+
     @ray.method(num_returns=1)
     def select_batch(self, dataset: RftDataset, batch_size: int) -> List[Dict[str, Any]]:
         """Select a batch of samples for training"""
diff --git a/trinity/data/server.py b/trinity/data/server.py
@@ -33,7 +33,7 @@ def data_processor(pipeline_type):
             }
         )
 
-    iterator = DataActiveIterator(pipeline_config, config.buffer)
+    iterator = DataActiveIterator(pipeline_config, config.buffer, pipeline_type=pipeline_type)
     ret, msg = iterator.run()
     return jsonify({"return_code": ret, "message": msg})
 

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def data_processor(pipeline_type):`
`33`	`33`	`}`
`34`	`34`	`)`
`35`	`35`
`36`		`- iterator = DataActiveIterator(pipeline_config, config.buffer)`
	`36`	`+ iterator = DataActiveIterator(pipeline_config, config.buffer, pipeline_type=pipeline_type)`
`37`	`37`	`ret, msg = iterator.run()`
`38`	`38`	`return jsonify({"return_code": ret, "message": msg})`
`39`	`39`