Refactor Experiences to List[Experience]

chenyushuo · chenyushuo · commit 55653099d20b · 2025-12-25T19:06:56.000+08:00
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -1325,7 +1325,7 @@ def tearDown(self):
 
 
 class TestTinkerTrainer(BaseTrainerCase):
-    @unittest.skip("Require tinker API key")
+    # @unittest.skip("Require tinker API key")
     def test_trainer(self):
         """Test GSM8K on tinker."""
         # test both mode
diff --git a/trinity/algorithm/sample_strategy/mix_sample_strategy.py b/trinity/algorithm/sample_strategy/mix_sample_strategy.py
@@ -8,7 +8,7 @@
 from trinity.algorithm.sample_strategy.utils import representative_sample
 from trinity.buffer import get_buffer_reader
 from trinity.common.config import BufferConfig
-from trinity.common.experience import CustomField, Experiences
+from trinity.common.experience import CustomField, Experience
 from trinity.utils.timer import Timer
 
 
@@ -53,7 +53,7 @@ def __init__(self, buffer_config: BufferConfig, **kwargs):
             expert_buffer_config,
         )
 
-    async def sample(self, step: int) -> Tuple[Experiences, Dict, List]:
+    async def sample(self, step: int) -> Tuple[List[Experience], Dict, List]:
         metrics = {}
         with Timer(metrics, "time/read_experience"):
             usual_exp_list = await self.usual_exp_buffer.read_async()
@@ -82,24 +82,21 @@ async def sample(self, step: int) -> Tuple[Experiences, Dict, List]:
             repr_samples = representative_sample(exp_list)
 
         self.set_model_version_metric(exp_list, metrics)
-        with Timer(metrics, "time/gather_experience"):
-            exps = Experiences.gather_experiences(
-                experiences=exp_list,
-                pad_token_id=self.pad_token_id,  # type: ignore [arg-type]
-                custom_fields=[
-                    CustomField(
-                        source_field="is_expert",
-                        destination_field="expert_mask",
-                        data_type=torch.bool,
-                    ),
-                    CustomField(
-                        source_field="step",
-                        destination_field="step",
-                        data_type=torch.int32,
-                    ),
-                ],
-            )  # type: ignore
-        return exps, metrics, repr_samples
+        custom_fields = [
+            CustomField(
+                source_field="is_expert",
+                destination_field="expert_mask",
+                data_type=torch.bool,
+            ),
+            CustomField(
+                source_field="step",
+                destination_field="step",
+                data_type=torch.int32,
+            ),
+        ]
+        for exp in exp_list:
+            exp.custom_fields = custom_fields
+        return exp_list, metrics, repr_samples
 
     @classmethod
     def default_args(cls) -> Dict:
diff --git a/trinity/algorithm/sample_strategy/sample_strategy.py b/trinity/algorithm/sample_strategy/sample_strategy.py
@@ -4,15 +4,15 @@
 from trinity.algorithm.sample_strategy.utils import representative_sample
 from trinity.buffer import get_buffer_reader
 from trinity.common.config import BufferConfig
-from trinity.common.experience import Experience, Experiences
+from trinity.common.experience import Experience
 from trinity.utils.annotations import Deprecated
 from trinity.utils.monitor import gather_metrics
 from trinity.utils.timer import Timer
 
 
 class SampleStrategy(ABC):
     def __init__(self, buffer_config: BufferConfig, **kwargs) -> None:
-        self.pad_token_id = buffer_config.pad_token_id
+        pass
 
     def set_model_version_metric(self, exp_list: List[Experience], metrics: Dict):
         metric_list = [
@@ -23,14 +23,14 @@ def set_model_version_metric(self, exp_list: List[Experience], metrics: Dict):
         metrics.update(gather_metrics(metric_list, "sample"))
 
     @abstractmethod
-    async def sample(self, step: int) -> Tuple[Experiences, Dict, List]:
+    async def sample(self, step: int) -> Tuple[List[Experience], Dict, List]:
         """Sample data from buffer.
 
         Args:
             step (`int`): The step number of current step.
 
         Returns:
-            `Experiences`: The sampled Experiences data.
+            `List[Experience]`: The sampled List[Experience] data.
             `Dict`: Metrics for logging.
             `List`: Representative data for logging.
         """
@@ -54,15 +54,13 @@ def __init__(self, buffer_config: BufferConfig, **kwargs):
         super().__init__(buffer_config)
         self.exp_buffer = get_buffer_reader(buffer_config.trainer_input.experience_buffer)  # type: ignore[arg-type]
 
-    async def sample(self, step: int, **kwargs) -> Tuple[Experiences, Dict, List]:
+    async def sample(self, step: int, **kwargs) -> Tuple[List[Experience], Dict, List]:
         metrics = {}
         with Timer(metrics, "time/read_experience"):
             exp_list = await self.exp_buffer.read_async()
             repr_samples = representative_sample(exp_list)
         self.set_model_version_metric(exp_list, metrics)
-        with Timer(metrics, "time/gather_experience"):
-            exps = Experiences.gather_experiences(exp_list, self.pad_token_id)  # type: ignore
-        return exps, metrics, repr_samples
+        return exp_list, metrics, repr_samples
 
     @classmethod
     def default_args(cls) -> dict:
@@ -81,16 +79,14 @@ def __init__(self, buffer_config: BufferConfig, **kwargs):
         super().__init__(buffer_config)
         self.max_staleness = kwargs.get("max_staleness", float("inf"))
 
-    async def sample(self, step: int, **kwargs) -> Tuple[Experiences, Dict, List]:
+    async def sample(self, step: int, **kwargs) -> Tuple[List[Experience], Dict, List]:
         min_model_version = max(step - self.max_staleness, 0)
         metrics = {}
         with Timer(metrics, "time/read_experience"):
             exp_list = await self.exp_buffer.read_async(min_model_version=min_model_version)
             repr_samples = representative_sample(exp_list)
         self.set_model_version_metric(exp_list, metrics)
-        with Timer(metrics, "time/gather_experience"):
-            exps = Experiences.gather_experiences(exp_list, self.pad_token_id)  # type: ignore
-        return exps, metrics, repr_samples
+        return exp_list, metrics, repr_samples
 
 
 @Deprecated
diff --git a/trinity/common/experience.py b/trinity/common/experience.py
@@ -136,6 +136,8 @@ class Experience:
     # for on-policy distillation
     teacher_logprobs: Optional[Tensor] = None  # [resp_length]
 
+    custom_fields: List[CustomField] = field(default_factory=list)
+
     def __init__(  # noqa: C901
         self,
         *,
@@ -161,6 +163,7 @@ def __init__(  # noqa: C901
         rejected_messages=None,
         multi_modal_inputs=None,
         teacher_logprobs=None,
+        custom_fields=None,
     ):
         if action_mask is not None:
             experience_type = "multi_turn"
@@ -250,6 +253,7 @@ def __init__(  # noqa: C901
             self.rejected = torch.tensor(self.rejected)
         if self.teacher_logprobs is not None and not isinstance(self.teacher_logprobs, Tensor):
             self.teacher_logprobs = torch.tensor(self.teacher_logprobs, dtype=torch.float32)
+        self.custom_fields = custom_fields or []
 
     def serialize(self) -> bytes:
         """Serialize the experience to bytes."""
diff --git a/trinity/trainer/tinker/utils.py b/trinity/trainer/tinker/utils.py
@@ -4,7 +4,7 @@
 import torch
 from tinker import types
 
-from trinity.common.experience import Experiences
+from trinity.common.experience import Experience, split_dpo_experience_to_single_turn
 
 
 def pad_to_length(
@@ -23,60 +23,61 @@ def pad_to_length(
 
 
 def to_tinker_input(
-    experiences: Experiences, logger: Logger
+    experiences: List[Experience], logger: Logger
 ) -> Tuple[List[types.Datum], List[types.ModelInput], List[dict]]:
-    cumsum = torch.cumsum(experiences.attention_masks, dim=-1)
-    eos_mask_idx = cumsum.argmax(dim=-1)
-    prompt_length = experiences.prompt_length
+    assert len(experiences) > 0, "No experiences provided."
+    if experiences[0].experience_type == "dpo":
+        experiences = split_dpo_experience_to_single_turn(experiences)
+
     batch = []
     batch_input_tokens = []
     model_inputs_list = []
-    for i in range(len(experiences.tokens)):
-        tokens = experiences.tokens[i]
-        attention_mask = experiences.attention_masks[i]
-        response_mask = attention_mask[prompt_length:]
-        input_tokens = tokens[attention_mask].long()
-        exp_seq_length = sum(attention_mask)
-        exp_response_length = sum(response_mask)
+    for exp in experiences:
+        tokens = exp.tokens
+        input_tokens = tokens.long()
+        prompt_length = exp.prompt_length
+        total_length = len(tokens)  # type: ignore
+        response_length = total_length - prompt_length
         loss_fn_inputs = {
-            "weights": pad_to_length(
-                experiences.action_masks[i][response_mask].float(), exp_seq_length - 1  # type: ignore
+            "weights": torch.concat(
+                [
+                    torch.zeros(prompt_length - 1, dtype=torch.float32),
+                    exp.action_mask.float(),
+                ]
             ),
             "target_tokens": input_tokens.tolist()[1:],
         }
         model_inputs = {
-            "total_length": exp_seq_length,
-            "action_mask": experiences.action_masks[i][response_mask],  # type: ignore
+            "total_length": total_length,
+            "action_mask": exp.action_mask,
         }
-        if experiences.rewards is not None or experiences.token_level_rewards is not None:
-            assert experiences.logprobs is not None
-            if experiences.token_level_rewards is not None:
-                if experiences.rewards is not None:
+        if exp.reward is not None or exp.token_level_reward is not None:
+            assert exp.logprobs is not None
+            if exp.token_level_reward is not None:
+                if exp.reward is not None:
                     logger.warning(
-                        "Both experiences.rewards and experiences.token_level_rewards are provided. "
-                        "Using experiences.token_level_rewards."
+                        "Both exp.rewards and exp.token_level_rewards are provided. "
+                        "Using exp.token_level_rewards."
                     )
-                token_level_rewards = experiences.token_level_rewards[i][response_mask]
+                token_level_reward = exp.token_level_reward
             else:
-                token_level_rewards = torch.zeros(
-                    exp_response_length, dtype=experiences.rewards.dtype
-                )
-                token_level_rewards[eos_mask_idx[i] - prompt_length] = experiences.rewards[i]
+                token_level_reward = torch.zeros(response_length, dtype=torch.float32)
+                token_level_reward[-1] = exp.reward
             model_inputs.update(
                 {
-                    "token_level_scores": token_level_rewards,
-                    "old_logprob": experiences.logprobs[i][response_mask],  # type: ignore
+                    "token_level_scores": token_level_reward,
+                    "old_logprob": exp.logprobs,
                 }
             )
-        if experiences.advantages is not None:
-            model_inputs["advantages"] = experiences.advantages[i][response_mask]
-        if experiences.returns is not None:
-            model_inputs["returns"] = experiences.returns[i][response_mask]
+        for attr in ["advantages", "returns", "teacher_logprobs"]:
+            if getattr(exp, attr, None) is not None:
+                model_inputs[attr] = getattr(exp, attr)
         # TODO: if tinker support multi-modal input, we can add it here
-        if experiences.custom_fields:
-            for field in experiences.custom_fields:
-                if hasattr(experiences, field):
-                    model_inputs[field] = getattr(experiences, field)
+        for custom_field in exp.custom_fields:
+            model_inputs[custom_field.destination_field] = torch.tensor(
+                exp.info[custom_field.source_field],
+                dtype=custom_field.data_type,
+            )
 
         batch.append(
             types.Datum(
diff --git a/trinity/trainer/tinker_trainer.py b/trinity/trainer/tinker_trainer.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict
+from typing import Dict, List
 
 import ray
 import tinker
@@ -16,7 +16,7 @@
 from trinity.algorithm.policy_loss_fn import POLICY_LOSS_FN
 from trinity.algorithm.utils import prefix_metrics
 from trinity.common.config import Config
-from trinity.common.experience import Experiences
+from trinity.common.experience import Experience
 from trinity.manager.synchronizer import Synchronizer
 from trinity.trainer.tinker.utils import (
     compute_data_metrics,
@@ -196,11 +196,11 @@ def _loss_func(
         avg_metrics = {k: sum(v) / len(v) for k, v in metrics.items()}
         return total_loss, avg_metrics
 
-    async def train_step(self, batch_exps: Experiences) -> Dict:
+    async def train_step(self, batch_exps: List[Experience]) -> Dict:
         """Training one step.
 
         Args:
-            batch (Experiences): A batch of experiences to train.
+            batch (List[Experience]): A batch of experiences to train.
 
         Returns:
             Dict: Metrics of the training step.
diff --git a/trinity/trainer/trainer.py b/trinity/trainer/trainer.py
@@ -17,7 +17,7 @@
 from trinity.algorithm.sample_strategy.sample_strategy import SampleStrategy
 from trinity.common.config import Config
 from trinity.common.constants import RunningStatus, SyncMethod, SyncStyle
-from trinity.common.experience import Experiences
+from trinity.common.experience import Experience
 from trinity.manager.state_manager import StateManager
 from trinity.manager.synchronizer import Synchronizer
 from trinity.utils.log import get_logger
@@ -108,7 +108,7 @@ async def train(self) -> str:
         self.logger.info("--------------------\n> Trainer finished.\n--------------------")
         return self.config.trainer.name
 
-    async def train_step(self, exps: Experiences) -> Dict:
+    async def train_step(self, exps: List[Experience]) -> Dict:
         """Train one step.
 
         Returns:
@@ -123,16 +123,16 @@ async def train_step(self, exps: Experiences) -> Dict:
         metrics.update(train_metrics)
         return metrics
 
-    async def _sample_data(self) -> Tuple[Experiences, Dict, List[Dict]]:
+    async def _sample_data(self) -> Tuple[List[Experience], Dict, List[Dict]]:
         """Sample a batch of experiences.
 
         Returns:
-            Experiences: A batch of experiences.
+            List[Experience]: A batch of experiences.
             Dict: Metrics of the sampling step.
             List[Dict]: A list of representative samples for logging.
         """
         batch, metrics, repr_samples = await self.sample_strategy.sample(self.train_step_num + 1)
-        metrics["sample/task_count"] = len(set(eid.tid for eid in batch.eids))
+        metrics["sample/task_count"] = len(set(exp.eid.tid for exp in batch))
         return batch, metrics, repr_samples
 
     async def need_sync(self) -> bool:
@@ -239,11 +239,11 @@ def train_step_num(self) -> int:
         """Get the current training step number."""
 
     @abstractmethod
-    async def train_step(self, batch_exps: Experiences) -> Dict:
+    async def train_step(self, batch_exps: List[Experience]) -> Dict:
         """Training one step.
 
         Args:
-            batch_exps (Experiences): A batch of experiences to train.
+            batch_exps (List[Experience]): A batch of experiences to train.
 
         Returns:
             Dict: Metrics of the training step.
diff --git a/trinity/trainer/verl/utils.py b/trinity/trainer/verl/utils.py
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py