Fix: interface change for sampling weights (#156)

kmehant · web-flow · commit 7205eb22c6f2 · 2025-10-09T19:35:22.000+05:30
* fix: refactor sampling weight

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: refactor sampling weight

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: refactor sampling weight

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

---------

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/plugins/online-data-mixing/src/fms_acceleration_odm/odm/dataset.py b/plugins/online-data-mixing/src/fms_acceleration_odm/odm/dataset.py
@@ -1,6 +1,6 @@
 # Standard
 from logging import getLogger
-from typing import List, Optional
+from typing import Optional
 import json
 import math
 import os
@@ -27,7 +27,7 @@ def __init__(
         collators_dict: dict,
         eval_dataset_dict: DatasetDict,
         eval_collators_dict: dict,
-        sampling_weights: Optional[List[float]] = None,
+        sampling_weights: Optional[dict] = None,
         gamma: float = 0.1,
         eta: float = 0.3,
         sampling_interval: int = 1,
@@ -51,7 +51,7 @@ def __init__(
             eval datasets.
             eval_collators_dict (dict): collator corresponding to each dataset
             used while constructing torch dataloader.
-            sampling_weights (Optional[List[float]], optional): Initial
+            sampling_weights (Optional[dict], optional): Initial
             set of sampling weights to start with. Defaults to equal weightage.
             gamma (float, optional): MAB hyperparameter. Defaults to 0.1.
             eta (float, optional): MAB hyperparameter. Defaults to 0.3.
@@ -123,9 +123,13 @@ def __init__(
         # are equally important. Weights based on the size of the datasets
         # and other such heuristics should be computed outside and passed
         # through sampling_weights while initializing this class.
-        if sampling_weights is None:
-            sampling_weights = [1] * self.total_categories
-        self.sampling_weights = torch.tensor(sampling_weights, dtype=torch.float64)
+        if not sampling_weights:
+            self.sampling_weights = [1] * self.total_categories
+        else:
+            self.sampling_weights = []
+            for cat in self.category_list:
+                self.sampling_weights.append(sampling_weights[cat])
+        self.sampling_weights = torch.tensor(self.sampling_weights, dtype=torch.float64)
         self.sampling_ratio = []
         self._update_sampling_ratio(self.sampling_weights)
 
diff --git a/plugins/online-data-mixing/tests/test_online_data.py b/plugins/online-data-mixing/tests/test_online_data.py
@@ -13,16 +13,43 @@
 # limitations under the License.
 
 # Third Party
+from torch.utils.data import IterableDataset
+
 # pylint: disable=import-error
 import pytest
 import torch
 
 # First Party
 from fms_acceleration_odm import OnlineMixingDataset, Reward
 
+
+class SampleDataset(IterableDataset):
+    def __init__(self, seq_length, vocab_size):
+        self.seq_length = seq_length
+        self.vocab_size = vocab_size
+
+    def __len__(self):
+        pass
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        input_ids = torch.rand(self.seq_length)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": torch.ones(self.seq_length),
+            "labels": input_ids,
+        }
+
+
+def get_dataset(seq_len, vocab_size):
+    return SampleDataset(seq_length=seq_len, vocab_size=vocab_size)
+
+
 PARAMETERS = [
     (
-        [1, 100, 2],
+        {"data_1": 1, "data_2": 100, "data_3": 2},
         [[1, 100, 1], [1, 200, 1], [1, 100, 1], [1, 1, 1000], [1, 1, 2000]],
         5,
         [1, 1, 1, 2, 2],
@@ -41,29 +68,18 @@ def test_online_data_mix_learning(
     batch_size = 100
     seq_length = 6
     vocab_size = 50
-    input_ids = (
-        torch.arange(batch_size * seq_length).reshape(batch_size, seq_length)
-        % vocab_size
-    )
-    attention_mask = torch.tensor(
-        [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1]]
-    )
-    labels = input_ids
-    train_data = {
-        "input_ids": input_ids,
-        "labels": labels,
-        "attention_mask": attention_mask,
-    }
-    eval_data = {
-        "input_ids": input_ids,
-        "labels": labels,
-        "attention_mask": attention_mask,
+
+    train_data_dict = {
+        "data_1": get_dataset(seq_len=seq_length, vocab_size=vocab_size),
+        "data_2": get_dataset(seq_len=seq_length, vocab_size=vocab_size),
+        "data_3": get_dataset(seq_len=seq_length, vocab_size=vocab_size),
     }
+    collators_dict = {"data_1": None, "data_2": None, "data_3": None}
     dataset = OnlineMixingDataset(
-        train_data,
-        None,
-        eval_data,
-        None,
+        train_data_dict,
+        collators_dict,
+        train_data_dict,
+        collators_dict,
         sampling_weights,
         0.1,
         0.3,