BUG: remove printing.

msafari18 · msafari18 · commit b6bc2ff437a1 · 2025-05-12T18:11:06.000-04:00
diff --git a/barcodebert/datasets.py b/barcodebert/datasets.py
@@ -13,6 +13,7 @@
 from transformers import AutoTokenizer
 import torch.distributed as dist
 from torch.utils.data import get_worker_info
+import math
 
 
 class KmerTokenizer(object):
@@ -154,12 +155,32 @@ def parse_row(self, row):
         return tokens, torch.tensor(int(label), dtype=torch.int64), att_mask
 
     def __len__(self):
-        # count lines once at startup (cheap) so val‑throughput logging doesn't crash
-        if not hasattr(self, "_n_samples"):
+        # 1) count total lines in file (once)
+        if not hasattr(self, "_n_total"):
             with open(self.file_path, "r") as f:
-                # subtract header if CSV has one; adjust accordingly
-                self._n_samples = sum(1 for _ in f) - 1
-        return self._n_samples
+                # subtract 1 if there’s a header row
+                self._n_total = sum(1 for _ in f) - 1
+
+        # 2) figure out how many each *process* + *worker* should see
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        else:
+            rank, world_size = 0, 1
+
+        worker_info = get_worker_info()
+        if worker_info is not None:
+            # subdivide further across DataLoader workers
+            total_workers = worker_info.num_workers
+            rank = rank * total_workers + worker_info.id
+            world_size = world_size * total_workers
+
+        # 3) even‐split the total lines, giving the first (rem) ranks one extra
+        base, rem = divmod(self._n_total, world_size)
+        local_n = base + (1 if rank < rem else 0)
+
+        # 4) batches per epoch for *this* process
+        return math.ceil(local_n / self._batch_size_per_gpu)
 
 
     def __iter__(self):