single-cell-data
diff --git a/‎src/tiledbsoma_ml/_mini_batch_iterable.py‎
Lines changed: 80 additions & 0 deletions b/‎src/tiledbsoma_ml/_mini_batch_iterable.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎src/tiledbsoma_ml/dataset.py‎
Lines changed: 69 additions & 10 deletions b/‎src/tiledbsoma_ml/dataset.py‎
Lines changed: 69 additions & 10 deletions
diff --git a/‎tests/_utils.py‎
Lines changed: 53 additions & 0 deletions b/‎tests/_utils.py‎
Lines changed: 53 additions & 0 deletions
@@ -4,11 +4,13 @@
 from __future__ import annotations
 
 import logging
+import os
 from typing import Iterable, Iterator
 
 import attrs
 import numpy as np
 import pandas as pd
+import torch
 from scipy import sparse
 
 from tiledbsoma_ml._common import MiniBatch
@@ -27,6 +29,29 @@ class MiniBatchIterable(Iterable[MiniBatch]):
     use_eager_fetch: bool = True
     return_sparse_X: bool = False
 
+    gpu_shuffle: bool = False
+    gpu_shuffle_mode: str = "iobatch"
+    device: torch.device | None = None
+    seed: int | None = None
+    epoch: int = 0
+
+    def _gpu_perm(self, n: int) -> torch.Tensor:
+        """Deterministic permutation of range(n) seeded by (seed, epoch, pid)."""
+        base = int(self.seed or 0)
+        pid = os.getpid()
+        mixed = (base * 1315423911 + self.epoch * 2654435761 + pid) & 0xFFFFFFFF
+
+        gen_device = (
+            self.device
+            if (
+                self.device is not None and getattr(self.device, "type", None) == "cuda"
+            )
+            else "cpu"
+        )
+        g = torch.Generator(device=gen_device)
+        g.manual_seed(mixed)
+        return torch.randperm(n, generator=g, device=gen_device)
+
     def _iter(self) -> Iterator[MiniBatch]:
         batch_size = self.batch_size
         result: MiniBatch | None = None
@@ -35,6 +60,39 @@ def _iter(self) -> Iterator[MiniBatch]:
             iob_idx = 0  # current offset into io batch
             iob_len = X_io_batch.shape[0]
 
+            # GPU within-IO-batch shuffle (dense only)
+            if self.gpu_shuffle and self.gpu_shuffle_mode == "iobatch":
+                if self.return_sparse_X:
+                    logger.warning(
+                        "GPU shuffle requested but return_sparse_X=True; leaving IO-batch order unchanged."
+                    )
+                else:
+                    perm = self._gpu_perm(iob_len)
+                    perm_cpu = perm.to("cpu", non_blocking=False).numpy()
+
+                    X_full = X_io_batch.slice_tonumpy(slice(0, iob_len))
+                    X_t = torch.from_numpy(X_full)
+                    if (
+                        self.device is not None
+                        and getattr(self.device, "type", None) == "cuda"
+                    ):
+                        if not X_t.is_pinned():
+                            X_t = X_t.pin_memory()  # faster H2D
+                        X_t = X_t.to(self.device, non_blocking=True)
+                    X_t = X_t.index_select(0, perm).contiguous()
+                    X_cpu = X_t.to("cpu", non_blocking=False).numpy()
+
+                    obs_perm = obs_io_batch.iloc[perm_cpu].reset_index(drop=True)
+
+                    # Emit mini-batches from the permuted IO-batch
+                    for start in range(0, iob_len, self.batch_size):
+                        stop = min(start + self.batch_size, iob_len)
+                        yield (
+                            X_cpu[start:stop],
+                            obs_perm.iloc[start:stop].reset_index(drop=True),
+                        )
+                    continue  # done with this IO-batch
+
             while iob_idx < iob_len:
                 if result is None:
                     # perform zero copy slice where possible
@@ -76,6 +134,28 @@ def _iter(self) -> Iterator[MiniBatch]:
                     iob_idx += to_take
 
                 X, obs = result
+
+                if (
+                    self.gpu_shuffle
+                    and self.gpu_shuffle_mode == "minibatch"
+                    and not self.return_sparse_X
+                ):
+                    mb_n = X.shape[0]
+                    perm = self._gpu_perm(mb_n)
+                    perm_cpu = perm.to("cpu", non_blocking=False).numpy()
+
+                    X_t = torch.from_numpy(X)
+                    if (
+                        self.device is not None
+                        and getattr(self.device, "type", None) == "cuda"
+                    ):
+                        if not X_t.is_pinned():
+                            X_t = X_t.pin_memory()
+                        X_t = X_t.to(self.device, non_blocking=True)
+                    X_t = X_t.index_select(0, perm).contiguous()
+                    X = X_t.to("cpu", non_blocking=False).numpy()
+                    obs = obs.iloc[perm_cpu].reset_index(drop=True)
+
                 assert X.shape[0] == obs.shape[0]
                 if X.shape[0] == batch_size:
                     yield result
 
@@ -5,7 +5,8 @@
 from __future__ import annotations
 
 import logging
-from typing import Iterator, List, Optional, Sequence, Tuple
+from enum import Enum
+from typing import Any, Iterator, List, Optional, Sequence, Tuple
 
 import numpy as np
 import torch
@@ -32,6 +33,25 @@
 DEFAULT_IO_BATCH_SIZE = 2**16
 
 
+class ShuffleMode(str, Enum):
+    """Shuffling backend selection."""
+
+    CPU = "cpu"
+    GPU_IOBATCH = "gpu_iobatch"  # Emulate CPU, shuffling at io batch level
+    GPU_MINIBATCH = "gpu_minibatch"  # Only shuffle the mini batch at the gpu
+
+
+def _shuffle_mode_converter(v: Any) -> ShuffleMode:
+    if isinstance(v, ShuffleMode):
+        return v
+    if isinstance(v, str):
+        v = v.lower()
+        if v == "gpu":  # Simplicity alias
+            return ShuffleMode.GPU_IOBATCH
+        return ShuffleMode(v)  # "cpu" | "gpu_iobatch" | "gpu_minibatch"
+    return ShuffleMode(v)
+
+
 @define
 class ExperimentDataset(IterableDataset[MiniBatch]):  # type: ignore[misc]
     r"""An |IterableDataset| implementation that reads from an |ExperimentAxisQuery|.
@@ -117,7 +137,7 @@ class ExperimentDataset(IterableDataset[MiniBatch]):  # type: ignore[misc]
     """Names of ``obs`` columns to return."""
 
     # Configuration fields with defaults
-    batch_size: int = field(default=1, validator=and_(instance_of(int), gt(0)))
+    batch_size: int = field(default=1024, validator=and_(instance_of(int), gt(0)))
     """Number of rows of ``X`` and ``obs`` data to yield in each |MiniBatch|."""
     io_batch_size: int = field(
         default=DEFAULT_IO_BATCH_SIZE, validator=and_(instance_of(int), gt(0))
@@ -135,6 +155,17 @@ class ExperimentDataset(IterableDataset[MiniBatch]):  # type: ignore[misc]
     use_eager_fetch: bool = field(default=True)
     """Pre-fetch one "IO batch" and one "mini batch"."""
 
+    # GPU Shuffle Config
+    shuffle_mode: ShuffleMode = field(
+        default=ShuffleMode.CPU, converter=_shuffle_mode_converter
+    )
+    """Whether to shuffle on cpu or gpu (and at what granularity).
+
+    Only read when shuffle=True
+    """
+    device: Optional[torch.device] = field(default=None)
+    """Device to move X to; set to torch.device('cuda', N) to enable GPU shuffle."""
+
     # Internal state
     epoch: int = field(default=0, init=False)
     rank: int = field(init=False)
@@ -154,6 +185,8 @@ def __init__(
         seed: Optional[int] = None,
         return_sparse_X: bool = False,
         use_eager_fetch: bool = True,
+        shuffle_mode: ShuffleMode = ShuffleMode.CPU,
+        device: Optional[torch.device] = None,
     ):
         r"""Construct a new |ExperimentDataset|.
 
@@ -223,6 +256,7 @@ def __init__(
             In addition, when using shuffling in a distributed configuration (e.g., ``DDP``), you must provide a seed,
             ensuring that the same shuffle is used across all replicas.
         """
+
         if query and layer_name:
             if x_locator or query_ids:
                 raise ValueError(
@@ -255,21 +289,30 @@ def __init__(
             seed=seed,
             return_sparse_X=return_sparse_X,
             use_eager_fetch=use_eager_fetch,
+            shuffle_mode=shuffle_mode,
+            device=device,
         )
 
     def __attrs_post_init__(self) -> None:
         """Validate configuration and initialize distributed state."""
         obs_column_names = self.obs_column_names
         if not obs_column_names:
             raise ValueError("Must specify at least one value in `obs_column_names`")
-
         if self.shuffle:
             # Verify `io_batch_size` is a multiple of `shuffle_chunk_size`
             if self.io_batch_size % self.shuffle_chunk_size:
                 raise ValueError(
                     f"{self.io_batch_size=} is not a multiple of {self.shuffle_chunk_size=}"
                 )
 
+            # Sanity Check for GPU Shuffle
+            if self.shuffle and self.shuffle_mode != ShuffleMode.CPU:
+                if self.device is None or getattr(self.device, "type", None) != "cuda":
+                    logger.warning(
+                        "GPU shuffle requested but `device` is not CUDA; defaulting to CPU within-IO shuffle."
+                    )
+                    object.__setattr__(self, "shuffle_mode", ShuffleMode.CPU)
+
         if self.seed is None:
             object.__setattr__(
                 self, "seed", np.random.default_rng().integers(0, 2**32 - 1)
@@ -333,7 +376,6 @@ def __iter__(self) -> Iterator[MiniBatch]:
             experimental
         """
         self._multiproc_check()
-
         worker_id, n_workers = get_worker_id_and_num()
         partition = Partition(
             rank=self.rank,
@@ -342,15 +384,25 @@ def __iter__(self) -> Iterator[MiniBatch]:
             n_workers=n_workers,
         )
         query_ids = self.query_ids.partitioned(partition)
-        if self.shuffle:
-            chunks = query_ids.shuffle_chunks(
+        use_gpu_shuffle = False
+        gpu_shuffle_mode = "none"
+        if self.shuffle and getattr(self.device, "type", None) == "cuda":
+            if self.shuffle_mode == ShuffleMode.GPU_IOBATCH:
+                use_gpu_shuffle = True
+                gpu_shuffle_mode = "iobatch"
+            elif self.shuffle_mode == ShuffleMode.GPU_MINIBATCH:
+                use_gpu_shuffle = True
+                gpu_shuffle_mode = "minibatch"
+
+        if self.shuffle and self.shuffle_mode not in (ShuffleMode.GPU_MINIBATCH,):
+            chunks = query_ids.shuffle_chunks(  # provide shuffle chunk size of random chunks (upstream randomization)
                 shuffle_chunk_size=self.shuffle_chunk_size,
                 seed=self.seed,
             )
         else:
-            # In no-shuffle mode, all the `obs_joinids` can be treated as one "shuffle chunk",
-            # which IO-batches will stride over.
-            chunks = [query_ids.obs_joinids]
+            chunks = [
+                query_ids.obs_joinids
+            ]  # For no or just mini batch shuffling, provide sequential order of chunks
 
         with self.x_locator.open() as (X, obs):
             io_batch_iter = IOBatchIterable(
@@ -361,7 +413,8 @@ def __iter__(self) -> Iterator[MiniBatch]:
                 X=X,
                 obs_column_names=self.obs_column_names,
                 seed=self.seed,
-                shuffle=self.shuffle,
+                # disable internal shuffling if we are shuffling with GPU
+                shuffle=(self.shuffle and not use_gpu_shuffle),
                 use_eager_fetch=self.use_eager_fetch,
             )
 
@@ -370,6 +423,12 @@ def __iter__(self) -> Iterator[MiniBatch]:
                 batch_size=self.batch_size,
                 use_eager_fetch=self.use_eager_fetch,
                 return_sparse_X=self.return_sparse_X,
+                # gpu shuffle params
+                gpu_shuffle=use_gpu_shuffle,
+                gpu_shuffle_mode=gpu_shuffle_mode,  # "iobatch" | "minibatch"
+                device=self.device,
+                seed=self.seed,
+                epoch=self.epoch,
             )
 
         self.epoch += 1
 
@@ -187,6 +187,59 @@ def add_sparse_array(
     a.write(tensor)
 
 
+def flatten_joinids(batches: List[MiniBatch]) -> List[int]:
+    return [int(i) for _, obs in batches for i in obs["soma_joinid"].tolist()]
+
+
+def minibatch_is_contiguous(ids: List[int]) -> bool:
+    if len(ids) <= 1:
+        return True
+    ids_sorted = sorted(ids)
+    return ids_sorted[-1] - ids_sorted[0] + 1 == len(ids_sorted)
+
+
+def assert_gpu_minibatch_no_upstream_mixing(batches: List[MiniBatch]) -> None:
+    """Each minibatch should be a contiguous slice; slices increase strictly.
+
+    Test for gpu_minibatch shuffling.
+    """
+    prev_max = -1
+    for _, obs in batches:
+        ids = [int(i) for i in obs["soma_joinid"].tolist()]
+        assert minibatch_is_contiguous(ids), f"Non-contiguous minibatch: {ids}"
+        ids_sorted = sorted(ids)
+        assert (
+            ids_sorted[0] > prev_max
+        ), f"Detected upstream mixing: start={ids_sorted[0]} <= prev_max={prev_max}"
+        prev_max = ids_sorted[-1]
+
+
+def assert_gpu_iobatch_invariants(
+    batches: List[MiniBatch],
+    batch_size: int,
+    min_noncontig_ratio: float = 0.2,
+    num_workers: int = 1,
+) -> None:
+    """Property checks for IO-batch GPU shuffle (not exact order)."""
+    # Check for unecessary non-full batches
+    sizes = [len(obs) for _, obs in batches]
+    assert all(1 <= s <= batch_size for s in sizes), f"Invalid sizes: {sizes}"
+    # If there are enough rows overall, expect at least one full minibatch
+    if sum(sizes) >= batch_size:
+        assert any(s == batch_size for s in sizes), "No full minibatches produced"
+
+    # measure dispersion b/w mini batches. Should not consistently fail.
+    non_contig = 0
+    for _, obs in batches:
+        ids = [int(i) for i in obs["soma_joinid"].tolist()]
+        if not minibatch_is_contiguous(ids):
+            non_contig += 1
+    if len(batches) >= 4:  # avoid tiny outliers
+        assert non_contig >= max(
+            1, int(len(batches) * min_noncontig_ratio)
+        ), "Low dispersion in IO-batch GPU shuffle; check upstream shuffle chunk selection."
+
+
 @contextmanager
 def mock_dist_is_initialized():
     with patch("torch.distributed.is_initialized") as mock_dist_is_initialized: