make noise dtype passable #7 (#9)

gfdb · web-flow · commit 0641f2e8d92a · 2026-02-07T19:16:08.000-05:00
* make noise_dtype passable and rm dataloader

* make noise_dtype passable and rm dataloader

* bump version in uv.lock
diff --git a/tests/test_gpu_augmentations.py b/tests/test_gpu_augmentations.py
@@ -63,25 +63,21 @@ def test_freq_drop_no_nan_and_inplace():
     assert torch.isnan(out).logical_not().all()
 
 
-def test_add_noise_with_stub(monkeypatch):
-    def _stub_noise_like(ref, sample_rate, noise_dir):
-        return torch.zeros_like(ref)
+def test_add_noise_with_mock_loader():
+    """Test add_noise with a mock NoiseLoader."""
+    from unittest.mock import MagicMock
 
-    monkeypatch.setattr(
-        "wav2aug.gpu.noise_addition._sample_noise_like", _stub_noise_like
-    )
     waveforms = torch.ones(2, 128, device=DEVICE, dtype=torch.float32)
     ptr = waveforms.data_ptr()
-    out = add_noise(
-        waveforms,
-        16_000,  # sample_rate as positional argument
-        snr_low=0.0,
-        snr_high=0.0,
-        download=False,
-        noise_dir="ignored",
-    )
+
+    # Create mock loader that returns zeros
+    mock_loader = MagicMock()
+    mock_loader.get_batch.return_value = torch.zeros(2, 128)
+
+    out = add_noise(waveforms, mock_loader, snr_low=0.0, snr_high=0.0)
     assert out.data_ptr() == ptr
     assert torch.isfinite(out).all()
+    mock_loader.get_batch.assert_called_once_with(2, 128)
 
 
 def test_add_babble_noise_identity_for_singleton_batch():
@@ -127,7 +123,7 @@ def test_time_dropout_zeroes_segments():
 
 
 def test_wav2aug_runs_with_stubbed_noise(monkeypatch):
-    def _noop_add_noise(waveforms, sample_rate, **kwargs):
+    def _noop_add_noise(waveforms, loader, **kwargs):
         return waveforms
 
     monkeypatch.setattr("wav2aug.gpu.wav2aug.add_noise", _noop_add_noise)
@@ -168,3 +164,22 @@ def test_wav2aug_top_k_invalid_raises():
 
     with pytest.raises(ValueError, match="top_k must be between 1 and 9"):
         Wav2Aug(sample_rate=16_000, top_k=10)
+
+
+def test_wav2aug_noise_dtype(monkeypatch):
+    """Test that noise_dtype is passed to NoiseLoader."""
+
+    def _noop_add_noise(waveforms, loader, **kwargs):
+        return waveforms
+
+    monkeypatch.setattr("wav2aug.gpu.wav2aug.add_noise", _noop_add_noise)
+
+    # Default should be float32
+    aug = Wav2Aug(sample_rate=16_000)
+    assert aug.noise_dtype == torch.float32
+    assert aug._noise_loader.storage_dtype == torch.float32
+
+    # Custom dtype should be passed through
+    aug = Wav2Aug(sample_rate=16_000, noise_dtype=torch.float16)
+    assert aug.noise_dtype == torch.float16
+    assert aug._noise_loader.storage_dtype == torch.float16
diff --git a/uv.lock b/uv.lock
diff --git a/wav2aug/gpu/noise_addition.py b/wav2aug/gpu/noise_addition.py
@@ -6,8 +6,6 @@
 import torch.nn.functional as F
 from tqdm import tqdm
 
-from wav2aug.utils._aug_utils import _sample_noise_like
-
 _EPS = 1e-14
 _AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a"}
 
@@ -23,21 +21,17 @@ def _list_audio_files(root: str) -> list[str]:
 
 
 class NoiseLoader:
-    """Noise loader with preload-to-memory or on-demand loading.
+    """Noise loader that preloads all noise files into CPU RAM.
 
-    By default, loads all noise files into CPU RAM at initialization for
-    zero-I/O sampling during training. For memory-constrained environments,
-    set preload=False to load files on-demand.
+    Loads all noise files into CPU RAM at initialization for
+    zero-I/O sampling during training.
 
     Usage:
-        # Preload mode (default, recommended):
+        # Default:
         noise_loader = NoiseLoader(noise_dir, sample_rate=16000)
 
-        # On-demand mode (for memory-constrained systems):
-        noise_loader = NoiseLoader(noise_dir, sample_rate=16000, preload=False)
-
-        # Custom storage dtype (e.g., for even lower memory):
-        noise_loader = NoiseLoader(noise_dir, sample_rate=16000, storage_dtype=torch.float8_e4m3fn)
+        # Custom storage dtype (e.g., for lower memory):
+        noise_loader = NoiseLoader(noise_dir, sample_rate=16000, storage_dtype=torch.float16)
 
         # In training loop:
         noisy = add_noise(waveforms, noise_loader, snr_low=0, snr_high=10)
@@ -47,39 +41,27 @@ def __init__(
         self,
         noise_dir: str,
         sample_rate: int,
-        preload: bool = True,
-        storage_dtype: torch.dtype = torch.float16,
+        storage_dtype: torch.dtype = torch.float32,
     ):
         """Initialize the noise loader.
 
         Args:
             noise_dir: Directory containing noise audio files.
             sample_rate: Target sample rate for noise.
-            preload: If True (default), load all noise files into CPU RAM at
-                initialization. Sampling becomes a fast tensor slice operation
-                with no I/O. If False, load files on-demand (slower but uses
-                less memory).
             storage_dtype: Data type for storing preloaded audio in memory.
-                Defaults to float16 (~650MB for pointsource_noises). Use float32
-                for maximum precision, or float8 variants for minimum memory. Note: In
-                my experiments, float16 halved memory usage in exchange for an
-                extremely tiny performance degradation.
+                Defaults to float32. Use float16 for lower memory usage.
         """
         self.noise_dir = noise_dir
         self.sample_rate = sample_rate
-        self.preload = preload
         self.storage_dtype = storage_dtype
         self.files = _list_audio_files(noise_dir)
         if not self.files:
             raise ValueError(f"No audio files found in {noise_dir}")
 
         # Preloaded noise bank (1D tensor of all concatenated noise)
-        self._noise_bank: torch.Tensor | None = None
-
-        if preload:
-            self._preload_all()
+        self._noise_bank: torch.Tensor = self._preload_all()
 
-    def _preload_all(self) -> None:
+    def _preload_all(self) -> torch.Tensor:
         """Load all noise files into memory."""
         from torchcodec.decoders import AudioDecoder
 
@@ -101,17 +83,7 @@ def _preload_all(self) -> None:
                 f"No valid audio files could be loaded from {self.noise_dir}"
             )
 
-        self._noise_bank = torch.cat(chunks, dim=0)
-
-    def _load_one(self) -> torch.Tensor:
-        """Load a single noise sample directly (no preloading)."""
-        from torchcodec.decoders import AudioDecoder
-
-        idx = torch.randint(0, len(self.files), (1,)).item()
-        dec = AudioDecoder(self.files[idx], sample_rate=self.sample_rate)
-        samp = dec.get_all_samples()
-        audio = samp.data.contiguous().mean(dim=0)  # mono, shape [time]
-        return audio
+        return torch.cat(chunks, dim=0)
 
     def get_batch(self, batch_size: int, length: int) -> torch.Tensor:
         """Get a batch of noise samples.
@@ -123,54 +95,23 @@ def get_batch(self, batch_size: int, length: int) -> torch.Tensor:
         Returns:
             Tensor of shape [batch_size, length] on CPU.
         """
-        if self._noise_bank is not None:
-            # Fast path: slice from preloaded noise bank
-            bank_len = self._noise_bank.shape[0]
-
-            if bank_len <= length:
-                # Noise bank shorter than requested - pad it
-                noise = self._noise_bank.unsqueeze(0).expand(batch_size, -1)
-                noise = F.pad(noise, (0, length - bank_len))
-                return noise
-
-            # Generate random start indices for each sample
-            max_start = bank_len - length
-            starts = torch.randint(0, max_start + 1, (batch_size,))
-
-            # Vectorized slicing: create index tensor [batch_size, length]
-            # where each row is [start, start+1, ..., start+length-1]
-            offsets = torch.arange(length)
-            indices = starts.unsqueeze(1) + offsets.unsqueeze(0)  # [batch_size, length]
-            return self._noise_bank[indices]
-        else:
-            # On-demand loading
-            noises = []
-            for _ in range(batch_size):
-                noise = self._load_one()
-                noise = self._pad_or_crop(noise, length)
-                noises.append(noise)
-            return torch.stack(noises, dim=0)
-
-    def _pad_or_crop(self, noise: torch.Tensor, length: int) -> torch.Tensor:
-        """Pad or crop noise to target length."""
-        if noise.shape[0] < length:
-            noise = F.pad(noise, (0, length - noise.shape[0]))
-        elif noise.shape[0] > length:
-            start = torch.randint(0, noise.shape[0] - length + 1, (1,)).item()
-            noise = noise[start : start + length]
-        return noise
-
-    @property
-    def mode(self) -> str:
-        """Return current loading mode: 'preload' or 'on-demand'."""
-        return "preload" if self._noise_bank is not None else "on-demand"
-
-    @property
-    def preloaded_duration_seconds(self) -> float | None:
-        """Total duration of preloaded audio in seconds, or None if not preloaded."""
-        if self._noise_bank is not None:
-            return self._noise_bank.shape[0] / self.sample_rate
-        return None
+        bank_len = self._noise_bank.shape[0]
+
+        if bank_len <= length:
+            # Noise bank shorter than requested - pad it
+            noise = self._noise_bank.unsqueeze(0).expand(batch_size, -1)
+            noise = F.pad(noise, (0, length - bank_len))
+            return noise
+
+        # Generate random start indices for each sample
+        max_start = bank_len - length
+        starts = torch.randint(0, max_start + 1, (batch_size,))
+
+        # Vectorized slicing: create index tensor [batch_size, length]
+        # where each row is [start, start+1, ..., start+length-1]
+        offsets = torch.arange(length)
+        indices = starts.unsqueeze(1) + offsets.unsqueeze(0)  # [batch_size, length]
+        return self._noise_bank[indices]
 
 
 @torch.no_grad()
@@ -239,39 +180,25 @@ def _mix_noise(
 @torch.no_grad()
 def add_noise(
     waveforms: torch.Tensor,
-    sample_rate_or_loader: int | NoiseLoader,
+    loader: NoiseLoader,
     *,
     snr_low: float = 0.0,
     snr_high: float = 10.0,
-    noise_dir: str | None = None,
-    download: bool = True,
-    pack: str = "pointsource_noises",
 ) -> torch.Tensor:
     """Add point-source noise to each waveform in the batch.
 
     Args:
         waveforms (torch.Tensor): The input waveforms. Shape [batch, time].
-        sample_rate_or_loader: Either the sample rate (int) for legacy behavior,
-            or a NoiseLoader instance for efficient background loading.
+        loader: A NoiseLoader instance for efficient noise sampling.
         snr_low (float, optional): The minimum SNR in dB. Defaults to 0.0.
         snr_high (float, optional): The maximum SNR in dB. Defaults to 10.0.
-        noise_dir (str | None, optional): Directory containing noise files.
-            Only used when sample_rate_or_loader is an int. Defaults to None.
-        download (bool, optional): Whether to download noise files if not found.
-            Only used when sample_rate_or_loader is an int. Defaults to True.
-        pack (str, optional): The name of the noise pack to use.
-            Only used when sample_rate_or_loader is an int. Defaults to "pointsource_noises".
 
     Returns:
         torch.Tensor: The waveforms with point-source noise added.
 
     Example:
-        # Fast path with NoiseLoader (recommended):
-        loader = NoiseLoader("/path/to/noise", sample_rate=16000, num_workers=4)
+        loader = NoiseLoader("/path/to/noise", sample_rate=16000)
         noisy = add_noise(waveforms, loader, snr_low=0, snr_high=10)
-
-        # Legacy path (slower, loads from disk each call):
-        noisy = add_noise(waveforms, 16000, snr_low=0, snr_high=10, noise_dir="/path/to/noise")
     """
     if waveforms.ndim != 2:
         raise AssertionError("expected waveforms shaped [batch, time]")
@@ -283,26 +210,8 @@ def add_noise(
     device = waveforms.device
     dtype = waveforms.dtype
 
-    if isinstance(sample_rate_or_loader, NoiseLoader):
-        # Fast path: use the NoiseLoader
-        noise = sample_rate_or_loader.get_batch(batch, total_time)
-        noise = noise.to(device=device, dtype=dtype)
-    else:
-        # Legacy path: load noise synchronously
-        sample_rate = sample_rate_or_loader
-
-        if noise_dir is None and download:
-            from wav2aug.data.fetch import ensure_pack
-
-            noise_dir = ensure_pack(pack)
-
-        noises = []
-        for _ in range(batch):
-            ref = torch.empty(1, total_time, dtype=dtype)
-            sample = _sample_noise_like(ref, sample_rate, noise_dir)
-            noise_sample = sample.to(device=device, dtype=dtype).view(-1)
-            noises.append(noise_sample)
-        noise = torch.stack(noises, dim=0)
+    noise = loader.get_batch(batch, total_time)
+    noise = noise.to(device=device, dtype=dtype)
 
     return _mix_noise(
         waveforms,
diff --git a/wav2aug/gpu/wav2aug.py b/wav2aug/gpu/wav2aug.py
@@ -21,31 +21,34 @@ def __init__(
         self,
         sample_rate: int,
         noise_dir: str | None = None,
-        noise_preload: bool = True,
         top_k: int = 9,
+        noise_dtype: torch.dtype = torch.float32,
     ) -> None:
         """Initialize Wav2Aug.
 
         Args:
             sample_rate: Audio sample rate in Hz.
             noise_dir: Directory containing noise files. If None, will use the
                 default cached noise pack (auto-downloaded if needed).
-            noise_preload: If True (default), preload all noise files into CPU RAM
-                at initialization for fast sampling. If False, load files on-demand.
             top_k: Number of top augmentations to use, ordered by effectiveness.
                 Default is 9 (all augmentations). Common values: 3, 6, or 9.
                 Order (best to worst): Noise Addition, Freq Drop, Time Drop,
                 Speed Perturb, Amp Clip, Chunk Swap, Babble Noise, Amp Scale,
                 Polarity Inversion.
+            noise_dtype: Data type for storing preloaded noise in memory.
+                Defaults to float32. Use float16 for memory efficiency.
         """
         self.sample_rate = int(sample_rate)
+        self.noise_dtype = noise_dtype
 
         # Initialize noise loader
         if noise_dir is None:
             from wav2aug.data.fetch import ensure_pack
 
             noise_dir = ensure_pack("pointsource_noises")
-        self._noise_loader = NoiseLoader(noise_dir, sample_rate, preload=noise_preload)
+        self._noise_loader = NoiseLoader(
+            noise_dir, sample_rate, storage_dtype=noise_dtype
+        )
 
         # All ops ordered by effectiveness (best first)
         all_ops: List[Callable[[torch.Tensor, torch.Tensor | None], torch.Tensor]] = [