feat(loss): add AMSE loss with SHT caching to address spectral double penalty (#164)

ananya12k · pre-commit-ci[bot] · web-flow · commit 8b9484a302ef · 2025-08-04T09:53:13.000+01:00
* feat(loss): add AMSE loss with SHT caching to address spectral double penalty * test: add unit tests for AMSENormalizedLoss with gradient and CUDA checks * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Formatting: format code acc to guidelines * Fix: Minor bug in code fixed * Fix * refactor: convert loss tests to pytest-style functions * pre commit task * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "pre commit task" This reverts commit 62a23f1. * Formatting code acc to standards * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Removed node modules folder * Removed unnecessary comment and formatted acc to stds --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/graph_weather/models/losses.py b/graph_weather/models/losses.py
@@ -2,6 +2,8 @@
 
 import numpy as np
 import torch
+import torch.nn as nn
+import torch_harmonics as th
 
 
 class NormalizedMSELoss(torch.nn.Module):
@@ -90,3 +92,104 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor):
 
         assert not torch.isnan(out).any()
         return out.mean()
+
+
+# Spectrally Adjusted Mean Squared Error (AMSE) loss
+class AMSENormalizedLoss(nn.Module):
+    """
+    Spectrally Adjusted Mean Squared Error (AMSE) Loss.
+
+    This loss function is designed to address the "double penalty" issue in spatial forecasting
+    by separately penalizing amplitude and phase differences in the spectral domain.
+
+    It applies the Spherical Harmonic Transform (SHT) to both predictions and targets,
+    computes the power spectral density (PSD), and then evaluates two terms:
+    1. Amplitude Error (difference in spectral amplitudes).
+    2. Decorrelation Error (phase misalignment/coherence loss).
+
+    This implementation follows the formulation in:
+    "Fixing the Double Penalty in Data-Driven Weather Forecasting Through a Modified Spherical Harmonic Loss Function"
+    (ICML 2025 Poster).
+
+    Args:
+        feature_variance (list or torch.Tensor): Variance of each physical feature for normalization (length C).
+        epsilon (float): Small constant for numerical stability.
+    """
+
+    def __init__(self, feature_variance: list | torch.Tensor, epsilon: float = 1e-9):
+        super().__init__()
+        if not isinstance(feature_variance, torch.Tensor):
+            feature_variance = torch.tensor(feature_variance, dtype=torch.float32)
+        else:
+            feature_variance = feature_variance.clone().detach().float()
+
+        self.register_buffer("feature_variance", feature_variance)
+
+        # SHT cache to avoid re-initializing on every forward pass since object performs some expensive pre-computation when it's initialized. Doing this repeatedly inside the training loop can add unnecessary overhead.
+        self.epsilon = epsilon
+        self.sht_cache = {}
+
+    def _get_sht(self, nlat: int, nlon: int, device: torch.device) -> th.RealSHT:
+        """
+        Helper to get a cached SHT object, creating it if it doesn't exist.
+        This prevents re-initializing the SHT object on every forward pass.
+        """
+        key = (nlat, nlon, device)
+        if key not in self.sht_cache:
+            self.sht_cache[key] = th.RealSHT(nlat, nlon, grid="equiangular").to(device)
+        return self.sht_cache[key]
+
+    def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass to compute the AMSE loss.
+
+        Args:
+            pred (torch.Tensor): Predicted tensor of shape (B, C, H, W).
+            target (torch.Tensor): Ground truth tensor of shape (B, C, H, W).
+
+        Returns:
+            torch.Tensor: Scalar loss value (averaged over batch and features).
+        """
+        if pred.shape != target.shape:
+            raise ValueError("Prediction and target tensors must have the same shape.")
+        if pred.ndim != 4:
+            raise ValueError("Input tensors must be 4D: (batch, channels, lat, lon)")
+
+        batch_size, num_channels, nlat, nlon = pred.shape
+
+        # Reshape to (B*C, H, W) to process all variables at once
+        pred_reshaped = pred.view(batch_size * num_channels, nlat, nlon)
+        target_reshaped = target.view(batch_size * num_channels, nlat, nlon)
+
+        # Get the (potentially cached) SHT object
+        sht = self._get_sht(nlat, nlon, pred.device)
+        pred_coeffs = sht(pred_reshaped)  # (B*C, L, M) complex
+        target_coeffs = sht(target_reshaped)  # (B*C, L, M) complex
+
+        # Compute Power Spectral Densities (PSD): sum |coeff|^2 over M
+        pred_psd = torch.sum(torch.abs(pred_coeffs) ** 2, dim=-1)  # (B*C, L)
+        target_psd = torch.sum(torch.abs(target_coeffs) ** 2, dim=-1)  # (B*C, L)
+
+        # Compute spectral coherence between prediction and target
+        cross_power = pred_coeffs * torch.conj(target_coeffs)  # (B*C, L, M)
+        coherence_num = torch.sum(cross_power.real, dim=-1)  # (B*C, L)
+        coherence_denom = torch.sqrt(pred_psd * target_psd)
+        coherence = coherence_num / (coherence_denom + self.epsilon)  # (B*C, L)
+
+        # Compute amplitude error: difference in sqrt(PSD)
+        amp_error = (
+            torch.sqrt(pred_psd + self.epsilon) - torch.sqrt(target_psd + self.epsilon)
+        ) ** 2
+
+        # Compute decorrelation error
+        decor_error = 2.0 * coherence_denom * (1.0 - coherence)
+
+        # Total spectral loss per sample
+        spectral_loss = torch.sum(amp_error + decor_error, dim=-1)  # (B*C,)
+
+        # Reshape back to (B, C)
+        spectral_loss = spectral_loss.view(batch_size, num_channels)
+
+        # Normalize by feature-wise variance and compute mean loss
+        normalized_loss = spectral_loss / (self.feature_variance + self.epsilon)
+        return normalized_loss.mean()
diff --git a/tests/test_asme_loss.py b/tests/test_asme_loss.py
@@ -0,0 +1,102 @@
+import pytest
+import torch
+import torch_harmonics as th
+
+from graph_weather.models.losses import AMSENormalizedLoss
+
+
+@pytest.fixture
+def default_shape() -> tuple[int, int, int, int]:
+    """Return a default tensor shape (B, C, H, W) for test inputs."""
+    return 2, 3, 32, 64
+
+
+@pytest.fixture
+def feature_variance(default_shape: tuple) -> torch.Tensor:
+    """Return a synthetic feature variance tensor, one value per channel."""
+    _, num_channels, _, _ = default_shape
+    return (torch.rand(num_channels) + 0.5).clone().detach()
+
+
+@pytest.fixture
+def loss_fn(feature_variance: torch.Tensor) -> AMSENormalizedLoss:
+    """Instantiate the AMSENormalizedLoss with mock feature variance."""
+    return AMSENormalizedLoss(feature_variance=feature_variance)
+
+
+def test_zero_loss_for_identical_inputs(loss_fn: AMSENormalizedLoss, default_shape: tuple):
+    """Loss should be zero when prediction and target tensors are identical."""
+    pred = torch.randn(default_shape)
+    target = pred.clone()
+    loss = loss_fn(pred, target)
+    assert torch.allclose(loss, torch.tensor(0.0), atol=1e-6)
+
+
+def test_positive_loss_for_different_inputs(loss_fn: AMSENormalizedLoss, default_shape: tuple):
+    """Loss should be strictly positive when inputs differ."""
+    pred = torch.randn(default_shape)
+    target = torch.randn(default_shape)
+    loss = loss_fn(pred, target)
+    assert loss.item() > 0.0
+
+
+def test_gradient_flow(loss_fn: AMSENormalizedLoss, default_shape: tuple):
+    """Check that gradients can flow through the loss for backpropagation."""
+    pred = torch.randn(default_shape, requires_grad=True)
+    target = torch.randn(default_shape)
+    loss = loss_fn(pred, target)
+    loss.backward()
+    assert pred.grad is not None
+    assert torch.sum(torch.abs(pred.grad)) > 0
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_cuda_execution(feature_variance: torch.Tensor, default_shape: tuple):
+    """Verify that the loss runs on GPU and returns a finite CUDA tensor."""
+    device = torch.device("cuda")
+    loss_fn_cuda = AMSENormalizedLoss(feature_variance=feature_variance).to(device)
+    pred = torch.randn(default_shape, device=device)
+    target = torch.randn(default_shape, device=device)
+    loss = loss_fn_cuda(pred, target)
+    assert loss.is_cuda
+    assert torch.isfinite(loss)
+
+
+def test_known_value_simple_case(feature_variance: torch.Tensor):
+    """
+    Validate loss against a known spectral case.
+
+    This test generates synthetic spectral coefficients and applies the inverse
+    spherical harmonic transform to ensure the AMSE loss produces expected values.
+    """
+    nlat, nlon = 16, 32
+    batch_size, num_channels = 1, feature_variance.shape[0]
+
+    sht_forward_temp = th.RealSHT(nlat, nlon, grid="equiangular")
+    lmax, mmax = sht_forward_temp.lmax, sht_forward_temp.mmax
+    coeffs_shape = (batch_size * num_channels, lmax, mmax)
+
+    # Place known energy in (l=1, m=0) band
+    target_coeffs = torch.zeros(coeffs_shape, dtype=torch.complex64)
+    target_coeffs[:, 1, 0] = 1.0 + 0.0j
+    pred_coeffs = target_coeffs * 0.5
+
+    # Inverse SHT to get spatial-domain data
+    isht = th.InverseRealSHT(nlat, nlon, grid="equiangular")
+    target = isht(target_coeffs).view(batch_size, num_channels, nlat, nlon)
+    pred = isht(pred_coeffs).view(batch_size, num_channels, nlat, nlon)
+
+    # Manually compute expected normalized spectral loss
+    psd_target_l1 = 1.0**2
+    psd_pred_l1 = 0.5**2
+    amp_error_l1 = (
+        torch.sqrt(torch.tensor(psd_pred_l1)) - torch.sqrt(torch.tensor(psd_target_l1))
+    ) ** 2
+    expected_spectral_loss_per_channel = amp_error_l1
+    expected_normalized_loss = (expected_spectral_loss_per_channel / feature_variance).mean()
+
+    # Compare to actual loss
+    loss_fn = AMSENormalizedLoss(feature_variance=feature_variance)
+    actual_loss = loss_fn(pred, target)
+
+    assert torch.allclose(actual_loss, expected_normalized_loss, atol=1e-5)