Implement missing value normalisation and standardisation (#162)

rbeucher · web-flow · commit dbcd0743e508 · 2026-01-14T14:08:07.000+10:00
* Implement missing value normalization and standardization in CMIP6Vocabulary; add unit tests for functionality

* Pre-commit

* Fix trailing spaces
diff --git a/src/access_moppy/base.py b/src/access_moppy/base.py
@@ -397,6 +397,9 @@ def _preprocess(ds):
             self.ds = self.chunker.rechunk_dataset(self.ds)
             print("✅ Dataset rechunking completed")
 
+        # Normalize missing values to NaN early for consistent processing
+        self._normalize_missing_values_early()
+
     def _ensure_numeric_time_coordinates(self, ds: xr.Dataset) -> xr.Dataset:
         """
         Convert cftime objects in time-related coordinates to numeric values.
@@ -550,6 +553,73 @@ def drop_intermediates(self):
             if var in self.ds.data_vars and var != self.cmor_name:
                 self.ds = self.ds.drop_vars(var)
 
+    def _normalize_missing_values_early(self):
+        """
+        Normalize missing values to NaN early in the processing pipeline.
+
+        This enables XArray's built-in missing value handling to work correctly
+        during derivation calculations, eliminating the need for custom safe
+        arithmetic operations.
+        """
+        try:
+            from access_moppy.vocabulary_processors import CMIP6Vocabulary
+
+            print("🔧 Normalizing missing values to NaN for consistent processing...")
+
+            # Use the static method to normalize the entire dataset
+            self.ds = CMIP6Vocabulary.normalize_dataset_missing_values(self.ds)
+
+            print(
+                "✅ Missing values normalized to NaN - XArray will handle propagation correctly"
+            )
+        except ImportError:
+            print("⚠️  Could not import CMIP6Vocabulary for missing value normalization")
+        except Exception as e:
+            print(f"⚠️  Warning: Could not normalize missing values early: {e}")
+
+    def standardize_missing_values(self):
+        """
+        Standardize missing values in the main variable to CMIP6 requirements.
+
+        At this point, missing values should already be normalized to NaN from
+        early processing, and XArray's built-in missing value propagation should
+        have handled derivation calculations correctly. This method converts NaN
+        to the final CMIP6-compliant missing value.
+
+        This is particularly important for:
+        - Final CMIP6 compliance (converting NaN to 1e20)
+        - Ensuring consistent metadata attributes
+        """
+        if (
+            hasattr(self, "vocab")
+            and self.vocab
+            and self.cmor_name in self.ds.data_vars
+        ):
+            print(
+                f"🔧 Applying final CMIP6 missing value standardization for {self.cmor_name}..."
+            )
+
+            # Get the main data variable
+            data_var = self.ds[self.cmor_name]
+
+            # At this point, data should have NaN for missing values
+            # Convert only NaN to CMIP6 standard (don't convert other values)
+            standardized_var = self.vocab.standardize_missing_values(
+                data_var,
+                convert_existing=False,  # Only convert NaN, preserve other values
+            )
+
+            # Update the dataset with the standardized variable
+            self.ds[self.cmor_name] = standardized_var
+
+            # Report the standardization
+            missing_value = self.vocab.get_cmip_missing_value()
+            print(f"✅ Final CMIP6 missing value applied: {missing_value}")
+        else:
+            print(
+                f"⚠️  Cannot standardize missing values for {self.cmor_name}: vocabulary not available"
+            )
+
     def update_attributes(self):
         raise NotImplementedError("Subclasses must implement update_attributes.")
 
@@ -839,6 +909,8 @@ def estimate_data_size(ds, cmor_name):
     def run(self, write_output: bool = False):
         self.select_and_process_variables()
         self.drop_intermediates()
+        # Standardize missing values to CMIP6 requirements after processing
+        self.standardize_missing_values()
         self.update_attributes()
         self.reorder()
         # Final rechunking before writing for optimal I/O performance
diff --git a/src/access_moppy/derivations/calc_utils.py b/src/access_moppy/derivations/calc_utils.py
@@ -32,7 +32,6 @@
 import click
 import numpy as np
 import xarray as xr
-from mopdb.utils import MopException
 
 # Global Variables
 # ----------------------------------------------------------------------
@@ -94,11 +93,11 @@ def time_resample(ctx, var, rfrq, tdim, sample="down", stats="mean"):
     """
     var_log = logging.getLogger(ctx.obj["var_log"])
     if not isinstance(var, xr.DataArray):
-        raise MopException("'var' must be a valid Xarray DataArray")
+        raise ValueError("'var' must be a valid Xarray DataArray")
     valid_stats = ["mean", "min", "max", "sum"]
     if stats not in valid_stats:
         var_log.error(f"Resample unrecognised stats {stats}")
-        raise MopException(f"{stats} not in valid list: {valid_stats}.")
+        raise ValueError(f"{stats} not in valid list: {valid_stats}.")
     offset = {
         "30m": [15, "min"],
         "h": [30, "min"],
@@ -123,16 +122,16 @@ def time_resample(ctx, var, rfrq, tdim, sample="down", stats="mean"):
             )
         except Exception as e:
             var_log.error(f"Resample error: {e}")
-            raise MopException(f"{e}")
+            raise ValueError(f"{e}")
     elif sample == "up":
         try:
             vout = var.resample({tdim: rfrq}).interpolate("linear")
         except Exception as e:
             var_log.error(f"Resample error: {e}")
-            raise MopException(f"{e}")
+            raise ValueError(f"{e}")
     else:
         var_log.error("Resample can only be up or down")
-        raise MopException("Sample is expected to be up or down")
+        raise ValueError("Sample is expected to be up or down")
     return vout
 
 
diff --git a/src/access_moppy/vocabulary_processors.py b/src/access_moppy/vocabulary_processors.py
@@ -5,6 +5,8 @@
 from importlib.resources import as_file, files
 from typing import Any, Dict, List, Optional
 
+import numpy as np
+
 from access_moppy import _creator
 
 
@@ -287,6 +289,233 @@ def get_variant_components(self) -> Dict[str, int]:
             raise ValueError(f"Invalid variant_label format: {self.variant_label}")
         return {k: int(v) for k, v in match.groupdict().items()}
 
+    def get_cmip_missing_value(self) -> float:
+        """
+        Get the CMIP6-compliant missing value for this variable.
+
+        Returns the missing value as specified in the CMOR table for this variable,
+        with fallback to table default or global default.
+
+        Returns:
+            float: The CMIP6-compliant missing value
+        """
+        # Check if variable has specific missing value
+        if "missing_value" in self.variable:
+            return float(self.variable["missing_value"])
+
+        # Check variable type and use appropriate table default
+        var_type = self.variable.get("type", "real")
+        if var_type == "integer":
+            # Use integer missing value from table header
+            return float(self.cmip_table["Header"].get("int_missing_value", -999))
+        else:
+            # Use real missing value from table header
+            return float(self.cmip_table["Header"].get("missing_value", 1e20))
+
+    def get_cmip_fill_value(self) -> float:
+        """
+        Get the CMIP6-compliant _FillValue for this variable.
+
+        For CMIP6, _FillValue should be the same as missing_value.
+
+        Returns:
+            float: The CMIP6-compliant _FillValue
+        """
+        return self.get_cmip_missing_value()
+
+    def normalize_missing_values_to_nan(self, data_array):
+        """
+        Normalize various missing value representations to NaN for consistent processing.
+
+        This method converts different missing value conventions (e.g., -999, -1e20)
+        to NaN, enabling XArray's built-in missing value handling to work properly
+        during derivation calculations.
+
+        Parameters:
+            data_array: xarray.DataArray
+                The data array to normalize
+
+        Returns:
+            xarray.DataArray: Data array with missing values converted to NaN
+        """
+        # Create a shallow copy to preserve lazy evaluation
+        result = data_array.copy(deep=False)
+
+        # Get current missing/fill values from attributes
+        current_missing = data_array.attrs.get("missing_value")
+        current_fill = data_array.attrs.get("_FillValue")
+
+        # Build conditions for values that should become NaN
+        nan_conditions = []
+
+        # Check for current missing_value
+        if current_missing is not None:
+            try:
+                current_missing = float(current_missing)
+                if not np.isnan(current_missing):  # Don't double-convert NaN
+                    nan_conditions.append(result == current_missing)
+            except (ValueError, TypeError):
+                pass
+
+        # Check for current _FillValue
+        if current_fill is not None:
+            try:
+                current_fill = float(current_fill)
+                if not np.isnan(current_fill):  # Don't double-convert NaN
+                    nan_conditions.append(result == current_fill)
+            except (ValueError, TypeError):
+                pass
+
+        # Apply conversions using lazy operations
+        if nan_conditions:
+            combined_mask = nan_conditions[0]
+            for condition in nan_conditions[1:]:
+                combined_mask = combined_mask | condition
+
+            # Convert to NaN using xarray.where (preserves lazy evaluation)
+            result = result.where(~combined_mask, np.nan)
+
+        # Update attributes to reflect NaN as the missing value
+        result.attrs["missing_value"] = np.nan
+        result.attrs["_FillValue"] = np.nan
+
+        return result
+
+    @staticmethod
+    def normalize_dataset_missing_values(dataset):
+        """
+        Normalize missing values to NaN across all data variables in a dataset.
+
+        This static method can be used to normalize missing values early in the
+        processing pipeline, before any derivation calculations are performed.
+        This enables XArray's built-in missing value propagation to handle
+        everything correctly.
+
+        Parameters:
+            dataset: xarray.Dataset
+                The dataset to normalize
+
+        Returns:
+            xarray.Dataset: Dataset with all missing values converted to NaN
+        """
+        # Create a shallow copy to preserve lazy evaluation
+        result = dataset.copy(deep=False)
+
+        for var_name in result.data_vars:
+            var = result[var_name]
+
+            # Get current missing/fill values from attributes
+            current_missing = var.attrs.get("missing_value")
+            current_fill = var.attrs.get("_FillValue")
+
+            # Build conditions for values that should become NaN
+            nan_conditions = []
+
+            # Check for current missing_value
+            if current_missing is not None:
+                try:
+                    current_missing = float(current_missing)
+                    if not np.isnan(current_missing):  # Don't double-convert NaN
+                        nan_conditions.append(var == current_missing)
+                except (ValueError, TypeError):
+                    pass
+
+            # Check for current _FillValue
+            if current_fill is not None:
+                try:
+                    current_fill = float(current_fill)
+                    if not np.isnan(current_fill):  # Don't double-convert NaN
+                        nan_conditions.append(var == current_fill)
+                except (ValueError, TypeError):
+                    pass
+
+            # Apply conversions using lazy operations
+            if nan_conditions:
+                combined_mask = nan_conditions[0]
+                for condition in nan_conditions[1:]:
+                    combined_mask = combined_mask | condition
+
+                # Convert to NaN using xarray.where (preserves lazy evaluation)
+                result[var_name] = var.where(~combined_mask, np.nan)
+
+                # Update attributes to reflect NaN as the missing value
+                result[var_name].attrs["missing_value"] = np.nan
+                result[var_name].attrs["_FillValue"] = np.nan
+
+        return result
+
+    def standardize_missing_values(self, data_array, convert_existing: bool = True):
+        """
+        Standardize missing values in a data array to CMIP6 requirements.
+
+        This method ensures that:
+        1. All missing/NaN values use the CMIP6-specified missing value
+        2. Data with different missing values from derived calculations are standardized
+        3. Attributes are updated with correct missing_value and _FillValue
+        4. Lazy evaluation is preserved for dask arrays
+
+        Parameters:
+            data_array: xarray.DataArray
+                The data array to standardize
+            convert_existing: bool
+                If True, convert existing missing values to CMIP6 standard.
+                If False, only standardize NaN values and update attributes.
+
+        Returns:
+            xarray.DataArray: Data array with standardized missing values
+        """
+        # Get the correct CMIP6 missing value
+        cmip_missing_value = self.get_cmip_missing_value()
+        cmip_fill_value = self.get_cmip_fill_value()
+
+        # Create a shallow copy to avoid modifying the original (preserves dask arrays)
+        result = data_array.copy(deep=False)
+
+        if convert_existing:
+            # Get current missing/fill values from attributes
+            current_missing = data_array.attrs.get("missing_value")
+            current_fill = data_array.attrs.get("_FillValue")
+
+            # Build conditions for missing values using xarray operations (lazy)
+            missing_conditions = []
+
+            # Check for NaN values
+            missing_conditions.append(np.isnan(result))
+
+            # Check for current missing_value
+            if current_missing is not None:
+                try:
+                    current_missing = float(current_missing)
+                    missing_conditions.append(result == current_missing)
+                except (ValueError, TypeError):
+                    pass
+
+            # Check for current _FillValue
+            if current_fill is not None:
+                try:
+                    current_fill = float(current_fill)
+                    missing_conditions.append(result == current_fill)
+                except (ValueError, TypeError):
+                    pass
+
+            # Combine all missing value conditions (this stays lazy with dask)
+            if missing_conditions:
+                combined_mask = missing_conditions[0]
+                for condition in missing_conditions[1:]:
+                    combined_mask = combined_mask | condition
+
+                # Use xarray.where to preserve lazy evaluation
+                result = result.where(~combined_mask, cmip_missing_value)
+        else:
+            # Only convert NaN values to CMIP6 missing value (lazy operation)
+            result = result.where(~np.isnan(result), cmip_missing_value)
+
+        # Update attributes with correct CMIP6 values (this doesn't affect lazy evaluation)
+        result.attrs["missing_value"] = cmip_missing_value
+        result.attrs["_FillValue"] = cmip_fill_value
+
+        return result
+
     def _get_external_variables(self) -> Optional[str]:
         """
         Derive the list of external variables required for this CMOR variable.
diff --git a/tests/unit/test_missing_values.py b/tests/unit/test_missing_values.py