Skip to content

Commit dbcd074

Browse files
authored
Implement missing value normalisation and standardisation (#162)
* Implement missing value normalization and standardization in CMIP6Vocabulary; add unit tests for functionality * Pre-commit * Fix trailing spaces
1 parent 136911d commit dbcd074

File tree

4 files changed

+552
-6
lines changed

4 files changed

+552
-6
lines changed

src/access_moppy/base.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,9 @@ def _preprocess(ds):
397397
self.ds = self.chunker.rechunk_dataset(self.ds)
398398
print("✅ Dataset rechunking completed")
399399

400+
# Normalize missing values to NaN early for consistent processing
401+
self._normalize_missing_values_early()
402+
400403
def _ensure_numeric_time_coordinates(self, ds: xr.Dataset) -> xr.Dataset:
401404
"""
402405
Convert cftime objects in time-related coordinates to numeric values.
@@ -550,6 +553,73 @@ def drop_intermediates(self):
550553
if var in self.ds.data_vars and var != self.cmor_name:
551554
self.ds = self.ds.drop_vars(var)
552555

556+
def _normalize_missing_values_early(self):
557+
"""
558+
Normalize missing values to NaN early in the processing pipeline.
559+
560+
This enables XArray's built-in missing value handling to work correctly
561+
during derivation calculations, eliminating the need for custom safe
562+
arithmetic operations.
563+
"""
564+
try:
565+
from access_moppy.vocabulary_processors import CMIP6Vocabulary
566+
567+
print("🔧 Normalizing missing values to NaN for consistent processing...")
568+
569+
# Use the static method to normalize the entire dataset
570+
self.ds = CMIP6Vocabulary.normalize_dataset_missing_values(self.ds)
571+
572+
print(
573+
"✅ Missing values normalized to NaN - XArray will handle propagation correctly"
574+
)
575+
except ImportError:
576+
print("⚠️ Could not import CMIP6Vocabulary for missing value normalization")
577+
except Exception as e:
578+
print(f"⚠️ Warning: Could not normalize missing values early: {e}")
579+
580+
def standardize_missing_values(self):
581+
"""
582+
Standardize missing values in the main variable to CMIP6 requirements.
583+
584+
At this point, missing values should already be normalized to NaN from
585+
early processing, and XArray's built-in missing value propagation should
586+
have handled derivation calculations correctly. This method converts NaN
587+
to the final CMIP6-compliant missing value.
588+
589+
This is particularly important for:
590+
- Final CMIP6 compliance (converting NaN to 1e20)
591+
- Ensuring consistent metadata attributes
592+
"""
593+
if (
594+
hasattr(self, "vocab")
595+
and self.vocab
596+
and self.cmor_name in self.ds.data_vars
597+
):
598+
print(
599+
f"🔧 Applying final CMIP6 missing value standardization for {self.cmor_name}..."
600+
)
601+
602+
# Get the main data variable
603+
data_var = self.ds[self.cmor_name]
604+
605+
# At this point, data should have NaN for missing values
606+
# Convert only NaN to CMIP6 standard (don't convert other values)
607+
standardized_var = self.vocab.standardize_missing_values(
608+
data_var,
609+
convert_existing=False, # Only convert NaN, preserve other values
610+
)
611+
612+
# Update the dataset with the standardized variable
613+
self.ds[self.cmor_name] = standardized_var
614+
615+
# Report the standardization
616+
missing_value = self.vocab.get_cmip_missing_value()
617+
print(f"✅ Final CMIP6 missing value applied: {missing_value}")
618+
else:
619+
print(
620+
f"⚠️ Cannot standardize missing values for {self.cmor_name}: vocabulary not available"
621+
)
622+
553623
def update_attributes(self):
554624
raise NotImplementedError("Subclasses must implement update_attributes.")
555625

@@ -839,6 +909,8 @@ def estimate_data_size(ds, cmor_name):
839909
def run(self, write_output: bool = False):
840910
self.select_and_process_variables()
841911
self.drop_intermediates()
912+
# Standardize missing values to CMIP6 requirements after processing
913+
self.standardize_missing_values()
842914
self.update_attributes()
843915
self.reorder()
844916
# Final rechunking before writing for optimal I/O performance

src/access_moppy/derivations/calc_utils.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import click
3333
import numpy as np
3434
import xarray as xr
35-
from mopdb.utils import MopException
3635

3736
# Global Variables
3837
# ----------------------------------------------------------------------
@@ -94,11 +93,11 @@ def time_resample(ctx, var, rfrq, tdim, sample="down", stats="mean"):
9493
"""
9594
var_log = logging.getLogger(ctx.obj["var_log"])
9695
if not isinstance(var, xr.DataArray):
97-
raise MopException("'var' must be a valid Xarray DataArray")
96+
raise ValueError("'var' must be a valid Xarray DataArray")
9897
valid_stats = ["mean", "min", "max", "sum"]
9998
if stats not in valid_stats:
10099
var_log.error(f"Resample unrecognised stats {stats}")
101-
raise MopException(f"{stats} not in valid list: {valid_stats}.")
100+
raise ValueError(f"{stats} not in valid list: {valid_stats}.")
102101
offset = {
103102
"30m": [15, "min"],
104103
"h": [30, "min"],
@@ -123,16 +122,16 @@ def time_resample(ctx, var, rfrq, tdim, sample="down", stats="mean"):
123122
)
124123
except Exception as e:
125124
var_log.error(f"Resample error: {e}")
126-
raise MopException(f"{e}")
125+
raise ValueError(f"{e}")
127126
elif sample == "up":
128127
try:
129128
vout = var.resample({tdim: rfrq}).interpolate("linear")
130129
except Exception as e:
131130
var_log.error(f"Resample error: {e}")
132-
raise MopException(f"{e}")
131+
raise ValueError(f"{e}")
133132
else:
134133
var_log.error("Resample can only be up or down")
135-
raise MopException("Sample is expected to be up or down")
134+
raise ValueError("Sample is expected to be up or down")
136135
return vout
137136

138137

src/access_moppy/vocabulary_processors.py

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from importlib.resources import as_file, files
66
from typing import Any, Dict, List, Optional
77

8+
import numpy as np
9+
810
from access_moppy import _creator
911

1012

@@ -287,6 +289,233 @@ def get_variant_components(self) -> Dict[str, int]:
287289
raise ValueError(f"Invalid variant_label format: {self.variant_label}")
288290
return {k: int(v) for k, v in match.groupdict().items()}
289291

292+
def get_cmip_missing_value(self) -> float:
293+
"""
294+
Get the CMIP6-compliant missing value for this variable.
295+
296+
Returns the missing value as specified in the CMOR table for this variable,
297+
with fallback to table default or global default.
298+
299+
Returns:
300+
float: The CMIP6-compliant missing value
301+
"""
302+
# Check if variable has specific missing value
303+
if "missing_value" in self.variable:
304+
return float(self.variable["missing_value"])
305+
306+
# Check variable type and use appropriate table default
307+
var_type = self.variable.get("type", "real")
308+
if var_type == "integer":
309+
# Use integer missing value from table header
310+
return float(self.cmip_table["Header"].get("int_missing_value", -999))
311+
else:
312+
# Use real missing value from table header
313+
return float(self.cmip_table["Header"].get("missing_value", 1e20))
314+
315+
def get_cmip_fill_value(self) -> float:
316+
"""
317+
Get the CMIP6-compliant _FillValue for this variable.
318+
319+
For CMIP6, _FillValue should be the same as missing_value.
320+
321+
Returns:
322+
float: The CMIP6-compliant _FillValue
323+
"""
324+
return self.get_cmip_missing_value()
325+
326+
def normalize_missing_values_to_nan(self, data_array):
327+
"""
328+
Normalize various missing value representations to NaN for consistent processing.
329+
330+
This method converts different missing value conventions (e.g., -999, -1e20)
331+
to NaN, enabling XArray's built-in missing value handling to work properly
332+
during derivation calculations.
333+
334+
Parameters:
335+
data_array: xarray.DataArray
336+
The data array to normalize
337+
338+
Returns:
339+
xarray.DataArray: Data array with missing values converted to NaN
340+
"""
341+
# Create a shallow copy to preserve lazy evaluation
342+
result = data_array.copy(deep=False)
343+
344+
# Get current missing/fill values from attributes
345+
current_missing = data_array.attrs.get("missing_value")
346+
current_fill = data_array.attrs.get("_FillValue")
347+
348+
# Build conditions for values that should become NaN
349+
nan_conditions = []
350+
351+
# Check for current missing_value
352+
if current_missing is not None:
353+
try:
354+
current_missing = float(current_missing)
355+
if not np.isnan(current_missing): # Don't double-convert NaN
356+
nan_conditions.append(result == current_missing)
357+
except (ValueError, TypeError):
358+
pass
359+
360+
# Check for current _FillValue
361+
if current_fill is not None:
362+
try:
363+
current_fill = float(current_fill)
364+
if not np.isnan(current_fill): # Don't double-convert NaN
365+
nan_conditions.append(result == current_fill)
366+
except (ValueError, TypeError):
367+
pass
368+
369+
# Apply conversions using lazy operations
370+
if nan_conditions:
371+
combined_mask = nan_conditions[0]
372+
for condition in nan_conditions[1:]:
373+
combined_mask = combined_mask | condition
374+
375+
# Convert to NaN using xarray.where (preserves lazy evaluation)
376+
result = result.where(~combined_mask, np.nan)
377+
378+
# Update attributes to reflect NaN as the missing value
379+
result.attrs["missing_value"] = np.nan
380+
result.attrs["_FillValue"] = np.nan
381+
382+
return result
383+
384+
@staticmethod
385+
def normalize_dataset_missing_values(dataset):
386+
"""
387+
Normalize missing values to NaN across all data variables in a dataset.
388+
389+
This static method can be used to normalize missing values early in the
390+
processing pipeline, before any derivation calculations are performed.
391+
This enables XArray's built-in missing value propagation to handle
392+
everything correctly.
393+
394+
Parameters:
395+
dataset: xarray.Dataset
396+
The dataset to normalize
397+
398+
Returns:
399+
xarray.Dataset: Dataset with all missing values converted to NaN
400+
"""
401+
# Create a shallow copy to preserve lazy evaluation
402+
result = dataset.copy(deep=False)
403+
404+
for var_name in result.data_vars:
405+
var = result[var_name]
406+
407+
# Get current missing/fill values from attributes
408+
current_missing = var.attrs.get("missing_value")
409+
current_fill = var.attrs.get("_FillValue")
410+
411+
# Build conditions for values that should become NaN
412+
nan_conditions = []
413+
414+
# Check for current missing_value
415+
if current_missing is not None:
416+
try:
417+
current_missing = float(current_missing)
418+
if not np.isnan(current_missing): # Don't double-convert NaN
419+
nan_conditions.append(var == current_missing)
420+
except (ValueError, TypeError):
421+
pass
422+
423+
# Check for current _FillValue
424+
if current_fill is not None:
425+
try:
426+
current_fill = float(current_fill)
427+
if not np.isnan(current_fill): # Don't double-convert NaN
428+
nan_conditions.append(var == current_fill)
429+
except (ValueError, TypeError):
430+
pass
431+
432+
# Apply conversions using lazy operations
433+
if nan_conditions:
434+
combined_mask = nan_conditions[0]
435+
for condition in nan_conditions[1:]:
436+
combined_mask = combined_mask | condition
437+
438+
# Convert to NaN using xarray.where (preserves lazy evaluation)
439+
result[var_name] = var.where(~combined_mask, np.nan)
440+
441+
# Update attributes to reflect NaN as the missing value
442+
result[var_name].attrs["missing_value"] = np.nan
443+
result[var_name].attrs["_FillValue"] = np.nan
444+
445+
return result
446+
447+
def standardize_missing_values(self, data_array, convert_existing: bool = True):
448+
"""
449+
Standardize missing values in a data array to CMIP6 requirements.
450+
451+
This method ensures that:
452+
1. All missing/NaN values use the CMIP6-specified missing value
453+
2. Data with different missing values from derived calculations are standardized
454+
3. Attributes are updated with correct missing_value and _FillValue
455+
4. Lazy evaluation is preserved for dask arrays
456+
457+
Parameters:
458+
data_array: xarray.DataArray
459+
The data array to standardize
460+
convert_existing: bool
461+
If True, convert existing missing values to CMIP6 standard.
462+
If False, only standardize NaN values and update attributes.
463+
464+
Returns:
465+
xarray.DataArray: Data array with standardized missing values
466+
"""
467+
# Get the correct CMIP6 missing value
468+
cmip_missing_value = self.get_cmip_missing_value()
469+
cmip_fill_value = self.get_cmip_fill_value()
470+
471+
# Create a shallow copy to avoid modifying the original (preserves dask arrays)
472+
result = data_array.copy(deep=False)
473+
474+
if convert_existing:
475+
# Get current missing/fill values from attributes
476+
current_missing = data_array.attrs.get("missing_value")
477+
current_fill = data_array.attrs.get("_FillValue")
478+
479+
# Build conditions for missing values using xarray operations (lazy)
480+
missing_conditions = []
481+
482+
# Check for NaN values
483+
missing_conditions.append(np.isnan(result))
484+
485+
# Check for current missing_value
486+
if current_missing is not None:
487+
try:
488+
current_missing = float(current_missing)
489+
missing_conditions.append(result == current_missing)
490+
except (ValueError, TypeError):
491+
pass
492+
493+
# Check for current _FillValue
494+
if current_fill is not None:
495+
try:
496+
current_fill = float(current_fill)
497+
missing_conditions.append(result == current_fill)
498+
except (ValueError, TypeError):
499+
pass
500+
501+
# Combine all missing value conditions (this stays lazy with dask)
502+
if missing_conditions:
503+
combined_mask = missing_conditions[0]
504+
for condition in missing_conditions[1:]:
505+
combined_mask = combined_mask | condition
506+
507+
# Use xarray.where to preserve lazy evaluation
508+
result = result.where(~combined_mask, cmip_missing_value)
509+
else:
510+
# Only convert NaN values to CMIP6 missing value (lazy operation)
511+
result = result.where(~np.isnan(result), cmip_missing_value)
512+
513+
# Update attributes with correct CMIP6 values (this doesn't affect lazy evaluation)
514+
result.attrs["missing_value"] = cmip_missing_value
515+
result.attrs["_FillValue"] = cmip_fill_value
516+
517+
return result
518+
290519
def _get_external_variables(self) -> Optional[str]:
291520
"""
292521
Derive the list of external variables required for this CMOR variable.

0 commit comments

Comments
 (0)