|
5 | 5 | from importlib.resources import as_file, files |
6 | 6 | from typing import Any, Dict, List, Optional |
7 | 7 |
|
| 8 | +import numpy as np |
| 9 | + |
8 | 10 | from access_moppy import _creator |
9 | 11 |
|
10 | 12 |
|
@@ -287,6 +289,233 @@ def get_variant_components(self) -> Dict[str, int]: |
287 | 289 | raise ValueError(f"Invalid variant_label format: {self.variant_label}") |
288 | 290 | return {k: int(v) for k, v in match.groupdict().items()} |
289 | 291 |
|
| 292 | + def get_cmip_missing_value(self) -> float: |
| 293 | + """ |
| 294 | + Get the CMIP6-compliant missing value for this variable. |
| 295 | +
|
| 296 | + Returns the missing value as specified in the CMOR table for this variable, |
| 297 | + with fallback to table default or global default. |
| 298 | +
|
| 299 | + Returns: |
| 300 | + float: The CMIP6-compliant missing value |
| 301 | + """ |
| 302 | + # Check if variable has specific missing value |
| 303 | + if "missing_value" in self.variable: |
| 304 | + return float(self.variable["missing_value"]) |
| 305 | + |
| 306 | + # Check variable type and use appropriate table default |
| 307 | + var_type = self.variable.get("type", "real") |
| 308 | + if var_type == "integer": |
| 309 | + # Use integer missing value from table header |
| 310 | + return float(self.cmip_table["Header"].get("int_missing_value", -999)) |
| 311 | + else: |
| 312 | + # Use real missing value from table header |
| 313 | + return float(self.cmip_table["Header"].get("missing_value", 1e20)) |
| 314 | + |
| 315 | + def get_cmip_fill_value(self) -> float: |
| 316 | + """ |
| 317 | + Get the CMIP6-compliant _FillValue for this variable. |
| 318 | +
|
| 319 | + For CMIP6, _FillValue should be the same as missing_value. |
| 320 | +
|
| 321 | + Returns: |
| 322 | + float: The CMIP6-compliant _FillValue |
| 323 | + """ |
| 324 | + return self.get_cmip_missing_value() |
| 325 | + |
| 326 | + def normalize_missing_values_to_nan(self, data_array): |
| 327 | + """ |
| 328 | + Normalize various missing value representations to NaN for consistent processing. |
| 329 | +
|
| 330 | + This method converts different missing value conventions (e.g., -999, -1e20) |
| 331 | + to NaN, enabling XArray's built-in missing value handling to work properly |
| 332 | + during derivation calculations. |
| 333 | +
|
| 334 | + Parameters: |
| 335 | + data_array: xarray.DataArray |
| 336 | + The data array to normalize |
| 337 | +
|
| 338 | + Returns: |
| 339 | + xarray.DataArray: Data array with missing values converted to NaN |
| 340 | + """ |
| 341 | + # Create a shallow copy to preserve lazy evaluation |
| 342 | + result = data_array.copy(deep=False) |
| 343 | + |
| 344 | + # Get current missing/fill values from attributes |
| 345 | + current_missing = data_array.attrs.get("missing_value") |
| 346 | + current_fill = data_array.attrs.get("_FillValue") |
| 347 | + |
| 348 | + # Build conditions for values that should become NaN |
| 349 | + nan_conditions = [] |
| 350 | + |
| 351 | + # Check for current missing_value |
| 352 | + if current_missing is not None: |
| 353 | + try: |
| 354 | + current_missing = float(current_missing) |
| 355 | + if not np.isnan(current_missing): # Don't double-convert NaN |
| 356 | + nan_conditions.append(result == current_missing) |
| 357 | + except (ValueError, TypeError): |
| 358 | + pass |
| 359 | + |
| 360 | + # Check for current _FillValue |
| 361 | + if current_fill is not None: |
| 362 | + try: |
| 363 | + current_fill = float(current_fill) |
| 364 | + if not np.isnan(current_fill): # Don't double-convert NaN |
| 365 | + nan_conditions.append(result == current_fill) |
| 366 | + except (ValueError, TypeError): |
| 367 | + pass |
| 368 | + |
| 369 | + # Apply conversions using lazy operations |
| 370 | + if nan_conditions: |
| 371 | + combined_mask = nan_conditions[0] |
| 372 | + for condition in nan_conditions[1:]: |
| 373 | + combined_mask = combined_mask | condition |
| 374 | + |
| 375 | + # Convert to NaN using xarray.where (preserves lazy evaluation) |
| 376 | + result = result.where(~combined_mask, np.nan) |
| 377 | + |
| 378 | + # Update attributes to reflect NaN as the missing value |
| 379 | + result.attrs["missing_value"] = np.nan |
| 380 | + result.attrs["_FillValue"] = np.nan |
| 381 | + |
| 382 | + return result |
| 383 | + |
| 384 | + @staticmethod |
| 385 | + def normalize_dataset_missing_values(dataset): |
| 386 | + """ |
| 387 | + Normalize missing values to NaN across all data variables in a dataset. |
| 388 | +
|
| 389 | + This static method can be used to normalize missing values early in the |
| 390 | + processing pipeline, before any derivation calculations are performed. |
| 391 | + This enables XArray's built-in missing value propagation to handle |
| 392 | + everything correctly. |
| 393 | +
|
| 394 | + Parameters: |
| 395 | + dataset: xarray.Dataset |
| 396 | + The dataset to normalize |
| 397 | +
|
| 398 | + Returns: |
| 399 | + xarray.Dataset: Dataset with all missing values converted to NaN |
| 400 | + """ |
| 401 | + # Create a shallow copy to preserve lazy evaluation |
| 402 | + result = dataset.copy(deep=False) |
| 403 | + |
| 404 | + for var_name in result.data_vars: |
| 405 | + var = result[var_name] |
| 406 | + |
| 407 | + # Get current missing/fill values from attributes |
| 408 | + current_missing = var.attrs.get("missing_value") |
| 409 | + current_fill = var.attrs.get("_FillValue") |
| 410 | + |
| 411 | + # Build conditions for values that should become NaN |
| 412 | + nan_conditions = [] |
| 413 | + |
| 414 | + # Check for current missing_value |
| 415 | + if current_missing is not None: |
| 416 | + try: |
| 417 | + current_missing = float(current_missing) |
| 418 | + if not np.isnan(current_missing): # Don't double-convert NaN |
| 419 | + nan_conditions.append(var == current_missing) |
| 420 | + except (ValueError, TypeError): |
| 421 | + pass |
| 422 | + |
| 423 | + # Check for current _FillValue |
| 424 | + if current_fill is not None: |
| 425 | + try: |
| 426 | + current_fill = float(current_fill) |
| 427 | + if not np.isnan(current_fill): # Don't double-convert NaN |
| 428 | + nan_conditions.append(var == current_fill) |
| 429 | + except (ValueError, TypeError): |
| 430 | + pass |
| 431 | + |
| 432 | + # Apply conversions using lazy operations |
| 433 | + if nan_conditions: |
| 434 | + combined_mask = nan_conditions[0] |
| 435 | + for condition in nan_conditions[1:]: |
| 436 | + combined_mask = combined_mask | condition |
| 437 | + |
| 438 | + # Convert to NaN using xarray.where (preserves lazy evaluation) |
| 439 | + result[var_name] = var.where(~combined_mask, np.nan) |
| 440 | + |
| 441 | + # Update attributes to reflect NaN as the missing value |
| 442 | + result[var_name].attrs["missing_value"] = np.nan |
| 443 | + result[var_name].attrs["_FillValue"] = np.nan |
| 444 | + |
| 445 | + return result |
| 446 | + |
| 447 | + def standardize_missing_values(self, data_array, convert_existing: bool = True): |
| 448 | + """ |
| 449 | + Standardize missing values in a data array to CMIP6 requirements. |
| 450 | +
|
| 451 | + This method ensures that: |
| 452 | + 1. All missing/NaN values use the CMIP6-specified missing value |
| 453 | + 2. Data with different missing values from derived calculations are standardized |
| 454 | + 3. Attributes are updated with correct missing_value and _FillValue |
| 455 | + 4. Lazy evaluation is preserved for dask arrays |
| 456 | +
|
| 457 | + Parameters: |
| 458 | + data_array: xarray.DataArray |
| 459 | + The data array to standardize |
| 460 | + convert_existing: bool |
| 461 | + If True, convert existing missing values to CMIP6 standard. |
| 462 | + If False, only standardize NaN values and update attributes. |
| 463 | +
|
| 464 | + Returns: |
| 465 | + xarray.DataArray: Data array with standardized missing values |
| 466 | + """ |
| 467 | + # Get the correct CMIP6 missing value |
| 468 | + cmip_missing_value = self.get_cmip_missing_value() |
| 469 | + cmip_fill_value = self.get_cmip_fill_value() |
| 470 | + |
| 471 | + # Create a shallow copy to avoid modifying the original (preserves dask arrays) |
| 472 | + result = data_array.copy(deep=False) |
| 473 | + |
| 474 | + if convert_existing: |
| 475 | + # Get current missing/fill values from attributes |
| 476 | + current_missing = data_array.attrs.get("missing_value") |
| 477 | + current_fill = data_array.attrs.get("_FillValue") |
| 478 | + |
| 479 | + # Build conditions for missing values using xarray operations (lazy) |
| 480 | + missing_conditions = [] |
| 481 | + |
| 482 | + # Check for NaN values |
| 483 | + missing_conditions.append(np.isnan(result)) |
| 484 | + |
| 485 | + # Check for current missing_value |
| 486 | + if current_missing is not None: |
| 487 | + try: |
| 488 | + current_missing = float(current_missing) |
| 489 | + missing_conditions.append(result == current_missing) |
| 490 | + except (ValueError, TypeError): |
| 491 | + pass |
| 492 | + |
| 493 | + # Check for current _FillValue |
| 494 | + if current_fill is not None: |
| 495 | + try: |
| 496 | + current_fill = float(current_fill) |
| 497 | + missing_conditions.append(result == current_fill) |
| 498 | + except (ValueError, TypeError): |
| 499 | + pass |
| 500 | + |
| 501 | + # Combine all missing value conditions (this stays lazy with dask) |
| 502 | + if missing_conditions: |
| 503 | + combined_mask = missing_conditions[0] |
| 504 | + for condition in missing_conditions[1:]: |
| 505 | + combined_mask = combined_mask | condition |
| 506 | + |
| 507 | + # Use xarray.where to preserve lazy evaluation |
| 508 | + result = result.where(~combined_mask, cmip_missing_value) |
| 509 | + else: |
| 510 | + # Only convert NaN values to CMIP6 missing value (lazy operation) |
| 511 | + result = result.where(~np.isnan(result), cmip_missing_value) |
| 512 | + |
| 513 | + # Update attributes with correct CMIP6 values (this doesn't affect lazy evaluation) |
| 514 | + result.attrs["missing_value"] = cmip_missing_value |
| 515 | + result.attrs["_FillValue"] = cmip_fill_value |
| 516 | + |
| 517 | + return result |
| 518 | + |
290 | 519 | def _get_external_variables(self) -> Optional[str]: |
291 | 520 | """ |
292 | 521 | Derive the list of external variables required for this CMOR variable. |
|
0 commit comments