Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions src/nisarqa/processing/stats_h5_writer/metrics_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,35 @@ def get_stats_name_descr(stat: str, component: str | None) -> tuple[str, str]:
)


def get_list_of_real_stats_names() -> list[str]:
"""
Return a list of the names of all statistics for real-valued datasets.

These names are per NISAR conventions, for min/max/mean/std statistics.
"""

stat_names = []
for stat in ("min", "max", "mean", "std"):
s, _ = get_stats_name_descr(stat, component=None)
stat_names.append(s)
return stat_names


def get_list_of_imag_stats_names() -> list[str]:
"""
Return a list of the names of all statistics for complex-valued datasets.

These names are per NISAR conventions, for min/max/mean/std statistics.
"""

stat_names = []
for comp in ("real", "imag"):
for stat in ("min", "max", "mean", "std"):
s, _ = get_stats_name_descr(stat, component=comp)
stat_names.append(s)
return stat_names


def copy_non_insar_imagery_metrics(
product: nisarqa.NonInsarProduct, stats_h5: h5py.File
) -> None:
Expand Down
256 changes: 240 additions & 16 deletions src/nisarqa/validate/sanity_checks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations

import re
from collections.abc import Container
from collections.abc import Callable, Container
import copy
from typing import Any

import h5py
import numpy as np
Expand All @@ -12,6 +14,42 @@
objects_to_skip = nisarqa.get_all(name=__name__)


def _log_if_bad_string_value(val: str | list[str], path: str) -> bool:
"""
Log an error if value is a known invalid string.

Parameters
----------
val : str or list of str
Value to be checked.
path : str
Path to the dataset (and/or attribute) containing `val` to be used
for logging. If `val` is the value of an attribute, suggest
providing the dataset's path with the attribute name.
"""
log = nisarqa.get_logger()

values = copy.deepcopy(val)

if isinstance(values, str):
values = [values]

for v in values:
if v.upper() in (
"",
"0",
"['0']",
"['']",
"['' '' '' '' '']",
"NONE",
"(NOT SPECIFIED)",
):
log.error(
f"Value is {val!r}, which is not valid for nominal NISAR data."
f" Path: {path}"
)


def dataset_sanity_checks(product: nisarqa.NisarProduct) -> None:
"""
Perform a series of verification checks on the input product's datasets.
Expand All @@ -23,12 +61,210 @@ def dataset_sanity_checks(product: nisarqa.NisarProduct) -> None:
"""
with h5py.File(product.filepath, "r") as f:

check_metadata_conventions(h5_file=f)

identification_sanity_checks(
id_group=f[product.identification_path],
product_type=product.product_type,
)


def check_metadata_conventions(h5_file: h5py.File) -> None:
"""
Check that all datasets and attributes meet certain NISAR conventions.

Iterate through an HDF5 file to validate that all groups and datasets,
including their attributes, meet certain NISAR conventions:
1) populated (not empty)
2) if string, that they are not variable-length strings and that
they are not populated with known placeholder values.
3) if an attribute is numeric and is in given set of names,
that its dtype corresponds to its dataset's dtype.

Any issues discovered are logged as errors.

Parameters
----------
h5_file : h5py.File
The opened HDF5 file object to be inspected.

Notes
-----
This function is general for all NISAR product types. It does not have
special handling for specific datasets in specific products.

This function does not compare dtypes against the dtypes denoted in the XML
product specifications. For that functionality, please use the XML Checker.
"""
log = nisarqa.get_logger()

# Construct list of attributes whose dtypes should exactly-match the
# dtype of the dataset/group they are attached to.
exact_dtype_match = nisarqa.get_list_of_real_stats_names()
exact_dtype_match += ["_FillValue", "valid_min", "valid_max"]

# Construct list of attributes whose dtypes should be half-precision of the
# dtype of the dataset/group they are attached to.
half_precision_match = nisarqa.get_list_of_imag_stats_names()

def _validate_string_logic(
name: str,
dtype_: h5py.Datatype,
value_provider: Callable[[], Any],
label: str,
) -> None:
"""
Unified logic to validate HDF5 string types and content.

Parameters
----------
name : str
Path or name of the object.
dtype_ : h5py.Datatype
The HDF5 datatype to check.
value_provider : callable
A function/lambda that returns the actual value of when called.
Used to avoid reading data unless value has a string type.
label : str
Context label for logging (e.g., "Dataset" or "Attribute").
"""

string_info = h5py.check_string_dtype(dtype_)
if string_info is None:
# object is not a string dtype. (It could be int, float, etc.)
return

if string_info.length is None:
log.error(
f"{label} is variable-length string; should be fixed-length."
f" Path: {name}"
)
elif string_info.length == 0:
log.error(f"{label} is an empty string. Path: {name}")
else:
# ONLY for strings do we read the value to check content.
# Fixed-length strings are usually small metadata fields.
raw_val = value_provider()
ds_val = nisarqa.byte_string_to_python_str(raw_val)

vals = [ds_val] if isinstance(ds_val, (str, bytes)) else ds_val
for val in vals:
_log_if_bad_string_value(val=val, path=name)

def _check_attributes(item_name: str, item: h5py.HLObject) -> None:
"""Check all attributes of a specific HDF5 object."""
for attr_name, attr_val in item.attrs.items():
if attr_val is None or isinstance(attr_val, h5py.Empty):
log.error(
f"Attribute '{attr_name}' is empty. Path: {item_name}"
)
continue

attr_id = item.attrs.get_id(attr_name)

string_info = h5py.check_string_dtype(attr_id.dtype)

if string_info is None:
# object is not a string dtype. (It could be int, float, etc.)
# Validate that that dtype of the attribute matches the dtype
# of the dataset.

# Check attributes which should have an exact dtype match
if attr_name in exact_dtype_match:
if attr_id.dtype != item.dtype:
log.error(
f"Attribute has dtype {attr_id.dtype}, which does"
f" not match its dataset's dtype of {item.dtype}."
f" Path: {item_name} -> {attr_name}"
)

# Check attributes which should use a half-precision dtype
if attr_name in half_precision_match:

incorrect_c32 = (
nisarqa.is_complex32(item)
and attr_id.dtype != np.float16
)
incorrect_c64 = (
item.dtype == np.complex64
and attr_id.dtype != np.float32
)
if incorrect_c32 or incorrect_c64:
log.error(
f"Attribute has dtype {attr_id.dtype}, which does"
f" not match the half-precision of its dataset's"
f" dtype of {item.dtype}."
f" Path: {item_name} -> {attr_name}"
)
else:
log.error(
f"Attribute is meant for a complex-valued dataset,"
" but is attached to a non-complex-valued dataset"
f" Path: {item_name} -> {attr_name}"
)

else:
_validate_string_logic(
name=f"{item_name} -> {attr_name}",
dtype_=attr_id.dtype,
value_provider=lambda: attr_val,
label="Attribute",
)

def visitor_func(path: str) -> None:
"""Visitor function for h5py.visit."""

# The `complex64` HDF5 object is neither a HDF5 group nor dataset, skip.
if path.endswith("complex64"):
return

obj = h5_file[path]

# 1. Always check attributes (This is safe for large datasets)
_check_attributes(path, obj)

# 2. Dataset-specific validation
if isinstance(obj, h5py.Dataset):

# 2a.For all datasets (numeric, string, etc.) check if the dataset
# was populated with some value. (aka not an empty/null dataset)

# Check if dataset is a 'null' space (Empty) without reading data.
# This occurs is when a dataset is written with a Python value of
# `None` (there could be other causes). h5py Datasets with no data
# have a shape of None or use the Empty class.
if obj.shape is None:
msg = f"Dataset has a null (Empty) space. Dataset: {obj.name}"
log.error(msg)
return

# Check if storage was allocated (0 bytes means uninitialized/empty)
if obj.id.get_storage_size() == 0:
log.error(
"Dataset has no allocated storage (empty)."
f" Dataset: {obj.name}"
)
return

# 2b. String Type/Content Check
_validate_string_logic(
name=obj.name,
dtype_=obj.dtype,
# Only called if dtype is string
value_provider=lambda: obj[()],
label="Dataset",
)

# 2c. Numeric Type/Content Check
# Numeric datasets will need to be individually validated
# via other sections in QA (XML Checker, qa_reports, Metadata LUT
# checks, etc.)

# Check root, then visit
_check_attributes("/", h5_file)
h5_file.visit(visitor_func)


def identification_sanity_checks(
id_group: h5py.Group, product_type: str
) -> None:
Expand Down Expand Up @@ -443,25 +679,13 @@ def _verify_data_is_in_list(
if _dataset_exists(ds_name):
data = _get_string_dataset(ds_name=ds_name)
if data is not None:
# TODO: Use a regex for more flexible pattern matching.
if data in (
"",
"0",
"['0']",
"['']",
"['' '' '' '' '']",
"None",
"(NOT SPECIFIED)",
):
log.error(
f"Dataset value is {data!r}, which is not a valid value."
f" Dataset: {_full_path(ds_name)}"
)
ds_full_path = _full_path(ds_name)
if _log_if_bad_string_value(val=data, path=ds_full_path):
passes = False
else:
log.warning(
f"Dataset value is {data!r}, but it has not be automatically"
f" verified during checks. Dataset: {_full_path(ds_name)}"
f" verified during checks. Dataset: {ds_full_path}"
)
else:
passes = False
Expand Down