isce-framework · nemo794 · Feb 9, 2026 · Feb 10, 2026 · Feb 11, 2026
@@ -592,6 +592,35 @@ def get_stats_name_descr(stat: str, component: str | None) -> tuple[str, str]:
         )
 
 
+def get_list_of_real_stats_names() -> list[str]:
+    """
+    Return a list of the names of all statistics for real-valued datasets.
+
+    These names are per NISAR conventions, for min/max/mean/std statistics.
+    """
+
+    stat_names = []
+    for stat in ("min", "max", "mean", "std"):
+        s, _ = get_stats_name_descr(stat, component=None)
+        stat_names.append(s)
+    return stat_names
+
+
+def get_list_of_imag_stats_names() -> list[str]:
+    """
+    Return a list of the names of all statistics for complex-valued datasets.
+
+    These names are per NISAR conventions, for min/max/mean/std statistics.
+    """
+
+    stat_names = []
+    for comp in ("real", "imag"):
+        for stat in ("min", "max", "mean", "std"):
+            s, _ = get_stats_name_descr(stat, component=comp)
+            stat_names.append(s)
+    return stat_names
+
+
 def copy_non_insar_imagery_metrics(
     product: nisarqa.NonInsarProduct, stats_h5: h5py.File
 ) -> None:

@@ -1,7 +1,9 @@
 from __future__ import annotations
 
 import re
-from collections.abc import Container
+from collections.abc import Callable, Container
+import copy
+from typing import Any
 
 import h5py
 import numpy as np
@@ -12,6 +14,42 @@
 objects_to_skip = nisarqa.get_all(name=__name__)
 
 
+def _log_if_bad_string_value(val: str | list[str], path: str) -> bool:
+    """
+    Log an error if value is a known invalid string.
+
+    Parameters
+    ----------
+    val : str or list of str
+        Value to be checked.
+    path : str
+        Path to the dataset (and/or attribute) containing `val` to be used
+        for logging. If `val` is the value of an attribute, suggest
+        providing the dataset's path with the attribute name.
+    """
+    log = nisarqa.get_logger()
+
+    values = copy.deepcopy(val)
+
+    if isinstance(values, str):
+        values = [values]
+
+    for v in values:
+        if v.upper() in (
+            "",
+            "0",
+            "['0']",
+            "['']",
+            "['' '' '' '' '']",
+            "NONE",
+            "(NOT SPECIFIED)",
+        ):
+            log.error(
+                f"Value is {val!r}, which is not valid for nominal NISAR data."
+                f" Path: {path}"
+            )
+
+
 def dataset_sanity_checks(product: nisarqa.NisarProduct) -> None:
     """
     Perform a series of verification checks on the input product's datasets.
@@ -23,12 +61,210 @@ def dataset_sanity_checks(product: nisarqa.NisarProduct) -> None:
     """
     with h5py.File(product.filepath, "r") as f:
 
+        check_metadata_conventions(h5_file=f)
+
         identification_sanity_checks(
             id_group=f[product.identification_path],
             product_type=product.product_type,
         )
 
 
+def check_metadata_conventions(h5_file: h5py.File) -> None:
+    """
+    Check that all datasets and attributes meet certain NISAR conventions.
+
+    Iterate through an HDF5 file to validate that all groups and datasets,
+    including their attributes, meet certain NISAR conventions:
+        1) populated (not empty)
+        2) if string, that they are not variable-length strings and that
+           they are not populated with known placeholder values.
+        3) if an attribute is numeric and is in given set of names,
+           that its dtype corresponds to its dataset's dtype.
+
+    Any issues discovered are logged as errors.
+
+    Parameters
+    ----------
+    h5_file : h5py.File
+        The opened HDF5 file object to be inspected.
+
+    Notes
+    -----
+    This function is general for all NISAR product types. It does not have
+    special handling for specific datasets in specific products.
+
+    This function does not compare dtypes against the dtypes denoted in the XML
+    product specifications. For that functionality, please use the XML Checker.
+    """
+    log = nisarqa.get_logger()
+
+    # Construct list of attributes whose dtypes should exactly-match the
+    # dtype of the dataset/group they are attached to.
+    exact_dtype_match = nisarqa.get_list_of_real_stats_names()
+    exact_dtype_match += ["_FillValue", "valid_min", "valid_max"]
+
+    # Construct list of attributes whose dtypes should be half-precision of the
+    # dtype of the dataset/group they are attached to.
+    half_precision_match = nisarqa.get_list_of_imag_stats_names()
+
+    def _validate_string_logic(
+        name: str,
+        dtype_: h5py.Datatype,
+        value_provider: Callable[[], Any],
+        label: str,
+    ) -> None:
+        """
+        Unified logic to validate HDF5 string types and content.
+
+        Parameters
+        ----------
+        name : str
+            Path or name of the object.
+        dtype_ : h5py.Datatype
+            The HDF5 datatype to check.
+        value_provider : callable
+            A function/lambda that returns the actual value of when called.
+            Used to avoid reading data unless value has a string type.
+        label : str
+            Context label for logging (e.g., "Dataset" or "Attribute").
+        """
+
+        string_info = h5py.check_string_dtype(dtype_)
+        if string_info is None:
+            # object is not a string dtype. (It could be int, float, etc.)
+            return
+
+        if string_info.length is None:
+            log.error(
+                f"{label} is variable-length string; should be fixed-length."
+                f" Path: {name}"
+            )
+        elif string_info.length == 0:
+            log.error(f"{label} is an empty string. Path: {name}")
+        else:
+            # ONLY for strings do we read the value to check content.
+            # Fixed-length strings are usually small metadata fields.
+            raw_val = value_provider()
+            ds_val = nisarqa.byte_string_to_python_str(raw_val)
+
+            vals = [ds_val] if isinstance(ds_val, (str, bytes)) else ds_val
+            for val in vals:
+                _log_if_bad_string_value(val=val, path=name)
+
+    def _check_attributes(item_name: str, item: h5py.HLObject) -> None:
+        """Check all attributes of a specific HDF5 object."""
+        for attr_name, attr_val in item.attrs.items():
+            if attr_val is None or isinstance(attr_val, h5py.Empty):
+                log.error(
+                    f"Attribute '{attr_name}' is empty. Path: {item_name}"
+                )
+                continue
+
+            attr_id = item.attrs.get_id(attr_name)
+
+            string_info = h5py.check_string_dtype(attr_id.dtype)
+
+            if string_info is None:
+                # object is not a string dtype. (It could be int, float, etc.)
+                # Validate that that dtype of the attribute matches the dtype
+                # of the dataset.
+
+                # Check attributes which should have an exact dtype match
+                if attr_name in exact_dtype_match:
+                    if attr_id.dtype != item.dtype:
+                        log.error(
+                            f"Attribute has dtype {attr_id.dtype}, which does"
+                            f" not match its dataset's dtype of {item.dtype}."
+                            f" Path: {item_name} -> {attr_name}"
+                        )
+
+                # Check attributes which should use a half-precision dtype
+                if attr_name in half_precision_match:
+
+                    incorrect_c32 = (
+                        nisarqa.is_complex32(item)
+                        and attr_id.dtype != np.float16
+                    )
+                    incorrect_c64 = (
+                        item.dtype == np.complex64
+                        and attr_id.dtype != np.float32
+                    )
+                    if incorrect_c32 or incorrect_c64:
+                        log.error(
+                            f"Attribute has dtype {attr_id.dtype}, which does"
+                            f" not match the half-precision of its dataset's"
+                            f" dtype of {item.dtype}."
+                            f" Path: {item_name} -> {attr_name}"
+                        )
+                    else:
+                        log.error(
+                            f"Attribute is meant for a complex-valued dataset,"
+                            " but is attached to a non-complex-valued dataset"
+                            f" Path: {item_name} -> {attr_name}"
+                        )
+
+            else:
+                _validate_string_logic(
+                    name=f"{item_name} -> {attr_name}",
+                    dtype_=attr_id.dtype,
+                    value_provider=lambda: attr_val,
+                    label="Attribute",
+                )
+
+    def visitor_func(path: str) -> None:
+        """Visitor function for h5py.visit."""
+
+        # The `complex64` HDF5 object is neither a HDF5 group nor dataset, skip.
+        if path.endswith("complex64"):
+            return
+
+        obj = h5_file[path]
+
+        # 1. Always check attributes (This is safe for large datasets)
+        _check_attributes(path, obj)
+
+        # 2. Dataset-specific validation
+        if isinstance(obj, h5py.Dataset):
+
+            # 2a.For all datasets (numeric, string, etc.) check if the dataset
+            # was populated with some value. (aka not an empty/null dataset)
+
+            # Check if dataset is a 'null' space (Empty) without reading data.
+            # This occurs is when a dataset is written with a Python value of
+            # `None` (there could be other causes). h5py Datasets with no data
+            # have a shape of None or use the Empty class.
+            if obj.shape is None:
+                msg = f"Dataset has a null (Empty) space. Dataset: {obj.name}"
+                log.error(msg)
+                return
+
+            # Check if storage was allocated (0 bytes means uninitialized/empty)
+            if obj.id.get_storage_size() == 0:
+                log.error(
+                    "Dataset has no allocated storage (empty)."
+                    f" Dataset: {obj.name}"
+                )
+                return
+
+            # 2b. String Type/Content Check
+            _validate_string_logic(
+                name=obj.name,
+                dtype_=obj.dtype,
+                # Only called if dtype is string
+                value_provider=lambda: obj[()],
+                label="Dataset",
+            )
+
+            # 2c. Numeric Type/Content Check
+            # Numeric datasets will need to be individually validated
+            # via other sections in QA (XML Checker, qa_reports, Metadata LUT
+            # checks, etc.)
+
+    # Check root, then visit
+    _check_attributes("/", h5_file)
+    h5_file.visit(visitor_func)
+
+
 def identification_sanity_checks(
     id_group: h5py.Group, product_type: str
 ) -> None:
@@ -443,25 +679,13 @@ def _verify_data_is_in_list(
         if _dataset_exists(ds_name):
             data = _get_string_dataset(ds_name=ds_name)
             if data is not None:
-                # TODO: Use a regex for more flexible pattern matching.
-                if data in (
-                    "",
-                    "0",
-                    "['0']",
-                    "['']",
-                    "['' '' '' '' '']",
-                    "None",
-                    "(NOT SPECIFIED)",
-                ):
-                    log.error(
-                        f"Dataset value is {data!r}, which is not a valid value."
-                        f" Dataset: {_full_path(ds_name)}"
-                    )
+                ds_full_path = _full_path(ds_name)
+                if _log_if_bad_string_value(val=data, path=ds_full_path):
                     passes = False
                 else:
                     log.warning(
                         f"Dataset value is {data!r}, but it has not be automatically"
-                        f" verified during checks. Dataset: {_full_path(ds_name)}"
+                        f" verified during checks. Dataset: {ds_full_path}"
                     )
             else:
                 passes = False