breaking down PR/660 into smaller pieces -- part2, empty obs (#669)

mer-a-o · mranst · web-flow · commit bb31de7b0d8d · 2026-01-22T09:52:24.000-05:00
* fetch empty obs and clean save obs diag

* Update src/swell/tasks/get_observations.py

Co-authored-by: Michael Anstett &lt;michael.anstett@nasa.gov&gt;

* better handling of obs with some empty files

* address test failure

* write missing obs in jedi.yaml

---------

Co-authored-by: Michael Anstett &lt;michael.anstett@nasa.gov&gt;
diff --git a/src/swell/tasks/get_observations.py b/src/swell/tasks/get_observations.py
@@ -8,10 +8,11 @@
 # --------------------------------------------------------------------------------------------------
 
 import isodate
+import netCDF4 as nc
 import numpy as np
 import os
 import r2d2
-import netCDF4 as nc
+import shutil
 from typing import Union
 
 from datetime import timedelta, datetime as dt
@@ -180,8 +181,22 @@ def execute(self) -> None:
                 try:
                     r2d2.fetch(**fetch_criteria)
                     self.logger.info(f"Successfully fetched {target_file}")
-                except Exception as e:
-                    self.logger.info(f"Failed to fetch {target_file}: {str(e)}")
+                except Exception:
+                    self.logger.info(
+                        f"Failed to fetch {target_file}. "
+                        "Fetch empty observation instead."
+                    )
+
+                    # fetch empty obs
+                    r2d2.fetch(
+                        item='observation',
+                        provider='empty_provider',
+                        observation_type='empty_type',
+                        file_extension='nc4',
+                        window_start='19700101T030000Z',
+                        window_length='PT6H',
+                        target_file=target_file,
+                    )
 
             # Check how many of the combine_input_files exist in the cycle directory.
             # If all of them are missing proceed without creating an observation input
@@ -441,85 +456,106 @@ def read_and_combine(self, input_filenames: list, output_filename: str) -> None:
         existing_files = [f for f in input_filenames if os.path.exists(f)]
         input_filenames = existing_files
 
-        # Loop through the input files and get the total dimension size for each dimension
-        # Location requires special handling to get the cumulative sum of the dimension size
-        # ---------------------------------------------------------------------------------
-        out_dim_size = {'Location': 0}
-        for input_filename in input_filenames:
-            with nc.Dataset(input_filename, 'r') as ds:
-                for dim_name, dim in ds.dimensions.items():
-                    if dim_name == 'Location':
-                        out_dim_size[dim_name] += dim.size
+        # Remove empty files from input_filenames
+        # -------------------------------------------------------------
+        valid_files = []
+
+        for fname in input_filenames:
+            try:
+                with nc.Dataset(fname, 'r') as ds:
+                    if 'Location' in ds.dimensions and ds.dimensions['Location'].size > 0:
+                        valid_files.append(fname)
                     else:
-                        out_dim_size[dim_name] = dim.size
-
-        with nc.Dataset(output_filename, 'w') as out_ds:
-            # Open the input NetCDF files for reading
-            # ---------------------------------------
-            self.logger.info(f"Combining files {input_filenames} ")
-
-            # Create an output file template based on the first input file
-            # ------------------------------------------------------------
-            with nc.Dataset(input_filenames[0], 'r') as ds:
-                # Access groups and create dimensions
-                # -----------------------------------
-                input_groups = ds.groups.keys()
-
-                for dim_name, dim in ds.dimensions.items():
-                    out_ds.createDimension(dim_name, out_dim_size[dim_name])
-
-                # Loop through groups and process variables
-                # -----------------------------------------
-                for group_name in input_groups:
-                    group = ds[group_name]
-
-                    # Create the groups in output file
-                    # --------------------------------
-                    out_group = out_ds.createGroup(group_name)
-
-                    # Access variables within a group
-                    # -------------------------------
-                    variables_in_group = group.variables.keys()
-
-                    # Loop over variables from input files, combine, and write to the new file
-                    # ------------------------------------------------------------------------
-                    for var_name in variables_in_group:
-                        list_data = []
-
-                        # Get the dimensions of the variable
-                        # ----------------------------------
-                        var_dims = group[var_name].dimensions
-
-                        # Loop over all the files and combine the variable data into a list
-                        # Channel dimensions remain the same, so we can break the loop
-                        # ----------------------------------------------------------------
-                        for input_file in input_filenames:
-                            list_data.append(self.get_data(input_file, group_name, var_name))
-                            # Only break if the first dimension is Channel
-                            if var_dims[0] == 'Channel':
-                                break
-
-                        # Concatenate the masked arrays along the first dimension
-                        # --------------------------------------------------------
-                        variable_data = np.ma.concatenate(list_data, axis=0)
-
-                        # Fill value needs to be assigned while creating variables
-                        # --------------------------------------------------------
-                        subset_var = out_group.createVariable(
-                            var_name,
-                            variable_data.dtype,
-                            var_dims,
-                            fill_value=group[var_name].getncattr('_FillValue')
-                        )
-                        for attr_name in group[var_name].ncattrs():
-                            if attr_name == '_FillValue':
-                                continue
-                            subset_var.setncattr(
-                                attr_name, group[var_name].getncattr(attr_name)
-                            )
+                        empty_template = fname
+            except OSError:
+                continue
 
-                        # Write subset data to the new file
+        input_filenames = valid_files
+
+        if input_filenames:
+            # Loop through the input files and get the total dimension size for each dimension
+            # Location requires special handling to get the cumulative sum of the dimension size
+            # ---------------------------------------------------------------------------------
+            out_dim_size = {'Location': 0}
+            for input_filename in input_filenames:
+                with nc.Dataset(input_filename, 'r') as ds:
+                    for dim_name, dim in ds.dimensions.items():
+                        if dim_name == 'Location':
+                            out_dim_size[dim_name] += dim.size
+                        else:
+                            out_dim_size[dim_name] = dim.size
+
+            with nc.Dataset(output_filename, 'w') as out_ds:
+                # Open the input NetCDF files for reading
+                # ---------------------------------------
+                self.logger.info(f"Combining files {input_filenames} ")
+
+                # Create an output file template based on the first input file
+                # ------------------------------------------------------------
+                with nc.Dataset(input_filenames[0], 'r') as ds:
+                    # Access groups and create dimensions
+                    # -----------------------------------
+                    input_groups = ds.groups.keys()
+
+                    for dim_name, dim in ds.dimensions.items():
+                        out_ds.createDimension(dim_name, out_dim_size[dim_name])
+
+                    # Loop through groups and process variables
+                    # -----------------------------------------
+                    for group_name in input_groups:
+                        group = ds[group_name]
+
+                        # Create the groups in output file
                         # --------------------------------
-                        subset_var[:] = variable_data
+                        out_group = out_ds.createGroup(group_name)
+
+                        # Access variables within a group
+                        # -------------------------------
+                        variables_in_group = group.variables.keys()
+
+                        # Loop over variables from input files, combine, and write to the new file
+                        # ------------------------------------------------------------------------
+                        for var_name in variables_in_group:
+                            list_data = []
+
+                            # Get the dimensions of the variable
+                            # ----------------------------------
+                            var_dims = group[var_name].dimensions
+
+                            # Loop over all the files and combine the variable data into a list
+                            # Channel dimensions remain the same, so we can break the loop
+                            # ----------------------------------------------------------------
+                            for input_file in input_filenames:
+                                list_data.append(self.get_data(input_file, group_name, var_name))
+                                # Only break if the first dimension is Channel
+                                if var_dims[0] == 'Channel':
+                                    break
+
+                            # Concatenate the masked arrays along the first dimension
+                            # --------------------------------------------------------
+                            variable_data = np.ma.concatenate(list_data, axis=0)
+
+                            # Fill value needs to be assigned while creating variables
+                            # --------------------------------------------------------
+                            subset_var = out_group.createVariable(
+                                var_name,
+                                variable_data.dtype,
+                                var_dims,
+                                fill_value=group[var_name].getncattr('_FillValue')
+                            )
+                            for attr_name in group[var_name].ncattrs():
+                                if attr_name == '_FillValue':
+                                    continue
+                                subset_var.setncattr(
+                                    attr_name, group[var_name].getncattr(attr_name)
+                                )
+
+                            # Write subset data to the new file
+                            # --------------------------------
+                            subset_var[:] = variable_data
+
+        else:
 
+            # If all the files are empty copy of them as the output file
+            shutil.copyfile(empty_template, output_filename)
 # ----------------------------------------------------------------------------------------------
diff --git a/src/swell/tasks/save_obs_diags.py b/src/swell/tasks/save_obs_diags.py
@@ -7,7 +7,6 @@
 
 # --------------------------------------------------------------------------------------------------
 
-import os
 import r2d2
 from swell.tasks.base.task_base import taskBase
 from swell.utilities.r2d2 import create_r2d2_config
@@ -61,51 +60,19 @@ def execute(self) -> None:
             self.logger.info(f'Checking input observation file: {input_obs_file}')
 
             use_obs = check_obs(self.jedi_rendering.observing_system_records_path, observation,
-                                observation_dict, self.cycle_time_dto())
+                                observation_dict, self.cycle_time_dto(), input_and_output=True)
 
-            self.logger.info(f'Checking observation {observation}: use_obs = {use_obs}')
+            # use_obs is false when obs input file (or feedback file) doesn't exist or is empty.
+            # The case when the feedback file is listed in yaml but doesn't exit never happens,
+            # as JEDI execution fails when input obs file is missing.
 
             if not use_obs:
-                self.logger.info(f'Input observation file analysis for {observation}:')
-                self.logger.info(f'  Expected file: {input_obs_file}')
-                # Check if file exists and is readable
-                # ---------------------------------------
-                try:
-                    import netCDF4 as nc
-                    dataset = nc.Dataset(input_obs_file, 'r')
-                    dims = {dim_name: dim.size for dim_name, dim in dataset.dimensions.items()}
-                    self.logger.info(f'  File exists but dimensions: {dims}')
-                    dataset.close()
-                except Exception as e:
-                    self.logger.info(f'  File exists but error reading: {str(e)}')
-
-                self.logger.info(f'  Skipping {observation}')
+                self.logger.info(f'Empty feedback (obs diag) {input_obs_file} file. Skip saving.')
                 continue
 
-            # Store diagnostic/feedback files produced by JEDI executables
-            # (e.g., variational, hofx, localensembleda).
-            # --------------------------------------------------------------
-
             name = observation_dict['obs space']['name']
             obs_path_file = observation_dict['obs space']['obsdataout']['engine']['obsfile']
 
-            self.logger.info(f'Looking for diagnostic output file: {obs_path_file}')
-
-            # Check for need to add 0000 to the file
-            if not os.path.exists(obs_path_file):
-                obs_path_file_name, obs_path_file_ext = os.path.splitext(obs_path_file)
-                obs_path_file_0000 = obs_path_file_name + '_0000' + obs_path_file_ext
-                self.logger.info(f'Primary file not found, checking: {obs_path_file_0000}')
-
-                if not os.path.exists(obs_path_file_0000):
-                    self.logger.info(f'Diagnostic output files not found for {observation}:')
-                    self.logger.info(f'  Expected: {obs_path_file}')
-                    self.logger.info(f'  Expected: {obs_path_file_0000}')
-                    self.logger.info(f'  RunJediVariationalExecutable did not run successfully')
-                    self.logger.info(f'  Skipping storage of {observation} diagnostic file')
-                    continue
-                obs_path_file = obs_path_file_0000
-
             self.logger.info(f'Found diagnostic output file: {obs_path_file}')
 
             # Store to R2D2