diff --git a/CHANGELOG.md b/CHANGELOG.md index c506080..a8f8f32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added optional SLURM "constraint". - Added functionality to run on tile space of stretched cube-sphere grids. -- Added python package for post-processing ObsFcstAna output. +- Added (and later revised) python package for post-processing ObsFcstAna output. ### Changed diff --git a/GEOSldas_App/util/postproc/ObsFcstAna_stats/Get_ObsFcstAna_stats.py b/GEOSldas_App/util/postproc/ObsFcstAna_stats/Get_ObsFcstAna_sums.py similarity index 55% rename from GEOSldas_App/util/postproc/ObsFcstAna_stats/Get_ObsFcstAna_stats.py rename to GEOSldas_App/util/postproc/ObsFcstAna_stats/Get_ObsFcstAna_sums.py index cf7e61a..b2543ff 100644 --- a/GEOSldas_App/util/postproc/ObsFcstAna_stats/Get_ObsFcstAna_stats.py +++ b/GEOSldas_App/util/postproc/ObsFcstAna_stats/Get_ObsFcstAna_sums.py @@ -2,24 +2,21 @@ """ Sample script for post-processing GEOSldas ObsFcstAna output into data assimilation diagnostics. -First, compute and store monthly sums and sums of squares and cross-products of raw ObsFcstAna output. +Computes and stores monthly sums and sums of squares and cross-products of raw ObsFcstAna output. Data assimilation diagnostics ("stats") such as the mean and std-dev of the observation-minus-forecast -residuals can then be diagnosed quickly from these intermediate "sums" files. -Sample script optionally computes and plots: -- Maps of long-term data assimilation diagnostics (see also Plot_stats_maps.py). -- Monthly time series of spatially averaged data assimilation diagnostics (see also Plot_stats_timeseries.py). +residuals can then be diagnosed quickly from these intermediate "sums" files. See Plot_stats_*.py. Usage: 1. Edit "user_config.py" with experiments information. 2. Run this script as follows (on Discover): $ module load python/GEOSpyD - $ ./Get_ObsFcstAna_stats.py + $ ./Get_ObsFcstAna_sums.py # Background execution: - $ nohup ./Get_ObsFcstAna_stats.py > out.log & + $ nohup ./Get_ObsFcstAna_sums.py > out.log & Authors: Q. Liu, R. Reichle, A. Fox -Last Modified: May 2025 +Last Modified: June 2025 """ import sys; sys.path.append('../../shared/python/') @@ -64,33 +61,6 @@ def main(): # Compute and save monthly sums postproc.save_monthly_sums() - # -------------------------------------------------------------------------------------- - # Optionally compute long-term temporal/spatial statistics and create plots. - # The plotting scripts can also run standalone using the individual Plot_stats_*.py scripts, - # as long as the monthly sum files are available. - - plot_maps = False - plot_timeseries = False - - if plot_maps: # Compute long-term temporal stats and plot maps - - stats_file = out_path + 'temporal_stats_'+exp_list[0]['exptag']+ '_'+start_time.strftime('%Y%m%d')+'_'+ \ - (end_time+timedelta(days=-1)).strftime('%Y%m%d')+'.nc4' - - # temporal_stats is a dictionary that contains all mean/variances fields for computing long-term O-F/O-A stats - # each field has the dimension [N_tile, N_species] - - temporal_stats = postproc.calc_temporal_stats_from_sums(write_to_nc=True, fout_stats=stats_file) - - # Example to plot some O-F maps - from Plot_stats_maps import plot_OmF_maps - plot_OmF_maps(postproc, temporal_stats, fig_path=out_path ) - - if plot_timeseries: # Compute spatially averaged stats and plot monthly time series of stats - - from Plot_stats_timeseries import Plot_monthly_OmF_bars - Plot_monthly_OmF_bars(postproc, fig_path=out_path) - if __name__ == '__main__': main() diff --git a/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_maps.py b/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_maps.py index ed44a8d..18a4c09 100644 --- a/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_maps.py +++ b/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_maps.py @@ -2,98 +2,168 @@ """ Sample script for plotting maps of long-term data assimilation diagnostics. -Computes Nobs-weighted avg of each metric across all species. -Requires saved files with monthly sums (see Get_ObsFcstAna_stat.py). +Plots Nobs-weighted avg of each metric across all species. +Requires saved files with monthly sums (see Get_ObsFcstAna_sums.py). Stats of *normalized* OmFs are approximated! + +Usage: + 1. Edit "user_config.py" with experiments information. + 2. Run this script as follows (on Discover): + $ module load python/GEOSpyD + $ ./Plot_stats_maps.py """ import sys; sys.path.append('../../shared/python/') import warnings; warnings.filterwarnings("ignore") +import os import numpy as np import matplotlib.pyplot as plt from datetime import timedelta -from remap_1d_to_2d import remap_1d_to_2d +from netCDF4 import Dataset + +from tile_to_latlongrid import tile_to_latlongrid from plot import plotMap from EASEv2 import EASEv2_ind2latlon -def plot_OmF_maps(postproc_obj, stats, fig_path='./'): +from postproc_ObsFcstAna import postproc_ObsFcstAna +from user_config import get_config + +def Main_OmF_maps(): - start_time = postproc_obj.start_time - end_time = postproc_obj.end_time - domain = postproc_obj.domain - tc = postproc_obj.tilecoord + # Plot maps of long-term temporal stats + + config = get_config() # edit user-defined inputs in user_config.py + + exp_list = config['exp_list'] + start_time = config['start_time'] + end_time = config['end_time'] + sum_path = config['sum_path'] + out_path = config['out_path'] + + # min total number of individual O-Fs required for stats at a location (across all months in period) - # Sample of final compuation of selected diagnostic metrics - Nmin = 20 + + # ------------------------------------------------------------------------------------ + # Read or compute stats. Each field has dimension [N_tile, N_species] + # ------------------------------------------------------------------------------------ - # Then compute metrics of O-F, O-A, etc. based on above computed - N_data = stats['N_data'] - O_mean = stats['obs_mean'] - # mean(x-y) = E[x] - E[y] - OmF_mean = stats['obs_mean'] - stats['fcst_mean'] - OmA_mean = stats['obs_mean'] - stats['ana_mean'] - # var(x-y) = var(x) + var(y) - 2cov(x,y) - # cov(x,y) = E[xy] - E[x]E[y] - OmF_stdv = np.sqrt(stats['obs_variance'] + stats['fcst_variance'] - \ - 2 * (stats['oxf_mean'] - stats['obs_mean']*stats['fcst_mean'])) - - OmA_stdv = np.sqrt(stats['obs_variance'] + stats['ana_variance'] - \ - 2 * (stats['oxa_mean'] - stats['obs_mean']*stats['ana_mean'])) - - # ***************************************************************************************** - # The time series mean and std-dev of the *normalized* OmF computed here are APPROXIMATED! - # ***************************************************************************************** - # Here, we first compute the stats of the OmF time series and then normalize using - # the time-avg "obsvar" and "fcstvar" values. - # Since "fcstvar" changes with time, the OmF values should be normalized at each time - # step (as in the older matlab scripts), and then the time series stats can be computed. - # To compute the exact stats with this python package, the sum and sum-of-squares of - # the normalized OmF values would need to be added into the sums files. - # - OmF_norm_mean = OmF_mean / np.sqrt(stats['obsvar_mean'] + stats['fcstvar_mean']) # APPROXIMATED stat! - OmF_norm_stdv = np.sqrt(OmF_stdv**2 / (stats['obsvar_mean'] + stats['fcstvar_mean']) ) # APPROXIMATED stat! - - # Mask out data points with insufficent observations using the Nmin threshold - # Do NOT apply to N_data + # File name for long-term temporal stats + stats_file = out_path + 'temporal_stats_'+exp_list[0]['exptag']+'_'+ start_time.strftime('%Y%m%d')+'_'+ \ + (end_time+timedelta(days=-1)).strftime('%Y%m%d')+'.nc4' + + # Read or compute long-term temporal stats. Each field has the dimension [N_tile, N_species]. + if os.path.isfile(stats_file): + + print('reading stats nc4 file '+ stats_file) + stats = {} + with Dataset(stats_file,'r') as nc: + for key, value in nc.variables.items(): + stats[key] = value[:].filled(np.nan) + + else: + # Initialize the postprocessing object and get information + postproc_obj = postproc_ObsFcstAna(exp_list, start_time, end_time, sum_path=sum_path) + stats = postproc_obj.calc_temporal_stats_from_sums(write_to_nc=True, fout_stats=stats_file) + + N_data = stats['N_data'] + OmF_mean = stats['OmF_mean'] + OmF_stdv = stats['OmF_stdv'] + OmA_mean = stats['OmA_mean'] + OmA_stdv = stats['OmA_stdv'] + OmF_norm_mean = stats['OmF_norm_mean'] + OmF_norm_stdv = stats['OmF_norm_stdv'] + + # Screen stats with Nmin (except for N_data) OmF_mean[ N_data < Nmin] = np.nan OmF_stdv[ N_data < Nmin] = np.nan + OmA_mean[ N_data < Nmin] = np.nan + OmA_stdv[ N_data < Nmin] = np.nan OmF_norm_mean[N_data < Nmin] = np.nan OmF_norm_stdv[N_data < Nmin] = np.nan - OmA_mean[ N_data < Nmin] = np.nan - OmA_stdv[ N_data < Nmin] = np.nan - N_data[ N_data < Nmin] = 0 - + + # Select/combine fields for plotting. The following provides an example to + # computes average stats across all species. + # # Compute Nobs-weighted avg of each metric across all species. # Typically used for SMAP Tb_h/h from asc and desc overpasses, # or ASCAT soil moisture from Metop-A/B/C. # DOES NOT MAKE SENSE IF, SAY, SPECIES HAVE DIFFERENT UNITS! - Nobs_data = np.nansum( N_data, axis=1) - OmF_mean = np.nansum(OmF_mean *N_data, axis=1)/Nobs_data - OmF_stdv = np.nansum(OmF_stdv *N_data, axis=1)/Nobs_data - OmF_norm_mean = np.nansum(OmF_norm_mean*N_data, axis=1)/Nobs_data - OmF_norm_stdv = np.nansum(OmF_norm_stdv*N_data, axis=1)/Nobs_data - OmA_mean = np.nansum(OmA_mean *N_data, axis=1)/Nobs_data - OmA_stdv = np.nansum(OmA_stdv *N_data, axis=1)/Nobs_data + + Ndata_sum = np.nansum( N_data, axis=1) + + OmF_mean = np.nansum(OmF_mean *N_data, axis=1)/Ndata_sum + OmF_stdv = np.nansum(OmF_stdv *N_data, axis=1)/Ndata_sum + OmF_norm_mean = np.nansum(OmF_norm_mean*N_data, axis=1)/Ndata_sum + OmF_norm_stdv = np.nansum(OmF_norm_stdv*N_data, axis=1)/Ndata_sum + OmA_mean = np.nansum(OmA_mean *N_data, axis=1)/Ndata_sum + OmA_stdv = np.nansum(OmA_stdv *N_data, axis=1)/Ndata_sum + + N_data = Ndata_sum + + # The obs are assigned to tiles only for book-keeping purposes, and the resolution of + # the obs is often coarser than that of the model tile space. Therefore, typically many + # tiles are never assigned any obs and end up with N_data=0. + # Here, we remove these unwanted zeros from N_data to keep them out of the computation of + # spatial averages (global avg; mapping from tile space to lat/lon plotting grid) - # Plotting - exptag = postproc_obj.exptag + N_data[N_data == 0] = np.nan + + # -------------------------------------------------------------------------------- + # Plot stats on regular lat/lon grid, with grid spacing guided by the footprint + # scale (field-of-view; FOV) of the observations. + # --------------------------------------------------------------------------------- + + # Get geographic info about model grid, model tile space, and obs FOV + # (assumes that all species have same FOV and FOV_units) + + tc = exp_list[0]['tilecoord'] + tg = exp_list[0]['tilegrid_global'] + obs_fov = exp_list[0]['obsparam'][0]['FOV'] + obs_fov_units = exp_list[0]['obsparam'][0]['FOV_units'] + + # Determine spacing of lat/lon grid for plotting (degree) based on obs FOV. + # Can also set manually if desired. + + my_res = obs_fov*2. # Note: FOV is a radius, hence the factor of 2. + + # If applicable, convert to degree, assuming 100 km is about 1 deg. + if 'km' in obs_fov_units: + my_res = my_res/100. + + # If obs_fov=0, reset to match model resolution. + if obs_fov == 0: + my_res = np.sqrt( tg['dlat']*tg['dlon'] ) + + # pick a resolution from a sample vector of resolutions + + sample_res = [0.05, 0.1, 0.25, 0.5, 1.0, 2.0] + + my_res = min(sample_res, key=lambda x: abs(x - my_res)) + + print(f'resolution of lat/lon grid for plotting: {my_res} degree') + + # ---------------------------------------------------------------------------------- + # Plot + # ---------------------------------------------------------------------------------- + + exptag = exp_list[0]['exptag'] fig, axes = plt.subplots(2,2, figsize=(18,10)) plt.rcParams.update({'font.size':14}) - + for i in np.arange(2): for j in np.arange(2): units = '[k]' if i == 0 and j == 0: - tile_data = Nobs_data + tile_data = N_data # crange is [cmin, cmax] crange =[0, np.ceil((end_time-start_time).days/150)*300] colormap = plt.get_cmap('jet',20) - title_txt = exptag + ' Tb Nobs '+ start_time.strftime('%Y%m')+'_'+end_time.strftime('%Y%m') + title_txt = exptag + ' Tb Nobs (excl. 0s)' + start_time.strftime('%Y%m')+'_'+end_time.strftime('%Y%m') units = '[-]' if i == 0 and j ==1: tile_data = OmF_mean @@ -112,58 +182,35 @@ def plot_OmF_maps(postproc_obj, stats, fig_path='./'): title_txt = exptag + ' Tb normalized O-F stdv (approx!) '+ start_time.strftime('%Y%m%d')+'_'+end_time.strftime('%Y%m%d') colormap.set_bad(color='0.9') # light grey, 0-black, 1-white + + # map tile_data on 2-d regular lat/lon grid for plotting + grid_data, lat_2d, lon_2d = tile_to_latlongrid(tile_data, tc, my_res) - # Regrid 1d tile_data to 2d grid_data for map plots - if '_M09_' in domain: # special case - grid_data_M09 = np.zeros((1624, 3856)) + np.nan - grid_data_M09[tc['j_indg'],tc['i_indg']] = tile_data - - # Reshape the data into 4x4 blocks - reshaped = grid_data_M09.reshape(1624//4, 4, 3856//4, 4) - - # Combine each 4x4 M09 block into a M36 grid - #if i==0 and j==0: - # grid_data = np.sum(reshaped,axis=(1, 3)) - #else: - # grid_data = np.nanmean(reshaped,axis=(1, 3)) - - grid_data = grid_data_M09[1::4, 2::4] - - # NOT area weighted - wmean = np.nanmean(grid_data) - wabsmean = np.nanmean(np.abs(grid_data)) - if 'normalized' in title_txt: - wabsmean = np.nanmean(np.abs(grid_data-1.)) - - lat_M36, lon_M36 = EASEv2_ind2latlon(np.arange(406), np.arange(964),'M36') - lon_2d,lat_2d = np.meshgrid(lon_M36,lat_M36) - else: - grid_data, uy, ux = remap_1d_to_2d(tile_data, lat_1d = tc['com_lat'], lon_1d = tc['com_lon']) - lon_2d,lat_2d = np.meshgrid(ux, uy) - - # Aear weighted mean and mean(abs) - wmean = np.nansum( tile_data * tc['area'])/np.nansum(~np.isnan(tile_data)*tc['area']) - wabsmean = np.nansum(np.abs(tile_data) * tc['area'])/np.nansum(~np.isnan(tile_data)*tc['area']) - if 'normalized' in title_txt: - wabsmean = np.nansum(np.abs(tile_data-1.) * tc['area'])/np.nansum(~np.isnan(tile_data)*tc['area']) - + + # compute spatial avg of metric (directly from tile_data; no weighting) + mean = np.nanmean( tile_data ) + absmean = np.nanmean(np.abs(tile_data)) + if 'normalized' in title_txt: - title_txt = title_txt + '\n' + "avg=%.3f, avg(abs(nstdv-1))=%.3f" % (wmean, wabsmean)+' '+units + absmean = np.nanmean(np.abs(tile_data-1.) ) + + if 'normalized' in title_txt and 'stdv' in title_txt: + title_txt = title_txt + '\n' + "avg=%.3f, avg(abs(nstdv-1))=%.3f" % (mean, absmean)+' '+units elif 'mean' in title_txt: - title_txt = title_txt + '\n' + "avg=%.3f, avg(abs)=%.3f" % (wmean, wabsmean)+' '+units + title_txt = title_txt + '\n' + "avg=%.3f, avg(abs)=%.3f" % (mean, absmean)+' '+units else: - title_txt = title_txt + '\n' + "avg=%.2f" % (wmean) +' '+units - + title_txt = title_txt + '\n' + "avg=%.2f" % (mean )+' '+units + if 'normalized' in title_txt: grid_data = np.log10(grid_data) crange = [-0.6, 0.45] - + mm, cs = plotMap(grid_data, ax =axes[i,j], lat=lat_2d, lon=lon_2d, cRange=crange, \ - title=title_txt, cmap=colormap, bounding=[-60, 80, -180,180]) + title=title_txt, cmap=colormap, bounding=[-60, 80, -180,180]) plt.tight_layout() # Save figure to file - fig.savefig(fig_path+'Map_OmF_'+ exptag +'_'+start_time.strftime('%Y%m')+'_'+\ + fig.savefig(out_path+'Map_OmF_'+ exptag +'_'+start_time.strftime('%Y%m')+'_'+\ (end_time+timedelta(days=-1)).strftime('%Y%m')+'.png') #plt.show() plt.close(fig) @@ -171,31 +218,7 @@ def plot_OmF_maps(postproc_obj, stats, fig_path='./'): # ----------------------------------------------------------------------------------------------- if __name__ == '__main__': - - from postproc_ObsFcstAna import postproc_ObsFcstAna - from user_config import get_config - - config = get_config() # edit user-defined inputs in user_config.py - - exp_list = config['exp_list'] - start_time = config['start_time'] - end_time = config['end_time'] - sum_path = config['sum_path'] - out_path = config['out_path'] - - # ------------------------------------------------------------------------------------ - # Initialize the postprocessing object - postproc = postproc_ObsFcstAna(exp_list, start_time, end_time, sum_path=sum_path) - - # Compute long-term temporal stats and plot maps - stats_file = out_path + 'temporal_stats_'+exp_list[0]['exptag']+'_'+ start_time.strftime('%Y%m%d')+'_'+ \ - (end_time+timedelta(days=-1)).strftime('%Y%m%d')+'.nc4' - - # temporal_stats is a dictionary that contains all mean/variances fields for computing long-term O-F/O-A stats - # each field has the dimension [N_tile, N_species] - - temporal_stats = postproc.calc_temporal_stats_from_sums(write_to_nc=True, fout_stats=stats_file) - - plot_OmF_maps(postproc, temporal_stats, fig_path=out_path ) + + Main_OmF_maps() # ====================== EOF ========================================================= diff --git a/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_timeseries.py b/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_timeseries.py index db72e9f..2c499cf 100644 --- a/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_timeseries.py +++ b/GEOSldas_App/util/postproc/ObsFcstAna_stats/Plot_stats_timeseries.py @@ -12,109 +12,88 @@ import numpy as np import matplotlib.pyplot as plt +import pickle from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta + +from postproc_ObsFcstAna import postproc_ObsFcstAna +from user_config import get_config + +def Main_OmF_timeseries(): + + config = get_config() # edit user-defined inputs in user_config.py + + exp_list = config['exp_list'] + start_time = config['start_time'] + end_time = config['end_time'] + sum_path = config['sum_path'] + out_path = config['out_path'] + + # ------------------------------------------------------------------------------------ + # + # Initialize the postprocessing object + postproc_obj = postproc_ObsFcstAna(exp_list, start_time, end_time, sum_path=sum_path) -def Plot_monthly_OmF_bars(postproc_obj, fig_path='./'): - import pickle - exptag = postproc_obj.exptag - start_month = postproc_obj.start_time - end_month = postproc_obj.end_time - - stats_file = fig_path + 'spatial_stats_'+exptag+'_'+start_month.strftime('%Y%m')+ \ - '_'+(end_month+timedelta(days=-1)).strftime('%Y%m')+'.pkl' + exptag = postproc_obj.exptag + start_time = postproc_obj.start_time + end_time = postproc_obj.end_time + + stats_file = out_path + 'spatial_stats_'+exptag+'_'+start_time.strftime('%Y%m')+ \ + '_'+(end_time+timedelta(days=-1)).strftime('%Y%m')+'.pkl' + + # Read or compute monthly stats across space if os.path.isfile(stats_file): with open(stats_file,'rb') as file: - stats_dict = pickle.load(file) - date_vec = stats_dict['date_vec'] + print(f'reading from {stats_file}') + stats = pickle.load(file) else: - - # Initialize monthly metrics as list - OmF_mean =[] - OmF_stdv =[] - OmA_mean =[] - OmA_stdv =[] - Ndata =[] - date_vec =[] - - # Time loop - current_month = start_month - while current_month < end_month: - print('compute monthly spatial mean stats for '+ current_month.strftime('%Y%m')) - - OmFm,OmFs,OmAm,OmAs,Nobsm = postproc_obj.calc_spatial_stats_from_sums(current_month) - - # Compute Nobs-weighted avg of each metric across all species. - # Typically used for SMAP Tb_h/h from asc and desc overpasses, - # or ASCAT soil moisture from Metop-A/B/C. - # DOES NOT MAKE SENSE IF, SAY, SPECIES HAVE DIFFERENT UNITS! - Nobsm = np.nansum( Nobsm) - OmFm = np.nansum(OmFm*Nobsm)/Nobsm - OmFs = np.nansum(OmFs*Nobsm)/Nobsm - OmAm = np.nansum(OmAm*Nobsm)/Nobsm - OmAs = np.nansum(OmAs*Nobsm)/Nobsm - - Ndata.append(Nobsm) - - OmF_mean.append(OmFm) - OmF_stdv.append(OmFs) - OmA_mean.append(OmAm) - OmA_stdv.append(OmAs) - - date_vec.append(current_month.strftime('%Y%m')) - current_month = current_month + relativedelta(months=1) - - # Store stats in a dictionary for easier saving and referencing - stats_dict = {"OmF_mean":OmF_mean, "OmF_stdv":OmF_stdv, - "OmA_mean":OmA_mean, "OmA_stdv":OmA_stdv, - "Ndata": Ndata, "date_vec":date_vec} + + stats = postproc_obj.calc_spatial_stats_from_sums() if stats_file is not None: with open(stats_file,'wb') as file: - pickle.dump(stats_dict,file) + print(f'saving stats to {stats_file}') + pickle.dump(stats,file) + + date_vec = stats['date_vec'] + + # Compute Nobs-weighted avg of each metric across all species. + # Typically used for SMAP Tb_h/h from asc and desc overpasses, + # or ASCAT soil moisture from Metop-A/B/C. + # DOES NOT MAKE SENSE IF, SAY, SPECIES HAVE DIFFERENT UNITS! + stats_plot = {} + N_data = np.nansum(stats['N_data' ], axis=1) + stats_plot['N_data' ] = N_data + stats_plot['OmF_mean'] = np.nansum(stats['OmF_mean']*stats['N_data'], axis=1)/N_data + stats_plot['OmF_stdv'] = np.nansum(stats['OmF_stdv']*stats['N_data'], axis=1)/N_data + stats_plot['OmA_mean'] = np.nansum(stats['OmA_mean']*stats['N_data'], axis=1)/N_data + stats_plot['OmA_stdv'] = np.nansum(stats['OmA_stdv']*stats['N_data'], axis=1)/N_data plot_var = 'OmF_mean' fig, ax = plt.subplots(figsize=(10,4)) - bars = ax.bar(date_vec, stats_dict[plot_var]) + bars = ax.bar(date_vec, stats_plot[plot_var]) plt.grid(True, linestyle='--', alpha=0.5) plt.xticks(ticks=date_vec[::2], labels=date_vec[::2]) - plt.title(exptag+ 'monthly '+plot_var) + plt.title(exptag+' monthly '+plot_var) plt.xlim(-1, len(date_vec)+1) plt.ylim(-.1, 2.) plt.tight_layout() #plt.show() - plt.savefig(fig_path+'Bars_'+plot_var+exptag+date_vec[0]+'_'+\ + plt.savefig(out_path+'Bars_'+plot_var+exptag+date_vec[0]+'_'+\ date_vec[-1]+'.png') plt.close() if __name__ == "__main__": - - from postproc_ObsFcstAna import postproc_ObsFcstAna - from user_config import get_config - - config = get_config() # edit user-defined inputs in user_config.py - exp_list = config['exp_list'] - start_time = config['start_time'] - end_time = config['end_time'] - sum_path = config['sum_path'] - out_path = config['out_path'] - - # ------------------------------------------------------------------------------------ - # - # Initialize the postprocessing object - postproc = postproc_ObsFcstAna(exp_list, start_time, end_time, sum_path=sum_path) - - # Compute spatial stats and plot monthly O-F stats bars - Plot_monthly_OmF_bars(postproc, fig_path=out_path) + Main_OmF_timeseries() # ====================== EOF ========================================================= diff --git a/GEOSldas_App/util/postproc/ObsFcstAna_stats/helper/write_nc4.py b/GEOSldas_App/util/postproc/ObsFcstAna_stats/helper/write_nc4.py index 8090cc8..57ebf69 100644 --- a/GEOSldas_App/util/postproc/ObsFcstAna_stats/helper/write_nc4.py +++ b/GEOSldas_App/util/postproc/ObsFcstAna_stats/helper/write_nc4.py @@ -9,7 +9,7 @@ def write_sums_nc4(file_path, N_data, data_sum, data2_sum, oxf_sum, oxa_sum, fxa_sum, obs_param): - nc = Dataset(file_path,'w',formate='NETCDF4') + nc = Dataset(file_path,'w',format='NETCDF4') tile = nc.createDimension('tile', N_data.shape[0]) species = nc.createDimension('species', N_data.shape[1]) @@ -46,7 +46,7 @@ def write_sums_nc4(file_path, N_data, data_sum, data2_sum, oxf_sum, oxa_sum, fxa def write_stats_nc4(file_path, stats): - nc = Dataset(file_path,'w',formate='NETCDF4') + nc = Dataset(file_path,'w',format='NETCDF4') tile = nc.createDimension('tile', stats['N_data'].shape[0]) species = nc.createDimension('species', stats['N_data'].shape[1]) diff --git a/GEOSldas_App/util/postproc/ObsFcstAna_stats/postproc_ObsFcstAna.py b/GEOSldas_App/util/postproc/ObsFcstAna_stats/postproc_ObsFcstAna.py index 41b86df..00058e0 100644 --- a/GEOSldas_App/util/postproc/ObsFcstAna_stats/postproc_ObsFcstAna.py +++ b/GEOSldas_App/util/postproc/ObsFcstAna_stats/postproc_ObsFcstAna.py @@ -21,19 +21,21 @@ class postproc_ObsFcstAna: def __init__(self, exp_list, start_time, end_time, sum_path='./'): - self.exp_list = exp_list - self.expdir_list = [item['expdir'] for item in exp_list] - self.expid_list = [item['expid'] for item in exp_list] - self.exptag = exp_list[0]['exptag'] - self.domain = exp_list[0]['domain'] - self.start_time = start_time - self.end_time = end_time - self.da_t0 = exp_list[0]['da_t0'] - self.da_dt = exp_list[0]['da_dt'] - self.var_list = ['obs_obs','obs_obsvar','obs_fcst','obs_fcstvar','obs_ana','obs_anavar'] - self.tilecoord = exp_list[0]['tilecoord'] - self.obsparam_list = [item['obsparam'] for item in exp_list] - self.sum_path = sum_path + self.exp_list = exp_list + self.expdir_list = [item['expdir'] for item in exp_list] + self.expid_list = [item['expid'] for item in exp_list] + self.exptag = exp_list[0]['exptag'] + self.domain = exp_list[0]['domain'] + self.start_time = start_time + self.end_time = end_time + self.da_t0 = exp_list[0]['da_t0'] + self.da_dt = exp_list[0]['da_dt'] + self.var_list = ['obs_obs','obs_obsvar','obs_fcst','obs_fcstvar','obs_ana','obs_anavar'] + self.tilecoord = exp_list[0]['tilecoord'] + self.tilegrid_global = exp_list[0]['tilegrid_global'] + self.tilegrid_domain = exp_list[0]['tilegrid_domain'] + self.obsparam_list = [item['obsparam'] for item in exp_list] + self.sum_path = sum_path # Determine experiment that supplies obs data self.obs_from = -1 @@ -82,7 +84,6 @@ def compute_monthly_sums(self, date_time): expdir_list = self.expdir_list expid_list = self.expid_list - domain = self.domain tc = self.tilecoord obsparam_list = self.obsparam_list var_list = self.var_list @@ -92,8 +93,8 @@ def compute_monthly_sums(self, date_time): n_tile = tc['N_tile'] n_spec = len(obsparam_list[0]) - date_time = date_time.replace(hour=self.da_t0) - end_time = date_time + relativedelta(months=1) + date_time = date_time.replace(hour=int(self.da_t0), minute=int(np.mod(self.da_t0,1)*60)) + stop_time = date_time + relativedelta(months=1) data_sum = {} data2_sum = {} @@ -107,12 +108,12 @@ def compute_monthly_sums(self, date_time): data_sum[ var] = np.zeros((n_tile, n_spec)) data2_sum[var] = np.zeros((n_tile, n_spec)) - while date_time < end_time: + while date_time < stop_time: # read the list of experiments at each time step (OFA="ObsFcstAna") OFA_list = [] for i in range(len(expdir_list)): - fname = expdir_list[i]+expid_list[i]+'/output/'+domain+'/ana/ens_avg/Y'+ \ + fname = expdir_list[i]+expid_list[i]+'/output/'+self.domain+'/ana/ens_avg/Y'+ \ date_time.strftime('%Y') + '/M' + \ date_time.strftime('%m') + '/' + \ expid_list[i]+'.ens_avg.ldas_ObsFcstAna.' + \ @@ -151,31 +152,32 @@ def compute_monthly_sums(self, date_time): data_all.append(data_tile) - # cross-mask over all experiments - is_cross_valid = ~np.isnan(data_all[0]['obs_obs']) - for data in data_all[1:]: - mask = ~np.isnan(data['obs_obs']) - is_cross_valid = np.logical_and(is_cross_valid,mask) + if len(data_all) > 0: + # cross-mask over all experiments + is_cross_valid = ~np.isnan(data_all[0]['obs_obs']) + for data in data_all[1:]: + mask = ~np.isnan(data['obs_obs']) + is_cross_valid = np.logical_and(is_cross_valid,mask) - # reconstruct the output variable dictionary based on input options; - # obs_obs and obs_obsvar are from exp_list[obs_from], the rest are from exp_list[0] - data_tile = {} - for var in var_list: - if 'obs_obs' in var: - data_tile[var] = data_all[obs_from][var] - else: - data_tile[var] = data_all[0][var] + # reconstruct the output variable dictionary based on input options; + # obs_obs and obs_obsvar are from exp_list[obs_from], the rest are from exp_list[0] + data_tile = {} + for var in var_list: + if 'obs_obs' in var: + data_tile[var] = data_all[obs_from][var] + else: + data_tile[var] = data_all[0][var] - is_valid = is_cross_valid - - N_data[ is_valid] += 1 - oxf_sum[is_valid] += data_tile['obs_obs' ][is_valid] * data_tile['obs_fcst'][is_valid] - oxa_sum[is_valid] += data_tile['obs_obs' ][is_valid] * data_tile['obs_ana' ][is_valid] - fxa_sum[is_valid] += data_tile['obs_fcst'][is_valid] * data_tile['obs_ana' ][is_valid] + is_valid = is_cross_valid + + N_data[ is_valid] += 1 + oxf_sum[is_valid] += data_tile['obs_obs' ][is_valid] * data_tile['obs_fcst'][is_valid] + oxa_sum[is_valid] += data_tile['obs_obs' ][is_valid] * data_tile['obs_ana' ][is_valid] + fxa_sum[is_valid] += data_tile['obs_fcst'][is_valid] * data_tile['obs_ana' ][is_valid] - for var in var_list: - data_sum[ var][is_valid] += data_tile[var][is_valid] - data2_sum[var][is_valid] += data_tile[var][is_valid] **2 + for var in var_list: + data_sum[ var][is_valid] += data_tile[var][is_valid] + data2_sum[var][is_valid] += data_tile[var][is_valid] **2 date_time = date_time + timedelta(seconds=da_dt) @@ -190,12 +192,11 @@ def compute_monthly_sums(self, date_time): def save_monthly_sums(self): expdir_list = self.expdir_list expid_list = self.expid_list - domain = self.domain var_list = self.var_list tc = self.tilecoord obsparam_list = self.obsparam_list - date_time = self.start_time + date_time = self.start_time while date_time < self.end_time: # loop through months @@ -229,109 +230,156 @@ def save_monthly_sums(self): def calc_temporal_stats_from_sums(self, write_to_nc=True, fout_stats='./stats.nc4'): - if os.path.isfile(fout_stats): - - print('reading stats nc4 file '+ fout_stats) - stats = {} - with Dataset(fout_stats,'r') as nc: - for key, value in nc.variables.items(): - stats[key] = value[:].filled(np.nan) + # Variable list for computing sum and sum-of-squares + var_list = self.var_list - else: - # Variable list for computing sum and sum-of-squares - var_list = self.var_list + # Read tilecoord and obsparam for tile and obs species information + n_tile = self.tilecoord['N_tile'] + n_spec = len(self.obsparam_list[0]) - # Read tilecoord and obsparam for tile and obs species information - n_tile = self.tilecoord['N_tile'] - n_spec = len(self.obsparam_list[0]) + # --------------------------------------------------------------- + # + # compute accumulated sums for period (start_time, end_time) + + # Initialize statistical metrics + data_sum = {} + data2_sum = {} + N_data = np.zeros((n_tile, n_spec)) + oxf_sum = np.zeros((n_tile, n_spec)) + oxa_sum = np.zeros((n_tile, n_spec)) + fxa_sum = np.zeros((n_tile, n_spec)) - # Initialize statistical metrics - data_sum = {} - data2_sum = {} - N_data = np.zeros((n_tile, n_spec)) - oxf_sum = np.zeros((n_tile, n_spec)) - oxa_sum = np.zeros((n_tile, n_spec)) - fxa_sum = np.zeros((n_tile, n_spec)) + for var in var_list: + data_sum[ var] = np.zeros((n_tile, n_spec)) + data2_sum[var] = np.zeros((n_tile, n_spec)) - for var in var_list: - data_sum[ var] = np.zeros((n_tile, n_spec)) - data2_sum[var] = np.zeros((n_tile, n_spec)) + # Time loop: processing data at monthly time step + + date_time = self.start_time - # Time loop: processing data at monthly time step + while date_time < self.end_time: # loop through months - date_time = self.start_time - - while date_time < self.end_time: # loop through months + fpath = self.sum_path + '/Y'+ date_time.strftime('%Y') + '/M' + date_time.strftime('%m') + '/' - fpath = self.sum_path + '/Y'+ date_time.strftime('%Y') + '/M' + date_time.strftime('%m') + '/' - - fout = self.exptag + '.ens_avg.ldas_ObsFcstAna_sums.' + date_time.strftime('%Y%m') +'.nc4' - - fout = fpath + fout - - # Read monthly data - if os.path.isfile(fout): - print('read sums from monthly file: '+fout) - mdata_sum = {} - mdata2_sum = {} - with Dataset(fout,'r') as nc: - mN_data = nc.variables['N_data' ][:] - moxf_sum = nc.variables['obsxfcst_sum'][:] - moxa_sum = nc.variables['obsxana_sum' ][:] - mfxa_sum = nc.variables['fcstxana_sum'][:] - for var in var_list: - mdata_sum[ var] = nc.variables[var+'_sum' ][:] - mdata2_sum[var] = nc.variables[var+'2_sum'][:] - - # Aggregate monthly data - N_data += mN_data - oxf_sum += moxf_sum - oxa_sum += moxa_sum - fxa_sum += mfxa_sum - + fout = self.exptag + '.ens_avg.ldas_ObsFcstAna_sums.' + date_time.strftime('%Y%m') +'.nc4' + + fout = fpath + fout + + # Read and accumulate monthly sums data + if os.path.isfile(fout): + print('read sums from monthly file: '+fout) + mdata_sum = {} + mdata2_sum = {} + with Dataset(fout,'r') as nc: + mN_data = nc.variables['N_data' ][:] + moxf_sum = nc.variables['obsxfcst_sum'][:] + moxa_sum = nc.variables['obsxana_sum' ][:] + mfxa_sum = nc.variables['fcstxana_sum'][:] for var in var_list: - data_sum[ var] += mdata_sum[ var] - data2_sum[var] += mdata2_sum[var] - else: - raise FileNotFoundError(f"File {fout} does not exist, run save_monthly_sums() first") - - date_time = date_time + relativedelta(months=1) + mdata_sum[ var] = nc.variables[var+'_sum' ][:] + mdata2_sum[var] = nc.variables[var+'2_sum' ][:] + + # Aggregate monthly data + N_data += mN_data + oxf_sum += moxf_sum + oxa_sum += moxa_sum + fxa_sum += mfxa_sum + + for var in var_list: + data_sum[ var] += mdata_sum[ var] + data2_sum[var] += mdata2_sum[var] + else: + raise FileNotFoundError(f"File {fout} does not exist, run save_monthly_sums() first") + + date_time = date_time + relativedelta(months=1) - # Compute stats (DA diagnostics) from accumulated sums data. - data_mean = {} - data2_mean = {} - data_var = {} + # -------------------------------------------------------------------- + # + # Compute stats (DA diagnostics) from accumulated sums data. + + data_mean = {} + data2_mean = {} + data_var = {} - # calculation - for var in var_list: - data_sum[var][ N_data == 0] = np.nan - data2_sum[var][N_data == 0] = np.nan - - data_mean[ var] = data_sum[ var] / N_data - data2_mean[var] = data2_sum[var] / N_data - # var(x) = E[x2] - (E[x])^2 - data_var[var] = data2_mean[var] - data_mean[var]**2 - - oxf_sum[N_data == 0] = np.nan - oxa_sum[N_data == 0] = np.nan - fxa_sum[N_data == 0] = np.nan - # E[xy] - oxf_mean = oxf_sum / N_data - oxa_mean = oxa_sum / N_data - fxa_mean = fxa_sum / N_data - - stats = {} - for var in var_list: - stats[var[4:]+'_mean'] = data_mean[var] - stats[var[4:]+'_variance'] = data_var[ var] - stats['oxf_mean'] = oxf_mean - stats['oxa_mean'] = oxa_mean - stats['fxa_mean'] = fxa_mean - stats['N_data'] = N_data - - if write_to_nc: - print('writing stats nc4 file: '+fout_stats) - write_stats_nc4(fout_stats, stats) + # calculation + for var in var_list: + data_sum[var][ N_data == 0] = np.nan + data2_sum[var][N_data == 0] = np.nan + + data_mean[ var] = data_sum[ var] / N_data + data2_mean[var] = data2_sum[var] / N_data + # var(x) = E[x2] - (E[x])^2 + data_var[var] = data2_mean[var] - data_mean[var]**2 + + oxf_sum[N_data == 0] = np.nan + oxa_sum[N_data == 0] = np.nan + fxa_sum[N_data == 0] = np.nan + # E[xy] + oxf_mean = oxf_sum / N_data + oxa_mean = oxa_sum / N_data + fxa_mean = fxa_sum / N_data + + # Then compute metrics of O-F, O-A, etc. based on above computed + + O_mean = data_mean['obs_obs'] + F_mean = data_mean['obs_fcst'] + A_mean = data_mean['obs_ana'] + + O_stdv = np.sqrt(data_var['obs_obs']) + F_stdv = np.sqrt(data_var['obs_fcst']) + A_stdv = np.sqrt(data_var['obs_ana']) + + # mean(x-y) = E[x] - E[y] + OmF_mean = O_mean - F_mean + OmA_mean = O_mean - A_mean + + # var(x-y) = var(x) + var(y) - 2cov(x,y) + # cov(x,y) = E[xy] - E[x]E[y] + OmF_stdv = np.sqrt(O_stdv**2 + F_stdv**2 - 2 * (oxf_mean - O_mean*F_mean)) + OmA_stdv = np.sqrt(O_stdv**2 + A_stdv**2 - 2 * (oxa_mean - O_mean*A_mean)) + + # ***************************************************************************************** + # The time series mean and std-dev of the *normalized* OmF computed here are APPROXIMATED! + # ***************************************************************************************** + # Here, we first compute the stats of the OmF time series and then normalize using + # the time-avg "obsvar" and "fcstvar" values. + # Since "fcstvar" changes with time, the OmF values should be normalized at each time + # step (as in the older matlab scripts), and then the time series stats can be computed. + # To compute the exact stats with this python package, the sum and sum-of-squares of + # the normalized OmF values would need to be added into the sums files. + # + OmF_norm_mean = OmF_mean / np.sqrt(data_mean['obs_obsvar'] + data_mean['obs_fcstvar']) # APPROXIMATED stat! + OmF_norm_stdv = np.sqrt(OmF_stdv**2 / (data_mean['obs_obsvar'] + data_mean['obs_fcstvar'])) # APPROXIMATED stat! + + # Mask out data points without any obs (NOTE: apply Nmin threshold when plotting or computing map avg) + # Do NOT apply to N_data + O_mean[ N_data < 1] = np.nan + O_stdv[ N_data < 1] = np.nan + F_mean[ N_data < 1] = np.nan + F_stdv[ N_data < 1] = np.nan + A_mean[ N_data < 1] = np.nan + A_stdv[ N_data < 1] = np.nan + + OmF_mean[ N_data < 1] = np.nan + OmF_stdv[ N_data < 1] = np.nan + OmF_norm_mean[N_data < 1] = np.nan + OmF_norm_stdv[N_data < 1] = np.nan + OmA_mean[ N_data < 1] = np.nan + OmA_stdv[ N_data < 1] = np.nan + + stats = { + 'O_mean' : O_mean, 'O_stdv' : O_stdv, + 'F_mean' : F_mean, 'F_stdv' : F_stdv, + 'A_mean' : A_mean, 'A_stdv' : A_stdv, + 'OmF_mean' : OmF_mean, 'OmF_stdv' : OmF_stdv, + 'OmA_mean' : OmA_mean, 'OmA_stdv' : OmA_stdv, + 'OmF_norm_mean': OmF_norm_mean, 'OmF_norm_stdv': OmF_norm_stdv, + 'N_data' : N_data, + } + + if write_to_nc: + print('writing stats nc4 file: '+fout_stats) + write_stats_nc4(fout_stats, stats) return stats @@ -345,79 +393,126 @@ def calc_temporal_stats_from_sums(self, write_to_nc=True, fout_stats='./stats.nc # the straight spatial avg across a map of a given DA diagnostic. The latter approach gives # the same weight to each location, regardless of how many obs are available at the location. - def calc_spatial_stats_from_sums(self, date_time): + def calc_spatial_stats_from_sums(self): + start_time = self.start_time + end_time = self.end_time + var_list = ['obs_obs', 'obs_fcst','obs_ana'] + + O_mean = [] + O_stdv = [] + F_mean = [] + F_stdv = [] + A_mean = [] + A_stdv = [] + OmF_mean = [] + OmF_stdv = [] + OmA_mean = [] + OmA_stdv = [] + N_data = [] + date_vec = [] - mo_path = self.sum_path + '/Y'+ date_time.strftime('%Y') + '/M' + date_time.strftime('%m') + '/' - fnc4_sums = mo_path + self.exptag + '.ens_avg.ldas_ObsFcstAna_sums.' + date_time.strftime('%Y%m') +'.nc4' + # Time loop + current_time = start_time + while current_time < end_time: - mdata_sum = {} - mdata2_sum = {} - - try: - with Dataset(fnc4_sums,'r') as nc: - mN_data = nc.variables['N_data' ][:] - moxf_sum = nc.variables['obsxfcst_sum'][:] - moxa_sum = nc.variables['obsxana_sum' ][:] - moxf_sum[mN_data == 0] = np.nan - moxa_sum[mN_data == 0] = np.nan - for var in var_list: - mdata_sum[ var] = nc.variables[var+'_sum' ][:] - mdata2_sum[var] = nc.variables[var+'2_sum'][:] - mdata_sum[ var][mN_data == 0] = np.nan - mdata2_sum[var][mN_data == 0] = np.nan - except FileNotFoundError: - print(f"Error: File '{filename}' not found.") - sys.exit(1) - except Exception as e: - print(f"An unexpected error occurred: {e}") - sys.exit(1) - - # Make sure only aggregate tiles with valid values for all variables - for var in var_list: - mN_data = mN_data.astype(float) - mN_data[np.isnan(mdata_sum[var])] = np.nan - mN_data[mN_data == 0] = np.nan - - # cross mask before aggregating tile values - for var in var_list: - mdata_sum[ var][np.isnan(mN_data)] = np.nan - mdata2_sum[var][np.isnan(mN_data)] = np.nan - moxf_sum[np.isnan(mN_data)] = np.nan - moxa_sum[np.isnan(mN_data)] = np.nan - - # Aggregate data of all tiles - N_data = np.nansum(mN_data, axis=0) - OxF_mean = np.nansum(moxf_sum,axis=0)/N_data - OxA_mean = np.nansum(moxa_sum,axis=0)/N_data - data_mean = {} - data2_mean = {} - data_var = {} - for var in var_list: - data_mean[ var] = np.nansum(mdata_sum[var ],axis=0)/N_data - data2_mean[var] = np.nansum(mdata2_sum[var],axis=0)/N_data - # var(x) = E[x2] - (E[x])^2 - data_var[var] = data2_mean[var] - data_mean[var]**2 - - # Compute metrics of O-F, O-A, etc. based on above stats - O_mean = data_mean['obs_obs'] - F_mean = data_mean['obs_fcst'] - A_mean = data_mean['obs_ana'] - - O_var = data_var[ 'obs_obs'] - F_var = data_var[ 'obs_fcst'] - A_var = data_var[ 'obs_ana'] - - # mean(x-y) = E[x] - E[y] - OmF_mean = O_mean - F_mean - OmA_mean = O_mean - A_mean + mo_path = self.sum_path + '/Y'+ current_time.strftime('%Y') + '/M' + current_time.strftime('%m') + '/' + fnc4_sums = mo_path + self.exptag + '.ens_avg.ldas_ObsFcstAna_sums.' + current_time.strftime('%Y%m') +'.nc4' + + mdata_sum = {} + mdata2_sum = {} + + try: + with Dataset(fnc4_sums,'r') as nc: + mN_data = nc.variables['N_data' ][:] + moxf_sum = nc.variables['obsxfcst_sum'][:] + moxa_sum = nc.variables['obsxana_sum' ][:] + moxf_sum[mN_data == 0] = np.nan + moxa_sum[mN_data == 0] = np.nan + for var in var_list: + mdata_sum[ var] = nc.variables[var+'_sum' ][:] + mdata2_sum[var] = nc.variables[var+'2_sum'][:] + mdata_sum[ var][mN_data == 0] = np.nan + mdata2_sum[var][mN_data == 0] = np.nan + except FileNotFoundError: + print(f"Error: File '{fnc4_sums}' not found. Run Get_ObsFcstAna_sums.py first.") + sys.exit(1) + except Exception as e: + print(f"An unexpected error occurred: {e}") + sys.exit(1) + + # Make sure only aggregate tiles with valid values for all variables + for var in var_list: + mN_data = mN_data.astype(float) + mN_data[np.isnan(mdata_sum[var])] = np.nan + mN_data[mN_data == 0] = np.nan - # var(x-y) = var(x) + var(y) - 2cov(x,y) - # cov(x,y) = E[xy] - E[x]E[y] - OmF_stdv = np.sqrt(O_var + F_var - 2 * (OxF_mean - O_mean*F_mean)) - OmA_stdv = np.sqrt(O_var + A_var - 2 * (OxA_mean - O_mean*A_mean)) + # cross mask before aggregating tile values + for var in var_list: + mdata_sum[ var][np.isnan(mN_data)] = np.nan + mdata2_sum[var][np.isnan(mN_data)] = np.nan + moxf_sum[ np.isnan(mN_data)] = np.nan + moxa_sum[ np.isnan(mN_data)] = np.nan + + # Aggregate data of all tiles + N_data_mo = np.nansum(mN_data, axis=0) + with np.errstate(divide='ignore', invalid='ignore'): + OxF_mean = np.nansum(moxf_sum, axis=0) / np.where(N_data_mo > 0, N_data_mo, np.nan) + OxA_mean = np.nansum(moxa_sum, axis=0) / np.where(N_data_mo > 0, N_data_mo, np.nan) + data_mean = {} + data2_mean = {} + data_var = {} + for var in var_list: + with np.errstate(divide='ignore', invalid='ignore'): + data_mean[ var] = np.nansum(mdata_sum[var ], axis=0) / np.where(N_data_mo > 0, N_data_mo, np.nan) + data2_mean[var] = np.nansum(mdata2_sum[var], axis=0) / np.where(N_data_mo > 0, N_data_mo, np.nan) + # var(x) = E[x2] - (E[x])^2 + data_var[var] = data2_mean[var] - data_mean[var]**2 + + # Compute monthly metrics of O-F, O-A, etc. based on above stats + O_mean_mo = data_mean['obs_obs'] + F_mean_mo = data_mean['obs_fcst'] + A_mean_mo = data_mean['obs_ana'] + + O_var = data_var['obs_obs'] + F_var = data_var['obs_fcst'] + A_var = data_var['obs_ana'] + + # mean(x-y) = E[x] - E[y] + OmF_mean_mo = O_mean_mo - F_mean_mo + OmA_mean_mo = O_mean_mo - A_mean_mo + + # var(x-y) = var(x) + var(y) - 2cov(x,y) + # cov(x,y) = E[xy] - E[x]E[y] + OmF_stdv_mo = np.sqrt(O_var + F_var - 2 * (OxF_mean - O_mean_mo*F_mean_mo)) + OmA_stdv_mo = np.sqrt(O_var + A_var - 2 * (OxA_mean - O_mean_mo*A_mean_mo)) + + # Extend timeseries + N_data.append( N_data_mo ) + O_mean.append( O_mean_mo ) + O_stdv.append( np.sqrt(O_var)) + F_mean.append( F_mean_mo ) + F_stdv.append( np.sqrt(F_var)) + A_mean.append( A_mean_mo ) + A_stdv.append( np.sqrt(A_var)) + OmF_mean.append(OmF_mean_mo ) + OmF_stdv.append(OmF_stdv_mo ) + OmA_mean.append(OmA_mean_mo ) + OmA_stdv.append(OmA_stdv_mo ) + + date_vec.append(current_time.strftime('%Y%m')) + current_time = current_time + relativedelta(months=1) + + stats = { + 'O_mean' : np.array(O_mean), 'O_stdv' : np.array(O_stdv), + 'F_mean' : np.array(F_mean), 'F_stdv' : np.array(F_stdv), + 'A_mean' : np.array(A_mean), 'A_stdv' : np.array(A_stdv), + 'OmF_mean': np.array(OmF_mean), 'OmF_stdv': np.array(OmF_stdv), + 'OmA_mean': np.array(OmA_mean), 'OmA_stdv': np.array(OmA_stdv), + 'N_data' : np.array(N_data), 'date_vec': date_vec + } - return OmF_mean, OmF_stdv, OmA_mean, OmA_stdv, N_data + return stats # ============== EOF ==================================================================== diff --git a/GEOSldas_App/util/postproc/ObsFcstAna_stats/user_config.py b/GEOSldas_App/util/postproc/ObsFcstAna_stats/user_config.py index 27e484f..169bbcf 100644 --- a/GEOSldas_App/util/postproc/ObsFcstAna_stats/user_config.py +++ b/GEOSldas_App/util/postproc/ObsFcstAna_stats/user_config.py @@ -6,7 +6,7 @@ import numpy as np from datetime import datetime, timedelta -from read_GEOSldas import read_tilecoord, read_obs_param +from read_GEOSldas import read_tilecoord, read_tilegrids, read_obs_param from postproc_ObsFcstAna import postproc_ObsFcstAna def get_config(): @@ -20,7 +20,7 @@ def get_config(): start_year = 2015 start_month = 4 last_year = 2016 - last_month = 4 + last_month = 3 # Sums or stats will be processed for exp_main: @@ -39,7 +39,8 @@ def get_config(): # Optional experiment(s) can be added for cross-masking or extracting obs from a different experiment. # - # All optional experiments and the main experiment must have identical tilecoords. + # All optional experiments and the main experiment must have identical tile space (BCs resolution) and domain. + # # If the default "species" number/order do not match, set "species_list" accordingly to force a match. # Output will be cross-masked between all specified experiments. # @@ -52,7 +53,6 @@ def get_config(): exp_sup1 = { 'expdir' : '/discover/nobackup/projects/gmao/merra/iau/merra_land/SMAP_runs/SMAP_Nature_v11/', 'expid' : 'DAv8_M36', - 'domain' : 'SMAP_EASEv2_M36_GLOBAL', 'use_obs' : True, # if True, use obs data from this exp 'species_list' : [1,2,3,4], # indices of species to be processed, # must identify same species as selected in main exp @@ -66,7 +66,7 @@ def get_config(): # Top level directory for all output from this package: - out_path = '/discover/nobackup/[USERNAME]/SMAP_test/' + out_path = '/discover/nobackup/[USERNAME]/' # Directory for monthly sum files: # - Can use the experiment directory or a different path. @@ -90,11 +90,12 @@ def get_config(): end_time = datetime( end_year, end_month, 1) # Get tilecoord and obsparam information for each experiment + + domain = exp_list[0]['domain'] for exp in exp_list: expdir = exp['expdir'] expid = exp['expid'] - domain = exp['domain'] YYYY = exp['obsparam_time'][0:4] MM = exp['obsparam_time'][4:6] @@ -114,8 +115,11 @@ def get_config(): ftc = expdir+expid+'/output/'+ domain+'/rc_out/'+ expid+'.ldas_tilecoord.bin' tc = read_tilecoord(ftc) + ftg = expdir+expid+'/output/'+ domain+'/rc_out/'+ expid+'.ldas_tilegrids.bin' + tg_global, tg_domain = read_tilegrids(ftg) + # add tilecoord and obsparam into to exp - exp.update({'tilecoord':tc, 'obsparam':obsparam}) + exp.update({'tilecoord':tc, 'obsparam':obsparam, 'tilegrid_global':tg_global, 'tilegrid_domain':tg_domain}) # verify that obs species match across experiments for exp_idx, exp in enumerate(exp_list) : diff --git a/GEOSldas_App/util/shared/python/read_GEOSldas.py b/GEOSldas_App/util/shared/python/read_GEOSldas.py index 96ba417..ccc1aa1 100644 --- a/GEOSldas_App/util/shared/python/read_GEOSldas.py +++ b/GEOSldas_App/util/shared/python/read_GEOSldas.py @@ -1,7 +1,6 @@ # collection of readers for GEOSldas output files -import numpy as np import struct import os import numpy as np @@ -99,6 +98,63 @@ def read_tilecoord(fname): print("done reading file") return tile_coord +# ---------------------------------------------------------------------------- +# +# reader for GEOSldas tilecoord file (binary) + +def read_tilegrids(fname): + """ + Read tile grid information from file and return global and domain grid structures. + + Parameters: + ---------- + fname : str + Path to the input file (either .txt or .bin) + + Returns: + ------- + tile_grid_g : dict + Global tile grid information + tile_grid_d : dict + Domain tile grid information + """ + + # Set endian format + machfmt = '<' # '>' for big-endian, '<' for little-endian + + # Read binary file + print(f'reading from {fname}') + + with open(fname, 'rb') as ifp: + # Read "global" and "domain" records + for grid in ['global','domain']: + + tile_grid = {} + + fortran_tag = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['gridtype'] = ifp.read(40).decode('ascii').strip('\x00') + tile_grid['ind_base'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['i_dir'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['j_dir'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['N_lon'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['N_lat'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['i_offg'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['j_offg'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + tile_grid['ll_lon'] = struct.unpack(f'{machfmt}f', ifp.read(4))[0] + tile_grid['ll_lat'] = struct.unpack(f'{machfmt}f', ifp.read(4))[0] + tile_grid['ur_lon'] = struct.unpack(f'{machfmt}f', ifp.read(4))[0] + tile_grid['ur_lat'] = struct.unpack(f'{machfmt}f', ifp.read(4))[0] + tile_grid['dlon'] = struct.unpack(f'{machfmt}f', ifp.read(4))[0] + tile_grid['dlat'] = struct.unpack(f'{machfmt}f', ifp.read(4))[0] + fortran_tag = struct.unpack(f'{machfmt}i', ifp.read(4))[0] + + if 'global' in grid: + tile_grid_g = tile_grid + else: + tile_grid_d = tile_grid + + return tile_grid_g, tile_grid_d + # ---------------------------------------------------------------------------- # # reader for GEOSldas ObsFcstAna file (binary) diff --git a/GEOSldas_App/util/shared/python/remap_1d_to_2d.py b/GEOSldas_App/util/shared/python/remap_1d_to_2d.py deleted file mode 100644 index d0a1d9f..0000000 --- a/GEOSldas_App/util/shared/python/remap_1d_to_2d.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import numpy as np - -# convert data from 1-d array ("compressed") format to 2-d array (grid) format based on lat/lon coordinates -# -# 1-d input data can have one additional dimension (e.g., time) -# -# 2-d output data is lat-by-lon[-by-time] - -def remap_1d_to_2d( data_1d, *, lat_1d, lon_1d): - - # Extract unique coordinates - unique_lats, indY = np.unique(lat_1d, return_inverse=True) - unique_lons, indX = np.unique(lon_1d, return_inverse=True) - - ny = len(unique_lats) - nx = len(unique_lons) - - if data_1d.ndim == 2: #[n_1d, ntime] - ntime = data_1d.shape[1] - data_2d = np.full([ny, nx, ntime], np.nan) - data_2d[indY, indX, :] = data_1d - elif data_1d.ndim == 1: #[n_1d] - data_2d = np.full([ny, nx], np.nan) - data_2d[indY, indX ] = data_1d - - return data_2d, unique_lats, unique_lons - - diff --git a/GEOSldas_App/util/shared/python/tile2grid.py b/GEOSldas_App/util/shared/python/tile2grid.py new file mode 100644 index 0000000..096d45b --- /dev/null +++ b/GEOSldas_App/util/shared/python/tile2grid.py @@ -0,0 +1,66 @@ + + +import os +import numpy as np + +def tile2grid(tile_data, tile_coord, tile_grid): + + """ + Map (1d) tile-space data onto (2d) grid that is associated with the tile space. + + Parameters: + ---------- + tile_data : Array in tile space, shape (N_tile, N_fields) + tile_coord : Dictionary containing tile coordinate information + tile_grid : Dictionary containing tile grid information + + Returns: + ------- + grid_data : Array in grid space, shape (N_lat, N_lon, N_fields) + """ + + # Verify input datasize + if tile_data.shape[0] != tile_coord['N_tile']: + print(f'Error: size of tile2grid input data does not match that of N_tile') + sys.exit() + + # if tile_data is 1-D [N_tile], expand into 2-D [N_tile,1] + if tile_data.ndim == 1: + tile_data = np.expand_dims(tile_data, axis=1) + + N_fields = tile_data.shape[-1] + # Initialize grid data array + grid_data = np.zeros((tile_grid['N_lat'], tile_grid['N_lon'], N_fields)) + + for k in range(N_fields): + # Initialize weight grid for current field + wgrid = np.zeros((tile_grid['N_lat'], tile_grid['N_lon'])) + + # Loop through tile space + for n in range(tile_coord['N_tile']): + # Calculate grid indices (adjust for Python's 0-based indexing) + i = tile_coord['i_indg'][n] - (tile_grid['i_offg'] - (1-tile_grid['ind_base'])) - 1 + j = tile_coord['j_indg'][n] - (tile_grid['j_offg'] - (1-tile_grid['ind_base'])) - 1 + + # Get weight + w = tile_coord['frac_cell'][n] + + # Check if current tile data is valid (not no-data) + if ~np.isnan(tile_data[n, k]): + # Accumulate weighted data + grid_data[j, i, k] += w * tile_data[n,k] + wgrid[ j, i] += w + + # Normalize and set no-data values + for i in range(tile_grid['N_lon']): + for j in range(tile_grid['N_lat']): + if wgrid[j, i] > 0.0: + # Normalize by total weight + grid_data[j, i, k] = grid_data[j, i, k] / wgrid[j, i] + else: + # Set no-data value + grid_data[j, i, k] = np.nan + + return grid_data.squeeze() + +# ============ EOF =============================================== diff --git a/GEOSldas_App/util/shared/python/tile_to_latlongrid.py b/GEOSldas_App/util/shared/python/tile_to_latlongrid.py new file mode 100644 index 0000000..da870be --- /dev/null +++ b/GEOSldas_App/util/shared/python/tile_to_latlongrid.py @@ -0,0 +1,65 @@ +import os +import numpy as np + +# + +def tile_to_latlongrid( tile_data, tile_coord, resolution, nodata=1.e15, nodata_tol_frac=1.e-4): + + """ + Map (1d) tile-space data onto (2d) regular lat/lon grid of arbitrary "resolution". + + Use "tile2grid.py" to map tile data onto the grid that is associated with the tile space. + + Parameters: + ---------- + tile_data : Array in tile space, shape (N_tile, N_fields) + tile_coord : Dictionary containing tile coordinate information + resolution : Target grid resolution + nodata : Value for missing data + nodata_tol_frac : Tolerance fraction for comparing values to nodata + + Returns: + ------- + grid_data : Array in regular grid space, shape (N_lat, N_lon, N_fields) + lat_2d : Lat array in regular grid space, shape (N_lat, N_lon) + lon_2d : Lon array in regular grid space, shape (N_lat, N_lon) + + """ + + tile_data[np.abs(tile_data - nodata) < nodata*nodata_tol_frac] = np.nan + + # Verify input datasize + if tile_data.shape[0] != tile_coord['N_tile']: + print(f'Error: size of tile2grid input data does not match that of N_tile') + sys.exit() + + # if tile_data is 1-D [N_tile], expand into 2-D [N_tile,1] + if tile_data.ndim == 1: + tile_data = np.expand_dims(tile_data, axis=1) + + nf = tile_data.shape[-1] + + lat_grid = np.arange( -90+resolution/2, 90+resolution/2, resolution) + lon_grid = np.arange(-180+resolution/2, 180+resolution/2, resolution) + + lon_2d, lat_2d = np.meshgrid(lon_grid, lat_grid) + + grid_data = np.zeros((len(lat_grid), len(lon_grid), nf)) + N_tilecnt = np.zeros((len(lat_grid), len(lon_grid), nf)) + + + for k in range(tile_coord['N_tile']): + for v in range(nf): + if ~np.isnan(tile_data[k,v]): + j = np.searchsorted(lat_grid + resolution/2., tile_coord['com_lat'][k]) + i = np.searchsorted(lon_grid + resolution/2., tile_coord['com_lon'][k]) + # grid value takes the mean of all tiles within + # new_mean = old_mean + (new_value - old_mean)/count + N_tilecnt[j,i,v] += 1 + grid_data[j,i,v] = grid_data[j,i,v] + (tile_data[k,v]-grid_data[j,i,v])/N_tilecnt[j,i,v] + + grid_data[N_tilecnt == 0] = np.nan + + return grid_data.squeeze(), lat_2d, lon_2d + +# =========================== EOF ============================================================