Merge pull request #133 from c-hydro/dev

ltrotter · web-flow · commit bf9fdc549877 · 2025-03-26T10:45:47.000+01:00
Dev
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "door"
-version = "2.3.3"
+version = "2.3.4"
 description = "A package for operational retrieval of raster data from different sources"
 authors = [
     { name = "Andrea Libertino", email = "andrea.libertino@cimafoundation.org" },
diff --git a/src/door/base_downloaders.py b/src/door/base_downloaders.py
@@ -256,7 +256,14 @@ def set_options(self, options: dict) -> None:
         if 'variables' in options:
             variables = options['variables']
             self.set_variables(variables)
-    
+
+    def set_product(self, product: str) -> None:
+        self.product = product.lower()
+        if self.product not in self.available_products:
+            raise ValueError(f'Product {product} not available. Choose one of {self.available_products.keys()}')
+        for key in self.available_products[self.product]:
+            setattr(self, key, self.available_products[self.product][key])
+
     def set_variables(self, variables: list) -> None:
         available_variables = self.available_variables
         if hasattr(self, 'product') and self.product in available_variables:
@@ -273,7 +280,14 @@ def get_last_ts(self, **kwargs) -> tuple[ts.TimeStep]:
         
         last_ts_output = None
 
-        variables = list(self.variables.keys()) if hasattr(self, 'variables') else ['__var__']
+        if hasattr(self, 'variables'):
+            if isinstance(self.variables, list):
+                variables = self.variables
+            elif isinstance(self.variables, dict):
+                variables = list(self.variables.keys())
+        else:
+            variables = ['__var__']
+
         tiles = self.destination.tile_names
 
         for i, variable in enumerate(variables):
diff --git a/src/door/data_sources/__init__.py b/src/door/data_sources/__init__.py
@@ -9,4 +9,5 @@
 from .persiann import *
 from .jaxa import *
 from .noaa import *
-from .eobs import *
+from .eobs import *
+from .jra import *
diff --git a/src/door/data_sources/cds/era5_downloader.py b/src/door/data_sources/cds/era5_downloader.py
@@ -69,11 +69,12 @@ def __init__(self, product = 'reanalysis-era5-single-levels') -> None:
             self.log.error(msg)
             raise ValueError(msg)
 
-    def set_variables(self, variables: list[str]) -> None:
+    def set_variables(self, variables: str|list[str]) -> None:
         """
         Set the variables to download.
         """
-
+        if isinstance(variables, str):
+            variables = [variables]
         super().set_variables(variables)
 
         agg_options = self.agg_method
diff --git a/src/door/data_sources/earthdata/viirsmodis_downloader.py b/src/door/data_sources/earthdata/viirsmodis_downloader.py
@@ -180,9 +180,17 @@ def set_attributes(self, dataset: xr.DataArray, varopts: dict) -> xr.DataArray:
         """
         Set the attributes of the dataset.
         """
-        dataset.attrs['valid_range'] = varopts.get('valid_range', None)
-        dataset.attrs['_FillValue'] = varopts.get('fill_value', None)
-        dataset.attrs['scale_factor'] = varopts.get('scale_factor', None)
+
+        valid_range = varopts['valid_range']
+        fill_value = varopts['fill_value']
+        scale_factor = varopts.get('scale_factor', None)
+
+        if valid_range is not None:
+            dataset = dataset.where((dataset >= valid_range[0]) & (dataset <= valid_range[1]), fill_value)
+
+        dataset.attrs['valid_range']  = valid_range
+        dataset.attrs['_FillValue']   = fill_value
+        dataset.attrs['scale_factor'] = scale_factor
 
         return dataset
 
@@ -287,9 +295,9 @@ class MODISDownloader(VIIRSMODISDownloader):
 
     available_variables = {
         'et': {
-            'ET'    : {'id': 0, 'valid_range': (-32767, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
-            'PET'   : {'id': 2, 'valid_range': (-32767, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
-            'ET_QC' : {'id': 4, 'valid_range': (0,254), 'fill_value' : 255, 'scale_factor': 1   },
+            'ET'    : {'id': 0, 'valid_range': (0, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
+            'PET'   : {'id': 2, 'valid_range': (0, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
+            'ET_QC' : {'id': 4, 'valid_range': (0,254),    'fill_value' : 255,   'scale_factor': 1   },
         }
     }
 
diff --git a/src/door/data_sources/eobs/eobs_downloader.py b/src/door/data_sources/eobs/eobs_downloader.py
@@ -89,42 +89,28 @@ def get_last_published_ts(self, **kwargs) -> ts.TimeRange:
         last_date = self.get_last_published_date(**kwargs)
 
         # get the timestep of the last date
-        ts_per_year = self.ts_per_year if hasattr(self, 'ts_per_year') else 365
-        last_date_timestep = FixedNTimeStep(last_date, ts_per_year)
+        last_date_timestep = ts.Day.from_date(last_date)
 
-        # if the last date is the last day of its timestep, return the last timestep
-        if last_date == last_date_timestep.end:
-            return last_date_timestep
-        # else, return the timestep before the one of the last date
-        else:
-            return last_date_timestep - 1
+        return last_date_timestep
 
     def get_last_published_date(self, **kwargs) -> dt.datetime:
 
         """
         Get the last published date for the dataset.
         """
 
-        self.metadata = "https://psl.noaa.gov/thredds/iso/Datasets/cpc_global_precip/precip.{year}.nc?catalog=http://psl.noaa.gov/thredds/catalog/Datasets/cpc_global_precip/catalog.html&dataset=Datasets/cpc_global_precip/precip.{year}.nc"
-
-        import xml.etree.ElementTree as ET
-
-        year = dt.datetime.now().year
-        with requests.get(self.metadata.format(year = year)) as response:
-            root = ET.fromstring(response.content)
-
-        # Parse the XML file
-        tree = ET.parse('your_xml_file.xml')
-        root = tree.getroot()
-
-        # Find the gml:endPosition element
-        end_position = root.find('.//gml:endPosition', namespaces={'gml': 'http://www.opengis.net/gml/3.2'})
-        if end_position is not None:
-            end_date = end_position.text
-
-        # Convert to datetime object if needed
-        end_date_dt = dt.datetime.fromisoformat(end_date.replace('Z', '+00:00')).date()
-        return end_date_dt
+        this_month = ts.Month.from_date(dt.datetime.now())
+        has_data = False
+        while not has_data:
+            try:
+                url = self.month_url.format(variable = self.variables[0], resolution = self.resolution, year = this_month.year, month = this_month.month)
+                r = requests.head(url)
+                r.raise_for_status()
+                has_data = True
+            except requests.exceptions.HTTPError:
+                this_month -= 1
+
+        return this_month.end
 
     def _get_data_ts(self,
                      timestep: TimeStep,
diff --git a/src/door/data_sources/jra/__init__.py b/src/door/data_sources/jra/__init__.py
@@ -0,0 +1 @@
+from .jra_downloader import JRADownloader
diff --git a/src/door/data_sources/jra/jra_downloader.py b/src/door/data_sources/jra/jra_downloader.py
@@ -0,0 +1,188 @@
+import os
+from typing import Generator, Optional, Sequence
+import xarray as xr
+import datetime as dt
+import requests
+import tempfile
+
+from ...base_downloaders import URLDownloader
+
+from d3tools import timestepping as ts
+from d3tools.timestepping.timestep import TimeStep
+from d3tools.timestepping.fixed_num_timestep import FixedNTimeStep
+from d3tools.spatial import BoundingBox, crop_to_bb
+
+class JRADownloader(URLDownloader):
+    source = "JRA"
+    name = "JRA_downloader"
+
+    single_temp_folder = True
+    separate_vars = True
+
+    default_options = {
+        "resolution": 0.375,
+        "freq" : 'd',
+        'variables'   : 'precipitation',
+        'agg_method'  : None
+    }
+
+    grid_codes = {
+        0.375 : 'gauss',
+        1.25  : 'll125'
+    }
+
+    home = "https://thredds.rda.ucar.edu/thredds/fileServer/files/g/d640000/"
+    
+
+    available_agg_methods = ['mean', 'max', 'min', 'sum']
+
+    available_products: dict = {
+        "jra-3q": {
+            "url_blank" : home + "{dataset}/{month.start:%Y%m}/jra3q.{dataset}.{var_code}.{var_name}-{grid_code}.{month.start:%Y%m%d}00_{month.end:%Y%m%d}23.nc",
+            "data_list" : "https://thredds.rda.ucar.edu/thredds/catalog/files/g/d640000/{dataset}/catalog.html"
+        }
+    }
+
+    available_variables: dict = {
+        "jra-3q": {
+            "precipitation": {
+                "dataset" : 'fcst_phy2m',
+                "var_code" : '0_1_52',
+                "var_name" : "tprate1have-sfc-fc", # this is a rate in mm/s, will need to multiply by 3600 to get mm/h and then sum to get total precipitation
+                "agg_method" : 'sum'
+            }
+        }
+    }
+
+    def __init__(self, product: str) -> None:
+        self.set_product(product)
+        super().__init__(self.url_blank, protocol = 'http')
+
+    def set_variables(self, variables: str|list[str]) -> None:
+        """
+        Set the variables to download.
+        """
+        if isinstance(variables, str):
+            variables = [variables]
+        super().set_variables(variables)
+ 
+        agg_options = self.agg_method
+        if not isinstance(agg_options, list):
+            agg_options = [agg_options]
+
+        if len(agg_options) != len(variables):
+            msg = 'The number of aggregation methods must be the same as the number of variables'
+            self.log.error(msg)
+            raise ValueError(msg)
+        
+        for agg, var in zip(agg_options, variables):
+            agg = self.check_agg(agg)
+            self.variables[var].update({'agg_method': agg})
+
+    def check_agg(self, agg):
+        if not isinstance(agg, list): agg = [agg]
+        for a in agg:
+            if a not in self.available_agg_methods:
+                msg = f'Aggregation method {a} not available'
+                self.log.error(msg)
+                raise ValueError(msg)
+        return agg
+
+    def get_last_published_ts(self, **kwargs) -> ts.TimeRange:
+        
+        """
+        Get the last published date for the dataset.
+        """
+
+        last_date = self.get_last_published_date(**kwargs)
+
+        # get the timestep of the last date
+        freq = self.freq if hasattr(self, 'freq') else 'd'
+        last_date_timestep = ts.TimeStep.from_unit(freq).from_date(last_date)
+
+        # if the last date is the last day of its timestep, return the last timestep
+        if last_date == last_date_timestep.end:
+            return last_date_timestep
+        # else, return the timestep before the one of the last date
+        else:
+            return last_date_timestep - 1
+
+    def get_last_published_date(self, **kwargs) -> dt.datetime:
+
+        """
+        Get the last published date for the dataset.
+        """
+        import re
+        last_month = None
+        for variable in self.variables:
+            if 'dataset' not in self.variables[variable]:
+                raise ValueError(f'Dataset not defined for variable {variable}')
+            
+            url = self.data_list.format(dataset = self.variables[variable]['dataset'])
+            with requests.get(url) as response:
+                # this is 100% not the best way to do this, but it works for now
+                matches = re.findall(r'href="(\d{4})(\d{2})/catalog.html"', response.text)
+            
+            this_last_month = ts.Month(int(matches[-1][0]), int(matches[-1][1]))
+            last_month = this_last_month if last_month is None else min(last_month, this_last_month)
+
+        return last_month.end
+
+    def _get_data_ts(self,
+                     timestep: TimeStep,
+                     space_bounds: BoundingBox,
+                     tmp_path: str) -> Generator[tuple[xr.DataArray, dict], None, None]:
+        
+        this_var = self.variables[self.variable]
+        this_month = ts.Month(timestep.year, timestep.month)
+        tmp_file_nc = f'temp_{self.product}{this_month.year}{this_month.month}.nc'
+
+        # check if the file is not already downloaded in the tmp_path
+        tmp_destination = os.path.join(tmp_path, tmp_file_nc)
+        if not os.path.exists(tmp_destination):
+            tags = {
+                'dataset' : this_var['dataset'],
+                'var_code' : this_var['var_code'],
+                'var_name' : this_var['var_name'],
+                'grid_code' : self.grid_codes[self.resolution],
+                'month' : this_month
+            }
+            # download the file
+            self.download(tmp_destination, min_size = 2000, missing_action = 'warning', **tags)
+
+            # once we download a month, we can delete the previous month
+            prev_month = this_month - 1
+            prev_file = f'temp_{self.product}{prev_month.year}{prev_month.month}.nc'
+            prev_file = os.path.join(tmp_path, prev_file)
+            if os.path.exists(prev_file):
+                os.remove(prev_file)
+        
+        # open the file
+        raw_data = xr.open_dataset(tmp_destination, engine = 'netcdf4')
+        vardata = raw_data[f"{this_var['var_name']}-{self.grid_codes[self.resolution]}"]
+
+        # only select the relevant time range
+        inrange = (vardata.time.dt.date >= timestep.start.date()) & (vardata.time.dt.date <= timestep.end.date())
+        vardata = vardata.sel(time = inrange)
+
+        # crop the data
+        cropped = crop_to_bb(vardata, space_bounds)
+
+        # if this is precipitation data, we need to transform it to mm/h
+        if this_var['var_name'] == 'tprate1have-sfc-fc':
+            cropped *= 3600
+
+        # aggregate the data
+        for agg_method in this_var['agg_method']:
+            if agg_method == 'sum':
+                aggregated = cropped.sum(dim = 'time')
+            elif agg_method == 'mean':
+                aggregated = cropped.mean(dim = 'time')
+            elif agg_method == 'max':
+                aggregated = cropped.max(dim = 'time')
+            elif agg_method == 'min':
+                aggregated = cropped.min(dim = 'time')
+            else:
+                raise ValueError(f'Aggregation method {self.agg_method} not recognized')
+
+            yield aggregated, {'agg_method': agg_method, 'variable': self.variable, 'resolution': str(self.resolution).replace('.', '')}
diff --git a/src/door/data_sources/noaa/noaa_downloader.py b/src/door/data_sources/noaa/noaa_downloader.py
diff --git a/workflow_examples/option_files/jra-3Q_example.json b/workflow_examples/option_files/jra-3Q_example.json

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .jra_downloader import JRADownloader`