Skip to content

Commit bf9fdc5

Browse files
authored
Merge pull request #133 from c-hydro/dev
Dev
2 parents c7279cf + c90280a commit bf9fdc5

File tree

10 files changed

+277
-55
lines changed

10 files changed

+277
-55
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "door"
7-
version = "2.3.3"
7+
version = "2.3.4"
88
description = "A package for operational retrieval of raster data from different sources"
99
authors = [
1010
{ name = "Andrea Libertino", email = "andrea.libertino@cimafoundation.org" },

src/door/base_downloaders.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,14 @@ def set_options(self, options: dict) -> None:
256256
if 'variables' in options:
257257
variables = options['variables']
258258
self.set_variables(variables)
259-
259+
260+
def set_product(self, product: str) -> None:
261+
self.product = product.lower()
262+
if self.product not in self.available_products:
263+
raise ValueError(f'Product {product} not available. Choose one of {self.available_products.keys()}')
264+
for key in self.available_products[self.product]:
265+
setattr(self, key, self.available_products[self.product][key])
266+
260267
def set_variables(self, variables: list) -> None:
261268
available_variables = self.available_variables
262269
if hasattr(self, 'product') and self.product in available_variables:
@@ -273,7 +280,14 @@ def get_last_ts(self, **kwargs) -> tuple[ts.TimeStep]:
273280

274281
last_ts_output = None
275282

276-
variables = list(self.variables.keys()) if hasattr(self, 'variables') else ['__var__']
283+
if hasattr(self, 'variables'):
284+
if isinstance(self.variables, list):
285+
variables = self.variables
286+
elif isinstance(self.variables, dict):
287+
variables = list(self.variables.keys())
288+
else:
289+
variables = ['__var__']
290+
277291
tiles = self.destination.tile_names
278292

279293
for i, variable in enumerate(variables):

src/door/data_sources/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@
99
from .persiann import *
1010
from .jaxa import *
1111
from .noaa import *
12-
from .eobs import *
12+
from .eobs import *
13+
from .jra import *

src/door/data_sources/cds/era5_downloader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,12 @@ def __init__(self, product = 'reanalysis-era5-single-levels') -> None:
6969
self.log.error(msg)
7070
raise ValueError(msg)
7171

72-
def set_variables(self, variables: list[str]) -> None:
72+
def set_variables(self, variables: str|list[str]) -> None:
7373
"""
7474
Set the variables to download.
7575
"""
76-
76+
if isinstance(variables, str):
77+
variables = [variables]
7778
super().set_variables(variables)
7879

7980
agg_options = self.agg_method

src/door/data_sources/earthdata/viirsmodis_downloader.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,17 @@ def set_attributes(self, dataset: xr.DataArray, varopts: dict) -> xr.DataArray:
180180
"""
181181
Set the attributes of the dataset.
182182
"""
183-
dataset.attrs['valid_range'] = varopts.get('valid_range', None)
184-
dataset.attrs['_FillValue'] = varopts.get('fill_value', None)
185-
dataset.attrs['scale_factor'] = varopts.get('scale_factor', None)
183+
184+
valid_range = varopts['valid_range']
185+
fill_value = varopts['fill_value']
186+
scale_factor = varopts.get('scale_factor', None)
187+
188+
if valid_range is not None:
189+
dataset = dataset.where((dataset >= valid_range[0]) & (dataset <= valid_range[1]), fill_value)
190+
191+
dataset.attrs['valid_range'] = valid_range
192+
dataset.attrs['_FillValue'] = fill_value
193+
dataset.attrs['scale_factor'] = scale_factor
186194

187195
return dataset
188196

@@ -287,9 +295,9 @@ class MODISDownloader(VIIRSMODISDownloader):
287295

288296
available_variables = {
289297
'et': {
290-
'ET' : {'id': 0, 'valid_range': (-32767, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
291-
'PET' : {'id': 2, 'valid_range': (-32767, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
292-
'ET_QC' : {'id': 4, 'valid_range': (0,254), 'fill_value' : 255, 'scale_factor': 1 },
298+
'ET' : {'id': 0, 'valid_range': (0, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
299+
'PET' : {'id': 2, 'valid_range': (0, 32700), 'fill_value' : 32767, 'scale_factor': 0.1},
300+
'ET_QC' : {'id': 4, 'valid_range': (0,254), 'fill_value' : 255, 'scale_factor': 1 },
293301
}
294302
}
295303

src/door/data_sources/eobs/eobs_downloader.py

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -89,42 +89,28 @@ def get_last_published_ts(self, **kwargs) -> ts.TimeRange:
8989
last_date = self.get_last_published_date(**kwargs)
9090

9191
# get the timestep of the last date
92-
ts_per_year = self.ts_per_year if hasattr(self, 'ts_per_year') else 365
93-
last_date_timestep = FixedNTimeStep(last_date, ts_per_year)
92+
last_date_timestep = ts.Day.from_date(last_date)
9493

95-
# if the last date is the last day of its timestep, return the last timestep
96-
if last_date == last_date_timestep.end:
97-
return last_date_timestep
98-
# else, return the timestep before the one of the last date
99-
else:
100-
return last_date_timestep - 1
94+
return last_date_timestep
10195

10296
def get_last_published_date(self, **kwargs) -> dt.datetime:
10397

10498
"""
10599
Get the last published date for the dataset.
106100
"""
107101

108-
self.metadata = "https://psl.noaa.gov/thredds/iso/Datasets/cpc_global_precip/precip.{year}.nc?catalog=http://psl.noaa.gov/thredds/catalog/Datasets/cpc_global_precip/catalog.html&dataset=Datasets/cpc_global_precip/precip.{year}.nc"
109-
110-
import xml.etree.ElementTree as ET
111-
112-
year = dt.datetime.now().year
113-
with requests.get(self.metadata.format(year = year)) as response:
114-
root = ET.fromstring(response.content)
115-
116-
# Parse the XML file
117-
tree = ET.parse('your_xml_file.xml')
118-
root = tree.getroot()
119-
120-
# Find the gml:endPosition element
121-
end_position = root.find('.//gml:endPosition', namespaces={'gml': 'http://www.opengis.net/gml/3.2'})
122-
if end_position is not None:
123-
end_date = end_position.text
124-
125-
# Convert to datetime object if needed
126-
end_date_dt = dt.datetime.fromisoformat(end_date.replace('Z', '+00:00')).date()
127-
return end_date_dt
102+
this_month = ts.Month.from_date(dt.datetime.now())
103+
has_data = False
104+
while not has_data:
105+
try:
106+
url = self.month_url.format(variable = self.variables[0], resolution = self.resolution, year = this_month.year, month = this_month.month)
107+
r = requests.head(url)
108+
r.raise_for_status()
109+
has_data = True
110+
except requests.exceptions.HTTPError:
111+
this_month -= 1
112+
113+
return this_month.end
128114

129115
def _get_data_ts(self,
130116
timestep: TimeStep,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .jra_downloader import JRADownloader
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import os
2+
from typing import Generator, Optional, Sequence
3+
import xarray as xr
4+
import datetime as dt
5+
import requests
6+
import tempfile
7+
8+
from ...base_downloaders import URLDownloader
9+
10+
from d3tools import timestepping as ts
11+
from d3tools.timestepping.timestep import TimeStep
12+
from d3tools.timestepping.fixed_num_timestep import FixedNTimeStep
13+
from d3tools.spatial import BoundingBox, crop_to_bb
14+
15+
class JRADownloader(URLDownloader):
16+
source = "JRA"
17+
name = "JRA_downloader"
18+
19+
single_temp_folder = True
20+
separate_vars = True
21+
22+
default_options = {
23+
"resolution": 0.375,
24+
"freq" : 'd',
25+
'variables' : 'precipitation',
26+
'agg_method' : None
27+
}
28+
29+
grid_codes = {
30+
0.375 : 'gauss',
31+
1.25 : 'll125'
32+
}
33+
34+
home = "https://thredds.rda.ucar.edu/thredds/fileServer/files/g/d640000/"
35+
36+
37+
available_agg_methods = ['mean', 'max', 'min', 'sum']
38+
39+
available_products: dict = {
40+
"jra-3q": {
41+
"url_blank" : home + "{dataset}/{month.start:%Y%m}/jra3q.{dataset}.{var_code}.{var_name}-{grid_code}.{month.start:%Y%m%d}00_{month.end:%Y%m%d}23.nc",
42+
"data_list" : "https://thredds.rda.ucar.edu/thredds/catalog/files/g/d640000/{dataset}/catalog.html"
43+
}
44+
}
45+
46+
available_variables: dict = {
47+
"jra-3q": {
48+
"precipitation": {
49+
"dataset" : 'fcst_phy2m',
50+
"var_code" : '0_1_52',
51+
"var_name" : "tprate1have-sfc-fc", # this is a rate in mm/s, will need to multiply by 3600 to get mm/h and then sum to get total precipitation
52+
"agg_method" : 'sum'
53+
}
54+
}
55+
}
56+
57+
def __init__(self, product: str) -> None:
58+
self.set_product(product)
59+
super().__init__(self.url_blank, protocol = 'http')
60+
61+
def set_variables(self, variables: str|list[str]) -> None:
62+
"""
63+
Set the variables to download.
64+
"""
65+
if isinstance(variables, str):
66+
variables = [variables]
67+
super().set_variables(variables)
68+
69+
agg_options = self.agg_method
70+
if not isinstance(agg_options, list):
71+
agg_options = [agg_options]
72+
73+
if len(agg_options) != len(variables):
74+
msg = 'The number of aggregation methods must be the same as the number of variables'
75+
self.log.error(msg)
76+
raise ValueError(msg)
77+
78+
for agg, var in zip(agg_options, variables):
79+
agg = self.check_agg(agg)
80+
self.variables[var].update({'agg_method': agg})
81+
82+
def check_agg(self, agg):
83+
if not isinstance(agg, list): agg = [agg]
84+
for a in agg:
85+
if a not in self.available_agg_methods:
86+
msg = f'Aggregation method {a} not available'
87+
self.log.error(msg)
88+
raise ValueError(msg)
89+
return agg
90+
91+
def get_last_published_ts(self, **kwargs) -> ts.TimeRange:
92+
93+
"""
94+
Get the last published date for the dataset.
95+
"""
96+
97+
last_date = self.get_last_published_date(**kwargs)
98+
99+
# get the timestep of the last date
100+
freq = self.freq if hasattr(self, 'freq') else 'd'
101+
last_date_timestep = ts.TimeStep.from_unit(freq).from_date(last_date)
102+
103+
# if the last date is the last day of its timestep, return the last timestep
104+
if last_date == last_date_timestep.end:
105+
return last_date_timestep
106+
# else, return the timestep before the one of the last date
107+
else:
108+
return last_date_timestep - 1
109+
110+
def get_last_published_date(self, **kwargs) -> dt.datetime:
111+
112+
"""
113+
Get the last published date for the dataset.
114+
"""
115+
import re
116+
last_month = None
117+
for variable in self.variables:
118+
if 'dataset' not in self.variables[variable]:
119+
raise ValueError(f'Dataset not defined for variable {variable}')
120+
121+
url = self.data_list.format(dataset = self.variables[variable]['dataset'])
122+
with requests.get(url) as response:
123+
# this is 100% not the best way to do this, but it works for now
124+
matches = re.findall(r'href="(\d{4})(\d{2})/catalog.html"', response.text)
125+
126+
this_last_month = ts.Month(int(matches[-1][0]), int(matches[-1][1]))
127+
last_month = this_last_month if last_month is None else min(last_month, this_last_month)
128+
129+
return last_month.end
130+
131+
def _get_data_ts(self,
132+
timestep: TimeStep,
133+
space_bounds: BoundingBox,
134+
tmp_path: str) -> Generator[tuple[xr.DataArray, dict], None, None]:
135+
136+
this_var = self.variables[self.variable]
137+
this_month = ts.Month(timestep.year, timestep.month)
138+
tmp_file_nc = f'temp_{self.product}{this_month.year}{this_month.month}.nc'
139+
140+
# check if the file is not already downloaded in the tmp_path
141+
tmp_destination = os.path.join(tmp_path, tmp_file_nc)
142+
if not os.path.exists(tmp_destination):
143+
tags = {
144+
'dataset' : this_var['dataset'],
145+
'var_code' : this_var['var_code'],
146+
'var_name' : this_var['var_name'],
147+
'grid_code' : self.grid_codes[self.resolution],
148+
'month' : this_month
149+
}
150+
# download the file
151+
self.download(tmp_destination, min_size = 2000, missing_action = 'warning', **tags)
152+
153+
# once we download a month, we can delete the previous month
154+
prev_month = this_month - 1
155+
prev_file = f'temp_{self.product}{prev_month.year}{prev_month.month}.nc'
156+
prev_file = os.path.join(tmp_path, prev_file)
157+
if os.path.exists(prev_file):
158+
os.remove(prev_file)
159+
160+
# open the file
161+
raw_data = xr.open_dataset(tmp_destination, engine = 'netcdf4')
162+
vardata = raw_data[f"{this_var['var_name']}-{self.grid_codes[self.resolution]}"]
163+
164+
# only select the relevant time range
165+
inrange = (vardata.time.dt.date >= timestep.start.date()) & (vardata.time.dt.date <= timestep.end.date())
166+
vardata = vardata.sel(time = inrange)
167+
168+
# crop the data
169+
cropped = crop_to_bb(vardata, space_bounds)
170+
171+
# if this is precipitation data, we need to transform it to mm/h
172+
if this_var['var_name'] == 'tprate1have-sfc-fc':
173+
cropped *= 3600
174+
175+
# aggregate the data
176+
for agg_method in this_var['agg_method']:
177+
if agg_method == 'sum':
178+
aggregated = cropped.sum(dim = 'time')
179+
elif agg_method == 'mean':
180+
aggregated = cropped.mean(dim = 'time')
181+
elif agg_method == 'max':
182+
aggregated = cropped.max(dim = 'time')
183+
elif agg_method == 'min':
184+
aggregated = cropped.min(dim = 'time')
185+
else:
186+
raise ValueError(f'Aggregation method {self.agg_method} not recognized')
187+
188+
yield aggregated, {'agg_method': agg_method, 'variable': self.variable, 'resolution': str(self.resolution).replace('.', '')}

0 commit comments

Comments
 (0)