22from pathlib import Path
33from typing import Annotated
44
5- import pandas as pd
65import pooch
76import typer
87import xarray as xr
98
10- from ref_sample_data import CMIP6Request , DataRequest , Obs4MIPsRequest
9+ from ref_sample_data import CMIP6Request , DataRequest , Obs4MIPsRequest , Obs4REFRequest
1110
1211OUTPUT_PATH = Path ("data" )
1312app = typer .Typer ()
1413
1514
16- def deduplicate_datasets (datasets : pd .DataFrame ) -> pd .DataFrame :
17- """
18- Deduplicate a dataset collection.
19-
20- Uses the metadata from the first dataset in each group,
21- but expands the time range to the min/max timespan of the group.
22-
23- Parameters
24- ----------
25- datasets
26- The dataset collection
27-
28- Returns
29- -------
30- pd.DataFrame
31- The deduplicated dataset collection spanning the times requested
32- """
33-
34- def _deduplicate_group (group : pd .DataFrame ) -> pd .DataFrame :
35- first = group .iloc [0 ].copy ()
36- first .time_start = group .time_start .min ()
37- first .time_end = group .time_end .max ()
38-
39- return first
40-
41- return datasets .groupby ("key" ).apply (_deduplicate_group , include_groups = False ).reset_index ()
42-
43-
4415def process_sample_data_request (
4516 request : DataRequest , decimate : bool , output_directory : Path , quiet : bool
4617) -> None :
@@ -61,7 +32,6 @@ def process_sample_data_request(
6132 Whether to suppress progress messages
6233 """
6334 datasets = request .fetch_datasets ()
64- datasets = deduplicate_datasets (datasets )
6535
6636 for _ , dataset in datasets .iterrows ():
6737 for ds_filename in dataset ["files" ]:
@@ -183,6 +153,8 @@ def process_sample_data_request(
183153 remove_ensembles = False ,
184154 time_span = ("2002" , "2016" ),
185155 ),
156+ # All unpublished obs4mips datasets
157+ Obs4REFRequest (),
186158]
187159
188160
0 commit comments