Skip to content

Commit 6770eee

Browse files
committed
Move preprocessor output filename configuration to new config
1 parent d91377a commit 6770eee

File tree

14 files changed

+242
-72
lines changed

14 files changed

+242
-72
lines changed

esmvalcore/_recipe/recipe.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
GRIB_FORMATS,
2828
_dates_to_timerange,
2929
_get_multiproduct_filename,
30-
_get_output_file,
3130
_parse_period,
3231
_truncate_dates,
3332
)
@@ -38,6 +37,7 @@
3837
MULTI_MODEL_FUNCTIONS,
3938
PreprocessingTask,
4039
PreprocessorFile,
40+
_get_preprocessor_filename,
4141
)
4242
from esmvalcore.preprocessor._area import _update_shapefile_path
4343
from esmvalcore.preprocessor._multimodel import _get_stat_identifier
@@ -678,11 +678,7 @@ def _get_preprocessor_products(
678678
_schedule_for_download(input_datasets)
679679
_log_input_files(input_datasets)
680680
logger.info("Found input files for %s", dataset.summary(shorten=True))
681-
682-
filename = _get_output_file(
683-
dataset.facets,
684-
dataset.session.preproc_dir,
685-
)
681+
filename = _get_preprocessor_filename(dataset)
686682
product = PreprocessorFile(
687683
filename=filename,
688684
attributes=dataset.facets,

esmvalcore/config/_config_object.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import dask.config
1212

1313
import esmvalcore
14-
from esmvalcore.config._config import load_config_developer
1514
from esmvalcore.config._config_validators import (
1615
_deprecated_options_defaults,
1716
_deprecators,
@@ -129,10 +128,6 @@ def load_from_dirs(self, dirs: Iterable[str | Path]) -> None:
129128
new_config_dict = self._get_config_dict_from_dirs(dirs)
130129
self.clear()
131130
self.update(new_config_dict)
132-
# Add known projects from config-developer file while we still have it.
133-
for project in load_config_developer(self["config_developer_file"]):
134-
if project not in self["projects"]:
135-
self["projects"][project] = {}
136131
self.check_missing()
137132

138133
def reload(self) -> None:

esmvalcore/config/_config_validators.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ def validate_projects(
375375
options_for_project: dict[str, Callable[[Any], Any]] = {
376376
"data": validate_dict, # TODO: try to create data sources here
377377
"extra_facets": validate_dict,
378+
"preprocessor_filename_template": validate_string,
378379
}
379380
for project, project_config in mapping.items():
380381
for option, val in project_config.items():
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Templates for the filenames used to write preprocessor output.
2+
projects:
3+
# ESGF projects.
4+
CMIP3:
5+
preprocessor_filename_template: "{project}_{institute}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}"
6+
CMIP5:
7+
preprocessor_filename_template: "{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}"
8+
CMIP6:
9+
preprocessor_filename_template: "{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}_{grid}"
10+
CORDEX:
11+
preprocessor_filename_template: "{project}_{institute}_{dataset}_{rcm_version}_{driver}_{domain}_{mip}_{exp}_{ensemble}_{short_name}"
12+
obs4MIPs:
13+
preprocessor_filename_template: "{project}_{dataset}_{short_name}"
14+
# Observational and reanalysis data that has been CMORized by ESMValTool according to the CMIP5 standard.
15+
OBS:
16+
preprocessor_filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}"
17+
# Observational and reanalysis data that has been CMORized by ESMValTool according to the CMIP6 standard.
18+
OBS6:
19+
preprocessor_filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}"
20+
# Observational and reanalysis data that can be read in its native format by ESMValCore.
21+
native6:
22+
preprocessor_filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}"
23+
# Data from various climate models in their native output format.
24+
ACCESS:
25+
preprocessor_filename_template: "{project}_{dataset}_{mip}_{exp}_{institute}_{sub_dataset}_{freq_attribute}_{short_name}"
26+
CESM:
27+
preprocessor_filename_template: "{project}_{dataset}_{case}_{gcomp}_{scomp}_{type}_{mip}_{short_name}"
28+
EMAC:
29+
preprocessor_filename_template: "{project}_{dataset}_{exp}_{channel}_{mip}_{short_name}"
30+
ICON:
31+
preprocessor_filename_template: "{project}_{dataset}_{exp}_{var_type}_{mip}_{short_name}"
32+
IPSLCM:
33+
preprocessor_filename_template: "{dataset}_{account}_{model}_{status}_{exp}_{simulation}_{freq}_{short_name}"

esmvalcore/dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
)
2828
from esmvalcore.config._data_sources import _get_data_sources
2929
from esmvalcore.exceptions import InputFilesNotFound, RecipeError
30-
from esmvalcore.io.local import _dates_to_timerange, _get_output_file
31-
from esmvalcore.preprocessor import preprocess
30+
from esmvalcore.io.local import _dates_to_timerange
31+
from esmvalcore.preprocessor import _get_preprocessor_filename, preprocess
3232

3333
if TYPE_CHECKING:
3434
from collections.abc import Iterable, Iterator, Sequence
@@ -815,7 +815,7 @@ def load(self) -> Cube:
815815
supplementary_cube = supplementary_dataset._load() # noqa: SLF001
816816
supplementary_cubes.append(supplementary_cube)
817817

818-
output_file = _get_output_file(self.facets, self.session.preproc_dir)
818+
output_file = _get_preprocessor_filename(self)
819819
cubes = preprocess(
820820
[cube],
821821
"add_supplementary_variables",
@@ -833,7 +833,7 @@ def _load(self) -> Cube:
833833
msg = check.get_no_data_message(self)
834834
raise InputFilesNotFound(msg)
835835

836-
output_file = _get_output_file(self.facets, self.session.preproc_dir)
836+
output_file = _get_preprocessor_filename(self)
837837
fix_dir_prefix = Path(
838838
self.session._fixed_file_dir, # noqa: SLF001
839839
self._get_joined_summary_facets("_", join_lists=True) + "_",

esmvalcore/io/local.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@
5858
from netCDF4 import Dataset
5959

6060
import esmvalcore.io.protocol
61-
from esmvalcore.config._config import get_project_config
6261
from esmvalcore.exceptions import RecipeError
6362
from esmvalcore.iris_helpers import ignore_warnings_context
6463

@@ -697,27 +696,6 @@ def _templates_to_regex(self) -> str:
697696
return pattern
698697

699698

700-
def _get_output_file(variable: dict[str, Any], preproc_dir: Path) -> Path:
701-
"""Return the full path to the output (preprocessed) file."""
702-
cfg = get_project_config(variable["project"])
703-
704-
# Join different experiment names
705-
if isinstance(variable.get("exp"), (list, tuple)):
706-
variable = dict(variable)
707-
variable["exp"] = "-".join(variable["exp"])
708-
outfile = _replace_tags(cfg["output_file"], variable)[0]
709-
if "timerange" in variable:
710-
timerange = variable["timerange"].replace("/", "-")
711-
outfile = Path(f"{outfile}_{timerange}")
712-
outfile = Path(f"{outfile}.nc")
713-
return Path(
714-
preproc_dir,
715-
variable.get("diagnostic", ""),
716-
variable.get("variable_group", ""),
717-
outfile,
718-
)
719-
720-
721699
def _get_multiproduct_filename(attributes: dict, preproc_dir: Path) -> Path:
722700
"""Get ensemble/multi-model filename depending on settings."""
723701
relevant_keys = [

esmvalcore/local.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,18 @@
99

1010
import logging
1111
import os.path
12+
import textwrap
1213
import warnings
1314
from pathlib import Path
14-
from typing import TYPE_CHECKING
15+
from typing import TYPE_CHECKING, Any
1516

1617
from esmvalcore.config import CFG
1718
from esmvalcore.config._config import get_ignored_warnings, get_project_config
1819
from esmvalcore.io.local import (
1920
LocalDataSource,
2021
LocalFile,
2122
_filter_versions_called_latest,
23+
_replace_tags,
2224
_select_latest_version,
2325
)
2426

@@ -255,3 +257,39 @@ def find_files(
255257
globs.extend(data_source._get_glob_patterns(**facets)) # noqa: SLF001
256258
return files, sorted(globs)
257259
return files
260+
261+
262+
_GET_OUTPUT_PATH_WARNED: set[str] = set()
263+
264+
265+
def _get_output_file(variable: dict[str, Any], preproc_dir: Path) -> Path:
266+
"""Return the full path to the output (preprocessed) file."""
267+
project = variable["project"]
268+
cfg = get_project_config(project)
269+
if project not in _GET_OUTPUT_PATH_WARNED:
270+
_GET_OUTPUT_PATH_WARNED.add(project)
271+
msg = textwrap.dedent(
272+
f"""
273+
Defining 'output_file' in config-develop.yml is deprecated and will be removed in version 2.16.0. Please use the following configuration instead:
274+
projects:
275+
{variable["project"]}:
276+
preprocessor_filename_template: "{cfg["output_file"]}"
277+
""".rstrip(),
278+
)
279+
logger.warning(msg)
280+
281+
# Join different experiment names
282+
if isinstance(variable.get("exp"), (list, tuple)):
283+
variable = dict(variable)
284+
variable["exp"] = "-".join(variable["exp"])
285+
outfile = _replace_tags(cfg["output_file"], variable)[0]
286+
if "timerange" in variable:
287+
timerange = variable["timerange"].replace("/", "-")
288+
outfile = Path(f"{outfile}_{timerange}")
289+
outfile = Path(f"{outfile}.nc")
290+
return Path(
291+
preproc_dir,
292+
variable.get("diagnostic", ""),
293+
variable.get("variable_group", ""),
294+
outfile,
295+
)

esmvalcore/preprocessor/__init__.py

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import copy
66
import inspect
77
import logging
8+
import re
9+
from collections.abc import Sequence
10+
from pathlib import Path
811
from pprint import pformat
912
from typing import TYPE_CHECKING, Any
1013

@@ -14,7 +17,10 @@
1417
from esmvalcore._task import BaseTask
1518
from esmvalcore.cmor.check import cmor_check_data, cmor_check_metadata
1619
from esmvalcore.cmor.fix import fix_data, fix_file, fix_metadata
20+
from esmvalcore.exceptions import RecipeError
21+
from esmvalcore.io.local import _parse_period
1722
from esmvalcore.io.protocol import DataElement
23+
from esmvalcore.local import _get_output_file
1824
from esmvalcore.preprocessor._area import (
1925
area_statistics,
2026
extract_named_regions,
@@ -102,14 +108,14 @@
102108
from esmvalcore.preprocessor._weighting import weighting_landsea_fraction
103109

104110
if TYPE_CHECKING:
105-
from collections.abc import Callable, Iterable, Sequence
106-
from pathlib import Path
111+
from collections.abc import Callable, Iterable
107112

108113
import prov.model
109114
from dask.delayed import Delayed
110115
from iris.cube import CubeList
111116

112117
from esmvalcore.dataset import Dataset
118+
from esmvalcore.typing import FacetValue
113119

114120
logger = logging.getLogger(__name__)
115121

@@ -254,6 +260,71 @@
254260
}
255261

256262

263+
def _get_preprocessor_filename(dataset: Dataset) -> Path:
264+
"""Get a filename for storing a preprocessed dataset.
265+
266+
Parameters
267+
----------
268+
dataset:
269+
The dataset that will be preprocessed.
270+
271+
Returns
272+
-------
273+
:
274+
A path for storing a preprocessed file.
275+
"""
276+
277+
def is_facet_value(value: Any) -> bool: # noqa: ANN401
278+
"""Check if a value is of type `esmvalcore.typing.FacetValue`."""
279+
return isinstance(value, str | int) or (
280+
isinstance(value, Sequence)
281+
and all(isinstance(v, str) for v in value)
282+
)
283+
284+
default_template = "_".join(
285+
f"{{{k}}}"
286+
for k in sorted(dataset.minimal_facets)
287+
if is_facet_value(dataset.minimal_facets[k])
288+
and k
289+
not in ("timerange", "diagnostic", "variable_group", "preprocessor")
290+
)
291+
template = (
292+
dataset.session["projects"]
293+
.get(dataset.facets["project"], {})
294+
.get("preprocessor_filename_template", default_template)
295+
)
296+
if template is default_template:
297+
try:
298+
# Use config-developer.yml for backward compatibility, remove in v2.16.
299+
return _get_output_file(
300+
dataset.facets,
301+
dataset.session.preproc_dir,
302+
)
303+
except RecipeError:
304+
pass
305+
306+
def normalize(value: FacetValue) -> str:
307+
"""Normalize a facet value to a string that can be used in a filename."""
308+
if isinstance(value, str | int):
309+
return re.sub("[^a-zA-Z0-9]+", "-", str(value))[:25]
310+
return "-".join(normalize(v) for v in value)
311+
312+
normalized_facets = {
313+
k: normalize(v) for k, v in dataset.facets.items() if is_facet_value(v)
314+
}
315+
filename = template.format(**normalized_facets)
316+
if "timerange" in dataset.facets:
317+
start_time, end_time = _parse_period(dataset.facets["timerange"])
318+
filename += f"_{start_time}-{end_time}"
319+
filename += ".nc"
320+
return Path(
321+
dataset.session.preproc_dir,
322+
dataset.facets.get("diagnostic", ""), # type: ignore[arg-type]
323+
dataset.facets.get("variable_group", ""), # type: ignore[arg-type]
324+
filename,
325+
)
326+
327+
257328
def _get_itype(step: str) -> str:
258329
"""Get the input type of a preprocessor function."""
259330
function = globals()[step]
@@ -520,7 +591,7 @@ def __init__(
520591
filename: Path,
521592
attributes: dict[str, Any] | None = None,
522593
settings: dict[str, Any] | None = None,
523-
datasets: list | None = None,
594+
datasets: list[Dataset] | None = None,
524595
) -> None:
525596
if datasets is not None:
526597
# Load data using a Dataset

tests/integration/io/test_local.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@
1313
from esmvalcore.io.local import (
1414
LocalDataSource,
1515
LocalFile,
16-
_get_output_file,
1716
_parse_period,
1817
)
19-
from esmvalcore.local import _select_drs, find_files
18+
from esmvalcore.local import _get_output_file, _select_drs, find_files
2019

2120
# Load test configuration
2221
with open(

0 commit comments

Comments
 (0)