Skip to content

Commit e8f47a2

Browse files
authored
Feature/mars schema (#325)
* Implement minimal schema for mars adaptor * Bug fixes for data_format * Conditional tag values become strings
1 parent 166d5dd commit e8f47a2

File tree

5 files changed

+281
-13
lines changed

5 files changed

+281
-13
lines changed

cads_adaptors/adaptors/cds.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def normalise_request(self, request: Request) -> Request:
314314
self.ensure_list_values(conditions)
315315
if self.satisfy_conditions(self.intersected_requests, conditions):
316316
hidden_tag = f"__{tag}"
317-
request[hidden_tag] = True
317+
request[hidden_tag] = "true"
318318
except Exception as e:
319319
self.context.add_stdout(
320320
f"An error occured while attempting conditional tagging: {e!r}"

cads_adaptors/adaptors/mars.py

Lines changed: 86 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,81 @@ def execute_mars(
144144
return target
145145

146146

147+
def minimal_mars_schema(
148+
allow_duplicate_values_keys=None,
149+
remove_duplicate_values=False,
150+
key_regex=None,
151+
value_regex=None,
152+
):
153+
"""A minimal schema that ensures all values are lists of strings. Also
154+
ensures non-post-processing keys don't contain duplicate values.
155+
"""
156+
# Regular expressions for valid keys and values. The one_char_minimum regex
157+
# matches any number of non-whitespace and space characters as long as there
158+
# is at least one non-whitespace.
159+
one_char_minimum = r"[\S ]*\S[\S ]*"
160+
whitespace = r"[ \t]*"
161+
key_regex = key_regex or rf"^{whitespace}{one_char_minimum}{whitespace}\Z"
162+
value_regex = value_regex or rf"^{whitespace}{one_char_minimum}{whitespace}\Z"
163+
164+
# These are the only keys permitted to have duplicate values. Duplicate
165+
# values for field-selection keys sometimes leads to MARS rejecting the
166+
# request but other times can result in duplicate output fields which can
167+
# cause downstream problems. Manuel advises not to rely on specific MARS
168+
# behaviour when given duplicate values so we reject them in advance.
169+
postproc_keys = ["grid", "area"] + (allow_duplicate_values_keys or [])
170+
171+
# Form a regex that matches any key that will not be interpreted as one of
172+
# postproc_keys by MARS. i.e. a case-insensitive regex that matches anything
173+
# other than optional whitespace followed by a sequence of characters whose
174+
# lead characters match lead characters of one of postproc_keys. This
175+
# complexity is required because MARS will interpret all of the following as
176+
# area: "ArEa", "areaXYZ", "are", "arefoo".
177+
same_lead_chars = [
178+
"(".join(list(k)) + "".join([")?"] * (len(k) - 1)) for k in postproc_keys
179+
]
180+
not_postproc_key = r"(?i)^(?!\s*(" + "|".join(same_lead_chars) + "))"
181+
182+
# Minimal schema
183+
schema = {
184+
"_draft": "7",
185+
"allOf": [ # All following schemas must match.
186+
# Basic requirements for all keys
187+
{
188+
"type": "object", # Item is a dict
189+
"minProperties": 1, # ...with at least 1 key
190+
"patternProperties": {
191+
key_regex: { # ...with names matching this
192+
"type": "array", # ...must hold lists
193+
"minItems": 1, # ...of at least 1 item
194+
"items": {
195+
"type": "string", # ...which are strings
196+
"pattern": value_regex, # ...matching this regex
197+
"_onErrorShowPattern": False, # (error msg control)
198+
},
199+
}
200+
},
201+
"additionalProperties": False, # ...with no non-matching keys
202+
"_onErrorShowPattern": False, # (error msg control)
203+
},
204+
# Additional requirement for some keys
205+
{
206+
"type": "object", # Item is a dict
207+
"patternProperties": {
208+
not_postproc_key: { # ...in which non post-processing keys
209+
"type": "array",
210+
"uniqueItems": True, # ...containing duplicates
211+
# ... are rejected or have duplicates removed
212+
"_noRemoveDuplicates": not remove_duplicate_values,
213+
}
214+
},
215+
},
216+
],
217+
}
218+
219+
return schema
220+
221+
147222
class DirectMarsCdsAdaptor(cds.AbstractCdsAdaptor):
148223
resources = {"MARS_CLIENT": 1}
149224

@@ -157,9 +232,12 @@ def retrieve(self, request: Request) -> BinaryIO:
157232

158233

159234
class MarsCdsAdaptor(cds.AbstractCdsAdaptor):
160-
def __init__(self, *args, **kwargs) -> None:
161-
super().__init__(*args, **kwargs)
235+
def __init__(self, *args, **config) -> None:
236+
super().__init__(*args, **config)
162237
self.data_format: str | None = None
238+
schema_options = config.get("schema_options", {})
239+
if not schema_options.get("disable_adaptor_schema"):
240+
self.adaptor_schema = minimal_mars_schema(**schema_options)
163241

164242
def convert_format(self, *args, **kwargs):
165243
from cads_adaptors.tools.convertors import convert_format
@@ -189,19 +267,21 @@ def pre_mapping_modifications(self, request: dict[str, Any]) -> dict[str, Any]:
189267
"therefore it is not guaranteed to work."
190268
)
191269
# Remove "format" from request if it exists
192-
data_format = request.pop("format", "grib")
270+
data_format = request.pop("format", ["grib"])
193271
data_format = handle_data_format(request.get("data_format", data_format))
194272

195273
# Account from some horribleness from the legacy system:
196274
if data_format.lower() in ["netcdf.zip", "netcdf_zip", "netcdf4.zip"]:
197275
data_format = "netcdf"
198-
request.setdefault("download_format", "zip")
276+
request.setdefault("download_format", ["zip"])
199277

200278
# Enforce value of data_format to normalized value
201-
request["data_format"] = data_format
279+
request["data_format"] = [data_format]
202280

203281
default_download_format = "as_source"
204-
download_format = request.pop("download_format", default_download_format)
282+
download_format = ensure_list(
283+
request.pop("download_format", default_download_format)
284+
)[0]
205285
self.set_download_format(
206286
download_format, default_download_format=default_download_format
207287
)

cads_adaptors/adaptors/multi.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from cads_adaptors import AbstractCdsAdaptor, mapping
44
from cads_adaptors.adaptors import Request
5+
from cads_adaptors.adaptors.mars import minimal_mars_schema
56
from cads_adaptors.exceptions import (
67
CdsConfigurationError,
78
InvalidRequest,
@@ -187,7 +188,8 @@ def split_adaptors(
187188
def pre_mapping_modifications(self, request: dict[str, Any]) -> dict[str, Any]:
188189
request = super().pre_mapping_modifications(request)
189190

190-
download_format = request.pop("download_format", "zip")
191+
download_format = request.pop("download_format", ["zip"])
192+
download_format = ensure_list(download_format)[0]
191193
self.set_download_format(download_format)
192194

193195
return request
@@ -232,6 +234,12 @@ def retrieve_list_of_results(self, request: Request) -> list[str]:
232234

233235

234236
class MultiMarsCdsAdaptor(MultiAdaptor):
237+
def __init__(self, *args, schema_options=None, **kwargs) -> None:
238+
super().__init__(*args, **kwargs)
239+
schema_options = schema_options or {}
240+
if not schema_options.get("disable_adaptor_schema"):
241+
self.adaptor_schema = minimal_mars_schema(**schema_options)
242+
235243
def convert_format(self, *args, **kwargs) -> list[str]:
236244
from cads_adaptors.tools.convertors import convert_format
237245

@@ -242,16 +250,18 @@ def pre_mapping_modifications(self, request: dict[str, Any]) -> dict[str, Any]:
242250
request = super().pre_mapping_modifications(request)
243251

244252
# TODO: Remove legacy syntax all together
245-
data_format = request.pop("format", "grib")
253+
data_format = request.pop("format", ["grib"])
246254
data_format = request.pop("data_format", data_format)
255+
data_format = ensure_list(data_format)[0]
247256

248257
# Account from some horribleness from the legacy system:
249258
if data_format.lower() in ["netcdf.zip", "netcdf_zip", "netcdf4.zip"]:
250259
data_format = "netcdf"
251-
request.setdefault("download_format", "zip")
260+
request.setdefault("download_format", ["zip"])
252261

253262
default_download_format = "as_source"
254-
download_format = request.pop("download_format", default_download_format)
263+
download_format = request.pop("download_format", [default_download_format])
264+
download_format = ensure_list(download_format)[0]
255265
self.set_download_format(
256266
download_format, default_download_format=default_download_format
257267
)

cads_adaptors/validation/error_message.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def message(cls, error):
194194

195195
# Property is forbidden because the name doesn't match a regex?
196196
m = re.match(
197-
r"^'(.*)' does not match any of the regexes:(.*)",
197+
r"^['\"](.*)['\"] does not match any of the regexes:(.*)",
198198
error.message,
199199
flags=re.DOTALL,
200200
)

tests/test_15_mars.py

Lines changed: 179 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,28 @@
1+
import logging
12
import os
3+
import re
4+
import string as mstring
25

6+
import pytest
37
import requests
48

5-
from cads_adaptors.adaptors import mars
9+
from cads_adaptors.adaptors import Context, mars, multi
10+
from cads_adaptors.exceptions import InvalidRequest
611

712
TEST_GRIB_FILE = "https://sites.ecmwf.int/repository/earthkit-data/test-data/era5-levels-members.grib"
13+
logger = logging.getLogger(__name__)
14+
15+
WHITESPACE_CHARS = set(" \t")
16+
EXTENDED_ASCII_CHARS = set(chr(i) for i in range(256))
17+
18+
VALID_KEY_CHARS = (
19+
set(x for x in EXTENDED_ASCII_CHARS if re.match(r"\S", x)) - set(mstring.whitespace)
20+
) | {" "}
21+
INVALID_KEY_CHARS = set(mstring.whitespace) - WHITESPACE_CHARS
22+
VALID_VALUE_CHARS = (
23+
set(x for x in EXTENDED_ASCII_CHARS if re.match(r"\S", x)) - set(mstring.whitespace)
24+
) | {" "}
25+
INVALID_VALUE_CHARS = set(mstring.whitespace) - WHITESPACE_CHARS
826

927

1028
def test_get_mars_servers():
@@ -63,3 +81,163 @@ def test_convert_format(tmp_path, monkeypatch):
6381
_, out_ext = os.path.splitext(converted_files[0])
6482
assert out_ext == ".nc"
6583
assert "/test_subdir/" in converted_files[0]
84+
85+
86+
def test_schema_null():
87+
"""Test that null request inputs don't pass the schema."""
88+
# Not a dict
89+
_check_schema_fail("", "request: '' is not of type 'dict'")
90+
91+
# Null dict
92+
_check_schema_fail({}, "request: {} should be non-empty")
93+
94+
# Null/whitespace keys and values
95+
for string in [""] + sorted(WHITESPACE_CHARS):
96+
string_repr = repr(string).strip("'")
97+
98+
# Null key
99+
_check_schema_fail(
100+
{string: "1"}, f"request: '{string_repr}' is an invalid key name"
101+
)
102+
103+
# Null value
104+
_check_schema_fail(
105+
{"param": string}, f"request['param'][0]: invalid value: '{string}'"
106+
)
107+
108+
109+
def test_schema_whitespace():
110+
"""Test the presence of whitespace (space/tab) in keys and values."""
111+
for badchar in sorted(WHITESPACE_CHARS):
112+
# Test them at the beginning, middle and end of the string
113+
for pos in [0, 1, 2]:
114+
string = "ab"
115+
string = string[:pos] + badchar + string[pos:]
116+
string_repr = repr(string).strip("'")
117+
118+
# Tabs are allowed at the start and end of the string, but not in
119+
# the middle
120+
if pos in [0, 2] or badchar != "\t":
121+
_check_schema_pass({string: "1"}, {string: ["1"]})
122+
_check_schema_pass({"param": string}, {"param": [string]})
123+
else:
124+
_check_schema_fail(
125+
{string: "1"}, f"request: '{string_repr}' is an invalid key name"
126+
)
127+
_check_schema_fail(
128+
{"param": string}, f"request['param'][0]: invalid value: '{string}'"
129+
)
130+
131+
132+
def test_schema_invalid_key_chars():
133+
"""Test that invalid key characters don't pass the schema."""
134+
for badchar in sorted(INVALID_KEY_CHARS):
135+
# Test them at the beginning, middle and end of the string
136+
for pos in [0, 1, 2]:
137+
string = "ab"
138+
string = string[:pos] + badchar + string[pos:]
139+
string_repr = repr(string)[1:-1]
140+
141+
# Check the request is rejected because of the bad character
142+
_check_schema_fail(
143+
{string: "1"}, f"request: '{string_repr}' is an invalid key name"
144+
)
145+
146+
# Check we can allow the character with config
147+
_check_schema_pass(
148+
{string: "1"}, {string: ["1"]}, key_regex=re.escape(string)
149+
)
150+
151+
152+
def test_schema_invalid_value_chars():
153+
"""Test that invalid value characters don't pass the schema."""
154+
for badchar in sorted(INVALID_VALUE_CHARS):
155+
# Test them at the beginning, middle and end of the string
156+
for pos in [0, 1, 2]:
157+
string = "ab"
158+
string = string[:pos] + badchar + string[pos:]
159+
160+
# Check the request is rejected because of the bad character
161+
_check_schema_fail(
162+
{"a": string}, f"request['a'][0]: invalid value: '{string}'"
163+
)
164+
165+
# ...but can be allowed by config
166+
_check_schema_pass(
167+
{"a": string}, {"a": [string]}, value_regex=re.escape(string)
168+
)
169+
170+
171+
def test_good_requests():
172+
"""Check the schema allows a selection of "normal-looking" requests."""
173+
_check_schema_pass({"a": 1}, {"a": ["1"]})
174+
_check_schema_pass({"A": "a"}, {"A": ["a"]})
175+
_check_schema_pass({"0": ["a"]}, {"0": ["a"]})
176+
_check_schema_pass({"_": 1}, {"_": ["1"]})
177+
_check_schema_pass(
178+
{" abc ": [3, 2, 1, "foo-bar"], "\txyz\t\t": "3/2/1/foo-bar"},
179+
{" abc ": ["3", "2", "1", "foo-bar"], "\txyz\t\t": ["3/2/1/foo-bar"]},
180+
)
181+
_check_schema_pass(
182+
{"step": "1/to/24/by/3", "param_FOO": ["152.128", "203.210"]},
183+
{"step": ["1/to/24/by/3"], "param_FOO": ["152.128", "203.210"]},
184+
)
185+
_check_schema_pass(
186+
{"area": [10, -10.0, -20.1, 10.1]}, {"area": ["10", "-10.0", "-20.1", "10.1"]}
187+
)
188+
_check_schema_pass({"area": "10/-10./-20.1/10.1"}, {"area": ["10/-10./-20.1/10.1"]})
189+
_check_schema_pass(
190+
{"x": ["1E+10", "-1.E-10", ".1E0", "-.1E0", "12.13e45", "-12.13.e-45"]},
191+
{"x": ["1E+10", "-1.E-10", ".1E0", "-.1E0", "12.13e45", "-12.13.e-45"]},
192+
)
193+
kk = "".join(sorted(VALID_KEY_CHARS))
194+
vv = "".join(sorted(VALID_VALUE_CHARS))
195+
_check_schema_pass({kk: vv}, {kk: [vv]})
196+
197+
198+
def test_schema_duplicates():
199+
"""Test behaviour with duplicate values in value lists."""
200+
# Duplicate values are allowed for area and grid
201+
_check_schema_pass({"area": [1, 1]}, {"area": ["1", "1"]})
202+
_check_schema_pass({"grid": ["1", "1"]}, {"grid": ["1", "1"]})
203+
_check_schema_pass({"GriD": [1, 1]}, {"GriD": ["1", "1"]})
204+
205+
# They're not allowed for other keys
206+
_check_schema_fail(
207+
{"param": [1, 1]}, "request['param']: has repeated values in the list, e.g. '1'"
208+
)
209+
210+
# ... unless the key is configured to permit them
211+
_check_schema_pass(
212+
{"param": [1, 1]}, {"param": ["1", "1"]}, allow_duplicate_values_keys=["param"]
213+
)
214+
215+
# ... or they are automatically removed
216+
_check_schema_pass(
217+
{"param": [1, 1]}, {"param": ["1"]}, remove_duplicate_values=True
218+
)
219+
220+
221+
def _check_schema_fail(request, error_msg):
222+
"""Check a request fails the schema with the expected error message."""
223+
for cls in [mars.MarsCdsAdaptor, multi.MultiMarsCdsAdaptor]:
224+
adp = cls(form=None, context=Context(logger=logger))
225+
with pytest.raises(InvalidRequest) as einfo:
226+
output = adp.normalise_request(request)
227+
assert isinstance(output, dict)
228+
229+
if einfo.value.args[0] != error_msg:
230+
raise Exception(
231+
"Schema error message not as expected: "
232+
f"{einfo.value.args[0]!r} != {error_msg!r}"
233+
)
234+
235+
236+
def _check_schema_pass(req_in, req_out, **schema_options):
237+
"""Check a request passes the schema and gives the expected output."""
238+
for cls in [mars.MarsCdsAdaptor, multi.MultiMarsCdsAdaptor]:
239+
adp = cls(
240+
form=None, context=Context(logger=logger), schema_options=schema_options
241+
)
242+
req_mod = adp.normalise_request(req_in)
243+
assert req_mod == req_out

0 commit comments

Comments
 (0)