Skip to content

Commit 0de6b1b

Browse files
authored
Error/warn when an illegal data type is discovered (#367)
* Implement get_concrete_data_types * Detect when a variable data type is unsupported and error/warn * Make exception publicly available
1 parent 4a44d56 commit 0de6b1b

File tree

8 files changed

+134
-43
lines changed

8 files changed

+134
-43
lines changed

src/dapla_metadata/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from datadoc_model.all_optional import model
44

5+
from ._merge import InconsistentDatasetsError
56
from ._merge import InconsistentDatasetsWarning
67
from .core import Datadoc
78
from .dapla_dataset_path_info import DaplaDatasetPathInfo

src/dapla_metadata/datasets/_merge.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,21 +189,25 @@ def check_variables_consistency(
189189
return results
190190

191191

192-
def check_ready_to_merge(
193-
results: list[DatasetConsistencyStatus], *, errors_as_warnings: bool
192+
def report_metadata_consistency(
193+
results: list[DatasetConsistencyStatus],
194+
*,
195+
errors_as_warnings: bool,
196+
message: str = INCONSISTENCIES_MESSAGE,
194197
) -> None:
195198
"""Check if the datasets are consistent enough to make a successful merge of metadata.
196199
197200
Args:
198201
results: List if dict with property name and boolean success flag
199202
errors_as_warnings: True if failing checks should be raised as warnings, not errors.
203+
message: The primary message to be displayed.
200204
201205
Raises:
202206
InconsistentDatasetsError: If inconsistencies are found and `errors_as_warnings == False`
203207
"""
204208
if failures := [result for result in results if not result.success]:
205209
messages_list = "\n - ".join(str(f) for f in failures)
206-
msg = f"{INCONSISTENCIES_MESSAGE}\n - {messages_list}"
210+
msg = f"{message}\n - {messages_list}"
207211
if errors_as_warnings:
208212
warnings.warn(
209213
message=msg,

src/dapla_metadata/datasets/core.py

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,18 @@
1818
from dapla_metadata.dapla import user_info
1919
from dapla_metadata.datasets._merge import DatasetConsistencyStatus
2020
from dapla_metadata.datasets._merge import check_dataset_consistency
21-
from dapla_metadata.datasets._merge import check_ready_to_merge
2221
from dapla_metadata.datasets._merge import check_variables_consistency
2322
from dapla_metadata.datasets._merge import merge_metadata
24-
from dapla_metadata.datasets.compatibility import is_metadata_in_container_structure
25-
from dapla_metadata.datasets.compatibility import upgrade_metadata
23+
from dapla_metadata.datasets._merge import report_metadata_consistency
24+
from dapla_metadata.datasets.compatibility._utils import (
25+
is_metadata_in_container_structure,
26+
)
27+
from dapla_metadata.datasets.compatibility.model_backwards_compatibility import (
28+
upgrade_metadata,
29+
)
2630
from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
2731
from dapla_metadata.datasets.dataset_parser import DatasetParser
32+
from dapla_metadata.datasets.dataset_parser import pretty_print_supported_types
2833
from dapla_metadata.datasets.model_validation import ValidateDatadocMetadata
2934
from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping
3035
from dapla_metadata.datasets.utility.constants import (
@@ -126,6 +131,7 @@ def __init__(
126131
self.variables_lookup: dict[str, VariableType] = {}
127132
self.explicitly_defined_metadata_document = False
128133
self.dataset_consistency_status: list[DatasetConsistencyStatus] = []
134+
self.concrete_data_types_lookup: dict[str, str] = {}
129135
if metadata_document_path:
130136
self.metadata_document = UPath(metadata_document_path)
131137
self.explicitly_defined_metadata_document = True
@@ -169,42 +175,37 @@ def _extract_metadata_from_files(self) -> None:
169175
self.metadata_document,
170176
)
171177

172-
if (
173-
self.dataset_path is not None
174-
and self.dataset == all_optional_model.Dataset()
175-
and len(self.variables) == 0
176-
):
178+
if self.dataset_path:
177179
extracted_metadata = self._extract_metadata_from_dataset(self.dataset_path)
180+
self.dataset_consistency_status.extend(
181+
self.check_illegal_variable_data_type(
182+
extracted_metadata.variables or [], self.concrete_data_types_lookup
183+
)
184+
)
178185

179186
if (
180187
self.dataset_path
181188
and self.metadata_document
182189
and extracted_metadata
183190
and existing_metadata
184-
):
185-
self.dataset_consistency_status = check_dataset_consistency(
186-
self.dataset_path,
187-
self.metadata_document,
191+
) and self.explicitly_defined_metadata_document:
192+
self.dataset_consistency_status.extend(
193+
check_dataset_consistency(
194+
self.dataset_path,
195+
self.metadata_document,
196+
)
188197
)
189198
self.dataset_consistency_status.extend(
190199
check_variables_consistency(
191200
extracted_metadata.variables or [],
192201
existing_metadata.variables or [],
193202
)
194203
)
195-
196-
if (
197-
self.dataset_path
198-
and self.explicitly_defined_metadata_document
199-
and self.metadata_document is not None
200-
and self.metadata_document.exists()
201-
and extracted_metadata is not None
202-
and existing_metadata is not None
203-
):
204-
check_ready_to_merge(
204+
report_metadata_consistency(
205205
self.dataset_consistency_status,
206206
errors_as_warnings=self.errors_as_warnings,
207207
)
208+
# Merge existing metadata with a new dataset
208209
merged_metadata = merge_metadata(
209210
extracted_metadata,
210211
existing_metadata,
@@ -215,8 +216,31 @@ def _extract_metadata_from_files(self) -> None:
215216
self.dataset_path,
216217
)
217218
self._set_metadata(merged_metadata)
218-
else:
219-
self._set_metadata(existing_metadata or extracted_metadata)
219+
return
220+
221+
report_metadata_consistency(
222+
self.dataset_consistency_status,
223+
errors_as_warnings=self.errors_as_warnings,
224+
message="Problems were detected with the metadata.",
225+
)
226+
self._set_metadata(existing_metadata or extracted_metadata)
227+
228+
def check_illegal_variable_data_type(
229+
self, variables: VariableListType, concrete_data_types_lookup: dict[str, str]
230+
) -> list[DatasetConsistencyStatus]:
231+
"""Check whether any of the variable types are unsupported.
232+
233+
When we encounter a variable which is unsupported, the `DatasetParser` sets the variable `data_type` to `None`.
234+
This function detects that situation and creates a friendly error message to inform of the situation.
235+
"""
236+
return [
237+
DatasetConsistencyStatus(
238+
message=f"Unsupported data type for variable '{v.short_name}' type: '{concrete_data_types_lookup.get(v.short_name, 'unknown')}' from dataset {self.dataset_path}\nPlease change the type of the variable to one of the supported options:\n{pretty_print_supported_types()}",
239+
success=False,
240+
)
241+
for v in variables
242+
if v.short_name and not v.data_type
243+
]
220244

221245
def _set_metadata(
222246
self,
@@ -369,6 +393,14 @@ def _extract_metadata_from_dataset(
369393
spatial_coverage_description=DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
370394
)
371395
metadata.variables = DatasetParser.for_file(dataset).get_fields()
396+
try:
397+
self.concrete_data_types_lookup = DatasetParser.for_file(
398+
dataset
399+
).get_concrete_data_types()
400+
except RuntimeError:
401+
logger.exception(
402+
"Failed to get concrete data types for dataset %s", dataset
403+
)
372404
return metadata
373405

374406
@staticmethod

src/dapla_metadata/datasets/dataset_parser.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from abc import ABC
1010
from abc import abstractmethod
1111
from typing import TYPE_CHECKING
12+
from typing import ClassVar
1213

1314
import pandas as pd
1415
from datadoc_model.all_optional.model import DataType
@@ -24,6 +25,11 @@
2425
import pyarrow as pa
2526
from upath.types import ReadablePathLike
2627

28+
29+
PARQUET_FILE_SUFFIX = ".parquet"
30+
PARQUET_GZIP_FILE_SUFFIX = ".parquet.gzip"
31+
SAS7BDAT_FILE_SUFFIX = ".sas7bdat"
32+
2733
KNOWN_INTEGER_TYPES = (
2834
"int",
2935
"int_",
@@ -98,6 +104,11 @@
98104
TYPE_MAP.update(dict.fromkeys(concrete_type, abstract_type))
99105

100106

107+
def pretty_print_supported_types() -> str:
108+
"""Return a human-readable string of the supported data types."""
109+
return "\n".join(f"{t[1].value}: {t[0]}" for t in TYPE_CORRESPONDENCE)
110+
111+
101112
class DatasetParser(ABC):
102113
"""Abstract Base Class for all Dataset parsers.
103114
@@ -162,12 +173,18 @@ def transform_data_type(data_type: str) -> DataType | None:
162173

163174
@abstractmethod
164175
def get_fields(self) -> list[Variable]:
165-
"""Abstract method, must be implemented by subclasses."""
176+
"""Extract the variable names and abstract data types for this dataset."""
177+
178+
@abstractmethod
179+
def get_concrete_data_types(self) -> dict[str, str]:
180+
"""Extract the variable names and concrete data types for this dataset."""
166181

167182

168183
class DatasetParserParquet(DatasetParser):
169184
"""Concrete implementation for parsing parquet files."""
170185

186+
_EXCLUDED_VARIABLE_NAMES: ClassVar[set[str]] = {"__index_level_0__"}
187+
171188
def __init__(self, dataset: UPath) -> None:
172189
"""Call the super init method for initialization.
173190
@@ -178,18 +195,24 @@ def __init__(self, dataset: UPath) -> None:
178195

179196
def get_fields(self) -> list[Variable]:
180197
"""Extract the fields from this dataset."""
181-
with self.dataset.open(mode="rb") as f:
182-
schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type, assignment]
183198
return [
184199
Variable(
185-
short_name=data_field.name.strip(),
186-
data_type=self.transform_data_type(str(data_field.type)), # type: ignore [attr-defined]
200+
short_name=data_field[0],
201+
data_type=self.transform_data_type(data_field[1]),
187202
)
188-
for data_field in schema
189-
if data_field.name
190-
!= "__index_level_0__" # Index columns should not be documented
203+
for data_field in self.get_concrete_data_types().items()
191204
]
192205

206+
def get_concrete_data_types(self) -> dict[str, str]:
207+
"""Extract the variable names and concrete data types for this dataset."""
208+
with self.dataset.open(mode="rb") as f:
209+
schema: pa.Schema = pq.read_schema(f)
210+
return {
211+
data_field.name.strip(): str(data_field.type)
212+
for data_field in schema
213+
if data_field.name not in self._EXCLUDED_VARIABLE_NAMES
214+
}
215+
193216

194217
class DatasetParserSas7Bdat(DatasetParser):
195218
"""Concrete implementation for parsing SAS7BDAT files."""
@@ -240,10 +263,10 @@ def get_fields(self) -> list[Variable]:
240263

241264
return fields
242265

266+
def get_concrete_data_types(self) -> dict[str, str]:
267+
"""Extract the variable names and concrete data types for this dataset."""
268+
raise NotImplementedError
243269

244-
PARQUET_FILE_SUFFIX = ".parquet"
245-
PARQUET_GZIP_FILE_SUFFIX = ".parquet.gzip"
246-
SAS7BDAT_FILE_SUFFIX = ".sas7bdat"
247270

248271
SUPPORTED_DATASET_FILE_SUFFIXES: dict[
249272
str,
2.09 KB
Binary file not shown.

tests/datasets/test_datadoc_metadata.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pydantic import ValidationError
2626

2727
from dapla_metadata.dapla.user_info import TestUserInfo
28+
from dapla_metadata.datasets._merge import InconsistentDatasetsError
2829
from dapla_metadata.datasets.core import Datadoc
2930
from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping
3031
from dapla_metadata.datasets.utility.constants import (
@@ -602,3 +603,12 @@ def test_merge_with_fewer_variables_in_existing_metadata(tmp_path):
602603
"bankinnskudd",
603604
"dato",
604605
]
606+
607+
608+
def test_unknown_data_type():
609+
with pytest.raises(
610+
InconsistentDatasetsError, match="Unsupported data type for variable"
611+
):
612+
Datadoc(
613+
dataset_path="tests/datasets/resources/datasets/category_data_type.parquet"
614+
)

tests/datasets/test_dataset_consistency.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
from dapla_metadata.datasets._merge import InconsistentDatasetsError
2323
from dapla_metadata.datasets._merge import InconsistentDatasetsWarning
2424
from dapla_metadata.datasets._merge import check_dataset_consistency
25-
from dapla_metadata.datasets._merge import check_ready_to_merge
2625
from dapla_metadata.datasets._merge import check_variables_consistency
26+
from dapla_metadata.datasets._merge import report_metadata_consistency
2727
from dapla_metadata.datasets.core import Datadoc
2828
from tests.datasets.constants import TEST_BUCKET_NAMING_STANDARD_COMPATIBLE_PATH
2929
from tests.datasets.constants import VARIABLE_DATA_TYPES
@@ -140,7 +140,7 @@ def test_check_dataset_consistency_inconsistent_paths(
140140
[True, False],
141141
ids=["warnings", "errors"],
142142
)
143-
def test_check_ready_to_merge_errors_as_warnings(
143+
def test_report_metadata_consistency_errors_as_warnings(
144144
dataset_consistency_status: list[DatasetConsistencyStatus],
145145
errors_as_warnings: bool,
146146
):
@@ -149,7 +149,7 @@ def test_check_ready_to_merge_errors_as_warnings(
149149
stack.enter_context(pytest.warns(InconsistentDatasetsWarning))
150150
else:
151151
stack.enter_context(pytest.raises(InconsistentDatasetsError))
152-
check_ready_to_merge(
152+
report_metadata_consistency(
153153
dataset_consistency_status,
154154
errors_as_warnings=errors_as_warnings,
155155
)

tests/datasets/test_dataset_parser.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,28 @@ def test_get_fields_parquet(local_parser: DatasetParserParquet):
4949
assert local_parser.get_fields() == expected_fields
5050

5151

52+
@pytest.mark.parametrize(
53+
"local_parser",
54+
[
55+
DatasetParser.for_file(TEST_PARQUET_FILEPATH),
56+
DatasetParser.for_file(TEST_PARQUET_GZIP_FILEPATH),
57+
],
58+
)
59+
def test_get_concrete_data_types_parquet(local_parser: DatasetParserParquet):
60+
expected_fields = {
61+
"alm_inntekt": "int64",
62+
"ber_bruttoformue": "int64",
63+
"fullf_utdanning": "string",
64+
"hoveddiagnose": "string",
65+
"pers_id": "string",
66+
"sivilstand": "string",
67+
"sykepenger": "int64",
68+
"tidspunkt": "timestamp[us]",
69+
}
70+
71+
assert local_parser.get_concrete_data_types() == expected_fields
72+
73+
5274
def test_get_fields_sas7bdat():
5375
expected_fields = [
5476
Variable(
@@ -101,8 +123,7 @@ def test_transform_datatype_unknown_type():
101123
],
102124
)
103125
def test_transform_datatype(expected: DataType, concrete_type: str):
104-
actual = DatasetParser.transform_data_type(concrete_type)
105-
assert actual == expected
126+
assert DatasetParser.transform_data_type(concrete_type) == expected
106127

107128

108129
@pytest.fixture

0 commit comments

Comments
 (0)