Skip to content

Commit 1dc31e4

Browse files
authored
Export service json validator fixes (#382)
* Assorted fixes for export service to get closer to passing validator * Accept date or date-time * Hook in the validator check to the jsonl output * Get things passing the validator for the exported json lines * mypy and flake8 fixes
1 parent 86e66eb commit 1dc31e4

17 files changed

+622
-58
lines changed

isamples_metadata/GEOMETransformer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ def produced_by_responsibilities(self) -> typing.List[dict[str, str]]:
351351
return responsibilities
352352
return []
353353

354-
def _produced_by_result_time_impl(self) -> str:
354+
def produced_by_result_time(self) -> str:
355355
parent_record = self._source_record_parent_record()
356356
if parent_record is not None:
357357
return self._formatted_date(
@@ -633,7 +633,7 @@ def produced_by_responsibilities(self) -> list:
633633
# TODO: who did the tissue extract, if available -- where does this live, if anywhere?
634634
return []
635635

636-
def _produced_by_result_time_impl(self) -> str:
636+
def produced_by_result_time(self) -> str:
637637
# TODO: time the tissue extract was done, if available -- where does this live?
638638
return ""
639639

isamples_metadata/OpenContextTransformer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ def produced_by_responsibilities(self) -> typing.List[dict[str, str]]:
420420
responsibilities.append(Transformer._responsibility_dict("collector", OpenContextTransformer._get_oc_str_or_dict_item_label(contributor)))
421421
return responsibilities
422422

423-
def _produced_by_result_time_impl(self) -> str:
423+
def produced_by_result_time(self) -> str:
424424
return self.source_record.get("published", Transformer.NOT_PROVIDED)
425425

426426
def sampling_site_description(self) -> str:

isamples_metadata/SESARTransformer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ def produced_by_responsibilities(self) -> list:
467467

468468
return responsibilities
469469

470-
def _produced_by_result_time_impl(self) -> str:
470+
def produced_by_result_time(self) -> str:
471471
result_time = Transformer.NOT_PROVIDED
472472
description = self._source_record_description()
473473
if "collectionStartDate" in description:

isamples_metadata/SmithsonianTransformer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def produced_by_responsibilities(self) -> list[dict[str, str]]:
208208
responsibilities.append({"role": "identified by", "name": current.strip()})
209209
return responsibilities
210210

211-
def _produced_by_result_time_impl(self) -> str:
211+
def produced_by_result_time(self) -> str:
212212
return self._formatted_date(
213213
self.source_record.get("year", ""),
214214
self.source_record.get("month", ""),

isamples_metadata/Transformer.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -260,17 +260,8 @@ def produced_by_responsibilities(self) -> typing.List[dict[str, str]]:
260260
"""The responsibility list for the producedBy dictionary"""
261261
pass
262262

263-
def produced_by_result_time(self) -> str:
264-
"""The result time for the producedBy dictionary"""
265-
result_time = self._produced_by_result_time_impl()
266-
if result_time is not None:
267-
# JSON schema expects this to be YYYY-MM-dd, so chop off any timestamps
268-
if len(result_time) > 10:
269-
result_time = result_time[:10]
270-
return result_time
271-
272263
@abstractmethod
273-
def _produced_by_result_time_impl(self) -> str:
264+
def produced_by_result_time(self) -> str:
274265
pass
275266

276267
@abstractmethod

isb_lib/utilities/solr_result_transformer.py

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
METADATA_REGISTRANT, METADATA_SAMPLE_LOCATION, METADATA_ELEVATION, METADATA_SAMPLING_SITE, \
1313
METADATA_RESULT_TIME, METADATA_HAS_FEATURE_OF_INTEREST, METADATA_DESCRIPTION, METADATA_INFORMAL_CLASSIFICATION, \
1414
METADATA_KEYWORDS, METADATA_HAS_SPECIMEN_CATEGORY, METADATA_HAS_MATERIAL_CATEGORY, METADATA_HAS_CONTEXT_CATEGORY, \
15-
METADATA_LABEL, METADATA_SAMPLE_IDENTIFIER, METADATA_AT_ID, METADATA_RESPONSIBILITY, METADATA_PRODUCED_BY, \
16-
METADATA_NAME
15+
METADATA_LABEL, METADATA_SAMPLE_IDENTIFIER, METADATA_RESPONSIBILITY, METADATA_PRODUCED_BY, \
16+
METADATA_NAME, METADATA_KEYWORD, METADATA_IDENTIFIER, METADATA_ROLE
1717
from isamples_metadata.solr_field_constants import SOLR_PRODUCED_BY_SAMPLING_SITE_PLACE_NAME, SOLR_AUTHORIZED_BY, \
1818
SOLR_COMPLIES_WITH, SOLR_PRODUCED_BY_SAMPLING_SITE_LOCATION_LONGITUDE, \
1919
SOLR_PRODUCED_BY_SAMPLING_SITE_LOCATION_LATITUDE, SOLR_RELATED_RESOURCE_ISB_CORE_ID, SOLR_CURATION_RESPONSIBILITY, \
@@ -65,6 +65,19 @@ def transform(table: Table, dest_path_no_extension: str, append: bool) -> str:
6565

6666

6767
class JSONExportTransformer(AbstractExportTransformer):
68+
69+
@staticmethod
70+
def filter_null_values(obj):
71+
"""
72+
Recursively filter out null values from a dictionary.
73+
"""
74+
if isinstance(obj, dict):
75+
return {k: JSONExportTransformer.filter_null_values(v) for k, v in obj.items() if v is not None}
76+
elif isinstance(obj, list):
77+
return [JSONExportTransformer.filter_null_values(elem) for elem in obj if elem is not None]
78+
else:
79+
return obj
80+
6881
@staticmethod
6982
def transform(table: Table, dest_path_no_extension: str, append: bool) -> str:
7083
if append:
@@ -73,7 +86,7 @@ def transform(table: Table, dest_path_no_extension: str, append: bool) -> str:
7386
dest_path = f"{dest_path_no_extension}.{extension}"
7487
with open(dest_path, "w") as file:
7588
for row in petl.util.base.dicts(table):
76-
json.dump(row, file)
89+
json.dump(JSONExportTransformer.filter_null_values(row), file)
7790
file.write("\n")
7891
return dest_path
7992

@@ -90,22 +103,39 @@ def _add_to_dict(self, target_dict: dict, target_key: str, source_dict: dict, so
90103
if source_value is not None:
91104
target_dict[target_key] = source_value
92105

106+
def _add_responsibilities_to_container(self,
107+
rec: dict,
108+
responsibility_key_solr: str,
109+
responsibility_key: str,
110+
container: dict):
111+
responsibilities = rec.get(responsibility_key_solr, [])
112+
responsibility_dicts = []
113+
for responsibility in responsibilities:
114+
pieces = responsibility.split(":")
115+
responsibility_dicts.append({METADATA_ROLE: pieces[0], METADATA_NAME: pieces[1]})
116+
if len(responsibility_dicts) > 0:
117+
container[responsibility_key] = responsibility_dicts
118+
93119
def _curation_dict(self, rec: dict) -> dict:
94120
curation_dict: dict = {}
95121
self._add_to_dict(curation_dict, METADATA_LABEL, rec, SOLR_CURATION_LABEL)
96122
self._add_to_dict(curation_dict, METADATA_DESCRIPTION, rec, SOLR_CURATION_DESCRIPTION)
97-
self._add_to_dict(curation_dict, METADATA_ACCESS_CONSTRAINTS, rec, SOLR_CURATION_ACCESS_CONSTRAINTS)
98123
self._add_to_dict(curation_dict, METADATA_CURATION_LOCATION, rec, SOLR_CURATION_LOCATION)
99-
self._add_to_dict(curation_dict, METADATA_RESPONSIBILITY, rec, SOLR_CURATION_RESPONSIBILITY)
124+
self._add_responsibilities_to_container(rec, SOLR_CURATION_RESPONSIBILITY, METADATA_RESPONSIBILITY, curation_dict)
125+
access_constraints = rec.get(SOLR_CURATION_ACCESS_CONSTRAINTS, "").split("|")
126+
if len(access_constraints) > 0:
127+
curation_dict[METADATA_ACCESS_CONSTRAINTS] = access_constraints
100128
return curation_dict
101129

102130
def _produced_by_dict(self, rec: dict) -> dict:
103131
produced_by_dict: dict = {}
104-
self._add_to_dict(produced_by_dict, METADATA_AT_ID, rec, SOLR_PRODUCED_BY_ISB_CORE_ID)
132+
self._add_to_dict(produced_by_dict, METADATA_IDENTIFIER, rec, SOLR_PRODUCED_BY_ISB_CORE_ID)
105133
self._add_to_dict(produced_by_dict, METADATA_LABEL, rec, SOLR_PRODUCED_BY_LABEL)
106-
self._add_to_dict(produced_by_dict, METADATA_RESPONSIBILITY, rec, SOLR_PRODUCED_BY_RESPONSIBILITY)
107134
self._add_to_dict(produced_by_dict, METADATA_DESCRIPTION, rec, SOLR_PRODUCED_BY_DESCRIPTION)
108-
self._add_to_dict(produced_by_dict, METADATA_RESULT_TIME, rec, SOLR_PRODUCED_BY_RESULT_TIME)
135+
result_time = rec.get(SOLR_PRODUCED_BY_RESULT_TIME)
136+
if result_time is not None:
137+
result_time = result_time[:10]
138+
produced_by_dict[METADATA_RESULT_TIME] = result_time
109139
self._add_to_dict(produced_by_dict, METADATA_HAS_FEATURE_OF_INTEREST, rec, SOLR_PRODUCED_BY_FEATURE_OF_INTEREST)
110140
sampling_site_dict: dict = {}
111141
produced_by_dict[METADATA_SAMPLING_SITE] = sampling_site_dict
@@ -117,10 +147,27 @@ def _produced_by_dict(self, rec: dict) -> dict:
117147
self._add_to_dict(sample_location_dict, METADATA_ELEVATION, rec, SOLR_PRODUCED_BY_SAMPLING_SITE_ELEVATION_IN_METERS)
118148
self._add_to_dict(sample_location_dict, METADATA_LATITUDE, rec, SOLR_PRODUCED_BY_SAMPLING_SITE_LOCATION_LATITUDE)
119149
self._add_to_dict(sample_location_dict, METADATA_LONGITUDE, rec, SOLR_PRODUCED_BY_SAMPLING_SITE_LOCATION_LONGITUDE)
150+
self._add_responsibilities_to_container(rec, SOLR_PRODUCED_BY_RESPONSIBILITY, METADATA_RESPONSIBILITY, produced_by_dict)
120151
return produced_by_dict
121152

153+
def _formatted_controlled_vocabulary(self, rec: dict, key: str) -> list[dict]:
154+
values = rec.get(key, [])
155+
return [{METADATA_LABEL: value} for value in values]
156+
157+
def _has_specimen_categories(self, rec: dict) -> list:
158+
return self._formatted_controlled_vocabulary(rec, SOLR_HAS_SPECIMEN_CATEGORY)
159+
160+
def _has_material_categories(self, rec: dict) -> list:
161+
return self._formatted_controlled_vocabulary(rec, SOLR_HAS_MATERIAL_CATEGORY)
162+
163+
def _has_context_categories(self, rec: dict) -> list:
164+
return self._formatted_controlled_vocabulary(rec, SOLR_HAS_CONTEXT_CATEGORY)
165+
166+
def _keywords(self, rec: dict) -> list:
167+
return [{METADATA_KEYWORD: keyword} for keyword in rec.get(SOLR_KEYWORDS, [])]
168+
122169
def _registrant_dict(self, rec: dict) -> dict:
123-
return {METADATA_NAME: rec[SOLR_REGISTRANT]}
170+
return {METADATA_NAME: rec[SOLR_REGISTRANT][0]}
124171

125172
def _rename_table_columns_csv(self):
126173
"""Renames the solr columns to the public names in the public metadata schema, while maintaining CSV tabular format"""
@@ -168,11 +215,11 @@ def _rename_table_columns_jsonl(self):
168215
mappings[METADATA_LABEL] = SOLR_LABEL
169216
mappings[METADATA_DESCRIPTION] = SOLR_DESCRIPTION
170217
mappings["source_collection"] = SOLR_SOURCE # this isn't present in the exported metadata
171-
mappings[METADATA_HAS_SPECIMEN_CATEGORY] = SOLR_HAS_SPECIMEN_CATEGORY
172-
mappings[METADATA_HAS_MATERIAL_CATEGORY] = SOLR_HAS_MATERIAL_CATEGORY
173-
mappings[METADATA_HAS_CONTEXT_CATEGORY] = SOLR_HAS_CONTEXT_CATEGORY
218+
mappings[METADATA_HAS_SPECIMEN_CATEGORY] = self._has_specimen_categories
219+
mappings[METADATA_HAS_MATERIAL_CATEGORY] = self._has_material_categories
220+
mappings[METADATA_HAS_CONTEXT_CATEGORY] = self._has_context_categories
174221
mappings[METADATA_INFORMAL_CLASSIFICATION] = SOLR_INFORMAL_CLASSIFICATION
175-
mappings[METADATA_KEYWORDS] = SOLR_KEYWORDS
222+
mappings[METADATA_KEYWORDS] = self._keywords
176223
mappings[METADATA_PRODUCED_BY] = self._produced_by_dict
177224
mappings[METADATA_REGISTRANT] = self._registrant_dict
178225
mappings[METADATA_SAMPLING_PURPOSE] = SOLR_SAMPLING_PURPOSE

tests/test_data/OpenContext/test/ark-28722-k26d5xr5z-test.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
"name": "Marshall C. Agnew"
5353
}
5454
],
55-
"result_time": "2007-11-11",
55+
"result_time": "2007-11-11T00:00:00Z",
5656
"sampling_site": {
5757
"description": "",
5858
"label": "",

tests/test_data/OpenContext/test/ark-28722-k26h4xk1f-test.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
"name": "Joanna Smith"
5151
}
5252
],
53-
"result_time": "2017-01-30",
53+
"result_time": "2017-01-30T22:57:28Z",
5454
"sampling_site": {
5555
"description": "",
5656
"label": "",

tests/test_data/OpenContext/test/ark-28722-k2b85cg1p-test.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
"name": "Katrina M. Haile"
4949
}
5050
],
51-
"result_time": "2007-11-11",
51+
"result_time": "2007-11-11T00:00:00Z",
5252
"sampling_site": {
5353
"description": "",
5454
"label": "",

tests/test_data/OpenContext/test/ark-28722-k2vq31x46-test.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
"name": "Elizabeth E. Payne"
4949
}
5050
],
51-
"result_time": "2007-11-11",
51+
"result_time": "2007-11-11T00:00:00Z",
5252
"sampling_site": {
5353
"description": "",
5454
"label": "",

0 commit comments

Comments
 (0)