Skip to content

Commit 4c71251

Browse files
committed
#118 GeoNode5: parser
1 parent f0c2885 commit 4c71251

File tree

5 files changed

+176
-166
lines changed

5 files changed

+176
-166
lines changed

rndt/apps.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,6 @@ def run_setup_hooks(*args, **kwargs):
3838
settings.METADATA_PARSERS.extend([RNDT_PARSER_FUNCTION])
3939
setattr(settings, "METADATA_PARSERS", settings.METADATA_PARSERS)
4040

41-
RNDT_STORER_FUNCTION = "rndt.metadata.storer.rndt_storer"
42-
43-
rndt_storers = [RNDT_STORER_FUNCTION]
44-
if not getattr(settings, "METADATA_STORERS", None):
45-
setattr(settings, "METADATA_STORERS", rndt_storers)
46-
elif rndt_storers[0] not in settings.METADATA_STORERS:
47-
settings.METADATA_STORERS.extend(rndt_storers)
48-
setattr(settings, "METADATA_STORERS", settings.METADATA_STORERS)
49-
5041
urlpatterns += [
5142
re_path(r"^", include("rndt.api.urls")),
5243
re_path(r"^catalogue/", include("rndt.catalogue.urls")),

rndt/locale/it/LC_MESSAGES/django.po

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,9 @@
22
# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
33
# This file is distributed under the same license as the PACKAGE package.
44

5-
msgid "Data Constraints"
6-
msgstr "Vincoli sui dati"
7-
8-
msgid "Access constraints"
9-
msgstr "Vincoli di accesso"
10-
11-
msgid "Use constraints"
12-
msgstr "Vincoli di fruibilità"
13-
14-
msgid "Free text"
15-
msgstr "Testo Libero"
16-
17-
msgid "Resolution"
18-
msgstr "Risoluzione"
19-
205
msgid "Public Administration"
216
msgstr "Pubblica Amministrazione"
227

238
msgid "Additional info"
249
msgstr "Informazioni aggiuntive"
2510

26-
msgid "Positional Accuracy"
27-
msgstr "Accuratezza posizionale"

rndt/metadata/parser.py

Lines changed: 172 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from owslib import util
1111
from owslib.iso import get_namespaces
1212

13-
ACCESS_CONSTRAINTS_URL = "http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations"
13+
logger = logging.getLogger(__name__)
1414

1515

1616
def rndt_parser(xml, uuid="", vals={}, regions=[], keywords=[], custom={}):
@@ -25,21 +25,96 @@ def rndt_parser(xml, uuid="", vals={}, regions=[], keywords=[], custom={}):
2525
exml = list(exml)[0]
2626

2727
rndt_parser = RNDTMetadataParser(exml)
28-
28+
rndt_parser.get_freq(vals)
2929
keywords, discarded = rndt_parser.resolve_keywords()
3030
custom["rejected_keywords"] = discarded
3131

32-
custom["rndt"] = {}
33-
34-
use_constr = rndt_parser.get_access_costraints(custom)
35-
rndt_parser.get_use_costraints(vals, use_constr)
36-
rndt_parser.get_resolutions(custom)
37-
rndt_parser.get_accuracy(custom)
38-
rndt_parser.get_freq(vals)
32+
# Next calls parse and store metadata in a jsonschema compliant way (geonode5)
33+
jsoninstance = custom.setdefault("jsoninstance", {})
34+
resolver = RNDTMetadataResolver(jsoninstance)
35+
resolver.resolve_constraints(rndt_parser.parse_constraints())
36+
resolver.resolve_resolution(rndt_parser.parse_resolution())
37+
resolver.resolve_accuracy(rndt_parser.parse_accuracy())
3938

4039
return uuid, vals, regions, keywords, custom
4140

4241

42+
class RNDTMetadataResolver:
43+
def __init__(self, jsoninstance:dict):
44+
self.jsoninstance = jsoninstance
45+
46+
def resolve_constraints(self, constraints:list):
47+
freetext = ""
48+
access = None
49+
use = None
50+
51+
for constr in constraints:
52+
logger.debug(f"Resolving constraint: --> {constr}")
53+
54+
code = constr["code"]
55+
if code not in ("otherRestrictions", "limitation not listed"):
56+
logger.debug(f"Skipping constraint {constr}")
57+
continue
58+
59+
href = constr["href"]
60+
text = constr["text"]
61+
62+
if not href:
63+
logger.debug(f"Collecting text from {constr}")
64+
freetext = f"{freetext}\n{text} "
65+
continue
66+
67+
# rndt_LimitationsOnPublicAccess -> url
68+
# rndt_ConditionsApplyingToAccessAndUse -> text or url
69+
70+
t = ThesaurusKeyword.objects.filter(about=href).filter(
71+
thesaurus__identifier="LimitationsOnPublicAccess"
72+
).first()
73+
if t:
74+
if access:
75+
logger.warning(f"Duplicate LimitationsOnPublicAccess overridden {access}")
76+
access = {"id":href, "label":t.alt_label}
77+
continue
78+
79+
t = ThesaurusKeyword.objects.filter(about=href).filter(
80+
thesaurus__identifier="ConditionsApplyingToAccessAndUse"
81+
).first()
82+
if t:
83+
if use:
84+
logger.warning(f"Duplicate ConditionsApplyingToAccessAndUse overridden {use}")
85+
use = {"inspire_url": True, "url": href}
86+
continue
87+
88+
logger.warning(f"Skipping unknown URL {constr}")
89+
# we may try and parse license URLs: that's beyond RNDT requirements, but it would be nice
90+
#endfor
91+
92+
if access:
93+
self.jsoninstance["rndt_LimitationsOnPublicAccess"] = access
94+
else:
95+
logger.info("LimitationsOnPublicAccess not found")
96+
97+
if use:
98+
self.jsoninstance["rndt_ConditionsApplyingToAccessAndUse"] = use
99+
if freetext:
100+
logger.warning(f"Ignoring freetext constraint [{freetext}]")
101+
else:
102+
if freetext:
103+
self.jsoninstance["rndt_ConditionsApplyingToAccessAndUse"] = {
104+
"inspire_url": False, "freetext": freetext
105+
}
106+
else:
107+
logger.info("ConditionsApplyingToAccessAndUse not found")
108+
109+
def resolve_resolution(self, val):
110+
if val is not None:
111+
self.jsoninstance["rndt_resolution"] = val
112+
113+
def resolve_accuracy(self, val):
114+
if val is not None:
115+
self.jsoninstance["rndt_accuracy"] = val
116+
117+
43118
class RNDTMetadataParser:
44119
"""
45120
A metadata parser compliant with the RNDT specification
@@ -55,136 +130,112 @@ def __init__(self, exml):
55130
)
56131
)
57132

58-
def get_freq(self, vals):
59-
freq_elem = self.exml.find(
60-
util.nspath_eval(
61-
"gmd:identificationInfo/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode",
62-
self.namespaces,
63-
)
64-
)
133+
def parse_codelist(self, xpath):
134+
""" This should be moved into the base parser """
135+
elem = self.exml.find(util.nspath_eval(xpath, self.namespaces,))
136+
return (elem.attrib.get("codeListValue", None), elem.text) if elem is not None else None
65137

66-
freq = freq_elem.attrib.get("codeListValue", None) if freq_elem is not None else None
67-
vals["maintenance_frequency"] = freq or "unknown"
68-
69-
def get_access_costraints(self, custom):
138+
def parse_constraints(self) -> list:
70139
"""
71-
Function responsible to get the access constraints compliant with RNDT
72-
- will take all the instances of LegalConstraints
73-
- if the restriction MD_RestrictionCode under accessConstraints has a codeListValue = otherRestrictions
74-
- If is an anchor item,
75-
- will put in the vals dictionary under constraints_other the thesaurus label if exists
76-
- otherwise will put in contraints_other the URL parsed
77-
- if is a charstring:
78-
- will save the value extracted in a variable since is required for get the use_constrains
140+
Function responsible to parse the access constraints elements
141+
- returns a list of dict:
142+
- code: restriction codeListValue
143+
- href: if gmx:Anchor in gmd:otherConstraints, its href
144+
- text: text content of gmd:otherConstraints, either if CharacterString or Anchor
79145
"""
80-
use_constrs = ""
81146
access_constraints = self.exml.findall(
82147
util.nspath_eval(
83148
"gmd:identificationInfo/gmd:MD_DataIdentification/gmd:resourceConstraints/gmd:MD_LegalConstraints",
84149
self.namespaces,
85150
)
86151
)
87-
for item in access_constraints:
88-
md_restriction_code = item.find(
152+
ret = []
153+
for node in access_constraints:
154+
constr = {}
155+
logger.debug(f"Parsing constraint: --> {node}")
156+
157+
md_restriction_code = node.find(
89158
util.nspath_eval("gmd:accessConstraints/gmd:MD_RestrictionCode", self.namespaces)
90159
)
91-
if (
92-
md_restriction_code is not None
93-
and md_restriction_code.attrib.get("codeListValue", "") == "otherRestrictions"
94-
):
95-
acc_constr = item.find(util.nspath_eval("gmd:otherConstraints/gmx:Anchor", self.namespaces))
96-
if acc_constr is not None:
97-
url = acc_constr.attrib.get("{http://www.w3.org/1999/xlink}href")
98-
t = ThesaurusKeyword.objects.filter(about=url).filter(
99-
thesaurus__identifier="LimitationsOnPublicAccess"
100-
)
101-
if t.exists():
102-
custom["rndt"] = {"constraints_other": url}
103-
else:
104-
custom["rndt"] = {"constraints_other": ACCESS_CONSTRAINTS_URL}
160+
if md_restriction_code is not None:
161+
constr["type"] = "accessConstraints"
162+
else:
163+
md_restriction_code = node.find(
164+
util.nspath_eval("gmd:useConstraints/gmd:MD_RestrictionCode", self.namespaces)
165+
)
166+
if md_restriction_code is not None:
167+
constr["type"] = "useConstraints"
105168
else:
106-
use_constrs = item.find(
169+
logger.warning("Missing known restrictioncode")
170+
continue
171+
172+
constr["code"] = md_restriction_code.attrib.get("codeListValue", "")
173+
174+
anchor = node.find(util.nspath_eval("gmd:otherConstraints/gmx:Anchor", self.namespaces))
175+
if anchor is not None:
176+
constr["href"] = anchor.attrib.get("{http://www.w3.org/1999/xlink}href")
177+
constr["text"] = anchor.text
178+
else:
179+
charstring = node.find(
107180
util.nspath_eval("gmd:otherConstraints/gco:CharacterString", self.namespaces)
108181
).text
109-
return use_constrs
182+
constr["href"] = None
183+
constr["text"] = charstring
110184

111-
def get_use_costraints(self, vals, acc_constr):
112-
"""
113-
Function responsible to get the use constraints compliant with RNDT
114-
- will take all the instances of LegalConstraints
115-
- if the restriction MD_RestrictionCode under useConstraints has a codeListValue = otherRestrictions
116-
- If is an anchor item,
117-
- will put in the custom dictionary under rndt the thesaurus label if exists
118-
- otherwise will put in custom[rndt] the text and the information extracted in the previous step
119-
- if is a charstring:
120-
- will put in custom[rndt] the text and the information extracted in the previous step
121-
"""
122-
use_constraints = self.exml.findall(
123-
util.nspath_eval(
124-
"gmd:identificationInfo/gmd:MD_DataIdentification/gmd:resourceConstraints/gmd:MD_LegalConstraints",
125-
self.namespaces,
126-
)
127-
)
128-
for item in use_constraints:
129-
md_restriction_code = item.find(
130-
util.nspath_eval("gmd:useConstraints/gmd:MD_RestrictionCode", self.namespaces)
131-
)
132-
if (
133-
md_restriction_code is not None
134-
and md_restriction_code.attrib.get("codeListValue", "") == "otherRestrictions"
135-
):
136-
use_constr = item.find(util.nspath_eval("gmd:otherConstraints/gmx:Anchor", self.namespaces))
137-
if use_constr is not None:
138-
url = use_constr.attrib.get("{http://www.w3.org/1999/xlink}href")
139-
t = ThesaurusKeyword.objects.filter(about=url).filter(
140-
thesaurus__identifier="ConditionsApplyingToAccessAndUse"
141-
)
142-
if t.exists():
143-
vals["constraints_other"] = url
144-
else:
145-
vals["constraints_other"] = f"{use_constr.text} {acc_constr}"
146-
else:
147-
use_constr = item.find(
148-
util.nspath_eval("gmd:otherConstraints/gco:CharacterString", self.namespaces)
149-
)
150-
if use_constr is not None:
151-
vals["constraints_other"] = f"{use_constr.text} {acc_constr}"
152-
else:
153-
vals["constraints_other"] = acc_constr
154-
return vals
185+
ret.append(constr)
155186

156-
def get_resolutions(self, custom):
187+
return ret
188+
189+
def parse_resolution(self, default=None):
157190
resolution = self.exml.find(
158191
util.nspath_eval(
159192
"gmd:identificationInfo/gmd:MD_DataIdentification/gmd:spatialResolution/gmd:MD_Resolution/gmd:distance/gco:Distance",
160193
self.namespaces,
161194
)
162195
)
163196

164-
if resolution is not None:
165-
custom["rndt"]["resolution"] = (
166-
resolution if isinstance(resolution, float) else ast.literal_eval(resolution.text)
167-
) or 0
168-
else:
169-
logging.error("Resolution cannot be None, using default value 0")
170-
custom["rndt"]["resolution"] = 0
171-
return custom
197+
if resolution is None:
198+
logger.info(f"Resolution not found")
199+
return default
200+
if isinstance(resolution, (float, int)):
201+
return resolution
172202

173-
def get_accuracy(self, custom):
174-
accuracy = self.exml.find(
203+
try:
204+
res = ast.literal_eval(resolution.text)
205+
if isinstance(res, (float, int)):
206+
return res
207+
except ValueError as e:
208+
logger.warning(f"Error parsing resolution '{resolution.text}': {e}")
209+
return default
210+
211+
logger.warning(f"Resolution cannot be parsed: [{resolution}]")
212+
return default
213+
214+
def parse_accuracy(self, default=None):
215+
acc = self.exml.find(
175216
util.nspath_eval(
176217
"gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:report/gmd:DQ_AbsoluteExternalPositionalAccuracy/gmd:result/gmd:DQ_QuantitativeResult/gmd:value/gco:Record/gco:Real",
177218
self.namespaces,
178219
)
179220
)
180-
if accuracy is not None:
181-
custom["rndt"]["accuracy"] = (
182-
accuracy if isinstance(accuracy, float) else ast.literal_eval(accuracy.text)
183-
) or 0
184-
else:
185-
logging.error("accuracy cannot be None, using default value 0")
186-
custom["rndt"]["accuracy"] = 0
187-
return custom
221+
222+
if acc is None:
223+
logger.info(f"Accuracy not found")
224+
return default
225+
if isinstance(acc, (float, int)):
226+
return float(acc)
227+
228+
try:
229+
eval_acc = ast.literal_eval(acc.text)
230+
if isinstance(eval_acc, (float, int)):
231+
return float(eval_acc)
232+
except ValueError as e:
233+
logger.warning(f"Error parsing resolution '{acc.text}': {e}")
234+
return default
235+
236+
logger.warning(f"Accuracy cannot be parsed: [{acc}]")
237+
return default
238+
188239

189240
def resolve_keywords(self):
190241
"""
@@ -282,3 +333,14 @@ def _get_keywords(keywords, thesaurus_info):
282333
else:
283334
not_tkey.append(text)
284335
return available, not_tkey, discarded
336+
337+
def parse_frequency(self):
338+
""" This should be moved into the base parser """
339+
return self.parse_codelist("gmd:identificationInfo/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode")
340+
341+
def get_freq(self, vals):
342+
freq = self.parse_frequency()
343+
code = freq[0] if freq else None
344+
if freq is None:
345+
logger.info(f"Frequency not found")
346+
vals["maintenance_frequency"] = code or "unknown"

0 commit comments

Comments
 (0)