Skip to content

Commit 1e1c132

Browse files
committed
Improving GBIF metrics and dealing with exceptions
1 parent 484bbc5 commit 1e1c132

File tree

4 files changed

+94
-179
lines changed

4 files changed

+94
-179
lines changed

api/evaluator.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,7 @@ def rda_i3_01m(self, **kwargs):
14321432
if row["text_value"].split("/")[-1] not in self.item_id:
14331433
id_list.append(row["text_value"])
14341434
points, msg_list = self.eval_persistency(id_list)
1435+
return (points, msg_list)
14351436

14361437
def rda_i3_01d(self):
14371438
"""Indicator RDA-A1-01M.
@@ -1858,14 +1859,16 @@ def rda_r1_3_01d(self, **kwargs):
18581859
terms_reusability_richness_list = terms_reusability_richness["list"]
18591860
terms_reusability_richness_metadata = terms_reusability_richness["metadata"]
18601861

1861-
element = terms_reusability_richness_metadata.loc[
1862-
terms_reusability_richness_metadata["element"].isin(["availableFormats"]),
1863-
"text_value",
1864-
].values[0]
1865-
for form in element:
1866-
availableFormats.append(form["label"])
1867-
18681862
try:
1863+
element = terms_reusability_richness_metadata.loc[
1864+
terms_reusability_richness_metadata["element"].isin(
1865+
["availableFormats"]
1866+
),
1867+
"text_value",
1868+
].values[0]
1869+
for form in element:
1870+
availableFormats.append(form["label"])
1871+
18691872
f = open(path)
18701873
f.close()
18711874

api/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,10 +707,10 @@ def orcid_basic_info(orcid):
707707
item = xmlTree.findall(
708708
".//{http://www.orcid.org/ns/common}assertion-origin-name"
709709
)
710+
basic_info = "ORCID Name: %s" % item[0].text
710711
except Exception as e:
711712
logging.error(e)
712713
return basic_info
713-
basic_info = "ORCID Name: %s" % item[0].text
714714
return basic_info
715715

716716

plugins/gbif/config.ini

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -37,33 +37,29 @@ terms_findability_richness = ['Title',
3737
'Format',
3838
'Identifier',
3939
'Language']
40+
4041
[gbif]
4142
# (meta)data terms to find the resource identifier
42-
identifier_term = [['alternateIdentifier','']]
43-
identifier_term_data = [['alternateIdentifier','']]
43+
identifier_term = [['dataset','alternateIdentifier']]
44+
identifier_term_data = [['dataset','alternateIdentifier']]
4445

4546
# Metadata terms to check richness (generic). These terms should be included [term, qualifier]. None means no qualifier
46-
terms_quali_generic = [['contributor',None],
47-
['date', None],
48-
['description', None],
49-
['identifier', None],
50-
['publisher', None],
51-
['rights', None],
52-
['title', None],
53-
['subject', None]]
47+
terms_quali_generic = [['dataset.creator', 'givenName'],
48+
['dataset.creator', 'surName'],
49+
['dataset', 'pubDate'],
50+
['dataset.abstract', 'para'],
51+
['dataset.intellectualRights.para.ulink', 'citetitle'],
52+
['dataset', 'title'],
53+
['dataset.keywordSet', 'keyword']]
5454

5555
# Metadata terms to check richness (disciplinar). These terms should be included [term, qualifier]
56-
terms_quali_disciplinar = [['contributor', None],
57-
['date', None],
58-
['description', None],
59-
['identifier', None],
60-
['publisher', None],
61-
['rights', None],
62-
['title', None],
63-
['subject', None]]
56+
terms_quali_disciplinar = [['dataset.coverage.geographicCoverage', 'geographicDescription'],
57+
['dataset.coverage.temporalCoverage.rangeOfDates.beginDate', 'calendarDate'],
58+
['dataset.coverage.temporalCoverage.rangeOfDates.endDate', 'calendarDate'],
59+
['dataset.coverage.taxonomicCoverage.taxonomicClassification', 'taxonRankValue']]
6460

6561
# Metadata terms that defines accessibility (case sensitive)
66-
terms_access = [['access', ''], ['rights', '']]
62+
terms_access = [['dataset.intellectualRights.para.ulink', 'citetitle']]
6763

6864
# Metadata terms to check discoverability richness.
6965
#
@@ -84,37 +80,37 @@ terms_access = [['access', ''], ['rights', '']]
8480
# Format File format availableFormats
8581
# Identifier Data Unique ID DOI
8682
# Language NA NA
87-
terms_findability_richness = [['title',''],
88-
['keywords',''],
89-
['description',''],
90-
['type','relatedDataProducts'],
91-
['paths', 'spatial'],
92-
['temporalCoverage','relatedDataProducts'],
93-
['dataProvider','relatedDataProducts'],
94-
['license',''],
95-
['availableFormats',''],
96-
['identifiers','relatedDataProducts']]
83+
terms_findability_richness = [['dataset', 'title']],
84+
['dataset.keywordSet', 'keyword'],
85+
['dataset.abstract', 'para'],
86+
['dataset.coverage.geographicCoverage', 'geographicDescription'],
87+
['dataset.coverage.temporalCoverage.rangeOfDates.beginDate', 'calendarDate'],
88+
['dataset.coverage.temporalCoverage.rangeOfDates.endDate', 'calendarDate'],
89+
['dataset.intellectualRights.para.ulink', 'citetitle'],
90+
['dataset','alternateIdentifier']]
9791

9892
# Metadata terms to check reusability richness
99-
terms_reusability_richness = [['rigths',''],
100-
['license','']]
101-
93+
terms_reusability_richness = [['dataset','alternateIdentifier'],
94+
['additionalMetadata.metadata.gbif', 'hierarchyLevel']]
10295

10396

10497
# Metadata terms wich includes controlled vocabularies. More controlled vocabularies can be imlpemented in plugins
105-
terms_cv = [['coverage', 'spatial'], ['subject', 'lcsh']]
98+
terms_cv = [['dataset.creator', 'userId']]
10699

107100
# List of data formats that are standard for the community
108101
supported_data_formats = [".txt", ".pdf", ".csv", ".nc", ".doc", ".xls", ".zip", ".rar", ".tar", ".png", ".jpg"]
109102

110103
# Metadata terms that defines links or relation with authors, contributors (preferebly in ORCID format)
111-
terms_qualified_references = [['contributor', None]]
104+
terms_qualified_references = [['dataset.creator', 'userId'],
105+
['dataset.contact', 'userId'],
106+
['dataset.project.personnel', 'userId'],
107+
['dataset.metadataProvider', 'userId' ]]
112108

113109
# Metadata terms that defines links or relation with other resources, (preferebly in ORCID format, URIs or persistent identifiers)
114-
terms_relations = [['relation', None]]
110+
terms_relations = [['dataset.creator', 'userId']]
115111

116112
# Metadata terms that defines the license type
117-
terms_license = [['rights', '']]
113+
terms_license = [['dataset.intellectualRights.para.ulink', 'citetitle']]
118114

119115
# Metadata terms that defines metadata about provenance
120116
terms_provenance =[['curationAndProvenanceObligations','']]
@@ -123,7 +119,7 @@ terms_provenance =[['curationAndProvenanceObligations','']]
123119
terms_access_protocols =['http','https','ftp']
124120

125121
# Manual metadata access
126-
metadata_access_manual = ['https://github.com/epos-eu/Hands-On-EPOS-API']
122+
metadata_access_manual = ['https://techdocs.gbif.org/en/openapi/']
127123

128124
# Manual data access
129125
data_access_manual = ['https://techdocs.gbif.org/en/openapi/']
@@ -153,9 +149,9 @@ terms_vocabularies=[['identifiers','relatedDataProducts'],
153149
['license',''],
154150
['contactPoints','relatedDataProducts']]
155151

156-
157-
api_user = ma8
158-
api_pass = safdsaRY
152+
api_mail =
153+
api_user =
154+
api_pass =
159155

160156

161157
[fairsharing]

plugins/gbif/plugin.py

Lines changed: 47 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import requests
1313

1414
from api.evaluator import Evaluator
15-
from plugins.gbif.gbif_data import ICA, gbif_doi_download
15+
from plugins.gbif.gbif_data import ICA, gbif_doi_download, gbif_doi_search
1616

1717
logging.basicConfig(
1818
stream=sys.stdout, level=logging.DEBUG, format="'%(name)s:%(lineno)s' | %(message)s"
@@ -115,149 +115,65 @@ def get_metadata(self):
115115
final_url = final_url.replace("www.gbif.org/", "api.gbif.org/v1/")
116116
final_url = final_url + "/document"
117117
response = requests.get(final_url, verify=False)
118-
tree = ET.fromstring(response.text)
119-
120-
print("gbif5")
121-
eml_schema = "{eml://ecoinformatics.org/eml-2.1.1}"
122-
metadata_sample = []
123-
elementos = tree.find(".//")
124-
for e in elementos:
125-
if e.text != "" or e.text != "\n " or e.text != "\n":
126-
metadata_sample.append([eml_schema, e.tag, e.text, None])
127-
for i in e.iter():
128-
if len(list(i.iter())) > 0:
129-
for se in i.iter():
130-
metadata_sample.append(
131-
[eml_schema, e.tag + "." + i.tag, se.text, se.tag]
132-
)
133-
elif i.tag != e.tag and (
134-
i.text != "" or i.text != "\n " or i.text != "\n"
135-
):
136-
metadata_sample.append([eml_schema, e.tag, i.text, i.tag])
118+
119+
def print_hierarchy_with_qualifier(elem, namespace, metadata_sample, path=""):
120+
parts = path.split(".")
121+
md_schema = parts[0]
122+
quali = parts[-1]
123+
if len(elem) == 0 and elem.text != None:
124+
# Si el elemento no tiene hijos, y tiene un padre, lo guardamos en qualifier
125+
if path:
126+
qualifier = f"{path}.{elem.tag}"
127+
metadata_sample.append(
128+
[
129+
namespace,
130+
path.replace(namespace + ".", ""),
131+
elem.text,
132+
elem.tag,
133+
]
134+
)
135+
else:
136+
# Si tiene hijos, seguimos recorriendo la jerarquía
137+
new_path = f"{path}.{elem.tag}" if path else elem.tag
138+
for child in elem:
139+
print_hierarchy_with_qualifier(
140+
child, namespace, metadata_sample, new_path
141+
)
142+
143+
def parse_and_print_xml(response):
144+
tree = ET.fromstring(response.text)
145+
namespace = tree.tag
146+
metadata_sample = []
147+
print_hierarchy_with_qualifier(tree, namespace, metadata_sample)
148+
return pd.DataFrame(
149+
metadata_sample,
150+
columns=["metadata_schema", "element", "text_value", "qualifier"],
151+
)
152+
153+
metadata_sample = parse_and_print_xml(response)
154+
137155
return metadata_sample
138156

139157
def rda_a1_01m(self):
140158
# IF your ID is not an standard one (like internal), this method should be redefined
141159
points = 0
142160
msg = "Data is not accessible"
161+
data_res = gbif_doi_search(self.item_id)
162+
if len(data_res) > 0:
163+
points = 100
164+
msg = "Data found"
165+
143166
return (points, msg)
144167

145168
def rda_a1_02m(self):
146169
# IF your ID is not an standard one (like internal), this method should be redefined
147170
points = 0
148171
msg = "Data is not accessible"
149-
return (points, msg)
150-
151-
def rda_i1_02m(self):
152-
"""Indicator RDA-A1-01M
153-
This indicator is linked to the following principle: I1: (Meta)data use a formal, accessible,
154-
shared, and broadly applicable language for knowledge representation. More information
155-
about that principle can be found here.
156-
157-
This indicator focuses on the machine-understandability aspect of the metadata. This means
158-
that metadata should be readable and thus interoperable for machines without any
159-
requirements such as specific translators or mappings.
160-
161-
Technical proposal:
162-
163-
Parameters
164-
----------
165-
item_id : str
166-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
167-
identifier from the repo)
168-
169-
Returns
170-
-------
171-
points
172-
A number between 0 and 100 to indicate how well this indicator is supported
173-
msg
174-
Message with the results or recommendations to improve this indicator
175-
"""
176-
177-
# TO REDEFINE
178-
points = 0
179-
msg = "No machine-actionable metadata format found. OAI-PMH endpoint may help"
180-
return (points, msg)
181-
182-
def rda_i1_02d(self):
183-
"""Indicator RDA-A1-01M
184-
This indicator is linked to the following principle: I1: (Meta)data use a formal, accessible,
185-
shared, and broadly applicable language for knowledge representation. More information
186-
about that principle can be found here.
187-
188-
This indicator focuses on the machine-understandability aspect of the data. This means that
189-
data should be readable and thus interoperable for machines without any requirements such
190-
as specific translators or mappings.
191-
192-
Technical proposal:
193-
194-
Parameters
195-
----------
196-
item_id : str
197-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
198-
identifier from the repo)
199-
200-
Returns
201-
-------
202-
points
203-
A number between 0 and 100 to indicate how well this indicator is supported
204-
msg
205-
Message with the results or recommendations to improve this indicator
206-
"""
207-
return self.rda_i1_02m()
208-
209-
def rda_r1_3_01m(self):
210-
"""Indicator RDA-A1-01M
211-
This indicator is linked to the following principle: R1.3: (Meta)data meet domain-relevant
212-
community standards.
213-
214-
This indicator requires that metadata complies with community standards.
215-
216-
Technical proposal:
217-
218-
Parameters
219-
----------
220-
item_id : str
221-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
222-
identifier from the repo)
223-
224-
Returns
225-
-------
226-
points
227-
A number between 0 and 100 to indicate how well this indicator is supported
228-
msg
229-
Message with the results or recommendations to improve this indicator
230-
"""
231-
# TO REDEFINE
232-
points = 0
233-
msg = _(
234-
"Currently, this repo does not include community-bsed schemas. If you need to include yours, please contact."
235-
)
236-
return (points, msg)
237-
238-
def rda_r1_3_01d(self):
239-
"""Indicator RDA_R1.3_01D.
240-
241-
Technical proposal:
172+
data_res = gbif_doi_search(self.item_id)
173+
if len(data_res) > 0:
174+
points = 100
175+
msg = "Data found"
242176

243-
Parameters
244-
----------
245-
item_id : str
246-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
247-
identifier from the repo)
248-
249-
Returns
250-
-------
251-
points
252-
A number between 0 and 100 to indicate how well this indicator is supported
253-
msg
254-
Message with the results or recommendations to improve this indicator
255-
"""
256-
# TO REDEFINE
257-
points = 0
258-
msg = _(
259-
"Currently, this repo does not include community-bsed schemas. If you need to include yours, please contact."
260-
)
261177
return (points, msg)
262178

263179
def data_01(self):

0 commit comments

Comments
 (0)