Skip to content

Commit 8657dfe

Browse files
committed
Improving GBIF metrics and dealing with exceptions
1 parent 67480f2 commit 8657dfe

File tree

4 files changed

+94
-179
lines changed

4 files changed

+94
-179
lines changed

api/evaluator.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,7 @@ def rda_i3_01m(self, **kwargs):
14281428
if row["text_value"].split("/")[-1] not in self.item_id:
14291429
id_list.append(row["text_value"])
14301430
points, msg_list = self.eval_persistency(id_list)
1431+
return (points, msg_list)
14311432

14321433
def rda_i3_01d(self):
14331434
"""Indicator RDA-A1-01M.
@@ -1854,14 +1855,16 @@ def rda_r1_3_01d(self, **kwargs):
18541855
terms_reusability_richness_list = terms_reusability_richness["list"]
18551856
terms_reusability_richness_metadata = terms_reusability_richness["metadata"]
18561857

1857-
element = terms_reusability_richness_metadata.loc[
1858-
terms_reusability_richness_metadata["element"].isin(["availableFormats"]),
1859-
"text_value",
1860-
].values[0]
1861-
for form in element:
1862-
availableFormats.append(form["label"])
1863-
18641858
try:
1859+
element = terms_reusability_richness_metadata.loc[
1860+
terms_reusability_richness_metadata["element"].isin(
1861+
["availableFormats"]
1862+
),
1863+
"text_value",
1864+
].values[0]
1865+
for form in element:
1866+
availableFormats.append(form["label"])
1867+
18651868
f = open(path)
18661869
f.close()
18671870

api/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,10 +710,10 @@ def orcid_basic_info(orcid):
710710
item = xmlTree.findall(
711711
".//{http://www.orcid.org/ns/common}assertion-origin-name"
712712
)
713+
basic_info = "ORCID Name: %s" % item[0].text
713714
except Exception as e:
714715
logging.error(e)
715716
return basic_info
716-
basic_info = "ORCID Name: %s" % item[0].text
717717
return basic_info
718718

719719

plugins/gbif/config.ini

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -37,33 +37,29 @@ terms_findability_richness = ['Title',
3737
'Format',
3838
'Identifier',
3939
'Language']
40+
4041
[gbif]
4142
# (meta)data terms to find the resource identifier
42-
identifier_term = [['alternateIdentifier','']]
43-
identifier_term_data = [['alternateIdentifier','']]
43+
identifier_term = [['dataset','alternateIdentifier']]
44+
identifier_term_data = [['dataset','alternateIdentifier']]
4445

4546
# Metadata terms to check richness (generic). These terms should be included [term, qualifier]. None means no qualifier
46-
terms_quali_generic = [['contributor',None],
47-
['date', None],
48-
['description', None],
49-
['identifier', None],
50-
['publisher', None],
51-
['rights', None],
52-
['title', None],
53-
['subject', None]]
47+
terms_quali_generic = [['dataset.creator', 'givenName'],
48+
['dataset.creator', 'surName'],
49+
['dataset', 'pubDate'],
50+
['dataset.abstract', 'para'],
51+
['dataset.intellectualRights.para.ulink', 'citetitle'],
52+
['dataset', 'title'],
53+
['dataset.keywordSet', 'keyword']]
5454

5555
# Metadata terms to check richness (disciplinar). These terms should be included [term, qualifier]
56-
terms_quali_disciplinar = [['contributor', None],
57-
['date', None],
58-
['description', None],
59-
['identifier', None],
60-
['publisher', None],
61-
['rights', None],
62-
['title', None],
63-
['subject', None]]
56+
terms_quali_disciplinar = [['dataset.coverage.geographicCoverage', 'geographicDescription'],
57+
['dataset.coverage.temporalCoverage.rangeOfDates.beginDate', 'calendarDate'],
58+
['dataset.coverage.temporalCoverage.rangeOfDates.endDate', 'calendarDate'],
59+
['dataset.coverage.taxonomicCoverage.taxonomicClassification', 'taxonRankValue']]
6460

6561
# Metadata terms that defines accessibility (case sensitive)
66-
terms_access = [['access', ''], ['rights', '']]
62+
terms_access = [['dataset.intellectualRights.para.ulink', 'citetitle']]
6763

6864
# Metadata terms to check discoverability richness.
6965
#
@@ -84,37 +80,37 @@ terms_access = [['access', ''], ['rights', '']]
8480
# Format File format availableFormats
8581
# Identifier Data Unique ID DOI
8682
# Language NA NA
87-
terms_findability_richness = [['title',''],
88-
['keywords',''],
89-
['description',''],
90-
['type','relatedDataProducts'],
91-
['paths', 'spatial'],
92-
['temporalCoverage','relatedDataProducts'],
93-
['dataProvider','relatedDataProducts'],
94-
['license',''],
95-
['availableFormats',''],
96-
['identifiers','relatedDataProducts']]
83+
terms_findability_richness = [['dataset', 'title']],
84+
['dataset.keywordSet', 'keyword'],
85+
['dataset.abstract', 'para'],
86+
['dataset.coverage.geographicCoverage', 'geographicDescription'],
87+
['dataset.coverage.temporalCoverage.rangeOfDates.beginDate', 'calendarDate'],
88+
['dataset.coverage.temporalCoverage.rangeOfDates.endDate', 'calendarDate'],
89+
['dataset.intellectualRights.para.ulink', 'citetitle'],
90+
['dataset','alternateIdentifier']]
9791

9892
# Metadata terms to check reusability richness
99-
terms_reusability_richness = [['rigths',''],
100-
['license','']]
101-
93+
terms_reusability_richness = [['dataset','alternateIdentifier'],
94+
['additionalMetadata.metadata.gbif', 'hierarchyLevel']]
10295

10396

10497
# Metadata terms wich includes controlled vocabularies. More controlled vocabularies can be imlpemented in plugins
105-
terms_cv = [['coverage', 'spatial'], ['subject', 'lcsh']]
98+
terms_cv = [['dataset.creator', 'userId']]
10699

107100
# List of data formats that are standard for the community
108101
supported_data_formats = [".txt", ".pdf", ".csv", ".nc", ".doc", ".xls", ".zip", ".rar", ".tar", ".png", ".jpg"]
109102

110103
# Metadata terms that defines links or relation with authors, contributors (preferebly in ORCID format)
111-
terms_qualified_references = [['contributor', None]]
104+
terms_qualified_references = [['dataset.creator', 'userId'],
105+
['dataset.contact', 'userId'],
106+
['dataset.project.personnel', 'userId'],
107+
['dataset.metadataProvider', 'userId' ]]
112108

113109
# Metadata terms that defines links or relation with other resources, (preferebly in ORCID format, URIs or persistent identifiers)
114-
terms_relations = [['relation', None]]
110+
terms_relations = [['dataset.creator', 'userId']]
115111

116112
# Metadata terms that defines the license type
117-
terms_license = [['rights', '']]
113+
terms_license = [['dataset.intellectualRights.para.ulink', 'citetitle']]
118114

119115
# Metadata terms that defines metadata about provenance
120116
terms_provenance =[['curationAndProvenanceObligations','']]
@@ -123,7 +119,7 @@ terms_provenance =[['curationAndProvenanceObligations','']]
123119
terms_access_protocols =['http','https','ftp']
124120

125121
# Manual metadata access
126-
metadata_access_manual = ['https://github.com/epos-eu/Hands-On-EPOS-API']
122+
metadata_access_manual = ['https://techdocs.gbif.org/en/openapi/']
127123

128124
# Manual data access
129125
data_access_manual = ['https://techdocs.gbif.org/en/openapi/']
@@ -153,9 +149,9 @@ terms_vocabularies=[['identifiers','relatedDataProducts'],
153149
['license',''],
154150
['contactPoints','relatedDataProducts']]
155151

156-
157-
api_user = ma8
158-
api_pass = safdsaRY
152+
api_mail =
153+
api_user =
154+
api_pass =
159155

160156

161157
[fairsharing]

plugins/gbif/plugin.py

Lines changed: 47 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import requests
1313

1414
from api.evaluator import Evaluator
15-
from plugins.gbif.gbif_data import ICA, gbif_doi_download
15+
from plugins.gbif.gbif_data import ICA, gbif_doi_download, gbif_doi_search
1616

1717
logging.basicConfig(
1818
stream=sys.stdout, level=logging.DEBUG, format="'%(name)s:%(lineno)s' | %(message)s"
@@ -114,149 +114,65 @@ def get_metadata(self):
114114
final_url = final_url.replace("www.gbif.org/", "api.gbif.org/v1/")
115115
final_url = final_url + "/document"
116116
response = requests.get(final_url, verify=False)
117-
tree = ET.fromstring(response.text)
118-
119-
print("gbif5")
120-
eml_schema = "{eml://ecoinformatics.org/eml-2.1.1}"
121-
metadata_sample = []
122-
elementos = tree.find(".//")
123-
for e in elementos:
124-
if e.text != "" or e.text != "\n " or e.text != "\n":
125-
metadata_sample.append([eml_schema, e.tag, e.text, None])
126-
for i in e.iter():
127-
if len(list(i.iter())) > 0:
128-
for se in i.iter():
129-
metadata_sample.append(
130-
[eml_schema, e.tag + "." + i.tag, se.text, se.tag]
131-
)
132-
elif i.tag != e.tag and (
133-
i.text != "" or i.text != "\n " or i.text != "\n"
134-
):
135-
metadata_sample.append([eml_schema, e.tag, i.text, i.tag])
117+
118+
def print_hierarchy_with_qualifier(elem, namespace, metadata_sample, path=""):
119+
parts = path.split(".")
120+
md_schema = parts[0]
121+
quali = parts[-1]
122+
if len(elem) == 0 and elem.text != None:
123+
# Si el elemento no tiene hijos, y tiene un padre, lo guardamos en qualifier
124+
if path:
125+
qualifier = f"{path}.{elem.tag}"
126+
metadata_sample.append(
127+
[
128+
namespace,
129+
path.replace(namespace + ".", ""),
130+
elem.text,
131+
elem.tag,
132+
]
133+
)
134+
else:
135+
# Si tiene hijos, seguimos recorriendo la jerarquía
136+
new_path = f"{path}.{elem.tag}" if path else elem.tag
137+
for child in elem:
138+
print_hierarchy_with_qualifier(
139+
child, namespace, metadata_sample, new_path
140+
)
141+
142+
def parse_and_print_xml(response):
143+
tree = ET.fromstring(response.text)
144+
namespace = tree.tag
145+
metadata_sample = []
146+
print_hierarchy_with_qualifier(tree, namespace, metadata_sample)
147+
return pd.DataFrame(
148+
metadata_sample,
149+
columns=["metadata_schema", "element", "text_value", "qualifier"],
150+
)
151+
152+
metadata_sample = parse_and_print_xml(response)
153+
136154
return metadata_sample
137155

138156
def rda_a1_01m(self):
139157
# IF your ID is not an standard one (like internal), this method should be redefined
140158
points = 0
141159
msg = "Data is not accessible"
160+
data_res = gbif_doi_search(self.item_id)
161+
if len(data_res) > 0:
162+
points = 100
163+
msg = "Data found"
164+
142165
return (points, msg)
143166

144167
def rda_a1_02m(self):
145168
# IF your ID is not an standard one (like internal), this method should be redefined
146169
points = 0
147170
msg = "Data is not accessible"
148-
return (points, msg)
149-
150-
def rda_i1_02m(self):
151-
"""Indicator RDA-A1-01M
152-
This indicator is linked to the following principle: I1: (Meta)data use a formal, accessible,
153-
shared, and broadly applicable language for knowledge representation. More information
154-
about that principle can be found here.
155-
156-
This indicator focuses on the machine-understandability aspect of the metadata. This means
157-
that metadata should be readable and thus interoperable for machines without any
158-
requirements such as specific translators or mappings.
159-
160-
Technical proposal:
161-
162-
Parameters
163-
----------
164-
item_id : str
165-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
166-
identifier from the repo)
167-
168-
Returns
169-
-------
170-
points
171-
A number between 0 and 100 to indicate how well this indicator is supported
172-
msg
173-
Message with the results or recommendations to improve this indicator
174-
"""
175-
176-
# TO REDEFINE
177-
points = 0
178-
msg = "No machine-actionable metadata format found. OAI-PMH endpoint may help"
179-
return (points, msg)
180-
181-
def rda_i1_02d(self):
182-
"""Indicator RDA-A1-01M
183-
This indicator is linked to the following principle: I1: (Meta)data use a formal, accessible,
184-
shared, and broadly applicable language for knowledge representation. More information
185-
about that principle can be found here.
186-
187-
This indicator focuses on the machine-understandability aspect of the data. This means that
188-
data should be readable and thus interoperable for machines without any requirements such
189-
as specific translators or mappings.
190-
191-
Technical proposal:
192-
193-
Parameters
194-
----------
195-
item_id : str
196-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
197-
identifier from the repo)
198-
199-
Returns
200-
-------
201-
points
202-
A number between 0 and 100 to indicate how well this indicator is supported
203-
msg
204-
Message with the results or recommendations to improve this indicator
205-
"""
206-
return self.rda_i1_02m()
207-
208-
def rda_r1_3_01m(self):
209-
"""Indicator RDA-A1-01M
210-
This indicator is linked to the following principle: R1.3: (Meta)data meet domain-relevant
211-
community standards.
212-
213-
This indicator requires that metadata complies with community standards.
214-
215-
Technical proposal:
216-
217-
Parameters
218-
----------
219-
item_id : str
220-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
221-
identifier from the repo)
222-
223-
Returns
224-
-------
225-
points
226-
A number between 0 and 100 to indicate how well this indicator is supported
227-
msg
228-
Message with the results or recommendations to improve this indicator
229-
"""
230-
# TO REDEFINE
231-
points = 0
232-
msg = _(
233-
"Currently, this repo does not include community-bsed schemas. If you need to include yours, please contact."
234-
)
235-
return (points, msg)
236-
237-
def rda_r1_3_01d(self):
238-
"""Indicator RDA_R1.3_01D.
239-
240-
Technical proposal:
171+
data_res = gbif_doi_search(self.item_id)
172+
if len(data_res) > 0:
173+
points = 100
174+
msg = "Data found"
241175

242-
Parameters
243-
----------
244-
item_id : str
245-
Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
246-
identifier from the repo)
247-
248-
Returns
249-
-------
250-
points
251-
A number between 0 and 100 to indicate how well this indicator is supported
252-
msg
253-
Message with the results or recommendations to improve this indicator
254-
"""
255-
# TO REDEFINE
256-
points = 0
257-
msg = _(
258-
"Currently, this repo does not include community-bsed schemas. If you need to include yours, please contact."
259-
)
260176
return (points, msg)
261177

262178
def data_01(self):

0 commit comments

Comments
 (0)