DIGITAL.CSIC plugin working

ferag · ferag · commit de9bcde9566e · 2024-10-17T16:02:01.000+02:00
diff --git a/api/evaluator.py b/api/evaluator.py
@@ -1172,6 +1172,7 @@ def rda_i1_01d(self):
         return (points, msg_list)
 
     def rda_i1_02m(self):
+        # TOFIX - This is very OAI-PMH dependant
         """Indicator RDA-A1-01M.
 
         This indicator is linked to the following principle: I1: (Meta)data use a formal, accessible,
@@ -1460,56 +1461,58 @@ def rda_i3_04m(self):
         return self.rda_i3_03m()
 
     # REUSABLE
-    def rda_r1_01m(self):
-        """Indicator RDA-A1-01M
+    @ConfigTerms(term_id="terms_reusability_richness")
+    def rda_r1_01m(self, **kwargs):
+        """Indicator RDA-R1-01M: Plurality of accurate and relevant attributes are provided to allow reuse.
+
         This indicator is linked to the following principle: R1: (Meta)data are richly described with a
         plurality of accurate and relevant attributes. More information about that principle can be
         found here.
+
         The indicator concerns the quantity but also the quality of metadata provided in order to
         enhance data reusability.
-        Technical proposal:
-        Parameters
-        ----------
-        item_id : str
-            Digital Object identifier, which can be a generic one (DOI, PID), or an internal (e.g. an
-            identifier from the repo)
+
         Returns
         -------
         points
-            A number between 0 and 100 to indicate how well this indicator is supported
+            Proportional to the number of terms considered to enhance reusability
         msg
             Message with the results or recommendations to improve this indicator
         """
-        # Depending on the metadata schema used, checks that at least the mandatory terms are filled (75%)
-        # and the number of terms are high (25%)
-        msg_list = []
-        logger.debug(_("Checking Dublin Core as multidisciplinar schema"))
+        points = 0
 
-        md_term_list = pd.DataFrame(
-            self.terms_quali_disciplinar, columns=["term", "qualifier"]
-        )
-        md_term_list = ut.check_metadata_terms(self.metadata, md_term_list)
-        points = (
-            100
-            * (len(md_term_list) - (len(md_term_list) - sum(md_term_list["found"])))
-            / len(md_term_list)
-        )
-        if points == 100:
-            msg_list.append(
-                {"message": _("All mandatory terms included"), "points": points}
+        terms_reusability_richness = kwargs["terms_reusability_richness"]
+        terms_reusability_richness_list = terms_reusability_richness["list"]
+        terms_reusability_richness_metadata = terms_reusability_richness["metadata"]
+
+        reusability_element_list = []
+        for element in terms_reusability_richness_list:
+            element_df = terms_reusability_richness_metadata.loc[
+                terms_reusability_richness_metadata["element"].isin([element[0]]),
+                "text_value",
+            ]
+
+            element_values = element_df.values
+            if len(element_values) > 0:
+                reusability_element_list.extend(element_values)
+        if len(reusability_element_list) > 0:
+            msg = "Found %s metadata elements that enhance reusability: %s" % (
+                len(reusability_element_list),
+                reusability_element_list,
             )
         else:
-            for i, e in md_term_list.iterrows():
-                if e["found"] == 0:
-                    msg_list.append(
-                        {
-                            "message": _("Missing term")
-                            + ": %s, qualifier: %s" % (e["term"], e["qualifier"]),
-                            "points": points,
-                        }
-                    )
+            msg = "Could not find any metadata element that enhance reusability"
+        points = (
+            len(
+                terms_reusability_richness_metadata.groupby(
+                    ["element", "qualifier"]
+                ).count()
+            )
+            / len(terms_reusability_richness["list"])
+            * 100
+        )
 
-        return (points, msg_list)
+        return (points, [{"message": msg, "points": points}])
 
     @ConfigTerms(term_id="terms_license")
     def rda_r1_1_01m(self, license_list=[], **kwargs):
diff --git a/api/utils.py b/api/utils.py
@@ -651,6 +651,31 @@ def check_standard_project_relation(value):
         return False
 
 
+def oai_request(oai_base, action):
+    oai = requests.get(oai_base + action, verify=False)  # Peticion al servidor
+    try:
+        xmlTree = ET.fromstring(oai.text)
+    except Exception as e:
+        logging.error("OAI_RQUEST: %s" % e)
+        xmlTree = ET.fromstring("<OAI-PMH></OAI-PMH>")
+    return xmlTree
+
+
+def oai_metadataFormats(oai_base):
+    action = "?verb=ListMetadataFormats"
+    xmlTree = oai_request(oai_base, action)
+    metadataFormats = {}
+    for e in xmlTree.findall(".//{http://www.openarchives.org/OAI/2.0/}metadataFormat"):
+        metadataPrefix = e.find(
+            "{http://www.openarchives.org/OAI/2.0/}metadataPrefix"
+        ).text
+        namespace = e.find(
+            "{http://www.openarchives.org/OAI/2.0/}metadataNamespace"
+        ).text
+        metadataFormats[metadataPrefix] = namespace
+    return metadataFormats
+
+
 def get_rdf_metadata_format(oai_base):
     rdf_schemas = []
     try:
@@ -664,6 +689,109 @@ def get_rdf_metadata_format(oai_base):
     return rdf_schemas
 
 
+def oai_check_record_url(oai_base, metadata_prefix, pid):
+    endpoint_root = urllib.parse.urlparse(oai_base).netloc
+    try:
+        pid_type = idutils.detect_identifier_schemes(pid)[0]
+    except Exception as e:
+        pid_type = "internal"
+        logging.error(e)
+    if pid_type != "internal":
+        oai_pid = idutils.normalize_pid(pid, pid_type)
+    else:
+        oai_pid = pid
+    action = "?verb=GetRecord"
+
+    test_id = "oai:%s:%s" % (endpoint_root, oai_pid)
+    params = "&metadataPrefix=%s&identifier=%s" % (metadata_prefix, test_id)
+    url_final = ""
+    url = oai_base + action + params
+    response = requests.get(url, verify=False, allow_redirects=True)
+    logging.debug("Trying ID v1: url: %s | status: %i" % (url, response.status_code))
+    error = 0
+    for tags in ET.fromstring(response.text).findall(
+        ".//{http://www.openarchives.org/OAI/2.0/}error"
+    ):
+        error = error + 1
+    if error == 0:
+        url_final = url
+
+    test_id = "%s" % (oai_pid)
+    params = "&metadataPrefix=%s&identifier=%s" % (metadata_prefix, test_id)
+
+    url = oai_base + action + params
+    logging.debug("Trying: " + url)
+    response = requests.get(url, verify=False)
+    error = 0
+    for tags in ET.fromstring(response.text).findall(
+        ".//{http://www.openarchives.org/OAI/2.0/}error"
+    ):
+        error = error + 1
+    if error == 0:
+        url_final = url
+
+    test_id = "%s:%s" % (pid_type, oai_pid)
+    params = "&metadataPrefix=%s&identifier=%s" % (metadata_prefix, test_id)
+
+    url = oai_base + action + params
+    logging.debug("Trying: " + url)
+    response = requests.get(url, verify=False)
+    error = 0
+    for tags in ET.fromstring(response.text).findall(
+        ".//{http://www.openarchives.org/OAI/2.0/}error"
+    ):
+        error = error + 1
+    if error == 0:
+        url_final = url
+
+    test_id = "oai:%s:%s" % (
+        endpoint_root,
+        oai_pid[oai_pid.rfind(".") + 1 : len(oai_pid)],
+    )
+    params = "&metadataPrefix=%s&identifier=%s" % (metadata_prefix, test_id)
+
+    url = oai_base + action + params
+    logging.debug("Trying: " + url)
+    response = requests.get(url, verify=False)
+    error = 0
+    for tags in ET.fromstring(response.text).findall(
+        ".//{http://www.openarchives.org/OAI/2.0/}error"
+    ):
+        error = error + 1
+    if error == 0:
+        url_final = url
+
+    test_id = "oai:%s:b2rec/%s" % (
+        endpoint_root,
+        oai_pid[oai_pid.rfind(".") + 1 : len(oai_pid)],
+    )
+    params = "&metadataPrefix=%s&identifier=%s" % (metadata_prefix, test_id)
+
+    url = oai_base + action + params
+    logging.debug("Trying: " + url)
+    response = requests.get(url, verify=False)
+    error = 0
+    for tags in ET.fromstring(response.text).findall(
+        ".//{http://www.openarchives.org/OAI/2.0/}error"
+    ):
+        error = error + 1
+    if error == 0:
+        url_final = url
+
+    return url_final
+
+
+def oai_get_metadata(url):
+    logger.debug("Metadata from: %s" % url)
+    oai = requests.get(url, verify=False, allow_redirects=True)
+    try:
+        xmlTree = ET.fromstring(oai.text)
+    except Exception as e:
+        logger.error("OAI_RQUEST: %s" % e)
+        xmlTree = None
+    return xmlTree
+
+
 def licenses_list():
     url = "https://spdx.org/licenses/licenses.json"
     headers = {"Accept": "application/json"}  # Type of response accpeted
diff --git a/plugins/digital_csic/config.ini.template b/plugins/digital_csic/config.ini.template
@@ -0,0 +1,96 @@
+[Generic]
+endpoint=http://digital.csic.es/dspace-oai/request
+[digital_csic]
+db_host =
+db_port =
+db_user =
+db_pass =
+db_db   =
+oai_base = http://digital.csic.es/dspace-oai/request
+
+# (meta)data terms to find the resource identifier
+identifier_term = [['identifier', 'doi'], ['identifier', 'uri']]
+identifier_term_data = [['identifier', 'doi'], ['identifier', 'uri']]
+
+
+# Metadata terms to check richness (generic). These terms should be included
+terms_quali_generic = [['contributor','author'],
+            ['date', 'issued'],
+            ['title', ''],
+            ['identifier', 'citation'],
+            ['publisher', ''],
+            ['identifier', 'uri'],
+            ['type', ''],
+            ['language', 'iso'],
+            ['relation', 'csic'],
+            ['rights', '']]
+
+terms_quali_disciplinar = [['contributor','author'],
+            ['date', 'issued'],
+            ['title', ''],
+            ['identifier', 'citation'],
+            ['publisher', ''],
+            ['identifier', 'uri'],
+            ['type', ''],
+            ['language', 'iso'],
+            ['relation', 'csic'],
+            ['rights', '']]
+
+terms_access = [['access', ''], ['rights', '']]
+
+# Accepted access protocols
+terms_access_protocols =['http','https']
+
+# Manual metadata access
+metadata_access_manual = ['TODO']
+
+# Manual data access
+data_access_manual = ['TODO']
+
+#Policy of metadata persistence
+metadata_persistence = []
+
+#Authentication for EPOS
+metadata_authentication = []
+
+#terms that use vocabularies and vocabularies used
+dict_vocabularies= {'ROR': 'https://ror.org/', 'PIC': 'https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/how-to-participate/participant-register', 'imtypes': 'https://www.iana.org/assignments/media-types/media-types.xhtml', 'TRL': 'TRL', 'temporal': 'https://www.iso.org/iso-8601-date-and-time-format.html', 'Rolecode': 'Rolecode', 'spdx': 'https://spdx.org/licenses/', 'ORCID': 'https://orcid.org/'}
+
+terms_vocabularies=[['identifiers','relatedDataProducts'],
+                   ['availableFormats',''],
+                   ['temporalCoverage','relatedDataProducts'],#no temporal metatdata
+                   ['license',''],
+                   ['contactPoints','relatedDataProducts']]
+
+
+terms_cv = [['coverage', 'spatial'], ['subject', 'lcsh'], ['subject', 'uri'], ['type', 'coar']]
+supported_data_formats = [".tif", ".aig", ".asc", ".agr", ".grd", ".nc", ".hdf", ".hdf5",
+                        ".pdf", ".odf", ".doc", ".docx", ".csv", ".jpg", ".png", ".gif",
+                        ".mp4", ".xml", ".rdf", ".txt", ".mp3", ".wav", ".zip", ".rar",
+                        ".tar", ".tar.gz", ".jpeg", ".xls", ".xlsx"]
+
+terms_qualified_references = [['identifier','funder']]
+terms_relations = [['relation', 'uri'], ['relation', ''], ['contributor','orcid'], ['contributor', 'funder']]
+terms_relations_only_data = [['relation', 'uri'], ['relation', ''], ['relation','isbasedon'], ['relation', 'isreferencedby)']]
+
+
+terms_license = [['rights', ''], ['license', '']]
+
+# Metadata terms to check reusability richness
+terms_reusability_richness = [['contributor','author'],
+                            ['date', 'issued'],
+                            ['title', ''],
+                            ['identifier', 'citation'],
+                            ['publisher', ''],
+                            ['identifier', 'uri'],
+                            ['type', ''],
+                            ['language', 'iso'],
+                            ['relation', 'csic'],
+                            ['rights', '']]
+
+#metadata standard
+metadata_standard = ['dc']
+
+metadata_schemas = {'dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/'}
+
+prov_terms = [['description', 'provenance']]
diff --git a/plugins/digital_csic/plugin.py b/plugins/digital_csic/plugin.py