feat: update metadata scripts to latest version

schnaitter · schnaitter · commit 25f75eed35cc · 2026-02-18T12:49:08.000+01:00
diff --git a/quadriga/metadata/create_bibtex.py b/quadriga/metadata/create_bibtex.py
@@ -275,9 +275,7 @@ def create_bibtex_from_cff() -> bool | None:
             if "repository-code" in pref and "note" not in pref:
                 bibtex_lines.append(f"  note      = {{Repository: {pref['repository-code']}}},")
 
-            # Add version info
-            if "version" in pref:
-                bibtex_lines.append(f"  version   = {{{pref['version']}}},")
+            # Note: version is already added in the common fields section above
 
             # Add software-specific details as howpublished if not present
             if ("howpublished" not in pref) and ("repository-code" in pref or "url" in pref):
@@ -331,7 +329,7 @@ def create_bibtex_from_cff() -> bool | None:
             with citation_bib_path.open("w", encoding="utf-8") as f:
                 f.write(bibtex)
         except OSError:
-            logger.exception("Error writing to {citation_bib_path}")
+            logger.exception("Error writing to %s", citation_bib_path)
             return False
         else:
             logger.info("BibTeX citation successfully created at %s", citation_bib_path)
diff --git a/quadriga/metadata/create_jsonld.py b/quadriga/metadata/create_jsonld.py
@@ -161,6 +161,7 @@ def transform_learning_objective(objective_data: Any) -> dict[str, Any]:
     - learning-objective -> schema:teaches / lrmi:teaches (closeMatch)
     - competency -> maps to modalia:Skill
     - blooms-category -> part of educational alignment
+    - assessment -> lrmi:assesses / schema:assesses (closeMatch)
 
     Args:
         objective_data (dict): Learning objective dictionary
@@ -199,6 +200,10 @@ def transform_learning_objective(objective_data: Any) -> dict[str, Any]:
         else:
             objective["targetDescription"] = f"Data Flow: {objective_data['data-flow']}"
 
+    # assessment -> lrmi:assesses / schema:assesses (closeMatch)
+    if "assessment" in objective_data:
+        objective["lrmi:assesses"] = objective_data["assessment"]
+
     return objective
 
 
@@ -555,8 +560,19 @@ def create_jsonld() -> bool | None:
             jsonld["funding"] = metadata["context-of-creation"]
             logger.info("Added context of creation")
 
-        # quality-assurance is not included in JSON-LD
-        # It's in active development and has no standard schema.org mapping
+        # learning-resource-type -> schema:learningResourceType (closeMatch)
+        #                        -> lrmi:learningResourceType (closeMatch)
+        #                        -> dcterms:type (broadMatch)
+        #                        -> dc:type (broadMatch)
+        if "learning-resource-type" in metadata:
+            jsonld["learningResourceType"] = metadata["learning-resource-type"]
+            jsonld["lrmi:learningResourceType"] = metadata["learning-resource-type"]
+            jsonld["dcterms:type"] = metadata["learning-resource-type"]
+            jsonld["dc:type"] = metadata["learning-resource-type"]
+            logger.info("Added learning resource type: %s", metadata["learning-resource-type"])
+
+        # quality-assurance: not mapped to JSON-LD
+        # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output
 
         # Write JSON-LD file
         try:
diff --git a/quadriga/metadata/create_rdfxml.py b/quadriga/metadata/create_rdfxml.py
@@ -12,8 +12,8 @@
 from __future__ import annotations
 
 import logging
-import os
 import sys
+import xml.etree.ElementTree as ET
 from pathlib import Path
 from typing import Any
 
@@ -22,13 +22,6 @@
 
 from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file
 
-# Ensure deterministic hash seed for consistent RDF serialization
-# This must be set before any dictionaries/sets are created
-if "PYTHONHASHSEED" not in os.environ:
-    os.environ["PYTHONHASHSEED"] = "0"
-    # Re-exec the script with the environment variable set
-    os.execv(sys.executable, [sys.executable] + sys.argv)
-
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
 
@@ -38,6 +31,24 @@
 LRMI = Namespace("http://purl.org/dcx/lrmi-terms/")
 
 
+def _sort_xml_element(element: ET.Element) -> None:
+    """
+    Recursively sort child elements for deterministic XML output.
+
+    Sorts by tag name, then by attributes (as sorted key-value pairs),
+    then by text content. This ensures identical output regardless of
+    Python's hash randomization (PYTHONHASHSEED).
+
+    Args:
+        element: XML element whose children will be sorted in-place
+    """
+    children = list(element)
+    for child in children:
+        _sort_xml_element(child)
+    children.sort(key=lambda e: (e.tag, sorted(e.attrib.items()), e.text or ""))
+    element[:] = children
+
+
 def clean_orcid(orcid_string: str) -> str | None:
     """
     Extract ORCID identifier from an ORCID string or URL.
@@ -171,6 +182,7 @@ def add_learning_objective(
     - learning-objective -> schema:teaches / lrmi:teaches (closeMatch)
     - competency -> maps to modalia:Skill
     - blooms-category -> part of educational alignment
+    - assessment -> lrmi:assesses / schema:assesses (closeMatch)
 
     Args:
         graph: RDF graph to add triples to
@@ -217,6 +229,10 @@ def add_learning_objective(
     if descriptions:
         graph.add((obj_uri, SCHEMA.targetDescription, Literal(" | ".join(descriptions))))
 
+    # assessment -> lrmi:assesses (closeMatch)
+    if "assessment" in objective_data:
+        graph.add((obj_uri, LRMI.assesses, Literal(objective_data["assessment"])))
+
     return obj_uri
 
 
@@ -562,28 +578,53 @@ def create_rdfxml() -> bool | None:
             graph.add((resource_uri, SCHEMA.funding, Literal(metadata["context-of-creation"])))
             logger.info("Added context of creation")
 
-        # quality-assurance is not included in RDF
-        # It's in active development and has no standard schema.org mapping
-
-        # Sort triples for deterministic output
-        # This ensures consistent ordering regardless of Python's hash randomization
-        logger.info("Sorting triples for deterministic output...")
-        sorted_triples = sorted(graph, key=lambda t: (str(t[0]), str(t[1]), str(t[2])))
-
-        # Create a new graph with sorted triples
-        sorted_graph = Graph()
-        for prefix, namespace in graph.namespaces():
-            sorted_graph.bind(prefix, namespace)
-
-        for triple in sorted_triples:
-            sorted_graph.add(triple)
-
-        logger.info("Sorted %d triples", len(sorted_triples))
+        # learning-resource-type -> schema:learningResourceType (closeMatch)
+        #                        -> lrmi:learningResourceType (closeMatch)
+        #                        -> dcterms:type (broadMatch)
+        #                        -> dc:type (broadMatch)
+        if "learning-resource-type" in metadata:
+            lrt = Literal(metadata["learning-resource-type"])
+            graph.add((resource_uri, SCHEMA.learningResourceType, lrt))
+            graph.add((resource_uri, LRMI.learningResourceType, lrt))
+            graph.add((resource_uri, DCTERMS.type, lrt))
+            graph.add((resource_uri, DC.type, lrt))
+            logger.info("Added learning resource type: %s", metadata["learning-resource-type"])
+
+        # quality-assurance: not mapped to RDF
+        # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output
+
+        # Serialize to RDF/XML and post-process for deterministic output.
+        # rdflib's pretty-xml serializer uses Python dicts internally, so element
+        # and namespace ordering varies across process invocations due to hash
+        # randomization. We sort the XML elements after serialization to guarantee
+        # reproducible output regardless of PYTHONHASHSEED.
+        logger.info("Serializing %d triples to RDF/XML...", len(graph))
 
-        # Write RDF/XML file
         try:
-            with rdf_path.open("wb") as f:
-                sorted_graph.serialize(f, format="pretty-xml", encoding="utf-8")
+            xml_bytes = graph.serialize(format="pretty-xml", encoding="utf-8")
+            xml_str = xml_bytes.decode("utf-8") if isinstance(xml_bytes, bytes) else xml_bytes
+
+            # Register namespace prefixes so ElementTree preserves them
+            for prefix, uri in [
+                ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
+                ("schema", str(SCHEMA)),
+                ("dc", str(DC)),
+                ("dcterms", str(DCTERMS)),
+                ("lrmi", str(LRMI)),
+                ("skos", str(SKOS)),
+            ]:
+                ET.register_namespace(prefix, uri)
+
+            # Parse, sort elements recursively, and re-serialize
+            root = ET.fromstring(xml_str)  # noqa: S314 — parsing our own rdflib output
+            _sort_xml_element(root)
+            ET.indent(root, space="  ")
+
+            sorted_xml = ET.tostring(root, encoding="unicode", xml_declaration=True)
+
+            with rdf_path.open("w", encoding="utf-8") as f:
+                f.write(sorted_xml)
+                f.write("\n")
         except OSError:
             logger.exception("Error writing to %s", rdf_path)
             return False
diff --git a/quadriga/metadata/create_zenodo_json.py b/quadriga/metadata/create_zenodo_json.py
@@ -250,15 +250,33 @@ def create_zenodo_json() -> bool | None:
             return False
 
         # description
-        description = citation_data.get("abstract")
-        if not description:
-            description = metadata.get("description")
-
-        if description:
-            zenodo_metadata["description"] = description
-            logger.info("Added description")
-        else:
-            logger.warning("No description/abstract found")
+        description = "<p>" + metadata.get("description") + "</p>"
+
+        description_base = f"""
+<p>Das interaktive Lehrbuch kann als <a href="{metadata.get("url")}" target="_blank">Web-Version</a> verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch bei <a href="{metadata.get("git")}" target="_blank">GitHub</a> zur Verfügung.</p>
+<p>Die QUADRIGA OER sind nach einem einheitlichen <a href="https://quadriga-dk.github.io/Book_Template" target="_blank">Template</a> gestaltet, werden nach einem <a href="{metadata.get("quality-assurance").get("description", "TODO")}" target="_blank">standardisierten Verfahren qualitätsgeprüft</a> und <a href="https://doi.org/10.5281/zenodo.18184772" target="_blank">mit Metadaten ausgezeichnet</a>.</p>
+<h5>QUADRIGA Datenkompetenzzentrum</h5>
+<p>QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozesse möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes <a href="https://doi.org/10.5281/zenodo.14747822" target="_blank">Datenkompetenzframework</a> zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.</p>
+<p>Zielgruppen von QUADRIGA sind in erster Linie promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.</p>
+<p>QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom <a href="https://www.bmftr.bund.de/DE/Forschung/Wissenschaftssystem/Forschungsdaten/DatenkompetenzenInDerWissenschaft/datenkompetenzeninderwissenschaft.html" target="_blank">Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR)</a> und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartern zählen:
+  <ul>
+    <li>Universität Potsdam (Verbundkoordination) <span style="font-size: small">(Förderkennzeichen: 16DKZ2034A)</span></li>
+    <li>Filmuniversität Babelsberg <span style="font-size: small">(Förderkennzeichen: 16DKZ2034B)</span></li>
+    <li>Fachhochschule Potsdam <span style="font-size: small">(Förderkennzeichen: 16DKZ2034C)</span></li>
+    <li>Fraunhofer FOKUS <span style="font-size: small">(Förderkennzeichen: 16DKZ2034D)</span></li>
+    <li>Freie Universität Berlin <span style="font-size: small">(Förderkennzeichen: 16DKZ2034E)</span></li>
+    <li>Technische Universität Berlin <span style="font-size: small">(Förderkennzeichen: 16DKZ2034F)</span></li>
+    <li>Gesellschaft für Informatik <span style="font-size: small">(Förderkennzeichen: 16DKZ2034G)</span></li>
+    <li>Humboldt-Universität zu Berlin <span style="font-size: small">(Förderkennzeichen: 16DKZ2034H)</span></li>
+  </ul>
+</p>
+
+<p>Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im <a href="https://doi.org/10.5281/zenodo.10805015" target="_blank">Umsetzungskonzept</a> erfahren.</p>
+
+<p>Weitere Informationen sowie Publikationen finden Sie auf der <a href="https://www.quadriga-dk.de" target="_blank">Webseite</a>, in der <a href="https://zenodo.org/communities/quadriga" target="_blank">Zenodo-Community</a> und der <a href="https://github.com/quadriga-dk" target="_blank">GitHub-Organisation</a> des Projekts.</p>
+"""
+        zenodo_metadata["description"] = description + description_base
+        logger.info("Added description")
 
         # publication date
         publication_date = None
@@ -356,7 +374,7 @@ def create_zenodo_json() -> bool | None:
             with zenodo_json_path.open("w", encoding="utf-8") as f:
                 json.dump(zenodo_metadata, f, ensure_ascii=False, indent=2)
         except OSError:
-            logger.exception("Error writing to {zenodo_json_path}")
+            logger.exception("Error writing to %s", zenodo_json_path)
             return False
         else:
             logger.info("Zenodo metadata successfully created at %s", zenodo_json_path)
diff --git a/quadriga/metadata/extract_from_book_config.py b/quadriga/metadata/extract_from_book_config.py
@@ -72,7 +72,6 @@ def extract_and_update() -> bool | None:
 
         # Extract information from _config.yml
         title = config_data.get("title", "")
-        config_data.get("author", "")
 
         if not title:
             logger.warning("No title found in _config.yml")
@@ -115,7 +114,7 @@ def extract_and_update() -> bool | None:
                     # Add to the list of chapters
                     toc_chapters.append(chapter_title)
                 except Exception:
-                    logger.exception("Error processing chapter {chapter.get('file', 'unknown')}")
+                    logger.exception("Error processing chapter %s", chapter.get("file", "unknown"))
                     # Add a placeholder with the filename if possible
                     try:
                         toc_chapters.append(f"[Error: {p.stem}]")
diff --git a/quadriga/metadata/run_all.py b/quadriga/metadata/run_all.py
@@ -18,6 +18,7 @@
 from quadriga.metadata.create_zenodo_json import create_zenodo_json
 from quadriga.metadata.extract_from_book_config import extract_and_update
 from quadriga.metadata.update_citation_cff import update_citation
+from quadriga.metadata.validate_schema import validate_schema
 
 logger = logging.getLogger(__name__)
 
@@ -34,6 +35,16 @@ def main() -> bool | None:
 
         logger.info("Running all metadata update scripts...")
 
+        # Validate metadata.yml against QUADRIGA schema first
+        try:
+            logger.info("Validating metadata.yml against QUADRIGA schema...")
+            if not validate_schema():
+                logger.error("Schema validation failed.")
+                return False
+        except Exception:
+            logger.exception("Unexpected error during schema validation")
+            return False
+
         # Execute extract_and_update with error handling
         try:
             logger.info("Extracting metadata from _config.yml and _toc.yml...")
diff --git a/quadriga/metadata/update_citation_cff.py b/quadriga/metadata/update_citation_cff.py
diff --git a/quadriga/metadata/utils.py b/quadriga/metadata/utils.py
diff --git a/quadriga/metadata/validate_schema.py b/quadriga/metadata/validate_schema.py