Skip to content

Commit 25f75ee

Browse files
committed
feat: update metadata scripts to latest version
1 parent 3423643 commit 25f75ee

File tree

9 files changed

+380
-92
lines changed

9 files changed

+380
-92
lines changed

quadriga/metadata/create_bibtex.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,7 @@ def create_bibtex_from_cff() -> bool | None:
275275
if "repository-code" in pref and "note" not in pref:
276276
bibtex_lines.append(f" note = {{Repository: {pref['repository-code']}}},")
277277

278-
# Add version info
279-
if "version" in pref:
280-
bibtex_lines.append(f" version = {{{pref['version']}}},")
278+
# Note: version is already added in the common fields section above
281279

282280
# Add software-specific details as howpublished if not present
283281
if ("howpublished" not in pref) and ("repository-code" in pref or "url" in pref):
@@ -331,7 +329,7 @@ def create_bibtex_from_cff() -> bool | None:
331329
with citation_bib_path.open("w", encoding="utf-8") as f:
332330
f.write(bibtex)
333331
except OSError:
334-
logger.exception("Error writing to {citation_bib_path}")
332+
logger.exception("Error writing to %s", citation_bib_path)
335333
return False
336334
else:
337335
logger.info("BibTeX citation successfully created at %s", citation_bib_path)

quadriga/metadata/create_jsonld.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def transform_learning_objective(objective_data: Any) -> dict[str, Any]:
161161
- learning-objective -> schema:teaches / lrmi:teaches (closeMatch)
162162
- competency -> maps to modalia:Skill
163163
- blooms-category -> part of educational alignment
164+
- assessment -> lrmi:assesses / schema:assesses (closeMatch)
164165
165166
Args:
166167
objective_data (dict): Learning objective dictionary
@@ -199,6 +200,10 @@ def transform_learning_objective(objective_data: Any) -> dict[str, Any]:
199200
else:
200201
objective["targetDescription"] = f"Data Flow: {objective_data['data-flow']}"
201202

203+
# assessment -> lrmi:assesses / schema:assesses (closeMatch)
204+
if "assessment" in objective_data:
205+
objective["lrmi:assesses"] = objective_data["assessment"]
206+
202207
return objective
203208

204209

@@ -555,8 +560,19 @@ def create_jsonld() -> bool | None:
555560
jsonld["funding"] = metadata["context-of-creation"]
556561
logger.info("Added context of creation")
557562

558-
# quality-assurance is not included in JSON-LD
559-
# It's in active development and has no standard schema.org mapping
563+
# learning-resource-type -> schema:learningResourceType (closeMatch)
564+
# -> lrmi:learningResourceType (closeMatch)
565+
# -> dcterms:type (broadMatch)
566+
# -> dc:type (broadMatch)
567+
if "learning-resource-type" in metadata:
568+
jsonld["learningResourceType"] = metadata["learning-resource-type"]
569+
jsonld["lrmi:learningResourceType"] = metadata["learning-resource-type"]
570+
jsonld["dcterms:type"] = metadata["learning-resource-type"]
571+
jsonld["dc:type"] = metadata["learning-resource-type"]
572+
logger.info("Added learning resource type: %s", metadata["learning-resource-type"])
573+
574+
# quality-assurance: not mapped to JSON-LD
575+
# All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output
560576

561577
# Write JSON-LD file
562578
try:

quadriga/metadata/create_rdfxml.py

Lines changed: 69 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from __future__ import annotations
1313

1414
import logging
15-
import os
1615
import sys
16+
import xml.etree.ElementTree as ET
1717
from pathlib import Path
1818
from typing import Any
1919

@@ -22,13 +22,6 @@
2222

2323
from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file
2424

25-
# Ensure deterministic hash seed for consistent RDF serialization
26-
# This must be set before any dictionaries/sets are created
27-
if "PYTHONHASHSEED" not in os.environ:
28-
os.environ["PYTHONHASHSEED"] = "0"
29-
# Re-exec the script with the environment variable set
30-
os.execv(sys.executable, [sys.executable] + sys.argv)
31-
3225
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
3326
logger = logging.getLogger(__name__)
3427

@@ -38,6 +31,24 @@
3831
LRMI = Namespace("http://purl.org/dcx/lrmi-terms/")
3932

4033

34+
def _sort_xml_element(element: ET.Element) -> None:
35+
"""
36+
Recursively sort child elements for deterministic XML output.
37+
38+
Sorts by tag name, then by attributes (as sorted key-value pairs),
39+
then by text content. This ensures identical output regardless of
40+
Python's hash randomization (PYTHONHASHSEED).
41+
42+
Args:
43+
element: XML element whose children will be sorted in-place
44+
"""
45+
children = list(element)
46+
for child in children:
47+
_sort_xml_element(child)
48+
children.sort(key=lambda e: (e.tag, sorted(e.attrib.items()), e.text or ""))
49+
element[:] = children
50+
51+
4152
def clean_orcid(orcid_string: str) -> str | None:
4253
"""
4354
Extract ORCID identifier from an ORCID string or URL.
@@ -171,6 +182,7 @@ def add_learning_objective(
171182
- learning-objective -> schema:teaches / lrmi:teaches (closeMatch)
172183
- competency -> maps to modalia:Skill
173184
- blooms-category -> part of educational alignment
185+
- assessment -> lrmi:assesses / schema:assesses (closeMatch)
174186
175187
Args:
176188
graph: RDF graph to add triples to
@@ -217,6 +229,10 @@ def add_learning_objective(
217229
if descriptions:
218230
graph.add((obj_uri, SCHEMA.targetDescription, Literal(" | ".join(descriptions))))
219231

232+
# assessment -> lrmi:assesses (closeMatch)
233+
if "assessment" in objective_data:
234+
graph.add((obj_uri, LRMI.assesses, Literal(objective_data["assessment"])))
235+
220236
return obj_uri
221237

222238

@@ -562,28 +578,53 @@ def create_rdfxml() -> bool | None:
562578
graph.add((resource_uri, SCHEMA.funding, Literal(metadata["context-of-creation"])))
563579
logger.info("Added context of creation")
564580

565-
# quality-assurance is not included in RDF
566-
# It's in active development and has no standard schema.org mapping
567-
568-
# Sort triples for deterministic output
569-
# This ensures consistent ordering regardless of Python's hash randomization
570-
logger.info("Sorting triples for deterministic output...")
571-
sorted_triples = sorted(graph, key=lambda t: (str(t[0]), str(t[1]), str(t[2])))
572-
573-
# Create a new graph with sorted triples
574-
sorted_graph = Graph()
575-
for prefix, namespace in graph.namespaces():
576-
sorted_graph.bind(prefix, namespace)
577-
578-
for triple in sorted_triples:
579-
sorted_graph.add(triple)
580-
581-
logger.info("Sorted %d triples", len(sorted_triples))
581+
# learning-resource-type -> schema:learningResourceType (closeMatch)
582+
# -> lrmi:learningResourceType (closeMatch)
583+
# -> dcterms:type (broadMatch)
584+
# -> dc:type (broadMatch)
585+
if "learning-resource-type" in metadata:
586+
lrt = Literal(metadata["learning-resource-type"])
587+
graph.add((resource_uri, SCHEMA.learningResourceType, lrt))
588+
graph.add((resource_uri, LRMI.learningResourceType, lrt))
589+
graph.add((resource_uri, DCTERMS.type, lrt))
590+
graph.add((resource_uri, DC.type, lrt))
591+
logger.info("Added learning resource type: %s", metadata["learning-resource-type"])
592+
593+
# quality-assurance: not mapped to RDF
594+
# All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output
595+
596+
# Serialize to RDF/XML and post-process for deterministic output.
597+
# rdflib's pretty-xml serializer uses Python dicts internally, so element
598+
# and namespace ordering varies across process invocations due to hash
599+
# randomization. We sort the XML elements after serialization to guarantee
600+
# reproducible output regardless of PYTHONHASHSEED.
601+
logger.info("Serializing %d triples to RDF/XML...", len(graph))
582602

583-
# Write RDF/XML file
584603
try:
585-
with rdf_path.open("wb") as f:
586-
sorted_graph.serialize(f, format="pretty-xml", encoding="utf-8")
604+
xml_bytes = graph.serialize(format="pretty-xml", encoding="utf-8")
605+
xml_str = xml_bytes.decode("utf-8") if isinstance(xml_bytes, bytes) else xml_bytes
606+
607+
# Register namespace prefixes so ElementTree preserves them
608+
for prefix, uri in [
609+
("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
610+
("schema", str(SCHEMA)),
611+
("dc", str(DC)),
612+
("dcterms", str(DCTERMS)),
613+
("lrmi", str(LRMI)),
614+
("skos", str(SKOS)),
615+
]:
616+
ET.register_namespace(prefix, uri)
617+
618+
# Parse, sort elements recursively, and re-serialize
619+
root = ET.fromstring(xml_str) # noqa: S314 — parsing our own rdflib output
620+
_sort_xml_element(root)
621+
ET.indent(root, space=" ")
622+
623+
sorted_xml = ET.tostring(root, encoding="unicode", xml_declaration=True)
624+
625+
with rdf_path.open("w", encoding="utf-8") as f:
626+
f.write(sorted_xml)
627+
f.write("\n")
587628
except OSError:
588629
logger.exception("Error writing to %s", rdf_path)
589630
return False

quadriga/metadata/create_zenodo_json.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -250,15 +250,33 @@ def create_zenodo_json() -> bool | None:
250250
return False
251251

252252
# description
253-
description = citation_data.get("abstract")
254-
if not description:
255-
description = metadata.get("description")
256-
257-
if description:
258-
zenodo_metadata["description"] = description
259-
logger.info("Added description")
260-
else:
261-
logger.warning("No description/abstract found")
253+
description = "<p>" + metadata.get("description") + "</p>"
254+
255+
description_base = f"""
256+
<p>Das interaktive Lehrbuch kann als <a href="{metadata.get("url")}" target="_blank">Web-Version</a> verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch bei <a href="{metadata.get("git")}" target="_blank">GitHub</a> zur Verfügung.</p>
257+
<p>Die QUADRIGA OER sind nach einem einheitlichen <a href="https://quadriga-dk.github.io/Book_Template" target="_blank">Template</a> gestaltet, werden nach einem <a href="{metadata.get("quality-assurance").get("description", "TODO")}" target="_blank">standardisierten Verfahren qualitätsgeprüft</a> und <a href="https://doi.org/10.5281/zenodo.18184772" target="_blank">mit Metadaten ausgezeichnet</a>.</p>
258+
<h5>QUADRIGA Datenkompetenzzentrum</h5>
259+
<p>QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozesse möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes <a href="https://doi.org/10.5281/zenodo.14747822" target="_blank">Datenkompetenzframework</a> zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.</p>
260+
<p>Zielgruppen von QUADRIGA sind in erster Linie promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.</p>
261+
<p>QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom <a href="https://www.bmftr.bund.de/DE/Forschung/Wissenschaftssystem/Forschungsdaten/DatenkompetenzenInDerWissenschaft/datenkompetenzeninderwissenschaft.html" target="_blank">Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR)</a> und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartern zählen:
262+
<ul>
263+
<li>Universität Potsdam (Verbundkoordination) <span style="font-size: small">(Förderkennzeichen: 16DKZ2034A)</span></li>
264+
<li>Filmuniversität Babelsberg <span style="font-size: small">(Förderkennzeichen: 16DKZ2034B)</span></li>
265+
<li>Fachhochschule Potsdam <span style="font-size: small">(Förderkennzeichen: 16DKZ2034C)</span></li>
266+
<li>Fraunhofer FOKUS <span style="font-size: small">(Förderkennzeichen: 16DKZ2034D)</span></li>
267+
<li>Freie Universität Berlin <span style="font-size: small">(Förderkennzeichen: 16DKZ2034E)</span></li>
268+
<li>Technische Universität Berlin <span style="font-size: small">(Förderkennzeichen: 16DKZ2034F)</span></li>
269+
<li>Gesellschaft für Informatik <span style="font-size: small">(Förderkennzeichen: 16DKZ2034G)</span></li>
270+
<li>Humboldt-Universität zu Berlin <span style="font-size: small">(Förderkennzeichen: 16DKZ2034H)</span></li>
271+
</ul>
272+
</p>
273+
274+
<p>Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im <a href="https://doi.org/10.5281/zenodo.10805015" target="_blank">Umsetzungskonzept</a> erfahren.</p>
275+
276+
<p>Weitere Informationen sowie Publikationen finden Sie auf der <a href="https://www.quadriga-dk.de" target="_blank">Webseite</a>, in der <a href="https://zenodo.org/communities/quadriga" target="_blank">Zenodo-Community</a> und der <a href="https://github.com/quadriga-dk" target="_blank">GitHub-Organisation</a> des Projekts.</p>
277+
"""
278+
zenodo_metadata["description"] = description + description_base
279+
logger.info("Added description")
262280

263281
# publication date
264282
publication_date = None
@@ -356,7 +374,7 @@ def create_zenodo_json() -> bool | None:
356374
with zenodo_json_path.open("w", encoding="utf-8") as f:
357375
json.dump(zenodo_metadata, f, ensure_ascii=False, indent=2)
358376
except OSError:
359-
logger.exception("Error writing to {zenodo_json_path}")
377+
logger.exception("Error writing to %s", zenodo_json_path)
360378
return False
361379
else:
362380
logger.info("Zenodo metadata successfully created at %s", zenodo_json_path)

quadriga/metadata/extract_from_book_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ def extract_and_update() -> bool | None:
7272

7373
# Extract information from _config.yml
7474
title = config_data.get("title", "")
75-
config_data.get("author", "")
7675

7776
if not title:
7877
logger.warning("No title found in _config.yml")
@@ -115,7 +114,7 @@ def extract_and_update() -> bool | None:
115114
# Add to the list of chapters
116115
toc_chapters.append(chapter_title)
117116
except Exception:
118-
logger.exception("Error processing chapter {chapter.get('file', 'unknown')}")
117+
logger.exception("Error processing chapter %s", chapter.get("file", "unknown"))
119118
# Add a placeholder with the filename if possible
120119
try:
121120
toc_chapters.append(f"[Error: {p.stem}]")

quadriga/metadata/run_all.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from quadriga.metadata.create_zenodo_json import create_zenodo_json
1919
from quadriga.metadata.extract_from_book_config import extract_and_update
2020
from quadriga.metadata.update_citation_cff import update_citation
21+
from quadriga.metadata.validate_schema import validate_schema
2122

2223
logger = logging.getLogger(__name__)
2324

@@ -34,6 +35,16 @@ def main() -> bool | None:
3435

3536
logger.info("Running all metadata update scripts...")
3637

38+
# Validate metadata.yml against QUADRIGA schema first
39+
try:
40+
logger.info("Validating metadata.yml against QUADRIGA schema...")
41+
if not validate_schema():
42+
logger.error("Schema validation failed.")
43+
return False
44+
except Exception:
45+
logger.exception("Unexpected error during schema validation")
46+
return False
47+
3748
# Execute extract_and_update with error handling
3849
try:
3950
logger.info("Extracting metadata from _config.yml and _toc.yml...")

0 commit comments

Comments
 (0)