|
12 | 12 | from __future__ import annotations |
13 | 13 |
|
14 | 14 | import logging |
15 | | -import os |
16 | 15 | import sys |
| 16 | +import xml.etree.ElementTree as ET |
17 | 17 | from pathlib import Path |
18 | 18 | from typing import Any |
19 | 19 |
|
|
22 | 22 |
|
23 | 23 | from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file |
24 | 24 |
|
25 | | -# Ensure deterministic hash seed for consistent RDF serialization |
26 | | -# This must be set before any dictionaries/sets are created |
27 | | -if "PYTHONHASHSEED" not in os.environ: |
28 | | - os.environ["PYTHONHASHSEED"] = "0" |
29 | | - # Re-exec the script with the environment variable set |
30 | | - os.execv(sys.executable, [sys.executable] + sys.argv) |
31 | | - |
32 | 25 | logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") |
33 | 26 | logger = logging.getLogger(__name__) |
34 | 27 |
|
|
38 | 31 | LRMI = Namespace("http://purl.org/dcx/lrmi-terms/") |
39 | 32 |
|
40 | 33 |
|
| 34 | +def _sort_xml_element(element: ET.Element) -> None: |
| 35 | + """ |
| 36 | + Recursively sort child elements for deterministic XML output. |
| 37 | +
|
| 38 | + Sorts by tag name, then by attributes (as sorted key-value pairs), |
| 39 | + then by text content. This ensures identical output regardless of |
| 40 | + Python's hash randomization (PYTHONHASHSEED). |
| 41 | +
|
| 42 | + Args: |
| 43 | + element: XML element whose children will be sorted in-place |
| 44 | + """ |
| 45 | + children = list(element) |
| 46 | + for child in children: |
| 47 | + _sort_xml_element(child) |
| 48 | + children.sort(key=lambda e: (e.tag, sorted(e.attrib.items()), e.text or "")) |
| 49 | + element[:] = children |
| 50 | + |
| 51 | + |
41 | 52 | def clean_orcid(orcid_string: str) -> str | None: |
42 | 53 | """ |
43 | 54 | Extract ORCID identifier from an ORCID string or URL. |
@@ -171,6 +182,7 @@ def add_learning_objective( |
171 | 182 | - learning-objective -> schema:teaches / lrmi:teaches (closeMatch) |
172 | 183 | - competency -> maps to modalia:Skill |
173 | 184 | - blooms-category -> part of educational alignment |
| 185 | + - assessment -> lrmi:assesses / schema:assesses (closeMatch) |
174 | 186 |
|
175 | 187 | Args: |
176 | 188 | graph: RDF graph to add triples to |
@@ -217,6 +229,10 @@ def add_learning_objective( |
217 | 229 | if descriptions: |
218 | 230 | graph.add((obj_uri, SCHEMA.targetDescription, Literal(" | ".join(descriptions)))) |
219 | 231 |
|
| 232 | + # assessment -> lrmi:assesses (closeMatch) |
| 233 | + if "assessment" in objective_data: |
| 234 | + graph.add((obj_uri, LRMI.assesses, Literal(objective_data["assessment"]))) |
| 235 | + |
220 | 236 | return obj_uri |
221 | 237 |
|
222 | 238 |
|
@@ -562,28 +578,53 @@ def create_rdfxml() -> bool | None: |
562 | 578 | graph.add((resource_uri, SCHEMA.funding, Literal(metadata["context-of-creation"]))) |
563 | 579 | logger.info("Added context of creation") |
564 | 580 |
|
565 | | - # quality-assurance is not included in RDF |
566 | | - # It's in active development and has no standard schema.org mapping |
567 | | - |
568 | | - # Sort triples for deterministic output |
569 | | - # This ensures consistent ordering regardless of Python's hash randomization |
570 | | - logger.info("Sorting triples for deterministic output...") |
571 | | - sorted_triples = sorted(graph, key=lambda t: (str(t[0]), str(t[1]), str(t[2]))) |
572 | | - |
573 | | - # Create a new graph with sorted triples |
574 | | - sorted_graph = Graph() |
575 | | - for prefix, namespace in graph.namespaces(): |
576 | | - sorted_graph.bind(prefix, namespace) |
577 | | - |
578 | | - for triple in sorted_triples: |
579 | | - sorted_graph.add(triple) |
580 | | - |
581 | | - logger.info("Sorted %d triples", len(sorted_triples)) |
| 581 | + # learning-resource-type -> schema:learningResourceType (closeMatch) |
| 582 | + # -> lrmi:learningResourceType (closeMatch) |
| 583 | + # -> dcterms:type (broadMatch) |
| 584 | + # -> dc:type (broadMatch) |
| 585 | + if "learning-resource-type" in metadata: |
| 586 | + lrt = Literal(metadata["learning-resource-type"]) |
| 587 | + graph.add((resource_uri, SCHEMA.learningResourceType, lrt)) |
| 588 | + graph.add((resource_uri, LRMI.learningResourceType, lrt)) |
| 589 | + graph.add((resource_uri, DCTERMS.type, lrt)) |
| 590 | + graph.add((resource_uri, DC.type, lrt)) |
| 591 | + logger.info("Added learning resource type: %s", metadata["learning-resource-type"]) |
| 592 | + |
| 593 | + # quality-assurance: not mapped to RDF |
| 594 | + # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output |
| 595 | + |
| 596 | + # Serialize to RDF/XML and post-process for deterministic output. |
| 597 | + # rdflib's pretty-xml serializer uses Python dicts internally, so element |
| 598 | + # and namespace ordering varies across process invocations due to hash |
| 599 | + # randomization. We sort the XML elements after serialization to guarantee |
| 600 | + # reproducible output regardless of PYTHONHASHSEED. |
| 601 | + logger.info("Serializing %d triples to RDF/XML...", len(graph)) |
582 | 602 |
|
583 | | - # Write RDF/XML file |
584 | 603 | try: |
585 | | - with rdf_path.open("wb") as f: |
586 | | - sorted_graph.serialize(f, format="pretty-xml", encoding="utf-8") |
| 604 | + xml_bytes = graph.serialize(format="pretty-xml", encoding="utf-8") |
| 605 | + xml_str = xml_bytes.decode("utf-8") if isinstance(xml_bytes, bytes) else xml_bytes |
| 606 | + |
| 607 | + # Register namespace prefixes so ElementTree preserves them |
| 608 | + for prefix, uri in [ |
| 609 | + ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), |
| 610 | + ("schema", str(SCHEMA)), |
| 611 | + ("dc", str(DC)), |
| 612 | + ("dcterms", str(DCTERMS)), |
| 613 | + ("lrmi", str(LRMI)), |
| 614 | + ("skos", str(SKOS)), |
| 615 | + ]: |
| 616 | + ET.register_namespace(prefix, uri) |
| 617 | + |
| 618 | + # Parse, sort elements recursively, and re-serialize |
| 619 | + root = ET.fromstring(xml_str) # noqa: S314 — parsing our own rdflib output |
| 620 | + _sort_xml_element(root) |
| 621 | + ET.indent(root, space=" ") |
| 622 | + |
| 623 | + sorted_xml = ET.tostring(root, encoding="unicode", xml_declaration=True) |
| 624 | + |
| 625 | + with rdf_path.open("w", encoding="utf-8") as f: |
| 626 | + f.write(sorted_xml) |
| 627 | + f.write("\n") |
587 | 628 | except OSError: |
588 | 629 | logger.exception("Error writing to %s", rdf_path) |
589 | 630 | return False |
|
0 commit comments