feat: syllabus import with creator and cca:course

phette23 · phette23 · commit 5e3f5690c89f · 2025-05-09T13:32:41.000-07:00
add local/courseInfo/faculty -&gt; Creators in crosswalk
some style edits/comment fixes in names.py
logic for Creator for syllabus items with no mods/name
add 2 constants to utils
tests for syllabus creator and resource type
diff --git a/docs/crosswalk.html b/docs/crosswalk.html
@@ -48,6 +48,7 @@ <h1>EQUELLA -> InvenioRDM Crosswalk</h1>
         VLDEPT["department"] --> CDEPTCODE["cca:course.department"]
         VCIDEPT["courseInfo/department"] --> CDEPT["cca:course.department_code"]
         VFACULTY["courseInfo/faculty"] --> CINSTRUCTORS["cca:course.instructors"]
+        VFACULTY --> |If we do not have a mods/name| CREATORS["Creators (1-n)"]
         VSECTION["courseInfo/section"] --> CSECTION["cca:course.section"]
         VSEMESTER["courseInfo/semester"] --> CTERM["cca:course.term"]
         VSECTION --> CSECTIONCALCID["cca:course.section_calc_id constructed from section & term"]
diff --git a/migrate/names.py b/migrate/names.py
@@ -1,5 +1,6 @@
-""" Parse names and lists of names from a variety of formats into {given_name, family_name} dicts
-This is used by Record.creator only. It does not relate to the Invenio names.yaml vocabulary."""
+"""Parse names and lists of names from a variety of formats into {given_name, family_name} dicts
+This is used by Record.creator only. It does not relate to the Invenio names.yaml vocabulary.
+"""
 
 import re
 
@@ -48,8 +49,8 @@ def n(d) -> dict[str, str]:
 
 
 def parse_name(namePart):
-    """parse wild variety of name strings into {givename, familyname}
-    or, if it looks like an orgnaization name, return only {name}"""
+    """Parse wild variety of name strings into {given_name, family_name}
+    or, if it looks like an organization name, return only {name}."""
 
     # semi-colon separated list of names
     if "; " in namePart:
@@ -58,7 +59,7 @@ def parse_name(namePart):
     if " + " in namePart:
         return [parse_name(p) for p in namePart.split(" + ")]
 
-    # usually Surname, Givenname but sometimes other things
+    # usually Surname, Given Name but sometimes other things
     if "," in namePart:
         # last, first
         parts = namePart.split(", ")
@@ -67,7 +68,7 @@ def parse_name(namePart):
             if "Calif.)" in namePart:
                 return n({"name": namePart})
             return n({"given_name": parts[1], "family_name": parts[0]})
-        # name with a DOB/dath date string after a second comma
+        # name with a DOB/death date string after a second comma
         if len(parts) == 3 and re.match(r"[0-9]{4}\-([0-9]{4})?", parts[2].strip()):
             return n({"given_name": parts[1], "family_name": parts[0]})
         # two or more commas, maybe we have a comma-separated list of names?
diff --git a/migrate/record.py b/migrate/record.py
@@ -16,7 +16,15 @@
 
 from names import parse_name
 from maps import *
-from utils import find_items, get_url, mklist, to_edtf, visual_mime_type_sort
+from utils import (
+    cca_affiliation,
+    find_items,
+    get_url,
+    mklist,
+    syllabus_collection_uuid,
+    to_edtf,
+    visual_mime_type_sort,
+)
 from subjects import find_subjects, Subject
 
 
@@ -56,6 +64,7 @@ def __init__(self, item):
         self.title: str = item.get("name", "Untitled")
         # default to current date in ISO 8601 format
         self.createdDate: str = item.get("createdDate", date.today().isoformat())
+        self.vault_collection: str = item.get("collection", {}).get("uuid", "")
         self.vault_url: str = ""
         if item.get("uuid") and item.get("version"):
             self.vault_url = (
@@ -147,9 +156,8 @@ def course(self) -> dict[str, Any] | None:
     def creators(self) -> list[dict[str, Any]]:
         # mods/name
         # https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n
-        namesx = mklist(self.xml.get("mods", {}).get("name"))
-        creators = []
-        for namex in namesx:
+        creators: list[dict[str, Any]] = []
+        for namex in mklist(self.xml.get("mods", {}).get("name")):
             # @usage = primary, secondary | ignoring this but could say sec. -> contributor, not creator
             partsx = namex.get("namePart")
             if type(partsx) == str:
@@ -196,9 +204,9 @@ def creators(self) -> list[dict[str, Any]]:
                 if type(names) == dict:
                     creators.append(
                         {
+                            "affiliations": creator["affiliations"],
                             "person_or_org": names,
                             "role": creator["role"],
-                            "affiliations": creator["affiliations"],
                         }
                     )
                 # implies type(names) == list, similar to below, if parse_name returns a
@@ -224,6 +232,34 @@ def creators(self) -> list[dict[str, Any]]:
                 for partx in partsx:
                     for name in mklist(parse_name(partx)):
                         creators.append({"person_or_org": name})
+
+        # Syllabi: we have no mods/name but list faculty in courseInfo/faculty
+        if len(creators) == 0:
+            faculty = self.xml.get("local", {}).get("courseInfo", {}).get("faculty")
+            if faculty:
+                names = parse_name(faculty)
+                if type(names) == dict:
+                    creators.append(
+                        {
+                            "affiliations": cca_affiliation,
+                            "person_or_org": names,
+                            "role": {"id": "creator"},
+                        }
+                    )
+                elif type(names) == list:
+                    for name in names:
+                        creators.append(
+                            {
+                                "affiliations": cca_affiliation,
+                                "person_or_org": name,
+                                "role": {"id": "creator"},
+                            }
+                        )
+
+        # If we _still_ have no creators, we cannot create a record b/c it is a
+        # required field but for our test data I do not want to specify creators.
+        if len(creators) == 0 and "pytest" not in sys.modules:
+            raise Exception(f"Record has no creators: {self.title}\n{self.vault_url}")
         return creators
 
     @property
@@ -478,6 +514,11 @@ def resource_type(self) -> dict[str, str]:
         # There are many fields that could be used to determine the resource type. Priority:
         # 1. mods/typeOfResource, 2. local/courseWorkType, 3. TBD (there are more...)
         # mods/typeOfResourceWrapper/typeOfResource
+
+        # Syllabus Collection only contains syllabi
+        if self.vault_collection == syllabus_collection_uuid:
+            return {"id": "publication-syllabus"}
+
         # Take the first typeOfResource value we find
         wrapper = self.xml.get("mods", {}).get("typeOfResourceWrapper")
         if type(wrapper) == list:
diff --git a/migrate/tests.py b/migrate/tests.py
@@ -5,7 +5,13 @@
 from names import parse_name
 from record import Record
 from subjects import find_subjects, subjects_from_xmldict, Subject, TYPES
-from utils import get_url, mklist, to_edtf, visual_mime_type_sort
+from utils import (
+    get_url,
+    mklist,
+    syllabus_collection_uuid,
+    to_edtf,
+    visual_mime_type_sort,
+)
 
 
 @pytest.mark.parametrize(
@@ -360,6 +366,13 @@ def test_parse_name(input, expect):
                 {"type": "personal", "given_name": "Joe", "family_name": "Pesci"},
             ],
         ),
+        # Syllabus: no mods/name but local/courseInfo/faculty
+        (
+            x(
+                "<local><courseInfo><faculty>Barbara Kruger</faculty></courseInfo></local>"
+            ),
+            [{"type": "personal", "given_name": "Barbara", "family_name": "Kruger"}],
+        ),
     ],
 )
 def test_creators(input, expect):
@@ -743,23 +756,30 @@ def test_dates(input, expect):
             x(
                 "<mods><typeOfResourceWrapper><typeOfResource>Event documentation</typeOfResource></typeOfResourceWrapper></mods>"
             ),
-            {"id": "event"},
+            "event",
         ),
         (  # multiple <typeOfResource> elements
             x(
                 "<mods><typeOfResourceWrapper><typeOfResource>moving image</typeOfResource><typeOfResource>mixed material</typeOfResource></typeOfResourceWrapper></mods>"
             ),
-            {"id": "image"},
+            "image",
+        ),
+        (  # Items in Syllabus Collection = publication-syllabus
+            {
+                "collection": {"uuid": syllabus_collection_uuid},
+                "metadata": "<xml></xml>",
+            },
+            "publication-syllabus",
         ),
         (  # default to publication
             x("<mods></mods>"),
-            {"id": "publication"},
+            "publication",
         ),
     ],
 )
 def test_type(input, expect):
     r = Record(input)
-    assert m(r)["resource_type"] == expect
+    assert m(r)["resource_type"].get("id", "") == expect
 
 
 # Publisher
diff --git a/migrate/utils.py b/migrate/utils.py
@@ -5,6 +5,10 @@
 
 from edtf import text_to_edtf
 
+# Constants
+cca_affiliation: list[dict[str, str]] = [{"id": "01mmcf932"}]  # ROR ID for CCA
+syllabus_collection_uuid: str = "9ec74523-e018-4e01-ab4e-be4dd06cdd68"
+
 
 # support three types of files: single item json, search results json with
 # multiple items in "results" property, and XML metadata with no item JSON