Skip to content

Commit 5e3f569

Browse files
committed
feat: syllabus import with creator and cca:course
add local/courseInfo/faculty -> Creators in crosswalk some style edits/comment fixes in names.py logic for Creator for syllabus items with no mods/name add 2 constants to utils tests for syllabus creator and resource type
1 parent 72e638e commit 5e3f569

File tree

5 files changed

+83
-16
lines changed

5 files changed

+83
-16
lines changed

docs/crosswalk.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ <h1>EQUELLA -> InvenioRDM Crosswalk</h1>
4848
VLDEPT["department"] --> CDEPTCODE["cca:course.department"]
4949
VCIDEPT["courseInfo/department"] --> CDEPT["cca:course.department_code"]
5050
VFACULTY["courseInfo/faculty"] --> CINSTRUCTORS["cca:course.instructors"]
51+
VFACULTY --> |If we do not have a mods/name| CREATORS["Creators (1-n)"]
5152
VSECTION["courseInfo/section"] --> CSECTION["cca:course.section"]
5253
VSEMESTER["courseInfo/semester"] --> CTERM["cca:course.term"]
5354
VSECTION --> CSECTIONCALCID["cca:course.section_calc_id constructed from section & term"]

migrate/names.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
""" Parse names and lists of names from a variety of formats into {given_name, family_name} dicts
2-
This is used by Record.creator only. It does not relate to the Invenio names.yaml vocabulary."""
1+
"""Parse names and lists of names from a variety of formats into {given_name, family_name} dicts
2+
This is used by Record.creator only. It does not relate to the Invenio names.yaml vocabulary.
3+
"""
34

45
import re
56

@@ -48,8 +49,8 @@ def n(d) -> dict[str, str]:
4849

4950

5051
def parse_name(namePart):
51-
"""parse wild variety of name strings into {givename, familyname}
52-
or, if it looks like an orgnaization name, return only {name}"""
52+
"""Parse wild variety of name strings into {given_name, family_name}
53+
or, if it looks like an organization name, return only {name}."""
5354

5455
# semi-colon separated list of names
5556
if "; " in namePart:
@@ -58,7 +59,7 @@ def parse_name(namePart):
5859
if " + " in namePart:
5960
return [parse_name(p) for p in namePart.split(" + ")]
6061

61-
# usually Surname, Givenname but sometimes other things
62+
# usually Surname, Given Name but sometimes other things
6263
if "," in namePart:
6364
# last, first
6465
parts = namePart.split(", ")
@@ -67,7 +68,7 @@ def parse_name(namePart):
6768
if "Calif.)" in namePart:
6869
return n({"name": namePart})
6970
return n({"given_name": parts[1], "family_name": parts[0]})
70-
# name with a DOB/dath date string after a second comma
71+
# name with a DOB/death date string after a second comma
7172
if len(parts) == 3 and re.match(r"[0-9]{4}\-([0-9]{4})?", parts[2].strip()):
7273
return n({"given_name": parts[1], "family_name": parts[0]})
7374
# two or more commas, maybe we have a comma-separated list of names?

migrate/record.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,15 @@
1616

1717
from names import parse_name
1818
from maps import *
19-
from utils import find_items, get_url, mklist, to_edtf, visual_mime_type_sort
19+
from utils import (
20+
cca_affiliation,
21+
find_items,
22+
get_url,
23+
mklist,
24+
syllabus_collection_uuid,
25+
to_edtf,
26+
visual_mime_type_sort,
27+
)
2028
from subjects import find_subjects, Subject
2129

2230

@@ -56,6 +64,7 @@ def __init__(self, item):
5664
self.title: str = item.get("name", "Untitled")
5765
# default to current date in ISO 8601 format
5866
self.createdDate: str = item.get("createdDate", date.today().isoformat())
67+
self.vault_collection: str = item.get("collection", {}).get("uuid", "")
5968
self.vault_url: str = ""
6069
if item.get("uuid") and item.get("version"):
6170
self.vault_url = (
@@ -147,9 +156,8 @@ def course(self) -> dict[str, Any] | None:
147156
def creators(self) -> list[dict[str, Any]]:
148157
# mods/name
149158
# https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n
150-
namesx = mklist(self.xml.get("mods", {}).get("name"))
151-
creators = []
152-
for namex in namesx:
159+
creators: list[dict[str, Any]] = []
160+
for namex in mklist(self.xml.get("mods", {}).get("name")):
153161
# @usage = primary, secondary | ignoring this but could say sec. -> contributor, not creator
154162
partsx = namex.get("namePart")
155163
if type(partsx) == str:
@@ -196,9 +204,9 @@ def creators(self) -> list[dict[str, Any]]:
196204
if type(names) == dict:
197205
creators.append(
198206
{
207+
"affiliations": creator["affiliations"],
199208
"person_or_org": names,
200209
"role": creator["role"],
201-
"affiliations": creator["affiliations"],
202210
}
203211
)
204212
# implies type(names) == list, similar to below, if parse_name returns a
@@ -224,6 +232,34 @@ def creators(self) -> list[dict[str, Any]]:
224232
for partx in partsx:
225233
for name in mklist(parse_name(partx)):
226234
creators.append({"person_or_org": name})
235+
236+
# Syllabi: we have no mods/name but list faculty in courseInfo/faculty
237+
if len(creators) == 0:
238+
faculty = self.xml.get("local", {}).get("courseInfo", {}).get("faculty")
239+
if faculty:
240+
names = parse_name(faculty)
241+
if type(names) == dict:
242+
creators.append(
243+
{
244+
"affiliations": cca_affiliation,
245+
"person_or_org": names,
246+
"role": {"id": "creator"},
247+
}
248+
)
249+
elif type(names) == list:
250+
for name in names:
251+
creators.append(
252+
{
253+
"affiliations": cca_affiliation,
254+
"person_or_org": name,
255+
"role": {"id": "creator"},
256+
}
257+
)
258+
259+
# If we _still_ have no creators, we cannot create a record b/c it is a
260+
# required field but for our test data I do not want to specify creators.
261+
if len(creators) == 0 and "pytest" not in sys.modules:
262+
raise Exception(f"Record has no creators: {self.title}\n{self.vault_url}")
227263
return creators
228264

229265
@property
@@ -478,6 +514,11 @@ def resource_type(self) -> dict[str, str]:
478514
# There are many fields that could be used to determine the resource type. Priority:
479515
# 1. mods/typeOfResource, 2. local/courseWorkType, 3. TBD (there are more...)
480516
# mods/typeOfResourceWrapper/typeOfResource
517+
518+
# Syllabus Collection only contains syllabi
519+
if self.vault_collection == syllabus_collection_uuid:
520+
return {"id": "publication-syllabus"}
521+
481522
# Take the first typeOfResource value we find
482523
wrapper = self.xml.get("mods", {}).get("typeOfResourceWrapper")
483524
if type(wrapper) == list:

migrate/tests.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
from names import parse_name
66
from record import Record
77
from subjects import find_subjects, subjects_from_xmldict, Subject, TYPES
8-
from utils import get_url, mklist, to_edtf, visual_mime_type_sort
8+
from utils import (
9+
get_url,
10+
mklist,
11+
syllabus_collection_uuid,
12+
to_edtf,
13+
visual_mime_type_sort,
14+
)
915

1016

1117
@pytest.mark.parametrize(
@@ -360,6 +366,13 @@ def test_parse_name(input, expect):
360366
{"type": "personal", "given_name": "Joe", "family_name": "Pesci"},
361367
],
362368
),
369+
# Syllabus: no mods/name but local/courseInfo/faculty
370+
(
371+
x(
372+
"<local><courseInfo><faculty>Barbara Kruger</faculty></courseInfo></local>"
373+
),
374+
[{"type": "personal", "given_name": "Barbara", "family_name": "Kruger"}],
375+
),
363376
],
364377
)
365378
def test_creators(input, expect):
@@ -743,23 +756,30 @@ def test_dates(input, expect):
743756
x(
744757
"<mods><typeOfResourceWrapper><typeOfResource>Event documentation</typeOfResource></typeOfResourceWrapper></mods>"
745758
),
746-
{"id": "event"},
759+
"event",
747760
),
748761
( # multiple <typeOfResource> elements
749762
x(
750763
"<mods><typeOfResourceWrapper><typeOfResource>moving image</typeOfResource><typeOfResource>mixed material</typeOfResource></typeOfResourceWrapper></mods>"
751764
),
752-
{"id": "image"},
765+
"image",
766+
),
767+
( # Items in Syllabus Collection = publication-syllabus
768+
{
769+
"collection": {"uuid": syllabus_collection_uuid},
770+
"metadata": "<xml></xml>",
771+
},
772+
"publication-syllabus",
753773
),
754774
( # default to publication
755775
x("<mods></mods>"),
756-
{"id": "publication"},
776+
"publication",
757777
),
758778
],
759779
)
760780
def test_type(input, expect):
761781
r = Record(input)
762-
assert m(r)["resource_type"] == expect
782+
assert m(r)["resource_type"].get("id", "") == expect
763783

764784

765785
# Publisher

migrate/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55

66
from edtf import text_to_edtf
77

8+
# Constants
9+
cca_affiliation: list[dict[str, str]] = [{"id": "01mmcf932"}] # ROR ID for CCA
10+
syllabus_collection_uuid: str = "9ec74523-e018-4e01-ab4e-be4dd06cdd68"
11+
812

913
# support three types of files: single item json, search results json with
1014
# multiple items in "results" property, and XML metadata with no item JSON

0 commit comments

Comments
 (0)