Skip to content

Commit 919d8b3

Browse files
committed
TIND, PyPDF: Make metadata consistent and coherent
Due to the way Langchain uses LanceDB, the Arrow schema cannot change between documents. By default, the PyPDF document loader from Langchain ingests all metadata from the PDF (from both properties and XMP) and saves them as keys in the ``metadata`` dictionary. Unfortunately, the PDFs we are processing do not have uniform usage of metadata keys between the different PDF files. Some have Company set to ``UC Berkeley``, some don't; some have author information, some don't. What's more, the way we parse TIND records into further metadata meant that for a given key X, some records would have ``null`` values, some had a single ``str``, and some had a ``list`` of ``str``. This commit ensures that only the PDF metadata Langchain uses to process documents is included, and that all TIND properties are lists of strings. Empty values are now a list containing a single empty string. The alternative would be to define our own LanceDB connection, define our own schema ahead of time, and hope it never needs to change. It would also make Langchain integration more difficult. Additionally, fix up unit tests to handle metadata being lists now. Closes: AP-462
1 parent d565e2d commit 919d8b3

File tree

4 files changed

+20
-21
lines changed

4 files changed

+20
-21
lines changed

tests/tind/test_format_validate_pymarc.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_multiple_subfields_returns_array_of_values(self) -> None:
6060
def test_metadata_simple(self) -> None:
6161
"""Test parsing of a simple record into metadata."""
6262
result = format_validate_pymarc.pymarc_to_metadata(self.pymarc_records[0])
63-
self.assertEqual(result['tind_id'], '19217')
63+
self.assertEqual(result['tind_id'], ['19217'])
6464
self.assertListEqual(result['subject'], ['Ranches', 'Persea americana', 'Agriculture'])
6565

6666
def test_metadata_multi_subject(self) -> None:
@@ -72,7 +72,7 @@ def test_metadata_multi_subject(self) -> None:
7272
})
7373

7474
result = format_validate_pymarc.pymarc_to_metadata(self.pymarc_records[0])
75-
self.assertEqual(result['tind_id'], '19217')
75+
self.assertEqual(result['tind_id'], ['19217'])
7676
self.assertListEqual(result['subject'], ['Ranches', 'Persea americana', 'Agriculture',
7777
'Testing'])
7878

@@ -87,7 +87,7 @@ def test_metadata_multi_lists(self) -> None:
8787
})
8888

8989
result = format_validate_pymarc.pymarc_to_metadata(self.pymarc_records[0])
90-
self.assertEqual(result['tind_id'], '19217')
90+
self.assertEqual(result['tind_id'], ['19217'])
9191
self.assertListEqual(result['subject'], ['Ranches', 'Persea americana', 'Agriculture',
9292
'Element 1', 'Element 2'])
9393

@@ -102,7 +102,7 @@ def test_metadata_multi_none(self) -> None:
102102
})
103103

104104
result = format_validate_pymarc.pymarc_to_metadata(self.pymarc_records[0])
105-
self.assertEqual(result['tind_id'], '19217')
105+
self.assertEqual(result['tind_id'], ['19217'])
106106
self.assertListEqual(result['subject'], ['Ranches', 'Persea americana', 'Agriculture'])
107107

108108
format_validate_pymarc.parse_pymarc = parse_pymarc

willa/lcvendor/pypdf.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,11 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
122122
# Normalize key with others PDF parser
123123
new_metadata[map_key[k]] = v
124124
new_metadata[k] = v
125-
elif isinstance(v, str):
126-
new_metadata[k] = v.strip()
127-
elif isinstance(v, int):
128-
new_metadata[k] = v
125+
elif k in _STD_METADATA_KEYS:
126+
if isinstance(v, str):
127+
new_metadata[k] = v.strip()
128+
elif isinstance(v, int):
129+
new_metadata[k] = v
129130
return new_metadata
130131

131132

willa/tind/format_tind_context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def get_tind_context(docs: list) -> str:
3030
tind_data = ''
3131
tind_ids = {}
3232
for doc in docs:
33-
tind_id = doc.metadata['tind_metadata']['tind_id']
33+
tind_id = doc.metadata['tind_metadata']['tind_id'][0]
3434
if tind_id in tind_ids:
3535
continue
3636

@@ -58,6 +58,6 @@ def process_fields(tind_rec: dict) -> str:
5858
elif tind_rec[field] is not None:
5959
formatted_str += f"{DISPLAY_MAPPINGS[field]} {tind_rec[field]}\n\n"
6060

61-
formatted_str += f"Catalogue Link: {get_tind_url(tind_rec['tind_id'])}"
61+
formatted_str += f"Catalogue Link: {get_tind_url(tind_rec['tind_id'][0])}"
6262

6363
return formatted_str

willa/tind/format_validate_pymarc.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -138,29 +138,27 @@ def pymarc_to_metadata(record: Record) -> dict:
138138
"""
139139
marc_values = parse_pymarc(record)
140140

141-
metadata: dict[str, str | list | None] = {}
141+
metadata: dict[str, list] = {}
142142
for key, value in marc_values.items():
143+
if value is None:
144+
continue # Skip adding blanks to content.
145+
143146
meta_key = KEY_MAPPINGS[key]
144147
if meta_key in metadata:
145-
if value is None:
146-
continue # Skip adding blanks to existing content.
147-
148-
if isinstance(metadata[meta_key], str):
149-
metadata[meta_key] = [metadata[meta_key]] # Turn our str into a one-element list.
150-
elif metadata[meta_key] is None:
151-
metadata[meta_key] = []
152-
153148
if isinstance(value, list):
154149
# Add our list to the list.
155150
metadata[meta_key].extend(value) # type: ignore[union-attr]
156151
else:
157152
# Add our value to the list.
158153
metadata[meta_key].append(value) # type: ignore[union-attr]
159154
else:
160-
metadata[meta_key] = value
155+
if isinstance(value, list):
156+
metadata[meta_key] = value
157+
else:
158+
metadata[meta_key] = [value]
161159

162160
for meta_key in set(KEY_MAPPINGS.values()):
163161
if meta_key not in metadata:
164-
metadata[meta_key] = None
162+
metadata[meta_key] = ['']
165163

166164
return metadata

0 commit comments

Comments
 (0)