Skip to content

Commit e31d3bb

Browse files
authored
Added convenience method DocDetails.make_filename (#1006)
1 parent 4e56369 commit e31d3bb

File tree

2 files changed

+39
-1
lines changed

2 files changed

+39
-1
lines changed

paperqa/types.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -531,7 +531,13 @@ class DocDetails(Doc):
531531
)
532532
doi: str | None = None
533533
doi_url: str | None = None
534-
doc_id: str | None = None
534+
doc_id: str | None = Field(
535+
default=None,
536+
description=(
537+
"Unique ID for this document. Simple ways to acquire one include"
538+
" hashing the DOI or a stringifying a UUID."
539+
),
540+
)
535541
file_location: str | os.PathLike | None = None
536542
license: str | None = Field(
537543
default=None,
@@ -811,6 +817,29 @@ def __getitem__(self, item: str):
811817
except AttributeError:
812818
return self.other[item]
813819

820+
def make_filename(self, title_limit: int | None = 48) -> str:
821+
"""
822+
Make a filesystem-safe filename that has the doc ID appended, but no extension.
823+
824+
Args:
825+
title_limit: Character limit on the title.
826+
827+
Returns:
828+
Filename that is filesystem safe (e.g. non-safe chars are replaced with dash).
829+
"""
830+
if not self.title or not self.doc_id:
831+
raise ValueError("Unable to create filename without both title and doc_id.")
832+
# SEE: https://stackoverflow.com/a/71199182
833+
encoded_title = re.sub(
834+
r"[/\\?%*:|\"<>\x7F\x00-\x1F]", "-", self.title[:title_limit]
835+
)
836+
# NOTE: we append the doc ID for a few reasons:
837+
# 1. Prevent collisions for identical titles
838+
# SEE: https://stackoverflow.com/a/71761675
839+
# 2. Filenames shouldn't end in a period,
840+
# so append the doc ID to circumvent that gotcha
841+
return "_".join((encoded_title, self.doc_id))
842+
814843
@computed_field # type: ignore[prop-decorator]
815844
@property
816845
def formatted_citation(self) -> str:

tests/test_paperqa.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,6 +1551,7 @@ def test_docdetails_doc_id_roundtrip() -> None:
15511551
# now let's do this with a doi
15521552
doc_details_with_doi_no_doc_id = DocDetails(
15531553
doi=test_doi,
1554+
title=r"A Stub | \emph{Stub Title}",
15541555
docname="test_doc",
15551556
citation="Test Citation",
15561557
dockey="test_dockey",
@@ -1566,6 +1567,10 @@ def test_docdetails_doc_id_roundtrip() -> None:
15661567
assert (
15671568
doc_details_with_doi_no_doc_id.dockey == doc_details_with_doi_no_doc_id.doc_id
15681569
)
1570+
assert (
1571+
doc_details_with_doi_no_doc_id.make_filename()
1572+
== "A Stub - -emph{Stub Title}_7f8a71c920c202c5"
1573+
)
15691574

15701575
# round-trip serializaiton should keep the same doc_id
15711576
new_with_doi_no_doc_id = DocDetails(
@@ -1574,6 +1579,10 @@ def test_docdetails_doc_id_roundtrip() -> None:
15741579
assert (
15751580
new_with_doi_no_doc_id.doc_id == doc_details_with_doi_no_doc_id.doc_id
15761581
), "DocDetails with doc_id should keep the same doc_id after serialization"
1582+
assert (
1583+
new_with_doi_no_doc_id.make_filename()
1584+
== "A Stub - -emph{Stub Title}_7f8a71c920c202c5"
1585+
)
15771586

15781587
# since validation runs on assignment, make sure we can assign correctly
15791588
doc_details_with_doi_no_doc_id.doc_id = test_specified_doc_id

0 commit comments

Comments
 (0)