Skip to content

Commit 965b98e

Browse files
authored
BUG: Cope with different ISO date length (#1999)
Ensure compatibility with all optional date field formats as specified in the PDF 1.7 specification. closes #1972 closes #1996
1 parent b0cf830 commit 965b98e

File tree

3 files changed

+67
-12
lines changed

3 files changed

+67
-12
lines changed

pypdf/_reader.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
deprecation_no_replacement,
6060
deprecation_with_replacement,
6161
logger_warning,
62+
parse_iso8824_date,
6263
read_non_whitespace,
6364
read_previous_line,
6465
read_until_whitespace,
@@ -240,12 +241,7 @@ def producer_raw(self) -> Optional[str]:
240241
@property
241242
def creation_date(self) -> Optional[datetime]:
242243
"""Read-only property accessing the document's creation date."""
243-
text = self._get_text(DI.CREATION_DATE)
244-
if text is None:
245-
return None
246-
return datetime.strptime(
247-
text.replace("Z", "+").replace("'", ""), "D:%Y%m%d%H%M%S%z"
248-
)
244+
return parse_iso8824_date(self._get_text(DI.CREATION_DATE))
249245

250246
@property
251247
def creation_date_raw(self) -> Optional[str]:
@@ -264,12 +260,7 @@ def modification_date(self) -> Optional[datetime]:
264260
265261
The date and time the document was most recently modified.
266262
"""
267-
text = self._get_text(DI.MOD_DATE)
268-
if text is None:
269-
return None
270-
return datetime.strptime(
271-
text.replace("Z", "+").replace("'", ""), "D:%Y%m%d%H%M%S%z"
272-
)
263+
return parse_iso8824_date(self._get_text(DI.MOD_DATE))
273264

274265
@property
275266
def modification_date_raw(self) -> Optional[str]:

pypdf/_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import logging
3434
import warnings
3535
from dataclasses import dataclass
36+
from datetime import datetime, timezone
3637
from io import DEFAULT_BUFFER_SIZE, BytesIO
3738
from os import SEEK_CUR
3839
from typing import (
@@ -76,6 +77,38 @@
7677
DEPR_MSG_HAPPENED = "{} is deprecated and was removed in pypdf {}. Use {} instead."
7778

7879

80+
def parse_iso8824_date(text: Optional[str]) -> Optional[datetime]:
81+
orgtext = text
82+
if text is None:
83+
return None
84+
if text[0].isdigit():
85+
text = "D:" + text
86+
if text.endswith(("Z", "z")):
87+
text += "0000"
88+
text = text.replace("z", "+").replace("Z", "+").replace("'", "")
89+
i = max(text.find("+"), text.find("-"))
90+
if i > 0 and i != len(text) - 5:
91+
text += "00"
92+
for f in (
93+
"D:%Y",
94+
"D:%Y%m",
95+
"D:%Y%m%d",
96+
"D:%Y%m%d%H",
97+
"D:%Y%m%d%H%M",
98+
"D:%Y%m%d%H%M%S",
99+
"D:%Y%m%d%H%M%S%z",
100+
):
101+
try:
102+
d = datetime.strptime(text, f) # noqa: DTZ007
103+
except ValueError:
104+
continue
105+
else:
106+
if text[-5:] == "+0000":
107+
d = d.replace(tzinfo=timezone.utc)
108+
return d
109+
raise ValueError(f"Can not convert date: {orgtext}")
110+
111+
79112
def _get_max_pdf_version_header(header1: bytes, header2: bytes) -> bytes:
80113
versions = (
81114
b"%PDF-1.3",

tests/test_utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
deprecation_no_replacement,
1616
mark_location,
1717
matrix_multiply,
18+
parse_iso8824_date,
1819
read_block_backwards,
1920
read_previous_line,
2021
read_until_regex,
@@ -337,3 +338,33 @@ def test_file_class():
337338
f = File(name="image.png", data=b"")
338339
assert str(f) == "File(name=image.png, data: 0 Byte)"
339340
assert repr(f) == "File(name=image.png, data: 0 Byte, hash: 0)"
341+
342+
343+
@pytest.mark.parametrize(
344+
("text", "expected"),
345+
[
346+
("D:20210318000756", "2021-03-18T00:07:56"),
347+
("20210318000756", "2021-03-18T00:07:56"),
348+
("D:2021", "2021-01-01T00:00:00"),
349+
("D:202103", "2021-03-01T00:00:00"),
350+
("D:20210304", "2021-03-04T00:00:00"),
351+
("D:2021030402", "2021-03-04T02:00:00"),
352+
("D:20210408054711", "2021-04-08T05:47:11"),
353+
("D:20210408054711Z", "2021-04-08T05:47:11+00:00"),
354+
("D:20210408054711Z00", "2021-04-08T05:47:11+00:00"),
355+
("D:20210408054711Z0000", "2021-04-08T05:47:11+00:00"),
356+
("D:20210408075331+02'00'", "2021-04-08T07:53:31+02:00"),
357+
("D:20210408075331-03'00'", "2021-04-08T07:53:31-03:00"),
358+
],
359+
)
360+
def test_parse_datetime(text, expected):
361+
date = parse_iso8824_date(text)
362+
date_str = (date.isoformat() + date.strftime("%z"))[: len(expected)]
363+
assert date_str == expected
364+
365+
366+
def test_parse_datetime_err():
367+
with pytest.raises(ValueError) as ex:
368+
parse_iso8824_date("D:20210408T054711Z")
369+
assert ex.value.args[0] == "Can not convert date: D:20210408T054711Z"
370+
assert parse_iso8824_date("D:20210408054711").tzinfo is None

0 commit comments

Comments
 (0)