Skip to content

Commit 6a41b67

Browse files
committed
Switch date-time validation to regex
After benchmarking, it turns out that the manual, non-regex construction is significantly slower than using a regex to do most of the work. With a citation to the prior art in rfc3339-validator, this now implements a regex based check.
1 parent 749e8a6 commit 6a41b67

File tree

1 file changed

+89
-109
lines changed
  • src/check_jsonschema/formats/implementations

1 file changed

+89
-109
lines changed
Lines changed: 89 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,118 +1,98 @@
1-
def validate(date_str: str) -> bool:
2-
"""Validate a string as a RFC3339 date-time.
1+
import re
2+
import typing as t
33

4-
This check does the fastest possible validation of the date string (in Python),
5-
deferring operations as much as possible to avoid unnecessary work.
6-
"""
7-
try:
8-
# the following chars MUST be fixed values:
9-
# YYYY-MM-DDTHH:MM:SSZ
10-
# ^ ^ ^ ^ ^
11-
#
12-
# so start by checking them first
13-
# this keeps us as fast as possible in failure cases
14-
#
15-
# (note: "T" and "t" are both allowed under ISO8601)
16-
if (
17-
date_str[4] != "-"
18-
or date_str[7] != "-"
19-
or date_str[10] not in ("T", "t")
20-
or date_str[13] != ":"
21-
or date_str[16] != ":"
22-
):
23-
return False
4+
# this regex is based on the one from the rfc3339-validator package
5+
# credit to the original author
6+
# original license:
7+
#
8+
# MIT License
9+
#
10+
# Copyright (c) 2019, Nicolas Aimetti
11+
#
12+
# Permission is hereby granted, free of charge, to any person obtaining a copy
13+
# of this software and associated documentation files (the "Software"), to deal
14+
# in the Software without restriction, including without limitation the rights
15+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16+
# copies of the Software, and to permit persons to whom the Software is
17+
# furnished to do so, subject to the following conditions:
18+
#
19+
# The above copyright notice and this permission notice shall be included in all
20+
# copies or substantial portions of the Software.
21+
#
22+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28+
# SOFTWARE.
29+
#
30+
# modifications have been made for additional corner cases and speed
31+
RFC3339_REGEX = re.compile(
32+
r"""
33+
^
34+
(?:\d{4})
35+
-
36+
(?:0[1-9]|1[0-2])
37+
-
38+
(?:\d{2})
39+
(?:T|t)
40+
(?:[01]\d|2[0123])
41+
:
42+
(?:[0-5]\d)
43+
:
44+
(?:[0-5]\d)
45+
# (optional) fractional seconds
46+
(?:(\.|,)\d+)?
47+
# UTC or offset
48+
(?:
49+
Z
50+
| z
51+
| [+-](?:[01]\d|2[0123]):[0-5]\d
52+
)
53+
$
54+
""",
55+
re.VERBOSE | re.ASCII,
56+
)
2457

25-
# check for fractional seconds, which pushes the location of the offset/Z
26-
# record the discovered start postiion of the offset segment
27-
offset_start = 19
28-
if date_str[19] in (".", ","):
29-
offset_start = date_str.find("Z", 20)
30-
if offset_start == -1:
31-
offset_start = date_str.find("z", 20)
32-
if offset_start == -1:
33-
offset_start = date_str.find("+", 20)
34-
if offset_start == -1:
35-
offset_start = date_str.find("-", 20)
36-
# if we can't find an offset after `.` or `,` as a separator, it's wrong
37-
if offset_start == -1:
38-
return False
3958

40-
# fractional seconds are checked to be numeric
41-
# the spec seems to allow for any number of digits (?) so there's no
42-
# length check here
43-
frac_seconds = date_str[20:offset_start]
44-
if not frac_seconds:
45-
return False
46-
if not frac_seconds.isnumeric():
47-
return False
59+
def validate(date_str: t.Any) -> bool:
60+
"""Validate a string as a RFC3339 date-time."""
61+
if not isinstance(date_str, str):
62+
return False
63+
if not RFC3339_REGEX.match(date_str):
64+
return False
4865

49-
# now, handle Z vs offset
50-
# (note: "Z" and "z" are both allowed under ISO8601)
51-
z_offset = date_str[offset_start:] in ("Z", "z")
52-
if z_offset and len(date_str) != offset_start + 1:
53-
return False
54-
if not z_offset:
55-
if len(date_str) != offset_start + 6:
56-
return False
57-
if date_str[offset_start] not in ("+", "-"):
58-
return False
59-
if date_str[offset_start + 3] != ":":
60-
return False
66+
year, month, day = int(date_str[:4]), int(date_str[5:7]), int(date_str[8:10])
6167

62-
year = date_str[:4]
63-
if not year.isnumeric():
64-
return False
65-
year_val = int(year)
68+
if month in {4, 6, 9, 11}:
69+
max_day = 30
70+
elif month == 2:
71+
max_day = 29 if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) else 28
72+
else:
73+
max_day = 31
74+
if not 1 <= day <= max_day:
75+
return False
76+
return True
6677

67-
month = date_str[5:7]
68-
if not month.isnumeric():
69-
return False
70-
month_val = int(month)
71-
if not 1 <= month_val <= 12:
72-
return False
7378

74-
day = date_str[8:10]
75-
if not day.isnumeric():
76-
return False
77-
max_day = 31
78-
if month_val in (4, 6, 9, 11):
79-
max_day = 30
80-
elif month_val == 2:
81-
max_day = (
82-
29
83-
if year_val % 4 == 0 and (year_val % 100 != 0 or year_val % 400 == 0)
84-
else 28
85-
)
86-
if not 1 <= int(day) <= max_day:
87-
return False
79+
if __name__ == "__main__":
80+
import timeit
8881

89-
hour = date_str[11:13]
90-
if not hour.isnumeric():
91-
return False
92-
if not 0 <= int(hour) <= 23:
93-
return False
94-
minute = date_str[14:16]
95-
if not minute.isnumeric():
96-
return False
97-
if not 0 <= int(minute) <= 59:
98-
return False
99-
second = date_str[17:19]
100-
if not second.isnumeric():
101-
return False
102-
if not 0 <= int(second) <= 59:
103-
return False
82+
N = 100_000
83+
long_fracsec = "2018-12-31T23:59:59.8446519776713Z"
84+
basic = "2018-12-31T23:59:59Z"
85+
in_february = "2018-02-12T23:59:59Z"
86+
in_february_invalid = "2018-02-29T23:59:59Z"
10487

105-
if not z_offset:
106-
offset_hour = date_str[offset_start + 1 : offset_start + 3]
107-
if not offset_hour.isnumeric():
108-
return False
109-
if not 0 <= int(offset_hour) <= 23:
110-
return False
111-
offset_minute = date_str[offset_start + 4 : offset_start + 6]
112-
if not offset_minute.isnumeric():
113-
return False
114-
if not 0 <= int(offset_minute) <= 59:
115-
return False
116-
except (IndexError, ValueError):
117-
return False
118-
return True
88+
print("benchmarking")
89+
for name, val in (
90+
("long_fracsec", long_fracsec),
91+
("basic", basic),
92+
("february", in_february),
93+
("february_invalid", in_february_invalid),
94+
):
95+
all_times = timeit.repeat(
96+
f"validate({val!r})", globals=globals(), repeat=3, number=N
97+
)
98+
print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns")

0 commit comments

Comments
 (0)