Skip to content

Commit 29a49c6

Browse files
feat(datetime): Migrate standard datetime parsing to whenever library
Co-Authored-By: Aaron <AJ> Steers <[email protected]>
1 parent 34f24f2 commit 29a49c6

File tree

3 files changed

+274
-17
lines changed

3 files changed

+274
-17
lines changed

airbyte_cdk/utils/datetime_helpers.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686

8787
from dateutil import parser
8888
from typing_extensions import Never
89-
from whenever import Instant, LocalDateTime, ZonedDateTime
89+
from whenever import Instant, LocalDateTime, OffsetDateTime, ZonedDateTime
9090

9191

9292
class AirbyteDateTime(datetime):
@@ -423,7 +423,40 @@ def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime:
423423
if ":" in dt_str and dt_str.count("-") < 2 and dt_str.count("/") < 2:
424424
raise ValueError(f"Missing date part in datetime string: {dt_str}")
425425

426-
# Try parsing with dateutil for timezone handling
426+
# Try parsing standard ISO/RFC formats with whenever
427+
# Only attempt whenever parsing for specific ISO/RFC formats
428+
if (
429+
isinstance(dt_str, str)
430+
and "/" not in dt_str # Exclude non-standard date separators
431+
and (
432+
# ISO format with T delimiter and Z timezone or +00:00 timezone
433+
(("T" in dt_str) and ("Z" in dt_str or "+" in dt_str or "-" in dt_str))
434+
# ISO format with space delimiter and Z timezone
435+
or (" " in dt_str and "Z" in dt_str)
436+
)
437+
):
438+
# First try Instant.parse_common_iso for UTC formats
439+
try:
440+
instant = Instant.parse_common_iso(dt_str)
441+
return AirbyteDateTime.from_datetime(instant.py_datetime())
442+
except Exception:
443+
pass
444+
445+
# Then try Instant.parse_rfc3339 which is more flexible
446+
try:
447+
instant = Instant.parse_rfc3339(dt_str)
448+
return AirbyteDateTime.from_datetime(instant.py_datetime())
449+
except Exception:
450+
pass
451+
452+
# Try OffsetDateTime for non-UTC timezones
453+
try:
454+
offset_dt = OffsetDateTime.parse_common_iso(dt_str)
455+
return AirbyteDateTime.from_datetime(offset_dt.py_datetime())
456+
except Exception:
457+
pass
458+
459+
# Fall back to dateutil for other formats
427460
try:
428461
parsed = parser.parse(dt_str)
429462
if parsed.tzinfo is None:
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import time
2+
import statistics
3+
from datetime import datetime, timezone
4+
from dateutil import parser
5+
from whenever import Instant, OffsetDateTime
6+
7+
from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse
8+
9+
# Test formats
10+
test_formats = [
11+
'2023-03-14T15:09:26Z', # ISO format with T delimiter and Z timezone
12+
'2023-03-14T15:09:26+00:00', # ISO format with +00:00 timezone
13+
'2023-03-14T15:09:26.123456Z', # ISO format with microseconds
14+
'2023-03-14T15:09:26-04:00', # ISO format with non-UTC timezone
15+
'2023-03-14 15:09:26Z', # Missing T delimiter
16+
'2023-03-14 15:09:26', # Missing T delimiter and timezone
17+
'2023-03-14', # Date-only format
18+
'14/03/2023 15:09:26', # Different date format
19+
'2023/03/14T15:09:26Z', # Non-standard date separator
20+
]
21+
22+
# Number of iterations for each test
23+
iterations = 1000
24+
25+
print("Performance Test: Datetime Parsing")
26+
print("=" * 60)
27+
print(f"Running {iterations} iterations for each format")
28+
print("-" * 60)
29+
30+
results = {}
31+
32+
for dt_str in test_formats:
33+
print(f"\nFormat: {dt_str}")
34+
35+
# Test whenever parsing
36+
whenever_times = []
37+
whenever_success = False
38+
39+
# Try Instant.parse_common_iso
40+
try:
41+
# Warmup
42+
Instant.parse_common_iso(dt_str)
43+
44+
for _ in range(iterations):
45+
start = time.perf_counter()
46+
Instant.parse_common_iso(dt_str)
47+
end = time.perf_counter()
48+
whenever_times.append((end - start) * 1000)
49+
50+
whenever_success = True
51+
whenever_method = "Instant.parse_common_iso"
52+
except Exception:
53+
pass
54+
55+
# If parse_common_iso failed, try parse_rfc3339
56+
if not whenever_success:
57+
try:
58+
# Warmup
59+
Instant.parse_rfc3339(dt_str)
60+
61+
whenever_times = []
62+
for _ in range(iterations):
63+
start = time.perf_counter()
64+
Instant.parse_rfc3339(dt_str)
65+
end = time.perf_counter()
66+
whenever_times.append((end - start) * 1000)
67+
68+
whenever_success = True
69+
whenever_method = "Instant.parse_rfc3339"
70+
except Exception:
71+
pass
72+
73+
# If both Instant methods failed, try OffsetDateTime.parse_common_iso
74+
if not whenever_success:
75+
try:
76+
# Warmup
77+
OffsetDateTime.parse_common_iso(dt_str)
78+
79+
whenever_times = []
80+
for _ in range(iterations):
81+
start = time.perf_counter()
82+
OffsetDateTime.parse_common_iso(dt_str)
83+
end = time.perf_counter()
84+
whenever_times.append((end - start) * 1000)
85+
86+
whenever_success = True
87+
whenever_method = "OffsetDateTime.parse_common_iso"
88+
except Exception:
89+
whenever_method = "None"
90+
91+
# Test dateutil parsing
92+
dateutil_times = []
93+
try:
94+
# Warmup
95+
parser.parse(dt_str)
96+
97+
for _ in range(iterations):
98+
start = time.perf_counter()
99+
parser.parse(dt_str)
100+
end = time.perf_counter()
101+
dateutil_times.append((end - start) * 1000)
102+
103+
dateutil_success = True
104+
except Exception:
105+
dateutil_success = False
106+
107+
# Test ab_datetime_parse
108+
ab_times = []
109+
try:
110+
# Warmup
111+
ab_datetime_parse(dt_str)
112+
113+
for _ in range(iterations):
114+
start = time.perf_counter()
115+
ab_datetime_parse(dt_str)
116+
end = time.perf_counter()
117+
ab_times.append((end - start) * 1000)
118+
119+
ab_success = True
120+
except Exception:
121+
ab_success = False
122+
123+
# Print results
124+
if whenever_success:
125+
whenever_avg = statistics.mean(whenever_times)
126+
whenever_min = min(whenever_times)
127+
whenever_max = max(whenever_times)
128+
print(f" {whenever_method}: avg={whenever_avg:.3f}ms, min={whenever_min:.3f}ms, max={whenever_max:.3f}ms")
129+
else:
130+
print(f" whenever: Failed to parse")
131+
132+
if dateutil_success:
133+
dateutil_avg = statistics.mean(dateutil_times)
134+
dateutil_min = min(dateutil_times)
135+
dateutil_max = max(dateutil_times)
136+
print(f" dateutil.parser.parse: avg={dateutil_avg:.3f}ms, min={dateutil_min:.3f}ms, max={dateutil_max:.3f}ms")
137+
else:
138+
print(f" dateutil.parser.parse: Failed to parse")
139+
140+
if ab_success:
141+
ab_avg = statistics.mean(ab_times)
142+
ab_min = min(ab_times)
143+
ab_max = max(ab_times)
144+
print(f" ab_datetime_parse: avg={ab_avg:.3f}ms, min={ab_min:.3f}ms, max={ab_max:.3f}ms")
145+
else:
146+
print(f" ab_datetime_parse: Failed to parse")
147+
148+
# Compare performance if both succeeded
149+
if whenever_success and dateutil_success:
150+
speedup = dateutil_avg / whenever_avg
151+
print(f" Performance: whenever is {speedup:.2f}x faster than dateutil")
152+
153+
# Store results for summary
154+
results[dt_str] = {
155+
'whenever': {
156+
'success': whenever_success,
157+
'method': whenever_method,
158+
'avg': statistics.mean(whenever_times) if whenever_success else None,
159+
},
160+
'dateutil': {
161+
'success': dateutil_success,
162+
'avg': statistics.mean(dateutil_times) if dateutil_success else None,
163+
},
164+
'ab_datetime_parse': {
165+
'success': ab_success,
166+
'avg': statistics.mean(ab_times) if ab_success else None,
167+
}
168+
}
169+
170+
# Print summary
171+
print("\n\nSummary")
172+
print("=" * 60)
173+
print("Format | whenever | dateutil | ab_datetime_parse | Speedup")
174+
print("-" * 60)
175+
176+
for dt_str, result in results.items():
177+
whenever_avg = result['whenever']['avg']
178+
dateutil_avg = result['dateutil']['avg']
179+
ab_avg = result['ab_datetime_parse']['avg']
180+
181+
whenever_str = f"{whenever_avg:.3f}ms" if whenever_avg else "N/A"
182+
dateutil_str = f"{dateutil_avg:.3f}ms" if dateutil_avg else "N/A"
183+
ab_str = f"{ab_avg:.3f}ms" if ab_avg else "N/A"
184+
185+
if whenever_avg and dateutil_avg:
186+
speedup = dateutil_avg / whenever_avg
187+
speedup_str = f"{speedup:.2f}x"
188+
else:
189+
speedup_str = "N/A"
190+
191+
print(f"{dt_str} | {whenever_str} | {dateutil_str} | {ab_str} | {speedup_str}")

unit_tests/utils/test_datetime_helpers.py

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,15 @@ def test_epoch_millis():
272272
# Formats that use the whenever parser
273273
("2023-03-14", "whenever"), # Date-only format
274274
(1678806566, "whenever"), # Unix timestamp
275-
# Formats that use the dateutil parser
276-
("2023-03-14T15:09:26Z", "dateutil"), # ISO format with T delimiter
277-
("2023-03-14T15:09:26+00:00", "dateutil"), # ISO format with timezone
278-
("2023-03-14 15:09:26", "dateutil"), # Missing T delimiter
275+
("2023-03-14T15:09:26Z", "whenever"), # ISO format with T delimiter
276+
("2023-03-14T15:09:26+00:00", "whenever"), # ISO format with timezone
277+
("2023-03-14T15:09:26.123456Z", "whenever"), # ISO format with microseconds
278+
("2023-03-14T15:09:26-04:00", "whenever"), # ISO format with non-UTC timezone
279+
("2023-03-14 15:09:26Z", "whenever"), # Missing T delimiter but with Z
280+
# Formats that still use the dateutil parser
281+
("2023-03-14 15:09:26", "dateutil"), # Missing T delimiter and timezone
279282
("14/03/2023 15:09:26", "dateutil"), # Different date format
283+
("2023/03/14T15:09:26Z", "dateutil"), # Non-standard date separator
280284
],
281285
)
282286
def test_datetime_parser_selection(input_value, expected_parser, monkeypatch):
@@ -286,31 +290,43 @@ def test_datetime_parser_selection(input_value, expected_parser, monkeypatch):
286290
dateutil_called = False
287291

288292
# Store original functions
289-
original_instant_module = __import__("whenever").Instant
293+
original_instant = __import__("whenever").Instant
294+
original_offset_dt = __import__("whenever").OffsetDateTime
290295
original_parser_parse = parser.parse
291296

292-
# Create a spy for Instant.from_timestamp
297+
# Create spies for whenever methods
293298
def spy_from_timestamp(*args, **kwargs):
294299
nonlocal whenever_called
295300
whenever_called = True
296-
# Call the original function
297-
return original_instant_module.from_timestamp(*args, **kwargs)
301+
return original_instant.from_timestamp(*args, **kwargs)
298302

299-
# Create a spy for Instant.from_utc
300303
def spy_from_utc(*args, **kwargs):
301304
nonlocal whenever_called
302305
whenever_called = True
303-
# Call the original function
304-
return original_instant_module.from_utc(*args, **kwargs)
306+
return original_instant.from_utc(*args, **kwargs)
307+
308+
def spy_parse_common_iso(*args, **kwargs):
309+
nonlocal whenever_called
310+
whenever_called = True
311+
return original_instant.parse_common_iso(*args, **kwargs)
312+
313+
def spy_parse_rfc3339(*args, **kwargs):
314+
nonlocal whenever_called
315+
whenever_called = True
316+
return original_instant.parse_rfc3339(*args, **kwargs)
317+
318+
def spy_offset_parse_common_iso(*args, **kwargs):
319+
nonlocal whenever_called
320+
whenever_called = True
321+
return original_offset_dt.parse_common_iso(*args, **kwargs)
305322

306323
# Create a spy for parser.parse
307324
def spy_parser_parse(*args, **kwargs):
308325
nonlocal dateutil_called
309326
dateutil_called = True
310-
# Call the original function
311327
return original_parser_parse(*args, **kwargs)
312328

313-
# Create a mock Instant class with our spy methods
329+
# Create mock classes with our spy methods
314330
class MockInstant:
315331
@staticmethod
316332
def from_timestamp(*args, **kwargs):
@@ -320,13 +336,30 @@ def from_timestamp(*args, **kwargs):
320336
def from_utc(*args, **kwargs):
321337
return spy_from_utc(*args, **kwargs)
322338

323-
# Add any other methods that might be called
339+
@staticmethod
340+
def parse_common_iso(*args, **kwargs):
341+
return spy_parse_common_iso(*args, **kwargs)
342+
343+
@staticmethod
344+
def parse_rfc3339(*args, **kwargs):
345+
return spy_parse_rfc3339(*args, **kwargs)
346+
347+
@staticmethod
348+
def py_datetime():
349+
return original_instant.py_datetime()
350+
351+
class MockOffsetDateTime:
352+
@staticmethod
353+
def parse_common_iso(*args, **kwargs):
354+
return spy_offset_parse_common_iso(*args, **kwargs)
355+
324356
@staticmethod
325357
def py_datetime():
326-
return original_instant_module.py_datetime()
358+
return original_offset_dt.py_datetime()
327359

328360
# Apply the mocks at the module level
329361
monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant)
362+
monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.OffsetDateTime", MockOffsetDateTime)
330363
monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse)
331364

332365
# Parse the datetime

0 commit comments

Comments
 (0)