Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 62 additions & 50 deletions airbyte_cdk/utils/datetime_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@
from datetime import datetime, timedelta, timezone
from typing import Any, Optional, Union, overload

from dateutil import parser
from dateutil import parser as dateutil_parser
from typing_extensions import Never
from whenever import Instant, LocalDateTime, ZonedDateTime
from whenever import Instant, LocalDateTime, OffsetDateTime, ZonedDateTime


class AirbyteDateTime(datetime):
Expand Down Expand Up @@ -358,7 +358,12 @@ def ab_datetime_now() -> AirbyteDateTime:
return AirbyteDateTime.from_datetime(datetime.now(timezone.utc))


def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime:
def ab_datetime_parse(
dt_str: str | int,
formats: list[str] | None = None,
*,
disallow_other_formats: bool = False,
) -> AirbyteDateTime:
"""Parses a datetime string or timestamp into an AirbyteDateTime with timezone awareness.

This implementation is as flexible as possible to handle various datetime formats.
Expand Down Expand Up @@ -389,57 +394,64 @@ def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime:
>>> ab_datetime_parse("2023-03-14") # Date-only
'2023-03-14T00:00:00+00:00'
"""
try:
# Handle numeric values as Unix timestamps (UTC)
if isinstance(dt_str, int) or (
isinstance(dt_str, str)
and (dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit()))
):
timestamp = int(dt_str)
if timestamp < 0:
raise ValueError("Timestamp cannot be negative")
if len(str(abs(timestamp))) > 10:
raise ValueError("Timestamp value too large")
instant = Instant.from_timestamp(timestamp)
return AirbyteDateTime.from_datetime(instant.py_datetime())

if not isinstance(dt_str, str):
# Handle numeric values as Unix timestamps (UTC)
if isinstance(dt_str, int):
timestamp: int = dt_str
if timestamp < 0:
raise ValueError("Timestamp cannot be negative")
if len(str(abs(timestamp))) > 10:
raise ValueError("Timestamp value too large")
instant = Instant.from_timestamp(timestamp)
return AirbyteDateTime.from_datetime(instant.py_datetime())

if not isinstance(dt_str, str):
raise ValueError(
f"Could not parse datetime string: expected string or integer, got {type(dt_str)}"
)

if formats:
ex_list: list[Exception] = []
for format in formats:
try:
result = OffsetDateTime.strptime(dt_str, format)
except Exception as ex:
ex_list.append(ex)
else:
# No exception
return result

if disallow_other_formats:
raise ValueError(
f"Could not parse datetime string: expected string or integer, got {type(dt_str)}"
f"Could not parse datetime string with provided formats [{formats}]: {str(ex_list)}"
)

# Handle date-only format first
if ":" not in dt_str and dt_str.count("-") == 2 and "/" not in dt_str:
try:
year, month, day = map(int, dt_str.split("-"))
if not (1 <= month <= 12 and 1 <= day <= 31):
raise ValueError(f"Invalid date format: {dt_str}")
instant = Instant.from_utc(year, month, day, 0, 0, 0)
return AirbyteDateTime.from_datetime(instant.py_datetime())
except (ValueError, TypeError):
raise ValueError(f"Invalid date format: {dt_str}")

# Reject time-only strings without date
if ":" in dt_str and dt_str.count("-") < 2 and dt_str.count("/") < 2:
raise ValueError(f"Missing date part in datetime string: {dt_str}")

# Try parsing with dateutil for timezone handling
# Else, value is a string

# Try parsing standard ISO/RFC formats with whenever
try:
instant = Instant.parse_common_iso(dt_str)
return AirbyteDateTime.from_datetime(instant.py_datetime())
except Exception:
pass

# Handle int-like strings
if (
dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit())
):
try:
parsed = parser.parse(dt_str)
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=timezone.utc)

return AirbyteDateTime.from_datetime(parsed)
except (ValueError, TypeError):
raise ValueError(f"Could not parse datetime string: {dt_str}")
except ValueError as e:
if "Invalid date format:" in str(e):
raise
if "Timestamp cannot be negative" in str(e):
raise
if "Timestamp value too large" in str(e):
raise
raise ValueError(f"Could not parse datetime string: {dt_str}")
return ab_datetime_format(int(dt_str))
except Exception:
pass

# Fall back to dateutil for other formats
try:
parsed = dateutil_parser.parse(dt_str)
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=timezone.utc)

return AirbyteDateTime.from_datetime(parsed)
except (ValueError, TypeError):
raise ValueError(f"Could not parse datetime string: {dt_str}, ({type(dt_str).__name__})")


def ab_datetime_try_parse(dt_str: str) -> AirbyteDateTime | None:
Expand Down
196 changes: 196 additions & 0 deletions performance_test_datetime_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import statistics
import time
from datetime import datetime, timezone

from dateutil import parser
from whenever import Instant, OffsetDateTime

from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse

# Test formats
test_formats = [
"2023-03-14T15:09:26Z", # ISO format with T delimiter and Z timezone
"2023-03-14T15:09:26+00:00", # ISO format with +00:00 timezone
"2023-03-14T15:09:26.123456Z", # ISO format with microseconds
"2023-03-14T15:09:26-04:00", # ISO format with non-UTC timezone
"2023-03-14 15:09:26Z", # Missing T delimiter
"2023-03-14 15:09:26", # Missing T delimiter and timezone
"2023-03-14", # Date-only format
"14/03/2023 15:09:26", # Different date format
"2023/03/14T15:09:26Z", # Non-standard date separator
]

# Number of iterations for each test
iterations = 1000

print("Performance Test: Datetime Parsing")
print("=" * 60)
print(f"Running {iterations} iterations for each format")
print("-" * 60)

results = {}

for dt_str in test_formats:
print(f"\nFormat: {dt_str}")

# Test whenever parsing
whenever_times = []
whenever_success = False

# Try Instant.parse_common_iso
try:
# Warmup
Instant.parse_common_iso(dt_str)

for _ in range(iterations):
start = time.perf_counter()
Instant.parse_common_iso(dt_str)
end = time.perf_counter()
whenever_times.append((end - start) * 1000)

whenever_success = True
whenever_method = "Instant.parse_common_iso"
except Exception:
pass

# If parse_common_iso failed, try parse_rfc3339
if not whenever_success:
try:
# Warmup
Instant.parse_rfc3339(dt_str)

whenever_times = []
for _ in range(iterations):
start = time.perf_counter()
Instant.parse_rfc3339(dt_str)
end = time.perf_counter()
whenever_times.append((end - start) * 1000)

whenever_success = True
whenever_method = "Instant.parse_rfc3339"
except Exception:
pass

# If both Instant methods failed, try OffsetDateTime.parse_common_iso
if not whenever_success:
try:
# Warmup
OffsetDateTime.parse_common_iso(dt_str)

whenever_times = []
for _ in range(iterations):
start = time.perf_counter()
OffsetDateTime.parse_common_iso(dt_str)
end = time.perf_counter()
whenever_times.append((end - start) * 1000)

whenever_success = True
whenever_method = "OffsetDateTime.parse_common_iso"
except Exception:
whenever_method = "None"

# Test dateutil parsing
dateutil_times = []
try:
# Warmup
parser.parse(dt_str)

for _ in range(iterations):
start = time.perf_counter()
parser.parse(dt_str)
end = time.perf_counter()
dateutil_times.append((end - start) * 1000)

dateutil_success = True
except Exception:
dateutil_success = False

# Test ab_datetime_parse
ab_times = []
try:
# Warmup
ab_datetime_parse(dt_str)

for _ in range(iterations):
start = time.perf_counter()
ab_datetime_parse(dt_str)
end = time.perf_counter()
ab_times.append((end - start) * 1000)

ab_success = True
except Exception:
ab_success = False

# Print results
if whenever_success:
whenever_avg = statistics.mean(whenever_times)
whenever_min = min(whenever_times)
whenever_max = max(whenever_times)
print(
f" {whenever_method}: avg={whenever_avg:.3f}ms, min={whenever_min:.3f}ms, max={whenever_max:.3f}ms"
)
else:
print(f" whenever: Failed to parse")

if dateutil_success:
dateutil_avg = statistics.mean(dateutil_times)
dateutil_min = min(dateutil_times)
dateutil_max = max(dateutil_times)
print(
f" dateutil.parser.parse: avg={dateutil_avg:.3f}ms, min={dateutil_min:.3f}ms, max={dateutil_max:.3f}ms"
)
else:
print(f" dateutil.parser.parse: Failed to parse")

if ab_success:
ab_avg = statistics.mean(ab_times)
ab_min = min(ab_times)
ab_max = max(ab_times)
print(f" ab_datetime_parse: avg={ab_avg:.3f}ms, min={ab_min:.3f}ms, max={ab_max:.3f}ms")
else:
print(f" ab_datetime_parse: Failed to parse")

# Compare performance if both succeeded
if whenever_success and dateutil_success:
speedup = dateutil_avg / whenever_avg
print(f" Performance: whenever is {speedup:.2f}x faster than dateutil")

# Store results for summary
results[dt_str] = {
"whenever": {
"success": whenever_success,
"method": whenever_method,
"avg": statistics.mean(whenever_times) if whenever_success else None,
},
"dateutil": {
"success": dateutil_success,
"avg": statistics.mean(dateutil_times) if dateutil_success else None,
},
"ab_datetime_parse": {
"success": ab_success,
"avg": statistics.mean(ab_times) if ab_success else None,
},
}

# Print summary
print("\n\nSummary")
print("=" * 60)
print("Format | whenever | dateutil | ab_datetime_parse | Speedup")
print("-" * 60)

for dt_str, result in results.items():
whenever_avg = result["whenever"]["avg"]
dateutil_avg = result["dateutil"]["avg"]
ab_avg = result["ab_datetime_parse"]["avg"]

whenever_str = f"{whenever_avg:.3f}ms" if whenever_avg else "N/A"
dateutil_str = f"{dateutil_avg:.3f}ms" if dateutil_avg else "N/A"
ab_str = f"{ab_avg:.3f}ms" if ab_avg else "N/A"

if whenever_avg and dateutil_avg:
speedup = dateutil_avg / whenever_avg
speedup_str = f"{speedup:.2f}x"
else:
speedup_str = "N/A"

print(f"{dt_str} | {whenever_str} | {dateutil_str} | {ab_str} | {speedup_str}")
Loading
Loading