diff --git a/airbyte_cdk/utils/datetime_helpers.py b/airbyte_cdk/utils/datetime_helpers.py index 99cf1ad23..c3c49ed1b 100644 --- a/airbyte_cdk/utils/datetime_helpers.py +++ b/airbyte_cdk/utils/datetime_helpers.py @@ -84,9 +84,9 @@ from datetime import datetime, timedelta, timezone from typing import Any, Optional, Union, overload -from dateutil import parser +from dateutil import parser as dateutil_parser from typing_extensions import Never -from whenever import Instant, LocalDateTime, ZonedDateTime +from whenever import Instant, LocalDateTime, OffsetDateTime, ZonedDateTime class AirbyteDateTime(datetime): @@ -358,7 +358,12 @@ def ab_datetime_now() -> AirbyteDateTime: return AirbyteDateTime.from_datetime(datetime.now(timezone.utc)) -def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime: +def ab_datetime_parse( + dt_str: str | int, + formats: list[str] | None = None, + *, + disallow_other_formats: bool = False, +) -> AirbyteDateTime: """Parses a datetime string or timestamp into an AirbyteDateTime with timezone awareness. This implementation is as flexible as possible to handle various datetime formats. @@ -389,57 +394,64 @@ def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime: >>> ab_datetime_parse("2023-03-14") # Date-only '2023-03-14T00:00:00+00:00' """ - try: - # Handle numeric values as Unix timestamps (UTC) - if isinstance(dt_str, int) or ( - isinstance(dt_str, str) - and (dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit())) - ): - timestamp = int(dt_str) - if timestamp < 0: - raise ValueError("Timestamp cannot be negative") - if len(str(abs(timestamp))) > 10: - raise ValueError("Timestamp value too large") - instant = Instant.from_timestamp(timestamp) - return AirbyteDateTime.from_datetime(instant.py_datetime()) - - if not isinstance(dt_str, str): + # Handle numeric values as Unix timestamps (UTC) + if isinstance(dt_str, int): + timestamp: int = dt_str + if timestamp < 0: + raise ValueError("Timestamp cannot be negative") + if len(str(abs(timestamp))) > 10: + raise ValueError("Timestamp value too large") + instant = Instant.from_timestamp(timestamp) + return AirbyteDateTime.from_datetime(instant.py_datetime()) + + if not isinstance(dt_str, str): + raise ValueError( + f"Could not parse datetime string: expected string or integer, got {type(dt_str)}" + ) + + if formats: + ex_list: list[Exception] = [] + for format in formats: + try: + result = OffsetDateTime.strptime(dt_str, format) + except Exception as ex: + ex_list.append(ex) + else: + # No exception + return result + + if disallow_other_formats: raise ValueError( - f"Could not parse datetime string: expected string or integer, got {type(dt_str)}" + f"Could not parse datetime string with provided formats [{formats}]: {str(ex_list)}" ) - # Handle date-only format first - if ":" not in dt_str and dt_str.count("-") == 2 and "/" not in dt_str: - try: - year, month, day = map(int, dt_str.split("-")) - if not (1 <= month <= 12 and 1 <= day <= 31): - raise ValueError(f"Invalid date format: {dt_str}") - instant = Instant.from_utc(year, month, day, 0, 0, 0) - return AirbyteDateTime.from_datetime(instant.py_datetime()) - except (ValueError, TypeError): - raise ValueError(f"Invalid date format: {dt_str}") - - # Reject time-only strings without date - if ":" in dt_str and dt_str.count("-") < 2 and dt_str.count("/") < 2: - raise ValueError(f"Missing date part in datetime string: {dt_str}") - - # Try parsing with dateutil for timezone handling + # Else, value is a string + + # Try parsing standard ISO/RFC formats with whenever + try: + instant = Instant.parse_common_iso(dt_str) + return AirbyteDateTime.from_datetime(instant.py_datetime()) + except Exception: + pass + + # Handle int-like strings + if ( + dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit()) + ): try: - parsed = parser.parse(dt_str) - if parsed.tzinfo is None: - parsed = parsed.replace(tzinfo=timezone.utc) - - return AirbyteDateTime.from_datetime(parsed) - except (ValueError, TypeError): - raise ValueError(f"Could not parse datetime string: {dt_str}") - except ValueError as e: - if "Invalid date format:" in str(e): - raise - if "Timestamp cannot be negative" in str(e): - raise - if "Timestamp value too large" in str(e): - raise - raise ValueError(f"Could not parse datetime string: {dt_str}") + return ab_datetime_format(int(dt_str)) + except Exception: + pass + + # Fall back to dateutil for other formats + try: + parsed = dateutil_parser.parse(dt_str) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + + return AirbyteDateTime.from_datetime(parsed) + except (ValueError, TypeError): + raise ValueError(f"Could not parse datetime string: {dt_str}, ({type(dt_str).__name__})") def ab_datetime_try_parse(dt_str: str) -> AirbyteDateTime | None: diff --git a/performance_test_datetime_parsing.py b/performance_test_datetime_parsing.py new file mode 100644 index 000000000..d72eedfb1 --- /dev/null +++ b/performance_test_datetime_parsing.py @@ -0,0 +1,196 @@ +import statistics +import time +from datetime import datetime, timezone + +from dateutil import parser +from whenever import Instant, OffsetDateTime + +from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse + +# Test formats +test_formats = [ + "2023-03-14T15:09:26Z", # ISO format with T delimiter and Z timezone + "2023-03-14T15:09:26+00:00", # ISO format with +00:00 timezone + "2023-03-14T15:09:26.123456Z", # ISO format with microseconds + "2023-03-14T15:09:26-04:00", # ISO format with non-UTC timezone + "2023-03-14 15:09:26Z", # Missing T delimiter + "2023-03-14 15:09:26", # Missing T delimiter and timezone + "2023-03-14", # Date-only format + "14/03/2023 15:09:26", # Different date format + "2023/03/14T15:09:26Z", # Non-standard date separator +] + +# Number of iterations for each test +iterations = 1000 + +print("Performance Test: Datetime Parsing") +print("=" * 60) +print(f"Running {iterations} iterations for each format") +print("-" * 60) + +results = {} + +for dt_str in test_formats: + print(f"\nFormat: {dt_str}") + + # Test whenever parsing + whenever_times = [] + whenever_success = False + + # Try Instant.parse_common_iso + try: + # Warmup + Instant.parse_common_iso(dt_str) + + for _ in range(iterations): + start = time.perf_counter() + Instant.parse_common_iso(dt_str) + end = time.perf_counter() + whenever_times.append((end - start) * 1000) + + whenever_success = True + whenever_method = "Instant.parse_common_iso" + except Exception: + pass + + # If parse_common_iso failed, try parse_rfc3339 + if not whenever_success: + try: + # Warmup + Instant.parse_rfc3339(dt_str) + + whenever_times = [] + for _ in range(iterations): + start = time.perf_counter() + Instant.parse_rfc3339(dt_str) + end = time.perf_counter() + whenever_times.append((end - start) * 1000) + + whenever_success = True + whenever_method = "Instant.parse_rfc3339" + except Exception: + pass + + # If both Instant methods failed, try OffsetDateTime.parse_common_iso + if not whenever_success: + try: + # Warmup + OffsetDateTime.parse_common_iso(dt_str) + + whenever_times = [] + for _ in range(iterations): + start = time.perf_counter() + OffsetDateTime.parse_common_iso(dt_str) + end = time.perf_counter() + whenever_times.append((end - start) * 1000) + + whenever_success = True + whenever_method = "OffsetDateTime.parse_common_iso" + except Exception: + whenever_method = "None" + + # Test dateutil parsing + dateutil_times = [] + try: + # Warmup + parser.parse(dt_str) + + for _ in range(iterations): + start = time.perf_counter() + parser.parse(dt_str) + end = time.perf_counter() + dateutil_times.append((end - start) * 1000) + + dateutil_success = True + except Exception: + dateutil_success = False + + # Test ab_datetime_parse + ab_times = [] + try: + # Warmup + ab_datetime_parse(dt_str) + + for _ in range(iterations): + start = time.perf_counter() + ab_datetime_parse(dt_str) + end = time.perf_counter() + ab_times.append((end - start) * 1000) + + ab_success = True + except Exception: + ab_success = False + + # Print results + if whenever_success: + whenever_avg = statistics.mean(whenever_times) + whenever_min = min(whenever_times) + whenever_max = max(whenever_times) + print( + f" {whenever_method}: avg={whenever_avg:.3f}ms, min={whenever_min:.3f}ms, max={whenever_max:.3f}ms" + ) + else: + print(f" whenever: Failed to parse") + + if dateutil_success: + dateutil_avg = statistics.mean(dateutil_times) + dateutil_min = min(dateutil_times) + dateutil_max = max(dateutil_times) + print( + f" dateutil.parser.parse: avg={dateutil_avg:.3f}ms, min={dateutil_min:.3f}ms, max={dateutil_max:.3f}ms" + ) + else: + print(f" dateutil.parser.parse: Failed to parse") + + if ab_success: + ab_avg = statistics.mean(ab_times) + ab_min = min(ab_times) + ab_max = max(ab_times) + print(f" ab_datetime_parse: avg={ab_avg:.3f}ms, min={ab_min:.3f}ms, max={ab_max:.3f}ms") + else: + print(f" ab_datetime_parse: Failed to parse") + + # Compare performance if both succeeded + if whenever_success and dateutil_success: + speedup = dateutil_avg / whenever_avg + print(f" Performance: whenever is {speedup:.2f}x faster than dateutil") + + # Store results for summary + results[dt_str] = { + "whenever": { + "success": whenever_success, + "method": whenever_method, + "avg": statistics.mean(whenever_times) if whenever_success else None, + }, + "dateutil": { + "success": dateutil_success, + "avg": statistics.mean(dateutil_times) if dateutil_success else None, + }, + "ab_datetime_parse": { + "success": ab_success, + "avg": statistics.mean(ab_times) if ab_success else None, + }, + } + +# Print summary +print("\n\nSummary") +print("=" * 60) +print("Format | whenever | dateutil | ab_datetime_parse | Speedup") +print("-" * 60) + +for dt_str, result in results.items(): + whenever_avg = result["whenever"]["avg"] + dateutil_avg = result["dateutil"]["avg"] + ab_avg = result["ab_datetime_parse"]["avg"] + + whenever_str = f"{whenever_avg:.3f}ms" if whenever_avg else "N/A" + dateutil_str = f"{dateutil_avg:.3f}ms" if dateutil_avg else "N/A" + ab_str = f"{ab_avg:.3f}ms" if ab_avg else "N/A" + + if whenever_avg and dateutil_avg: + speedup = dateutil_avg / whenever_avg + speedup_str = f"{speedup:.2f}x" + else: + speedup_str = "N/A" + + print(f"{dt_str} | {whenever_str} | {dateutil_str} | {ab_str} | {speedup_str}") diff --git a/unit_tests/utils/test_datetime_helpers.py b/unit_tests/utils/test_datetime_helpers.py index 88c61ef95..67d2bbcd7 100644 --- a/unit_tests/utils/test_datetime_helpers.py +++ b/unit_tests/utils/test_datetime_helpers.py @@ -6,6 +6,8 @@ import freezegun import pytest +from dateutil import parser +from whenever import Instant from airbyte_cdk.utils.datetime_helpers import ( AirbyteDateTime, @@ -262,3 +264,111 @@ def test_epoch_millis(): # Test roundtrip conversion dt3 = AirbyteDateTime.from_epoch_millis(dt.to_epoch_millis()) assert dt3 == dt + + +@pytest.mark.parametrize( + "input_value,expected_parser", + [ + # Formats that use the whenever parser + ("2023-03-14", "whenever"), # Date-only format + (1678806566, "whenever"), # Unix timestamp + ("2023-03-14T15:09:26Z", "whenever"), # ISO format with T delimiter + ("2023-03-14T15:09:26+00:00", "whenever"), # ISO format with timezone + ("2023-03-14T15:09:26.123456Z", "whenever"), # ISO format with microseconds + ("2023-03-14T15:09:26-04:00", "whenever"), # ISO format with non-UTC timezone + ("2023-03-14 15:09:26Z", "whenever"), # Missing T delimiter but with Z + # Formats that still use the dateutil parser + ("2023-03-14 15:09:26", "dateutil"), # Missing T delimiter and timezone + ("14/03/2023 15:09:26", "dateutil"), # Different date format + ("2023/03/14T15:09:26Z", "dateutil"), # Non-standard date separator + ], +) +def test_datetime_parser_selection(input_value, expected_parser, monkeypatch): + """Test that the correct parser is used based on the input format.""" + # Create tracking variables + whenever_called = False + dateutil_called = False + + # Store original functions + original_instant = __import__("whenever").Instant + original_offset_dt = __import__("whenever").OffsetDateTime + original_parser_parse = parser.parse + + # Create spies for whenever methods + def spy_from_timestamp(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_instant.from_timestamp(*args, **kwargs) + + def spy_from_utc(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_instant.from_utc(*args, **kwargs) + + def spy_parse_common_iso(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_instant.parse_common_iso(*args, **kwargs) + + def spy_parse_rfc3339(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_instant.parse_rfc3339(*args, **kwargs) + + def spy_offset_parse_common_iso(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_offset_dt.parse_common_iso(*args, **kwargs) + + # Create a spy for parser.parse + def spy_parser_parse(*args, **kwargs): + nonlocal dateutil_called + dateutil_called = True + return original_parser_parse(*args, **kwargs) + + # Create mock classes with our spy methods + class MockInstant: + @staticmethod + def from_timestamp(*args, **kwargs): + return spy_from_timestamp(*args, **kwargs) + + @staticmethod + def from_utc(*args, **kwargs): + return spy_from_utc(*args, **kwargs) + + @staticmethod + def parse_common_iso(*args, **kwargs): + return spy_parse_common_iso(*args, **kwargs) + + @staticmethod + def parse_rfc3339(*args, **kwargs): + return spy_parse_rfc3339(*args, **kwargs) + + @staticmethod + def py_datetime(): + return original_instant.py_datetime() + + class MockOffsetDateTime: + @staticmethod + def parse_common_iso(*args, **kwargs): + return spy_offset_parse_common_iso(*args, **kwargs) + + @staticmethod + def py_datetime(): + return original_offset_dt.py_datetime() + + # Apply the mocks at the module level + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant) + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.OffsetDateTime", MockOffsetDateTime) + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse) + + # Parse the datetime + ab_datetime_parse(input_value) + + # Check which parser was used + if expected_parser == "whenever": + assert whenever_called, f"Expected whenever parser to be used for {input_value}" + assert not dateutil_called, f"Did not expect dateutil parser to be used for {input_value}" + else: + assert dateutil_called, f"Expected dateutil parser to be used for {input_value}" + assert not whenever_called, f"Did not expect whenever parser to be used for {input_value}"