From 9a20e6f01172d0e89064ed369b23623002a02dba Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 02:07:01 +0000 Subject: [PATCH 1/7] tests: Add unit tests for datetime parser code paths Co-Authored-By: Aaron Steers --- unit_tests/utils/test_datetime_helpers.py | 159 ++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/unit_tests/utils/test_datetime_helpers.py b/unit_tests/utils/test_datetime_helpers.py index 88c61ef95..50cd79a44 100644 --- a/unit_tests/utils/test_datetime_helpers.py +++ b/unit_tests/utils/test_datetime_helpers.py @@ -6,6 +6,8 @@ import freezegun import pytest +from dateutil import parser +from whenever import Instant from airbyte_cdk.utils.datetime_helpers import ( AirbyteDateTime, @@ -262,3 +264,160 @@ def test_epoch_millis(): # Test roundtrip conversion dt3 = AirbyteDateTime.from_epoch_millis(dt.to_epoch_millis()) assert dt3 == dt + + +@pytest.mark.parametrize( + "input_value,expected_parser", + [ + # Formats that use the whenever parser + ("2023-03-14", "whenever"), # Date-only format + (1678806566, "whenever"), # Unix timestamp + # Formats that use the dateutil parser + ("2023-03-14T15:09:26Z", "dateutil"), # ISO format with T delimiter + ("2023-03-14T15:09:26+00:00", "dateutil"), # ISO format with timezone + ("2023-03-14 15:09:26", "dateutil"), # Missing T delimiter + ("14/03/2023 15:09:26", "dateutil"), # Different date format + ], +) +def test_datetime_parser_selection(input_value, expected_parser, monkeypatch): + """Test that the correct parser is used based on the input format.""" + # Create tracking variables + whenever_called = False + dateutil_called = False + + # Store original functions + original_instant_module = __import__("whenever").Instant + original_parser_parse = parser.parse + + # Create a spy for Instant.from_timestamp + def spy_from_timestamp(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + # Call the original function + return original_instant_module.from_timestamp(*args, **kwargs) + + # Create a spy for Instant.from_utc + def spy_from_utc(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + # Call the original function + return original_instant_module.from_utc(*args, **kwargs) + + # Create a spy for parser.parse + def spy_parser_parse(*args, **kwargs): + nonlocal dateutil_called + dateutil_called = True + # Call the original function + return original_parser_parse(*args, **kwargs) + + # Create a mock Instant class with our spy methods + class MockInstant: + @staticmethod + def from_timestamp(*args, **kwargs): + return spy_from_timestamp(*args, **kwargs) + + @staticmethod + def from_utc(*args, **kwargs): + return spy_from_utc(*args, **kwargs) + + # Add any other methods that might be called + @staticmethod + def py_datetime(): + return original_instant_module.py_datetime() + + # Apply the mocks at the module level + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant) + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse) + + # Skip formats that would be rejected by validation checks + if isinstance(input_value, str) and "March" in input_value: + # Skip this test case as it would be rejected by validation + return + + # Parse the datetime + ab_datetime_parse(input_value) + + # Check which parser was used + if expected_parser == "whenever": + assert whenever_called, f"Expected whenever parser to be used for {input_value}" + assert not dateutil_called, f"Did not expect dateutil parser to be used for {input_value}" + else: + assert dateutil_called, f"Expected dateutil parser to be used for {input_value}" + assert not whenever_called, f"Did not expect whenever parser to be used for {input_value}" + + +def test_whenever_parser_for_iso_formats(monkeypatch): + """Test that the whenever parser is used for certain formats even when dateutil is unavailable.""" + + # Create a mock dateutil.parser.parse that always raises an exception + def mock_parser_parse(dt_str, **kwargs): + raise ValueError("dateutil parser is unavailable") + + # Apply the mock at the module level + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", mock_parser_parse) + + # These formats should still parse correctly using the whenever parser + whenever_formats = [ + "2023-03-14", # Date-only format + 1678806566, # Unix timestamp + ] + + for dt_str in whenever_formats: + # This should not raise an exception because the whenever parser should be used + result = ab_datetime_parse(dt_str) + assert isinstance(result, AirbyteDateTime) + + +def test_dateutil_fallback_for_non_iso_formats(monkeypatch): + """Test that the dateutil parser is used as a fallback for non-ISO/RFC compliant formats.""" + # Create tracking variables + whenever_called = False + dateutil_called = False + + # Store original functions + original_instant_module = __import__("whenever").Instant + original_parser_parse = parser.parse + + # Create a mock Instant class with methods that always raise exceptions + class MockInstant: + @staticmethod + def from_timestamp(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + raise ValueError("whenever parser is unavailable") + + @staticmethod + def from_utc(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + raise ValueError("whenever parser is unavailable") + + # Create a spy for parser.parse + def spy_parser_parse(*args, **kwargs): + nonlocal dateutil_called + dateutil_called = True + return original_parser_parse(*args, **kwargs) + + # Apply the mocks at the module level + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant) + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse) + + # These non-ISO/RFC formats should use the dateutil parser + non_iso_formats = [ + "2023-03-14T15:09:26Z", # ISO format with T delimiter + "2023-03-14T15:09:26+00:00", # ISO format with timezone + "2023-03-14 15:09:26", # Missing T delimiter + "14/03/2023 15:09:26", # Different date format + ] + + for dt_str in non_iso_formats: + # Skip formats that would be rejected by validation checks + if "March" in dt_str: + continue + + # This should not raise an exception because the dateutil parser should be used + result = ab_datetime_parse(dt_str) + assert isinstance(result, AirbyteDateTime) + assert dateutil_called, f"Expected dateutil parser to be used for {dt_str}" + # Reset the flag for the next iteration + dateutil_called = False From 34f24f2032b4f791ed1631b2da746a05bdbfde5c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 02:30:18 +0000 Subject: [PATCH 2/7] tests: Address PR comments by removing redundant test functions Co-Authored-By: Aaron Steers --- unit_tests/utils/test_datetime_helpers.py | 82 ----------------------- 1 file changed, 82 deletions(-) diff --git a/unit_tests/utils/test_datetime_helpers.py b/unit_tests/utils/test_datetime_helpers.py index 50cd79a44..59e868632 100644 --- a/unit_tests/utils/test_datetime_helpers.py +++ b/unit_tests/utils/test_datetime_helpers.py @@ -329,11 +329,6 @@ def py_datetime(): monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant) monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse) - # Skip formats that would be rejected by validation checks - if isinstance(input_value, str) and "March" in input_value: - # Skip this test case as it would be rejected by validation - return - # Parse the datetime ab_datetime_parse(input_value) @@ -344,80 +339,3 @@ def py_datetime(): else: assert dateutil_called, f"Expected dateutil parser to be used for {input_value}" assert not whenever_called, f"Did not expect whenever parser to be used for {input_value}" - - -def test_whenever_parser_for_iso_formats(monkeypatch): - """Test that the whenever parser is used for certain formats even when dateutil is unavailable.""" - - # Create a mock dateutil.parser.parse that always raises an exception - def mock_parser_parse(dt_str, **kwargs): - raise ValueError("dateutil parser is unavailable") - - # Apply the mock at the module level - monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", mock_parser_parse) - - # These formats should still parse correctly using the whenever parser - whenever_formats = [ - "2023-03-14", # Date-only format - 1678806566, # Unix timestamp - ] - - for dt_str in whenever_formats: - # This should not raise an exception because the whenever parser should be used - result = ab_datetime_parse(dt_str) - assert isinstance(result, AirbyteDateTime) - - -def test_dateutil_fallback_for_non_iso_formats(monkeypatch): - """Test that the dateutil parser is used as a fallback for non-ISO/RFC compliant formats.""" - # Create tracking variables - whenever_called = False - dateutil_called = False - - # Store original functions - original_instant_module = __import__("whenever").Instant - original_parser_parse = parser.parse - - # Create a mock Instant class with methods that always raise exceptions - class MockInstant: - @staticmethod - def from_timestamp(*args, **kwargs): - nonlocal whenever_called - whenever_called = True - raise ValueError("whenever parser is unavailable") - - @staticmethod - def from_utc(*args, **kwargs): - nonlocal whenever_called - whenever_called = True - raise ValueError("whenever parser is unavailable") - - # Create a spy for parser.parse - def spy_parser_parse(*args, **kwargs): - nonlocal dateutil_called - dateutil_called = True - return original_parser_parse(*args, **kwargs) - - # Apply the mocks at the module level - monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant) - monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse) - - # These non-ISO/RFC formats should use the dateutil parser - non_iso_formats = [ - "2023-03-14T15:09:26Z", # ISO format with T delimiter - "2023-03-14T15:09:26+00:00", # ISO format with timezone - "2023-03-14 15:09:26", # Missing T delimiter - "14/03/2023 15:09:26", # Different date format - ] - - for dt_str in non_iso_formats: - # Skip formats that would be rejected by validation checks - if "March" in dt_str: - continue - - # This should not raise an exception because the dateutil parser should be used - result = ab_datetime_parse(dt_str) - assert isinstance(result, AirbyteDateTime) - assert dateutil_called, f"Expected dateutil parser to be used for {dt_str}" - # Reset the flag for the next iteration - dateutil_called = False From 29a49c6799ff37114d1e5eb72c3e617f8c8d6ff6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 02:44:35 +0000 Subject: [PATCH 3/7] feat(datetime): Migrate standard datetime parsing to whenever library Co-Authored-By: Aaron Steers --- airbyte_cdk/utils/datetime_helpers.py | 37 ++++- performance_test_datetime_parsing.py | 191 ++++++++++++++++++++++ unit_tests/utils/test_datetime_helpers.py | 63 +++++-- 3 files changed, 274 insertions(+), 17 deletions(-) create mode 100644 performance_test_datetime_parsing.py diff --git a/airbyte_cdk/utils/datetime_helpers.py b/airbyte_cdk/utils/datetime_helpers.py index 99cf1ad23..832a950e9 100644 --- a/airbyte_cdk/utils/datetime_helpers.py +++ b/airbyte_cdk/utils/datetime_helpers.py @@ -86,7 +86,7 @@ from dateutil import parser from typing_extensions import Never -from whenever import Instant, LocalDateTime, ZonedDateTime +from whenever import Instant, LocalDateTime, OffsetDateTime, ZonedDateTime class AirbyteDateTime(datetime): @@ -423,7 +423,40 @@ def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime: if ":" in dt_str and dt_str.count("-") < 2 and dt_str.count("/") < 2: raise ValueError(f"Missing date part in datetime string: {dt_str}") - # Try parsing with dateutil for timezone handling + # Try parsing standard ISO/RFC formats with whenever + # Only attempt whenever parsing for specific ISO/RFC formats + if ( + isinstance(dt_str, str) + and "/" not in dt_str # Exclude non-standard date separators + and ( + # ISO format with T delimiter and Z timezone or +00:00 timezone + (("T" in dt_str) and ("Z" in dt_str or "+" in dt_str or "-" in dt_str)) + # ISO format with space delimiter and Z timezone + or (" " in dt_str and "Z" in dt_str) + ) + ): + # First try Instant.parse_common_iso for UTC formats + try: + instant = Instant.parse_common_iso(dt_str) + return AirbyteDateTime.from_datetime(instant.py_datetime()) + except Exception: + pass + + # Then try Instant.parse_rfc3339 which is more flexible + try: + instant = Instant.parse_rfc3339(dt_str) + return AirbyteDateTime.from_datetime(instant.py_datetime()) + except Exception: + pass + + # Try OffsetDateTime for non-UTC timezones + try: + offset_dt = OffsetDateTime.parse_common_iso(dt_str) + return AirbyteDateTime.from_datetime(offset_dt.py_datetime()) + except Exception: + pass + + # Fall back to dateutil for other formats try: parsed = parser.parse(dt_str) if parsed.tzinfo is None: diff --git a/performance_test_datetime_parsing.py b/performance_test_datetime_parsing.py new file mode 100644 index 000000000..6f658d451 --- /dev/null +++ b/performance_test_datetime_parsing.py @@ -0,0 +1,191 @@ +import time +import statistics +from datetime import datetime, timezone +from dateutil import parser +from whenever import Instant, OffsetDateTime + +from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse + +# Test formats +test_formats = [ + '2023-03-14T15:09:26Z', # ISO format with T delimiter and Z timezone + '2023-03-14T15:09:26+00:00', # ISO format with +00:00 timezone + '2023-03-14T15:09:26.123456Z', # ISO format with microseconds + '2023-03-14T15:09:26-04:00', # ISO format with non-UTC timezone + '2023-03-14 15:09:26Z', # Missing T delimiter + '2023-03-14 15:09:26', # Missing T delimiter and timezone + '2023-03-14', # Date-only format + '14/03/2023 15:09:26', # Different date format + '2023/03/14T15:09:26Z', # Non-standard date separator +] + +# Number of iterations for each test +iterations = 1000 + +print("Performance Test: Datetime Parsing") +print("=" * 60) +print(f"Running {iterations} iterations for each format") +print("-" * 60) + +results = {} + +for dt_str in test_formats: + print(f"\nFormat: {dt_str}") + + # Test whenever parsing + whenever_times = [] + whenever_success = False + + # Try Instant.parse_common_iso + try: + # Warmup + Instant.parse_common_iso(dt_str) + + for _ in range(iterations): + start = time.perf_counter() + Instant.parse_common_iso(dt_str) + end = time.perf_counter() + whenever_times.append((end - start) * 1000) + + whenever_success = True + whenever_method = "Instant.parse_common_iso" + except Exception: + pass + + # If parse_common_iso failed, try parse_rfc3339 + if not whenever_success: + try: + # Warmup + Instant.parse_rfc3339(dt_str) + + whenever_times = [] + for _ in range(iterations): + start = time.perf_counter() + Instant.parse_rfc3339(dt_str) + end = time.perf_counter() + whenever_times.append((end - start) * 1000) + + whenever_success = True + whenever_method = "Instant.parse_rfc3339" + except Exception: + pass + + # If both Instant methods failed, try OffsetDateTime.parse_common_iso + if not whenever_success: + try: + # Warmup + OffsetDateTime.parse_common_iso(dt_str) + + whenever_times = [] + for _ in range(iterations): + start = time.perf_counter() + OffsetDateTime.parse_common_iso(dt_str) + end = time.perf_counter() + whenever_times.append((end - start) * 1000) + + whenever_success = True + whenever_method = "OffsetDateTime.parse_common_iso" + except Exception: + whenever_method = "None" + + # Test dateutil parsing + dateutil_times = [] + try: + # Warmup + parser.parse(dt_str) + + for _ in range(iterations): + start = time.perf_counter() + parser.parse(dt_str) + end = time.perf_counter() + dateutil_times.append((end - start) * 1000) + + dateutil_success = True + except Exception: + dateutil_success = False + + # Test ab_datetime_parse + ab_times = [] + try: + # Warmup + ab_datetime_parse(dt_str) + + for _ in range(iterations): + start = time.perf_counter() + ab_datetime_parse(dt_str) + end = time.perf_counter() + ab_times.append((end - start) * 1000) + + ab_success = True + except Exception: + ab_success = False + + # Print results + if whenever_success: + whenever_avg = statistics.mean(whenever_times) + whenever_min = min(whenever_times) + whenever_max = max(whenever_times) + print(f" {whenever_method}: avg={whenever_avg:.3f}ms, min={whenever_min:.3f}ms, max={whenever_max:.3f}ms") + else: + print(f" whenever: Failed to parse") + + if dateutil_success: + dateutil_avg = statistics.mean(dateutil_times) + dateutil_min = min(dateutil_times) + dateutil_max = max(dateutil_times) + print(f" dateutil.parser.parse: avg={dateutil_avg:.3f}ms, min={dateutil_min:.3f}ms, max={dateutil_max:.3f}ms") + else: + print(f" dateutil.parser.parse: Failed to parse") + + if ab_success: + ab_avg = statistics.mean(ab_times) + ab_min = min(ab_times) + ab_max = max(ab_times) + print(f" ab_datetime_parse: avg={ab_avg:.3f}ms, min={ab_min:.3f}ms, max={ab_max:.3f}ms") + else: + print(f" ab_datetime_parse: Failed to parse") + + # Compare performance if both succeeded + if whenever_success and dateutil_success: + speedup = dateutil_avg / whenever_avg + print(f" Performance: whenever is {speedup:.2f}x faster than dateutil") + + # Store results for summary + results[dt_str] = { + 'whenever': { + 'success': whenever_success, + 'method': whenever_method, + 'avg': statistics.mean(whenever_times) if whenever_success else None, + }, + 'dateutil': { + 'success': dateutil_success, + 'avg': statistics.mean(dateutil_times) if dateutil_success else None, + }, + 'ab_datetime_parse': { + 'success': ab_success, + 'avg': statistics.mean(ab_times) if ab_success else None, + } + } + +# Print summary +print("\n\nSummary") +print("=" * 60) +print("Format | whenever | dateutil | ab_datetime_parse | Speedup") +print("-" * 60) + +for dt_str, result in results.items(): + whenever_avg = result['whenever']['avg'] + dateutil_avg = result['dateutil']['avg'] + ab_avg = result['ab_datetime_parse']['avg'] + + whenever_str = f"{whenever_avg:.3f}ms" if whenever_avg else "N/A" + dateutil_str = f"{dateutil_avg:.3f}ms" if dateutil_avg else "N/A" + ab_str = f"{ab_avg:.3f}ms" if ab_avg else "N/A" + + if whenever_avg and dateutil_avg: + speedup = dateutil_avg / whenever_avg + speedup_str = f"{speedup:.2f}x" + else: + speedup_str = "N/A" + + print(f"{dt_str} | {whenever_str} | {dateutil_str} | {ab_str} | {speedup_str}") diff --git a/unit_tests/utils/test_datetime_helpers.py b/unit_tests/utils/test_datetime_helpers.py index 59e868632..67d2bbcd7 100644 --- a/unit_tests/utils/test_datetime_helpers.py +++ b/unit_tests/utils/test_datetime_helpers.py @@ -272,11 +272,15 @@ def test_epoch_millis(): # Formats that use the whenever parser ("2023-03-14", "whenever"), # Date-only format (1678806566, "whenever"), # Unix timestamp - # Formats that use the dateutil parser - ("2023-03-14T15:09:26Z", "dateutil"), # ISO format with T delimiter - ("2023-03-14T15:09:26+00:00", "dateutil"), # ISO format with timezone - ("2023-03-14 15:09:26", "dateutil"), # Missing T delimiter + ("2023-03-14T15:09:26Z", "whenever"), # ISO format with T delimiter + ("2023-03-14T15:09:26+00:00", "whenever"), # ISO format with timezone + ("2023-03-14T15:09:26.123456Z", "whenever"), # ISO format with microseconds + ("2023-03-14T15:09:26-04:00", "whenever"), # ISO format with non-UTC timezone + ("2023-03-14 15:09:26Z", "whenever"), # Missing T delimiter but with Z + # Formats that still use the dateutil parser + ("2023-03-14 15:09:26", "dateutil"), # Missing T delimiter and timezone ("14/03/2023 15:09:26", "dateutil"), # Different date format + ("2023/03/14T15:09:26Z", "dateutil"), # Non-standard date separator ], ) def test_datetime_parser_selection(input_value, expected_parser, monkeypatch): @@ -286,31 +290,43 @@ def test_datetime_parser_selection(input_value, expected_parser, monkeypatch): dateutil_called = False # Store original functions - original_instant_module = __import__("whenever").Instant + original_instant = __import__("whenever").Instant + original_offset_dt = __import__("whenever").OffsetDateTime original_parser_parse = parser.parse - # Create a spy for Instant.from_timestamp + # Create spies for whenever methods def spy_from_timestamp(*args, **kwargs): nonlocal whenever_called whenever_called = True - # Call the original function - return original_instant_module.from_timestamp(*args, **kwargs) + return original_instant.from_timestamp(*args, **kwargs) - # Create a spy for Instant.from_utc def spy_from_utc(*args, **kwargs): nonlocal whenever_called whenever_called = True - # Call the original function - return original_instant_module.from_utc(*args, **kwargs) + return original_instant.from_utc(*args, **kwargs) + + def spy_parse_common_iso(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_instant.parse_common_iso(*args, **kwargs) + + def spy_parse_rfc3339(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_instant.parse_rfc3339(*args, **kwargs) + + def spy_offset_parse_common_iso(*args, **kwargs): + nonlocal whenever_called + whenever_called = True + return original_offset_dt.parse_common_iso(*args, **kwargs) # Create a spy for parser.parse def spy_parser_parse(*args, **kwargs): nonlocal dateutil_called dateutil_called = True - # Call the original function return original_parser_parse(*args, **kwargs) - # Create a mock Instant class with our spy methods + # Create mock classes with our spy methods class MockInstant: @staticmethod def from_timestamp(*args, **kwargs): @@ -320,13 +336,30 @@ def from_timestamp(*args, **kwargs): def from_utc(*args, **kwargs): return spy_from_utc(*args, **kwargs) - # Add any other methods that might be called + @staticmethod + def parse_common_iso(*args, **kwargs): + return spy_parse_common_iso(*args, **kwargs) + + @staticmethod + def parse_rfc3339(*args, **kwargs): + return spy_parse_rfc3339(*args, **kwargs) + + @staticmethod + def py_datetime(): + return original_instant.py_datetime() + + class MockOffsetDateTime: + @staticmethod + def parse_common_iso(*args, **kwargs): + return spy_offset_parse_common_iso(*args, **kwargs) + @staticmethod def py_datetime(): - return original_instant_module.py_datetime() + return original_offset_dt.py_datetime() # Apply the mocks at the module level monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.Instant", MockInstant) + monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.OffsetDateTime", MockOffsetDateTime) monkeypatch.setattr("airbyte_cdk.utils.datetime_helpers.parser.parse", spy_parser_parse) # Parse the datetime From bd60307025d877c2f604e852294ee57ceffc3353 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 02:45:57 +0000 Subject: [PATCH 4/7] chore: Fix Ruff format and lint issues Co-Authored-By: Aaron Steers --- performance_test_datetime_parsing.py | 105 ++++++++++++++------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/performance_test_datetime_parsing.py b/performance_test_datetime_parsing.py index 6f658d451..d72eedfb1 100644 --- a/performance_test_datetime_parsing.py +++ b/performance_test_datetime_parsing.py @@ -1,6 +1,7 @@ -import time import statistics +import time from datetime import datetime, timezone + from dateutil import parser from whenever import Instant, OffsetDateTime @@ -8,15 +9,15 @@ # Test formats test_formats = [ - '2023-03-14T15:09:26Z', # ISO format with T delimiter and Z timezone - '2023-03-14T15:09:26+00:00', # ISO format with +00:00 timezone - '2023-03-14T15:09:26.123456Z', # ISO format with microseconds - '2023-03-14T15:09:26-04:00', # ISO format with non-UTC timezone - '2023-03-14 15:09:26Z', # Missing T delimiter - '2023-03-14 15:09:26', # Missing T delimiter and timezone - '2023-03-14', # Date-only format - '14/03/2023 15:09:26', # Different date format - '2023/03/14T15:09:26Z', # Non-standard date separator + "2023-03-14T15:09:26Z", # ISO format with T delimiter and Z timezone + "2023-03-14T15:09:26+00:00", # ISO format with +00:00 timezone + "2023-03-14T15:09:26.123456Z", # ISO format with microseconds + "2023-03-14T15:09:26-04:00", # ISO format with non-UTC timezone + "2023-03-14 15:09:26Z", # Missing T delimiter + "2023-03-14 15:09:26", # Missing T delimiter and timezone + "2023-03-14", # Date-only format + "14/03/2023 15:09:26", # Different date format + "2023/03/14T15:09:26Z", # Non-standard date separator ] # Number of iterations for each test @@ -31,112 +32,116 @@ for dt_str in test_formats: print(f"\nFormat: {dt_str}") - + # Test whenever parsing whenever_times = [] whenever_success = False - + # Try Instant.parse_common_iso try: # Warmup Instant.parse_common_iso(dt_str) - + for _ in range(iterations): start = time.perf_counter() Instant.parse_common_iso(dt_str) end = time.perf_counter() whenever_times.append((end - start) * 1000) - + whenever_success = True whenever_method = "Instant.parse_common_iso" except Exception: pass - + # If parse_common_iso failed, try parse_rfc3339 if not whenever_success: try: # Warmup Instant.parse_rfc3339(dt_str) - + whenever_times = [] for _ in range(iterations): start = time.perf_counter() Instant.parse_rfc3339(dt_str) end = time.perf_counter() whenever_times.append((end - start) * 1000) - + whenever_success = True whenever_method = "Instant.parse_rfc3339" except Exception: pass - + # If both Instant methods failed, try OffsetDateTime.parse_common_iso if not whenever_success: try: # Warmup OffsetDateTime.parse_common_iso(dt_str) - + whenever_times = [] for _ in range(iterations): start = time.perf_counter() OffsetDateTime.parse_common_iso(dt_str) end = time.perf_counter() whenever_times.append((end - start) * 1000) - + whenever_success = True whenever_method = "OffsetDateTime.parse_common_iso" except Exception: whenever_method = "None" - + # Test dateutil parsing dateutil_times = [] try: # Warmup parser.parse(dt_str) - + for _ in range(iterations): start = time.perf_counter() parser.parse(dt_str) end = time.perf_counter() dateutil_times.append((end - start) * 1000) - + dateutil_success = True except Exception: dateutil_success = False - + # Test ab_datetime_parse ab_times = [] try: # Warmup ab_datetime_parse(dt_str) - + for _ in range(iterations): start = time.perf_counter() ab_datetime_parse(dt_str) end = time.perf_counter() ab_times.append((end - start) * 1000) - + ab_success = True except Exception: ab_success = False - + # Print results if whenever_success: whenever_avg = statistics.mean(whenever_times) whenever_min = min(whenever_times) whenever_max = max(whenever_times) - print(f" {whenever_method}: avg={whenever_avg:.3f}ms, min={whenever_min:.3f}ms, max={whenever_max:.3f}ms") + print( + f" {whenever_method}: avg={whenever_avg:.3f}ms, min={whenever_min:.3f}ms, max={whenever_max:.3f}ms" + ) else: print(f" whenever: Failed to parse") - + if dateutil_success: dateutil_avg = statistics.mean(dateutil_times) dateutil_min = min(dateutil_times) dateutil_max = max(dateutil_times) - print(f" dateutil.parser.parse: avg={dateutil_avg:.3f}ms, min={dateutil_min:.3f}ms, max={dateutil_max:.3f}ms") + print( + f" dateutil.parser.parse: avg={dateutil_avg:.3f}ms, min={dateutil_min:.3f}ms, max={dateutil_max:.3f}ms" + ) else: print(f" dateutil.parser.parse: Failed to parse") - + if ab_success: ab_avg = statistics.mean(ab_times) ab_min = min(ab_times) @@ -144,27 +149,27 @@ print(f" ab_datetime_parse: avg={ab_avg:.3f}ms, min={ab_min:.3f}ms, max={ab_max:.3f}ms") else: print(f" ab_datetime_parse: Failed to parse") - + # Compare performance if both succeeded if whenever_success and dateutil_success: speedup = dateutil_avg / whenever_avg print(f" Performance: whenever is {speedup:.2f}x faster than dateutil") - + # Store results for summary results[dt_str] = { - 'whenever': { - 'success': whenever_success, - 'method': whenever_method, - 'avg': statistics.mean(whenever_times) if whenever_success else None, + "whenever": { + "success": whenever_success, + "method": whenever_method, + "avg": statistics.mean(whenever_times) if whenever_success else None, + }, + "dateutil": { + "success": dateutil_success, + "avg": statistics.mean(dateutil_times) if dateutil_success else None, }, - 'dateutil': { - 'success': dateutil_success, - 'avg': statistics.mean(dateutil_times) if dateutil_success else None, + "ab_datetime_parse": { + "success": ab_success, + "avg": statistics.mean(ab_times) if ab_success else None, }, - 'ab_datetime_parse': { - 'success': ab_success, - 'avg': statistics.mean(ab_times) if ab_success else None, - } } # Print summary @@ -174,18 +179,18 @@ print("-" * 60) for dt_str, result in results.items(): - whenever_avg = result['whenever']['avg'] - dateutil_avg = result['dateutil']['avg'] - ab_avg = result['ab_datetime_parse']['avg'] - + whenever_avg = result["whenever"]["avg"] + dateutil_avg = result["dateutil"]["avg"] + ab_avg = result["ab_datetime_parse"]["avg"] + whenever_str = f"{whenever_avg:.3f}ms" if whenever_avg else "N/A" dateutil_str = f"{dateutil_avg:.3f}ms" if dateutil_avg else "N/A" ab_str = f"{ab_avg:.3f}ms" if ab_avg else "N/A" - + if whenever_avg and dateutil_avg: speedup = dateutil_avg / whenever_avg speedup_str = f"{speedup:.2f}x" else: speedup_str = "N/A" - + print(f"{dt_str} | {whenever_str} | {dateutil_str} | {ab_str} | {speedup_str}") From 50bb81fa1710609d3af709ccb9824a6c5ca88878 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Thu, 6 Mar 2025 22:07:51 -0800 Subject: [PATCH 5/7] update datetime parse logic --- airbyte_cdk/utils/datetime_helpers.py | 138 +++++++++++--------------- 1 file changed, 59 insertions(+), 79 deletions(-) diff --git a/airbyte_cdk/utils/datetime_helpers.py b/airbyte_cdk/utils/datetime_helpers.py index 832a950e9..9cf06e90f 100644 --- a/airbyte_cdk/utils/datetime_helpers.py +++ b/airbyte_cdk/utils/datetime_helpers.py @@ -84,7 +84,7 @@ from datetime import datetime, timedelta, timezone from typing import Any, Optional, Union, overload -from dateutil import parser +from dateutil import parser as dateutil_parser from typing_extensions import Never from whenever import Instant, LocalDateTime, OffsetDateTime, ZonedDateTime @@ -358,7 +358,12 @@ def ab_datetime_now() -> AirbyteDateTime: return AirbyteDateTime.from_datetime(datetime.now(timezone.utc)) -def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime: +def ab_datetime_parse( + dt_str: str | int, + formats: list[str] | None = None, + *, + disallow_other_formats: bool = False, +) -> AirbyteDateTime: """Parses a datetime string or timestamp into an AirbyteDateTime with timezone awareness. This implementation is as flexible as possible to handle various datetime formats. @@ -389,89 +394,64 @@ def ab_datetime_parse(dt_str: str | int) -> AirbyteDateTime: >>> ab_datetime_parse("2023-03-14") # Date-only '2023-03-14T00:00:00+00:00' """ - try: - # Handle numeric values as Unix timestamps (UTC) - if isinstance(dt_str, int) or ( - isinstance(dt_str, str) - and (dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit())) - ): - timestamp = int(dt_str) - if timestamp < 0: - raise ValueError("Timestamp cannot be negative") - if len(str(abs(timestamp))) > 10: - raise ValueError("Timestamp value too large") - instant = Instant.from_timestamp(timestamp) - return AirbyteDateTime.from_datetime(instant.py_datetime()) - - if not isinstance(dt_str, str): - raise ValueError( - f"Could not parse datetime string: expected string or integer, got {type(dt_str)}" - ) - - # Handle date-only format first - if ":" not in dt_str and dt_str.count("-") == 2 and "/" not in dt_str: + # Handle numeric values as Unix timestamps (UTC) + if isinstance(dt_str, int): + if timestamp < 0: + raise ValueError("Timestamp cannot be negative") + if len(str(abs(timestamp))) > 10: + raise ValueError("Timestamp value too large") + instant = Instant.from_timestamp(timestamp) + return AirbyteDateTime.from_datetime(instant.py_datetime()) + + if formats: + ex_list: list[Exception] = [] + for format in formats: try: - year, month, day = map(int, dt_str.split("-")) - if not (1 <= month <= 12 and 1 <= day <= 31): - raise ValueError(f"Invalid date format: {dt_str}") - instant = Instant.from_utc(year, month, day, 0, 0, 0) - return AirbyteDateTime.from_datetime(instant.py_datetime()) - except (ValueError, TypeError): - raise ValueError(f"Invalid date format: {dt_str}") - - # Reject time-only strings without date - if ":" in dt_str and dt_str.count("-") < 2 and dt_str.count("/") < 2: - raise ValueError(f"Missing date part in datetime string: {dt_str}") - - # Try parsing standard ISO/RFC formats with whenever - # Only attempt whenever parsing for specific ISO/RFC formats - if ( - isinstance(dt_str, str) - and "/" not in dt_str # Exclude non-standard date separators - and ( - # ISO format with T delimiter and Z timezone or +00:00 timezone - (("T" in dt_str) and ("Z" in dt_str or "+" in dt_str or "-" in dt_str)) - # ISO format with space delimiter and Z timezone - or (" " in dt_str and "Z" in dt_str) + result = OffsetDateTime.strptime(dt_str, format) + except Exception as ex: + ex_list.append(ex) + else: + # No exception + return result + + if disallow_other_formats: + raise ValueError( + f"Could not parse datetime string. {str(ex_list)}" ) - ): - # First try Instant.parse_common_iso for UTC formats - try: - instant = Instant.parse_common_iso(dt_str) - return AirbyteDateTime.from_datetime(instant.py_datetime()) - except Exception: - pass - # Then try Instant.parse_rfc3339 which is more flexible - try: - instant = Instant.parse_rfc3339(dt_str) - return AirbyteDateTime.from_datetime(instant.py_datetime()) - except Exception: - pass + if not isinstance(dt_str, str): + raise ValueError( + f"Could not parse datetime string: expected string or integer, got {type(dt_str)}" + ) - # Try OffsetDateTime for non-UTC timezones - try: - offset_dt = OffsetDateTime.parse_common_iso(dt_str) - return AirbyteDateTime.from_datetime(offset_dt.py_datetime()) - except Exception: - pass + # Else, value is a string - # Fall back to dateutil for other formats + # Try parsing standard ISO/RFC formats with whenever + try: + instant = Instant.parse_common_iso(dt_str) + return AirbyteDateTime.from_datetime(instant.py_datetime()) + except Exception: + pass + + # Handle int-like strings + if ( + isinstance(dt_str, str) and ( + dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit()) + ) + ): try: - parsed = parser.parse(dt_str) - if parsed.tzinfo is None: - parsed = parsed.replace(tzinfo=timezone.utc) - - return AirbyteDateTime.from_datetime(parsed) - except (ValueError, TypeError): - raise ValueError(f"Could not parse datetime string: {dt_str}") - except ValueError as e: - if "Invalid date format:" in str(e): - raise - if "Timestamp cannot be negative" in str(e): - raise - if "Timestamp value too large" in str(e): - raise + return ab_datetime_format(int(dt_str)) + except Exception: + pass + + # Fall back to dateutil for other formats + try: + parsed = dateutil_parser.parse(dt_str) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + + return AirbyteDateTime.from_datetime(parsed) + except (ValueError, TypeError): raise ValueError(f"Could not parse datetime string: {dt_str}") From b4c5751ba3e7712166e8b0adbd4e9cbee6483744 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Thu, 6 Mar 2025 22:17:40 -0800 Subject: [PATCH 6/7] fix declaration --- airbyte_cdk/utils/datetime_helpers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/utils/datetime_helpers.py b/airbyte_cdk/utils/datetime_helpers.py index 9cf06e90f..db5c936bb 100644 --- a/airbyte_cdk/utils/datetime_helpers.py +++ b/airbyte_cdk/utils/datetime_helpers.py @@ -396,6 +396,7 @@ def ab_datetime_parse( """ # Handle numeric values as Unix timestamps (UTC) if isinstance(dt_str, int): + timestamp: int = dt_str if timestamp < 0: raise ValueError("Timestamp cannot be negative") if len(str(abs(timestamp))) > 10: From 8e9f982f91a956d1d9412104d13debc8f46da484 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Thu, 6 Mar 2025 22:28:58 -0800 Subject: [PATCH 7/7] improve handling --- airbyte_cdk/utils/datetime_helpers.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/utils/datetime_helpers.py b/airbyte_cdk/utils/datetime_helpers.py index db5c936bb..c3c49ed1b 100644 --- a/airbyte_cdk/utils/datetime_helpers.py +++ b/airbyte_cdk/utils/datetime_helpers.py @@ -404,6 +404,11 @@ def ab_datetime_parse( instant = Instant.from_timestamp(timestamp) return AirbyteDateTime.from_datetime(instant.py_datetime()) + if not isinstance(dt_str, str): + raise ValueError( + f"Could not parse datetime string: expected string or integer, got {type(dt_str)}" + ) + if formats: ex_list: list[Exception] = [] for format in formats: @@ -417,14 +422,9 @@ def ab_datetime_parse( if disallow_other_formats: raise ValueError( - f"Could not parse datetime string. {str(ex_list)}" + f"Could not parse datetime string with provided formats [{formats}]: {str(ex_list)}" ) - if not isinstance(dt_str, str): - raise ValueError( - f"Could not parse datetime string: expected string or integer, got {type(dt_str)}" - ) - # Else, value is a string # Try parsing standard ISO/RFC formats with whenever @@ -436,9 +436,7 @@ def ab_datetime_parse( # Handle int-like strings if ( - isinstance(dt_str, str) and ( - dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit()) - ) + dt_str.isdigit() or (dt_str.startswith("-") and dt_str[1:].isdigit()) ): try: return ab_datetime_format(int(dt_str)) @@ -453,7 +451,7 @@ def ab_datetime_parse( return AirbyteDateTime.from_datetime(parsed) except (ValueError, TypeError): - raise ValueError(f"Could not parse datetime string: {dt_str}") + raise ValueError(f"Could not parse datetime string: {dt_str}, ({type(dt_str).__name__})") def ab_datetime_try_parse(dt_str: str) -> AirbyteDateTime | None: