Skip to content

Commit dfca9c5

Browse files
Fix get_date_time() for backdated liberator ingestion (#2417)
* ammend get_date_time and tests * fixup: pattern with time * formatting
1 parent 7d32aee commit dfca9c5

File tree

2 files changed

+91
-9
lines changed

2 files changed

+91
-9
lines changed

lambdas/rds_snapshot_export_s3_to_s3_copier/main.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,28 @@ def get_date_time(source_identifier: str) -> tuple[str, str, str, str]:
1919
source_identifier (str): source identifier taken from the
2020
event, as implemented this will include datetime for the snapshot as
2121
applicable in the form sql-to-parquet-yy-mm-dd-hhmmss or
22-
sql-to-parquet-yy-mm-dd-hhmmss-backdated
22+
sql-to-parquet-yyyy-mm-dd-backdated
2323
2424
Returns:
2525
tuple(str, str, str, str): year, month, day, date
2626
"""
2727

28-
pattern = r"^sql-to-parquet-(\d{2})-(\d{2})-(\d{2})-(\d{6})(-backdated)?$"
29-
30-
if not re.match(pattern, source_identifier):
28+
pattern_with_time = r"^sql-to-parquet-(\d{2})-(\d{2})-(\d{2})-(\d{6})$"
29+
pattern_backdated = r"^sql-to-parquet-(\d{4})-(\d{2})-(\d{2})-backdated$"
30+
31+
if re.match(pattern_with_time, source_identifier):
32+
split_identifier = source_identifier.split("-")
33+
day = split_identifier[5]
34+
month = split_identifier[4]
35+
year = "20" + split_identifier[3]
36+
elif re.match(pattern_backdated, source_identifier):
37+
split_identifier = source_identifier.split("-")
38+
day = split_identifier[5]
39+
month = split_identifier[4]
40+
year = split_identifier[3]
41+
else:
3142
raise ValueError("Invalid source identifier format")
3243

33-
split_identifier = source_identifier.split("-")
34-
day = split_identifier[5]
35-
month = split_identifier[4]
36-
year = "20" + split_identifier[3]
37-
3844
date = f"{year}{month}{day}"
3945
return year, month, day, date
4046

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import pytest
2+
from main import get_date_time
3+
4+
5+
class TestGetDateTime:
6+
def test_get_date_time_yy_format_with_time(self):
7+
source_identifier = "sql-to-parquet-23-12-25-143000"
8+
year, month, day, date = get_date_time(source_identifier)
9+
10+
assert year == "2023"
11+
assert month == "12"
12+
assert day == "25"
13+
assert date == "20231225"
14+
15+
def test_get_date_time_yyyy_format_backdated(self):
16+
source_identifier = "sql-to-parquet-2023-12-25-backdated"
17+
year, month, day, date = get_date_time(source_identifier)
18+
19+
assert year == "2023"
20+
assert month == "12"
21+
assert day == "25"
22+
assert date == "20231225"
23+
24+
def test_get_date_time_yyyy_format_backdated_different_date(self):
25+
source_identifier = "sql-to-parquet-2024-01-15-backdated"
26+
year, month, day, date = get_date_time(source_identifier)
27+
28+
assert year == "2024"
29+
assert month == "01"
30+
assert day == "15"
31+
assert date == "20240115"
32+
33+
def test_get_date_time_yy_format_different_time(self):
34+
source_identifier = "sql-to-parquet-24-03-10-090000"
35+
year, month, day, date = get_date_time(source_identifier)
36+
37+
assert year == "2024"
38+
assert month == "03"
39+
assert day == "10"
40+
assert date == "20240310"
41+
42+
def test_get_date_time_invalid_format_raises_error(self):
43+
invalid_identifiers = [
44+
"sql-to-parquet-2023-12-25", # Missing -backdated for yyyy format
45+
"sql-to-parquet-23-12-25", # Missing time for yy format
46+
"sql-to-parquet-23-12-25-143000-backdated", # Invalid: yy format cannot have -backdated
47+
"invalid-format-23-12-25-143000", # Wrong prefix
48+
"sql-to-parquet-23-12-25-14300", # Wrong time format (5 digits)
49+
"sql-to-parquet-23-12-25-1430000", # Wrong time format (7 digits)
50+
"sql-to-parquet-2023-12-backdated", # Missing day
51+
"sql-to-parquet-123-12-25-143000", # 3-digit year
52+
]
53+
54+
for invalid_id in invalid_identifiers:
55+
with pytest.raises(ValueError, match="Invalid source identifier format"):
56+
get_date_time(invalid_id)
57+
58+
def test_get_date_time_edge_cases(self):
59+
# Test with single digit month/day (should still work with zero padding)
60+
source_identifier = "sql-to-parquet-23-01-05-000000"
61+
year, month, day, date = get_date_time(source_identifier)
62+
63+
assert year == "2023"
64+
assert month == "01"
65+
assert day == "05"
66+
assert date == "20230105"
67+
68+
def test_get_date_time_leap_year(self):
69+
# Test leap year date
70+
source_identifier = "sql-to-parquet-2024-02-29-backdated"
71+
year, month, day, date = get_date_time(source_identifier)
72+
73+
assert year == "2024"
74+
assert month == "02"
75+
assert day == "29"
76+
assert date == "20240229"

0 commit comments

Comments
 (0)