Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions preprocessing/series_semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,23 @@ def check_type(self, series: pl.Series):
return self.column_type(series.dtype)


def parse_time_military(s: pl.Series) -> pl.Series:
"""Parse time strings with multiple format attempts"""
# Try different time formats
FORMATS_TO_TRY = ["%H:%M:%S", "%H:%M", "%I:%M:%S %p", "%I:%M %p"]

for fmt in FORMATS_TO_TRY:
try:
result = s.str.strptime(pl.Time, format=fmt, strict=False)
if result.is_not_null().sum() > 0: # If any parsed successfully
return result
except:
continue

# If all formats fail, return nulls
return pl.Series([None] * s.len(), dtype=pl.Time)


def parse_datetime_with_tz(s: pl.Series) -> pl.Series:
"""Parse datetime strings with timezone info (both abbreviations and offsets)"""
import warnings
Expand Down Expand Up @@ -97,6 +114,14 @@ def parse_datetime_with_tz(s: pl.Series) -> pl.Series:
data_type="datetime",
)

time_military = SeriesSemantic(
semantic_name="time_military",
column_type=pl.String,
try_convert=parse_time_military,
validate_result=lambda s: s.is_not_null(),
data_type="datetime",
)

datetime_string = SeriesSemantic(
semantic_name="datetime",
column_type=pl.String,
Expand Down Expand Up @@ -191,6 +216,7 @@ def parse_datetime_with_tz(s: pl.Series) -> pl.Series:
datetime_string,
date_string,
time_string,
time_military,
timestamp_seconds,
timestamp_milliseconds,
url,
Expand Down
26 changes: 26 additions & 0 deletions preprocessing/test_series_semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
native_date,
native_datetime,
parse_datetime_with_tz,
parse_time_military,
text_catch_all,
time_string,
)
Expand Down Expand Up @@ -122,6 +123,21 @@ def test_parse_datetime_with_tz():
assert result.is_not_null().all()


def test_parse_time_military():
"""Test military time parsing function with various formats"""
# Test 24-hour format without seconds (HH:MM)
series_24h = pl.Series(["23:39", "12:45", "00:30"])
result = parse_time_military(series_24h)
assert result.dtype == pl.Time
assert result.is_not_null().all()

# Test 24-hour format with seconds (HH:MM:SS)
series_24h_sec = pl.Series(["14:30:15", "09:15:30", "23:59:59"])
result = parse_time_military(series_24h_sec)
assert result.dtype == pl.Time
assert result.is_not_null().all()


def test_parse_datetime_with_tz_no_timezone():
"""Test datetime parsing without timezone suffix"""
series = pl.Series(["2025-02-28 00:36:15", "2025-02-28 00:36:13"])
Expand Down Expand Up @@ -186,6 +202,16 @@ def test_parse_datetime_mixed_timezones_warning():
assert result.is_not_null().all()


def test_time_military_semantic_inference():
"""Test that time_military semantic gets properly detected"""
# Test 24-hour format detection
series_24h = pl.Series(["23:47", "14:30", "09:15", "00:00", "12:45"])
semantic = infer_series_semantic(series_24h)
assert semantic is not None
assert semantic.semantic_name == "time_military"
assert semantic.data_type == "datetime"


# Edge cases
def test_all_none_series():
"""Test series with all null values"""
Expand Down
Loading