From 26ada90c835f9b4f678d1dea539978695d659f16 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Wed, 12 Nov 2025 23:10:21 -0500 Subject: [PATCH 1/9] Initial tests --- pandas/tests/tools/test_to_datetime.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 67b1e8668e5f6..caa5b8f6fda48 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3819,3 +3819,33 @@ def test_to_datetime_lxml_elementunicoderesult_with_format(cache): out = to_datetime(Series([val]), format="%Y-%m-%d %H:%M:%S", cache=cache) assert out.iloc[0] == Timestamp(s) + + +class TestForIncreasedRobustness: + def test_parse_with_no_malformed_components(self): + res = to_datetime( + "2018-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f" + ) + expected = Timestamp("2018-10-01 12:00:00.0000000011") + assert res == expected + + def test_parse_with_malformed_day(self): + res = to_datetime( + "2018-10-. 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f" + ) + expected = NaT + assert res == expected + + def test_parse_with_malformed_day_iso(self): + res = to_datetime("2018-10-.", format="ISO8601") + expected = NaT + assert res == expected + + def test_parse_with_half_malformed_components(self): + res = to_datetime("2018-10-. 12:.:.", format="%Y-%m-%d %H:%M:%S") + expected = NaT + assert res == expected + + def test_parse_with_too_many_malformed_components(self): + with pytest.raises(ValueError, match="^time data *"): + to_datetime("2018-.-. 12:.:.", format="%Y-%m-%d %H:%M:%S") From 1c10bbf6fd0e182f113675bf62646ebd0eaa9591 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Tue, 18 Nov 2025 22:51:48 -0500 Subject: [PATCH 2/9] Year parsing and added tests Year parsing, added tests Year parsing, added tests --- .../include/pandas/datetime/pd_datetime.h | 7 +- .../numpy/datetime/np_datetime_strings.h | 3 +- .../numpy/datetime/np_datetime_strings.c | 545 ++++++++++++++++-- pandas/_libs/tslibs/np_datetime.pxd | 3 +- pandas/_libs/tslibs/np_datetime.pyx | 5 +- pandas/_libs/tslibs/strptime.pyx | 34 +- pandas/core/tools/datetimes.py | 46 +- pandas/tests/tools/test_to_datetime.py | 69 ++- 8 files changed, 634 insertions(+), 78 deletions(-) diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 98e5521af2506..fe4d19c39b81d 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -48,7 +48,7 @@ typedef struct { PyArray_DatetimeMetaData (*get_datetime_metadata_from_dtype)(PyArray_Descr *); int (*parse_iso_8601_datetime)(const char *, int, int, npy_datetimestruct *, NPY_DATETIMEUNIT *, int *, int *, const char *, - int, FormatRequirement); + int, FormatRequirement, double); int (*get_datetime_iso_8601_strlen)(int, NPY_DATETIMEUNIT); int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, size_t, int, NPY_DATETIMEUNIT); @@ -94,10 +94,11 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; PandasDateTimeAPI->get_datetime_metadata_from_dtype((dtype)) #define parse_iso_8601_datetime(str, len, want_exc, out, out_bestunit, \ out_local, out_tzoffset, format, format_len, \ - format_requirement) \ + format_requirement, threshold) \ PandasDateTimeAPI->parse_iso_8601_datetime( \ (str), (len), (want_exc), (out), (out_bestunit), (out_local), \ - (out_tzoffset), (format), (format_len), (format_requirement)) + (out_tzoffset), (format), (format_len), (format_requirement), \ + (threshold)) #define get_datetime_iso_8601_strlen(local, base) \ PandasDateTimeAPI->get_datetime_iso_8601_strlen((local), (base)) #define make_iso_8601_datetime(dts, outstr, outlen, utc, base) \ diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h index 75e69f30ada1e..53f1c2d05e213 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h @@ -67,7 +67,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char *format, int format_len, - FormatRequirement format_requirement); + FormatRequirement format_requirement, + double threshold); /* * Provides a string length to use for converting datetime diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index a46f5bc467c5d..b10e907d6021a 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -82,6 +82,7 @@ static DatetimePartParseResult compare_format(const char **format, int *characters_remaining, const char *compare_to, int n, const FormatRequirement format_requirement) { + printf("\n%s\n%s\n", *format, compare_to); if (format_requirement == INFER_FORMAT) { return COMPARISON_SUCCESS; } @@ -112,7 +113,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char *format, int format_len, - FormatRequirement format_requirement) { + FormatRequirement format_requirement, + double threshold) { if (len < 0 || format_len < 0) goto parse_error; int year_leap = 0; @@ -146,6 +148,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr = str; sublen = len; + int invalid_components = 0; + int valid_components = 0; + /* Skip leading whitespace */ while (sublen > 0 && isspace(*substr)) { ++substr; @@ -170,11 +175,27 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ + printf("Start: "); + printf("%s", str); + printf("\n"); comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); + + int to_month = 0; + if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr + 1)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto find_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } @@ -186,6 +207,63 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 4; sublen -= 4; + } else if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2]) && !isdigit(substr[3])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[3] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + substr += 3; + sublen -= 3; + to_month = 1; + goto find_sep; + } + } else if (sublen >= 3 && isdigit(substr[0]) && isdigit(substr[1]) && + !isdigit(substr[2])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[2] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + substr += 2; + sublen -= 2; + to_month = 1; + goto find_sep; + } + goto find_sep; + } else if (sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[1] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + substr += 1; + sublen -= 1; + to_month = 1; + goto find_sep; + } + } else if (sublen >= 1 && !isdigit(substr[0])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[0] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + to_month = 1; + goto find_sep; + } } /* Negate the year if necessary */ @@ -201,12 +279,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, *out_local = 0; } if (format_len) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr + 1)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto find_sep; } bestunit = NPY_FR_Y; + valid_components++; goto finish; } +find_sep: if (!isdigit(*substr)) { for (i = 0; i < valid_ymd_sep_len; ++i) { if (*substr == valid_ymd_sep[i]) { @@ -214,52 +303,118 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } if (i == valid_ymd_sep_len) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; } has_ymd_sep = 1; ymd_sep = valid_ymd_sep[i]; + printf("Sep: %c\n", ymd_sep); ++substr; --sublen; + printf("Before!: %c\n", ymd_sep); comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + if (to_month) { + goto month; + } + if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto month; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* Cannot have trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { + if (!isdigit(*substr)) { + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto month; + } else if (sublen == 0) { goto parse_error; } } + valid_components++; /* PARSE THE MONTH */ +month: + printf("\nI-V after year-parsing: %d-%d\n", invalid_components, + valid_components); comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ out->month = (*substr - '0'); ++substr; --sublen; + /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->month = 10 * out->month + (*substr - '0'); ++substr; --sublen; } else if (!has_ymd_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } if (out->month < 1 || out->month > 12) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; } - goto error; + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } /* Next character must be the separator, start of day, or end of string */ @@ -267,44 +422,105 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, bestunit = NPY_FR_M; /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ if (!has_ymd_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } if (format_len) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } if (out_local != NULL) { *out_local = 0; } + valid_components++; goto finish; } if (has_ymd_sep) { /* Must have separator, but cannot be trailing */ if (*substr != ymd_sep || sublen == 1) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } ++substr; --sublen; comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto day; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } } + valid_components++; /* PARSE THE DAY */ +day: + printf("\nI-V after month-parsing: %d-%d\n", invalid_components, + valid_components); comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ if (!isdigit(*substr)) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } out->day = (*substr - '0'); ++substr; @@ -315,15 +531,27 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; } else if (!has_ymd_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } if (out->day < 1 || out->day > days_per_month_table[year_leap][out->month - 1]) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; } - goto error; + goto hour; } /* Next character must be a 'T', ' ', or end of string */ @@ -332,36 +560,84 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, *out_local = 0; } if (format_len) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } bestunit = NPY_FR_D; + valid_components++; goto finish; } if ((*substr != 'T' && *substr != ' ') || sublen == 1) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } comparison = compare_format(&format, &format_len, substr, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } ++substr; --sublen; + valid_components++; /* PARSE THE HOURS */ +hour: + printf("\nI-V after day-parsing: %d-%d\n", invalid_components, + valid_components); + fflush(stdout); comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ if (!isdigit(*substr)) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } out->hour = (*substr - '0'); bestunit = NPY_FR_h; @@ -374,23 +650,44 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (out->hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", str); + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; } - goto error; + goto minute; } } /* Next character must be a ':' or the end of the string */ if (sublen == 0) { if (!hour_was_2_digits) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } if (format_len) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } bestunit = NPY_FR_h; + valid_components++; goto finish; } @@ -400,28 +697,65 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; /* Cannot have a trailing separator */ if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } comparison = compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } + valid_components++; goto parse_timezone; } + valid_components++; /* PARSE THE MINUTES */ +minute: comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ @@ -435,21 +769,42 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (out->min >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", str); + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; } - goto error; + if (sublen == 0) { + goto finish; + } + goto second; } } else if (!has_hms_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } if (sublen == 0) { bestunit = NPY_FR_m; if (format_len) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } + valid_components++; goto finish; } @@ -459,27 +814,56 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } ++substr; --sublen; /* Cannot have a trailing ':' */ if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } } else if (!has_hms_sep && isdigit(*substr)) { } else { + valid_components++; goto parse_timezone; } + valid_components++; /* PARSE THE SECONDS */ +second: comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto microsecond; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ @@ -492,14 +876,26 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (out->sec >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", str); + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; } - goto error; + if (sublen == 0) { + goto finish; + } + goto microsecond; } } else if (!has_hms_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto microsecond; } /* Next character may be a '.' indicating fractional seconds */ @@ -509,21 +905,42 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ".", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto microsecond; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } } else { bestunit = NPY_FR_s; + valid_components++; goto parse_timezone; } + valid_components++; /* PARSE THE MICROSECONDS (0 to 6 digits) */ +microsecond: comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto picosecond; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } numdigits = 0; @@ -543,10 +960,13 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } else { bestunit = NPY_FR_ms; } + valid_components++; goto parse_timezone; } + valid_components++; /* PARSE THE PICOSECONDS (0 to 6 digits) */ +picosecond: numdigits = 0; for (i = 0; i < 6; ++i) { out->ps *= 10; @@ -564,8 +984,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } else { bestunit = NPY_FR_ns; } + valid_components++; goto parse_timezone; } + valid_components++; /* PARSE THE ATTOSECONDS (0 to 6 digits) */ numdigits = 0; @@ -584,6 +1006,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } else { bestunit = NPY_FR_fs; } + valid_components++; parse_timezone: /* trim any whitespace between time/timezone */ @@ -739,6 +1162,20 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } finish: + printf("\nI-V at end: %d-%d\n", invalid_components, valid_components); + if (invalid_components > 0 && + (double)valid_components / (valid_components + invalid_components) >= + threshold) { + return -2; // sentinel for NaT + } + + if ((double)valid_components / (valid_components + invalid_components) < + threshold) { + printf("Bad 1\n%s\n%d\n%d\n", str, valid_components, invalid_components); + fflush(stdout); + goto parse_error; // threshold not met, raise exception + } + if (out_bestunit != NULL) { *out_bestunit = bestunit; } diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 3e5654b70cd92..7732223a7e886 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -90,7 +90,8 @@ cdef int string_to_dts( int* out_tzoffset, bint want_exc, str format = *, - bint exact = * + bint exact = *, + double threshold = *, ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 0fc7a6945d2e0..3d64c9c8ed89a 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -65,7 +65,7 @@ cdef extern from "pandas/datetime/pd_datetime.h": NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char *format, int format_len, - FormatRequirement exact) + FormatRequirement exact, double threshold) # ---------------------------------------------------------------------- # numpy object inspection @@ -348,6 +348,7 @@ cdef int string_to_dts( bint want_exc, str format=None, bint exact=True, + double threshold = 1.0, ) except? -1: cdef: Py_ssize_t length @@ -367,7 +368,7 @@ cdef int string_to_dts( return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, format_buf, format_length, - format_requirement) + format_requirement, threshold) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d22150f9aefab..7209f02bb5bae 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -355,6 +355,7 @@ def array_strptime( errors="raise", bint utc=False, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, + double threshold = 1.0, ): """ Calculates the datetime structs represented by the passed array of strings @@ -367,6 +368,23 @@ def array_strptime( errors : string specifying error handling, {'raise', 'coerce'} creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC Set to NPY_FR_GENERIC to infer a resolution. + + /// INSERT DOCUMENTATION UPDATE HERE /// + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## """ cdef: @@ -453,15 +471,23 @@ def array_strptime( out_tzoffset = 0 if fmt == "ISO8601": - string_to_dts_succeeded = not string_to_dts( + res = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, None, False + &out_tzoffset, False, None, False, threshold ) + if res == -2: # sentinel for NaT + iresult[i] = NPY_NAT + continue + string_to_dts_succeeded = not res elif iso_format: - string_to_dts_succeeded = not string_to_dts( + res = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, fmt, exact + &out_tzoffset, False, fmt, exact, threshold ) + string_to_dts_succeeded = not res + if res == -2: # sentinel for NaT + iresult[i] = NPY_NAT + continue if string_to_dts_succeeded: # No error reported by string_to_dts, pick back up # where we left off diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a2c18ccb59899..6ea91c317a54f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -327,6 +327,7 @@ def _convert_listlike_datetimes( dayfirst: bool | None = None, yearfirst: bool | None = None, exact: bool = True, + threshold: float = 1.0, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike @@ -351,6 +352,23 @@ def _convert_listlike_datetimes( exact : bool, default True exact format matching behavior from to_datetime + /// INSERT DOCUMENTATION UPDATE HERE /// + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + Returns ------- Index-like of parsed dates @@ -432,7 +450,9 @@ def _convert_listlike_datetimes( # `format` could be inferred, or user didn't ask for mixed-format parsing. if format is not None and format != "mixed": - return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + return _array_strptime_with_fallback( + arg, name, utc, format, exact, errors, threshold + ) result, tz_parsed = objects_to_datetime64( arg, @@ -462,11 +482,14 @@ def _array_strptime_with_fallback( fmt: str, exact: bool, errors: str, + threshold: float = 1.0, ) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ - result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + result, tz_out = array_strptime( + arg, fmt, exact=exact, errors=errors, utc=utc, threshold=threshold + ) if tz_out is not None: unit = np.datetime_data(result.dtype)[0] dtype = DatetimeTZDtype(tz=tz_out, unit=unit) @@ -682,6 +705,7 @@ def to_datetime( unit: str | None = None, origin: str = "unix", cache: bool = True, + threshold: float = 1.0, ) -> DatetimeIndex | Series | DatetimeScalar | NaTType: """ Convert argument to datetime. @@ -791,6 +815,23 @@ def to_datetime( out-of-bounds values will render the cache unusable and may slow down parsing. + /// INSERT DOCUMENTATION UPDATE HERE /// + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + ######################## + Returns ------- datetime @@ -1008,6 +1049,7 @@ def to_datetime( yearfirst=yearfirst, errors=errors, exact=exact, # type: ignore[arg-type] + threshold=threshold, ) result: Timestamp | NaTType | Series | Index diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index caa5b8f6fda48..8ba97495b3103 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3824,28 +3824,75 @@ def test_to_datetime_lxml_elementunicoderesult_with_format(cache): class TestForIncreasedRobustness: def test_parse_with_no_malformed_components(self): res = to_datetime( - "2018-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f" + "2018-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, ) expected = Timestamp("2018-10-01 12:00:00.0000000011") assert res == expected + res = to_datetime( + "2018-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=1.0, + ) + assert res == expected + + def test_parse_with_malformed_year(self): + res = to_datetime( + "12-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", threshold=0.5 + ) + assert isna(res) + + def test_parse_with_malformed_year_iso(self): + res = to_datetime("12-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + """def test_parse_with_malformed_month(self): + res = to_datetime( + "2018-202-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_malformed_month_iso(self): + res = to_datetime("2018-202-01", format="ISO8601", threshold=0.5) + assert isna(res) def test_parse_with_malformed_day(self): res = to_datetime( - "2018-10-. 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f" + "2018-10-202 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, ) - expected = NaT - assert res == expected + assert isna(res) def test_parse_with_malformed_day_iso(self): - res = to_datetime("2018-10-.", format="ISO8601") - expected = NaT - assert res == expected + res = to_datetime("2018-10-202", format="ISO8601", threshold=0.5) + assert isna(res) def test_parse_with_half_malformed_components(self): - res = to_datetime("2018-10-. 12:.:.", format="%Y-%m-%d %H:%M:%S") - expected = NaT - assert res == expected + res = to_datetime( + "2018-10-202 12:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ) + assert isna(res) def test_parse_with_too_many_malformed_components(self): with pytest.raises(ValueError, match="^time data *"): - to_datetime("2018-.-. 12:.:.", format="%Y-%m-%d %H:%M:%S") + to_datetime( + "2018-202-202 12:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ) + + def test_parse_with_too_many_malformed_components_all(self): + with pytest.raises(ValueError, match="^time data *"): + to_datetime( + "2018-10-202 12:00:00", format="%Y-%m-%d %H:%M:%S", threshold=1.0 + ) + + def test_parse_with_too_many_malformed_components_iso(self): + with pytest.raises(ValueError, match="^time data *"): + to_datetime("18-10-202", format="ISO8601", threshold=0.5) + + def test_parse_with_too_many_malformed_components_iso_all(self): + with pytest.raises(ValueError, match="^time data *"): + to_datetime("2018-10-202", format="ISO8601", threshold=1.0)""" From 334ec7a85a731a871fbaf28bd0c7f61861814366 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Thu, 20 Nov 2025 19:46:00 -0500 Subject: [PATCH 3/9] Increased robustness -- full parsing --- .../numpy/datetime/np_datetime_strings.c | 350 ++++++++++++------ pandas/tests/tools/test_to_datetime.py | 273 +++++++++++++- 2 files changed, 498 insertions(+), 125 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index b10e907d6021a..fa665f5ea19d0 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -82,7 +82,6 @@ static DatetimePartParseResult compare_format(const char **format, int *characters_remaining, const char *compare_to, int n, const FormatRequirement format_requirement) { - printf("\n%s\n%s\n", *format, compare_to); if (format_requirement == INFER_FORMAT) { return COMPARISON_SUCCESS; } @@ -175,9 +174,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ - printf("Start: "); - printf("%s", str); - printf("\n"); + printf("Start: %s\n", str); comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); @@ -266,6 +263,27 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } + /* Invalidates the component if there is more than 4 digits */ + int still_more = 1; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (*substr == valid_ymd_sep[i]) { + still_more = 0; + break; + } + } + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto find_sep; + } + /* Negate the year if necessary */ if (str[0] == '-') { out->year = -out->year; @@ -303,7 +321,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } if (i == valid_ymd_sep_len) { - invalid_components++; + if (invalid_components + valid_components < 1) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -315,11 +334,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } has_ymd_sep = 1; ymd_sep = valid_ymd_sep[i]; - printf("Sep: %c\n", ymd_sep); ++substr; --sublen; - printf("Before!: %c\n", ymd_sep); comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (to_month) { @@ -327,7 +344,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if (comparison == COMPARISON_ERROR) { - invalid_components++; + if (invalid_components + valid_components < 1) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -337,12 +355,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto month; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; + if (invalid_components + valid_components < 1) + valid_components++; goto finish; } /* Cannot have trailing separator */ if (!isdigit(*substr)) { - invalid_components++; + if (invalid_components + valid_components < 1) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -355,7 +375,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } } - valid_components++; + if (invalid_components + valid_components < 1) + valid_components++; /* PARSE THE MONTH */ month: @@ -390,6 +411,31 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->month = 10 * out->month + (*substr - '0'); ++substr; --sublen; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (*substr == valid_ymd_sep[i]) { + still_more = 0; + break; + } + } + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + comparison = compare_format(&format, &format_len, &ymd_sep, 1, + format_requirement); + goto month_sep; + } + } } else if (!has_ymd_sep) { invalid_components++; while (sublen > 0 && !isdigit(*substr)) { @@ -401,28 +447,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); - goto day; + goto month_sep; } if (out->month < 1 || out->month > 12) { invalid_components++; - while (sublen > 0 && !isdigit(*substr)) { - substr++; - sublen--; - } - if (sublen == 0) { - goto finish; - } - comparison = - compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); - goto day; + goto month_sep; } +month_sep: /* Next character must be the separator, start of day, or end of string */ if (sublen == 0) { bestunit = NPY_FR_M; /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ if (!has_ymd_sep) { - invalid_components++; + if (invalid_components + valid_components < 2) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -435,7 +474,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto day; } if (format_len) { - invalid_components++; + if (invalid_components + valid_components < 2) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -450,14 +490,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_local != NULL) { *out_local = 0; } - valid_components++; + if (invalid_components + valid_components < 2) + valid_components++; goto finish; } if (has_ymd_sep) { /* Must have separator, but cannot be trailing */ if (*substr != ymd_sep || sublen == 1) { - invalid_components++; + if (invalid_components + valid_components < 2) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -474,7 +516,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - invalid_components++; + if (invalid_components + valid_components < 2) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -484,11 +527,13 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto day; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; + if (invalid_components + valid_components < 2) + valid_components++; goto finish; } } - valid_components++; + if (invalid_components + valid_components < 2) + valid_components++; /* PARSE THE DAY */ day: @@ -505,7 +550,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto hour; + goto day_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { valid_components++; goto finish; @@ -520,7 +565,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto hour; + goto day_sep; } out->day = (*substr - '0'); ++substr; @@ -530,6 +575,31 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->day = 10 * out->day + (*substr - '0'); ++substr; --sublen; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (*substr == valid_ymd_sep[i]) { + still_more = 0; + break; + } + } + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + comparison = compare_format(&format, &format_len, &ymd_sep, 1, + format_requirement); + goto day_sep; + } + } } else if (!has_ymd_sep) { invalid_components++; while (sublen > 0 && !isdigit(*substr)) { @@ -539,28 +609,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto hour; + goto day_sep; } if (out->day < 1 || out->day > days_per_month_table[year_leap][out->month - 1]) { invalid_components++; - while (sublen > 0 && !isdigit(*substr)) { - substr++; - sublen--; - } - if (sublen == 0) { - goto finish; - } - goto hour; + goto day_sep; } +day_sep: /* Next character must be a 'T', ' ', or end of string */ if (sublen == 0) { if (out_local != NULL) { *out_local = 0; } if (format_len) { - invalid_components++; + if (invalid_components + valid_components < 3) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -576,7 +641,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if ((*substr != 'T' && *substr != ' ') || sublen == 1) { - invalid_components++; + if (invalid_components + valid_components < 3) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -589,7 +655,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, substr, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - invalid_components++; + if (invalid_components + valid_components < 3) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -599,18 +666,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto hour; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; + if (invalid_components + valid_components < 3) + valid_components++; goto finish; } ++substr; --sublen; - valid_components++; + if (invalid_components + valid_components < 3) + valid_components++; /* PARSE THE HOURS */ hour: printf("\nI-V after day-parsing: %d-%d\n", invalid_components, valid_components); - fflush(stdout); comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -622,7 +690,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto minute; + goto hour_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { valid_components++; goto finish; @@ -637,7 +705,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto minute; + goto hour_sep; } out->hour = (*substr - '0'); bestunit = NPY_FR_h; @@ -649,23 +717,39 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->hour = 10 * out->hour + (*substr - '0'); ++substr; --sublen; - if (out->hour >= 24) { - invalid_components++; - while (sublen > 0 && !isdigit(*substr)) { - substr++; - sublen--; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + if (!isdigit(substr[0])) { + still_more = 0; } - if (sublen == 0) { - goto finish; + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto hour_sep; } - goto minute; + } + + if (out->hour >= 24) { + invalid_components++; + goto hour_sep; } } +hour_sep: /* Next character must be a ':' or the end of the string */ if (sublen == 0) { if (!hour_was_2_digits) { - invalid_components++; + if (invalid_components + valid_components < 4) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -676,7 +760,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto minute; } if (format_len) { - invalid_components++; + if (invalid_components + valid_components < 4) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -687,7 +772,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto minute; } bestunit = NPY_FR_h; - valid_components++; + if (invalid_components + valid_components < 4) + valid_components++; goto finish; } @@ -697,7 +783,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; /* Cannot have a trailing separator */ if (sublen == 0 || !isdigit(*substr)) { - invalid_components++; + if (invalid_components + valid_components < 4) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -710,7 +797,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - invalid_components++; + if (invalid_components + valid_components < 4) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -720,12 +808,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto minute; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; + if (invalid_components + valid_components < 4) + valid_components++; goto finish; } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { - invalid_components++; + if (invalid_components + valid_components < 4) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -735,13 +825,17 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto minute; } - valid_components++; + if (invalid_components + valid_components < 4) + valid_components++; goto parse_timezone; } - valid_components++; + if (invalid_components + valid_components < 4) + valid_components++; /* PARSE THE MINUTES */ minute: + printf("\nI-V after hour-parsing: %d-%d\n", invalid_components, + valid_components); comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -753,7 +847,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto second; + goto minute_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { valid_components++; goto finish; @@ -768,16 +862,30 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->min = 10 * out->min + (*substr - '0'); ++substr; --sublen; - if (out->min >= 60) { - invalid_components++; - while (sublen > 0 && !isdigit(*substr)) { - substr++; - sublen--; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + if (!isdigit(substr[0])) { + still_more = 0; } - if (sublen == 0) { - goto finish; + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto minute_sep; } - goto second; + } + + if (out->min >= 60) { + invalid_components++; + goto minute_sep; } } else if (!has_hms_sep) { invalid_components++; @@ -788,13 +896,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto second; + goto minute_sep; } +minute_sep: if (sublen == 0) { bestunit = NPY_FR_m; if (format_len) { - invalid_components++; + if (invalid_components + valid_components < 5) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -804,7 +914,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto second; } - valid_components++; + if (invalid_components + valid_components < 5) + valid_components++; goto finish; } @@ -814,7 +925,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - invalid_components++; + if (invalid_components + valid_components < 5) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -824,14 +936,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto second; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; + if (invalid_components + valid_components < 5) + valid_components++; goto finish; } ++substr; --sublen; /* Cannot have a trailing ':' */ if (sublen == 0 || !isdigit(*substr)) { - invalid_components++; + if (invalid_components + valid_components < 5) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -843,13 +957,17 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } else if (!has_hms_sep && isdigit(*substr)) { } else { - valid_components++; + if (invalid_components + valid_components < 5) + valid_components++; goto parse_timezone; } - valid_components++; + if (invalid_components + valid_components < 5) + valid_components++; /* PARSE THE SECONDS */ second: + printf("\nI-V after minute-parsing: %d-%d\n", invalid_components, + valid_components); comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -861,7 +979,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto microsecond; + goto second_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { valid_components++; goto finish; @@ -875,16 +993,30 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->sec = 10 * out->sec + (*substr - '0'); ++substr; --sublen; - if (out->sec >= 60) { - invalid_components++; - while (sublen > 0 && !isdigit(*substr)) { - substr++; - sublen--; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + if (!isdigit(substr[0])) { + still_more = 0; } - if (sublen == 0) { - goto finish; + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto second_sep; } - goto microsecond; + } + + if (out->sec >= 60) { + invalid_components++; + goto second_sep; } } else if (!has_hms_sep) { invalid_components++; @@ -895,9 +1027,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - goto microsecond; + goto second_sep; } +second_sep: /* Next character may be a '.' indicating fractional seconds */ if (sublen > 0 && *substr == '.') { ++substr; @@ -905,7 +1038,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ".", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - invalid_components++; + if (invalid_components + valid_components < 6) + invalid_components++; while (sublen > 0 && !isdigit(*substr)) { substr++; sublen--; @@ -915,32 +1049,28 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } goto microsecond; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; + if (invalid_components + valid_components < 6) + valid_components++; goto finish; } } else { bestunit = NPY_FR_s; - valid_components++; + if (invalid_components + valid_components < 6) + valid_components++; goto parse_timezone; } - valid_components++; + if (invalid_components + valid_components < 6) + valid_components++; /* PARSE THE MICROSECONDS (0 to 6 digits) */ microsecond: + printf("\nI-V after second-parsing: %d-%d\n", invalid_components, + valid_components); comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - invalid_components++; - while (sublen > 0 && !isdigit(*substr)) { - substr++; - sublen--; - } - if (sublen == 0) { - goto finish; - } - goto picosecond; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - valid_components++; goto finish; } numdigits = 0; @@ -960,13 +1090,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } else { bestunit = NPY_FR_ms; } - valid_components++; goto parse_timezone; } - valid_components++; /* PARSE THE PICOSECONDS (0 to 6 digits) */ -picosecond: numdigits = 0; for (i = 0; i < 6; ++i) { out->ps *= 10; @@ -984,10 +1111,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } else { bestunit = NPY_FR_ns; } - valid_components++; goto parse_timezone; } - valid_components++; /* PARSE THE ATTOSECONDS (0 to 6 digits) */ numdigits = 0; @@ -1006,7 +1131,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } else { bestunit = NPY_FR_fs; } - valid_components++; parse_timezone: /* trim any whitespace between time/timezone */ @@ -1171,8 +1295,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((double)valid_components / (valid_components + invalid_components) < threshold) { - printf("Bad 1\n%s\n%d\n%d\n", str, valid_components, invalid_components); - fflush(stdout); goto parse_error; // threshold not met, raise exception } diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 8ba97495b3103..3e1103f52d682 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3830,6 +3830,7 @@ def test_parse_with_no_malformed_components(self): ) expected = Timestamp("2018-10-01 12:00:00.0000000011") assert res == expected + res = to_datetime( "2018-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", @@ -3837,17 +3838,51 @@ def test_parse_with_no_malformed_components(self): ) assert res == expected - def test_parse_with_malformed_year(self): + def test_parse_with_five_digit_year(self): + res = to_datetime( + "20012-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_three_digit_year(self): + res = to_datetime( + "212-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_two_digit_year(self): res = to_datetime( "12-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", threshold=0.5 ) assert isna(res) - def test_parse_with_malformed_year_iso(self): + def test_parse_with_one_digit_year(self): + res = to_datetime( + "1-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", threshold=0.5 + ) + assert isna(res) + + def test_parse_with_five_digit_year_iso(self): + res = to_datetime("20012-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_three_digit_year_iso(self): + res = to_datetime("201-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_two_digit_year_iso(self): res = to_datetime("12-10-01", format="ISO8601", threshold=0.5) assert isna(res) - """def test_parse_with_malformed_month(self): + def test_parse_with_one_digit_year_iso(self): + res = to_datetime("1-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_three_digit_month(self): res = to_datetime( "2018-202-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", @@ -3855,11 +3890,35 @@ def test_parse_with_malformed_year_iso(self): ) assert isna(res) - def test_parse_with_malformed_month_iso(self): + def test_parse_with_one_digit_month(self): + res = to_datetime( + "2018-0-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-2-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 12:00:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_month_iso(self): res = to_datetime("2018-202-01", format="ISO8601", threshold=0.5) assert isna(res) - def test_parse_with_malformed_day(self): + def test_parse_with_one_digit_month_iso(self): + res = to_datetime("2018-0-01", format="ISO8601", threshold=0.5) + assert isna(res) + + res = to_datetime("2018-2-01", format="ISO8601", threshold=0.5) + expected = Timestamp("2018-02-01") + assert res == expected + + def test_parse_with_three_digit_day(self): res = to_datetime( "2018-10-202 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", @@ -3867,32 +3926,224 @@ def test_parse_with_malformed_day(self): ) assert isna(res) - def test_parse_with_malformed_day_iso(self): + def test_parse_with_one_digit_day(self): + res = to_datetime( + "2018-10-0 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-1 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 12:00:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_day_iso(self): res = to_datetime("2018-10-202", format="ISO8601", threshold=0.5) assert isna(res) + def test_parse_with_one_digit_day_iso(self): + res = to_datetime("2018-10-0", format="ISO8601", threshold=0.5) + assert isna(res) + + res = to_datetime("2018-02-1", format="ISO8601", threshold=0.5) + expected = Timestamp("2018-02-01") + assert res == expected + + def test_parse_with_three_digit_hour(self): + res = to_datetime( + "2018-07-01 121:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_hour(self): + res = to_datetime( + "2018-10-01 24:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-01 1:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 01:00:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_minute(self): + res = to_datetime( + "2018-07-01 12:121:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_minute(self): + res = to_datetime( + "2018-10-01 23:60:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-01 10:1:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 10:01:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_second(self): + res = to_datetime( + "2018-07-01 12:12:121.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_second(self): + res = to_datetime( + "2018-10-01 23:00:60.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-01 10:00:1.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 10:00:01.0000000011") + assert res == expected + def test_parse_with_half_malformed_components(self): res = to_datetime( "2018-10-202 12:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 ) assert isna(res) + res = to_datetime( + "20118-101-01 23:60:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "20118-101-01 23:60:100.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + def test_parse_with_too_many_malformed_components(self): + res = to_datetime( + "2018-111-111 10:00:01.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + with pytest.raises(ValueError, match="^time data *"): - to_datetime( + _ = to_datetime( "2018-202-202 12:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 ) def test_parse_with_too_many_malformed_components_all(self): with pytest.raises(ValueError, match="^time data *"): - to_datetime( + _ = to_datetime( "2018-10-202 12:00:00", format="%Y-%m-%d %H:%M:%S", threshold=1.0 ) def test_parse_with_too_many_malformed_components_iso(self): - with pytest.raises(ValueError, match="^time data *"): - to_datetime("18-10-202", format="ISO8601", threshold=0.5) + res = to_datetime("2018-10-111", format="ISO8601", threshold=0.5) + assert isna(res) + + with pytest.raises( + ValueError, match="^Time data 18-10-202 is not ISO8601 format" + ): + _ = to_datetime("18-10-202", format="ISO8601", threshold=0.5) def test_parse_with_too_many_malformed_components_iso_all(self): + with pytest.raises( + ValueError, match="^Time data 2018-100-202 is not ISO8601 format" + ): + _ = to_datetime("2018-100-202", format="ISO8601", threshold=1.0) + + def test_parse_with_all_malformed_components(self): + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "201-202-202 121:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ) + + res = to_datetime( + "201-202-202 121:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.0 + ) + assert isna(res) + + def test_series(self): + series = Series(["2020", "1999", "2011"]) + result = to_datetime(series, format="%Y") + expected = Series( + [ + Timestamp("2020-01-01"), + Timestamp("1999-01-01"), + Timestamp("2011-01-01"), + ] + ) + tm.assert_series_equal(result, expected) + + series = Series(["199"]) + result = to_datetime(series, format="%Y", threshold=0.0) + assert isna(result[0]) + + series = Series(["2020-01-101", "1999-101-01", "211-10-11", "2020-01-01"]) + result = to_datetime(series, format="ISO8601", threshold=0.5) + assert isna(result[0]) + assert isna(result[1]) + assert isna(result[2]) + assert not isna(result[3]) + + series = Series(["2020-01-101", "1999-101-01", "2011-101-101"]) + with pytest.raises( + ValueError, match="^Time data 2011-101-101 is not ISO8601 format" + ): + _ = to_datetime(series, format="ISO8601", threshold=0.5) + + def test_errors_is_coerce(self): + series = Series(["2020", "20xx"]) + result = to_datetime(series, format="%Y", errors="coerce", threshold=1.0) + expected = Series([Timestamp("2020-01-01"), NaT]) + tm.assert_series_equal(result, expected) + + def test_iso_and_format_have_same_threshold_behavior(self): + assert isna(to_datetime("2018-202-01", format="ISO8601", threshold=0.5)) + assert isna(to_datetime("2018-202-01", format="%Y-%m-%d", threshold=0.5)) + + def test_microseconds_does_not_count(self): with pytest.raises(ValueError, match="^time data *"): - to_datetime("2018-10-202", format="ISO8601", threshold=1.0)""" + _ = to_datetime( + "20181-021-011 111:010:010.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.01, + ) + + def test_one_component(self): + res = to_datetime("20181", format="%Y", threshold=0.0) + assert isna(res) + + def test_parse_mixed_format_threshold(self): + series = Series(["2020-01-01", "01/02/2021", "2021-13-01"]) + result = to_datetime(series, format="mixed", threshold=0.5, errors="coerce") + expected = Series([Timestamp("2020-01-01"), Timestamp("2021-01-02"), NaT]) + tm.assert_series_equal(result, expected) From 05e83c578e0dcfcd4f3d5e1971736e0df2bede05 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Thu, 20 Nov 2025 21:58:08 -0500 Subject: [PATCH 4/9] Fixed problem with dates with no ymd separator --- .../numpy/datetime/np_datetime_strings.c | 105 ++++++++++++------ 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index fa665f5ea19d0..697b9706309d7 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -174,7 +174,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ - printf("Start: %s\n", str); comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); @@ -182,7 +181,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (comparison == COMPARISON_ERROR) { invalid_components++; - while (sublen > 0 && !isdigit(*substr + 1)) { + while (sublen > 1 && !isdigit(substr[1])) { substr++; sublen--; } @@ -219,6 +218,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, to_month = 1; goto find_sep; } + } else if (sublen == 3 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2])) { + invalid_components++; + substr += 3; + sublen -= 3; + goto finish; } else if (sublen >= 3 && isdigit(substr[0]) && isdigit(substr[1]) && !isdigit(substr[2])) { int valid_sep = 0; @@ -235,6 +240,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto find_sep; } goto find_sep; + } else if (sublen == 2 && isdigit(substr[0]) && isdigit(substr[1])) { + invalid_components++; + substr += 2; + sublen -= 2; + goto finish; } else if (sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) { int valid_sep = 0; for (i = 0; i < valid_ymd_sep_len; ++i) { @@ -249,6 +259,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, to_month = 1; goto find_sep; } + } else if (sublen == 1 && isdigit(substr[0])) { + invalid_components++; + substr++; + sublen--; + goto finish; } else if (sublen >= 1 && !isdigit(substr[0])) { int valid_sep = 0; for (i = 0; i < valid_ymd_sep_len; ++i) { @@ -264,7 +279,37 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* Invalidates the component if there is more than 4 digits */ - int still_more = 1; + int has_sep = 0; + int j = 0; + for (j = 0; j < (sublen > 4 ? 4 : sublen); ++j) { + char c = substr[j]; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (c == valid_ymd_sep[i]) { + has_sep = 1; + break; + } + } + if (has_sep || !isdigit(c)) { + break; + } + } + if (has_sep && j != 0) { + invalid_components++; + substr += j; + sublen -= j; + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto find_sep; + } + if (!has_sep && sublen < 4) { + invalid_components++; + substr += sublen; + sublen = 0; + goto finish; + } + /*int still_more = 1; for (i = 0; i < valid_ymd_sep_len; ++i) { if (*substr == valid_ymd_sep[i]) { still_more = 0; @@ -282,7 +327,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } to_month = 1; goto find_sep; - } + }*/ /* Negate the year if necessary */ if (str[0] == '-') { @@ -298,7 +343,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if (format_len) { invalid_components++; - while (sublen > 0 && !isdigit(*substr + 1)) { + while (sublen > 1 && !isdigit(substr[1])) { substr++; sublen--; } @@ -380,8 +425,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE MONTH */ month: - printf("\nI-V after year-parsing: %d-%d\n", invalid_components, - valid_components); comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); @@ -414,26 +457,35 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* Invalidates the component if there is more than 2 digits */ if (sublen > 0) { - int still_more = 1; - for (i = 0; i < valid_ymd_sep_len; ++i) { - if (*substr == valid_ymd_sep[i]) { - still_more = 0; + int has_sep = 0; + int j = 0; + for (j = 0; j < (sublen > 2 ? 2 : sublen); ++j) { + char c = substr[j]; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (c == valid_ymd_sep[i]) { + has_sep = 1; + break; + } + } + if (has_sep || !isdigit(c)) { break; } } - if (still_more) { + if (has_sep && j != 0) { invalid_components++; - while (sublen > 0 && isdigit(substr[0])) { - substr++; - sublen--; - } + substr += j; + sublen -= j; if (sublen == 0) { goto finish; } to_month = 1; - comparison = compare_format(&format, &format_len, &ymd_sep, 1, - format_requirement); - goto month_sep; + goto find_sep; + } + if (!has_sep && sublen < 2) { + invalid_components++; + substr += sublen; + sublen = 0; + goto finish; } } } else if (!has_ymd_sep) { @@ -537,8 +589,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE DAY */ day: - printf("\nI-V after month-parsing: %d-%d\n", invalid_components, - valid_components); comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -594,7 +644,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - to_month = 1; comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); goto day_sep; @@ -677,8 +726,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE HOURS */ hour: - printf("\nI-V after day-parsing: %d-%d\n", invalid_components, - valid_components); comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -733,7 +780,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - to_month = 1; goto hour_sep; } } @@ -834,8 +880,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE MINUTES */ minute: - printf("\nI-V after hour-parsing: %d-%d\n", invalid_components, - valid_components); comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -878,7 +922,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - to_month = 1; goto minute_sep; } } @@ -966,8 +1009,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE SECONDS */ second: - printf("\nI-V after minute-parsing: %d-%d\n", invalid_components, - valid_components); comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -1009,7 +1050,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { goto finish; } - to_month = 1; goto second_sep; } } @@ -1064,8 +1104,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE MICROSECONDS (0 to 6 digits) */ microsecond: - printf("\nI-V after second-parsing: %d-%d\n", invalid_components, - valid_components); comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -1286,7 +1324,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } finish: - printf("\nI-V at end: %d-%d\n", invalid_components, valid_components); if (invalid_components > 0 && (double)valid_components / (valid_components + invalid_components) >= threshold) { From bb79b135bcf828535b7aef0904e1f3ede467bbc7 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Thu, 20 Nov 2025 22:34:15 -0500 Subject: [PATCH 5/9] Fixed minor month parsing bug with wrong goto --- .../numpy/datetime/np_datetime_strings.c | 46 ++++++------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 697b9706309d7..87596852e8b80 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -114,6 +114,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, int format_len, FormatRequirement format_requirement, double threshold) { + printf("Start %s\n", str); if (len < 0 || format_len < 0) goto parse_error; int year_leap = 0; @@ -189,7 +190,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } to_month = 1; - goto find_sep; + goto year_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { valid_components++; goto finish; @@ -216,7 +217,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 3; sublen -= 3; to_month = 1; - goto find_sep; + goto year_sep; } } else if (sublen == 3 && isdigit(substr[0]) && isdigit(substr[1]) && isdigit(substr[2])) { @@ -237,9 +238,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 2; sublen -= 2; to_month = 1; - goto find_sep; + goto year_sep; } - goto find_sep; + goto year_sep; } else if (sublen == 2 && isdigit(substr[0]) && isdigit(substr[1])) { invalid_components++; substr += 2; @@ -257,7 +258,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 1; sublen -= 1; to_month = 1; - goto find_sep; + goto year_sep; } } else if (sublen == 1 && isdigit(substr[0])) { invalid_components++; @@ -274,7 +275,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (valid_sep) { invalid_components++; to_month = 1; - goto find_sep; + goto year_sep; } } @@ -301,7 +302,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } to_month = 1; - goto find_sep; + goto year_sep; } if (!has_sep && sublen < 4) { invalid_components++; @@ -309,26 +310,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, sublen = 0; goto finish; } - /*int still_more = 1; - for (i = 0; i < valid_ymd_sep_len; ++i) { - if (*substr == valid_ymd_sep[i]) { - still_more = 0; - break; - } - } - if (still_more) { - invalid_components++; - while (sublen > 0 && isdigit(substr[0])) { - substr++; - sublen--; - } - if (sublen == 0) { - goto finish; - } - to_month = 1; - goto find_sep; - }*/ +year_sep: + printf("Now %s\n", substr); /* Negate the year if necessary */ if (str[0] == '-') { out->year = -out->year; @@ -342,7 +326,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, *out_local = 0; } if (format_len) { - invalid_components++; + if (invalid_components + valid_components < 1) + invalid_components++; while (sublen > 1 && !isdigit(substr[1])) { substr++; sublen--; @@ -351,14 +336,13 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } to_month = 1; - goto find_sep; } bestunit = NPY_FR_Y; - valid_components++; + if (invalid_components + valid_components < 1) + valid_components++; goto finish; } -find_sep: if (!isdigit(*substr)) { for (i = 0; i < valid_ymd_sep_len; ++i) { if (*substr == valid_ymd_sep[i]) { @@ -479,7 +463,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } to_month = 1; - goto find_sep; + goto month_sep; } if (!has_sep && sublen < 2) { invalid_components++; From 9da60e4fab4fffd909d06d359015b7c16a47a93f Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Thu, 20 Nov 2025 23:01:46 -0500 Subject: [PATCH 6/9] Fixed incorrect parsing bug when T separates date and time --- .../src/vendored/numpy/datetime/np_datetime_strings.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 87596852e8b80..71854877c386d 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -114,7 +114,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, int format_len, FormatRequirement format_requirement, double threshold) { - printf("Start %s\n", str); if (len < 0 || format_len < 0) goto parse_error; int year_leap = 0; @@ -312,7 +311,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } year_sep: - printf("Now %s\n", substr); /* Negate the year if necessary */ if (str[0] == '-') { out->year = -out->year; @@ -612,14 +610,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* Invalidates the component if there is more than 2 digits */ if (sublen > 0) { - int still_more = 1; - for (i = 0; i < valid_ymd_sep_len; ++i) { - if (*substr == valid_ymd_sep[i]) { - still_more = 0; - break; - } - } - if (still_more) { + if (isdigit(substr[0])) { invalid_components++; while (sublen > 0 && isdigit(substr[0])) { substr++; From 0a767888786f16fdf5b16521fd09a34aab9b2ed0 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Thu, 20 Nov 2025 23:53:28 -0500 Subject: [PATCH 7/9] Documentation update --- .../numpy/datetime/np_datetime_strings.c | 2 +- pandas/_libs/tslibs/strptime.pyx | 18 +----- pandas/core/tools/datetimes.py | 61 ++++++++----------- pandas/tests/tools/test_to_datetime.py | 8 +++ 4 files changed, 36 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 71854877c386d..fe230dcb83db4 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -441,7 +441,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0) { int has_sep = 0; int j = 0; - for (j = 0; j < (sublen > 2 ? 2 : sublen); ++j) { + for (j = 0; j < (sublen > 2 && !has_ymd_sep ? 2 : sublen); ++j) { char c = substr[j]; for (i = 0; i < valid_ymd_sep_len; ++i) { if (c == valid_ymd_sep[i]) { diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 7209f02bb5bae..cec1d2ea74106 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -368,23 +368,7 @@ def array_strptime( errors : string specifying error handling, {'raise', 'coerce'} creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC Set to NPY_FR_GENERIC to infer a resolution. - - /// INSERT DOCUMENTATION UPDATE HERE /// - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## + threshold : minimum fraction of valid datetime components required """ cdef: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6ea91c317a54f..8e3612cb6acbe 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -351,23 +351,8 @@ def _convert_listlike_datetimes( yearfirst parsing behavior from to_datetime exact : bool, default True exact format matching behavior from to_datetime - - /// INSERT DOCUMENTATION UPDATE HERE /// - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## + threshold : float + Minimum fraction of valid datetime components required Returns ------- @@ -660,6 +645,7 @@ def to_datetime( unit: str | None = ..., origin=..., cache: bool = ..., + threshold: float = ..., ) -> Timestamp: ... @@ -675,6 +661,7 @@ def to_datetime( unit: str | None = ..., origin=..., cache: bool = ..., + threshold: float = ..., ) -> Series: ... @@ -690,6 +677,7 @@ def to_datetime( unit: str | None = ..., origin=..., cache: bool = ..., + threshold: float = ..., ) -> DatetimeIndex: ... @@ -814,24 +802,19 @@ def to_datetime( is only used when there are at least 50 values. The presence of out-of-bounds values will render the cache unusable and may slow down parsing. - - /// INSERT DOCUMENTATION UPDATE HERE /// - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - ######################## - + threshold : float + Minimum fraction of valid datetime components required to consider parsing + successful. Components include year, month, day, hour, minute, and second + if present in the input. An invalid component has too many or too few digits + or a number outside the possible range (e.g., month outside [1, 12]). Behavior + depends on the threshold: + + - 1.0 (default): all components must be valid, else raises error (unless + ``errors='coerce'``). + - 0.0: any invalid component produces NaT, else returns a valid datetime. + - Values between 0 and 1: if all components are valid, returns a valid + datetime; if the fraction of valid components >= threshold, returns NaT; + otherwise raises error. Returns ------- datetime @@ -1032,6 +1015,14 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[us, UTC]', freq=None) + + - Input string with one invalid component returns NaT if threshold allows + partial validity + + >>> pd.to_datetime( + ... "2018-100-26 12:00:00", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ... ) + NaT """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3e1103f52d682..6965ebef87b33 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -4147,3 +4147,11 @@ def test_parse_mixed_format_threshold(self): result = to_datetime(series, format="mixed", threshold=0.5, errors="coerce") expected = Series([Timestamp("2020-01-01"), Timestamp("2021-01-02"), NaT]) tm.assert_series_equal(result, expected) + + def test_example(self): + result = to_datetime( + "2018-100-26 12:00:00", + format="%Y-%m-%d %H:%M:%S", + threshold=0.5, + ) + assert isna(result) From 7fafd4a826c61bc05cb11b6c309c287bbd6ac4dc Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Fri, 21 Nov 2025 00:00:43 -0500 Subject: [PATCH 8/9] Edge case: threshold outside of [0.0, 1.0] --- pandas/core/tools/datetimes.py | 3 +++ pandas/tests/tools/test_to_datetime.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8e3612cb6acbe..6b6a41a95121f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1029,6 +1029,9 @@ def to_datetime( if arg is None: return NaT + if not (0.0 <= threshold <= 1.0): + raise ValueError(f"`threshold` must be between 0.0 and 1.0, got {threshold}") + if origin != "unix": arg = _adjust_to_origin(arg, origin, unit) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6965ebef87b33..6d1ff46421b43 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -4155,3 +4155,22 @@ def test_example(self): threshold=0.5, ) assert isna(result) + + def test_bad_threshold(self): + with pytest.raises( + ValueError, match="`threshold` must be between 0.0 and 1.0, got -0.5" + ): + _ = to_datetime( + "2020-01-01 12:20:20", + format="%Y-%m-%d %H:%M:%S", + threshold=-0.5, + ) + + with pytest.raises( + ValueError, match="`threshold` must be between 0.0 and 1.0, got 2.0" + ): + _ = to_datetime( + "2020-01-01 12:20:20", + format="%Y-%m-%d %H:%M:%S", + threshold=2.0, + ) From acdbc411cc3accb581367e19da280d5e40deb0c3 Mon Sep 17 00:00:00 2001 From: Jake Hirsch Date: Fri, 21 Nov 2025 00:02:33 -0500 Subject: [PATCH 9/9] Minor documentation update --- pandas/core/tools/datetimes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6b6a41a95121f..ff9de351cbfb6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -804,10 +804,10 @@ def to_datetime( parsing. threshold : float Minimum fraction of valid datetime components required to consider parsing - successful. Components include year, month, day, hour, minute, and second - if present in the input. An invalid component has too many or too few digits - or a number outside the possible range (e.g., month outside [1, 12]). Behavior - depends on the threshold: + successful. Must be between 0.0 and 1.0. Components include year, month, + day, hour, minute, and second if present in the input. An invalid component + has too many or too few digits or a number outside the possible range + (e.g., month outside [1, 12]). Behavior depends on the threshold: - 1.0 (default): all components must be valid, else raises error (unless ``errors='coerce'``).