diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 98e5521af2506..fe4d19c39b81d 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -48,7 +48,7 @@ typedef struct { PyArray_DatetimeMetaData (*get_datetime_metadata_from_dtype)(PyArray_Descr *); int (*parse_iso_8601_datetime)(const char *, int, int, npy_datetimestruct *, NPY_DATETIMEUNIT *, int *, int *, const char *, - int, FormatRequirement); + int, FormatRequirement, double); int (*get_datetime_iso_8601_strlen)(int, NPY_DATETIMEUNIT); int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, size_t, int, NPY_DATETIMEUNIT); @@ -94,10 +94,11 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; PandasDateTimeAPI->get_datetime_metadata_from_dtype((dtype)) #define parse_iso_8601_datetime(str, len, want_exc, out, out_bestunit, \ out_local, out_tzoffset, format, format_len, \ - format_requirement) \ + format_requirement, threshold) \ PandasDateTimeAPI->parse_iso_8601_datetime( \ (str), (len), (want_exc), (out), (out_bestunit), (out_local), \ - (out_tzoffset), (format), (format_len), (format_requirement)) + (out_tzoffset), (format), (format_len), (format_requirement), \ + (threshold)) #define get_datetime_iso_8601_strlen(local, base) \ PandasDateTimeAPI->get_datetime_iso_8601_strlen((local), (base)) #define make_iso_8601_datetime(dts, outstr, outlen, utc, base) \ diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h index 75e69f30ada1e..53f1c2d05e213 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h @@ -67,7 +67,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char *format, int format_len, - FormatRequirement format_requirement); + FormatRequirement format_requirement, + double threshold); /* * Provides a string length to use for converting datetime diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index a46f5bc467c5d..fe230dcb83db4 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -112,7 +112,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char *format, int format_len, - FormatRequirement format_requirement) { + FormatRequirement format_requirement, + double threshold) { if (len < 0 || format_len < 0) goto parse_error; int year_leap = 0; @@ -146,6 +147,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr = str; sublen = len; + int invalid_components = 0; + int valid_components = 0; + /* Skip leading whitespace */ while (sublen > 0 && isspace(*substr)) { ++substr; @@ -172,9 +176,22 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE YEAR (4 digits) */ comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); + + int to_month = 0; + if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 1 && !isdigit(substr[1])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto year_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } @@ -186,8 +203,114 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 4; sublen -= 4; + } else if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2]) && !isdigit(substr[3])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[3] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + substr += 3; + sublen -= 3; + to_month = 1; + goto year_sep; + } + } else if (sublen == 3 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2])) { + invalid_components++; + substr += 3; + sublen -= 3; + goto finish; + } else if (sublen >= 3 && isdigit(substr[0]) && isdigit(substr[1]) && + !isdigit(substr[2])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[2] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + substr += 2; + sublen -= 2; + to_month = 1; + goto year_sep; + } + goto year_sep; + } else if (sublen == 2 && isdigit(substr[0]) && isdigit(substr[1])) { + invalid_components++; + substr += 2; + sublen -= 2; + goto finish; + } else if (sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[1] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + substr += 1; + sublen -= 1; + to_month = 1; + goto year_sep; + } + } else if (sublen == 1 && isdigit(substr[0])) { + invalid_components++; + substr++; + sublen--; + goto finish; + } else if (sublen >= 1 && !isdigit(substr[0])) { + int valid_sep = 0; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (substr[0] == valid_ymd_sep[i]) { + valid_sep = 1; + } + } + if (valid_sep) { + invalid_components++; + to_month = 1; + goto year_sep; + } } + /* Invalidates the component if there is more than 4 digits */ + int has_sep = 0; + int j = 0; + for (j = 0; j < (sublen > 4 ? 4 : sublen); ++j) { + char c = substr[j]; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (c == valid_ymd_sep[i]) { + has_sep = 1; + break; + } + } + if (has_sep || !isdigit(c)) { + break; + } + } + if (has_sep && j != 0) { + invalid_components++; + substr += j; + sublen -= j; + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto year_sep; + } + if (!has_sep && sublen < 4) { + invalid_components++; + substr += sublen; + sublen = 0; + goto finish; + } + +year_sep: /* Negate the year if necessary */ if (str[0] == '-') { out->year = -out->year; @@ -201,9 +324,20 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, *out_local = 0; } if (format_len) { - goto parse_error; + if (invalid_components + valid_components < 1) + invalid_components++; + while (sublen > 1 && !isdigit(substr[1])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; } bestunit = NPY_FR_Y; + if (invalid_components + valid_components < 1) + valid_components++; goto finish; } @@ -214,7 +348,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } if (i == valid_ymd_sep_len) { - goto parse_error; + if (invalid_components + valid_components < 1) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + to_month = 1; } has_ymd_sep = 1; ymd_sep = valid_ymd_sep[i]; @@ -223,88 +366,238 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + if (to_month) { + goto month; + } + if (comparison == COMPARISON_ERROR) { - goto parse_error; + if (invalid_components + valid_components < 1) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto month; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + if (invalid_components + valid_components < 1) + valid_components++; goto finish; } /* Cannot have trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { + if (!isdigit(*substr)) { + if (invalid_components + valid_components < 1) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto month; + } else if (sublen == 0) { goto parse_error; } } + if (invalid_components + valid_components < 1) + valid_components++; /* PARSE THE MONTH */ +month: comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ out->month = (*substr - '0'); ++substr; --sublen; + /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->month = 10 * out->month + (*substr - '0'); ++substr; --sublen; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int has_sep = 0; + int j = 0; + for (j = 0; j < (sublen > 2 && !has_ymd_sep ? 2 : sublen); ++j) { + char c = substr[j]; + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (c == valid_ymd_sep[i]) { + has_sep = 1; + break; + } + } + if (has_sep || !isdigit(c)) { + break; + } + } + if (has_sep && j != 0) { + invalid_components++; + substr += j; + sublen -= j; + if (sublen == 0) { + goto finish; + } + to_month = 1; + goto month_sep; + } + if (!has_sep && sublen < 2) { + invalid_components++; + substr += sublen; + sublen = 0; + goto finish; + } + } } else if (!has_ymd_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto month_sep; } if (out->month < 1 || out->month > 12) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); - } - goto error; + invalid_components++; + goto month_sep; } +month_sep: /* Next character must be the separator, start of day, or end of string */ if (sublen == 0) { bestunit = NPY_FR_M; /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ if (!has_ymd_sep) { - goto parse_error; + if (invalid_components + valid_components < 2) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } if (format_len) { - goto parse_error; + if (invalid_components + valid_components < 2) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } if (out_local != NULL) { *out_local = 0; } + if (invalid_components + valid_components < 2) + valid_components++; goto finish; } if (has_ymd_sep) { /* Must have separator, but cannot be trailing */ if (*substr != ymd_sep || sublen == 1) { - goto parse_error; + if (invalid_components + valid_components < 2) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); + goto day; } ++substr; --sublen; comparison = compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + if (invalid_components + valid_components < 2) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto day; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + if (invalid_components + valid_components < 2) + valid_components++; goto finish; } } + if (invalid_components + valid_components < 2) + valid_components++; /* PARSE THE DAY */ +day: comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto day_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ if (!isdigit(*substr)) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto day_sep; } out->day = (*substr - '0'); ++substr; @@ -314,54 +607,127 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->day = 10 * out->day + (*substr - '0'); ++substr; --sublen; + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + if (isdigit(substr[0])) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + comparison = compare_format(&format, &format_len, &ymd_sep, 1, + format_requirement); + goto day_sep; + } + } } else if (!has_ymd_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto day_sep; } if (out->day < 1 || out->day > days_per_month_table[year_leap][out->month - 1]) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); - } - goto error; + invalid_components++; + goto day_sep; } +day_sep: /* Next character must be a 'T', ' ', or end of string */ if (sublen == 0) { if (out_local != NULL) { *out_local = 0; } if (format_len) { - goto parse_error; + if (invalid_components + valid_components < 3) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } bestunit = NPY_FR_D; + valid_components++; goto finish; } if ((*substr != 'T' && *substr != ' ') || sublen == 1) { - goto parse_error; + if (invalid_components + valid_components < 3) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } comparison = compare_format(&format, &format_len, substr, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + if (invalid_components + valid_components < 3) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + if (invalid_components + valid_components < 3) + valid_components++; goto finish; } ++substr; --sublen; + if (invalid_components + valid_components < 3) + valid_components++; /* PARSE THE HOURS */ +hour: comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ if (!isdigit(*substr)) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour_sep; } out->hour = (*substr - '0'); bestunit = NPY_FR_h; @@ -373,24 +739,62 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->hour = 10 * out->hour + (*substr - '0'); ++substr; --sublen; - if (out->hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", str); + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + if (!isdigit(substr[0])) { + still_more = 0; + } + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto hour_sep; } - goto error; + } + + if (out->hour >= 24) { + invalid_components++; + goto hour_sep; } } +hour_sep: /* Next character must be a ':' or the end of the string */ if (sublen == 0) { if (!hour_was_2_digits) { - goto parse_error; + if (invalid_components + valid_components < 4) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } if (format_len) { - goto parse_error; + if (invalid_components + valid_components < 4) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } bestunit = NPY_FR_h; + if (invalid_components + valid_components < 4) + valid_components++; goto finish; } @@ -400,28 +804,71 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; /* Cannot have a trailing separator */ if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; + if (invalid_components + valid_components < 4) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } comparison = compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + if (invalid_components + valid_components < 4) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + if (invalid_components + valid_components < 4) + valid_components++; goto finish; } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { - goto parse_error; + if (invalid_components + valid_components < 4) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute; } + if (invalid_components + valid_components < 4) + valid_components++; goto parse_timezone; } + if (invalid_components + valid_components < 4) + valid_components++; /* PARSE THE MINUTES */ +minute: comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ @@ -434,22 +881,59 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->min = 10 * out->min + (*substr - '0'); ++substr; --sublen; - if (out->min >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", str); + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + if (!isdigit(substr[0])) { + still_more = 0; } - goto error; + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute_sep; + } + } + + if (out->min >= 60) { + invalid_components++; + goto minute_sep; } } else if (!has_hms_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto minute_sep; } +minute_sep: if (sublen == 0) { bestunit = NPY_FR_m; if (format_len) { - goto parse_error; + if (invalid_components + valid_components < 5) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } + if (invalid_components + valid_components < 5) + valid_components++; goto finish; } @@ -459,27 +943,61 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + if (invalid_components + valid_components < 5) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + if (invalid_components + valid_components < 5) + valid_components++; goto finish; } ++substr; --sublen; /* Cannot have a trailing ':' */ if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; + if (invalid_components + valid_components < 5) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second; } } else if (!has_hms_sep && isdigit(*substr)) { } else { + if (invalid_components + valid_components < 5) + valid_components++; goto parse_timezone; } + if (invalid_components + valid_components < 5) + valid_components++; /* PARSE THE SECONDS */ +second: comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second_sep; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + valid_components++; goto finish; } /* First digit required */ @@ -491,17 +1009,43 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->sec = 10 * out->sec + (*substr - '0'); ++substr; --sublen; - if (out->sec >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", str); + + /* Invalidates the component if there is more than 2 digits */ + if (sublen > 0) { + int still_more = 1; + if (!isdigit(substr[0])) { + still_more = 0; + } + if (still_more) { + invalid_components++; + while (sublen > 0 && isdigit(substr[0])) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second_sep; } - goto error; + } + + if (out->sec >= 60) { + invalid_components++; + goto second_sep; } } else if (!has_hms_sep) { - goto parse_error; + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto second_sep; } +second_sep: /* Next character may be a '.' indicating fractional seconds */ if (sublen > 0 && *substr == '.') { ++substr; @@ -509,16 +1053,32 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, comparison = compare_format(&format, &format_len, ".", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + if (invalid_components + valid_components < 6) + invalid_components++; + while (sublen > 0 && !isdigit(*substr)) { + substr++; + sublen--; + } + if (sublen == 0) { + goto finish; + } + goto microsecond; } else if (comparison == COMPLETED_PARTIAL_MATCH) { + if (invalid_components + valid_components < 6) + valid_components++; goto finish; } } else { bestunit = NPY_FR_s; + if (invalid_components + valid_components < 6) + valid_components++; goto parse_timezone; } + if (invalid_components + valid_components < 6) + valid_components++; /* PARSE THE MICROSECONDS (0 to 6 digits) */ +microsecond: comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); if (comparison == COMPARISON_ERROR) { @@ -739,6 +1299,17 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } finish: + if (invalid_components > 0 && + (double)valid_components / (valid_components + invalid_components) >= + threshold) { + return -2; // sentinel for NaT + } + + if ((double)valid_components / (valid_components + invalid_components) < + threshold) { + goto parse_error; // threshold not met, raise exception + } + if (out_bestunit != NULL) { *out_bestunit = bestunit; } diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 3e5654b70cd92..7732223a7e886 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -90,7 +90,8 @@ cdef int string_to_dts( int* out_tzoffset, bint want_exc, str format = *, - bint exact = * + bint exact = *, + double threshold = *, ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 0fc7a6945d2e0..3d64c9c8ed89a 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -65,7 +65,7 @@ cdef extern from "pandas/datetime/pd_datetime.h": NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char *format, int format_len, - FormatRequirement exact) + FormatRequirement exact, double threshold) # ---------------------------------------------------------------------- # numpy object inspection @@ -348,6 +348,7 @@ cdef int string_to_dts( bint want_exc, str format=None, bint exact=True, + double threshold = 1.0, ) except? -1: cdef: Py_ssize_t length @@ -367,7 +368,7 @@ cdef int string_to_dts( return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, format_buf, format_length, - format_requirement) + format_requirement, threshold) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d22150f9aefab..cec1d2ea74106 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -355,6 +355,7 @@ def array_strptime( errors="raise", bint utc=False, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, + double threshold = 1.0, ): """ Calculates the datetime structs represented by the passed array of strings @@ -367,6 +368,7 @@ def array_strptime( errors : string specifying error handling, {'raise', 'coerce'} creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC Set to NPY_FR_GENERIC to infer a resolution. + threshold : minimum fraction of valid datetime components required """ cdef: @@ -453,15 +455,23 @@ def array_strptime( out_tzoffset = 0 if fmt == "ISO8601": - string_to_dts_succeeded = not string_to_dts( + res = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, None, False + &out_tzoffset, False, None, False, threshold ) + if res == -2: # sentinel for NaT + iresult[i] = NPY_NAT + continue + string_to_dts_succeeded = not res elif iso_format: - string_to_dts_succeeded = not string_to_dts( + res = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, fmt, exact + &out_tzoffset, False, fmt, exact, threshold ) + string_to_dts_succeeded = not res + if res == -2: # sentinel for NaT + iresult[i] = NPY_NAT + continue if string_to_dts_succeeded: # No error reported by string_to_dts, pick back up # where we left off diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a2c18ccb59899..ff9de351cbfb6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -327,6 +327,7 @@ def _convert_listlike_datetimes( dayfirst: bool | None = None, yearfirst: bool | None = None, exact: bool = True, + threshold: float = 1.0, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike @@ -350,6 +351,8 @@ def _convert_listlike_datetimes( yearfirst parsing behavior from to_datetime exact : bool, default True exact format matching behavior from to_datetime + threshold : float + Minimum fraction of valid datetime components required Returns ------- @@ -432,7 +435,9 @@ def _convert_listlike_datetimes( # `format` could be inferred, or user didn't ask for mixed-format parsing. if format is not None and format != "mixed": - return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + return _array_strptime_with_fallback( + arg, name, utc, format, exact, errors, threshold + ) result, tz_parsed = objects_to_datetime64( arg, @@ -462,11 +467,14 @@ def _array_strptime_with_fallback( fmt: str, exact: bool, errors: str, + threshold: float = 1.0, ) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ - result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + result, tz_out = array_strptime( + arg, fmt, exact=exact, errors=errors, utc=utc, threshold=threshold + ) if tz_out is not None: unit = np.datetime_data(result.dtype)[0] dtype = DatetimeTZDtype(tz=tz_out, unit=unit) @@ -637,6 +645,7 @@ def to_datetime( unit: str | None = ..., origin=..., cache: bool = ..., + threshold: float = ..., ) -> Timestamp: ... @@ -652,6 +661,7 @@ def to_datetime( unit: str | None = ..., origin=..., cache: bool = ..., + threshold: float = ..., ) -> Series: ... @@ -667,6 +677,7 @@ def to_datetime( unit: str | None = ..., origin=..., cache: bool = ..., + threshold: float = ..., ) -> DatetimeIndex: ... @@ -682,6 +693,7 @@ def to_datetime( unit: str | None = None, origin: str = "unix", cache: bool = True, + threshold: float = 1.0, ) -> DatetimeIndex | Series | DatetimeScalar | NaTType: """ Convert argument to datetime. @@ -790,7 +802,19 @@ def to_datetime( is only used when there are at least 50 values. The presence of out-of-bounds values will render the cache unusable and may slow down parsing. - + threshold : float + Minimum fraction of valid datetime components required to consider parsing + successful. Must be between 0.0 and 1.0. Components include year, month, + day, hour, minute, and second if present in the input. An invalid component + has too many or too few digits or a number outside the possible range + (e.g., month outside [1, 12]). Behavior depends on the threshold: + + - 1.0 (default): all components must be valid, else raises error (unless + ``errors='coerce'``). + - 0.0: any invalid component produces NaT, else returns a valid datetime. + - Values between 0 and 1: if all components are valid, returns a valid + datetime; if the fraction of valid components >= threshold, returns NaT; + otherwise raises error. Returns ------- datetime @@ -991,12 +1015,23 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[us, UTC]', freq=None) + + - Input string with one invalid component returns NaT if threshold allows + partial validity + + >>> pd.to_datetime( + ... "2018-100-26 12:00:00", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ... ) + NaT """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") if arg is None: return NaT + if not (0.0 <= threshold <= 1.0): + raise ValueError(f"`threshold` must be between 0.0 and 1.0, got {threshold}") + if origin != "unix": arg = _adjust_to_origin(arg, origin, unit) @@ -1008,6 +1043,7 @@ def to_datetime( yearfirst=yearfirst, errors=errors, exact=exact, # type: ignore[arg-type] + threshold=threshold, ) result: Timestamp | NaTType | Series | Index diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 67b1e8668e5f6..6d1ff46421b43 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3819,3 +3819,358 @@ def test_to_datetime_lxml_elementunicoderesult_with_format(cache): out = to_datetime(Series([val]), format="%Y-%m-%d %H:%M:%S", cache=cache) assert out.iloc[0] == Timestamp(s) + + +class TestForIncreasedRobustness: + def test_parse_with_no_malformed_components(self): + res = to_datetime( + "2018-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-10-01 12:00:00.0000000011") + assert res == expected + + res = to_datetime( + "2018-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=1.0, + ) + assert res == expected + + def test_parse_with_five_digit_year(self): + res = to_datetime( + "20012-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_three_digit_year(self): + res = to_datetime( + "212-10-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_two_digit_year(self): + res = to_datetime( + "12-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", threshold=0.5 + ) + assert isna(res) + + def test_parse_with_one_digit_year(self): + res = to_datetime( + "1-10-01 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f", threshold=0.5 + ) + assert isna(res) + + def test_parse_with_five_digit_year_iso(self): + res = to_datetime("20012-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_three_digit_year_iso(self): + res = to_datetime("201-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_two_digit_year_iso(self): + res = to_datetime("12-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_one_digit_year_iso(self): + res = to_datetime("1-10-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_three_digit_month(self): + res = to_datetime( + "2018-202-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_month(self): + res = to_datetime( + "2018-0-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-2-01 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 12:00:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_month_iso(self): + res = to_datetime("2018-202-01", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_one_digit_month_iso(self): + res = to_datetime("2018-0-01", format="ISO8601", threshold=0.5) + assert isna(res) + + res = to_datetime("2018-2-01", format="ISO8601", threshold=0.5) + expected = Timestamp("2018-02-01") + assert res == expected + + def test_parse_with_three_digit_day(self): + res = to_datetime( + "2018-10-202 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_day(self): + res = to_datetime( + "2018-10-0 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-1 12:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 12:00:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_day_iso(self): + res = to_datetime("2018-10-202", format="ISO8601", threshold=0.5) + assert isna(res) + + def test_parse_with_one_digit_day_iso(self): + res = to_datetime("2018-10-0", format="ISO8601", threshold=0.5) + assert isna(res) + + res = to_datetime("2018-02-1", format="ISO8601", threshold=0.5) + expected = Timestamp("2018-02-01") + assert res == expected + + def test_parse_with_three_digit_hour(self): + res = to_datetime( + "2018-07-01 121:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_hour(self): + res = to_datetime( + "2018-10-01 24:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-01 1:00:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 01:00:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_minute(self): + res = to_datetime( + "2018-07-01 12:121:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_minute(self): + res = to_datetime( + "2018-10-01 23:60:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-01 10:1:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 10:01:00.0000000011") + assert res == expected + + def test_parse_with_three_digit_second(self): + res = to_datetime( + "2018-07-01 12:12:121.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + def test_parse_with_one_digit_second(self): + res = to_datetime( + "2018-10-01 23:00:60.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + res = to_datetime( + "2018-02-01 10:00:1.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + expected = Timestamp("2018-02-01 10:00:01.0000000011") + assert res == expected + + def test_parse_with_half_malformed_components(self): + res = to_datetime( + "2018-10-202 12:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ) + assert isna(res) + + res = to_datetime( + "20118-101-01 23:60:00.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "20118-101-01 23:60:100.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + + def test_parse_with_too_many_malformed_components(self): + res = to_datetime( + "2018-111-111 10:00:01.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.5, + ) + assert isna(res) + + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "2018-202-202 12:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ) + + def test_parse_with_too_many_malformed_components_all(self): + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "2018-10-202 12:00:00", format="%Y-%m-%d %H:%M:%S", threshold=1.0 + ) + + def test_parse_with_too_many_malformed_components_iso(self): + res = to_datetime("2018-10-111", format="ISO8601", threshold=0.5) + assert isna(res) + + with pytest.raises( + ValueError, match="^Time data 18-10-202 is not ISO8601 format" + ): + _ = to_datetime("18-10-202", format="ISO8601", threshold=0.5) + + def test_parse_with_too_many_malformed_components_iso_all(self): + with pytest.raises( + ValueError, match="^Time data 2018-100-202 is not ISO8601 format" + ): + _ = to_datetime("2018-100-202", format="ISO8601", threshold=1.0) + + def test_parse_with_all_malformed_components(self): + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "201-202-202 121:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.5 + ) + + res = to_datetime( + "201-202-202 121:202:202", format="%Y-%m-%d %H:%M:%S", threshold=0.0 + ) + assert isna(res) + + def test_series(self): + series = Series(["2020", "1999", "2011"]) + result = to_datetime(series, format="%Y") + expected = Series( + [ + Timestamp("2020-01-01"), + Timestamp("1999-01-01"), + Timestamp("2011-01-01"), + ] + ) + tm.assert_series_equal(result, expected) + + series = Series(["199"]) + result = to_datetime(series, format="%Y", threshold=0.0) + assert isna(result[0]) + + series = Series(["2020-01-101", "1999-101-01", "211-10-11", "2020-01-01"]) + result = to_datetime(series, format="ISO8601", threshold=0.5) + assert isna(result[0]) + assert isna(result[1]) + assert isna(result[2]) + assert not isna(result[3]) + + series = Series(["2020-01-101", "1999-101-01", "2011-101-101"]) + with pytest.raises( + ValueError, match="^Time data 2011-101-101 is not ISO8601 format" + ): + _ = to_datetime(series, format="ISO8601", threshold=0.5) + + def test_errors_is_coerce(self): + series = Series(["2020", "20xx"]) + result = to_datetime(series, format="%Y", errors="coerce", threshold=1.0) + expected = Series([Timestamp("2020-01-01"), NaT]) + tm.assert_series_equal(result, expected) + + def test_iso_and_format_have_same_threshold_behavior(self): + assert isna(to_datetime("2018-202-01", format="ISO8601", threshold=0.5)) + assert isna(to_datetime("2018-202-01", format="%Y-%m-%d", threshold=0.5)) + + def test_microseconds_does_not_count(self): + with pytest.raises(ValueError, match="^time data *"): + _ = to_datetime( + "20181-021-011 111:010:010.0000000011", + format="%Y-%m-%d %H:%M:%S.%f", + threshold=0.01, + ) + + def test_one_component(self): + res = to_datetime("20181", format="%Y", threshold=0.0) + assert isna(res) + + def test_parse_mixed_format_threshold(self): + series = Series(["2020-01-01", "01/02/2021", "2021-13-01"]) + result = to_datetime(series, format="mixed", threshold=0.5, errors="coerce") + expected = Series([Timestamp("2020-01-01"), Timestamp("2021-01-02"), NaT]) + tm.assert_series_equal(result, expected) + + def test_example(self): + result = to_datetime( + "2018-100-26 12:00:00", + format="%Y-%m-%d %H:%M:%S", + threshold=0.5, + ) + assert isna(result) + + def test_bad_threshold(self): + with pytest.raises( + ValueError, match="`threshold` must be between 0.0 and 1.0, got -0.5" + ): + _ = to_datetime( + "2020-01-01 12:20:20", + format="%Y-%m-%d %H:%M:%S", + threshold=-0.5, + ) + + with pytest.raises( + ValueError, match="`threshold` must be between 0.0 and 1.0, got 2.0" + ): + _ = to_datetime( + "2020-01-01 12:20:20", + format="%Y-%m-%d %H:%M:%S", + threshold=2.0, + )