RelationalAI · Drvi · Mar 10, 2025 · Mar 1, 2025 · Mar 3, 2025 · Mar 3, 2025
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -17,8 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
-          - '1.8'
+          - '1.10'
           - '1'
           - 'nightly'
         os:
@@ -33,7 +32,7 @@ jobs:
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v2
+      - uses: actions/cache@v4
         env:
           cache-name: cache-artifacts
         with:

diff --git a/Project.toml b/Project.toml
@@ -13,22 +13,24 @@ SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c"
 TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
 
 [compat]
+BenchmarkTools = "1.6"
 ChunkedBase = "0.3"
 Dates = "1"
 FixedPointDecimals = "0.4.3, 0.5, 0.6"
 Parsers = "2.7"
 SentinelArrays = "1"
 SnoopPrecompile = "1"
 TimeZones = "1"
-julia = "1.6"
+julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CodecZlibNG = "642d12eb-acb5-4437-bcfc-a25e07ad685c"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [targets]
-test = ["Aqua", "CodecZlibNG", "JET", "Logging", "Test", "UUIDs"]
+test = ["Aqua", "BenchmarkTools", "CodecZlibNG", "JET", "Logging", "Test", "UUIDs"]
diff --git a/src/type_parsers/datetime_parser.jl b/src/type_parsers/datetime_parser.jl
@@ -16,6 +16,18 @@ It will parse the following formats:
 - `yyyy-mm-ddTHH:MM:SSZ`
 - `yyyy-mm-ddTHH:MM:SS.sZ`
 
+Negative years are also supported. The smallest DateTime value that can be represented is
+`-292277024-05-15T16:47:04.192` and the largest is `292277025-08-17T07:12:55.807`, since
+they are backed by a 64 bit integer with millisecond precision. These values correspond to
+`DateTime(Dates.UTM(typemin(Int)))` and `DateTime(Dates.UTM(typemax(Int)))` respectively.
+
+Additionally, since some systems use 32 bit integers to represent years and we don't want to
+fail loudly parsing these even though we can't represent them exactly, all valid
+timestamps with in the range `[-2147483648-01-01T00:00:00.000, -292277024-05-15T16:47:04.193]`
+will be clamped to the minimal representable DateTime, `-292277024-05-15T16:47:04.192`, and all valid
+timestamps with in the range `[292277025-08-17T07:12:55.808, 2147483647-12-31T23:59:59.999]`
+will be clamped to the maximal representable DateTime, `292277025-08-17T07:12:55.807`.
+
 # Examples:
 ```julia
 julia> Parsers.xparse(ChunkedCSV.GuessDateTime, "2014-01-01")
@@ -49,108 +61,155 @@ function _unsafe_datetime(y=0, m=1, d=1, h=0, mi=0, s=0, ms=0)
     rata = ms + 1000 * (s + 60mi + 3600h + 86400 * Dates.totaldays(y, m, d))
     return DateTime(Dates.UTM(rata))
 end
+function _clamped_datetime(y, m, d, h=0, mi=0, s=0, ms=0)
+    dt = _unsafe_datetime(y, m, d, h, mi, s, ms)
+    y >= 292277025 && dt < ZERO_DATETIME && return MAX_DATETIME
+    y <= -292277024 && dt > ZERO_DATETIME && return MIN_DATETIME
+    return dt
+end
+
+function _clamped_datetime_from_zoned(year::Int, zdt::ZonedDateTime)
+    dt = DateTime(zdt, TimeZones.UTC)
+    year >= 292277025 && dt < ZERO_DATETIME && return MAX_DATETIME
+    year <= -292277024 && dt > ZERO_DATETIME && return MIN_DATETIME
+    return dt
+end
+
+const MAX_DATETIME = DateTime(Dates.UTM(typemax(Int)))
+const MIN_DATETIME = DateTime(Dates.UTM(typemin(Int)))
+const ZERO_DATETIME = DateTime(Dates.UTM(0))
 
-# [y]yyy-[m]m-[d]d((T|\s)HH:MM:SS(\.s{1,3}})?)?(zzzz|ZZZ|\Z)?
+# [-]y{1,10}-[m]m-[d]d((T|\s)HH:MM:SS(\.s{1,3})?)?(zzzz|ZZZ|\Z)?
 Base.@propagate_inbounds function _default_tryparse_timestamp(buf, pos, len, code, b, options)
     delim = options.delim.token
     cq = options.cq.token
     rounding = options.rounding
-    # ensure there is enough room for at least yyyy-m-d
-    if len - pos < 8
+    # ensure there is enough room for at least y-mm-dd
+    if len - pos + 1 < 7
         (b != delim) && (code |= Parsers.INVALID)
         (pos >= len) && (code |= Parsers.EOF)
         return _unsafe_datetime(0), code, pos
     end
+    sign_mul = 1
+    if b == UInt8('-')
+        sign_mul = -1
+        pos += 1
+        b = buf[pos]
+    end
 
     year = 0
-    for i in 1:4
-        b -= 0x30
-        b > 0x09 && (return _unsafe_datetime(0), code | Parsers.INVALID, pos)
-        year = Int(b) + 10 * year
-        b = buf[pos += 1]
-        (i > 2 && b == UInt8('-')) && break
+    for i in 1:10 # 10 digits max, since that is the maximum length of a 32 bit integer, anything larger is invalid
+        b0 = b - 0x30
+        b0 > 0x09 && (return _unsafe_datetime(0), code | Parsers.INVALID, pos)
+        year = Int(b0) + 10 * year
+        pos += 1
+        pos > len && (return _unsafe_datetime(0), code | Parsers.INVALID | Parsers.EOF, pos)
+        b = buf[pos]
+        b == UInt8('-') && break
     end
-    b != UInt8('-')  && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
-    b = buf[pos += 1]
+    year *= sign_mul
+    # If the year is larger than what can be represented by a 32 bit integer, fail to parse,
+    # values between typemin(Int32) and MIN_DATETIME are clamped to MIN_DATETIME
+    # values between typemax(Int32) and MAX_DATETIME are clamped to MAX_DATETIME
+    overflowed = (year > typemax(Int32) || year < typemin(Int32))
+    if b != UInt8('-') || overflowed
+        overflowed || (code |= Parsers.OVERFLOW)
+        return (_unsafe_datetime(year), code | Parsers.INVALID, pos)
+    end
+    pos += 1
+    pos > len && (return _unsafe_datetime(0), code | Parsers.INVALID | Parsers.EOF, pos)
+    b = buf[pos]
 
     month = 0
     for _ in 1:2
-        b -= 0x30
-        b > 0x09 && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
-        month = Int(b) + 10 * month
-        b = buf[pos += 1]
-        b == UInt8('-') && break
+        b0 = b - 0x30
+        b0 > 0x09 && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
+        month = Int(b0) + 10 * month
+        pos += 1
+        pos > len && (return _unsafe_datetime(year), code | Parsers.INVALID | Parsers.EOF, pos)
+        b = buf[pos]
     end
     month > 12 && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
-    b != UInt8('-')  && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
-    b = buf[pos += 1]
+    b != UInt8('-') && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
+    pos += 1
+    pos > len && (return _unsafe_datetime(year, month), code | Parsers.INVALID | Parsers.EOF, pos)
+    b = buf[pos]
 
     day = 0
-    for _ in 1:2
-        b -= 0x30
-        b > 0x09 && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
-        day = Int(b) + 10 * day
-        pos == len && (code |= Parsers.EOF; break)
-        b = buf[pos += 1]
-        (b == UInt8('T') ||  b == UInt8(' ')) && break
+    for i in 1:2
+        b0 = b - 0x30
+        b0 > 0x09 && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
+        day = Int(b0) + 10 * day
+        pos += 1
+        if pos > len
+            code |= Parsers.EOF;
+            if i == 2
+                break # 2 digit day at the very end of the buffer
+            else # 1 digit day is an error
+                return (_unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
+            end
+        else
+            b = buf[pos]
+        end
     end
     day > Dates.daysinmonth(year, month) && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
-    if (pos >= len || (b != UInt8('T') && b != UInt8(' ')))
-        if !(b == delim || b == cq)
-            code |= Parsers.EOF
-            pos += 1
-        end
-        return _unsafe_datetime(year, month, day), code | Parsers.OK, pos
+    if (pos > len) || (b != UInt8('T') && b != UInt8(' '))
+        return _clamped_datetime(year, month, day), code | Parsers.OK, pos
     end
     # ensure there is enough room for at least HH:MM:DD
-    len - pos < 8 && (return _unsafe_datetime(0), code | Parsers.INVALID, len)
+    len - pos + 1 < 8 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
     b = buf[pos += 1]
 
     hour = 0
     for _ in 1:2
-        b -= 0x30
-        b > 0x09 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
-        hour = Int(b) + 10 * hour
+        b0 = b - 0x30
+        b0 > 0x09 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
+        hour = Int(b0) + 10 * hour
         b = buf[pos += 1]
     end
-    hour > 24 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
+    hour >= 24 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
     b != UInt8(':') && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
     b = buf[pos += 1]
 
     minute = 0
     for _ in 1:2
-        b -= 0x30
-        b > 0x09 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
-        minute = Int(b) + 10 * minute
+        b0 = b - 0x30
+        b0 > 0x09 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
+        minute = Int(b0) + 10 * minute
         b = buf[pos += 1]
     end
-    minute > 60 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
+    minute >= 60 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
     b != UInt8(':') && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
     b = buf[pos += 1]
 
     second = 0
     for _ in 1:2
-        b -= 0x30
-        b > 0x09 && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
-        second = Int(b) + 10 * second
-        pos == len && break
-        b = buf[pos += 1]
+        b0 = b - 0x30
+        b0 > 0x09 && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
+        second = Int(b0) + 10 * second
+        pos += 1
+        pos > len && (code |= Parsers.EOF; break)
+        b = buf[pos]
     end
-    if (pos == len || b == delim || b == cq)
-        code |= isnothing(Dates.validargs(DateTime, year, month, day, hour, minute, second, 0)) ? Parsers.OK : Parsers.INVALID
-        if !(b == delim || b == cq)
-            code |= Parsers.EOF
-            pos += 1
-        end
-        return _unsafe_datetime(year, month, day, hour, minute, second), code, pos
+    second >= 60 && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
+    if pos > len
+        return _clamped_datetime(year, month, day, hour, minute, second), code | Parsers.OK, pos
     end
 
     millisecond = 0
     if b == UInt8('.')
         i = 0
-        while pos < len && ((b = (buf[pos += 1] - UInt8('0'))) <= 0x09)
-            millisecond = Int(b) + 10 * millisecond
+        pos += 1
+        pos > len && (return _unsafe_datetime(year, month, day, hour, minute, second), code | Parsers.INVALID | Parsers.EOF, pos)
+        b = buf[pos]
+        while true
+            b0 = b - UInt8('0')
+            b0 > 0x09 && break
             i += 1
+            millisecond = Int(b0) + 10 * millisecond
+            pos += 1
+            pos > len && break
+            b = buf[pos]
         end
 
         i == 0 && (return _unsafe_datetime(year, month, day, hour, minute, second), code | Parsers.INVALID, pos)
@@ -181,34 +240,26 @@ Base.@propagate_inbounds function _default_tryparse_timestamp(buf, pos, len, cod
                 throw(ArgumentError("invalid rounding mode: $rounding"))
             end
         end
-
-        b += UInt8('0')
-        if (pos == len || b == delim || b == cq)
-            code |= isnothing(Dates.validargs(DateTime, year, month, day, hour, minute, second, millisecond)) ? Parsers.OK : Parsers.INVALID
-            if !(b == delim || b == cq)
-                code |= Parsers.EOF
-                pos += 1
-            end
-            return _unsafe_datetime(year, month, day, hour, minute, second, millisecond), code, pos
-        end
+        millisecond >= 1000 && (return _unsafe_datetime(year, month, day, hour, minute, second), code | Parsers.INVALID, pos)
     end
     b == UInt8(' ') && pos < len && (b = buf[pos += 1])
+
     tz, pos, code = _tryparse_timezone(buf, pos, b, len, code)
-    pos >= len && (code |= Parsers.EOF)
-    Parsers.invalid(code) && (return _unsafe_datetime(year, month, day, hour, minute, second, millisecond), code , pos)
-    if isnothing(Dates.validargs(ZonedDateTime, year, month, day, hour, minute, second, millisecond, tz))
-        code |= Parsers.OK
+    pos > len && (code |= Parsers.EOF)
+
+    dt = _clamped_datetime(year, month, day, hour, minute, second, millisecond)
+    code |= Parsers.OK
+    if isnothing(tz)
+        return (dt, code, pos)
+    else
         if tz === _Z
             # Avoiding TimeZones.ZonedDateTime to save some allocations in case the `tz`
             # corresponds to a UTC time zone.
-            return (_unsafe_datetime(year, month, day, hour, minute, second, millisecond), code, pos)
+            return (dt, code, pos)
         else
-            dt = _unsafe_datetime(year, month, day, hour, minute, second, millisecond)
-            ztd = TimeZones.ZonedDateTime(dt, TimeZones.TimeZone(tz))
-            return (DateTime(ztd, TimeZones.UTC), code, pos)
+            zdt = TimeZones.ZonedDateTime(dt, TimeZones.TimeZone(tz))
+            return (_clamped_datetime_from_zoned(year, zdt), code, pos)
         end
-    else
-        return (_unsafe_datetime(0), code | Parsers.INVALID, pos)
     end
 end
 
@@ -217,6 +268,12 @@ end
 # This is needed until https://github.com/JuliaTime/TimeZones.jl/issues/271 is fixed
 const _Z = SubString("Z", 1:1)
 @inline function _tryparse_timezone(buf, pos, b, len, code)
+    # At this point we don't even know if there is a timezone to parse, we might be at the end of
+    # the field. So in case we get an invalid TZ here, we just return the _original_ code
+    # and `nothing` for the timezone, as if we never attempted to parse it.
+    # If this _was_ a true invalid timezone, the other layers in Parsers.jl will mark the value
+    # as invalid because we're at the very end of the field and if we leave any non-whitespace characters
+    # between the end of the value and the delimiter.
     nb = len - pos
     @inbounds if b == UInt8('+') || b == UInt8('-')
         if nb > 1 && buf[pos+1] == UInt8('0')
@@ -232,8 +289,8 @@ const _Z = SubString("Z", 1:1)
                 end
             end
         end
-        (tz, pos, _, code) = Parsers.tryparsenext(Dates.DatePart{'z'}(4, false), buf, pos, len, b, code)
-        return tz, pos, code
+        (tz, pos, _, code_tz) = Parsers.tryparsenext(Dates.DatePart{'z'}(4, false), buf, pos, len, b, code)
+        return tz, pos, Parsers.invalid(code_tz) ? code : code_tz
     end
 
     @inbounds if b == UInt8('G')
@@ -247,8 +304,8 @@ const _Z = SubString("Z", 1:1)
             return (_Z, pos+3, code) # UTC
         end
     end
-    (tz, pos, _, code) = Parsers.tryparsenext(Dates.DatePart{'Z'}(3, false), buf, pos, len, b, code)
-    return tz, pos, code
+    (tz, pos, _, code_tz) = Parsers.tryparsenext(Dates.DatePart{'Z'}(3, false), buf, pos, len, b, code)
+    return tz, pos, Parsers.invalid(code_tz) ? code : code_tz
 end
 
 function Parsers.typeparser(::Parsers.AbstractConf{GuessDateTime}, source::AbstractVector{UInt8}, pos, len, b, code, pl, options)