Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ jobs:
fail-fast: false
matrix:
version:
- '1.6'
- '1.8'
- '1.10'
- '1'
- 'nightly'
os:
Expand All @@ -33,7 +32,7 @@ jobs:
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v2
- uses: actions/cache@v4
env:
cache-name: cache-artifacts
with:
Expand Down
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,24 @@ SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c"
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"

[compat]
BenchmarkTools = "1.6"
ChunkedBase = "0.3"
Dates = "1"
FixedPointDecimals = "0.4.3, 0.5, 0.6"
Parsers = "2.7"
SentinelArrays = "1"
SnoopPrecompile = "1"
TimeZones = "1"
julia = "1.6"
julia = "1.10"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CodecZlibNG = "642d12eb-acb5-4437-bcfc-a25e07ad685c"
JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[targets]
test = ["Aqua", "CodecZlibNG", "JET", "Logging", "Test", "UUIDs"]
test = ["Aqua", "BenchmarkTools", "CodecZlibNG", "JET", "Logging", "Test", "UUIDs"]
213 changes: 135 additions & 78 deletions src/type_parsers/datetime_parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@ It will parse the following formats:
- `yyyy-mm-ddTHH:MM:SSZ`
- `yyyy-mm-ddTHH:MM:SS.sZ`

Negative years are also supported. The smallest DateTime value that can be represented is
`-292277024-05-15T16:47:04.192` and the largest is `292277025-08-17T07:12:55.807`, since
they are backed by a 64 bit integer with millisecond precision. These values correspond to
`DateTime(Dates.UTM(typemin(Int)))` and `DateTime(Dates.UTM(typemax(Int)))` respectively.

Additionally, since some systems use 32 bit integers to represent years and we don't want to
fail loudly parsing these even though we can't represent them exactly, all valid
timestamps with in the range `[-2147483648-01-01T00:00:00.000, -292277024-05-15T16:47:04.193]`
will be clamped to the minimal representable DateTime, `-292277024-05-15T16:47:04.192`, and all valid
timestamps with in the range `[292277025-08-17T07:12:55.808, 2147483647-12-31T23:59:59.999]`
will be clamped to the maximal representable DateTime, `292277025-08-17T07:12:55.807`.

# Examples:
```julia
julia> Parsers.xparse(ChunkedCSV.GuessDateTime, "2014-01-01")
Expand Down Expand Up @@ -49,108 +61,155 @@ function _unsafe_datetime(y=0, m=1, d=1, h=0, mi=0, s=0, ms=0)
rata = ms + 1000 * (s + 60mi + 3600h + 86400 * Dates.totaldays(y, m, d))
return DateTime(Dates.UTM(rata))
end
function _clamped_datetime(y, m, d, h=0, mi=0, s=0, ms=0)
dt = _unsafe_datetime(y, m, d, h, mi, s, ms)
y >= 292277025 && dt < ZERO_DATETIME && return MAX_DATETIME
y <= -292277024 && dt > ZERO_DATETIME && return MIN_DATETIME
return dt
end

function _clamped_datetime_from_zoned(year::Int, zdt::ZonedDateTime)
dt = DateTime(zdt, TimeZones.UTC)
year >= 292277025 && dt < ZERO_DATETIME && return MAX_DATETIME
year <= -292277024 && dt > ZERO_DATETIME && return MIN_DATETIME
return dt
end

const MAX_DATETIME = DateTime(Dates.UTM(typemax(Int)))
const MIN_DATETIME = DateTime(Dates.UTM(typemin(Int)))
const ZERO_DATETIME = DateTime(Dates.UTM(0))

# [y]yyy-[m]m-[d]d((T|\s)HH:MM:SS(\.s{1,3}})?)?(zzzz|ZZZ|\Z)?
# [-]y{1,10}-[m]m-[d]d((T|\s)HH:MM:SS(\.s{1,3})?)?(zzzz|ZZZ|\Z)?
Base.@propagate_inbounds function _default_tryparse_timestamp(buf, pos, len, code, b, options)
delim = options.delim.token
cq = options.cq.token
rounding = options.rounding
# ensure there is enough room for at least yyyy-m-d
if len - pos < 8
# ensure there is enough room for at least y-mm-dd
if len - pos + 1 < 7
(b != delim) && (code |= Parsers.INVALID)
(pos >= len) && (code |= Parsers.EOF)
return _unsafe_datetime(0), code, pos
end
sign_mul = 1
if b == UInt8('-')
sign_mul = -1
pos += 1
b = buf[pos]
end

year = 0
for i in 1:4
b -= 0x30
b > 0x09 && (return _unsafe_datetime(0), code | Parsers.INVALID, pos)
year = Int(b) + 10 * year
b = buf[pos += 1]
(i > 2 && b == UInt8('-')) && break
for i in 1:10 # 10 digits max, since that is the maximum length of a 32 bit integer, anything larger is invalid
b0 = b - 0x30
b0 > 0x09 && (return _unsafe_datetime(0), code | Parsers.INVALID, pos)
year = Int(b0) + 10 * year
pos += 1
pos > len && (return _unsafe_datetime(0), code | Parsers.INVALID | Parsers.EOF, pos)
b = buf[pos]
b == UInt8('-') && break
end
b != UInt8('-') && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
b = buf[pos += 1]
year *= sign_mul
# If the year is larger than what can be represented by a 32 bit integer, fail to parse,
# values between typemin(Int32) and MIN_DATETIME are clamped to MIN_DATETIME
# values between typemax(Int32) and MAX_DATETIME are clamped to MAX_DATETIME
overflowed = (year > typemax(Int32) || year < typemin(Int32))
if b != UInt8('-') || overflowed
overflowed || (code |= Parsers.OVERFLOW)
return (_unsafe_datetime(year), code | Parsers.INVALID, pos)
end
pos += 1
pos > len && (return _unsafe_datetime(0), code | Parsers.INVALID | Parsers.EOF, pos)
b = buf[pos]

month = 0
for _ in 1:2
b -= 0x30
b > 0x09 && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
month = Int(b) + 10 * month
b = buf[pos += 1]
b == UInt8('-') && break
b0 = b - 0x30
b0 > 0x09 && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
month = Int(b0) + 10 * month
pos += 1
pos > len && (return _unsafe_datetime(year), code | Parsers.INVALID | Parsers.EOF, pos)
b = buf[pos]
end
month > 12 && (return _unsafe_datetime(year), code | Parsers.INVALID, pos)
b != UInt8('-') && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
b = buf[pos += 1]
b != UInt8('-') && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
pos += 1
pos > len && (return _unsafe_datetime(year, month), code | Parsers.INVALID | Parsers.EOF, pos)
b = buf[pos]

day = 0
for _ in 1:2
b -= 0x30
b > 0x09 && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
day = Int(b) + 10 * day
pos == len && (code |= Parsers.EOF; break)
b = buf[pos += 1]
(b == UInt8('T') || b == UInt8(' ')) && break
for i in 1:2
b0 = b - 0x30
b0 > 0x09 && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
day = Int(b0) + 10 * day
pos += 1
if pos > len
code |= Parsers.EOF;
if i == 2
break # 2 digit day at the very end of the buffer
else # 1 digit day is an error
return (_unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
end
else
b = buf[pos]
end
end
day > Dates.daysinmonth(year, month) && (return _unsafe_datetime(year, month), code | Parsers.INVALID, pos)
if (pos >= len || (b != UInt8('T') && b != UInt8(' ')))
if !(b == delim || b == cq)
code |= Parsers.EOF
pos += 1
end
return _unsafe_datetime(year, month, day), code | Parsers.OK, pos
if (pos > len) || (b != UInt8('T') && b != UInt8(' '))
return _clamped_datetime(year, month, day), code | Parsers.OK, pos
end
# ensure there is enough room for at least HH:MM:DD
len - pos < 8 && (return _unsafe_datetime(0), code | Parsers.INVALID, len)
len - pos + 1 < 8 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
b = buf[pos += 1]

hour = 0
for _ in 1:2
b -= 0x30
b > 0x09 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
hour = Int(b) + 10 * hour
b0 = b - 0x30
b0 > 0x09 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
hour = Int(b0) + 10 * hour
b = buf[pos += 1]
end
hour > 24 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
hour >= 24 && (return _unsafe_datetime(year, month, day), code | Parsers.INVALID, pos)
b != UInt8(':') && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
b = buf[pos += 1]

minute = 0
for _ in 1:2
b -= 0x30
b > 0x09 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
minute = Int(b) + 10 * minute
b0 = b - 0x30
b0 > 0x09 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
minute = Int(b0) + 10 * minute
b = buf[pos += 1]
end
minute > 60 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
minute >= 60 && (return _unsafe_datetime(year, month, day, hour), code | Parsers.INVALID, pos)
b != UInt8(':') && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
b = buf[pos += 1]

second = 0
for _ in 1:2
b -= 0x30
b > 0x09 && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
second = Int(b) + 10 * second
pos == len && break
b = buf[pos += 1]
b0 = b - 0x30
b0 > 0x09 && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
second = Int(b0) + 10 * second
pos += 1
pos > len && (code |= Parsers.EOF; break)
b = buf[pos]
end
if (pos == len || b == delim || b == cq)
code |= isnothing(Dates.validargs(DateTime, year, month, day, hour, minute, second, 0)) ? Parsers.OK : Parsers.INVALID
if !(b == delim || b == cq)
code |= Parsers.EOF
pos += 1
end
return _unsafe_datetime(year, month, day, hour, minute, second), code, pos
second >= 60 && (return _unsafe_datetime(year, month, day, hour, minute), code | Parsers.INVALID, pos)
if pos > len
return _clamped_datetime(year, month, day, hour, minute, second), code | Parsers.OK, pos
end

millisecond = 0
if b == UInt8('.')
i = 0
while pos < len && ((b = (buf[pos += 1] - UInt8('0'))) <= 0x09)
millisecond = Int(b) + 10 * millisecond
pos += 1
pos > len && (return _unsafe_datetime(year, month, day, hour, minute, second), code | Parsers.INVALID | Parsers.EOF, pos)
b = buf[pos]
while true
b0 = b - UInt8('0')
b0 > 0x09 && break
i += 1
millisecond = Int(b0) + 10 * millisecond
pos += 1
pos > len && break
b = buf[pos]
end

i == 0 && (return _unsafe_datetime(year, month, day, hour, minute, second), code | Parsers.INVALID, pos)
Expand Down Expand Up @@ -181,34 +240,26 @@ Base.@propagate_inbounds function _default_tryparse_timestamp(buf, pos, len, cod
throw(ArgumentError("invalid rounding mode: $rounding"))
end
end

b += UInt8('0')
if (pos == len || b == delim || b == cq)
code |= isnothing(Dates.validargs(DateTime, year, month, day, hour, minute, second, millisecond)) ? Parsers.OK : Parsers.INVALID
if !(b == delim || b == cq)
code |= Parsers.EOF
pos += 1
end
return _unsafe_datetime(year, month, day, hour, minute, second, millisecond), code, pos
end
millisecond >= 1000 && (return _unsafe_datetime(year, month, day, hour, minute, second), code | Parsers.INVALID, pos)
end
b == UInt8(' ') && pos < len && (b = buf[pos += 1])

tz, pos, code = _tryparse_timezone(buf, pos, b, len, code)
pos >= len && (code |= Parsers.EOF)
Parsers.invalid(code) && (return _unsafe_datetime(year, month, day, hour, minute, second, millisecond), code , pos)
if isnothing(Dates.validargs(ZonedDateTime, year, month, day, hour, minute, second, millisecond, tz))
code |= Parsers.OK
pos > len && (code |= Parsers.EOF)

dt = _clamped_datetime(year, month, day, hour, minute, second, millisecond)
code |= Parsers.OK
if isnothing(tz)
return (dt, code, pos)
else
if tz === _Z
# Avoiding TimeZones.ZonedDateTime to save some allocations in case the `tz`
# corresponds to a UTC time zone.
return (_unsafe_datetime(year, month, day, hour, minute, second, millisecond), code, pos)
return (dt, code, pos)
else
dt = _unsafe_datetime(year, month, day, hour, minute, second, millisecond)
ztd = TimeZones.ZonedDateTime(dt, TimeZones.TimeZone(tz))
return (DateTime(ztd, TimeZones.UTC), code, pos)
zdt = TimeZones.ZonedDateTime(dt, TimeZones.TimeZone(tz))
return (_clamped_datetime_from_zoned(year, zdt), code, pos)
end
else
return (_unsafe_datetime(0), code | Parsers.INVALID, pos)
end
end

Expand All @@ -217,6 +268,12 @@ end
# This is needed until https://github.com/JuliaTime/TimeZones.jl/issues/271 is fixed
const _Z = SubString("Z", 1:1)
@inline function _tryparse_timezone(buf, pos, b, len, code)
# At this point we don't even know if there is a timezone to parse, we might be at the end of
# the field. So in case we get an invalid TZ here, we just return the _original_ code
# and `nothing` for the timezone, as if we never attempted to parse it.
# If this _was_ a true invalid timezone, the other layers in Parsers.jl will mark the value
# as invalid because we're at the very end of the field and if we leave any non-whitespace characters
# between the end of the value and the delimiter.
nb = len - pos
@inbounds if b == UInt8('+') || b == UInt8('-')
if nb > 1 && buf[pos+1] == UInt8('0')
Expand All @@ -232,8 +289,8 @@ const _Z = SubString("Z", 1:1)
end
end
end
(tz, pos, _, code) = Parsers.tryparsenext(Dates.DatePart{'z'}(4, false), buf, pos, len, b, code)
return tz, pos, code
(tz, pos, _, code_tz) = Parsers.tryparsenext(Dates.DatePart{'z'}(4, false), buf, pos, len, b, code)
return tz, pos, Parsers.invalid(code_tz) ? code : code_tz
end

@inbounds if b == UInt8('G')
Expand All @@ -247,8 +304,8 @@ const _Z = SubString("Z", 1:1)
return (_Z, pos+3, code) # UTC
end
end
(tz, pos, _, code) = Parsers.tryparsenext(Dates.DatePart{'Z'}(3, false), buf, pos, len, b, code)
return tz, pos, code
(tz, pos, _, code_tz) = Parsers.tryparsenext(Dates.DatePart{'Z'}(3, false), buf, pos, len, b, code)
return tz, pos, Parsers.invalid(code_tz) ? code : code_tz
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, don't think i fully understood the changes in this function -- which tests show this new behaviour? is it

        res = Parsers.xparse(ChunkedCSV.GuessDateTime, string(ChunkedCSV.MIN_DATETIME, "Z"))
        @test res.val == ChunkedCSV.MIN_DATETIME
        @test Parsers.ok(res.code)

should we / do we have a test with an actual invalid timezone to check we still end up with the invalid code?

Copy link
Member Author

@Drvi Drvi Mar 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this is tricky to explain, because I'm trying to fulfill the unwritten contract between Parsers.typeparser, which we're implementing here, and the other layers in Parsers.jl. Basically, the type parser is must consume all valid bytes and stop once it encounters a byte that doesn't "belong" to the type it's parsing. Sometimes that byte is a delimiter or a whitespace in which case you shouldn't mark it as INVALID and let the other layers handle it, but sometimes certain structure is dictated by the parsing grammar, like if you have a float with a scientific notation exponent, then a number or sign must follow the e, otherwise this is an invalid float. Except, you are not supposed to rely on knowing what the delimiter or quote is, since these should only be handled by the other layers in Parsers.jl. And since we must consume all bytes that belong to the type, we must attempt to parse the timezone string, just in case it is there, and in case it is not there, we must handle it gracefully. So this is what I'm doing here. I need to handle both the case when there is a timezone, e.g. when we're here:

2024-01-01 00:00:00.000Z,
                       ^

and the other case

2024-01-01 00:00:00.000,
                       ^

Since the timezone is optional. If the timezone is just wrong:

2024-01-01 00:00:00.000!,
                       ^

We'll say -- well, this byte doesn't seem to belong to us, so we won't skip past it (and let the other layers in Parsers.jl handle it and mark it as invalid) but we also don't say this value is invalid. The other layers in Parsers.jl don't understand what a valid timezone is or isn't, but they understand there must be no more non-space characters after we consumed all the bytes for the type, so it will mark the parsing as invalid because of that.

So this code path is exercised in all test cases that have a time component (i.e. not just the date part), since in all of those, we'll reach this function. I've explicitly added the test set "parsing in context" to make sure we always return the expected code in these cases.

end

function Parsers.typeparser(::Parsers.AbstractConf{GuessDateTime}, source::AbstractVector{UInt8}, pos, len, b, code, pl, options)
Expand Down
Loading
Loading