Skip to content

Commit a77e8c2

Browse files
Defer as.Date() coercion to R level (#6107)
* Refactor colClasses handling for readability * remove debug prints * C-level changes needed * update git hash again :\ * revert atime test, defer to follow-up * re-site NEWS
1 parent e9087ce commit a77e8c2

File tree

4 files changed

+24
-17
lines changed

4 files changed

+24
-17
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ rowwiseDT(
5757

5858
6. Fixed a segfault in `fcase()`, [#6448](https://github.com/Rdatatable/data.table/issues/6448). Thanks @ethanbsmith for reporting with reprex, @aitap for finding the root cause, and @MichaelChirico for the PR.
5959

60+
7. `fread()` performance improves when specifying `Date` among `colClasses`, [#6105](https://github.com/Rdatatable/data.table/issues/6105). One implication of the change is that the column will be an `IDate` (which also inherits from `Date`), which may affect code strongly relying on the column class to be `Date` exactly; computations with `IDate` and `Date` columns should otherwise be the same. If you strongly prefer the `Date` class, run `as.Date()` explicitly following `fread()`. Thanks @scipima for the report and @MichaelChirico for the fix.
61+
6062
## NOTES
6163

6264
1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.

inst/tests/S4.Rraw

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,14 +90,14 @@ ans_print = utils::capture.output(print(ans))
9090
if (TZnotUTC) {
9191
test(5.2, options=list(datatable.old.fread.datetime.character=NULL),
9292
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""),
93-
ans, output=ans_print)
93+
copy(ans)[, a := as.IDate(a)], output=ans_print)
9494
test(5.3, options=list(datatable.old.fread.datetime.character=NULL),
9595
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""),
96-
data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
96+
data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
9797
} else {
9898
test(5.4, options=list(datatable.old.fread.datetime.character=NULL),
9999
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
100-
ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
100+
ans<-data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
101101
test(5.5, options=list(datatable.old.fread.datetime.character=NULL),
102102
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)),
103103
ans, output=ans_print)

inst/tests/tests.Rraw

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11042,20 +11042,20 @@ test(1743.044, fread("a\n1", colClasses=""), data.table(a=1L))
1104211042
# Issue #1634: 'fread doesn't check colClasses to be valid type'
1104311043
# Currently using BioGenerics, which doesn't support USE.NAMES
1104411044
## Date supported character/list colClasses
11045-
test(1743.05, sapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), c(a="Date", b="integer"))
11046-
test(1743.06, sapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), c(a="Date", b="integer"))
11047-
test(1743.07, sapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), c(a="Date", b="Date"))
11048-
test(1743.08, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
11049-
test(1743.09, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
11050-
test(1743.10, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), c(a="Date",b="integer",c="complex",d="integer"))
11051-
test(1743.11, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), c(a="Date",b="integer",c="complex",d="raw"))
11045+
test(1743.05, lapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), list(a=c("IDate", "Date"), b="integer"))
11046+
test(1743.06, lapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), list(a=c("IDate", "Date"), b="integer"))
11047+
test(1743.07, lapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), list(a=c("IDate", "Date"), b=c("IDate", "Date")))
11048+
test(1743.08, lapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), list(a=c("IDate", "Date"), b="integer", c="complex"))
11049+
test(1743.09, lapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), list(a=c("IDate", "Date"), b="integer", c="complex"))
11050+
test(1743.10, lapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), list(a=c("IDate", "Date"), b="integer", c="complex", d="integer"))
11051+
test(1743.11, lapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), list(a=c("IDate", "Date"), b="integer", c="complex", d="raw"))
1105211052
test(1743.121, sapply(fread("a,b\n2015-01-01,2015-01-01", colClasses=c(NA,"IDate")), inherits, what="IDate"), c(a=TRUE, b=TRUE))
11053-
test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.Date("2015-01-01")))
11053+
test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.IDate("2015-01-01")))
1105411054
test(1743.123, fread("a,b\n1+3i,2015-01-01", colClasses=c(NA,"IDate")), data.table(a="1+3i", b=as.IDate("2015-01-01")))
1105511055

1105611056
## Attempts to impose incompatible colClasses is a warning (not an error)
1105711057
## and does not change the value of the columns
11058-
test(1743.13, sapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=c(a="character", b="Date"), warning=base_messages$ambiguous_date_fmt)
11058+
test(1743.13, lapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=list(a="character", b=c("IDate", "Date")), warning=base_messages$ambiguous_date_fmt)
1105911059

1106011060
## Just invalid
1106111061
test(1743.14, options = c(useFancyQuotes = FALSE),
@@ -11085,6 +11085,9 @@ test(1743.201, fread("A,B,C,D\nA,B,X,4", select=list(factor="A"), colClasses="ch
1108511085
test(1743.202, fread("A,B,C,D\nA,B,X,4", select=c(A="factor"), colClasses="character"), error="select= is a named vector.*but colClasses= has been provided as well. Please remove colClasses=.")
1108611086
test(1743.203, fread("A,B,C,D\nA,B,X,4", select=list(character="D", factor="B")), data.table(D="4", B=factor("B")))
1108711087
test(1743.204, fread("A,B,C,D\nA,B,X,4", select=list(character=4, character=2)), data.table(D="4", B="B"))
11088+
## Date+select
11089+
test(1743.205, fread("a,b\n2017-01-01,1", colClasses="Date", select="a"), data.table(a=as.IDate("2017-01-01")))
11090+
test(1743.206, fread("a,b\n2017-01-01,1", select=list(Date="a")), data.table(a=as.IDate("2017-01-01")))
1108811091

1108911092
## factors
1109011093
test(1743.211, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = 1L), select = 2:3), class), y = c(b="integer", c="character"))
@@ -16958,7 +16961,7 @@ test(2150.10, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+00:XX')])
1695816961
test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local time for backwards compatibility
1695916962
data.table(a=as.POSIXct("2015-01-01"), b=as.POSIXct("2015-01-01")))
1696016963
test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage
16961-
data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01")))
16964+
data.table(a=as.IDate("2015-01-01"), b=as.POSIXct("2015-01-01")))
1696216965
test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2", tz=""), # no Z, tz="" needed for this test from v1.14.0
1696316966
if (TZnotUTC) data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2))
1696416967
else data.table(a=setattr(as.POSIXct(c("2015-01-01 00:00:00", "2015-01-02 01:02:03"), tz="UTC"), "tzone", "UTC"), b=c(1.1, 1.2)))

src/freadR.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -335,10 +335,10 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
335335
type[i]=CT_STRING; // e.g. CT_ISO8601_DATE changed to character here so that as.POSIXct treats the date-only as local time in tests 1743.122 and 2150.11
336336
SET_STRING_ELT(colClassesAs, i, tt);
337337
}
338-
} else {
338+
} else if (type[i] != CT_ISO8601_DATE || tt != char_Date) {
339339
type[i] = typeEnum[w-1]; // freadMain checks bump up only not down
340340
if (w==NUT) SET_STRING_ELT(colClassesAs, i, tt);
341-
}
341+
} // else (when colClasses="Date" and fread found an IDate), don't update type[i] and don't signal any coercion needed on R side
342342
}
343343
} else { // selectColClasses==true
344344
if (!selectInts) internal_error(__func__, "selectInts is NULL but selectColClasses is true"); // # nocov
@@ -355,7 +355,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
355355
type[y-1]=CT_STRING;
356356
SET_STRING_ELT(colClassesAs, y-1, tt);
357357
}
358-
} else {
358+
} else if (type[i] != CT_ISO8601_DATE || tt != char_Date) {
359359
type[y-1] = typeEnum[w-1];
360360
if (w==NUT) SET_STRING_ELT(colClassesAs, y-1, tt);
361361
}
@@ -405,7 +405,9 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
405405
}
406406
if (selectRankD) selectRankD[colIdx-1] = rank++;
407407
// NB: mark as negative to indicate 'seen'
408-
if (colClassType == CT_ISO8601_TIME && type[colIdx-1]!=CT_ISO8601_TIME) {
408+
if (type[colIdx-1]==CT_ISO8601_DATE && colClassType==CT_STRING && STRING_ELT(listNames, i) == char_Date) {
409+
type[colIdx-1] *= -1;
410+
} else if (colClassType == CT_ISO8601_TIME && type[colIdx-1]!=CT_ISO8601_TIME) {
409411
type[colIdx-1] = -CT_STRING; // don't use in-built UTC parser, defer to character and as.POSIXct afterwards which reads in local time
410412
SET_STRING_ELT(colClassesAs, colIdx-1, STRING_ELT(listNames, i));
411413
} else {

0 commit comments

Comments
 (0)