Skip to content

Commit f820068

Browse files
committed
column naming for empty string and duplicate NA label
1 parent f9c2824 commit f820068

File tree

1 file changed

+46
-15
lines changed

1 file changed

+46
-15
lines changed

R/fcast.R

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,31 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
129129
stopf("Argument 'value.var.in.dots' should be logical TRUE/FALSE")
130130
if (!isTRUEorFALSE(value.var.in.LHSdots) || !isTRUEorFALSE(value.var.in.RHSdots))
131131
stopf("Arguments 'value.var.in.LHSdots', 'value.var.in.RHSdots' should be logical TRUE/FALSE")
132-
# #2980 if explicitly providing fun.aggregate=length but not a value.var,
133-
# just use the last column (as guess(data) would do) because length will be
134-
# the same on all columns
132+
133+
handle_empty_strings = function(names) {
134+
names[names == ""] = "empty_string"
135+
names
136+
}
137+
138+
ensure_unique_names = function(names) {
139+
if (any(duplicated(names))) {
140+
names = make.unique(names, sep = "_")
141+
}
142+
names
143+
}
144+
135145
if (missing(value.var) && !missing(fun.aggregate) && identical(fun.aggregate, length))
136146
value.var = names(data)[ncol(data)]
147+
137148
lvals = value_vars(value.var, names(data))
138149
valnames = unique(unlist(lvals))
150+
151+
valnames = handle_empty_strings(valnames)
152+
valnames = ensure_unique_names(valnames)
153+
139154
lvars = check_formula(formula, names(data), valnames, value.var.in.LHSdots, value.var.in.RHSdots)
140155
lvars = lapply(lvars, function(x) if (length(x)) x else quote(`.`))
141-
# tired of lapply and the way it handles environments!
156+
142157
allcols = c(unlist(lvars), lapply(valnames, as.name))
143158
dat = vector("list", length(allcols))
144159
for (i in seq_along(allcols)) {
@@ -147,18 +162,21 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
147162
if (is.function(dat[[i]]))
148163
stopf("Column [%s] not found or of unknown type.", deparse(x))
149164
}
165+
150166
setattr(lvars, 'names', c("lhs", "rhs"))
151-
# Have to take care of duplicate names, and provide names for expression columns properly.
167+
152168
varnames = make.unique(vapply_1c(unlist(lvars), all.vars, max.names=1L), sep=sep)
153169
dupidx = which(valnames %chin% varnames)
154170
if (length(dupidx)) {
155171
dups = valnames[dupidx]
156172
valnames = tail(make.unique(c(varnames, valnames)), -length(varnames))
157173
lvals = lapply(lvals, function(x) { x[x %chin% dups] = valnames[dupidx]; x })
158174
}
175+
159176
lhsnames = head(varnames, length(lvars$lhs))
160177
rhsnames = tail(varnames, -length(lvars$lhs))
161178
setattr(dat, 'names', c(varnames, valnames))
179+
162180
if (any(vapply_1b(dat[varnames], is.list))) {
163181
stopf("Columns specified in formula can not be of type list")
164182
}
@@ -168,9 +186,10 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
168186
subset = m[["subset"]][[2L]]
169187
if (!is.null(subset)) {
170188
if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
171-
idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
189+
idx = which(eval(subset, data, parent.frame()))
172190
dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
173191
}
192+
174193
fun.call = m[["fun.aggregate"]]
175194
if (is.null(fun.call)) {
176195
oo = forderv(dat, by=varnames, retGrp=TRUE)
@@ -179,14 +198,15 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
179198
fun.call = quote(length)
180199
}
181200
}
201+
182202
dat_for_default_fill = dat
183203
run_agg_funs = !is.null(fun.call)
184204
if (run_agg_funs) {
185205
fun.call = aggregate_funs(fun.call, lvals, sep, ...)
186206
maybe_err = function(list.of.columns) {
187207
if (!all(lengths(list.of.columns) == 1L)) {
188208
msg <- gettext("Aggregating functions should take a vector as input and return a single value (length=1), but they do not, so the result is undefined. Please fix by modifying your function so that a single value is always returned.")
189-
if (is.null(fill)) { # TODO change to always stopf #6329
209+
if (is.null(fill)) {
190210
stop(msg, domain=NA, call. = FALSE)
191211
} else {
192212
warning(msg, domain=NA, call. = FALSE)
@@ -196,12 +216,14 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
196216
}
197217
dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)]
198218
}
219+
199220
order_ = function(x) {
200221
o = forderv(x, retGrp=TRUE, sort=TRUE)
201222
idx = attr(o, 'starts', exact=TRUE)
202223
if (!length(o)) o = seq_along(x)
203-
o[idx] # subsetVector retains attributes, using R's subset for now
224+
o[idx]
204225
}
226+
205227
cj_uniq = function(DT) {
206228
do.call(CJ, lapply(DT, function(x)
207229
if (is.factor(x)) {
@@ -210,15 +232,17 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
210232
setattr(xint, 'class', class(x))
211233
} else .Call(CsubsetVector, x, order_(x))
212234
))}
235+
213236
valnames = setdiff(names(dat), varnames)
214-
# 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
237+
215238
if (!is.null(fun.call) || !is.null(subset))
216239
setkeyv(dat, varnames)
240+
217241
if (length(rhsnames)) {
218242
lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
219-
# handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
243+
220244
if (all(drop)) {
221-
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix
245+
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE)))
222246
maporder = lapply(map, order_)
223247
mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
224248
lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
@@ -235,21 +259,28 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
235259
.Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
236260
lhs = lhs_; rhs = rhs_
237261
}
262+
238263
maplen = lengths(mapunique)
239-
idx = do.call(CJ, mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
264+
idx = do.call(CJ, mapunique)[map, 'I' := .I][["I"]]
240265
some_fill = anyNA(idx)
241266
fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))]
242267
if (run_agg_funs && is.null(fill) && some_fill) {
243268
fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))]
244269
}
270+
245271
ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill)
246272
allcols = do.call(paste, c(rhs, sep=sep))
247273
if (length(valnames) > 1L)
248274
allcols = do.call(paste, if (identical(".", allcols)) list(valnames, sep=sep)
249275
else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
250-
# removed 'setcolorder()' here, #1153
276+
277+
if (length(lhsnames) + length(allcols) != length(ans)) {
278+
stopf("Length mismatch: 'names' attribute [%d] must match the vector length [%d].", length(lhsnames) + length(allcols), length(ans))
279+
}
280+
251281
setattr(ans, 'names', c(lhsnames, allcols))
252282
setDT(ans); setattr(ans, 'sorted', lhsnames)
253-
} else internal_error("empty rhsnames") # nocov
283+
} else internal_error("empty rhsnames")
284+
254285
return(ans)
255-
}
286+
}

0 commit comments

Comments
 (0)