@@ -129,16 +129,31 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
129129 stopf(" Argument 'value.var.in.dots' should be logical TRUE/FALSE" )
130130 if (! isTRUEorFALSE(value.var.in.LHSdots ) || ! isTRUEorFALSE(value.var.in.RHSdots ))
131131 stopf(" Arguments 'value.var.in.LHSdots', 'value.var.in.RHSdots' should be logical TRUE/FALSE" )
132- # #2980 if explicitly providing fun.aggregate=length but not a value.var,
133- # just use the last column (as guess(data) would do) because length will be
134- # the same on all columns
132+
133+ handle_empty_strings = function (names ) {
134+ names [names == " " ] = " empty_string"
135+ names
136+ }
137+
138+ ensure_unique_names = function (names ) {
139+ if (any(duplicated(names ))) {
140+ names = make.unique(names , sep = " _" )
141+ }
142+ names
143+ }
144+
135145 if (missing(value.var ) && ! missing(fun.aggregate ) && identical(fun.aggregate , length ))
136146 value.var = names(data )[ncol(data )]
147+
137148 lvals = value_vars(value.var , names(data ))
138149 valnames = unique(unlist(lvals ))
150+
151+ valnames = handle_empty_strings(valnames )
152+ valnames = ensure_unique_names(valnames )
153+
139154 lvars = check_formula(formula , names(data ), valnames , value.var.in.LHSdots , value.var.in.RHSdots )
140155 lvars = lapply(lvars , function (x ) if (length(x )) x else quote(`.` ))
141- # tired of lapply and the way it handles environments!
156+
142157 allcols = c(unlist(lvars ), lapply(valnames , as.name ))
143158 dat = vector(" list" , length(allcols ))
144159 for (i in seq_along(allcols )) {
@@ -147,18 +162,21 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
147162 if (is.function(dat [[i ]]))
148163 stopf(" Column [%s] not found or of unknown type." , deparse(x ))
149164 }
165+
150166 setattr(lvars , ' names' , c(" lhs" , " rhs" ))
151- # Have to take care of duplicate names, and provide names for expression columns properly.
167+
152168 varnames = make.unique(vapply_1c(unlist(lvars ), all.vars , max.names = 1L ), sep = sep )
153169 dupidx = which(valnames %chin % varnames )
154170 if (length(dupidx )) {
155171 dups = valnames [dupidx ]
156172 valnames = tail(make.unique(c(varnames , valnames )), - length(varnames ))
157173 lvals = lapply(lvals , function (x ) { x [x %chin % dups ] = valnames [dupidx ]; x })
158174 }
175+
159176 lhsnames = head(varnames , length(lvars $ lhs ))
160177 rhsnames = tail(varnames , - length(lvars $ lhs ))
161178 setattr(dat , ' names' , c(varnames , valnames ))
179+
162180 if (any(vapply_1b(dat [varnames ], is.list ))) {
163181 stopf(" Columns specified in formula can not be of type list" )
164182 }
@@ -168,9 +186,10 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
168186 subset = m [[" subset" ]][[2L ]]
169187 if (! is.null(subset )) {
170188 if (is.name(subset )) subset = as.call(list (quote(`(` ), subset ))
171- idx = which(eval(subset , data , parent.frame())) # any advantage thro' secondary keys?
189+ idx = which(eval(subset , data , parent.frame()))
172190 dat = .Call(CsubsetDT , dat , idx , seq_along(dat ))
173191 }
192+
174193 fun.call = m [[" fun.aggregate" ]]
175194 if (is.null(fun.call )) {
176195 oo = forderv(dat , by = varnames , retGrp = TRUE )
@@ -179,14 +198,15 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
179198 fun.call = quote(length )
180199 }
181200 }
201+
182202 dat_for_default_fill = dat
183203 run_agg_funs = ! is.null(fun.call )
184204 if (run_agg_funs ) {
185205 fun.call = aggregate_funs(fun.call , lvals , sep , ... )
186206 maybe_err = function (list.of.columns ) {
187207 if (! all(lengths(list.of.columns ) == 1L )) {
188208 msg <- gettext(" Aggregating functions should take a vector as input and return a single value (length=1), but they do not, so the result is undefined. Please fix by modifying your function so that a single value is always returned." )
189- if (is.null(fill )) { # TODO change to always stopf #6329
209+ if (is.null(fill )) {
190210 stop(msg , domain = NA , call. = FALSE )
191211 } else {
192212 warning(msg , domain = NA , call. = FALSE )
@@ -196,12 +216,14 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
196216 }
197217 dat = dat [, maybe_err(eval(fun.call )), by = c(varnames )]
198218 }
219+
199220 order_ = function (x ) {
200221 o = forderv(x , retGrp = TRUE , sort = TRUE )
201222 idx = attr(o , ' starts' , exact = TRUE )
202223 if (! length(o )) o = seq_along(x )
203- o [idx ] # subsetVector retains attributes, using R's subset for now
224+ o [idx ]
204225 }
226+
205227 cj_uniq = function (DT ) {
206228 do.call(CJ , lapply(DT , function (x )
207229 if (is.factor(x )) {
@@ -210,15 +232,17 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
210232 setattr(xint , ' class' , class(x ))
211233 } else .Call(CsubsetVector , x , order_(x ))
212234 ))}
235+
213236 valnames = setdiff(names(dat ), varnames )
214- # 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
237+
215238 if (! is.null(fun.call ) || ! is.null(subset ))
216239 setkeyv(dat , varnames )
240+
217241 if (length(rhsnames )) {
218242 lhs = shallow(dat , lhsnames ); rhs = shallow(dat , rhsnames ); val = shallow(dat , valnames )
219- # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
243+
220244 if (all(drop )) {
221- map = setDT(lapply(list (lhsnames , rhsnames ), function (cols ) frankv(dat , cols = cols , ties.method = " dense" , na.last = FALSE ))) # #2202 fix
245+ map = setDT(lapply(list (lhsnames , rhsnames ), function (cols ) frankv(dat , cols = cols , ties.method = " dense" , na.last = FALSE )))
222246 maporder = lapply(map , order_ )
223247 mapunique = lapply(seq_along(map ), function (i ) .Call(CsubsetVector , map [[i ]], maporder [[i ]]))
224248 lhs = .Call(CsubsetDT , lhs , maporder [[1L ]], seq_along(lhs ))
@@ -235,21 +259,28 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
235259 .Call(Csetlistelt , mapunique , 2L , seq_len(nrow(rhs_ )))
236260 lhs = lhs_ ; rhs = rhs_
237261 }
262+
238263 maplen = lengths(mapunique )
239- idx = do.call(CJ , mapunique )[map , ' I' : = .I ][[" I" ]] # TO DO: move this to C and avoid materialising the Cross Join.
264+ idx = do.call(CJ , mapunique )[map , ' I' : = .I ][[" I" ]]
240265 some_fill = anyNA(idx )
241266 fill.default = if (run_agg_funs && is.null(fill ) && some_fill ) dat_for_default_fill [, maybe_err(eval(fun.call ))]
242267 if (run_agg_funs && is.null(fill ) && some_fill ) {
243268 fill.default = dat_for_default_fill [0L ][, maybe_err(eval(fun.call ))]
244269 }
270+
245271 ans = .Call(Cfcast , lhs , val , maplen [[1L ]], maplen [[2L ]], idx , fill , fill.default , is.null(fun.call ), some_fill )
246272 allcols = do.call(paste , c(rhs , sep = sep ))
247273 if (length(valnames ) > 1L )
248274 allcols = do.call(paste , if (identical(" ." , allcols )) list (valnames , sep = sep )
249275 else c(CJ(valnames , allcols , sorted = FALSE ), sep = sep ))
250- # removed 'setcolorder()' here, #1153
276+
277+ if (length(lhsnames ) + length(allcols ) != length(ans )) {
278+ stopf(" Length mismatch: 'names' attribute [%d] must match the vector length [%d]." , length(lhsnames ) + length(allcols ), length(ans ))
279+ }
280+
251281 setattr(ans , ' names' , c(lhsnames , allcols ))
252282 setDT(ans ); setattr(ans , ' sorted' , lhsnames )
253- } else internal_error(" empty rhsnames" ) # nocov
283+ } else internal_error(" empty rhsnames" )
284+
254285 return (ans )
255- }
286+ }
0 commit comments