diff --git a/DESCRIPTION b/DESCRIPTION index ebd01913c..e2e5304f2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,11 @@ Authors@R: person(given = "Lona", family = "Koers", role = "ctb", - email = "lona.koers@gmail.com")) + email = "lona.koers@gmail.com"), + person(given = "Alexander", + family = "Winterstetter", + role = "ctb", + email = "alexander.winterstetter@gmail.com")) Description: Dataflow programming toolkit that enriches 'mlr3' with a diverse set of pipelining operators ('PipeOps') that can be composed into graphs. Operations exist for data preprocessing, model fitting, and ensemble diff --git a/R/PipeOpImpute.R b/R/PipeOpImpute.R index 087435ed1..fc45b844c 100644 --- a/R/PipeOpImpute.R +++ b/R/PipeOpImpute.R @@ -329,7 +329,9 @@ PipeOpImpute = R6Class("PipeOpImpute", logical = c(TRUE, FALSE), numeric = 0, # see PipeOpImputeMean and PipeOpImputeMedian ordered = levels(feature), # see above - character = "" + character = "", + Date = as.Date(0), + POSIXct = as.POSIXct(0) ) }, diff --git a/R/PipeOpImputeConstant.R b/R/PipeOpImputeConstant.R index fd47a3424..f649ba9c3 100644 --- a/R/PipeOpImputeConstant.R +++ b/R/PipeOpImputeConstant.R @@ -83,7 +83,7 @@ PipeOpImputeConstant = R6Class("PipeOpImputeConstant", check_levels = p_lgl(init = TRUE, tags = c("train", "required")) ) super$initialize(id, param_set = ps, param_vals = param_vals, empty_level_control = "always", - feature_types = c("logical", "integer", "numeric", "character", "factor", "ordered", "POSIXct")) + feature_types = c("logical", "integer", "numeric", "character", "factor", "ordered", "POSIXct", "Date")) } ), private = list( @@ -96,7 +96,8 @@ PipeOpImputeConstant = R6Class("PipeOpImputeConstant", "character" = assert_string(constant), "factor" = assert_string_or_factor(constant), "ordered" = assert_string_or_factor(constant), - "POSIXct" = assert_posixct(constant, any.missing = FALSE, len = 1L) + "POSIXct" = assert_posixct(constant, any.missing = FALSE, len = 1L), + "Date" = assert_date(constant, any.missing = FALSE, len = 1L) ) if (type %in% c("ordered", "factor") && self$param_set$values$check_levels) { if (!isTRUE(check_choice(as.character(constant), levels(feature)))) { diff --git a/R/PipeOpImputeHist.R b/R/PipeOpImputeHist.R index 389eb870f..2fec53d93 100644 --- a/R/PipeOpImputeHist.R +++ b/R/PipeOpImputeHist.R @@ -1,11 +1,11 @@ -#' @title Impute Numerical Features by Histogram +#' @title Impute Numeric, Integer, POSIXct or Date Features by Histogram #' #' @usage NULL #' @name mlr_pipeops_imputehist #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpImpute`]/[`PipeOp`]. #' #' @description -#' Impute numerical features by histogram. +#' Impute numeric, integer, POSIXct or Date features by histogram. #' #' During training, a histogram is fitted on each column using R's [`hist()`][graphics::hist] function. #' The fitted histogram is then sampled from for imputation. Sampling happens in a two-step process: @@ -27,7 +27,7 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpImpute`]. #' -#' The output is the input [`Task`][mlr3::Task] with all affected numeric features missing values imputed by (column-wise) histogram; see Description for details. +#' The output is the input [`Task`][mlr3::Task] with all affected numeric, integer, POSIXct or Date features missing values imputed by (column-wise) histogram; see Description for details. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpImpute`]. @@ -66,13 +66,27 @@ PipeOpImputeHist = R6Class("PipeOpImputeHist", inherit = PipeOpImpute, public = list( initialize = function(id = "imputehist", param_vals = list()) { - super$initialize(id, param_vals = param_vals, packages = "graphics", feature_types = c("integer", "numeric")) + super$initialize(id, param_vals = param_vals, packages = "graphics", feature_types = c("integer", "numeric", "Date", "POSIXct")) } ), private = list( .train_imputer = function(feature, type, context) { - graphics::hist(feature, plot = FALSE)[c("counts", "breaks")] + if (inherits(feature, c("POSIXct", "Date"))) { + # hist() for POSIXct/Date does not do "Sturges" breaks automatically, so we compute it explicitly + n_breaks = ceiling(log2(length(feature)) + 1) + # If we pass the number of breaks, hist() does some computation that results in integer overflow + if (inherits(feature, "POSIXct")) { + breaks = as.POSIXct(as.numeric(pretty(range(feature, na.rm = TRUE), n = n_breaks, min.n = 1))) + } else { + breaks = as.Date(as.numeric(pretty(range(feature, na.rm = TRUE), n = n_breaks, min.n = 1))) + } + # pretty() does not return values of length < 2, so the special case where `breaks` gets + # intepreted differently does not need to be handled here. + graphics::hist(feature, breaks = breaks, plot = FALSE)[c("counts", "breaks")] + } else { + graphics::hist(feature, plot = FALSE)[c("counts", "breaks")] + } }, .impute = function(feature, type, model, context) { diff --git a/R/PipeOpImputeLearner.R b/R/PipeOpImputeLearner.R index 50159924a..dafc32c46 100644 --- a/R/PipeOpImputeLearner.R +++ b/R/PipeOpImputeLearner.R @@ -10,7 +10,7 @@ #' Note this parameter is part of the [`PipeOpImpute`] base class and explained there. #' #' Additionally, only features supported by the learner can be imputed; i.e. learners of type -#' `regr` can only impute features of type `integer` and `numeric`, while `classif` can impute +#' `regr` can only impute features of type `integer`, `numeric`, `POSIXct` and `Date`, while `classif` can impute #' features of type `factor`, `ordered` and `logical`. #' #' The [`Learner`][mlr3::Learner] used for imputation is trained on all `context_columns`; if these contain missing values, @@ -105,7 +105,7 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner", private$.learner = as_learner(learner, clone = TRUE) id = id %??% private$.learner$id feature_types = switch(private$.learner$task_type, - regr = c("integer", "numeric"), + regr = c("integer", "numeric", "POSIXct", "Date"), classif = c("logical", "factor", "ordered"), stop("Only `classif` or `regr` Learners are currently supported by PipeOpImputeLearner.") # FIXME: at least ordinal should also be possible. When Moore's law catches up with us we could even do `character` @@ -183,6 +183,8 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner", # Convert non-factor imputation targets to a factor if (is.numeric(feature)) { feature + } else if (any(class(feature) %in% c("POSIXct", "Date"))) { + as.numeric(feature) } else { if (!is.null(levels(feature))) { factor(feature, levels = levels(feature), ordered = FALSE) @@ -198,6 +200,8 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner", feature = round(feature) } if (type == "logical") feature = as.logical(feature) # FIXME mlr-org/mlr3#475 + if (type == "POSIXct") feature = as.POSIXct(feature) + if (type == "Date") feature = as.Date(feature) auto_convert(feature, "feature to be imputed", type, levels = levels(feature)) }, .additional_phash_input = function() private$.learner$phash diff --git a/R/PipeOpImputeMean.R b/R/PipeOpImputeMean.R index 90274516a..2c3104719 100644 --- a/R/PipeOpImputeMean.R +++ b/R/PipeOpImputeMean.R @@ -1,11 +1,11 @@ -#' @title Impute Numerical Features by their Mean +#' @title Impute Numeric, Integer, POSIXct or Date Features by their Mean #' #' @usage NULL #' @name mlr_pipeops_imputemean #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpImpute`]/[`PipeOp`]. #' #' @description -#' Impute numerical features by their mean. +#' Impute numeric, integer, POSIXct or Date features by their mean. #' #' @section Construction: #' ``` @@ -20,12 +20,12 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpImpute`]. #' -#' The output is the input [`Task`][mlr3::Task] with all affected numeric features missing values imputed by (column-wise) mean. +#' The output is the input [`Task`][mlr3::Task] with all affected numeric, integer, POSIXct and Date features missing values imputed by (column-wise) mean. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpImpute`]. #' -#' The `$state$model` is a named `list` of `numeric(1)` indicating the mean of the respective feature. +#' The `$state$model` is a named `list` of either `numeric(1)`, `integer(1)`, `POSIXct(1)` or `Date(1)` indicating the mean of the respective feature. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpImpute`]. @@ -59,7 +59,7 @@ PipeOpImputeMean = R6Class("PipeOpImputeMean", inherit = PipeOpImpute, public = list( initialize = function(id = "imputemean", param_vals = list()) { - super$initialize(id, param_vals = param_vals, feature_types= c("numeric", "integer")) + super$initialize(id, param_vals = param_vals, feature_types= c("numeric", "integer", "POSIXct", "Date")) } ), private = list( diff --git a/R/PipeOpImputeMedian.R b/R/PipeOpImputeMedian.R index 0113b3565..5ce447289 100644 --- a/R/PipeOpImputeMedian.R +++ b/R/PipeOpImputeMedian.R @@ -1,11 +1,11 @@ -#' @title Impute Numerical Features by their Median +#' @title Impute Numeric, Integer, POSIXct or Date Features by their Median #' #' @usage NULL #' @name mlr_pipeops_imputemedian #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpImpute`]/[`PipeOp`]. #' #' @description -#' Impute numerical features by their median. +#' Impute numerical, integer, POSIXct or Date features by their median. #' #' @section Construction: #' ``` @@ -20,12 +20,12 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpImpute`]. #' -#' The output is the input [`Task`][mlr3::Task] with all affected numeric features missing values imputed by (column-wise) median. +#' The output is the input [`Task`][mlr3::Task] with all affected numeric, integer, POSIXct and Date features missing values imputed by (column-wise) median. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpImpute`]. #' -#' The `$state$model` is a named `list` of `numeric(1)` indicating the median of the respective feature. +#' The `$state$model` is a named `list` of `numeric(1)`, `integer(1)`, `POSIXct(1)` or `Date(1)` indicating the median of the respective feature. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpImpute`]. @@ -59,7 +59,7 @@ PipeOpImputeMedian = R6Class("PipeOpImputeMedian", inherit = PipeOpImpute, public = list( initialize = function(id = "imputemedian", param_vals = list()) { - super$initialize(id, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer")) + super$initialize(id, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer", "POSIXct", "Date")) } ), private = list( diff --git a/R/PipeOpImputeMode.R b/R/PipeOpImputeMode.R index bdef94cde..bbe3d198b 100644 --- a/R/PipeOpImputeMode.R +++ b/R/PipeOpImputeMode.R @@ -5,7 +5,7 @@ #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpImpute`]/[`PipeOp`]. #' #' @description -#' Impute features by their mode. Supports factors as well as logical and numerical features. +#' Impute features by their mode. Supports factors, logical, numerical, POSIXct and Date features. #' If multiple modes are present then imputed values are sampled randomly from them. #' #' @section Construction: @@ -66,7 +66,7 @@ PipeOpImputeMode = R6Class("PipeOpImputeMode", inherit = PipeOpImpute, public = list( initialize = function(id = "imputemode", param_vals = list()) { - super$initialize(id, param_vals = param_vals, feature_types = c("factor", "integer", "logical", "numeric", "ordered")) + super$initialize(id, param_vals = param_vals, feature_types = c("factor", "integer", "logical", "numeric", "ordered", "POSIXct", "Date")) } ), private = list( diff --git a/R/PipeOpImputeOOR.R b/R/PipeOpImputeOOR.R index f1037cc51..c1d01b3d4 100644 --- a/R/PipeOpImputeOOR.R +++ b/R/PipeOpImputeOOR.R @@ -7,7 +7,7 @@ #' @description #' Impute factorial features by adding a new level `".MISSING"`. #' -#' Impute numerical features by constant values shifted below the minimum or above the maximum by +#' Impute numeric, integer, POSIXct or Date features by constant values shifted below the minimum or above the maximum by #' using \eqn{min(x) - offset - multiplier * diff(range(x))} or #' \eqn{max(x) + offset + multiplier * diff(range(x))}. #' @@ -51,7 +51,7 @@ #' #' The `$state$model` contains either `".MISSING"` used for `character` and `factor` (also #' `ordered`) features or `numeric(1)` indicating the constant value used for imputation of -#' `integer` and `numeric` features. +#' `integer`, `numeric`, `POSIXct` or `Date` features. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpImpute`], as well as: @@ -59,17 +59,17 @@ #' Should `integer` and `numeric` features be shifted below the minimum? Initialized to `TRUE`. If `FALSE` #' they are shifted above the maximum. See also the description above. #' * `offset` :: `numeric(1)` \cr -#' Numerical non-negative offset as used in the description above for `integer` and `numeric` +#' Numerical non-negative offset as used in the description above for `integer`, `numeric`, `POSIXCT` and `Date`. #' features. Initialized to `1`. #' * `multiplier` :: `numeric(1)` \cr -#' Numerical non-negative multiplier as used in the description above for `integer` and `numeric` +#' Numerical non-negative multiplier as used in the description above for `integer`, `numeric`, `POSIXct` and `Date`. #' features. Initialized to `1`. #' #' @section Internals: #' Adds an explicit new `level()` to `factor` and `ordered` features, but not to `character` features. #' For `integer` and `numeric` features uses the `min`, `max`, `diff` and `range` functions. #' `integer` and `numeric` features that are entirely `NA` are imputed as `0`. `factor` and `ordered` features that are -#' entirely `NA` are imputed as `".MISSING"`. +#' entirely `NA` are imputed as `".MISSING"`. For `POSIXct` and `Date` features the value `0` is transformed into the respective data type. #' #' @section Fields: #' Only fields inherited from [`PipeOp`]. @@ -119,7 +119,7 @@ PipeOpImputeOOR = R6Class("PipeOpImputeOOR", ) # this is one of the few imputers that handles 'character' features! super$initialize(id, param_set = ps, param_vals = param_vals, empty_level_control = "param", - feature_types = c("character", "factor", "integer", "numeric", "ordered")) + feature_types = c("character", "factor", "integer", "numeric", "ordered", "POSIXct", "Date")) } ), private = list( @@ -153,10 +153,13 @@ PipeOpImputeOOR = R6Class("PipeOpImputeOOR", logical = c(TRUE, FALSE), numeric = 0, ordered = ".MISSING", - character = "" + character = "", + POSIXct = as.POSIXct(0), + Date = as.Date(0) ) } ) ) mlr_pipeops$add("imputeoor", PipeOpImputeOOR) + diff --git a/R/PipeOpImputeSample.R b/R/PipeOpImputeSample.R index 214998728..b7e87464a 100644 --- a/R/PipeOpImputeSample.R +++ b/R/PipeOpImputeSample.R @@ -33,8 +33,8 @@ #' @section Internals: #' Uses the `sample()` function. Features that are entirely `NA` are imputed as #' the following: For `factor` or `ordered`, random levels are sampled uniformly at random. -#' For logicals, `TRUE` or `FALSE` are sampled uniformly at random. -#' Numerics and integers are imputed as `0`. +#' For `logical`, `TRUE` or `FALSE` are sampled uniformly at random. +#' `numeric` and `integer` are imputed as `0`. #' #' @section Fields: #' Only fields inherited from [`PipeOp`]. @@ -61,7 +61,7 @@ PipeOpImputeSample = R6Class("PipeOpImputeSample", inherit = PipeOpImpute, public = list( initialize = function(id = "imputesample", param_vals = list()) { - super$initialize(id, param_vals = param_vals, feature_types = c("factor", "integer", "logical", "numeric", "ordered")) + super$initialize(id, param_vals = param_vals, feature_types = c("factor", "integer", "logical", "numeric", "ordered", "POSIXct", "Date")) } ), private = list( diff --git a/R/PipeOpTaskPreproc.R b/R/PipeOpTaskPreproc.R index 8e6e0b584..3dbac0f70 100644 --- a/R/PipeOpTaskPreproc.R +++ b/R/PipeOpTaskPreproc.R @@ -59,7 +59,7 @@ #' `"TaskRegr"` (or another subclass introduced by other packages). Default is `"Task"`. #' * `tags` :: `character` | `NULL`\cr #' Tags of the resulting `PipeOp`. This is added to the tag `"data transform"`. Default `NULL`. -#'* `feature_types` :: `character`\cr +#' * `feature_types` :: `character`\cr #' Feature types affected by the `PipeOp`. See `private$.select_cols()` for more information. #' Defaults to all available feature types. #' diff --git a/man/mlr3pipelines-package.Rd b/man/mlr3pipelines-package.Rd index dfdffc8db..3561fabe8 100644 --- a/man/mlr3pipelines-package.Rd +++ b/man/mlr3pipelines-package.Rd @@ -37,6 +37,7 @@ Other contributors: \item Keno Mersmann \email{keno.mersmann@gmail.com} [contributor] \item Maximilian Mücke \email{muecke.maximilian@gmail.com} (\href{https://orcid.org/0009-0000-9432-9795}{ORCID}) [contributor] \item Lona Koers \email{lona.koers@gmail.com} [contributor] + \item Alexander Winterstetter \email{alexander.winterstetter@gmail.com} [contributor] } } diff --git a/man/mlr_pipeops_imputehist.Rd b/man/mlr_pipeops_imputehist.Rd index a53263e3e..0c63a400b 100644 --- a/man/mlr_pipeops_imputehist.Rd +++ b/man/mlr_pipeops_imputehist.Rd @@ -3,12 +3,12 @@ \name{mlr_pipeops_imputehist} \alias{mlr_pipeops_imputehist} \alias{PipeOpImputeHist} -\title{Impute Numerical Features by Histogram} +\title{Impute Numeric, Integer, POSIXct or Date Features by Histogram} \format{ \code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpImpute}}/\code{\link{PipeOp}}. } \description{ -Impute numerical features by histogram. +Impute numeric, integer, POSIXct or Date features by histogram. During training, a histogram is fitted on each column using R's \code{\link[graphics:hist]{hist()}} function. The fitted histogram is then sampled from for imputation. Sampling happens in a two-step process: @@ -34,7 +34,7 @@ List of hyperparameter settings, overwriting the hyperparameter settings that wo Input and output channels are inherited from \code{\link{PipeOpImpute}}. -The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric features missing values imputed by (column-wise) histogram; see Description for details. +The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric, integer, POSIXct or Date features missing values imputed by (column-wise) histogram; see Description for details. } \section{State}{ diff --git a/man/mlr_pipeops_imputelearner.Rd b/man/mlr_pipeops_imputelearner.Rd index fb7c920a9..2c0e21b74 100644 --- a/man/mlr_pipeops_imputelearner.Rd +++ b/man/mlr_pipeops_imputelearner.Rd @@ -13,7 +13,7 @@ Uses the features indicated by the \code{context_columns} parameter as features Note this parameter is part of the \code{\link{PipeOpImpute}} base class and explained there. Additionally, only features supported by the learner can be imputed; i.e. learners of type -\code{regr} can only impute features of type \code{integer} and \code{numeric}, while \code{classif} can impute +\code{regr} can only impute features of type \code{integer}, \code{numeric}, \code{POSIXct} and \code{Date}, while \code{classif} can impute features of type \code{factor}, \code{ordered} and \code{logical}. The \code{\link[mlr3:Learner]{Learner}} used for imputation is trained on all \code{context_columns}; if these contain missing values, diff --git a/man/mlr_pipeops_imputemean.Rd b/man/mlr_pipeops_imputemean.Rd index a4b33fa0f..3be09d554 100644 --- a/man/mlr_pipeops_imputemean.Rd +++ b/man/mlr_pipeops_imputemean.Rd @@ -3,12 +3,12 @@ \name{mlr_pipeops_imputemean} \alias{mlr_pipeops_imputemean} \alias{PipeOpImputeMean} -\title{Impute Numerical Features by their Mean} +\title{Impute Numeric, Integer, POSIXct or Date Features by their Mean} \format{ \code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpImpute}}/\code{\link{PipeOp}}. } \description{ -Impute numerical features by their mean. +Impute numeric, integer, POSIXct or Date features by their mean. } \section{Construction}{ @@ -27,14 +27,14 @@ List of hyperparameter settings, overwriting the hyperparameter settings that wo Input and output channels are inherited from \code{\link{PipeOpImpute}}. -The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric features missing values imputed by (column-wise) mean. +The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric, integer, POSIXct and Date features missing values imputed by (column-wise) mean. } \section{State}{ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpImpute}}. -The \verb{$state$model} is a named \code{list} of \code{numeric(1)} indicating the mean of the respective feature. +The \verb{$state$model} is a named \code{list} of either \code{numeric(1)}, \code{integer(1)}, \code{POSIXct(1)} or \code{Date(1)} indicating the mean of the respective feature. } \section{Parameters}{ diff --git a/man/mlr_pipeops_imputemedian.Rd b/man/mlr_pipeops_imputemedian.Rd index 8655e1ecc..d5eee5abc 100644 --- a/man/mlr_pipeops_imputemedian.Rd +++ b/man/mlr_pipeops_imputemedian.Rd @@ -3,12 +3,12 @@ \name{mlr_pipeops_imputemedian} \alias{mlr_pipeops_imputemedian} \alias{PipeOpImputeMedian} -\title{Impute Numerical Features by their Median} +\title{Impute Numeric, Integer, POSIXct or Date Features by their Median} \format{ \code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpImpute}}/\code{\link{PipeOp}}. } \description{ -Impute numerical features by their median. +Impute numerical, integer, POSIXct or Date features by their median. } \section{Construction}{ @@ -27,14 +27,14 @@ List of hyperparameter settings, overwriting the hyperparameter settings that wo Input and output channels are inherited from \code{\link{PipeOpImpute}}. -The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric features missing values imputed by (column-wise) median. +The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric, integer, POSIXct and Date features missing values imputed by (column-wise) median. } \section{State}{ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpImpute}}. -The \verb{$state$model} is a named \code{list} of \code{numeric(1)} indicating the median of the respective feature. +The \verb{$state$model} is a named \code{list} of \code{numeric(1)}, \code{integer(1)}, \code{POSIXct(1)} or \code{Date(1)} indicating the median of the respective feature. } \section{Parameters}{ diff --git a/man/mlr_pipeops_imputemode.Rd b/man/mlr_pipeops_imputemode.Rd index b35b83b13..b199fc7d6 100644 --- a/man/mlr_pipeops_imputemode.Rd +++ b/man/mlr_pipeops_imputemode.Rd @@ -8,7 +8,7 @@ \code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpImpute}}/\code{\link{PipeOp}}. } \description{ -Impute features by their mode. Supports factors as well as logical and numerical features. +Impute features by their mode. Supports factors, logical, numerical, POSIXct and Date features. If multiple modes are present then imputed values are sampled randomly from them. } \section{Construction}{ diff --git a/man/mlr_pipeops_imputeoor.Rd b/man/mlr_pipeops_imputeoor.Rd index 187c23acf..9585e3a6c 100644 --- a/man/mlr_pipeops_imputeoor.Rd +++ b/man/mlr_pipeops_imputeoor.Rd @@ -10,7 +10,7 @@ \description{ Impute factorial features by adding a new level \code{".MISSING"}. -Impute numerical features by constant values shifted below the minimum or above the maximum by +Impute numeric, integer, POSIXct or Date features by constant values shifted below the minimum or above the maximum by using \eqn{min(x) - offset - multiplier * diff(range(x))} or \eqn{max(x) + offset + multiplier * diff(range(x))}. @@ -60,7 +60,7 @@ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherit The \verb{$state$model} contains either \code{".MISSING"} used for \code{character} and \code{factor} (also \code{ordered}) features or \code{numeric(1)} indicating the constant value used for imputation of -\code{integer} and \code{numeric} features. +\code{integer}, \code{numeric}, \code{POSIXct} or \code{Date} features. } \section{Parameters}{ @@ -71,10 +71,10 @@ The parameters are the parameters inherited from \code{\link{PipeOpImpute}}, as Should \code{integer} and \code{numeric} features be shifted below the minimum? Initialized to \code{TRUE}. If \code{FALSE} they are shifted above the maximum. See also the description above. \item \code{offset} :: \code{numeric(1)} \cr -Numerical non-negative offset as used in the description above for \code{integer} and \code{numeric} +Numerical non-negative offset as used in the description above for \code{integer}, \code{numeric}, \code{POSIXCT} and \code{Date}. features. Initialized to \code{1}. \item \code{multiplier} :: \code{numeric(1)} \cr -Numerical non-negative multiplier as used in the description above for \code{integer} and \code{numeric} +Numerical non-negative multiplier as used in the description above for \code{integer}, \code{numeric}, \code{POSIXct} and \code{Date}. features. Initialized to \code{1}. } } @@ -84,7 +84,7 @@ features. Initialized to \code{1}. Adds an explicit new \code{level()} to \code{factor} and \code{ordered} features, but not to \code{character} features. For \code{integer} and \code{numeric} features uses the \code{min}, \code{max}, \code{diff} and \code{range} functions. \code{integer} and \code{numeric} features that are entirely \code{NA} are imputed as \code{0}. \code{factor} and \code{ordered} features that are -entirely \code{NA} are imputed as \code{".MISSING"}. +entirely \code{NA} are imputed as \code{".MISSING"}. For \code{POSIXct} and \code{Date} features the value \code{0} is transformed into the respective data type. } \section{Fields}{ diff --git a/man/mlr_pipeops_imputesample.Rd b/man/mlr_pipeops_imputesample.Rd index e80476851..df73d77e8 100644 --- a/man/mlr_pipeops_imputesample.Rd +++ b/man/mlr_pipeops_imputesample.Rd @@ -46,8 +46,8 @@ The parameters are the parameters inherited from \code{\link{PipeOpImpute}}. Uses the \code{sample()} function. Features that are entirely \code{NA} are imputed as the following: For \code{factor} or \code{ordered}, random levels are sampled uniformly at random. -For logicals, \code{TRUE} or \code{FALSE} are sampled uniformly at random. -Numerics and integers are imputed as \code{0}. +For \code{logical}, \code{TRUE} or \code{FALSE} are sampled uniformly at random. +\code{numeric} and \code{integer} are imputed as \code{0}. } \section{Fields}{ diff --git a/tests/testthat/test_pipeop_impute.R b/tests/testthat/test_pipeop_impute.R index 95023a1bf..b6a046207 100644 --- a/tests/testthat/test_pipeop_impute.R +++ b/tests/testthat/test_pipeop_impute.R @@ -12,20 +12,22 @@ test_that("PipeOpImpute", { ps = ps( method_num = p_fct(c("median", "mean", "mode", "sample", "hist", "oor", "constant"), tags = c("train", "predict")), method_fct = p_fct(c("oor", "sample", "mode", "constant"), tags = c("train", "predict")), + method_pxc = p_fct(c("median", "mean", "mode", "sample", "hist", "oor", "constant"), tags = c("train", "predict")), + method_dte = p_fct(c("median", "mean", "mode", "sample", "hist", "oor", "constant"), tags = c("train", "predict")), add_dummy = p_fct(c("none", "missing_train", "all"), tags = c("train", "predict")), innum = p_uty(tags = c("train", "predict")) ) - ps$values = list(method_num = "median", method_fct = "oor", add_dummy = "missing_train") + ps$values = list(method_num = "median", method_fct = "oor", method_pxc = "median", method_dte = "median", add_dummy = "missing_train") super$initialize(id, ps, param_vals = param_vals) }, build_graph = function() { numimputer = switch(self$param_set$values$method_num, - median = po("imputemedian"), - mean = po("imputemean"), + median = po("imputemedian", id = "num_median"), + mean = po("imputemean", id = "num_mean"), mode = po("imputemode", id = "num_mode"), sample = po("imputesample", id = "num_sample"), - hist = po("imputehist"), + hist = po("imputehist", id = "num_hist"), constant = po("imputeconstant", id = "num_constant", param_vals = list(constant = -999)), oor = po("imputeoor", id = "num_oor")) fctimputer = switch(self$param_set$values$method_fct, @@ -33,6 +35,22 @@ test_that("PipeOpImpute", { sample = po("imputesample", id = "fct_sample"), mode = po("imputemode", id = "fct_mode"), constant = po("imputeconstant", id = "fct_constant", param_vals = list(constant = ".MISSING", check_levels = FALSE))) + pxcimputer = switch(self$param_set$values$method_pxc, + median = po("imputemedian", id = "pxc_median"), + mean = po("imputemean", id = "pxc_mean"), + mode = po("imputemode", id = "pxc_mode"), + sample = po("imputesample", id = "pxc_sample"), + hist = po("imputehist", id = "pxc_hist"), + constant = po("imputeconstant", id = "pxc_constant", param_vals = list(constant = as.POSIXct(0))), + oor = po("imputeoor", id = "pxc_oor")) + dteimputer = switch(self$param_set$values$method_dte, + median = po("imputemedian", id = "dte_median"), + mean = po("imputemean", id = "dte_mean"), + mode = po("imputemode", id = "dte_mode"), + sample = po("imputesample", id = "dte_sample"), + hist = po("imputehist", id = "dte_hist"), + constant = po("imputeconstant", id = "dte_constant", param_vals = list(constant = as.Date(0))), + oor = po("imputeoor", id = "dte_oor")) if (self$param_set$values$add_dummy == "none") { dummyselector = selector_none() @@ -45,6 +63,8 @@ test_that("PipeOpImpute", { graph = list( po("select", id = "num_select", selector = selector_type(c("integer", "numeric"))) %>>% numimputer, po("select", id = "fct_select", selector = selector_type(c("factor", "ordered"))) %>>% fctimputer, + po("select", id = "pxc_select", selector = selector_type(c("POSIXct"))) %>>% pxcimputer, + po("select", id = "dte_select", selector = selector_type(c("Date"))) %>>% dteimputer, po("select", id = "lgl_select", selector = selector_type("logical")) %>>% po("imputesample", id = "lgl_sample"), po("select", id = "chr_select", selector = selector_type("character")) %>>% po("imputeconstant", id = "chr_const", constant = ".MISSING"), po("select", id = "dummyselector", selector = dummyselector) %>>% po("missind", type = "logical", affect_columns = NULL, @@ -78,12 +98,9 @@ test_that("PipeOpImpute", { task = mlr_tasks$get("pima") + expect_datapreproc_pipeop_class(PipeOpTestImpute, constargs = list(param_vals = list(innum = c("a", "b", "c", "d", "e", "f", "g"))), task = task) - expect_datapreproc_pipeop_class(PipeOpTestImpute, constargs = list(param_vals = list(innum = c("a", "b", "c", "d", "e"))), task = task) - - expect_datapreproc_pipeop_class(PipeOpTestImpute, constargs = list(param_vals = list(innum = c("a", "b", "c", "d", "e"))), task = mlr_tasks$get("iris")) - - + expect_datapreproc_pipeop_class(PipeOpTestImpute, constargs = list(param_vals = list(innum = c("a", "b", "c", "d", "e", "f", "g"))), task = mlr_tasks$get("iris")) mdata = data.frame(stringsAsFactors = FALSE, a = c(1, 2, 3, 4, 5, NA), @@ -98,7 +115,11 @@ test_that("PipeOpImpute", { j = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE), k = c(TRUE, FALSE, TRUE, FALSE, TRUE, NA), l = factor(letters[rep(1:2, 3)]), - m = c(-.Machine$integer.max, -10000000L, 0L, 10000000L, .Machine$integer.max, NA) + m = c(-.Machine$integer.max, -10000000L, 0L, 10000000L, .Machine$integer.max, NA), + n = as.POSIXct(1:6), + o = c(as.POSIXct(1:5), NA), + p = as.Date(1:6), + q = c(as.Date(1:5), NA) ) task = TaskClassif$new("mdata", as_data_backend(mdata), target = "l") @@ -109,6 +130,8 @@ test_that("PipeOpImpute", { expect_datapreproc_pipeop_class(PipeOpTestImpute, task = task_no_lgl, constargs = list(param_vals = list( method_num = "median", + method_pxc = "oor", + method_dte = "oor", method_fct = "oor", add_dummy = "none"))) @@ -188,11 +211,11 @@ test_that("PipeOpImpute", { task_predicted = po$predict(list(task))[[1]]$data() - expect_equal(task_trained[1, c("a", "c", "d", "f", "k", "m")], - task_trained[2, c("a", "c", "d", "f", "k", "m")]) + expect_equal(task_trained[1, c("a", "c", "d", "f", "k", "m", "o", "q")], + task_trained[2, c("a", "c", "d", "f", "k", "m", "o", "q")]) - expect_equal(task_predicted[c(5:6), c("a", "c", "d", "f", "k", "m")], - task_trained[c(1:2), c("a", "c", "d", "f", "k", "m")]) + expect_equal(task_predicted[c(5:6), c("a", "c", "d", "f", "k", "m", "o", "q")], + task_trained[c(1:2), c("a", "c", "d", "f", "k", "m", "o", "q")]) expect_equal(task_trained$missing_a, c(FALSE, TRUE)) expect_equal(task_trained$missing_c, c(FALSE, TRUE)) @@ -201,30 +224,34 @@ test_that("PipeOpImpute", { expect_equal(task_trained$missing_h, c(FALSE, TRUE)) expect_equal(task_trained$missing_k, c(FALSE, TRUE)) expect_equal(task_trained$missing_m, c(FALSE, TRUE)) + expect_equal(task_trained$missing_o, c(FALSE, TRUE)) + expect_equal(task_trained$missing_q, c(FALSE, TRUE)) expect_equal(task_trained$missing_b, c(FALSE, FALSE)) expect_equal(task_trained$missing_e, c(FALSE, FALSE)) expect_equal(task_trained$missing_g, c(FALSE, FALSE)) expect_equal(task_trained$missing_i, c(FALSE, FALSE)) expect_equal(task_trained$missing_j, c(FALSE, FALSE)) + expect_equal(task_trained$missing_n, c(FALSE, FALSE)) + expect_equal(task_trained$missing_p, c(FALSE, FALSE)) - expect_set_equal(colnames(task_trained), c(letters[1:13], paste0("missing_", letters[c(1:11, 13)]))) - expect_set_equal(colnames(task_predicted), c(letters[1:13], paste0("missing_", letters[c(1:11, 13)]))) + expect_set_equal(colnames(task_trained), c(letters[1:17], paste0("missing_", letters[c(1:11, 13:17)]))) + expect_set_equal(colnames(task_predicted), c(letters[1:17], paste0("missing_", letters[c(1:11, 13:17)]))) po = PipeOpTestImpute$new(param_vals = list( - method_num = "median", method_fct = "oor", add_dummy = "all")) + method_num = "median", method_pxc = "median", method_dte = "median", method_fct = "oor", add_dummy = "all")) task_trained = po$train(list(task$clone(deep = TRUE)$filter(5:6)))[[1]]$data() task_predicted = po$predict(list(task))[[1]]$data() - expect_equal(task_trained[1, c("a", "c", "k", "m")], - task_trained[2, c("a", "c", "k", "m")]) + expect_equal(task_trained[1, c("a", "c", "k", "m", "o", "q")], + task_trained[2, c("a", "c", "k", "m", "o", "q")]) expect_equal(task_predicted[5:6, ], task_trained[1:2]) - expect_set_equal(colnames(task_trained), c(letters[1:13], paste0("missing_", c("a", "b", "c", "j", "k", "m")))) - expect_set_equal(colnames(task_predicted), c(letters[1:13], paste0("missing_", c("a", "b", "c", "j", "k", "m")))) + expect_set_equal(colnames(task_trained), c(letters[1:17], paste0("missing_", c("a", "b", "c", "j", "k", "m", "n", "o", "p", "q")))) + expect_set_equal(colnames(task_predicted), c(letters[1:17], paste0("missing_", c("a", "b", "c", "j", "k", "m", "n", "o", "p", "q")))) expect_equal(task_trained$d[2], factor(".MISSING", levels = c(letters[1:6], ".MISSING"))) expect_equal(task_trained$h[2], ".MISSING") @@ -235,8 +262,8 @@ test_that("PipeOpImpute", { task_trained = po$train(list(task$clone(deep = TRUE)$filter(5:6)))[[1]]$data() task_predicted = po$predict(list(task$clone(deep = TRUE)$filter(1:3)))[[1]]$data() - expect_set_equal(colnames(task_trained), c(letters[1:13], paste0("missing_", c("a", "c", "k", "m")))) - expect_set_equal(colnames(task_predicted), c(letters[1:13], paste0("missing_", c("a", "c", "k", "m")))) + expect_set_equal(colnames(task_trained), c(letters[1:17], paste0("missing_", c("a", "c", "k", "m", "o", "q")))) + expect_set_equal(colnames(task_predicted), c(letters[1:17], paste0("missing_", c("a", "c", "k", "m", "o", "q")))) po = PipeOpTestImpute$new(param_vals = list( method_num = "median", method_fct = "oor", add_dummy = "none")) @@ -262,7 +289,6 @@ test_that("PipeOpImpute", { # impute full na columns: po = PipeOpTestImpute$new(param_vals = list(method_num = "median", method_fct = "oor")) - mdata = data.table( stringsAsFactors = FALSE, a = as.numeric(rep(NA, 3)), @@ -271,6 +297,8 @@ test_that("PipeOpImpute", { d = factor(rep(NA, 3), ordered = TRUE, levels = "a"), e = as.logical(rep(NA, 3)), f = as.character(rep(NA, 3)), + g = as.POSIXct(rep(NA, 3)), + h = as.Date(rep(NA, 3)), t = as.factor(letters[rep(1:2, 3)]) ) task = TaskClassif$new("mdata", as_data_backend(mdata), target = "t") @@ -283,7 +311,9 @@ test_that("PipeOpImpute", { logical = c(TRUE, FALSE), numeric = 0, ordered = ".MISSING", - character = ".MISSING" + character = ".MISSING", + POSIXct = as.POSIXct(0), + Date = as.Date(0) ) out1 = po$train(list(task))[[1]]$data() out2 = po$predict(list(task))[[1]]$data() @@ -298,7 +328,8 @@ test_that("PipeOpImpute", { test_that("More tests for PipeOpImputeMode", { set.seed(1) dat = data.frame(y = rnorm(10L), x1 = as.character(1L:10L), x2 = rnorm(10L), x3 = factor(rep(c(1L, 2L), each = 5L)), - x4 = ordered(rep(1L:5L, times = 2L)), x5 = 1L:10L, x6 = rep(c(TRUE, FALSE), times = 5L), stringsAsFactors = FALSE) + x4 = ordered(rep(1L:5L, times = 2L)), x5 = 1L:10L, x6 = rep(c(TRUE, FALSE), times = 5L), + x7 = as.POSIXct(1L:10L), x8 = as.Date(1L:10L), stringsAsFactors = FALSE) dat[c(1L, 10L), ] = NA task = TaskRegr$new("task", backend = dat, target = "y") @@ -314,7 +345,7 @@ test_that("More tests for PipeOpImputeMode", { expect_false(anyNA(task_NA_trained[[5L]])) expect_equivalent(sapply(po_NA$state$model, FUN = function(x) class(x)[1L]), - c("numeric", "character", "character", "integer", "logical")) + c("numeric", "character", "character", "integer", "logical", "POSIXct", "Date")) task_NA_predicted = po_NA$predict(list(task_NA))[[1L]]$data() expect_equal(levels(task_NA_predicted[[4L]]), as.character(1:2)) @@ -327,7 +358,7 @@ test_that("More tests for PipeOpImputeConstant", { set.seed(1) dat = data.frame(y = rnorm(10L), x1 = as.character(1L:10L), x2 = rnorm(10L), x3 = factor(rep(c(1L, 2L), each = 5L)), x4 = ordered(rep(1L:5L, times = 2L)), x5 = 1L:10L, x6 = rep(c(TRUE, FALSE), times = 5L), - x7 = as.POSIXct(1L:10L, origin = "1960-01-01", tz = "GMT"), stringsAsFactors = FALSE) + x7 = as.POSIXct(1L:10L), x8 = as.Date(1L:10L), stringsAsFactors = FALSE) dat[1L, ] = NA task = TaskRegr$new("task", backend = dat, target = "y") @@ -340,13 +371,13 @@ test_that("More tests for PipeOpImputeConstant", { train_out = po$train(list(task))[[1L]] expect_equal(train_out$feature_types, task$feature_types) - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(train_out$data(cols = "x1")[[1L]][1L], "test") po$param_set$values = list(constant = -999, check_levels = TRUE, affect_columns = selector_type("numeric")) train_out = po$train(list(task))[[1L]] expect_equal(train_out$feature_types, task$feature_types) - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(train_out$data(cols = "x2")[[1L]][1L], -999) po$param_set$values = list(constant = "test", check_levels = TRUE, affect_columns = selector_type("factor")) @@ -354,7 +385,7 @@ test_that("More tests for PipeOpImputeConstant", { po$param_set$values$check_levels = FALSE train_out = po$train(list(task))[[1L]] expect_equal(train_out$feature_types, task$feature_types) - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(po$train(list(task))[[1L]]$data(cols = "x3")[[1L]][1L], factor("test", levels = c("1", "2", "test"))) po$param_set$values$constant = factor("test", levels = c("test", "another")) expect_equal(po$train(list(task))[[1L]]$data(cols = "x3")[[1L]][1L], factor("test", levels = c("1", "2", "test"))) @@ -364,7 +395,7 @@ test_that("More tests for PipeOpImputeConstant", { po$param_set$values$check_levels = FALSE train_out = po$train(list(task))[[1L]] expect_equal(train_out$feature_types, task$feature_types) - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(po$train(list(task))[[1L]]$data(cols = "x4")[[1L]][1L], ordered("test", levels = c("1", "2", "3", "4", "5", "test"))) po$param_set$values$constant = factor("test", levels = c("test", "another")) expect_equal(po$train(list(task))[[1L]]$data(cols = "x4")[[1L]][1L], ordered("test", levels = c("1", "2", "3", "4", "5", "test"))) @@ -372,32 +403,42 @@ test_that("More tests for PipeOpImputeConstant", { po$param_set$values = list(constant = -999, check_levels = TRUE, affect_columns = selector_type("integer")) train_out = po$train(list(task))[[1L]] expect_equal(train_out$feature_types, task$feature_types) - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(train_out$data(cols = "x5")[[1L]][1L], -999) po$param_set$values = list(constant = TRUE, check_levels = TRUE, affect_columns = selector_type("logical")) train_out = po$train(list(task))[[1L]] expect_equal(train_out$feature_types, task$feature_types) - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(train_out$data(cols = "x6")[[1L]][1L], TRUE) - pos_impute = as.POSIXct(1000000, origin = "1960-01-01", tz = "GMT") + pos_impute = as.POSIXct(1000000) po$param_set$values = list(constant = pos_impute, check_levels = TRUE, affect_columns = selector_type("POSIXct")) train_out = po$train(list(task))[[1L]] - expect_equal(sum(train_out$missings()), 7L) + expect_equal(sum(train_out$missings()), 8L) expect_equal(train_out$data(cols = "x7")[[1L]][1L], pos_impute) + po$param_set$values = list(constant = "1970-01-11 10:39:22 CET", check_levels = TRUE, affect_columns = selector_type("POSIXct")) + expect_error(po$train(list(task))) + + pos_impute = as.Date(1000000) + po$param_set$values = list(constant = pos_impute, check_levels = TRUE, affect_columns = selector_type("Date")) + train_out = po$train(list(task))[[1L]] + expect_equal(sum(train_out$missings()), 8L) + expect_equal(train_out$data(cols = "x8")[[1L]][1L], pos_impute) + po$param_set$values = list(constant = "1999-12-31", check_levels = TRUE, affect_columns = selector_type("Date")) + expect_error(po$train(list(task))) }) test_that("More tests for Integers", { - data <- data.table(x = c(-.Machine$integer.max, -10000000L, 0L, 10000000L, .Machine$integer.max, rep(NA, 1001)), t = 1:1006) + data = data.table(x = c(-.Machine$integer.max, -10000000L, 0L, 10000000L, .Machine$integer.max, rep(NA, 1001)), t = 1:1006) task = TaskRegr$new("task", backend = data, target = "t") pos = list(PipeOpImputeHist$new(), PipeOpImputeMean$new(), PipeOpImputeSample$new(), PipeOpImputeMedian$new(), PipeOpImputeMode$new(), PipeOpImputeOOR$new()) for (po in pos) { - result <- po$train(list(task))[[1]] + result = po$train(list(task))[[1]] expect_integer(result$data()$x, info = po$id) expect_false(anyNA(result$data()$x), info = po$id) @@ -590,14 +631,16 @@ test_that("'empty_level_control' in POImputeOOR and POImputeConstant", { # PipeOpImputeConstant - # Add logical and POSIXct features + # Add logical, POSIXct and Date features task_train$cbind(data.table( lgl = c(TRUE, FALSE, NA), - pxc = as.POSIXct(c("2025/01/01", "2025/02/02", NA)) + pxc = as.POSIXct(c("2025/01/01", "2025/02/02", NA)), + dte = as.Date(c("2025/01/01", "2025/02/02", NA)) )) task_pred$cbind(data.table( lgl = c(TRUE, FALSE, NA), - pxc = as.POSIXct(c("2025/01/01", "2025/02/02", NA)) + pxc = as.POSIXct(c("2025/01/01", "2025/02/02", NA)), + dte = as.Date(c("2025/01/01", "2025/02/02", NA)) )) # Also test that other types still behave as expected @@ -653,6 +696,17 @@ test_that("'empty_level_control' in POImputeOOR and POImputeConstant", { predict_out = op$predict(list(task_pred))[[1L]] expect_identical(predict_out$data(cols = names(dt_out)), dt_out) + # Type: Date + op$param_set$set_values(constant = as.Date("2024/01/01"), affect_columns = selector_type("Date")) + + dt_out = data.table( + target = factor(c("a", "b", "a")), + dte = as.Date(c("2025/01/01", "2025/02/02", "2024/01/01")) + ) + train_out = op$train(list(task_train))[[1L]] + expect_identical(train_out$data(cols = names(dt_out)), dt_out) + predict_out = op$predict(list(task_pred))[[1L]] + expect_identical(predict_out$data(cols = names(dt_out)), dt_out) }) test_that("PipeOpImputeSample - impute missings for unseen factor levels", { @@ -680,3 +734,4 @@ test_that("PipeOpImputeSample - impute missings for unseen factor levels", { expect_no_error(glrn$predict(task_NA)) }) +