From b1b836b0ac2e5d0c39c105bd39f38c253cf1f2eb Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Sun, 28 Jul 2024 11:42:02 +0200 Subject: [PATCH 01/36] feat: add init umap implementation --- DESCRIPTION | 3 +- R/PipeOpUMAP.R | 155 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 R/PipeOpUMAP.R diff --git a/DESCRIPTION b/DESCRIPTION index 29f8348b2..afe29cf56 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -87,7 +87,8 @@ Suggests: methods, vtreat, future, - htmlwidgets + htmlwidgets, + uwot (>= 0.2.1) ByteCompile: true Encoding: UTF-8 Config/testthat/edition: 3 diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R new file mode 100644 index 000000000..bb793dac1 --- /dev/null +++ b/R/PipeOpUMAP.R @@ -0,0 +1,155 @@ +#' @title Uniform Manifold Approximation and Projection (UMAP) +#' +#' @usage NULL +#' @name mlr_pipeops_umap +#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @description +#' Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP). +#' See [uwot::umap2()] for details. +#' +#' @section Construction: +#' ``` +#' PipeOpUMAP$new(id = "umap", param_vals = list()) +#' ``` +#' +#' * `id` :: `character(1)`\cr +#' Identifier of resulting object, default `"umap"`. +#' * `param_vals` :: named `list`\cr +#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. +#' +#' @section Input and Output Channels: +#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. +#' +#' The output is the input [`Task`][mlr3::Task] with all affected numeric features replaced by their principal components. +#' +#' @section State: +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the class [stats::prcomp], +#' with the exception of the `$x` slot. These are in particular: +#' * `sdev` :: `numeric`\cr +#' The standard deviations of the principal components. +#' * `rotation` :: `matrix`\cr +#' The matrix of variable loadings. +#' * `center` :: `numeric` | `logical(1)`\cr +#' The centering used, or `FALSE`. +#' * `scale` :: `numeric` | `logical(1)`\cr +#' The scaling used, or `FALSE`. +#' +#' @section Parameters: +#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: +#' * `center` :: `logical(1)`\cr +#' Indicating whether the features should be centered. Default is `TRUE`. See [`prcomp()`][stats::prcomp]. +#' * `scale.` :: `logical(1)`\cr +#' Whether to scale features to unit variance before analysis. Default is `FALSE`, but scaling is advisable. See [`prcomp()`][stats::prcomp]. +#' * `rank.` :: `integer(1)`\cr +#' Maximal number of principal components to be used. Default is `NULL`: use all components. See [`prcomp()`][stats::prcomp]. +#' +#' @section Internals: +#' Uses the [`umap()`][uwot::umap] function. +#' +#' @section Methods: +#' Only methods inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @examples +#' library("mlr3") +#' +#' task = tsk("iris") +#' pop = po("umap") +#' +#' task$data() +#' pop$train(list(task))[[1]]$data() +#' +#' pop$state +#' @family PipeOps +#' @template seealso_pipeopslist +#' @include PipeOpTaskPreproc.R +#' @export +PipeOpUMAP = R6Class("PipeOpUMAP", + inherit = PipeOpTaskPreproc, + public = list( + initialize = function(id = "umap", param_vals = list()) { + ps = ps( + n_neighbors = p_int(2L, 100L, default = 15L, tags = c("train", "umap")), + n_components = p_int(1L, 100L, default = 2L, tags = c("train", "umap")), + metric = p_fct( + c("euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical"), + default = "euclidean", + tags = c("train", "umap"), + depends = quote(nn_method == "hnsw") + ), + n_epochs = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + learning_rate = p_dbl(0, default = 1, tags = c("train", "umap")), + scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), + init = p_uty( + default = "spectral", + tags = c("train", "umap"), + custom_check = crate(function(x) { + choices = c("spectral", "normlaplacian", "random", "lvrandom", "laplacian", "pca", "spca", "agspectral") + check_choice(x, choices) %check||% check_matrix(x) + }) + ), + init_sdev = p_uty(default = "range", tags = c("train", "umap")), + spread = p_dbl(default = 1, tags = c("train", "umap")), + min_dist = p_dbl(default = 0.01, tags = c("train", "umap")), + set_op_mix_ratio = p_dbl(0, 1, default = 1, tags = c("train", "umap")), + local_connectivity = p_dbl(1, default = 1L, tags = c("train", "umap")), + bandwidth = p_dbl(default = 1, tags = c("train", "umap")), + repulsion_strength = p_dbl(default = 1, tags = c("train", "umap")), + negative_sample_rate = p_dbl(default = 5L, tags = c("train", "umap")), + a = p_uty(default = NULL, tags = c("train", "umap")), + b = p_uty(default = NULL, tags = c("train", "umap")), + nn_method = p_uty( + default = NULL, + tags = c("train", "umap"), + custom_check = crate(function(x) { + check_choice(x, c("fnn", "annoy", "hnsw", "nndescent"), null.ok = TRUE) %check||% + check_list(x, types = "matrix", len = 2L, names = "idx", "dist") %check||% + check_class(x, "dgCMatrix") + }) + ), + n_trees = p_int(10L, 100L, default = 50L, tags = c("train", "umap")), + search_k = p_int(tags = c("train", "umap")), + approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), + y = p_uty(default = NULL, tags = c("train", "umap")), + target_n_neighbors = p_int(tags = c("train", "umap")), + target_metric = p_fct(c("euclidean", "cosine", "correlation"), default = "euclidean", tags = c("train", "umap")), + target_weight = p_dbl(0, 1, default = 0.5, tags = c("train", "umap")), + pca = p_int(1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + pca_center = p_lgl(default = TRUE, tags = c("train", "umap")), + pca_rand = p_lgl(default = TRUE, tags = c("train", "umap")), + fast_sgd = p_lgl(default = FALSE, tags = c("train", "umap")), + n_threads = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + n_sgd_threads = p_int(0L, default = 0L, special_vals = list("auto"), tags = c("train", "umap")), + grain_size = p_int(1L, default = 1L, tags = c("train", "umap")), + verbose = p_lgl(default = TRUE, tags = c("train", "umap")), + batch = p_lgl(default = FALSE, tags = c("train", "umap")), + opt_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_list), + epoch_callback = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_function_or_null), + pca_method = p_fct(c("irlba", "rsvd", "bigstatsr", "svd", "auto"), default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + binary_edge_weights = p_lgl(default = FALSE, tags = c("train", "umap")), + dens_scale = p_dbl(0, 1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + seed = p_int(default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + nn_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_list) + ) + ps$set_values(verbose = FALSE) + + super$initialize(id, param_set = ps, param_vals = param_vals, feature_types = c("numeric", "integer")) + } + ), + private = list( + + .train_dt = function(dt, levels, target) { + params = insert_named(self$param_set$get_values(tags = "umap"), list(ret_model = TRUE)) + umap = invoke(uwot::umap2, dt, .args = params) + self$state = umap + self$state$embedding = NULL + umap$embedding + }, + + .predict_dt = function(dt, levels) { + invoke(uwot::umap_transform, dt, self$state) + } + ) +) + +mlr_pipeops$add("umap", PipeOpUMAP) From ffdf668bd65e6c1cb5847197f869e891f3ed3e85 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Sun, 28 Jul 2024 11:44:24 +0200 Subject: [PATCH 02/36] docs: run document --- DESCRIPTION | 1 + NAMESPACE | 1 + man/PipeOp.Rd | 1 + man/PipeOpEnsemble.Rd | 1 + man/PipeOpImpute.Rd | 1 + man/PipeOpTargetTrafo.Rd | 1 + man/PipeOpTaskPreproc.Rd | 1 + man/PipeOpTaskPreprocSimple.Rd | 1 + man/mlr_pipeops.Rd | 1 + man/mlr_pipeops_boxcox.Rd | 1 + man/mlr_pipeops_branch.Rd | 1 + man/mlr_pipeops_chunk.Rd | 1 + man/mlr_pipeops_classbalancing.Rd | 1 + man/mlr_pipeops_classifavg.Rd | 1 + man/mlr_pipeops_classweights.Rd | 1 + man/mlr_pipeops_colapply.Rd | 1 + man/mlr_pipeops_collapsefactors.Rd | 1 + man/mlr_pipeops_colroles.Rd | 1 + man/mlr_pipeops_copy.Rd | 1 + man/mlr_pipeops_datefeatures.Rd | 1 + man/mlr_pipeops_encode.Rd | 1 + man/mlr_pipeops_encodeimpact.Rd | 1 + man/mlr_pipeops_encodelmer.Rd | 1 + man/mlr_pipeops_featureunion.Rd | 1 + man/mlr_pipeops_filter.Rd | 1 + man/mlr_pipeops_fixfactors.Rd | 1 + man/mlr_pipeops_histbin.Rd | 1 + man/mlr_pipeops_ica.Rd | 1 + man/mlr_pipeops_imputeconstant.Rd | 1 + man/mlr_pipeops_imputehist.Rd | 1 + man/mlr_pipeops_imputelearner.Rd | 1 + man/mlr_pipeops_imputemean.Rd | 1 + man/mlr_pipeops_imputemedian.Rd | 1 + man/mlr_pipeops_imputemode.Rd | 1 + man/mlr_pipeops_imputeoor.Rd | 1 + man/mlr_pipeops_imputesample.Rd | 1 + man/mlr_pipeops_kernelpca.Rd | 1 + man/mlr_pipeops_learner.Rd | 1 + man/mlr_pipeops_missind.Rd | 1 + man/mlr_pipeops_modelmatrix.Rd | 1 + man/mlr_pipeops_multiplicityexply.Rd | 1 + man/mlr_pipeops_multiplicityimply.Rd | 1 + man/mlr_pipeops_mutate.Rd | 1 + man/mlr_pipeops_nmf.Rd | 1 + man/mlr_pipeops_nop.Rd | 1 + man/mlr_pipeops_ovrsplit.Rd | 1 + man/mlr_pipeops_ovrunite.Rd | 1 + man/mlr_pipeops_pca.Rd | 1 + man/mlr_pipeops_proxy.Rd | 1 + man/mlr_pipeops_quantilebin.Rd | 1 + man/mlr_pipeops_randomprojection.Rd | 1 + man/mlr_pipeops_randomresponse.Rd | 1 + man/mlr_pipeops_regravg.Rd | 1 + man/mlr_pipeops_removeconstants.Rd | 1 + man/mlr_pipeops_renamecolumns.Rd | 1 + man/mlr_pipeops_replicate.Rd | 1 + man/mlr_pipeops_scale.Rd | 1 + man/mlr_pipeops_scalemaxabs.Rd | 1 + man/mlr_pipeops_scalerange.Rd | 1 + man/mlr_pipeops_select.Rd | 1 + man/mlr_pipeops_smote.Rd | 1 + man/mlr_pipeops_spatialsign.Rd | 1 + man/mlr_pipeops_subsample.Rd | 1 + man/mlr_pipeops_targetinvert.Rd | 1 + man/mlr_pipeops_targetmutate.Rd | 1 + man/mlr_pipeops_targettrafoscalerange.Rd | 1 + man/mlr_pipeops_textvectorizer.Rd | 1 + man/mlr_pipeops_threshold.Rd | 1 + man/mlr_pipeops_tunethreshold.Rd | 1 + man/mlr_pipeops_umap.Rd | 160 +++++++++++++++++++++++ man/mlr_pipeops_unbranch.Rd | 1 + man/mlr_pipeops_updatetarget.Rd | 1 + man/mlr_pipeops_vtreat.Rd | 1 + man/mlr_pipeops_yeojohnson.Rd | 1 + 74 files changed, 233 insertions(+) create mode 100644 man/mlr_pipeops_umap.Rd diff --git a/DESCRIPTION b/DESCRIPTION index afe29cf56..5a74e9138 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -165,6 +165,7 @@ Collate: 'PipeOpThreshold.R' 'PipeOpTrafo.R' 'PipeOpTuneThreshold.R' + 'PipeOpUMAP.R' 'PipeOpUnbranch.R' 'PipeOpVtreat.R' 'PipeOpYeoJohnson.R' diff --git a/NAMESPACE b/NAMESPACE index d69d21c09..5b6783ac1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -108,6 +108,7 @@ export(PipeOpTaskPreprocSimple) export(PipeOpTextVectorizer) export(PipeOpThreshold) export(PipeOpTuneThreshold) +export(PipeOpUMAP) export(PipeOpUnbranch) export(PipeOpVtreat) export(PipeOpYeoJohnson) diff --git a/man/PipeOp.Rd b/man/PipeOp.Rd index 82d829e42..bf0debbd3 100644 --- a/man/PipeOp.Rd +++ b/man/PipeOp.Rd @@ -333,6 +333,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/PipeOpEnsemble.Rd b/man/PipeOpEnsemble.Rd index 61ac51bb9..ab016c9f9 100644 --- a/man/PipeOpEnsemble.Rd +++ b/man/PipeOpEnsemble.Rd @@ -165,6 +165,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/PipeOpImpute.Rd b/man/PipeOpImpute.Rd index e52256e79..7fc9e598d 100644 --- a/man/PipeOpImpute.Rd +++ b/man/PipeOpImpute.Rd @@ -195,6 +195,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/PipeOpTargetTrafo.Rd b/man/PipeOpTargetTrafo.Rd index 8d811ef60..6fc37dc5e 100644 --- a/man/PipeOpTargetTrafo.Rd +++ b/man/PipeOpTargetTrafo.Rd @@ -206,6 +206,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/PipeOpTaskPreproc.Rd b/man/PipeOpTaskPreproc.Rd index 817173680..25941823d 100644 --- a/man/PipeOpTaskPreproc.Rd +++ b/man/PipeOpTaskPreproc.Rd @@ -261,6 +261,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/PipeOpTaskPreprocSimple.Rd b/man/PipeOpTaskPreprocSimple.Rd index 69ec70f72..2e0928fb8 100644 --- a/man/PipeOpTaskPreprocSimple.Rd +++ b/man/PipeOpTaskPreprocSimple.Rd @@ -198,6 +198,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops.Rd b/man/mlr_pipeops.Rd index bba536267..c531acb78 100644 --- a/man/mlr_pipeops.Rd +++ b/man/mlr_pipeops.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_boxcox.Rd b/man/mlr_pipeops_boxcox.Rd index 0d514ce8c..da206d61d 100644 --- a/man/mlr_pipeops_boxcox.Rd +++ b/man/mlr_pipeops_boxcox.Rd @@ -149,6 +149,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_branch.Rd b/man/mlr_pipeops_branch.Rd index e9b855e0a..1830f2c48 100644 --- a/man/mlr_pipeops_branch.Rd +++ b/man/mlr_pipeops_branch.Rd @@ -167,6 +167,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_chunk.Rd b/man/mlr_pipeops_chunk.Rd index 7603b5b0f..e35161348 100644 --- a/man/mlr_pipeops_chunk.Rd +++ b/man/mlr_pipeops_chunk.Rd @@ -146,6 +146,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_classbalancing.Rd b/man/mlr_pipeops_classbalancing.Rd index c734631a9..0b4cfccf6 100644 --- a/man/mlr_pipeops_classbalancing.Rd +++ b/man/mlr_pipeops_classbalancing.Rd @@ -187,6 +187,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_classifavg.Rd b/man/mlr_pipeops_classifavg.Rd index 381046572..cfd2e68e7 100644 --- a/man/mlr_pipeops_classifavg.Rd +++ b/man/mlr_pipeops_classifavg.Rd @@ -163,6 +163,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_classweights.Rd b/man/mlr_pipeops_classweights.Rd index ea3eef216..86b799874 100644 --- a/man/mlr_pipeops_classweights.Rd +++ b/man/mlr_pipeops_classweights.Rd @@ -155,6 +155,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_colapply.Rd b/man/mlr_pipeops_colapply.Rd index e2e8bbe54..23ba274c5 100644 --- a/man/mlr_pipeops_colapply.Rd +++ b/man/mlr_pipeops_colapply.Rd @@ -176,6 +176,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_collapsefactors.Rd b/man/mlr_pipeops_collapsefactors.Rd index 4dc6dc619..32e0c0f60 100644 --- a/man/mlr_pipeops_collapsefactors.Rd +++ b/man/mlr_pipeops_collapsefactors.Rd @@ -143,6 +143,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_colroles.Rd b/man/mlr_pipeops_colroles.Rd index 73a5ee723..5996de056 100644 --- a/man/mlr_pipeops_colroles.Rd +++ b/man/mlr_pipeops_colroles.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_copy.Rd b/man/mlr_pipeops_copy.Rd index c09aff0cf..5b9f6a03e 100644 --- a/man/mlr_pipeops_copy.Rd +++ b/man/mlr_pipeops_copy.Rd @@ -165,6 +165,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_datefeatures.Rd b/man/mlr_pipeops_datefeatures.Rd index eb881ec59..636a08f89 100644 --- a/man/mlr_pipeops_datefeatures.Rd +++ b/man/mlr_pipeops_datefeatures.Rd @@ -182,6 +182,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_encode.Rd b/man/mlr_pipeops_encode.Rd index 79f2e3a8c..390194822 100644 --- a/man/mlr_pipeops_encode.Rd +++ b/man/mlr_pipeops_encode.Rd @@ -178,6 +178,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd index 8033735f0..6c3300407 100644 --- a/man/mlr_pipeops_encodeimpact.Rd +++ b/man/mlr_pipeops_encodeimpact.Rd @@ -160,6 +160,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_encodelmer.Rd b/man/mlr_pipeops_encodelmer.Rd index bfd1285ec..30e2c255d 100644 --- a/man/mlr_pipeops_encodelmer.Rd +++ b/man/mlr_pipeops_encodelmer.Rd @@ -175,6 +175,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_featureunion.Rd b/man/mlr_pipeops_featureunion.Rd index e0dbf21b6..4fe2ce4d8 100644 --- a/man/mlr_pipeops_featureunion.Rd +++ b/man/mlr_pipeops_featureunion.Rd @@ -180,6 +180,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_filter.Rd b/man/mlr_pipeops_filter.Rd index 3d5d2fc53..6ba6170a4 100644 --- a/man/mlr_pipeops_filter.Rd +++ b/man/mlr_pipeops_filter.Rd @@ -211,6 +211,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_fixfactors.Rd b/man/mlr_pipeops_fixfactors.Rd index 5fd00abc2..e273af8c0 100644 --- a/man/mlr_pipeops_fixfactors.Rd +++ b/man/mlr_pipeops_fixfactors.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_histbin.Rd b/man/mlr_pipeops_histbin.Rd index 74d6c5e2f..7eacb0edd 100644 --- a/man/mlr_pipeops_histbin.Rd +++ b/man/mlr_pipeops_histbin.Rd @@ -147,6 +147,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_ica.Rd b/man/mlr_pipeops_ica.Rd index 1e6473928..28707a9e0 100644 --- a/man/mlr_pipeops_ica.Rd +++ b/man/mlr_pipeops_ica.Rd @@ -175,6 +175,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputeconstant.Rd b/man/mlr_pipeops_imputeconstant.Rd index 0c35c4a9a..58297123b 100644 --- a/man/mlr_pipeops_imputeconstant.Rd +++ b/man/mlr_pipeops_imputeconstant.Rd @@ -149,6 +149,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputehist.Rd b/man/mlr_pipeops_imputehist.Rd index 0fb6d8f1f..6a13b3f32 100644 --- a/man/mlr_pipeops_imputehist.Rd +++ b/man/mlr_pipeops_imputehist.Rd @@ -140,6 +140,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputelearner.Rd b/man/mlr_pipeops_imputelearner.Rd index a2f9ea073..710906408 100644 --- a/man/mlr_pipeops_imputelearner.Rd +++ b/man/mlr_pipeops_imputelearner.Rd @@ -186,6 +186,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputemean.Rd b/man/mlr_pipeops_imputemean.Rd index bd8d788a5..e529cbe04 100644 --- a/man/mlr_pipeops_imputemean.Rd +++ b/man/mlr_pipeops_imputemean.Rd @@ -134,6 +134,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputemedian.Rd b/man/mlr_pipeops_imputemedian.Rd index 00145e29d..baad75bb4 100644 --- a/man/mlr_pipeops_imputemedian.Rd +++ b/man/mlr_pipeops_imputemedian.Rd @@ -134,6 +134,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputemode.Rd b/man/mlr_pipeops_imputemode.Rd index 613970b73..0e1b1d78a 100644 --- a/man/mlr_pipeops_imputemode.Rd +++ b/man/mlr_pipeops_imputemode.Rd @@ -141,6 +141,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputeoor.Rd b/man/mlr_pipeops_imputeoor.Rd index c5766c7e6..259e0f3aa 100644 --- a/man/mlr_pipeops_imputeoor.Rd +++ b/man/mlr_pipeops_imputeoor.Rd @@ -163,6 +163,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_imputesample.Rd b/man/mlr_pipeops_imputesample.Rd index 2944213ce..625febcea 100644 --- a/man/mlr_pipeops_imputesample.Rd +++ b/man/mlr_pipeops_imputesample.Rd @@ -136,6 +136,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_kernelpca.Rd b/man/mlr_pipeops_kernelpca.Rd index e2d92e746..40ffc3bce 100644 --- a/man/mlr_pipeops_kernelpca.Rd +++ b/man/mlr_pipeops_kernelpca.Rd @@ -150,6 +150,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_learner.Rd b/man/mlr_pipeops_learner.Rd index 023c5ca9c..1304d087f 100644 --- a/man/mlr_pipeops_learner.Rd +++ b/man/mlr_pipeops_learner.Rd @@ -181,6 +181,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_missind.Rd b/man/mlr_pipeops_missind.Rd index d1ac309e3..8de70bc0c 100644 --- a/man/mlr_pipeops_missind.Rd +++ b/man/mlr_pipeops_missind.Rd @@ -164,6 +164,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_modelmatrix.Rd b/man/mlr_pipeops_modelmatrix.Rd index d27063f12..6876b541d 100644 --- a/man/mlr_pipeops_modelmatrix.Rd +++ b/man/mlr_pipeops_modelmatrix.Rd @@ -140,6 +140,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_multiplicityexply.Rd b/man/mlr_pipeops_multiplicityexply.Rd index 01531c672..365516422 100644 --- a/man/mlr_pipeops_multiplicityexply.Rd +++ b/man/mlr_pipeops_multiplicityexply.Rd @@ -146,6 +146,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_multiplicityimply.Rd b/man/mlr_pipeops_multiplicityimply.Rd index b5a3400ab..4557895c6 100644 --- a/man/mlr_pipeops_multiplicityimply.Rd +++ b/man/mlr_pipeops_multiplicityimply.Rd @@ -152,6 +152,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_mutate.Rd b/man/mlr_pipeops_mutate.Rd index cff63d4b1..4ba9f9920 100644 --- a/man/mlr_pipeops_mutate.Rd +++ b/man/mlr_pipeops_mutate.Rd @@ -157,6 +157,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 148dfbcfd..1de3f5083 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -192,6 +192,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_nop.Rd b/man/mlr_pipeops_nop.Rd index eabc9e48f..b195e5648 100644 --- a/man/mlr_pipeops_nop.Rd +++ b/man/mlr_pipeops_nop.Rd @@ -142,6 +142,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_ovrsplit.Rd b/man/mlr_pipeops_ovrsplit.Rd index e0718678b..f1eb5a1b4 100644 --- a/man/mlr_pipeops_ovrsplit.Rd +++ b/man/mlr_pipeops_ovrsplit.Rd @@ -159,6 +159,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_ovrunite.Rd b/man/mlr_pipeops_ovrunite.Rd index 83f3c85c2..4010905ed 100644 --- a/man/mlr_pipeops_ovrunite.Rd +++ b/man/mlr_pipeops_ovrunite.Rd @@ -154,6 +154,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_pca.Rd b/man/mlr_pipeops_pca.Rd index ca5d14d59..262c0ce4c 100644 --- a/man/mlr_pipeops_pca.Rd +++ b/man/mlr_pipeops_pca.Rd @@ -151,6 +151,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_proxy.Rd b/man/mlr_pipeops_proxy.Rd index 1e6e8f9c0..8f962f21a 100644 --- a/man/mlr_pipeops_proxy.Rd +++ b/man/mlr_pipeops_proxy.Rd @@ -165,6 +165,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_quantilebin.Rd b/man/mlr_pipeops_quantilebin.Rd index 8b416ee52..624d5f708 100644 --- a/man/mlr_pipeops_quantilebin.Rd +++ b/man/mlr_pipeops_quantilebin.Rd @@ -139,6 +139,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_randomprojection.Rd b/man/mlr_pipeops_randomprojection.Rd index e41d6ea42..16839ff3c 100644 --- a/man/mlr_pipeops_randomprojection.Rd +++ b/man/mlr_pipeops_randomprojection.Rd @@ -151,6 +151,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_randomresponse.Rd b/man/mlr_pipeops_randomresponse.Rd index 2f813a326..c3364c453 100644 --- a/man/mlr_pipeops_randomresponse.Rd +++ b/man/mlr_pipeops_randomresponse.Rd @@ -168,6 +168,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_regravg.Rd b/man/mlr_pipeops_regravg.Rd index 4b1603441..d56d1c5a5 100644 --- a/man/mlr_pipeops_regravg.Rd +++ b/man/mlr_pipeops_regravg.Rd @@ -154,6 +154,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_removeconstants.Rd b/man/mlr_pipeops_removeconstants.Rd index 4fe961f7c..042998703 100644 --- a/man/mlr_pipeops_removeconstants.Rd +++ b/man/mlr_pipeops_removeconstants.Rd @@ -144,6 +144,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_renamecolumns.Rd b/man/mlr_pipeops_renamecolumns.Rd index 768211b84..44e7c2209 100644 --- a/man/mlr_pipeops_renamecolumns.Rd +++ b/man/mlr_pipeops_renamecolumns.Rd @@ -143,6 +143,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_replicate.Rd b/man/mlr_pipeops_replicate.Rd index 7735a2586..e07d3538b 100644 --- a/man/mlr_pipeops_replicate.Rd +++ b/man/mlr_pipeops_replicate.Rd @@ -136,6 +136,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_scale.Rd b/man/mlr_pipeops_scale.Rd index 33d4e027e..5f397b2a2 100644 --- a/man/mlr_pipeops_scale.Rd +++ b/man/mlr_pipeops_scale.Rd @@ -158,6 +158,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_scalemaxabs.Rd b/man/mlr_pipeops_scalemaxabs.Rd index 279a2c7c1..29946719b 100644 --- a/man/mlr_pipeops_scalemaxabs.Rd +++ b/man/mlr_pipeops_scalemaxabs.Rd @@ -133,6 +133,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_scalerange.Rd b/man/mlr_pipeops_scalerange.Rd index 707ca661c..3a7ce39e3 100644 --- a/man/mlr_pipeops_scalerange.Rd +++ b/man/mlr_pipeops_scalerange.Rd @@ -138,6 +138,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_select.Rd b/man/mlr_pipeops_select.Rd index c3d8ec0f9..d4e25fb1b 100644 --- a/man/mlr_pipeops_select.Rd +++ b/man/mlr_pipeops_select.Rd @@ -154,6 +154,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_smote.Rd b/man/mlr_pipeops_smote.Rd index b92867d6e..7a526507c 100644 --- a/man/mlr_pipeops_smote.Rd +++ b/man/mlr_pipeops_smote.Rd @@ -157,6 +157,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_spatialsign.Rd b/man/mlr_pipeops_spatialsign.Rd index 4995632ca..12d25a921 100644 --- a/man/mlr_pipeops_spatialsign.Rd +++ b/man/mlr_pipeops_spatialsign.Rd @@ -133,6 +133,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_subsample.Rd b/man/mlr_pipeops_subsample.Rd index 5a0a3c9fc..720722479 100644 --- a/man/mlr_pipeops_subsample.Rd +++ b/man/mlr_pipeops_subsample.Rd @@ -148,6 +148,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_targetinvert.Rd b/man/mlr_pipeops_targetinvert.Rd index a63ea2feb..c95d0e7f3 100644 --- a/man/mlr_pipeops_targetinvert.Rd +++ b/man/mlr_pipeops_targetinvert.Rd @@ -133,6 +133,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_targetmutate.Rd b/man/mlr_pipeops_targetmutate.Rd index 5193c2db9..806c1854a 100644 --- a/man/mlr_pipeops_targetmutate.Rd +++ b/man/mlr_pipeops_targetmutate.Rd @@ -181,6 +181,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_targettrafoscalerange.Rd b/man/mlr_pipeops_targettrafoscalerange.Rd index 8441b3f1e..a6e38c132 100644 --- a/man/mlr_pipeops_targettrafoscalerange.Rd +++ b/man/mlr_pipeops_targettrafoscalerange.Rd @@ -147,6 +147,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_textvectorizer.Rd b/man/mlr_pipeops_textvectorizer.Rd index 6212e6ad6..775fd717c 100644 --- a/man/mlr_pipeops_textvectorizer.Rd +++ b/man/mlr_pipeops_textvectorizer.Rd @@ -247,6 +247,7 @@ Other PipeOps: \code{\link{mlr_pipeops_targettrafoscalerange}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_threshold.Rd b/man/mlr_pipeops_threshold.Rd index 98c3039c0..3eeee4561 100644 --- a/man/mlr_pipeops_threshold.Rd +++ b/man/mlr_pipeops_threshold.Rd @@ -140,6 +140,7 @@ Other PipeOps: \code{\link{mlr_pipeops_targettrafoscalerange}}, \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_tunethreshold.Rd b/man/mlr_pipeops_tunethreshold.Rd index 7ce2bc4ab..a21c76601 100644 --- a/man/mlr_pipeops_tunethreshold.Rd +++ b/man/mlr_pipeops_tunethreshold.Rd @@ -165,6 +165,7 @@ Other PipeOps: \code{\link{mlr_pipeops_targettrafoscalerange}}, \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd new file mode 100644 index 000000000..86b37320a --- /dev/null +++ b/man/mlr_pipeops_umap.Rd @@ -0,0 +1,160 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpUMAP.R +\name{mlr_pipeops_umap} +\alias{mlr_pipeops_umap} +\alias{PipeOpUMAP} +\title{Uniform Manifold Approximation and Projection (UMAP)} +\format{ +\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} +\description{ +Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP). +See \code{\link[uwot:umap2]{uwot::umap2()}} for details. +} +\section{Construction}{ + + +\if{html}{\out{
}}\preformatted{PipeOpUMAP$new(id = "umap", param_vals = list()) +}\if{html}{\out{
}} +\itemize{ +\item \code{id} :: \code{character(1)}\cr +Identifier of resulting object, default \code{"umap"}. +\item \code{param_vals} :: named \code{list}\cr +List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. +} +} + +\section{Input and Output Channels}{ + +Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. + +The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric features replaced by their principal components. +} + +\section{State}{ + +The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the class \link[stats:prcomp]{stats::prcomp}, +with the exception of the \verb{$x} slot. These are in particular: +\itemize{ +\item \code{sdev} :: \code{numeric}\cr +The standard deviations of the principal components. +\item \code{rotation} :: \code{matrix}\cr +The matrix of variable loadings. +\item \code{center} :: \code{numeric} | \code{logical(1)}\cr +The centering used, or \code{FALSE}. +\item \code{scale} :: \code{numeric} | \code{logical(1)}\cr +The scaling used, or \code{FALSE}. +} +} + +\section{Parameters}{ + +The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{center} :: \code{logical(1)}\cr +Indicating whether the features should be centered. Default is \code{TRUE}. See \code{\link[stats:prcomp]{prcomp()}}. +\item \code{scale.} :: \code{logical(1)}\cr +Whether to scale features to unit variance before analysis. Default is \code{FALSE}, but scaling is advisable. See \code{\link[stats:prcomp]{prcomp()}}. +\item \code{rank.} :: \code{integer(1)}\cr +Maximal number of principal components to be used. Default is \code{NULL}: use all components. See \code{\link[stats:prcomp]{prcomp()}}. +} +} + +\section{Internals}{ + +Uses the \code{\link[uwot:umap]{umap()}} function. +} + +\section{Methods}{ + +Only methods inherited from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} + +\examples{ +library("mlr3") + +task = tsk("iris") +pop = po("umap") + +task$data() +pop$train(list(task))[[1]]$data() + +pop$state +} +\seealso{ +https://mlr-org.com/pipeops.html + +Other PipeOps: +\code{\link{PipeOp}}, +\code{\link{PipeOpEnsemble}}, +\code{\link{PipeOpImpute}}, +\code{\link{PipeOpTargetTrafo}}, +\code{\link{PipeOpTaskPreproc}}, +\code{\link{PipeOpTaskPreprocSimple}}, +\code{\link{mlr_pipeops}}, +\code{\link{mlr_pipeops_boxcox}}, +\code{\link{mlr_pipeops_branch}}, +\code{\link{mlr_pipeops_chunk}}, +\code{\link{mlr_pipeops_classbalancing}}, +\code{\link{mlr_pipeops_classifavg}}, +\code{\link{mlr_pipeops_classweights}}, +\code{\link{mlr_pipeops_colapply}}, +\code{\link{mlr_pipeops_collapsefactors}}, +\code{\link{mlr_pipeops_colroles}}, +\code{\link{mlr_pipeops_copy}}, +\code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_encode}}, +\code{\link{mlr_pipeops_encodeimpact}}, +\code{\link{mlr_pipeops_encodelmer}}, +\code{\link{mlr_pipeops_featureunion}}, +\code{\link{mlr_pipeops_filter}}, +\code{\link{mlr_pipeops_fixfactors}}, +\code{\link{mlr_pipeops_histbin}}, +\code{\link{mlr_pipeops_ica}}, +\code{\link{mlr_pipeops_imputeconstant}}, +\code{\link{mlr_pipeops_imputehist}}, +\code{\link{mlr_pipeops_imputelearner}}, +\code{\link{mlr_pipeops_imputemean}}, +\code{\link{mlr_pipeops_imputemedian}}, +\code{\link{mlr_pipeops_imputemode}}, +\code{\link{mlr_pipeops_imputeoor}}, +\code{\link{mlr_pipeops_imputesample}}, +\code{\link{mlr_pipeops_kernelpca}}, +\code{\link{mlr_pipeops_learner}}, +\code{\link{mlr_pipeops_missind}}, +\code{\link{mlr_pipeops_modelmatrix}}, +\code{\link{mlr_pipeops_multiplicityexply}}, +\code{\link{mlr_pipeops_multiplicityimply}}, +\code{\link{mlr_pipeops_mutate}}, +\code{\link{mlr_pipeops_nmf}}, +\code{\link{mlr_pipeops_nop}}, +\code{\link{mlr_pipeops_ovrsplit}}, +\code{\link{mlr_pipeops_ovrunite}}, +\code{\link{mlr_pipeops_pca}}, +\code{\link{mlr_pipeops_proxy}}, +\code{\link{mlr_pipeops_quantilebin}}, +\code{\link{mlr_pipeops_randomprojection}}, +\code{\link{mlr_pipeops_randomresponse}}, +\code{\link{mlr_pipeops_regravg}}, +\code{\link{mlr_pipeops_removeconstants}}, +\code{\link{mlr_pipeops_renamecolumns}}, +\code{\link{mlr_pipeops_replicate}}, +\code{\link{mlr_pipeops_scale}}, +\code{\link{mlr_pipeops_scalemaxabs}}, +\code{\link{mlr_pipeops_scalerange}}, +\code{\link{mlr_pipeops_select}}, +\code{\link{mlr_pipeops_smote}}, +\code{\link{mlr_pipeops_spatialsign}}, +\code{\link{mlr_pipeops_subsample}}, +\code{\link{mlr_pipeops_targetinvert}}, +\code{\link{mlr_pipeops_targetmutate}}, +\code{\link{mlr_pipeops_targettrafoscalerange}}, +\code{\link{mlr_pipeops_textvectorizer}}, +\code{\link{mlr_pipeops_threshold}}, +\code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_unbranch}}, +\code{\link{mlr_pipeops_updatetarget}}, +\code{\link{mlr_pipeops_vtreat}}, +\code{\link{mlr_pipeops_yeojohnson}} +} +\concept{PipeOps} diff --git a/man/mlr_pipeops_unbranch.Rd b/man/mlr_pipeops_unbranch.Rd index a6986b956..36cb69044 100644 --- a/man/mlr_pipeops_unbranch.Rd +++ b/man/mlr_pipeops_unbranch.Rd @@ -146,6 +146,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, \code{\link{mlr_pipeops_yeojohnson}} diff --git a/man/mlr_pipeops_updatetarget.Rd b/man/mlr_pipeops_updatetarget.Rd index 9e1ae3b06..b553d330a 100644 --- a/man/mlr_pipeops_updatetarget.Rd +++ b/man/mlr_pipeops_updatetarget.Rd @@ -161,6 +161,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_vtreat}}, \code{\link{mlr_pipeops_yeojohnson}} diff --git a/man/mlr_pipeops_vtreat.Rd b/man/mlr_pipeops_vtreat.Rd index 67f23519d..c9edef73a 100644 --- a/man/mlr_pipeops_vtreat.Rd +++ b/man/mlr_pipeops_vtreat.Rd @@ -214,6 +214,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_yeojohnson}} diff --git a/man/mlr_pipeops_yeojohnson.Rd b/man/mlr_pipeops_yeojohnson.Rd index 82284c0d1..912b316c8 100644 --- a/man/mlr_pipeops_yeojohnson.Rd +++ b/man/mlr_pipeops_yeojohnson.Rd @@ -151,6 +151,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}} From 0795faea6d8b8bea90021bf8ebe2e192fae1607e Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Sun, 28 Jul 2024 12:25:51 +0200 Subject: [PATCH 03/36] docs: more param docs --- R/PipeOpUMAP.R | 90 +++++++++++++++++++++++++++---- man/mlr_pipeops_umap.Rd | 84 ++++++++++++++++++++++++++--- tests/testthat/test_pipeop_umap.R | 7 +++ 3 files changed, 165 insertions(+), 16 deletions(-) create mode 100644 tests/testthat/test_pipeop_umap.R diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index bb793dac1..2151185dd 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -37,12 +37,84 @@ #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: -#' * `center` :: `logical(1)`\cr -#' Indicating whether the features should be centered. Default is `TRUE`. See [`prcomp()`][stats::prcomp]. -#' * `scale.` :: `logical(1)`\cr -#' Whether to scale features to unit variance before analysis. Default is `FALSE`, but scaling is advisable. See [`prcomp()`][stats::prcomp]. -#' * `rank.` :: `integer(1)`\cr -#' Maximal number of principal components to be used. Default is `NULL`: use all components. See [`prcomp()`][stats::prcomp]. +#' * `n_neighbors` :: `integer(1)`\cr +#' Blah +#' * `n_components` :: `integer(1)`\cr +#' Blah +#' * `metric` :: `character(1)`\cr +#' Blah +#' * `n_epochs` :: `integer(1)`\cr +#' Blah +#' * `learning_rate` :: `numeric(1)`\cr +#' Blah +#' * `init` :: `character(1)`\cr +#' Blah +#' * `init_sdev` :: `character(1)`\cr +#' Blah +#' * `spread` :: `character(1)`\cr +#' Blah +#' * `min_dist` :: `character(1)`\cr +#' Blah +#' * `set_op_mix_ratio` :: `character(1)`\cr +#' Blah +#' * `local_connectivity` :: `character(1)`\cr +#' Blah +#' * `bandwidth` :: `character(1)`\cr +#' Blah +#' * `repulsion_strength` :: `character(1)`\cr +#' Blah +#' * `a` :: `character(1)`\cr +#' Blah +#' * `b` :: `character(1)`\cr +#' Blah +#' * `nn_method` :: `character(1)`\cr +#' Blah +#' * `n_trees` :: `character(1)`\cr +#' Blah +#' * `search_k` :: `character(1)`\cr +#' Blah +#' * `approx_pow` :: `character(1)`\cr +#' Blah +#' * `y` :: `character(1)`\cr +#' Blah +#' * `target_n_neighbors` :: `character(1)`\cr +#' Blah +#' * `target_metric` :: `character(1)`\cr +#' Blah +#' * `target_weight` :: `character(1)`\cr +#' Blah +#' * `pca` :: `character(1)`\cr +#' Blah +#' * `pca_center` :: `character(1)`\cr +#' Blah +#' * `pca_rand` :: `character(1)`\cr +#' Blah +#' * `fast_sgd` :: `character(1)`\cr +#' Blah +#' * `n_threads` :: `character(1)`\cr +#' Blah +#' * `n_sgd_threads` :: `character(1)`\cr +#' Blah +#' * `grain_size` :: `character(1)`\cr +#' Blah +#' * `verbose` :: `character(1)`\cr +#' Blah +#' * `batch` :: `character(1)`\cr +#' Blah +#' * `opt_args` :: `character(1)`\cr +#' Blah +#' * `epoch_callback` :: `character(1)`\cr +#' Blah +#' * `pca_method` :: `character(1)`\cr +#' Blah +#' * `binary_edge_weights` :: `character(1)`\cr +#' Blah +#' * `dens_scale` :: `character(1)`\cr +#' Blah +#' * `seed` :: `character(1)`\cr +#' Blah +#' * `nn_args` :: `character(1)`\cr +#' Blah #' #' @section Internals: #' Uses the [`umap()`][uwot::umap] function. @@ -123,13 +195,13 @@ PipeOpUMAP = R6Class("PipeOpUMAP", grain_size = p_int(1L, default = 1L, tags = c("train", "umap")), verbose = p_lgl(default = TRUE, tags = c("train", "umap")), batch = p_lgl(default = FALSE, tags = c("train", "umap")), - opt_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_list), + opt_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = crate(function(x) check_list(x, null.ok = TRUE))), epoch_callback = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_function_or_null), pca_method = p_fct(c("irlba", "rsvd", "bigstatsr", "svd", "auto"), default = NULL, special_vals = list(NULL), tags = c("train", "umap")), binary_edge_weights = p_lgl(default = FALSE, tags = c("train", "umap")), dens_scale = p_dbl(0, 1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), seed = p_int(default = NULL, special_vals = list(NULL), tags = c("train", "umap")), - nn_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_list) + nn_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = crate(function(x) check_list(x, null.ok = TRUE))) ) ps$set_values(verbose = FALSE) @@ -137,12 +209,10 @@ PipeOpUMAP = R6Class("PipeOpUMAP", } ), private = list( - .train_dt = function(dt, levels, target) { params = insert_named(self$param_set$get_values(tags = "umap"), list(ret_model = TRUE)) umap = invoke(uwot::umap2, dt, .args = params) self$state = umap - self$state$embedding = NULL umap$embedding }, diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 86b37320a..1e9d052d2 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -51,12 +51,84 @@ The scaling used, or \code{FALSE}. The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: \itemize{ -\item \code{center} :: \code{logical(1)}\cr -Indicating whether the features should be centered. Default is \code{TRUE}. See \code{\link[stats:prcomp]{prcomp()}}. -\item \code{scale.} :: \code{logical(1)}\cr -Whether to scale features to unit variance before analysis. Default is \code{FALSE}, but scaling is advisable. See \code{\link[stats:prcomp]{prcomp()}}. -\item \code{rank.} :: \code{integer(1)}\cr -Maximal number of principal components to be used. Default is \code{NULL}: use all components. See \code{\link[stats:prcomp]{prcomp()}}. +\item \code{n_neighbors} :: \code{integer(1)}\cr +Blah +\item \code{n_components} :: \code{integer(1)}\cr +Blah +\item \code{metric} :: \code{character(1)}\cr +Blah +\item \code{n_epochs} :: \code{integer(1)}\cr +Blah +\item \code{learning_rate} :: \code{numeric(1)}\cr +Blah +\item \code{init} :: \code{character(1)}\cr +Blah +\item \code{init_sdev} :: \code{character(1)}\cr +Blah +\item \code{spread} :: \code{character(1)}\cr +Blah +\item \code{min_dist} :: \code{character(1)}\cr +Blah +\item \code{set_op_mix_ratio} :: \code{character(1)}\cr +Blah +\item \code{local_connectivity} :: \code{character(1)}\cr +Blah +\item \code{bandwidth} :: \code{character(1)}\cr +Blah +\item \code{repulsion_strength} :: \code{character(1)}\cr +Blah +\item \code{a} :: \code{character(1)}\cr +Blah +\item \code{b} :: \code{character(1)}\cr +Blah +\item \code{nn_method} :: \code{character(1)}\cr +Blah +\item \code{n_trees} :: \code{character(1)}\cr +Blah +\item \code{search_k} :: \code{character(1)}\cr +Blah +\item \code{approx_pow} :: \code{character(1)}\cr +Blah +\item \code{y} :: \code{character(1)}\cr +Blah +\item \code{target_n_neighbors} :: \code{character(1)}\cr +Blah +\item \code{target_metric} :: \code{character(1)}\cr +Blah +\item \code{target_weight} :: \code{character(1)}\cr +Blah +\item \code{pca} :: \code{character(1)}\cr +Blah +\item \code{pca_center} :: \code{character(1)}\cr +Blah +\item \code{pca_rand} :: \code{character(1)}\cr +Blah +\item \code{fast_sgd} :: \code{character(1)}\cr +Blah +\item \code{n_threads} :: \code{character(1)}\cr +Blah +\item \code{n_sgd_threads} :: \code{character(1)}\cr +Blah +\item \code{grain_size} :: \code{character(1)}\cr +Blah +\item \code{verbose} :: \code{character(1)}\cr +Blah +\item \code{batch} :: \code{character(1)}\cr +Blah +\item \code{opt_args} :: \code{character(1)}\cr +Blah +\item \code{epoch_callback} :: \code{character(1)}\cr +Blah +\item \code{pca_method} :: \code{character(1)}\cr +Blah +\item \code{binary_edge_weights} :: \code{character(1)}\cr +Blah +\item \code{dens_scale} :: \code{character(1)}\cr +Blah +\item \code{seed} :: \code{character(1)}\cr +Blah +\item \code{nn_args} :: \code{character(1)}\cr +Blah } } diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R new file mode 100644 index 000000000..0faf5e36c --- /dev/null +++ b/tests/testthat/test_pipeop_umap.R @@ -0,0 +1,7 @@ +context("PipeOpUMAP") + +test_that("PipeOpUMAP - basic properties", { + op = PipeOpUMAP$new() + task = mlr_tasks$get("iris") + expect_pipeop(op) +}) From d1fc20ef9190bc1b6530331113b420ef65d07345 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 11:00:05 +0200 Subject: [PATCH 04/36] docs: more param docs --- R/PipeOpUMAP.R | 165 ++++++++++++++++++++++------------------ man/mlr_pipeops_umap.Rd | 159 +++++++++++++++++++++----------------- 2 files changed, 181 insertions(+), 143 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 2151185dd..b029303a0 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -38,83 +38,102 @@ #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: #' * `n_neighbors` :: `integer(1)`\cr -#' Blah +#' The size of the neighborhood used for manifold approximation. Default is `15`. #' * `n_components` :: `integer(1)`\cr -#' Blah +#' The dimension of the space to embed into. Default is `2`. #' * `metric` :: `character(1)`\cr -#' Blah +#' Type of distance metric to use to find nearest neighbors. Default is `"euclidean"`. #' * `n_epochs` :: `integer(1)`\cr -#' Blah +#' Number of epochs to use during the optimization of the embedded coordinates. +#' By default, this value is set to 500 for datasets containing 10,000 vertices or less, +#' and 200 otherwise. If n_epochs = 0, then coordinates determined by "init" will be returned. #' * `learning_rate` :: `numeric(1)`\cr -#' Blah -#' * `init` :: `character(1)`\cr -#' Blah -#' * `init_sdev` :: `character(1)`\cr -#' Blah -#' * `spread` :: `character(1)`\cr -#' Blah -#' * `min_dist` :: `character(1)`\cr -#' Blah -#' * `set_op_mix_ratio` :: `character(1)`\cr -#' Blah -#' * `local_connectivity` :: `character(1)`\cr -#' Blah -#' * `bandwidth` :: `character(1)`\cr -#' Blah -#' * `repulsion_strength` :: `character(1)`\cr -#' Blah -#' * `a` :: `character(1)`\cr -#' Blah -#' * `b` :: `character(1)`\cr -#' Blah -#' * `nn_method` :: `character(1)`\cr -#' Blah -#' * `n_trees` :: `character(1)`\cr -#' Blah -#' * `search_k` :: `character(1)`\cr -#' Blah -#' * `approx_pow` :: `character(1)`\cr -#' Blah +#' Initial learning rate used in optimization of the coordinates. Default is `1`. +#' * `init` :: `character(1)` | `matrix`\cr +#' Type of initialization for the coordinates. Default is `"spectral"`. +#' * `init_sdev` :: `character(1)` | `numeric(1)`\cr +#' Scales each dimension of the initialized coordinates to this standard deviation. +#' Default is `"range"`. +#' * `spread` :: `numeric(1)`\cr +#' The effective scale of embedded points. In combination with `min_dist`, +#' this determines how clustered/clumped the embedded points are. Default is `1`. +#' * `min_dist` :: `numeric(1)`\cr +#' The effective minimum distance between embedded points. Default is `0.01`. +#' * `set_op_mix_ratio` :: `numeric(1)`\cr +#' Interpolate between (fuzzy) union and intersection as the set operation used to +#' combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Default is `1`. +#' * `local_connectivity` :: `numeric(1)`\cr +#' The local connectivity required – i.e. the number of nearest neighbors that should be +#' assumed to be connected at a local level. Default is `1`. +#' * `bandwidth` :: `numeric(1)`\cr +#' The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. +#' Default is `1`. +#' * `repulsion_strength` :: `numeric(1)`\cr +#' Weighting applied to negative samples in low dimensional embedding optimization. +#' Values higher than one will result in greater weight being given to negative samples. +#' Default is `1`. +#' * `negative_sample_rate` :: `numeric(1)`\cr +#' The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample +#' in optimizing the low dimensional embedding. Default is `5`. +#' * `a` :: `any`\cr +#' More specific parameters controlling the embedding. +#' If `NULL` these values are set automatically as determined by `min_dist` and `spread`. +#' Default is `NULL`. +#' * `b` :: `any`\cr +#' More specific parameters controlling the embedding. +#' If `NULL` these values are set automatically as determined by `min_dist` and `spread`. +#' Default is `NULL`. +#' * `nn_method` :: `character(1)` | named `list()` | matrix\cr +#' Method for finding nearest neighbors. Default is `NULL`. +#' * `n_trees` :: `integer(1)`\cr +#' Number of trees to build when constructing the nearest neighbor index. Default is `50`. +#' * `search_k` :: `integer(1)`\cr +#' Number of nodes to search during the neighbor retrieval. +#' * `approx_pow` :: `logical(1)`\cr +#' If `TRUE`, use an approximation to the power function in the UMAP gradient. +#' Ignored if `dens_scale` is non-NULL. Default is `FALSE`. #' * `y` :: `character(1)`\cr -#' Blah -#' * `target_n_neighbors` :: `character(1)`\cr -#' Blah +#' Default is `NULL`. +#' * `target_n_neighbors` :: `integer(1)`\cr +#' Number of nearest neighbors to use to construct the target simplicial set. Default is `NULL`. #' * `target_metric` :: `character(1)`\cr -#' Blah -#' * `target_weight` :: `character(1)`\cr -#' Blah -#' * `pca` :: `character(1)`\cr -#' Blah -#' * `pca_center` :: `character(1)`\cr -#' Blah -#' * `pca_rand` :: `character(1)`\cr -#' Blah -#' * `fast_sgd` :: `character(1)`\cr -#' Blah -#' * `n_threads` :: `character(1)`\cr -#' Blah -#' * `n_sgd_threads` :: `character(1)`\cr -#' Blah -#' * `grain_size` :: `character(1)`\cr -#' Blah -#' * `verbose` :: `character(1)`\cr -#' Blah -#' * `batch` :: `character(1)`\cr -#' Blah -#' * `opt_args` :: `character(1)`\cr -#' Blah -#' * `epoch_callback` :: `character(1)`\cr -#' Blah +#' The metric used to measure distance for `y` if using supervised dimension reduction. +#' Used only if `y` is numeric. +#' * `target_weight` :: `numeric(1)`\cr +#' Weighting factor between data topology and target topology. Default is `0.5`. +#' * `pca` :: `integer(1)`\cr +#' Default is `NULL`. +#' * `pca_center` :: `logical(1)`\cr +#' If `TRUE`, center the columns of X before carrying out PCA. +#' For binary data, it's recommended to set this to `FALSE`. Default is `TRUE`. +#' * `pca_rand` :: `logical(1)`\cr +#' Default is `TRUE`. +#' * `fast_sgd` :: `logical(1)`\cr +#' Default is `FALSE`. +#' * `n_threads` :: `integer(1)`\cr +#' Default is `NULL`. +#' * `n_sgd_threads` :: `integer(1)`\cr +#' Default is `0`. +#' * `grain_size` :: `integer(1)`\cr +#' Default is `1`. +#' * `verbose` :: `logical(1)`\cr +#' Should details be logged to the console? Initialzed to `FALSE`. +#' * `batch` :: `logical(1)`\cr +#' Default is `FALSE`. +#' * `opt_args` :: named `list()`\cr +#' Default is `NULL`. +#' * `epoch_callback` :: `function`\cr +#' Default is `NULL`. #' * `pca_method` :: `character(1)`\cr -#' Blah -#' * `binary_edge_weights` :: `character(1)`\cr -#' Blah -#' * `dens_scale` :: `character(1)`\cr -#' Blah -#' * `seed` :: `character(1)`\cr -#' Blah -#' * `nn_args` :: `character(1)`\cr -#' Blah +#' Default is `NULL`. +#' * `binary_edge_weights` :: `logical(1)`\cr +#' Default is `FALSE`. +#' * `dens_scale` :: `numeric(1)`\cr +#' Default is `NULL`. +#' * `seed` :: `integer(1)`\cr +#' Default is `NULL`. +#' * `nn_args` :: named `list()`\cr +#' Default is `NULL`. #' #' @section Internals: #' Uses the [`umap()`][uwot::umap] function. @@ -164,10 +183,10 @@ PipeOpUMAP = R6Class("PipeOpUMAP", spread = p_dbl(default = 1, tags = c("train", "umap")), min_dist = p_dbl(default = 0.01, tags = c("train", "umap")), set_op_mix_ratio = p_dbl(0, 1, default = 1, tags = c("train", "umap")), - local_connectivity = p_dbl(1, default = 1L, tags = c("train", "umap")), + local_connectivity = p_dbl(1, default = 1, tags = c("train", "umap")), bandwidth = p_dbl(default = 1, tags = c("train", "umap")), repulsion_strength = p_dbl(default = 1, tags = c("train", "umap")), - negative_sample_rate = p_dbl(default = 5L, tags = c("train", "umap")), + negative_sample_rate = p_dbl(default = 5, tags = c("train", "umap")), a = p_uty(default = NULL, tags = c("train", "umap")), b = p_uty(default = NULL, tags = c("train", "umap")), nn_method = p_uty( @@ -186,7 +205,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", target_n_neighbors = p_int(tags = c("train", "umap")), target_metric = p_fct(c("euclidean", "cosine", "correlation"), default = "euclidean", tags = c("train", "umap")), target_weight = p_dbl(0, 1, default = 0.5, tags = c("train", "umap")), - pca = p_int(1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + pca = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), pca_center = p_lgl(default = TRUE, tags = c("train", "umap")), pca_rand = p_lgl(default = TRUE, tags = c("train", "umap")), fast_sgd = p_lgl(default = FALSE, tags = c("train", "umap")), diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 1e9d052d2..541f57159 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -52,83 +52,102 @@ The scaling used, or \code{FALSE}. The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: \itemize{ \item \code{n_neighbors} :: \code{integer(1)}\cr -Blah +The size of the neighborhood used for manifold approximation. Default is \code{15}. \item \code{n_components} :: \code{integer(1)}\cr -Blah +The dimension of the space to embed into. Default is \code{2}. \item \code{metric} :: \code{character(1)}\cr -Blah +Type of distance metric to use to find nearest neighbors. Default is \code{"euclidean"}. \item \code{n_epochs} :: \code{integer(1)}\cr -Blah +Number of epochs to use during the optimization of the embedded coordinates. +By default, this value is set to 500 for datasets containing 10,000 vertices or less, +and 200 otherwise. If n_epochs = 0, then coordinates determined by "init" will be returned. \item \code{learning_rate} :: \code{numeric(1)}\cr -Blah -\item \code{init} :: \code{character(1)}\cr -Blah -\item \code{init_sdev} :: \code{character(1)}\cr -Blah -\item \code{spread} :: \code{character(1)}\cr -Blah -\item \code{min_dist} :: \code{character(1)}\cr -Blah -\item \code{set_op_mix_ratio} :: \code{character(1)}\cr -Blah -\item \code{local_connectivity} :: \code{character(1)}\cr -Blah -\item \code{bandwidth} :: \code{character(1)}\cr -Blah -\item \code{repulsion_strength} :: \code{character(1)}\cr -Blah -\item \code{a} :: \code{character(1)}\cr -Blah -\item \code{b} :: \code{character(1)}\cr -Blah -\item \code{nn_method} :: \code{character(1)}\cr -Blah -\item \code{n_trees} :: \code{character(1)}\cr -Blah -\item \code{search_k} :: \code{character(1)}\cr -Blah -\item \code{approx_pow} :: \code{character(1)}\cr -Blah +Initial learning rate used in optimization of the coordinates. Default is \code{1}. +\item \code{init} :: \code{character(1)} | \code{matrix}\cr +Type of initialization for the coordinates. Default is \code{"spectral"}. +\item \code{init_sdev} :: \code{character(1)} | \code{numeric(1)}\cr +Scales each dimension of the initialized coordinates to this standard deviation. +Default is \code{"range"}. +\item \code{spread} :: \code{numeric(1)}\cr +The effective scale of embedded points. In combination with \code{min_dist}, +this determines how clustered/clumped the embedded points are. Default is \code{1}. +\item \code{min_dist} :: \code{numeric(1)}\cr +The effective minimum distance between embedded points. Default is \code{0.01}. +\item \code{set_op_mix_ratio} :: \code{numeric(1)}\cr +Interpolate between (fuzzy) union and intersection as the set operation used to +combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Default is \code{1}. +\item \code{local_connectivity} :: \code{numeric(1)}\cr +The local connectivity required – i.e. the number of nearest neighbors that should be +assumed to be connected at a local level. Default is \code{1}. +\item \code{bandwidth} :: \code{numeric(1)}\cr +The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. +Default is \code{1}. +\item \code{repulsion_strength} :: \code{numeric(1)}\cr +Weighting applied to negative samples in low dimensional embedding optimization. +Values higher than one will result in greater weight being given to negative samples. +Default is \code{1}. +\item \code{negative_sample_rate} :: \code{numeric(1)}\cr +The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample +in optimizing the low dimensional embedding. Default is \code{5}. +\item \code{a} :: \code{any}\cr +More specific parameters controlling the embedding. +If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}. +Default is \code{NULL}. +\item \code{b} :: \code{any}\cr +More specific parameters controlling the embedding. +If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}. +Default is \code{NULL}. +\item \code{nn_method} :: \code{character(1)} | named \code{list()} | matrix\cr +Method for finding nearest neighbors. Default is \code{NULL}. +\item \code{n_trees} :: \code{integer(1)}\cr +Number of trees to build when constructing the nearest neighbor index. Default is \code{50}. +\item \code{search_k} :: \code{integer(1)}\cr +Number of nodes to search during the neighbor retrieval. +\item \code{approx_pow} :: \code{logical(1)}\cr +If \code{TRUE}, use an approximation to the power function in the UMAP gradient. +Ignored if \code{dens_scale} is non-NULL. Default is \code{FALSE}. \item \code{y} :: \code{character(1)}\cr -Blah -\item \code{target_n_neighbors} :: \code{character(1)}\cr -Blah +Default is \code{NULL}. +\item \code{target_n_neighbors} :: \code{integer(1)}\cr +Number of nearest neighbors to use to construct the target simplicial set. Default is \code{NULL}. \item \code{target_metric} :: \code{character(1)}\cr -Blah -\item \code{target_weight} :: \code{character(1)}\cr -Blah -\item \code{pca} :: \code{character(1)}\cr -Blah -\item \code{pca_center} :: \code{character(1)}\cr -Blah -\item \code{pca_rand} :: \code{character(1)}\cr -Blah -\item \code{fast_sgd} :: \code{character(1)}\cr -Blah -\item \code{n_threads} :: \code{character(1)}\cr -Blah -\item \code{n_sgd_threads} :: \code{character(1)}\cr -Blah -\item \code{grain_size} :: \code{character(1)}\cr -Blah -\item \code{verbose} :: \code{character(1)}\cr -Blah -\item \code{batch} :: \code{character(1)}\cr -Blah -\item \code{opt_args} :: \code{character(1)}\cr -Blah -\item \code{epoch_callback} :: \code{character(1)}\cr -Blah +The metric used to measure distance for \code{y} if using supervised dimension reduction. +Used only if \code{y} is numeric. +\item \code{target_weight} :: \code{numeric(1)}\cr +Weighting factor between data topology and target topology. Default is \code{0.5}. +\item \code{pca} :: \code{integer(1)}\cr +Default is \code{NULL}. +\item \code{pca_center} :: \code{logical(1)}\cr +If \code{TRUE}, center the columns of X before carrying out PCA. +For binary data, it's recommended to set this to \code{FALSE}. Default is \code{TRUE}. +\item \code{pca_rand} :: \code{logical(1)}\cr +Default is \code{TRUE}. +\item \code{fast_sgd} :: \code{logical(1)}\cr +Default is \code{FALSE}. +\item \code{n_threads} :: \code{integer(1)}\cr +Default is \code{NULL}. +\item \code{n_sgd_threads} :: \code{integer(1)}\cr +Default is \code{0}. +\item \code{grain_size} :: \code{integer(1)}\cr +Default is \code{1}. +\item \code{verbose} :: \code{logical(1)}\cr +Should details be logged to the console? Initialzed to \code{FALSE}. +\item \code{batch} :: \code{logical(1)}\cr +Default is \code{FALSE}. +\item \code{opt_args} :: named \code{list()}\cr +Default is \code{NULL}. +\item \code{epoch_callback} :: \code{function}\cr +Default is \code{NULL}. \item \code{pca_method} :: \code{character(1)}\cr -Blah -\item \code{binary_edge_weights} :: \code{character(1)}\cr -Blah -\item \code{dens_scale} :: \code{character(1)}\cr -Blah -\item \code{seed} :: \code{character(1)}\cr -Blah -\item \code{nn_args} :: \code{character(1)}\cr -Blah +Default is \code{NULL}. +\item \code{binary_edge_weights} :: \code{logical(1)}\cr +Default is \code{FALSE}. +\item \code{dens_scale} :: \code{numeric(1)}\cr +Default is \code{NULL}. +\item \code{seed} :: \code{integer(1)}\cr +Default is \code{NULL}. +\item \code{nn_args} :: named \code{list()}\cr +Default is \code{NULL}. } } From 7fbd496653ea2df96cd280cdb8db5a4fd17e60d7 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 11:26:29 +0200 Subject: [PATCH 05/36] docs: only run the examples if uwot available --- R/PipeOpUMAP.R | 2 ++ man/mlr_pipeops_umap.Rd | 2 ++ tests/testthat/test_pipeop_umap.R | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index b029303a0..6c1a53f90 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -142,6 +142,7 @@ #' Only methods inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @examples +#' \dontshow{ if (requireNamespace("uwot")) \{ } #' library("mlr3") #' #' task = tsk("iris") @@ -151,6 +152,7 @@ #' pop$train(list(task))[[1]]$data() #' #' pop$state +#' \dontshow{ \} } #' @family PipeOps #' @template seealso_pipeopslist #' @include PipeOpTaskPreproc.R diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 541f57159..b82b642ad 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -162,6 +162,7 @@ Only methods inherited from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}} } \examples{ +\dontshow{ if (requireNamespace("uwot")) \{ } library("mlr3") task = tsk("iris") @@ -171,6 +172,7 @@ task$data() pop$train(list(task))[[1]]$data() pop$state +\dontshow{ \} } } \seealso{ https://mlr-org.com/pipeops.html diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R index 0faf5e36c..eef01e201 100644 --- a/tests/testthat/test_pipeop_umap.R +++ b/tests/testthat/test_pipeop_umap.R @@ -1,7 +1,7 @@ context("PipeOpUMAP") test_that("PipeOpUMAP - basic properties", { + skip_if_not_installed("uwot") op = PipeOpUMAP$new() - task = mlr_tasks$get("iris") expect_pipeop(op) }) From 0122c89a901b842b56e189a0a05b09cc37d35340 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 12:06:39 +0200 Subject: [PATCH 06/36] docs: more docs --- R/PipeOpUMAP.R | 72 +++++++++++++++++++++++++---------------- man/mlr_pipeops_umap.Rd | 72 +++++++++++++++++++++++++---------------- 2 files changed, 88 insertions(+), 56 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 6c1a53f90..23f08b465 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -24,7 +24,7 @@ #' The output is the input [`Task`][mlr3::Task] with all affected numeric features replaced by their principal components. #' #' @section State: -#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the class [stats::prcomp], +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the class [uwot::umap2], #' with the exception of the `$x` slot. These are in particular: #' * `sdev` :: `numeric`\cr #' The standard deviations of the principal components. @@ -39,101 +39,117 @@ #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: #' * `n_neighbors` :: `integer(1)`\cr #' The size of the neighborhood used for manifold approximation. Default is `15`. +#' For details see [uwot::umap2()]. #' * `n_components` :: `integer(1)`\cr -#' The dimension of the space to embed into. Default is `2`. +#' The dimension of the space to embed into. Default is `2`. For details see [uwot::umap2()]. #' * `metric` :: `character(1)`\cr #' Type of distance metric to use to find nearest neighbors. Default is `"euclidean"`. +#' For details see [uwot::umap2()]. #' * `n_epochs` :: `integer(1)`\cr -#' Number of epochs to use during the optimization of the embedded coordinates. -#' By default, this value is set to 500 for datasets containing 10,000 vertices or less, -#' and 200 otherwise. If n_epochs = 0, then coordinates determined by "init" will be returned. +#' Number of epochs to use during the optimization of the embedded coordinates. Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `learning_rate` :: `numeric(1)`\cr #' Initial learning rate used in optimization of the coordinates. Default is `1`. +#' For details see [uwot::umap2()]. #' * `init` :: `character(1)` | `matrix`\cr #' Type of initialization for the coordinates. Default is `"spectral"`. +#' For details see [uwot::umap2()]. #' * `init_sdev` :: `character(1)` | `numeric(1)`\cr #' Scales each dimension of the initialized coordinates to this standard deviation. -#' Default is `"range"`. +#' Default is `"range"`. For details see [uwot::umap2()]. #' * `spread` :: `numeric(1)`\cr -#' The effective scale of embedded points. In combination with `min_dist`, -#' this determines how clustered/clumped the embedded points are. Default is `1`. +#' The effective scale of embedded points. Default is `1`. For details see [uwot::umap2()]. #' * `min_dist` :: `numeric(1)`\cr #' The effective minimum distance between embedded points. Default is `0.01`. +#' For details see [uwot::umap2()]. #' * `set_op_mix_ratio` :: `numeric(1)`\cr #' Interpolate between (fuzzy) union and intersection as the set operation used to #' combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Default is `1`. +#' For details see [uwot::umap2()]. #' * `local_connectivity` :: `numeric(1)`\cr #' The local connectivity required – i.e. the number of nearest neighbors that should be -#' assumed to be connected at a local level. Default is `1`. +#' assumed to be connected at a local level. Default is `1`. For details see [uwot::umap2()]. #' * `bandwidth` :: `numeric(1)`\cr #' The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. -#' Default is `1`. +#' Default is `1`. For details see [uwot::umap2()]. #' * `repulsion_strength` :: `numeric(1)`\cr #' Weighting applied to negative samples in low dimensional embedding optimization. #' Values higher than one will result in greater weight being given to negative samples. -#' Default is `1`. +#' Default is `1`. For details see [uwot::umap2()]. #' * `negative_sample_rate` :: `numeric(1)`\cr #' The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample -#' in optimizing the low dimensional embedding. Default is `5`. +#' in optimizing the low dimensional embedding. Default is `5`. For details see [uwot::umap2()]. #' * `a` :: `any`\cr -#' More specific parameters controlling the embedding. -#' If `NULL` these values are set automatically as determined by `min_dist` and `spread`. -#' Default is `NULL`. +#' More specific parameters controlling the embedding. Default is `NULL`. For details see [uwot::umap2()]. #' * `b` :: `any`\cr -#' More specific parameters controlling the embedding. -#' If `NULL` these values are set automatically as determined by `min_dist` and `spread`. -#' Default is `NULL`. +#' More specific parameters controlling the embedding. Default is `NULL`. For details see [uwot::umap2()]. #' * `nn_method` :: `character(1)` | named `list()` | matrix\cr -#' Method for finding nearest neighbors. Default is `NULL`. +#' Method for finding nearest neighbors. Default is `NULL`. For details see [uwot::umap2()]. #' * `n_trees` :: `integer(1)`\cr #' Number of trees to build when constructing the nearest neighbor index. Default is `50`. +#' For details see [uwot::umap2()]. #' * `search_k` :: `integer(1)`\cr -#' Number of nodes to search during the neighbor retrieval. +#' Number of nodes to search during the neighbor retrieval. For details see [uwot::umap2()]. #' * `approx_pow` :: `logical(1)`\cr -#' If `TRUE`, use an approximation to the power function in the UMAP gradient. -#' Ignored if `dens_scale` is non-NULL. Default is `FALSE`. +#' If `TRUE`, use an approximation to the power function in the UMAP gradient. Default is `FALSE`. +#' For details see [uwot::umap2()]. #' * `y` :: `character(1)`\cr -#' Default is `NULL`. +#' Default is `NULL`. For details see [uwot::umap2()]. #' * `target_n_neighbors` :: `integer(1)`\cr #' Number of nearest neighbors to use to construct the target simplicial set. Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `target_metric` :: `character(1)`\cr #' The metric used to measure distance for `y` if using supervised dimension reduction. -#' Used only if `y` is numeric. +#' For details see [uwot::umap2()]. #' * `target_weight` :: `numeric(1)`\cr #' Weighting factor between data topology and target topology. Default is `0.5`. +#' For details see [uwot::umap2()]. #' * `pca` :: `integer(1)`\cr -#' Default is `NULL`. +#' Default is `NULL`. For details see [uwot::umap2()]. #' * `pca_center` :: `logical(1)`\cr -#' If `TRUE`, center the columns of X before carrying out PCA. -#' For binary data, it's recommended to set this to `FALSE`. Default is `TRUE`. +#' If `TRUE`, center the columns of X before carrying out PCA. Default is `TRUE`. +#' For details see [uwot::umap2()]. #' * `pca_rand` :: `logical(1)`\cr #' Default is `TRUE`. +#' For details see [uwot::umap2()]. #' * `fast_sgd` :: `logical(1)`\cr #' Default is `FALSE`. +#' For details see [uwot::umap2()]. #' * `n_threads` :: `integer(1)`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `n_sgd_threads` :: `integer(1)`\cr #' Default is `0`. +#' For details see [uwot::umap2()]. #' * `grain_size` :: `integer(1)`\cr #' Default is `1`. +#' For details see [uwot::umap2()]. #' * `verbose` :: `logical(1)`\cr -#' Should details be logged to the console? Initialzed to `FALSE`. +#' Should details be printed? Initialzed to `FALSE`. For details see [uwot::umap2()]. #' * `batch` :: `logical(1)`\cr #' Default is `FALSE`. +#' For details see [uwot::umap2()]. #' * `opt_args` :: named `list()`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `epoch_callback` :: `function`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `pca_method` :: `character(1)`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `binary_edge_weights` :: `logical(1)`\cr #' Default is `FALSE`. +#' For details see [uwot::umap2()]. #' * `dens_scale` :: `numeric(1)`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `seed` :: `integer(1)`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `nn_args` :: named `list()`\cr #' Default is `NULL`. +#' For details see [uwot::umap2()]. #' #' @section Internals: #' Uses the [`umap()`][uwot::umap] function. diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index b82b642ad..78f79dfd1 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -33,7 +33,7 @@ The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric \section{State}{ -The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the class \link[stats:prcomp]{stats::prcomp}, +The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the class \link[uwot:umap2]{uwot::umap2}, with the exception of the \verb{$x} slot. These are in particular: \itemize{ \item \code{sdev} :: \code{numeric}\cr @@ -53,101 +53,117 @@ The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}} \itemize{ \item \code{n_neighbors} :: \code{integer(1)}\cr The size of the neighborhood used for manifold approximation. Default is \code{15}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_components} :: \code{integer(1)}\cr -The dimension of the space to embed into. Default is \code{2}. +The dimension of the space to embed into. Default is \code{2}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{metric} :: \code{character(1)}\cr Type of distance metric to use to find nearest neighbors. Default is \code{"euclidean"}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_epochs} :: \code{integer(1)}\cr -Number of epochs to use during the optimization of the embedded coordinates. -By default, this value is set to 500 for datasets containing 10,000 vertices or less, -and 200 otherwise. If n_epochs = 0, then coordinates determined by "init" will be returned. +Number of epochs to use during the optimization of the embedded coordinates. Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{learning_rate} :: \code{numeric(1)}\cr Initial learning rate used in optimization of the coordinates. Default is \code{1}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{init} :: \code{character(1)} | \code{matrix}\cr Type of initialization for the coordinates. Default is \code{"spectral"}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{init_sdev} :: \code{character(1)} | \code{numeric(1)}\cr Scales each dimension of the initialized coordinates to this standard deviation. -Default is \code{"range"}. +Default is \code{"range"}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{spread} :: \code{numeric(1)}\cr -The effective scale of embedded points. In combination with \code{min_dist}, -this determines how clustered/clumped the embedded points are. Default is \code{1}. +The effective scale of embedded points. Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{min_dist} :: \code{numeric(1)}\cr The effective minimum distance between embedded points. Default is \code{0.01}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{set_op_mix_ratio} :: \code{numeric(1)}\cr Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Default is \code{1}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{local_connectivity} :: \code{numeric(1)}\cr The local connectivity required – i.e. the number of nearest neighbors that should be -assumed to be connected at a local level. Default is \code{1}. +assumed to be connected at a local level. Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{bandwidth} :: \code{numeric(1)}\cr The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. -Default is \code{1}. +Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{repulsion_strength} :: \code{numeric(1)}\cr Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. -Default is \code{1}. +Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{negative_sample_rate} :: \code{numeric(1)}\cr The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample -in optimizing the low dimensional embedding. Default is \code{5}. +in optimizing the low dimensional embedding. Default is \code{5}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{a} :: \code{any}\cr -More specific parameters controlling the embedding. -If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}. -Default is \code{NULL}. +More specific parameters controlling the embedding. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{b} :: \code{any}\cr -More specific parameters controlling the embedding. -If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}. -Default is \code{NULL}. +More specific parameters controlling the embedding. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_method} :: \code{character(1)} | named \code{list()} | matrix\cr -Method for finding nearest neighbors. Default is \code{NULL}. +Method for finding nearest neighbors. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_trees} :: \code{integer(1)}\cr Number of trees to build when constructing the nearest neighbor index. Default is \code{50}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{search_k} :: \code{integer(1)}\cr -Number of nodes to search during the neighbor retrieval. +Number of nodes to search during the neighbor retrieval. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{approx_pow} :: \code{logical(1)}\cr -If \code{TRUE}, use an approximation to the power function in the UMAP gradient. -Ignored if \code{dens_scale} is non-NULL. Default is \code{FALSE}. +If \code{TRUE}, use an approximation to the power function in the UMAP gradient. Default is \code{FALSE}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{y} :: \code{character(1)}\cr -Default is \code{NULL}. +Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_n_neighbors} :: \code{integer(1)}\cr Number of nearest neighbors to use to construct the target simplicial set. Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_metric} :: \code{character(1)}\cr The metric used to measure distance for \code{y} if using supervised dimension reduction. -Used only if \code{y} is numeric. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_weight} :: \code{numeric(1)}\cr Weighting factor between data topology and target topology. Default is \code{0.5}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca} :: \code{integer(1)}\cr -Default is \code{NULL}. +Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_center} :: \code{logical(1)}\cr -If \code{TRUE}, center the columns of X before carrying out PCA. -For binary data, it's recommended to set this to \code{FALSE}. Default is \code{TRUE}. +If \code{TRUE}, center the columns of X before carrying out PCA. Default is \code{TRUE}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_rand} :: \code{logical(1)}\cr Default is \code{TRUE}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{fast_sgd} :: \code{logical(1)}\cr Default is \code{FALSE}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_threads} :: \code{integer(1)}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_sgd_threads} :: \code{integer(1)}\cr Default is \code{0}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{grain_size} :: \code{integer(1)}\cr Default is \code{1}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{verbose} :: \code{logical(1)}\cr -Should details be logged to the console? Initialzed to \code{FALSE}. +Should details be printed? Initialzed to \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{batch} :: \code{logical(1)}\cr Default is \code{FALSE}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{opt_args} :: named \code{list()}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{epoch_callback} :: \code{function}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_method} :: \code{character(1)}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{binary_edge_weights} :: \code{logical(1)}\cr Default is \code{FALSE}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{dens_scale} :: \code{numeric(1)}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{seed} :: \code{integer(1)}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_args} :: named \code{list()}\cr Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. } } From 0017c4e0918f675cb7e4ac2a34cfaf421710c6f4 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 12:20:42 +0200 Subject: [PATCH 07/36] docs: finish param docs --- R/PipeOpUMAP.R | 54 +++++++++++++++++++++------------------ man/mlr_pipeops_umap.Rd | 56 +++++++++++++++++++++++------------------ 2 files changed, 60 insertions(+), 50 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 23f08b465..a15e0774f 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -74,7 +74,6 @@ #' Default is `1`. For details see [uwot::umap2()]. #' * `repulsion_strength` :: `numeric(1)`\cr #' Weighting applied to negative samples in low dimensional embedding optimization. -#' Values higher than one will result in greater weight being given to negative samples. #' Default is `1`. For details see [uwot::umap2()]. #' * `negative_sample_rate` :: `numeric(1)`\cr #' The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample @@ -93,8 +92,9 @@ #' * `approx_pow` :: `logical(1)`\cr #' If `TRUE`, use an approximation to the power function in the UMAP gradient. Default is `FALSE`. #' For details see [uwot::umap2()]. -#' * `y` :: `character(1)`\cr -#' Default is `NULL`. For details see [uwot::umap2()]. +#' * `y` :: `any`\cr +#' Optional target data for supervised dimension reduction. Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `target_n_neighbors` :: `integer(1)`\cr #' Number of nearest neighbors to use to construct the target simplicial set. Default is `NULL`. #' For details see [uwot::umap2()]. @@ -105,51 +105,55 @@ #' Weighting factor between data topology and target topology. Default is `0.5`. #' For details see [uwot::umap2()]. #' * `pca` :: `integer(1)`\cr -#' Default is `NULL`. For details see [uwot::umap2()]. +#' Redude data to this number of columns using PCA. Default is `NULL`. +#' For details see [uwot::umap2()]. #' * `pca_center` :: `logical(1)`\cr #' If `TRUE`, center the columns of X before carrying out PCA. Default is `TRUE`. #' For details see [uwot::umap2()]. #' * `pca_rand` :: `logical(1)`\cr -#' Default is `TRUE`. -#' For details see [uwot::umap2()]. +#' If `TRUE`, use the PCG random number generator (O'Neill, 2014) during optimization. +#' Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. +#' Default is `TRUE`. For details see [uwot::umap2()]. #' * `fast_sgd` :: `logical(1)`\cr -#' Default is `FALSE`. -#' For details see [uwot::umap2()]. +#' If `TRUE`, then the following combination of parameters is set: +#' * `pcg_rand = TRUE` +#' * `n_sgd_threads = "auto"` +#' * `approx_pow = TRUE` +#' Default is `FALSE`. For details see [uwot::umap2()]. #' * `n_threads` :: `integer(1)`\cr -#' Default is `NULL`. -#' For details see [uwot::umap2()]. +#' Number of threads to use. Default is `NULL`. For details see [uwot::umap2()]. #' * `n_sgd_threads` :: `integer(1)`\cr -#' Default is `0`. +#' Number of threads to use during stochastic gradient descent. Default is `0`. #' For details see [uwot::umap2()]. #' * `grain_size` :: `integer(1)`\cr -#' Default is `1`. +#' The minimum amount of work to do on each thread. Default is `1`. #' For details see [uwot::umap2()]. #' * `verbose` :: `logical(1)`\cr #' Should details be printed? Initialzed to `FALSE`. For details see [uwot::umap2()]. #' * `batch` :: `logical(1)`\cr -#' Default is `FALSE`. -#' For details see [uwot::umap2()]. +#' If `TRUE`, then embedding coordinates are updated at the end of each epoch rather +#' than during the epoch. Default is `FALSE`. For details see [uwot::umap2()]. #' * `opt_args` :: named `list()`\cr -#' Default is `NULL`. +#' A list of optimizer parameters, used when `batch = TRUE`. Default is `NULL`. #' For details see [uwot::umap2()]. #' * `epoch_callback` :: `function`\cr -#' Default is `NULL`. +#' A function which will be invoked at the end of every epoch. Default is `NULL`. #' For details see [uwot::umap2()]. #' * `pca_method` :: `character(1)`\cr -#' Default is `NULL`. -#' For details see [uwot::umap2()]. +#' Method to carry out any PCA dimensionality reduction when the `pca` is specified. +#' Default is `NULL`. For details see [uwot::umap2()]. #' * `binary_edge_weights` :: `logical(1)`\cr -#' Default is `FALSE`. -#' For details see [uwot::umap2()]. +#' If TRUE then edge weights in the input graph are treated as binary (0/1) rather than real valued. +#' Default is `FALSE`. For details see [uwot::umap2()]. #' * `dens_scale` :: `numeric(1)`\cr -#' Default is `NULL`. +#' A scaling factor to apply to the density of the input data. Default is `NULL`. #' For details see [uwot::umap2()]. #' * `seed` :: `integer(1)`\cr -#' Default is `NULL`. -#' For details see [uwot::umap2()]. +#' Integer seed to use to initialize the random number generator state. +#' Default is `NULL`. For details see [uwot::umap2()]. #' * `nn_args` :: named `list()`\cr -#' Default is `NULL`. -#' For details see [uwot::umap2()]. +#' A list containing additional arguments to pass to the nearest neighbor method. +#' Default is `NULL`. For details see [uwot::umap2()]. #' #' @section Internals: #' Uses the [`umap()`][uwot::umap] function. diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 78f79dfd1..8b87a30ac 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -88,7 +88,6 @@ The effective bandwidth of the kernel if we view the algorithm as similar to Lap Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{repulsion_strength} :: \code{numeric(1)}\cr Weighting applied to negative samples in low dimensional embedding optimization. -Values higher than one will result in greater weight being given to negative samples. Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{negative_sample_rate} :: \code{numeric(1)}\cr The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample @@ -107,8 +106,9 @@ Number of nodes to search during the neighbor retrieval. For details see \code{\ \item \code{approx_pow} :: \code{logical(1)}\cr If \code{TRUE}, use an approximation to the power function in the UMAP gradient. Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{y} :: \code{character(1)}\cr -Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +\item \code{y} :: \code{any}\cr +Optional target data for supervised dimension reduction. Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_n_neighbors} :: \code{integer(1)}\cr Number of nearest neighbors to use to construct the target simplicial set. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. @@ -119,51 +119,57 @@ For details see \code{\link[uwot:umap2]{uwot::umap2()}}. Weighting factor between data topology and target topology. Default is \code{0.5}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca} :: \code{integer(1)}\cr -Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Redude data to this number of columns using PCA. Default is \code{NULL}. +For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_center} :: \code{logical(1)}\cr If \code{TRUE}, center the columns of X before carrying out PCA. Default is \code{TRUE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_rand} :: \code{logical(1)}\cr -Default is \code{TRUE}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. +Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. +Default is \code{TRUE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{fast_sgd} :: \code{logical(1)}\cr -Default is \code{FALSE}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +If \code{TRUE}, then the following combination of parameters is set: +\itemize{ +\item \code{pcg_rand = TRUE} +\item \code{n_sgd_threads = "auto"} +\item \code{approx_pow = TRUE} +Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +} \item \code{n_threads} :: \code{integer(1)}\cr -Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Number of threads to use. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_sgd_threads} :: \code{integer(1)}\cr -Default is \code{0}. +Number of threads to use during stochastic gradient descent. Default is \code{0}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{grain_size} :: \code{integer(1)}\cr -Default is \code{1}. +The minimum amount of work to do on each thread. Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{verbose} :: \code{logical(1)}\cr Should details be printed? Initialzed to \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{batch} :: \code{logical(1)}\cr -Default is \code{FALSE}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather +than during the epoch. Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{opt_args} :: named \code{list()}\cr -Default is \code{NULL}. +A list of optimizer parameters, used when \code{batch = TRUE}. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{epoch_callback} :: \code{function}\cr -Default is \code{NULL}. +A function which will be invoked at the end of every epoch. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_method} :: \code{character(1)}\cr -Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Method to carry out any PCA dimensionality reduction when the \code{pca} is specified. +Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{binary_edge_weights} :: \code{logical(1)}\cr -Default is \code{FALSE}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +If TRUE then edge weights in the input graph are treated as binary (0/1) rather than real valued. +Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{dens_scale} :: \code{numeric(1)}\cr -Default is \code{NULL}. +A scaling factor to apply to the density of the input data. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{seed} :: \code{integer(1)}\cr -Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Integer seed to use to initialize the random number generator state. +Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_args} :: named \code{list()}\cr -Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +A list containing additional arguments to pass to the nearest neighbor method. +Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. } } From 7a17a3a733e33caed992ac83535ec7963f51224f Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 12:49:11 +0200 Subject: [PATCH 08/36] docs: init docs for state --- R/PipeOpUMAP.R | 67 +++++++++++++++++++++++++++++++++++------ man/mlr_pipeops_umap.Rd | 66 ++++++++++++++++++++++++++++++++++------ 2 files changed, 113 insertions(+), 20 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index a15e0774f..d23f1222a 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -24,16 +24,62 @@ #' The output is the input [`Task`][mlr3::Task] with all affected numeric features replaced by their principal components. #' #' @section State: -#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the class [uwot::umap2], -#' with the exception of the `$x` slot. These are in particular: -#' * `sdev` :: `numeric`\cr -#' The standard deviations of the principal components. -#' * `rotation` :: `matrix`\cr -#' The matrix of variable loadings. -#' * `center` :: `numeric` | `logical(1)`\cr -#' The centering used, or `FALSE`. -#' * `scale` :: `numeric` | `logical(1)`\cr -#' The scaling used, or `FALSE`. +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the class [uwot::umap2]. +#' These are in particular: +#' * `embedding` :: `matrix`\cr +#' Blah +#' * `scale_info` :: `any`\cr +#' Blah +#' * `search_k` :: `numeric(1)`\cr +#' Blah +#' * `local_connectivity` :: `numeric(1)`\cr +#' Blah +#' * `n_epochs` :: `numeric(1)`\cr +#' Blah +#' * `alpha` :: `numeric(1)`\cr +#' Blah +#' * `negative_sample_rate` :: `numeric(1)`\cr +#' Blah +#' * `method` :: `character(1)`\cr +#' Blah +#' * `a` :: named `numeric(1)`\cr +#' Blah +#' * `b` :: named `numeric(1)`\cr +#' Blah +#' * `gamma` :: `numeric(1)`\cr +#' Blah +#' * `approx_pow` :: `logical(1)`\cr +#' Blah +#' * `metric` :: named `list()`\cr +#' Blah +#' * `norig_col` :: `integer(1)`\cr +#' Blah +#' * `pcg_rand` :: `logical(1)`\cr +#' Blah +#' * `batch` :: `logical(1)`\cr +#' Blah +#' * `opt_args` :: named `list()`\cr +#' Blah +#' * `num_precomputed_nns` :: `numeric(1)`\cr +#' Blah +#' * `min_dist` :: `numeric(1)`\cr +#' Blah +#' * `spread` :: `numeric(1)`\cr +#' Blah +#' * `binary_edge_weights` :: `logical(1)`\cr +#' Blah +#' * `seed` :: `integer(1)`\cr +#' Blah +#' * `nn_method` :: `any`\cr +#' Blah +#' * `nn_args` :: `list()`\cr +#' Blah +#' * `n_neighbors` :: `numeric(1)`\cr +#' Blah +#' * `nn_index` :: named `list()`\cr +#' Blah +#' * `pca_models` :: `list()`\cr +#' Blah #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: @@ -253,6 +299,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", .train_dt = function(dt, levels, target) { params = insert_named(self$param_set$get_values(tags = "umap"), list(ret_model = TRUE)) umap = invoke(uwot::umap2, dt, .args = params) + browser() self$state = umap umap$embedding }, diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 8b87a30ac..134824f43 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -33,17 +33,63 @@ The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric \section{State}{ -The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the class \link[uwot:umap2]{uwot::umap2}, -with the exception of the \verb{$x} slot. These are in particular: +The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the class \link[uwot:umap2]{uwot::umap2}. +These are in particular: \itemize{ -\item \code{sdev} :: \code{numeric}\cr -The standard deviations of the principal components. -\item \code{rotation} :: \code{matrix}\cr -The matrix of variable loadings. -\item \code{center} :: \code{numeric} | \code{logical(1)}\cr -The centering used, or \code{FALSE}. -\item \code{scale} :: \code{numeric} | \code{logical(1)}\cr -The scaling used, or \code{FALSE}. +\item \code{embedding} :: \code{matrix}\cr +Blah +\item \code{scale_info} :: \code{any}\cr +Blah +\item \code{search_k} :: \code{numeric(1)}\cr +Blah +\item \code{local_connectivity} :: \code{numeric(1)}\cr +Blah +\item \code{n_epochs} :: \code{numeric(1)}\cr +Blah +\item \code{alpha} :: \code{numeric(1)}\cr +Blah +\item \code{negative_sample_rate} :: \code{numeric(1)}\cr +Blah +\item \code{method} :: \code{character(1)}\cr +Blah +\item \code{a} :: named \code{numeric(1)}\cr +Blah +\item \code{b} :: named \code{numeric(1)}\cr +Blah +\item \code{gamma} :: \code{numeric(1)}\cr +Blah +\item \code{approx_pow} :: \code{logical(1)}\cr +Blah +\item \code{metric} :: named \code{list()}\cr +Blah +\item \code{norig_col} :: \code{integer(1)}\cr +Blah +\item \code{pcg_rand} :: \code{logical(1)}\cr +Blah +\item \code{batch} :: \code{logical(1)}\cr +Blah +\item \code{opt_args} :: named \code{list()}\cr +Blah +\item \code{num_precomputed_nns} :: \code{numeric(1)}\cr +Blah +\item \code{min_dist} :: \code{numeric(1)}\cr +Blah +\item \code{spread} :: \code{numeric(1)}\cr +Blah +\item \code{binary_edge_weights} :: \code{logical(1)}\cr +Blah +\item \code{seed} :: \code{integer(1)}\cr +Blah +\item \code{nn_method} :: \code{any}\cr +Blah +\item \code{nn_args} :: \code{list()}\cr +Blah +\item \code{n_neighbors} :: \code{numeric(1)}\cr +Blah +\item \code{nn_index} :: named \code{list()}\cr +Blah +\item \code{pca_models} :: \code{list()}\cr +Blah } } From 4188dcde17f3024345e377929af2f934f979fec0 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 17:00:21 +0200 Subject: [PATCH 09/36] fix: remove browser call --- R/PipeOpUMAP.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index d23f1222a..e6ff02c61 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -299,7 +299,6 @@ PipeOpUMAP = R6Class("PipeOpUMAP", .train_dt = function(dt, levels, target) { params = insert_named(self$param_set$get_values(tags = "umap"), list(ret_model = TRUE)) umap = invoke(uwot::umap2, dt, .args = params) - browser() self$state = umap umap$embedding }, From 56e43fe2fcae286c88e13d0be596cda22849adfb Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 20:00:25 +0200 Subject: [PATCH 10/36] feat(umap): allow more metrics and remove depend --- R/PipeOpUMAP.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index e6ff02c61..90107ad0f 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -231,10 +231,14 @@ PipeOpUMAP = R6Class("PipeOpUMAP", n_neighbors = p_int(2L, 100L, default = 15L, tags = c("train", "umap")), n_components = p_int(1L, 100L, default = 2L, tags = c("train", "umap")), metric = p_fct( - c("euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical"), + levels = c( + "euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical", + "braycurtis", "canberra", "chebyshev", "dice", "hamming", "hellinger", "jaccard", + "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", + "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" + ), default = "euclidean", - tags = c("train", "umap"), - depends = quote(nn_method == "hnsw") + tags = c("train", "umap") ), n_epochs = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), learning_rate = p_dbl(0, default = 1, tags = c("train", "umap")), From 9ae3780783916eb733918f116eab3bd67c118992 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 30 Jul 2024 20:11:41 +0200 Subject: [PATCH 11/36] docs: added most state parameters --- R/PipeOpUMAP.R | 157 +++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 71 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index d23f1222a..0ca74f894 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -6,7 +6,7 @@ #' #' @description #' Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP). -#' See [uwot::umap2()] for details. +#' See [uwot::umap2()] For details,. #' #' @section Construction: #' ``` @@ -24,58 +24,65 @@ #' The output is the input [`Task`][mlr3::Task] with all affected numeric features replaced by their principal components. #' #' @section State: -#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the class [uwot::umap2]. +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as the elements of the list +#' returned from [uwot::umap2]. #' These are in particular: #' * `embedding` :: `matrix`\cr -#' Blah +#' Matrix of embedded coordinates. #' * `scale_info` :: `any`\cr #' Blah #' * `search_k` :: `numeric(1)`\cr -#' Blah +#' Number of nodes searched during the neighbor retrieval. Only used if the `nn_method` is `"annoy"`. +#' For details, see [uwot::umap2()]. #' * `local_connectivity` :: `numeric(1)`\cr -#' Blah +#' Used local connectivity – i.e. the number of nearest neighbors that should be +#' assumed to be connected at a local level. For details, see [uwot::umap2()]. #' * `n_epochs` :: `numeric(1)`\cr -#' Blah +#' Number of epochs used during the optimization of the embedded coordinates. For details, see [uwot::umap2()]. #' * `alpha` :: `numeric(1)`\cr -#' Blah +#' Initial learning rate. For details, see [uwot::umap2()]. #' * `negative_sample_rate` :: `numeric(1)`\cr -#' Blah +#' The number of negative edge/1-simplex samples used per positive edge/1-simplex sample +#' in optimizing the low dimensional embedding. For details, see [uwot::umap2()]. #' * `method` :: `character(1)`\cr #' Blah #' * `a` :: named `numeric(1)`\cr -#' Blah +#' More specific parameters controlling the embedding. For details, see [uwot::umap2()]. #' * `b` :: named `numeric(1)`\cr -#' Blah +#' More specific parameters controlling the embedding. For details, see [uwot::umap2()]. #' * `gamma` :: `numeric(1)`\cr #' Blah #' * `approx_pow` :: `logical(1)`\cr -#' Blah +#' If `TRUE`, use an approximation to the power function in the UMAP gradient. For details, see [uwot::umap2()]. #' * `metric` :: named `list()`\cr -#' Blah +#' Type of distance metric used to find nearest neighbors. For details, see [uwot::umap2()]. #' * `norig_col` :: `integer(1)`\cr -#' Blah +#' Number of original columns. #' * `pcg_rand` :: `logical(1)`\cr -#' Blah +#' `TRUE`, if the PCG random number generator (O'Neill, 2014) was used during optimization. +#' Otherwise, Tausworthe "taus88" generator was used. For details, see [uwot::umap2()]. #' * `batch` :: `logical(1)`\cr -#' Blah +#' `TRUE`, if embedding coordinates were updated at the end of each epoch rather +#' than during the epoch. For details, see [uwot::umap2()]. #' * `opt_args` :: named `list()`\cr -#' Blah +#' Optimizer parameters, used when `batch = TRUE`. For details, see [uwot::umap2()]. #' * `num_precomputed_nns` :: `numeric(1)`\cr #' Blah #' * `min_dist` :: `numeric(1)`\cr -#' Blah +#' The effective minimum distance between embedded points. For details, see [uwot::umap2()]. #' * `spread` :: `numeric(1)`\cr -#' Blah +#' The effective scale of embedded points. For details, see [uwot::umap2()]. #' * `binary_edge_weights` :: `logical(1)`\cr -#' Blah +#' If `TRUE` then edge weights in the input graph were treated as binary (0/1) rather than real valued. +#' For details, see [uwot::umap2()]. #' * `seed` :: `integer(1)`\cr #' Blah #' * `nn_method` :: `any`\cr -#' Blah +#' Method for finding nearest neighbors. For details, see [uwot::umap2()]. #' * `nn_args` :: `list()`\cr #' Blah #' * `n_neighbors` :: `numeric(1)`\cr -#' Blah +#' The size of the neighborhood used for manifold approximation. For details, see [uwot::umap2()]. #' * `nn_index` :: named `list()`\cr #' Blah #' * `pca_models` :: `list()`\cr @@ -85,124 +92,125 @@ #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: #' * `n_neighbors` :: `integer(1)`\cr #' The size of the neighborhood used for manifold approximation. Default is `15`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `n_components` :: `integer(1)`\cr -#' The dimension of the space to embed into. Default is `2`. For details see [uwot::umap2()]. +#' The dimension of the space to embed into. Default is `2`. For details, see [uwot::umap2()]. #' * `metric` :: `character(1)`\cr #' Type of distance metric to use to find nearest neighbors. Default is `"euclidean"`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `n_epochs` :: `integer(1)`\cr #' Number of epochs to use during the optimization of the embedded coordinates. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `learning_rate` :: `numeric(1)`\cr #' Initial learning rate used in optimization of the coordinates. Default is `1`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `init` :: `character(1)` | `matrix`\cr #' Type of initialization for the coordinates. Default is `"spectral"`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `init_sdev` :: `character(1)` | `numeric(1)`\cr #' Scales each dimension of the initialized coordinates to this standard deviation. -#' Default is `"range"`. For details see [uwot::umap2()]. +#' Default is `"range"`. For details, see [uwot::umap2()]. #' * `spread` :: `numeric(1)`\cr -#' The effective scale of embedded points. Default is `1`. For details see [uwot::umap2()]. +#' The effective scale of embedded points. Default is `1`. For details, see [uwot::umap2()]. #' * `min_dist` :: `numeric(1)`\cr #' The effective minimum distance between embedded points. Default is `0.01`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `set_op_mix_ratio` :: `numeric(1)`\cr #' Interpolate between (fuzzy) union and intersection as the set operation used to #' combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Default is `1`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `local_connectivity` :: `numeric(1)`\cr #' The local connectivity required – i.e. the number of nearest neighbors that should be -#' assumed to be connected at a local level. Default is `1`. For details see [uwot::umap2()]. +#' assumed to be connected at a local level. Default is `1`. For details, see [uwot::umap2()]. #' * `bandwidth` :: `numeric(1)`\cr #' The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. -#' Default is `1`. For details see [uwot::umap2()]. +#' Default is `1`. For details, see [uwot::umap2()]. #' * `repulsion_strength` :: `numeric(1)`\cr #' Weighting applied to negative samples in low dimensional embedding optimization. -#' Default is `1`. For details see [uwot::umap2()]. +#' Default is `1`. For details, see [uwot::umap2()]. #' * `negative_sample_rate` :: `numeric(1)`\cr #' The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample -#' in optimizing the low dimensional embedding. Default is `5`. For details see [uwot::umap2()]. +#' in optimizing the low dimensional embedding. Default is `5`. For details, see [uwot::umap2()]. #' * `a` :: `any`\cr -#' More specific parameters controlling the embedding. Default is `NULL`. For details see [uwot::umap2()]. +#' More specific parameters controlling the embedding. Default is `NULL`. For details, see [uwot::umap2()]. #' * `b` :: `any`\cr -#' More specific parameters controlling the embedding. Default is `NULL`. For details see [uwot::umap2()]. +#' More specific parameters controlling the embedding. Default is `NULL`. For details, see [uwot::umap2()]. #' * `nn_method` :: `character(1)` | named `list()` | matrix\cr -#' Method for finding nearest neighbors. Default is `NULL`. For details see [uwot::umap2()]. +#' Method for finding nearest neighbors. Default is `NULL`. For details, see [uwot::umap2()]. #' * `n_trees` :: `integer(1)`\cr #' Number of trees to build when constructing the nearest neighbor index. Default is `50`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `search_k` :: `integer(1)`\cr -#' Number of nodes to search during the neighbor retrieval. For details see [uwot::umap2()]. +#' Number of nodes to search during the neighbor retrieval. Only used if the `nn_method` is `"annoy"`. +#' For details, see [uwot::umap2()]. #' * `approx_pow` :: `logical(1)`\cr #' If `TRUE`, use an approximation to the power function in the UMAP gradient. Default is `FALSE`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `y` :: `any`\cr #' Optional target data for supervised dimension reduction. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `target_n_neighbors` :: `integer(1)`\cr -#' Number of nearest neighbors to use to construct the target simplicial set. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' Number of nearest neighbors to use to construct the target simplicial set. Default is `n_neighbors`. +#' For details, see [uwot::umap2()]. #' * `target_metric` :: `character(1)`\cr #' The metric used to measure distance for `y` if using supervised dimension reduction. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `target_weight` :: `numeric(1)`\cr #' Weighting factor between data topology and target topology. Default is `0.5`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `pca` :: `integer(1)`\cr #' Redude data to this number of columns using PCA. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `pca_center` :: `logical(1)`\cr #' If `TRUE`, center the columns of X before carrying out PCA. Default is `TRUE`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `pca_rand` :: `logical(1)`\cr #' If `TRUE`, use the PCG random number generator (O'Neill, 2014) during optimization. #' Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. -#' Default is `TRUE`. For details see [uwot::umap2()]. +#' Default is `TRUE`. For details, see [uwot::umap2()]. #' * `fast_sgd` :: `logical(1)`\cr #' If `TRUE`, then the following combination of parameters is set: #' * `pcg_rand = TRUE` #' * `n_sgd_threads = "auto"` #' * `approx_pow = TRUE` -#' Default is `FALSE`. For details see [uwot::umap2()]. +#' Default is `FALSE`. For details, see [uwot::umap2()]. #' * `n_threads` :: `integer(1)`\cr -#' Number of threads to use. Default is `NULL`. For details see [uwot::umap2()]. +#' Number of threads to use. Default is `NULL`. For details, see [uwot::umap2()]. #' * `n_sgd_threads` :: `integer(1)`\cr #' Number of threads to use during stochastic gradient descent. Default is `0`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `grain_size` :: `integer(1)`\cr #' The minimum amount of work to do on each thread. Default is `1`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `verbose` :: `logical(1)`\cr -#' Should details be printed? Initialzed to `FALSE`. For details see [uwot::umap2()]. +#' Should details be printed? Initialzed to `FALSE`. For details, see [uwot::umap2()]. #' * `batch` :: `logical(1)`\cr #' If `TRUE`, then embedding coordinates are updated at the end of each epoch rather -#' than during the epoch. Default is `FALSE`. For details see [uwot::umap2()]. +#' than during the epoch. Default is `FALSE`. For details, see [uwot::umap2()]. #' * `opt_args` :: named `list()`\cr #' A list of optimizer parameters, used when `batch = TRUE`. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `epoch_callback` :: `function`\cr #' A function which will be invoked at the end of every epoch. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `pca_method` :: `character(1)`\cr #' Method to carry out any PCA dimensionality reduction when the `pca` is specified. -#' Default is `NULL`. For details see [uwot::umap2()]. +#' Default is `NULL`. For details, see [uwot::umap2()]. #' * `binary_edge_weights` :: `logical(1)`\cr -#' If TRUE then edge weights in the input graph are treated as binary (0/1) rather than real valued. -#' Default is `FALSE`. For details see [uwot::umap2()]. +#' If `TRUE` then edge weights in the input graph are treated as binary (0/1) rather than real valued. +#' Default is `FALSE`. For details, see [uwot::umap2()]. #' * `dens_scale` :: `numeric(1)`\cr #' A scaling factor to apply to the density of the input data. Default is `NULL`. -#' For details see [uwot::umap2()]. +#' For details, see [uwot::umap2()]. #' * `seed` :: `integer(1)`\cr #' Integer seed to use to initialize the random number generator state. -#' Default is `NULL`. For details see [uwot::umap2()]. +#' Default is `NULL`. For details, see [uwot::umap2()]. #' * `nn_args` :: named `list()`\cr #' A list containing additional arguments to pass to the nearest neighbor method. -#' Default is `NULL`. For details see [uwot::umap2()]. +#' Default is `NULL`. For details, see [uwot::umap2()]. #' #' @section Internals: -#' Uses the [`umap()`][uwot::umap] function. +#' Uses the [`umap()`][uwot::umap2] function. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -231,11 +239,15 @@ PipeOpUMAP = R6Class("PipeOpUMAP", n_neighbors = p_int(2L, 100L, default = 15L, tags = c("train", "umap")), n_components = p_int(1L, 100L, default = 2L, tags = c("train", "umap")), metric = p_fct( - c("euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical"), + levels = c( + "euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical", + "braycurtis", "canberra", "chebyshev", "dice", "hamming", "hellinger", "jaccard", + "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", + "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" + ), default = "euclidean", - tags = c("train", "umap"), - depends = quote(nn_method == "hnsw") - ), + tags = c("train", "umap") + ), # why not all? n_epochs = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), learning_rate = p_dbl(0, default = 1, tags = c("train", "umap")), scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), @@ -270,7 +282,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", search_k = p_int(tags = c("train", "umap")), approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), y = p_uty(default = NULL, tags = c("train", "umap")), - target_n_neighbors = p_int(tags = c("train", "umap")), + target_n_neighbors = p_int(tags = c("train", "umap")), # default = n_neighbors target_metric = p_fct(c("euclidean", "cosine", "correlation"), default = "euclidean", tags = c("train", "umap")), target_weight = p_dbl(0, 1, default = 0.5, tags = c("train", "umap")), pca = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), @@ -283,7 +295,11 @@ PipeOpUMAP = R6Class("PipeOpUMAP", verbose = p_lgl(default = TRUE, tags = c("train", "umap")), batch = p_lgl(default = FALSE, tags = c("train", "umap")), opt_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = crate(function(x) check_list(x, null.ok = TRUE))), - epoch_callback = p_uty(default = NULL, tags = c("train", "umap"), custom_check = check_function_or_null), + epoch_callback = p_uty( + default = NULL, + tags = c("train", "umap"), + custom_check = crate(function(x) check_function(x, args = c("epochs", "n_epochs", "coords"), null.ok = TRUE)) + ), pca_method = p_fct(c("irlba", "rsvd", "bigstatsr", "svd", "auto"), default = NULL, special_vals = list(NULL), tags = c("train", "umap")), binary_edge_weights = p_lgl(default = FALSE, tags = c("train", "umap")), dens_scale = p_dbl(0, 1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), @@ -299,7 +315,6 @@ PipeOpUMAP = R6Class("PipeOpUMAP", .train_dt = function(dt, levels, target) { params = insert_named(self$param_set$get_values(tags = "umap"), list(ret_model = TRUE)) umap = invoke(uwot::umap2, dt, .args = params) - browser() self$state = umap umap$embedding }, From 8850d83af523a93d0a84c7f526e6ab1f83db0656 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 30 Jul 2024 20:24:21 +0200 Subject: [PATCH 12/36] fixed merge --- R/PipeOpUMAP.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index b3572eca6..947814fd3 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -247,11 +247,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", ), default = "euclidean", tags = c("train", "umap") -<<<<<<< HEAD - ), # why not all? -======= ), ->>>>>>> 56e43fe2fcae286c88e13d0be596cda22849adfb n_epochs = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), learning_rate = p_dbl(0, default = 1, tags = c("train", "umap")), scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), From 18000b30316bfc039701c89b63f1c5dc10bf6dc5 Mon Sep 17 00:00:00 2001 From: Maximilian Muecke Date: Tue, 30 Jul 2024 20:38:13 +0200 Subject: [PATCH 13/36] docs: redocument --- R/PipeOpUMAP.R | 2 +- man/mlr_pipeops_umap.Rd | 136 +++++++++++++++++++++------------------- 2 files changed, 73 insertions(+), 65 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 947814fd3..40afea686 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -210,7 +210,7 @@ #' Default is `NULL`. For details, see [uwot::umap2()]. #' #' @section Internals: -#' Uses the [`umap()`][uwot::umap2] function. +#' Uses the [umap2()][uwot::umap2] function. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 134824f43..1bf05c50a 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -9,7 +9,7 @@ } \description{ Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP). -See \code{\link[uwot:umap2]{uwot::umap2()}} for details. +See \code{\link[uwot:umap2]{uwot::umap2()}} For details,. } \section{Construction}{ @@ -33,59 +33,66 @@ The output is the input \code{\link[mlr3:Task]{Task}} with all affected numeric \section{State}{ -The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the class \link[uwot:umap2]{uwot::umap2}. +The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as the elements of the list +returned from \link[uwot:umap2]{uwot::umap2}. These are in particular: \itemize{ \item \code{embedding} :: \code{matrix}\cr -Blah +Matrix of embedded coordinates. \item \code{scale_info} :: \code{any}\cr Blah \item \code{search_k} :: \code{numeric(1)}\cr -Blah +Number of nodes searched during the neighbor retrieval. Only used if the \code{nn_method} is \code{"annoy"}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{local_connectivity} :: \code{numeric(1)}\cr -Blah +Used local connectivity – i.e. the number of nearest neighbors that should be +assumed to be connected at a local level. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_epochs} :: \code{numeric(1)}\cr -Blah +Number of epochs used during the optimization of the embedded coordinates. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{alpha} :: \code{numeric(1)}\cr -Blah +Initial learning rate. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{negative_sample_rate} :: \code{numeric(1)}\cr -Blah +The number of negative edge/1-simplex samples used per positive edge/1-simplex sample +in optimizing the low dimensional embedding. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{method} :: \code{character(1)}\cr Blah \item \code{a} :: named \code{numeric(1)}\cr -Blah +More specific parameters controlling the embedding. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{b} :: named \code{numeric(1)}\cr -Blah +More specific parameters controlling the embedding. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{gamma} :: \code{numeric(1)}\cr Blah \item \code{approx_pow} :: \code{logical(1)}\cr -Blah +If \code{TRUE}, use an approximation to the power function in the UMAP gradient. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{metric} :: named \code{list()}\cr -Blah +Type of distance metric used to find nearest neighbors. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{norig_col} :: \code{integer(1)}\cr -Blah +Number of original columns. \item \code{pcg_rand} :: \code{logical(1)}\cr -Blah +\code{TRUE}, if the PCG random number generator (O'Neill, 2014) was used during optimization. +Otherwise, Tausworthe "taus88" generator was used. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{batch} :: \code{logical(1)}\cr -Blah +\code{TRUE}, if embedding coordinates were updated at the end of each epoch rather +than during the epoch. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{opt_args} :: named \code{list()}\cr -Blah +Optimizer parameters, used when \code{batch = TRUE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{num_precomputed_nns} :: \code{numeric(1)}\cr Blah \item \code{min_dist} :: \code{numeric(1)}\cr -Blah +The effective minimum distance between embedded points. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{spread} :: \code{numeric(1)}\cr -Blah +The effective scale of embedded points. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{binary_edge_weights} :: \code{logical(1)}\cr -Blah +If \code{TRUE} then edge weights in the input graph were treated as binary (0/1) rather than real valued. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{seed} :: \code{integer(1)}\cr Blah \item \code{nn_method} :: \code{any}\cr -Blah +Method for finding nearest neighbors. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_args} :: \code{list()}\cr Blah \item \code{n_neighbors} :: \code{numeric(1)}\cr -Blah +The size of the neighborhood used for manifold approximation. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_index} :: named \code{list()}\cr Blah \item \code{pca_models} :: \code{list()}\cr @@ -99,129 +106,130 @@ The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}} \itemize{ \item \code{n_neighbors} :: \code{integer(1)}\cr The size of the neighborhood used for manifold approximation. Default is \code{15}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_components} :: \code{integer(1)}\cr -The dimension of the space to embed into. Default is \code{2}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +The dimension of the space to embed into. Default is \code{2}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{metric} :: \code{character(1)}\cr Type of distance metric to use to find nearest neighbors. Default is \code{"euclidean"}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_epochs} :: \code{integer(1)}\cr Number of epochs to use during the optimization of the embedded coordinates. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{learning_rate} :: \code{numeric(1)}\cr Initial learning rate used in optimization of the coordinates. Default is \code{1}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{init} :: \code{character(1)} | \code{matrix}\cr Type of initialization for the coordinates. Default is \code{"spectral"}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{init_sdev} :: \code{character(1)} | \code{numeric(1)}\cr Scales each dimension of the initialized coordinates to this standard deviation. -Default is \code{"range"}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{"range"}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{spread} :: \code{numeric(1)}\cr -The effective scale of embedded points. Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +The effective scale of embedded points. Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{min_dist} :: \code{numeric(1)}\cr The effective minimum distance between embedded points. Default is \code{0.01}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{set_op_mix_ratio} :: \code{numeric(1)}\cr Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Default is \code{1}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{local_connectivity} :: \code{numeric(1)}\cr The local connectivity required – i.e. the number of nearest neighbors that should be -assumed to be connected at a local level. Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +assumed to be connected at a local level. Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{bandwidth} :: \code{numeric(1)}\cr The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. -Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{repulsion_strength} :: \code{numeric(1)}\cr Weighting applied to negative samples in low dimensional embedding optimization. -Default is \code{1}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{negative_sample_rate} :: \code{numeric(1)}\cr The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample -in optimizing the low dimensional embedding. Default is \code{5}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +in optimizing the low dimensional embedding. Default is \code{5}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{a} :: \code{any}\cr -More specific parameters controlling the embedding. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +More specific parameters controlling the embedding. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{b} :: \code{any}\cr -More specific parameters controlling the embedding. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +More specific parameters controlling the embedding. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_method} :: \code{character(1)} | named \code{list()} | matrix\cr -Method for finding nearest neighbors. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Method for finding nearest neighbors. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_trees} :: \code{integer(1)}\cr Number of trees to build when constructing the nearest neighbor index. Default is \code{50}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{search_k} :: \code{integer(1)}\cr -Number of nodes to search during the neighbor retrieval. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Number of nodes to search during the neighbor retrieval. Only used if the \code{nn_method} is \code{"annoy"}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{approx_pow} :: \code{logical(1)}\cr If \code{TRUE}, use an approximation to the power function in the UMAP gradient. Default is \code{FALSE}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{y} :: \code{any}\cr Optional target data for supervised dimension reduction. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_n_neighbors} :: \code{integer(1)}\cr -Number of nearest neighbors to use to construct the target simplicial set. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Number of nearest neighbors to use to construct the target simplicial set. Default is \code{n_neighbors}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_metric} :: \code{character(1)}\cr The metric used to measure distance for \code{y} if using supervised dimension reduction. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_weight} :: \code{numeric(1)}\cr Weighting factor between data topology and target topology. Default is \code{0.5}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca} :: \code{integer(1)}\cr Redude data to this number of columns using PCA. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_center} :: \code{logical(1)}\cr If \code{TRUE}, center the columns of X before carrying out PCA. Default is \code{TRUE}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_rand} :: \code{logical(1)}\cr If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. -Default is \code{TRUE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{TRUE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{fast_sgd} :: \code{logical(1)}\cr If \code{TRUE}, then the following combination of parameters is set: \itemize{ \item \code{pcg_rand = TRUE} \item \code{n_sgd_threads = "auto"} \item \code{approx_pow = TRUE} -Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. } \item \code{n_threads} :: \code{integer(1)}\cr -Number of threads to use. Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Number of threads to use. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_sgd_threads} :: \code{integer(1)}\cr Number of threads to use during stochastic gradient descent. Default is \code{0}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{grain_size} :: \code{integer(1)}\cr The minimum amount of work to do on each thread. Default is \code{1}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{verbose} :: \code{logical(1)}\cr -Should details be printed? Initialzed to \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Should details be printed? Initialzed to \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{batch} :: \code{logical(1)}\cr If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather -than during the epoch. Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +than during the epoch. Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{opt_args} :: named \code{list()}\cr A list of optimizer parameters, used when \code{batch = TRUE}. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{epoch_callback} :: \code{function}\cr A function which will be invoked at the end of every epoch. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_method} :: \code{character(1)}\cr Method to carry out any PCA dimensionality reduction when the \code{pca} is specified. -Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{binary_edge_weights} :: \code{logical(1)}\cr -If TRUE then edge weights in the input graph are treated as binary (0/1) rather than real valued. -Default is \code{FALSE}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +If \code{TRUE} then edge weights in the input graph are treated as binary (0/1) rather than real valued. +Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{dens_scale} :: \code{numeric(1)}\cr A scaling factor to apply to the density of the input data. Default is \code{NULL}. -For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{seed} :: \code{integer(1)}\cr Integer seed to use to initialize the random number generator state. -Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_args} :: named \code{list()}\cr A list containing additional arguments to pass to the nearest neighbor method. -Default is \code{NULL}. For details see \code{\link[uwot:umap2]{uwot::umap2()}}. +Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. } } \section{Internals}{ -Uses the \code{\link[uwot:umap]{umap()}} function. +Uses the \link[uwot:umap2]{umap2()} function. } \section{Methods}{ From 341fd1ea06feb60cbab598e4040d8a754c92c7d5 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 30 Jul 2024 21:04:02 +0200 Subject: [PATCH 14/36] docs: added seed state + feat: rm double metric --- R/PipeOpUMAP.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 947814fd3..73c5187cc 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -45,7 +45,7 @@ #' The number of negative edge/1-simplex samples used per positive edge/1-simplex sample #' in optimizing the low dimensional embedding. For details, see [uwot::umap2()]. #' * `method` :: `character(1)`\cr -#' Blah +#' General method used for dimensionality reduction, is always `"umap"` for this PipeOp. #' * `a` :: named `numeric(1)`\cr #' More specific parameters controlling the embedding. For details, see [uwot::umap2()]. #' * `b` :: named `numeric(1)`\cr @@ -76,7 +76,7 @@ #' If `TRUE` then edge weights in the input graph were treated as binary (0/1) rather than real valued. #' For details, see [uwot::umap2()]. #' * `seed` :: `integer(1)`\cr -#' Blah +#' Integer seed to use to initialize the random number generator state. For details, see [uwot::umap2()]. #' * `nn_method` :: `any`\cr #' Method for finding nearest neighbors. For details, see [uwot::umap2()]. #' * `nn_args` :: `list()`\cr @@ -241,7 +241,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", metric = p_fct( levels = c( "euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical", - "braycurtis", "canberra", "chebyshev", "dice", "hamming", "hellinger", "jaccard", + "braycurtis", "canberra", "chebyshev", "dice", "hellinger", "jaccard", "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" ), @@ -282,7 +282,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", search_k = p_int(tags = c("train", "umap")), approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), y = p_uty(default = NULL, tags = c("train", "umap")), - target_n_neighbors = p_int(tags = c("train", "umap")), # default = n_neighbors + target_n_neighbors = p_int(tags = c("train", "umap")), target_metric = p_fct(c("euclidean", "cosine", "correlation"), default = "euclidean", tags = c("train", "umap")), target_weight = p_dbl(0, 1, default = 0.5, tags = c("train", "umap")), pca = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), From b6b14372748e372afbe7bd81e15e4a3af2d74981 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 30 Jul 2024 21:52:56 +0200 Subject: [PATCH 15/36] docs: remaining state params + document --- R/PipeOpUMAP.R | 15 ++++++++------- man/mlr_pipeops_nmf.Rd | 2 +- man/mlr_pipeops_umap.Rd | 19 ++++++++++--------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index d72e1cea9..7cb8fa69c 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -29,8 +29,8 @@ #' These are in particular: #' * `embedding` :: `matrix`\cr #' Matrix of embedded coordinates. -#' * `scale_info` :: `any`\cr -#' Blah +#' * `scale_info` :: named `list()`\cr +#' If `scale`is `TRUE`, this gives the scaling attributes (`center`, `scale`, `nzvcols`) of the scaled data. #' * `search_k` :: `numeric(1)`\cr #' Number of nodes searched during the neighbor retrieval. Only used if the `nn_method` is `"annoy"`. #' For details, see [uwot::umap2()]. @@ -51,7 +51,8 @@ #' * `b` :: named `numeric(1)`\cr #' More specific parameters controlling the embedding. For details, see [uwot::umap2()]. #' * `gamma` :: `numeric(1)`\cr -#' Blah +#' Repulsion strength. Weighting applied to negative samples in low dimensional embedding optimization. +#' For details, see [uwot::umap2()]. #' * `approx_pow` :: `logical(1)`\cr #' If `TRUE`, use an approximation to the power function in the UMAP gradient. For details, see [uwot::umap2()]. #' * `metric` :: named `list()`\cr @@ -67,7 +68,7 @@ #' * `opt_args` :: named `list()`\cr #' Optimizer parameters, used when `batch = TRUE`. For details, see [uwot::umap2()]. #' * `num_precomputed_nns` :: `numeric(1)`\cr -#' Blah +#' Number of precomputed nearest neighbors, via `nn_method`. #' * `min_dist` :: `numeric(1)`\cr #' The effective minimum distance between embedded points. For details, see [uwot::umap2()]. #' * `spread` :: `numeric(1)`\cr @@ -80,13 +81,13 @@ #' * `nn_method` :: `any`\cr #' Method for finding nearest neighbors. For details, see [uwot::umap2()]. #' * `nn_args` :: `list()`\cr -#' Blah +#' A list containing additional arguments to pass to the nearest neighbor method. For details, see [uwot::umap2()]. #' * `n_neighbors` :: `numeric(1)`\cr #' The size of the neighborhood used for manifold approximation. For details, see [uwot::umap2()]. #' * `nn_index` :: named `list()`\cr -#' Blah +#' Nearest neighbor index that can be used for transformation of new data points. #' * `pca_models` :: `list()`\cr -#' Blah +#' Used PCA models for initialization, `pca` is specified. For details, see [uwot::umap2()]. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 1de3f5083..6e74a4313 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -96,7 +96,7 @@ See \code{\link[NMF:nmf]{nmf()}}. \section{Internals}{ -Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis-coef-methods]{basis()}}, \code{\link[NMF:basis-coef-methods]{coef()}} and +Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis]{basis()}}, \code{\link[NMF:coef]{coef()}} and \code{\link[MASS:ginv]{ginv()}}. } diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 1bf05c50a..c4c4c12dd 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -39,8 +39,8 @@ These are in particular: \itemize{ \item \code{embedding} :: \code{matrix}\cr Matrix of embedded coordinates. -\item \code{scale_info} :: \code{any}\cr -Blah +\item \code{scale_info} :: named \code{list()}\cr +If \code{scale}is \code{TRUE}, this gives the scaling attributes (\code{center}, \code{scale}, \code{nzvcols}) of the scaled data. \item \code{search_k} :: \code{numeric(1)}\cr Number of nodes searched during the neighbor retrieval. Only used if the \code{nn_method} is \code{"annoy"}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. @@ -55,13 +55,14 @@ Initial learning rate. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. The number of negative edge/1-simplex samples used per positive edge/1-simplex sample in optimizing the low dimensional embedding. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{method} :: \code{character(1)}\cr -Blah +General method used for dimensionality reduction, is always \code{"umap"} for this PipeOp. \item \code{a} :: named \code{numeric(1)}\cr More specific parameters controlling the embedding. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{b} :: named \code{numeric(1)}\cr More specific parameters controlling the embedding. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{gamma} :: \code{numeric(1)}\cr -Blah +Repulsion strength. Weighting applied to negative samples in low dimensional embedding optimization. +For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{approx_pow} :: \code{logical(1)}\cr If \code{TRUE}, use an approximation to the power function in the UMAP gradient. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{metric} :: named \code{list()}\cr @@ -77,7 +78,7 @@ than during the epoch. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{opt_args} :: named \code{list()}\cr Optimizer parameters, used when \code{batch = TRUE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{num_precomputed_nns} :: \code{numeric(1)}\cr -Blah +Number of precomputed nearest neighbors, via \code{nn_method}. \item \code{min_dist} :: \code{numeric(1)}\cr The effective minimum distance between embedded points. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{spread} :: \code{numeric(1)}\cr @@ -86,17 +87,17 @@ The effective scale of embedded points. For details, see \code{\link[uwot:umap2] If \code{TRUE} then edge weights in the input graph were treated as binary (0/1) rather than real valued. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{seed} :: \code{integer(1)}\cr -Blah +Integer seed to use to initialize the random number generator state. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_method} :: \code{any}\cr Method for finding nearest neighbors. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_args} :: \code{list()}\cr -Blah +A list containing additional arguments to pass to the nearest neighbor method. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_neighbors} :: \code{numeric(1)}\cr The size of the neighborhood used for manifold approximation. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_index} :: named \code{list()}\cr -Blah +Nearest neighbor index that can be used for transformation of new data points. \item \code{pca_models} :: \code{list()}\cr -Blah +Used PCA models for initialization, \code{pca} is specified. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. } } From 28adeed326c1171838f4935282a88fde895049fd Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 2 Aug 2024 15:07:18 +0200 Subject: [PATCH 16/36] feat: predict takes computing params that are not taken from model by umap_transform by default --- R/PipeOpUMAP.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 7cb8fa69c..6a2ecd2f1 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -290,8 +290,8 @@ PipeOpUMAP = R6Class("PipeOpUMAP", pca_center = p_lgl(default = TRUE, tags = c("train", "umap")), pca_rand = p_lgl(default = TRUE, tags = c("train", "umap")), fast_sgd = p_lgl(default = FALSE, tags = c("train", "umap")), - n_threads = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), - n_sgd_threads = p_int(0L, default = 0L, special_vals = list("auto"), tags = c("train", "umap")), + n_threads = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "predict", "umap")), + n_sgd_threads = p_int(0L, default = 0L, special_vals = list("auto"), tags = c("train", "predict", "umap")), grain_size = p_int(1L, default = 1L, tags = c("train", "umap")), verbose = p_lgl(default = TRUE, tags = c("train", "umap")), batch = p_lgl(default = FALSE, tags = c("train", "umap")), @@ -314,14 +314,15 @@ PipeOpUMAP = R6Class("PipeOpUMAP", ), private = list( .train_dt = function(dt, levels, target) { - params = insert_named(self$param_set$get_values(tags = "umap"), list(ret_model = TRUE)) + params = insert_named(self$param_set$get_values(tags = c("umap", "train")), list(ret_model = TRUE)) umap = invoke(uwot::umap2, dt, .args = params) self$state = umap umap$embedding }, .predict_dt = function(dt, levels) { - invoke(uwot::umap_transform, dt, self$state) + params = self$param_set$get_values(tags = c("umap", "predict")) + invoke(uwot::umap_transform, dt, self$state, .args = params) } ) ) From 114e33cd6093fa02d8e626bc46351858541e19d9 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 2 Aug 2024 15:10:20 +0200 Subject: [PATCH 17/36] feat: target metrics same as metrics --- R/PipeOpUMAP.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 6a2ecd2f1..bc57005eb 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -284,7 +284,16 @@ PipeOpUMAP = R6Class("PipeOpUMAP", approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), y = p_uty(default = NULL, tags = c("train", "umap")), target_n_neighbors = p_int(tags = c("train", "umap")), - target_metric = p_fct(c("euclidean", "cosine", "correlation"), default = "euclidean", tags = c("train", "umap")), + target_metric = p_fct( + levels = c( + "euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical", + "braycurtis", "canberra", "chebyshev", "dice", "hellinger", "jaccard", + "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", + "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" + ), + default = "euclidean", + tags = c("train", "umap") + ), target_weight = p_dbl(0, 1, default = 0.5, tags = c("train", "umap")), pca = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), pca_center = p_lgl(default = TRUE, tags = c("train", "umap")), From 5e60f6f57e554427f5a7f403bb29e10328cb80e4 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 2 Aug 2024 15:42:04 +0200 Subject: [PATCH 18/36] feat: custom check for param --- R/PipeOpUMAP.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index bc57005eb..3932b58e3 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -282,7 +282,17 @@ PipeOpUMAP = R6Class("PipeOpUMAP", n_trees = p_int(10L, 100L, default = 50L, tags = c("train", "umap")), search_k = p_int(tags = c("train", "umap")), approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), - y = p_uty(default = NULL, tags = c("train", "umap")), + y = p_uty( + default = NULL, + tags = c("train", "umap"), + custom_check = crate(function(x) { + check_atomic_vector(x) %check||% + check_matrix(x) %check||% + check_data_frame(x) %check||% + check_list(x) %check||% + check_null(x) + }) + ), target_n_neighbors = p_int(tags = c("train", "umap")), target_metric = p_fct( levels = c( From 5f869876cda1c10b309994f55565c2eeea895cb1 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 15:52:28 +0200 Subject: [PATCH 19/36] feat: removed param options that are incompatible with predict + test stub --- R/PipeOpUMAP.R | 10 +++------- tests/testthat/test_pipeop_umap.R | 14 +++++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 3932b58e3..331adbfe1 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -6,7 +6,7 @@ #' #' @description #' Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP). -#' See [uwot::umap2()] For details,. +#' See [uwot::umap2()] for details. #' #' @section Construction: #' ``` @@ -241,7 +241,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", n_components = p_int(1L, 100L, default = 2L, tags = c("train", "umap")), metric = p_fct( levels = c( - "euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical", + "euclidean", "cosine", "manhattan", "hamming", "correlation", "braycurtis", "canberra", "chebyshev", "dice", "hellinger", "jaccard", "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" @@ -273,11 +273,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", nn_method = p_uty( default = NULL, tags = c("train", "umap"), - custom_check = crate(function(x) { - check_choice(x, c("fnn", "annoy", "hnsw", "nndescent"), null.ok = TRUE) %check||% - check_list(x, types = "matrix", len = 2L, names = "idx", "dist") %check||% - check_class(x, "dgCMatrix") - }) + custom_check = crate(function(x) check_choice(x, c("annoy", "hnsw", "nndescent"), null.ok = TRUE)) ), n_trees = p_int(10L, 100L, default = 50L, tags = c("train", "umap")), search_k = p_int(tags = c("train", "umap")), diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R index eef01e201..fe770c347 100644 --- a/tests/testthat/test_pipeop_umap.R +++ b/tests/testthat/test_pipeop_umap.R @@ -3,5 +3,17 @@ context("PipeOpUMAP") test_that("PipeOpUMAP - basic properties", { skip_if_not_installed("uwot") op = PipeOpUMAP$new() - expect_pipeop(op) + task = mlr_tasks$get("iris") + expect_datapreproc_pipeop_class(op, task = task) +}) + +test_that("PipeOpUMAP - Compare to uwot::umap2", { + skip_if_not_installed("uwot") + op = PipeOpUMAP$new() + task = mlr_tasks$get("iris") + + # Default parameters + + # Some changed parameters + }) From b8fa65f60ecf9717815cfeab56e6e4f900a60288 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 16:00:12 +0200 Subject: [PATCH 20/36] small docs change and change to target_metric for compatibility with predict --- R/PipeOpUMAP.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 331adbfe1..9857118a0 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -136,8 +136,9 @@ #' More specific parameters controlling the embedding. Default is `NULL`. For details, see [uwot::umap2()]. #' * `b` :: `any`\cr #' More specific parameters controlling the embedding. Default is `NULL`. For details, see [uwot::umap2()]. -#' * `nn_method` :: `character(1)` | named `list()` | matrix\cr -#' Method for finding nearest neighbors. Default is `NULL`. For details, see [uwot::umap2()]. +#' * `nn_method` :: `character(1)`\cr +#' Method for finding nearest neighbors. Note that only values compatible with [uwot::umap_transform()] are allowed. +#' Default is `NULL`. For details, see [uwot::umap2()]. #' * `n_trees` :: `integer(1)`\cr #' Number of trees to build when constructing the nearest neighbor index. Default is `50`. #' For details, see [uwot::umap2()]. @@ -292,7 +293,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", target_n_neighbors = p_int(tags = c("train", "umap")), target_metric = p_fct( levels = c( - "euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical", + "euclidean", "cosine", "manhattan", "hamming", "correlation", "braycurtis", "canberra", "chebyshev", "dice", "hellinger", "jaccard", "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" From 7881990870d39b6aaaa5b3dd26197f49bf046431 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 20:14:03 +0200 Subject: [PATCH 21/36] docs: param scale + small corr for scale special vals --- R/PipeOpUMAP.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 9857118a0..6d0836e06 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -105,6 +105,8 @@ #' * `learning_rate` :: `numeric(1)`\cr #' Initial learning rate used in optimization of the coordinates. Default is `1`. #' For details, see [uwot::umap2()]. +#' * `scale` :: `logical(1)` / `character(1)`\cr +#' Scaling to apply to the data. If `TRUE`, data is standardized. Default is `FALSE`. For details, see [uwot::umap2()]. #' * `init` :: `character(1)` | `matrix`\cr #' Type of initialization for the coordinates. Default is `"spectral"`. #' For details, see [uwot::umap2()]. @@ -252,7 +254,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", ), n_epochs = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), learning_rate = p_dbl(0, default = 1, tags = c("train", "umap")), - scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), + scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "scale", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), init = p_uty( default = "spectral", tags = c("train", "umap"), From 79c7dbc1e98bb7a1840220dd1fa354c1ab14cc07 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 20:57:04 +0200 Subject: [PATCH 22/36] added tests --- tests/testthat/test_pipeop_umap.R | 73 ++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R index fe770c347..0a389f88b 100644 --- a/tests/testthat/test_pipeop_umap.R +++ b/tests/testthat/test_pipeop_umap.R @@ -3,17 +3,78 @@ context("PipeOpUMAP") test_that("PipeOpUMAP - basic properties", { skip_if_not_installed("uwot") op = PipeOpUMAP$new() - task = mlr_tasks$get("iris") - expect_datapreproc_pipeop_class(op, task = task) + task = mlr_tasks$get("iris")$filter(1:30) + + expect_pipeop(op) + + expect_task(op$train(list(task))[[1]]) + expect_task(op$predict(list(task))[[1]]) + }) -test_that("PipeOpUMAP - Compare to uwot::umap2", { +test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default Params", { skip_if_not_installed("uwot") + task = mlr_tasks$get("iris")$filter(1:30) + op = PipeOpUMAP$new() - task = mlr_tasks$get("iris") + pv = list(seed = 1234L) + op$param_set$set_values(.values = pv) + + train_out = train_pipeop(op, list(task))[[1L]] + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) + + state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", + "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", + "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") + expect_true(all(state_names %in% names(op$state))) + state_names_wo_pointers = setdiff(state_names, "nn_index") # since pointers in element 1 will not be equal + expect_identical(op$state[state_names_wo_pointers], umap_out[state_names_wo_pointers]) + expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) + + predict_out = predict_pipeop(op, list(task))[[1L]] + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) + +}) + + +test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed Params", { + skip_if_not_installed("uwot") + task = mlr_tasks$get("iris")$filter(1:30) + + op = PipeOpUMAP$new() + pv = list( + seed = 1234L, + nn_method = "annoy", + n_neighbors = 10L, + metric = "correlation", + n_epochs = 100L, + learning_rate = 0.5, + scale = FALSE, + init = "pca", + init_sdev = 1e-4, + set_op_mix_ratio = 0.5, + local_connectivity = 1.1, + bandwidth = 0.9, + repulsion_strength = 1.1, + negative_sample_rate = 6, + y = task$data()[, 1] + ) + op$param_set$set_values(.values = pv) + + train_out = train_pipeop(op, list(task))[[1L]] + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) - # Default parameters + state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", + "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", + "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") + expect_true(all(state_names %in% names(op$state))) + state_names = setdiff(state_names, "nn_index") # since pointers in state$nn_index$element1 will not be equal + expect_identical(op$state[state_names], umap_out[state_names]) + expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) - # Some changed parameters + predict_out = predict_pipeop(op, list(task))[[1L]] + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) }) From 9d29051c688934eab51be5687fa87ea498e58922 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 21:04:32 +0200 Subject: [PATCH 23/36] docs: run document --- man/mlr_pipeops_umap.Rd | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index c4c4c12dd..d5140a246 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -9,7 +9,7 @@ } \description{ Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP). -See \code{\link[uwot:umap2]{uwot::umap2()}} For details,. +See \code{\link[uwot:umap2]{uwot::umap2()}} for details. } \section{Construction}{ @@ -119,6 +119,8 @@ For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{learning_rate} :: \code{numeric(1)}\cr Initial learning rate used in optimization of the coordinates. Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +\item \code{scale} :: \code{logical(1)} / \code{character(1)}\cr +Scaling to apply to the data. If \code{TRUE}, data is standardized. Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{init} :: \code{character(1)} | \code{matrix}\cr Type of initialization for the coordinates. Default is \code{"spectral"}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. @@ -150,8 +152,9 @@ in optimizing the low dimensional embedding. Default is \code{5}. For details, s More specific parameters controlling the embedding. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{b} :: \code{any}\cr More specific parameters controlling the embedding. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{nn_method} :: \code{character(1)} | named \code{list()} | matrix\cr -Method for finding nearest neighbors. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +\item \code{nn_method} :: \code{character(1)}\cr +Method for finding nearest neighbors. Note that only values compatible with \code{\link[uwot:umap_transform]{uwot::umap_transform()}} are allowed. +Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{n_trees} :: \code{integer(1)}\cr Number of trees to build when constructing the nearest neighbor index. Default is \code{50}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. From 8545c79485bab327262575b158811846b66bc7c5 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 21:19:31 +0200 Subject: [PATCH 24/36] updated NEWS.md --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index fc42cae16..4f583d8c2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # mlr3pipelines 0.6.1 * New PipeOp `PipeOpRowApply` / `po("rowapply")` +* New PipeOp `PipeOpUMAP` / `po("umap")` # mlr3pipelines 0.6.0 From 97a616afc0c0cd60a709ab5425cd8bb74227ed38 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 22:11:19 +0200 Subject: [PATCH 25/36] added packages to initialize --- R/PipeOpUMAP.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 6d0836e06..67b6cba16 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -327,7 +327,7 @@ PipeOpUMAP = R6Class("PipeOpUMAP", ) ps$set_values(verbose = FALSE) - super$initialize(id, param_set = ps, param_vals = param_vals, feature_types = c("numeric", "integer")) + super$initialize(id, param_set = ps, param_vals = param_vals, packages = "uwot", feature_types = c("numeric", "integer")) } ), private = list( From ccdb4ae643e97d3675fab833a27405474103a78f Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 23:08:35 +0200 Subject: [PATCH 26/36] docs: added ref to paper --- R/PipeOpUMAP.R | 2 ++ R/bibentries.R | 13 +++++++++++++ man/mlr_pipeops_umap.Rd | 7 +++++++ 3 files changed, 22 insertions(+) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 67b6cba16..b609dea07 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -231,6 +231,8 @@ #' #' pop$state #' \dontshow{ \} } +#' @references +#' `r format_bib("mcinnes_2018")` #' @family PipeOps #' @template seealso_pipeopslist #' @include PipeOpTaskPreproc.R diff --git a/R/bibentries.R b/R/bibentries.R index de55741d5..a78b6103a 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -52,5 +52,18 @@ bibentries = c( author = "Yujun Wu and Dennis D Boos and Leonard A Stefanski", title = "Controlling Variable Selection by the Addition of Pseudovariables", journal = "Journal of the American Statistical Association" + ), + + mcinnes_2018 = bibentry("article", + doi = "10.21105/joss.00861", + year = "2018", + month = "9", + publisher = "The Open Journal", + volume = "3", + number = "29", + author = "Leland McInnes and John Healy and James Melville and Lukas Großberger", + title = "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction", + journal = "Journal of Open Source Software" ) ) + diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index d5140a246..d57694cbb 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -254,6 +254,12 @@ pop$train(list(task))[[1]]$data() pop$state \dontshow{ \} } } +\references{ +McInnes L, Healy J, Melville J, Großberger L (2018). +\dQuote{UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction.} +\emph{Journal of Open Source Software}, \bold{3}(29). +\doi{10.21105/joss.00861}. +} \seealso{ https://mlr-org.com/pipeops.html @@ -312,6 +318,7 @@ Other PipeOps: \code{\link{mlr_pipeops_removeconstants}}, \code{\link{mlr_pipeops_renamecolumns}}, \code{\link{mlr_pipeops_replicate}}, +\code{\link{mlr_pipeops_rowapply}}, \code{\link{mlr_pipeops_scale}}, \code{\link{mlr_pipeops_scalemaxabs}}, \code{\link{mlr_pipeops_scalerange}}, From 56ba86160873733eaa1392cb9b145c3900af43f8 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 6 Aug 2024 23:19:57 +0200 Subject: [PATCH 27/36] docs: fixed non-ASCII char + document --- R/bibentries.R | 2 +- man/mlr_pipeops_umap.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/bibentries.R b/R/bibentries.R index a78b6103a..09c994452 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -61,7 +61,7 @@ bibentries = c( publisher = "The Open Journal", volume = "3", number = "29", - author = "Leland McInnes and John Healy and James Melville and Lukas Großberger", + author = "Leland McInnes and John Healy and James Melville and Lukas Grossberger", title = "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction", journal = "Journal of Open Source Software" ) diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index d57694cbb..28d0f05c9 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -255,7 +255,7 @@ pop$state \dontshow{ \} } } \references{ -McInnes L, Healy J, Melville J, Großberger L (2018). +McInnes L, Healy J, Melville J, Grossberger L (2018). \dQuote{UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction.} \emph{Journal of Open Source Software}, \bold{3}(29). \doi{10.21105/joss.00861}. From 903c4fc52276ef892cd49f56ef81a597e17847e1 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Wed, 7 Aug 2024 21:20:24 +0200 Subject: [PATCH 28/36] fix test failures due to crate --- R/PipeOpUMAP.R | 14 ++++++++------ man/mlr_pipeops_umap.Rd | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index b609dea07..9bd224fa1 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -219,6 +219,9 @@ #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. #' +#' @references +#' `r format_bib("mcinnes_2018")` +#' #' @examples #' \dontshow{ if (requireNamespace("uwot")) \{ } #' library("mlr3") @@ -231,8 +234,7 @@ #' #' pop$state #' \dontshow{ \} } -#' @references -#' `r format_bib("mcinnes_2018")` +#' #' @family PipeOps #' @template seealso_pipeopslist #' @include PipeOpTaskPreproc.R @@ -260,10 +262,10 @@ PipeOpUMAP = R6Class("PipeOpUMAP", init = p_uty( default = "spectral", tags = c("train", "umap"), - custom_check = crate(function(x) { + custom_check = function(x) { choices = c("spectral", "normlaplacian", "random", "lvrandom", "laplacian", "pca", "spca", "agspectral") check_choice(x, choices) %check||% check_matrix(x) - }) + } ), init_sdev = p_uty(default = "range", tags = c("train", "umap")), spread = p_dbl(default = 1, tags = c("train", "umap")), @@ -286,13 +288,13 @@ PipeOpUMAP = R6Class("PipeOpUMAP", y = p_uty( default = NULL, tags = c("train", "umap"), - custom_check = crate(function(x) { + custom_check = function(x) { check_atomic_vector(x) %check||% check_matrix(x) %check||% check_data_frame(x) %check||% check_list(x) %check||% check_null(x) - }) + } ), target_n_neighbors = p_int(tags = c("train", "umap")), target_metric = p_fct( diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 28d0f05c9..ea177bebb 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -253,6 +253,7 @@ pop$train(list(task))[[1]]$data() pop$state \dontshow{ \} } + } \references{ McInnes L, Healy J, Melville J, Grossberger L (2018). From 8be8cdddb3c4fc63b416a1b397dfdeb4ac004281 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Wed, 7 Aug 2024 21:32:55 +0200 Subject: [PATCH 29/36] docs: document + missing changes in master/man --- R/PipeOpUMAP.R | 1 - man/mlr_pipeops_nmf.Rd | 2 +- man/mlr_pipeops_rowapply.Rd | 6 +++--- man/mlr_pipeops_umap.Rd | 1 - 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 9bd224fa1..c0cf6c190 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -234,7 +234,6 @@ #' #' pop$state #' \dontshow{ \} } -#' #' @family PipeOps #' @template seealso_pipeopslist #' @include PipeOpTaskPreproc.R diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 36a726258..ce74b22e5 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -96,7 +96,7 @@ See \code{\link[NMF:nmf]{nmf()}}. \section{Internals}{ -Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis]{basis()}}, \code{\link[NMF:coef]{coef()}} and +Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis-coef-methods]{basis()}}, \code{\link[NMF:basis-coef-methods]{coef()}} and \code{\link[MASS:ginv]{ginv()}}. } diff --git a/man/mlr_pipeops_rowapply.Rd b/man/mlr_pipeops_rowapply.Rd index 85e0ac30e..e57437129 100644 --- a/man/mlr_pipeops_rowapply.Rd +++ b/man/mlr_pipeops_rowapply.Rd @@ -46,14 +46,13 @@ Function to apply to each row in the affected columns of the task. The return value should be a vector of the same length for every input. Initialized as \code{\link[base:identity]{identity()}}. \item \code{col_prefix} :: \code{character(1)}\cr -If specified, prefix to be prepended to the column names of affected columns, separated by a dot (\code{.}). Default is \code{""}. +If specified, prefix to be prepended to the column names of affected columns, separated by a dot (\code{.}). Initialized as \code{""}. } } \section{Internals}{ -Calls \code{\link{apply}} on the data, using the value of \code{applicator} as \code{FUN} and \code{simplify = TRUE}, then coerces the output via -\code{\link[data.table:as.data.table]{as.data.table()}}. +Calls \code{\link{apply}} on the data, using the value of \code{applicator} as \code{FUN}. } \section{Fields}{ @@ -144,6 +143,7 @@ Other PipeOps: \code{\link{mlr_pipeops_textvectorizer}}, \code{\link{mlr_pipeops_threshold}}, \code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_umap}}, \code{\link{mlr_pipeops_unbranch}}, \code{\link{mlr_pipeops_updatetarget}}, \code{\link{mlr_pipeops_vtreat}}, diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index ea177bebb..28d0f05c9 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -253,7 +253,6 @@ pop$train(list(task))[[1]]$data() pop$state \dontshow{ \} } - } \references{ McInnes L, Healy J, Melville J, Grossberger L (2018). From daf28df2a0dc2ab24dcea4f2d6ccc9e92017e5d1 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Thu, 8 Aug 2024 10:08:11 +0200 Subject: [PATCH 30/36] fix: crate workaround --- R/PipeOpUMAP.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index c0cf6c190..018098775 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -261,10 +261,10 @@ PipeOpUMAP = R6Class("PipeOpUMAP", init = p_uty( default = "spectral", tags = c("train", "umap"), - custom_check = function(x) { + custom_check = crate(function(x) { choices = c("spectral", "normlaplacian", "random", "lvrandom", "laplacian", "pca", "spca", "agspectral") check_choice(x, choices) %check||% check_matrix(x) - } + }, .parent = topenv()) ), init_sdev = p_uty(default = "range", tags = c("train", "umap")), spread = p_dbl(default = 1, tags = c("train", "umap")), @@ -287,13 +287,13 @@ PipeOpUMAP = R6Class("PipeOpUMAP", y = p_uty( default = NULL, tags = c("train", "umap"), - custom_check = function(x) { + custom_check = crate(function(x) { check_atomic_vector(x) %check||% check_matrix(x) %check||% check_data_frame(x) %check||% check_list(x) %check||% check_null(x) - } + }, .parent = topenv()) ), target_n_neighbors = p_int(tags = c("train", "umap")), target_metric = p_fct( From 394eb701a05b43f226cf43e4d7639663fe9f43a9 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Wed, 14 Aug 2024 17:00:01 +0200 Subject: [PATCH 31/36] Added depends to params + document --- R/PipeOpUMAP.R | 64 ++++++++++++++++++++++++++++------------- man/mlr_pipeops_umap.Rd | 2 +- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 018098775..99f04905b 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -168,7 +168,7 @@ #' * `pca_center` :: `logical(1)`\cr #' If `TRUE`, center the columns of X before carrying out PCA. Default is `TRUE`. #' For details, see [uwot::umap2()]. -#' * `pca_rand` :: `logical(1)`\cr +#' * `pcg_rand` :: `logical(1)`\cr #' If `TRUE`, use the PCG random number generator (O'Neill, 2014) during optimization. #' Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. #' Default is `TRUE`. For details, see [uwot::umap2()]. @@ -243,8 +243,8 @@ PipeOpUMAP = R6Class("PipeOpUMAP", public = list( initialize = function(id = "umap", param_vals = list()) { ps = ps( - n_neighbors = p_int(2L, 100L, default = 15L, tags = c("train", "umap")), - n_components = p_int(1L, 100L, default = 2L, tags = c("train", "umap")), + n_neighbors = p_int(lower = 1L, default = 15L, tags = c("train", "umap")), + n_components = p_int(lower = 1L, default = 2L, tags = c("train", "umap")), metric = p_fct( levels = c( "euclidean", "cosine", "manhattan", "hamming", "correlation", @@ -255,8 +255,8 @@ PipeOpUMAP = R6Class("PipeOpUMAP", default = "euclidean", tags = c("train", "umap") ), - n_epochs = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), - learning_rate = p_dbl(0, default = 1, tags = c("train", "umap")), + n_epochs = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + learning_rate = p_dbl(lower = 0, default = 1, tags = c("train", "umap")), scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "scale", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), init = p_uty( default = "spectral", @@ -266,11 +266,11 @@ PipeOpUMAP = R6Class("PipeOpUMAP", check_choice(x, choices) %check||% check_matrix(x) }, .parent = topenv()) ), - init_sdev = p_uty(default = "range", tags = c("train", "umap")), + init_sdev = p_dbl(default = "range", special_vals = list("range"), tags = c("train", "umap")), spread = p_dbl(default = 1, tags = c("train", "umap")), min_dist = p_dbl(default = 0.01, tags = c("train", "umap")), - set_op_mix_ratio = p_dbl(0, 1, default = 1, tags = c("train", "umap")), - local_connectivity = p_dbl(1, default = 1, tags = c("train", "umap")), + set_op_mix_ratio = p_dbl(lower = 0, upper = 1, default = 1, tags = c("train", "umap")), + local_connectivity = p_dbl(lower = 1, default = 1, tags = c("train", "umap")), bandwidth = p_dbl(default = 1, tags = c("train", "umap")), repulsion_strength = p_dbl(default = 1, tags = c("train", "umap")), negative_sample_rate = p_dbl(default = 5, tags = c("train", "umap")), @@ -281,8 +281,9 @@ PipeOpUMAP = R6Class("PipeOpUMAP", tags = c("train", "umap"), custom_check = crate(function(x) check_choice(x, c("annoy", "hnsw", "nndescent"), null.ok = TRUE)) ), - n_trees = p_int(10L, 100L, default = 50L, tags = c("train", "umap")), - search_k = p_int(tags = c("train", "umap")), + n_trees = p_int(lower = 1L, default = 50L, tags = c("train", "umap"), depends = quote(nn_method == "annoy")), + search_k = p_int(tags = c("train", "umap"), depends = quote(nn_method == "annoy")), + # approx_pow is only used if dens_scale is non-NULL approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), y = p_uty( default = NULL, @@ -291,11 +292,13 @@ PipeOpUMAP = R6Class("PipeOpUMAP", check_atomic_vector(x) %check||% check_matrix(x) %check||% check_data_frame(x) %check||% - check_list(x) %check||% + check_list(x, len = 2, names = "unique") %check||% check_null(x) }, .parent = topenv()) ), + # target_n_neighbors is only used if y is non-NULL and numeric target_n_neighbors = p_int(tags = c("train", "umap")), + # target_metric is only used if y is non-NULL and numeric target_metric = p_fct( levels = c( "euclidean", "cosine", "manhattan", "hamming", "correlation", @@ -306,27 +309,48 @@ PipeOpUMAP = R6Class("PipeOpUMAP", default = "euclidean", tags = c("train", "umap") ), - target_weight = p_dbl(0, 1, default = 0.5, tags = c("train", "umap")), - pca = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + # target_weight is only used if y is non-NULL + target_weight = p_dbl(lower = 0, upper = 1, default = 0.5, tags = c("train", "umap")), + # pca is ignored if metric is "hamming" + pca = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap"), + depends = quote(metric %in% c( + "euclidean", "cosine", "manhattan", "correlation", + "braycurtis", "canberra", "chebyshev", "dice", "hellinger", "jaccard", + "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", + "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" + ))), + # pca_center might only be used if pca is specified (documentation unclear) pca_center = p_lgl(default = TRUE, tags = c("train", "umap")), - pca_rand = p_lgl(default = TRUE, tags = c("train", "umap")), + pcg_rand = p_lgl(default = TRUE, tags = c("train", "umap")), fast_sgd = p_lgl(default = FALSE, tags = c("train", "umap")), - n_threads = p_int(1L, default = NULL, special_vals = list(NULL), tags = c("train", "predict", "umap")), - n_sgd_threads = p_int(0L, default = 0L, special_vals = list("auto"), tags = c("train", "predict", "umap")), - grain_size = p_int(1L, default = 1L, tags = c("train", "umap")), + n_threads = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("train", "predict", "umap")), + n_sgd_threads = p_int(lower = 0L, default = 0L, special_vals = list("auto"), tags = c("train", "predict", "umap")), + grain_size = p_int(lower = 1L, default = 1L, tags = c("train", "umap")), verbose = p_lgl(default = TRUE, tags = c("train", "umap")), batch = p_lgl(default = FALSE, tags = c("train", "umap")), - opt_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = crate(function(x) check_list(x, null.ok = TRUE))), + opt_args = p_uty( + default = NULL, + tags = c("train", "umap"), + custom_check = crate(function(x) check_list(x, types = c("numeric", "character"), min.len = 1, max.len = 5, + names = "unique", null.ok = TRUE)), + depends = quote(batch == TRUE) + ), epoch_callback = p_uty( default = NULL, tags = c("train", "umap"), custom_check = crate(function(x) check_function(x, args = c("epochs", "n_epochs", "coords"), null.ok = TRUE)) ), + # pca_method is only used if pca is specified pca_method = p_fct(c("irlba", "rsvd", "bigstatsr", "svd", "auto"), default = NULL, special_vals = list(NULL), tags = c("train", "umap")), binary_edge_weights = p_lgl(default = FALSE, tags = c("train", "umap")), - dens_scale = p_dbl(0, 1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + dens_scale = p_dbl(lower = 0, upper = 1, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), seed = p_int(default = NULL, special_vals = list(NULL), tags = c("train", "umap")), - nn_args = p_uty(default = NULL, tags = c("train", "umap"), custom_check = crate(function(x) check_list(x, null.ok = TRUE))) + nn_args = p_uty( + default = NULL, + tags = c("train", "umap"), + custom_check = crate(function(x) check_list(x, types = c("integer", "numeric", "character"), + min.len = 1, max.len = 8, names = "unique", null.ok = TRUE)) + ) ) ps$set_values(verbose = FALSE) diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 28d0f05c9..69ebd3124 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -182,7 +182,7 @@ For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_center} :: \code{logical(1)}\cr If \code{TRUE}, center the columns of X before carrying out PCA. Default is \code{TRUE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{pca_rand} :: \code{logical(1)}\cr +\item \code{pcg_rand} :: \code{logical(1)}\cr If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. Default is \code{TRUE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. From 53ae9a7adac1b2bbb8923bf27a4e6b7493e7f467 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Wed, 14 Aug 2024 17:49:00 +0200 Subject: [PATCH 32/36] Added preproc tests + updated DESRIPTION --- DESCRIPTION | 3 +++ tests/testthat/test_pipeop_umap.R | 27 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8357e3b26..52c168942 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -89,6 +89,9 @@ Suggests: future, htmlwidgets, uwot (>= 0.2.1), + RcppAnnoy, + RcppHNSW, + rnndescent, ranger ByteCompile: true Encoding: UTF-8 diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R index 0a389f88b..f1883589e 100644 --- a/tests/testthat/test_pipeop_umap.R +++ b/tests/testthat/test_pipeop_umap.R @@ -2,17 +2,23 @@ context("PipeOpUMAP") test_that("PipeOpUMAP - basic properties", { skip_if_not_installed("uwot") - op = PipeOpUMAP$new() - task = mlr_tasks$get("iris")$filter(1:30) + skip_if_not_installed("RcppAnnoy") + skip_if_not_installed("RcppHNSW") + skip_if_not_installed("rnndescent") - expect_pipeop(op) + task = mlr_tasks$get("iris")$filter(1:30) - expect_task(op$train(list(task))[[1]]) - expect_task(op$predict(list(task))[[1]]) + # Test for different nn_methods since they are relying on different packages and deep clone is impleneted differently + expect_datapreproc_pipeop_class(PipeOpUMAP, constargs = list(param_vals = list(nn_method = "annoy")), + deterministic_train = FALSE, deterministic_predict = FALSE, task = task) + expect_datapreproc_pipeop_class(PipeOpUMAP, constargs = list(param_vals = list(nn_method = "hnsw")), + deterministic_train = FALSE, deterministic_predict = FALSE, task = task) + expect_datapreproc_pipeop_class(PipeOpUMAP, constargs = list(param_vals = list(nn_method = "nndescent")), + deterministic_train = FALSE, deterministic_predict = FALSE, task = task) }) -test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default Params", { +test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default Params, nn_method = annoy", { skip_if_not_installed("uwot") task = mlr_tasks$get("iris")$filter(1:30) @@ -27,7 +33,7 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") expect_true(all(state_names %in% names(op$state))) - state_names_wo_pointers = setdiff(state_names, "nn_index") # since pointers in element 1 will not be equal + state_names_wo_pointers = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal expect_identical(op$state[state_names_wo_pointers], umap_out[state_names_wo_pointers]) expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) @@ -38,7 +44,7 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default }) -test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed Params", { +test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed Params, nn_method = annoy", { skip_if_not_installed("uwot") task = mlr_tasks$get("iris")$filter(1:30) @@ -69,7 +75,7 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") expect_true(all(state_names %in% names(op$state))) - state_names = setdiff(state_names, "nn_index") # since pointers in state$nn_index$element1 will not be equal + state_names = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal expect_identical(op$state[state_names], umap_out[state_names]) expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) @@ -78,3 +84,6 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) }) + +# weitere tests für nn_methods +# for these use options that are specific to that method From 2eef42dc417b102ba9d8058c48c599f96ca8bc55 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Wed, 14 Aug 2024 17:49:33 +0200 Subject: [PATCH 33/36] Add deep_clone, currently WIP --- R/PipeOpUMAP.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index 99f04905b..ff945b913 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -368,6 +368,26 @@ PipeOpUMAP = R6Class("PipeOpUMAP", .predict_dt = function(dt, levels) { params = self$param_set$get_values(tags = c("umap", "predict")) invoke(uwot::umap_transform, dt, self$state, .args = params) + }, + + # We need to overload deep_clone since state$nn_index$ann is a C++ address if nn_method is "annoy" or "hnsw" + deep_clone = function(name, value) { + if (name == "state" && "NO_OP" %nin% class(value)) { + # TODO: Make sure these class names are correct for different options for nn_args + # attr(attr(value$nn_index, "class"), "package") might work otherwise + if (class(value$nn_index$ann) %in% c("RcppHNSWL2", "Rcpp_AnnoyEuclidean")) { + state = value + state$nn_index$ann = value$nn_index$ann$copy() + state$nn_index$type = value$nn_index$type + state$nn_index$metric = value$nn_index$metric + state$nn_index$ndim = value$nn_index$ndim + state + } else { + super$deep_clone(name, value) + } + } else { + super$deep_clone(name, value) + } } ) ) From d854eaefd805c6e57efd4496b5f8285f07da79a7 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 20 Aug 2024 19:11:35 +0200 Subject: [PATCH 34/36] added tests for different nn_methods --- tests/testthat/test_pipeop_umap.R | 93 ++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R index f1883589e..94d766365 100644 --- a/tests/testthat/test_pipeop_umap.R +++ b/tests/testthat/test_pipeop_umap.R @@ -8,7 +8,7 @@ test_that("PipeOpUMAP - basic properties", { task = mlr_tasks$get("iris")$filter(1:30) - # Test for different nn_methods since they are relying on different packages and deep clone is impleneted differently + # Test for different nn_methods since they are relying on different packages and deep clone is implemented differently expect_datapreproc_pipeop_class(PipeOpUMAP, constargs = list(param_vals = list(nn_method = "annoy")), deterministic_train = FALSE, deterministic_predict = FALSE, task = task) expect_datapreproc_pipeop_class(PipeOpUMAP, constargs = list(param_vals = list(nn_method = "hnsw")), @@ -46,6 +46,7 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed Params, nn_method = annoy", { skip_if_not_installed("uwot") + skip_if_not_installed("RcppAnnoy") task = mlr_tasks$get("iris")$filter(1:30) op = PipeOpUMAP$new() @@ -85,5 +86,91 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed }) -# weitere tests für nn_methods -# for these use options that are specific to that method + +test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed Params, nn_method = hnsw", { + skip_if_not_installed("uwot") + skip_if_not_installed("RcppHNSW") + task = mlr_tasks$get("iris")$filter(1:30) + + op = PipeOpUMAP$new() + pv = list( + seed = 1234L, + nn_method = "hnsw", + n_neighbors = 10L, + metric = "correlation", + n_epochs = 100L, + learning_rate = 0.5, + scale = FALSE, + init = "pca", + init_sdev = 1e-4, + set_op_mix_ratio = 0.5, + local_connectivity = 1.1, + bandwidth = 0.9, + repulsion_strength = 1.1, + negative_sample_rate = 6, + y = task$data()[, 1], + nn_args = list(M = 10L, ef_construction = 100L, ef = 20L) + ) + op$param_set$set_values(.values = pv) + + train_out = train_pipeop(op, list(task))[[1L]] + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) + + state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", + "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", + "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") + expect_true(all(state_names %in% names(op$state))) + state_names = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal + expect_identical(op$state[state_names], umap_out[state_names]) + expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) + + predict_out = predict_pipeop(op, list(task))[[1L]] + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) + +}) + + +test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed Params, nn_method = rnndescent", { + skip_if_not_installed("uwot") + skip_if_not_installed("rnndescent") + task = mlr_tasks$get("iris")$filter(1:30) + + op = PipeOpUMAP$new() + pv = list( + seed = 1234L, + nn_method = "nndescent", + n_neighbors = 10L, + metric = "symmetrickl", + n_epochs = 100L, + learning_rate = 0.5, + scale = FALSE, + init = "pca", + init_sdev = 1e-4, + set_op_mix_ratio = 0.5, + local_connectivity = 1.1, + bandwidth = 0.9, + repulsion_strength = 1.1, + negative_sample_rate = 6, + y = task$data()[, 1], + nn_args = list(n_trees = 15L, max_candidates = 15L, pruning_degree_multiplier = 1.4, epsilon = 0.05) + ) + op$param_set$set_values(.values = pv) + + train_out = train_pipeop(op, list(task))[[1L]] + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) + + state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", + "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", + "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") + expect_true(all(state_names %in% names(op$state))) + + state_names = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal + expect_identical(op$state[state_names], umap_out[state_names]) + expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) + + predict_out = predict_pipeop(op, list(task))[[1L]] + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) + +}) From e1b113eeb67cdbab8d17c9a492c3ebd676fec706 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 23 Aug 2024 16:49:43 +0200 Subject: [PATCH 35/36] feat: params for transform + better param defs + docs --- R/PipeOpUMAP.R | 175 +++++++++++++++++++++++++--------------- man/mlr_pipeops_umap.Rd | 58 +++++++++---- 2 files changed, 151 insertions(+), 82 deletions(-) diff --git a/R/PipeOpUMAP.R b/R/PipeOpUMAP.R index ff945b913..49798bbb2 100644 --- a/R/PipeOpUMAP.R +++ b/R/PipeOpUMAP.R @@ -107,9 +107,11 @@ #' For details, see [uwot::umap2()]. #' * `scale` :: `logical(1)` / `character(1)`\cr #' Scaling to apply to the data. If `TRUE`, data is standardized. Default is `FALSE`. For details, see [uwot::umap2()]. -#' * `init` :: `character(1)` | `matrix`\cr -#' Type of initialization for the coordinates. Default is `"spectral"`. -#' For details, see [uwot::umap2()]. +#' * `init` :: `character(1)`\cr +#' Type of initialization for the coordinates. May be set to `"custom"`, in which case the `matrix` of initial +#' coordinates passed to `init_custom` is used. Default is `"spectral"`. For details, see [uwot::umap2()]. +#' * `init_custom` :: `matrix`\cr +#' Matrix of initial coordinates. Only used, if `init` is `"custom"`. #' * `init_sdev` :: `character(1)` | `numeric(1)`\cr #' Scales each dimension of the initialized coordinates to this standard deviation. #' Default is `"range"`. For details, see [uwot::umap2()]. @@ -134,9 +136,9 @@ #' * `negative_sample_rate` :: `numeric(1)`\cr #' The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample #' in optimizing the low dimensional embedding. Default is `5`. For details, see [uwot::umap2()]. -#' * `a` :: `any`\cr +#' * `a` :: `numeric(1)`\cr #' More specific parameters controlling the embedding. Default is `NULL`. For details, see [uwot::umap2()]. -#' * `b` :: `any`\cr +#' * `b` :: `numeric(1)`\cr #' More specific parameters controlling the embedding. Default is `NULL`. For details, see [uwot::umap2()]. #' * `nn_method` :: `character(1)`\cr #' Method for finding nearest neighbors. Note that only values compatible with [uwot::umap_transform()] are allowed. @@ -150,20 +152,20 @@ #' * `approx_pow` :: `logical(1)`\cr #' If `TRUE`, use an approximation to the power function in the UMAP gradient. Default is `FALSE`. #' For details, see [uwot::umap2()]. -#' * `y` :: `any`\cr -#' Optional target data for supervised dimension reduction. Default is `NULL`. -#' For details, see [uwot::umap2()]. +#' `use_supervised` :: `logical(1)`\cr +#' If `TRUE`, perform supervised dimension reduction. This is done by passing the task's target to [uwot::umap2()]'s `y` argument. +#' For details, see there. Initialized to `FALSE`. #' * `target_n_neighbors` :: `integer(1)`\cr -#' Number of nearest neighbors to use to construct the target simplicial set. Default is `n_neighbors`. -#' For details, see [uwot::umap2()]. +#' Number of nearest neighbors to use to construct the target simplicial set. Only used when performing supervised dimension reduction. +#' Default is `n_neighbors`. For details, see [uwot::umap2()]. #' * `target_metric` :: `character(1)`\cr -#' The metric used to measure distance for `y` if using supervised dimension reduction. +#' The metric used to measure distance for the task's target when performing supervised dimension reduction. #' For details, see [uwot::umap2()]. #' * `target_weight` :: `numeric(1)`\cr -#' Weighting factor between data topology and target topology. Default is `0.5`. -#' For details, see [uwot::umap2()]. +#' Weighting factor between data topology and target topology. Only used when performing supervised dimension reduction. +#' Default is `0.5`. For details, see [uwot::umap2()]. #' * `pca` :: `integer(1)`\cr -#' Redude data to this number of columns using PCA. Default is `NULL`. +#' Reduce data to this number of columns using PCA. Default is `NULL`. #' For details, see [uwot::umap2()]. #' * `pca_center` :: `logical(1)`\cr #' If `TRUE`, center the columns of X before carrying out PCA. Default is `TRUE`. @@ -187,10 +189,10 @@ #' The minimum amount of work to do on each thread. Default is `1`. #' For details, see [uwot::umap2()]. #' * `verbose` :: `logical(1)`\cr -#' Should details be printed? Initialzed to `FALSE`. For details, see [uwot::umap2()]. +#' Should details be printed? Initialized to `FALSE`. For details, see [uwot::umap2()]. #' * `batch` :: `logical(1)`\cr #' If `TRUE`, then embedding coordinates are updated at the end of each epoch rather -#' than during the epoch. Default is `FALSE`. For details, see [uwot::umap2()]. +#' than during the epoch. Default is `TRUE`. For details, see [uwot::umap2()]. #' * `opt_args` :: named `list()`\cr #' A list of optimizer parameters, used when `batch = TRUE`. Default is `NULL`. #' For details, see [uwot::umap2()]. @@ -213,6 +215,28 @@ #' A list containing additional arguments to pass to the nearest neighbor method. #' Default is `NULL`. For details, see [uwot::umap2()]. #' +#' Additionally, there are several parameters that may be used to overwrite parameter values for prediction: +#' * `search_k_transform` :: `integer(1)`\cr +#' Number of nodes to search during the neighbor retrieval when predicting. +#' Only used if `nn_method` is `"annoy"`. If `NULL`, `search_k` is used instead. Default is `NULL`. For details, see [uwot::umap_transform()]. +#' * `n_epochs_transform` :: `integer(1)`\cr +#' Number of epochs used during the optimization of the embedded coordinates when predicting. +#' If `NULL`, `n_epochs` is used instead. Default is `NULL`. For details, see [uwot::umap_transform()]. +#' * `init_transform` :: `character(1)`\cr +#' Type of initialization for the coordinates when predicting. May be set to `"custom"`, in which case the `matrix` of initial +#' coordinates passed to `init_transform_custom` is used. Default is `"weighted"`. For details, see [uwot::umap_transform()]. +#' * `init_transform_custom` :: `matrix`\cr +#' Matrix of initial coordinates when predicting Only used, if `init_transform` is `"custom"`. +#' * `batch_transform` :: `logical(1)`\cr +#' If `TRUE`, embedding coordinates are updated at the end of each epoch rather than during the epoch when predicting. +#' If `NULL`, `batch` is used instead. Default is `FALSE`. For details, see [uwot::umap_transform()]. +#' * `learning_rate_transform` :: `numeric(1)`\cr +#' Initial learning rate used in optimization of the coordinates when predicting. +#' If `NULL`, `learning_rate` is used instead. Default is `NULL`. For details, see [uwot::umap_transform()]. +#' * `epoch_callback_transform` :: `function`\cr +#' A function which will be invoked at the end of every epoch when predicting. +#' Default is `NULL`. For details, see [uwot::umap_transform()]. +#' #' @section Internals: #' Uses the [umap2()][uwot::umap2] function. #' @@ -257,15 +281,19 @@ PipeOpUMAP = R6Class("PipeOpUMAP", ), n_epochs = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap")), learning_rate = p_dbl(lower = 0, default = 1, tags = c("train", "umap")), - scale = p_lgl(default = FALSE, special_vals = list("none", "Z", "scale", "maxabs", "range", "colrange", NULL), tags = c("train", "umap")), - init = p_uty( + scale = p_fct( + levels = c("none", "scale", "maxabs", "range", "colrange"), + special_vals = list(FALSE, NULL, "Z", TRUE), + default = FALSE, + tags = c("train", "umap") + ), + init = p_fct( + levels = c("spectral", "normlaplacian", "random", "lvrandom", "laplacian", "pca", "spca", "agspectral"), + special_vals = list("custom"), default = "spectral", - tags = c("train", "umap"), - custom_check = crate(function(x) { - choices = c("spectral", "normlaplacian", "random", "lvrandom", "laplacian", "pca", "spca", "agspectral") - check_choice(x, choices) %check||% check_matrix(x) - }, .parent = topenv()) + tags = c("train", "umap") ), + init_custom = p_uty(custom_check = check_matrix, tags = "train", depends = quote(init == "custom")), init_sdev = p_dbl(default = "range", special_vals = list("range"), tags = c("train", "umap")), spread = p_dbl(default = 1, tags = c("train", "umap")), min_dist = p_dbl(default = 0.01, tags = c("train", "umap")), @@ -274,31 +302,15 @@ PipeOpUMAP = R6Class("PipeOpUMAP", bandwidth = p_dbl(default = 1, tags = c("train", "umap")), repulsion_strength = p_dbl(default = 1, tags = c("train", "umap")), negative_sample_rate = p_dbl(default = 5, tags = c("train", "umap")), - a = p_uty(default = NULL, tags = c("train", "umap")), - b = p_uty(default = NULL, tags = c("train", "umap")), - nn_method = p_uty( - default = NULL, - tags = c("train", "umap"), - custom_check = crate(function(x) check_choice(x, c("annoy", "hnsw", "nndescent"), null.ok = TRUE)) - ), + a = p_dbl(default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + b = p_dbl(default = NULL, special_vals = list(NULL), tags = c("train", "umap")), + nn_method = p_fct(levels = c("annoy", "hnsw", "nndescent"), default = NULL, special_vals = list(NULL), tags = c("train", "umap")), n_trees = p_int(lower = 1L, default = 50L, tags = c("train", "umap"), depends = quote(nn_method == "annoy")), search_k = p_int(tags = c("train", "umap"), depends = quote(nn_method == "annoy")), # approx_pow is only used if dens_scale is non-NULL approx_pow = p_lgl(default = FALSE, tags = c("train", "umap")), - y = p_uty( - default = NULL, - tags = c("train", "umap"), - custom_check = crate(function(x) { - check_atomic_vector(x) %check||% - check_matrix(x) %check||% - check_data_frame(x) %check||% - check_list(x, len = 2, names = "unique") %check||% - check_null(x) - }, .parent = topenv()) - ), - # target_n_neighbors is only used if y is non-NULL and numeric - target_n_neighbors = p_int(tags = c("train", "umap")), - # target_metric is only used if y is non-NULL and numeric + use_supervised = p_lgl(default = FALSE, tags = c("train")), + target_n_neighbors = p_int(tags = c("train", "umap"), depends = quote(use_supervised == TRUE)), target_metric = p_fct( levels = c( "euclidean", "cosine", "manhattan", "hamming", "correlation", @@ -307,10 +319,10 @@ PipeOpUMAP = R6Class("PipeOpUMAP", "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" ), default = "euclidean", - tags = c("train", "umap") + tags = c("train", "umap"), + depends = quote(use_supervised == TRUE) ), - # target_weight is only used if y is non-NULL - target_weight = p_dbl(lower = 0, upper = 1, default = 0.5, tags = c("train", "umap")), + target_weight = p_dbl(lower = 0, upper = 1, default = 0.5, tags = c("train", "umap"), depends = quote(use_supervised == TRUE)), # pca is ignored if metric is "hamming" pca = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("train", "umap"), depends = quote(metric %in% c( @@ -319,15 +331,15 @@ PipeOpUMAP = R6Class("PipeOpUMAP", "jensenshannon", "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "spearmanr", "symmetrickl", "tsss", "yule" ))), - # pca_center might only be used if pca is specified (documentation unclear) + # pca_center is only used if pca is specified pca_center = p_lgl(default = TRUE, tags = c("train", "umap")), pcg_rand = p_lgl(default = TRUE, tags = c("train", "umap")), fast_sgd = p_lgl(default = FALSE, tags = c("train", "umap")), n_threads = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("train", "predict", "umap")), n_sgd_threads = p_int(lower = 0L, default = 0L, special_vals = list("auto"), tags = c("train", "predict", "umap")), - grain_size = p_int(lower = 1L, default = 1L, tags = c("train", "umap")), - verbose = p_lgl(default = TRUE, tags = c("train", "umap")), - batch = p_lgl(default = FALSE, tags = c("train", "umap")), + grain_size = p_int(lower = 1L, default = 1L, tags = c("train", "predict", "umap")), + verbose = p_lgl(default = TRUE, tags = c("train", "predict", "umap")), + batch = p_lgl(default = TRUE, tags = c("train", "umap")), opt_args = p_uty( default = NULL, tags = c("train", "umap"), @@ -350,38 +362,69 @@ PipeOpUMAP = R6Class("PipeOpUMAP", tags = c("train", "umap"), custom_check = crate(function(x) check_list(x, types = c("integer", "numeric", "character"), min.len = 1, max.len = 8, names = "unique", null.ok = TRUE)) + ), + # Parameters that are passed to umap_transform to overwrite parameters from training for prediction + search_k_transform = p_int(default = NULL, special_vals = list(NULL), tags = c("predict", "overwrite"), depends = quote(nn_method == "annoy")), + n_epochs_transform = p_int(lower = 1L, default = NULL, special_vals = list(NULL), tags = c("predict", "overwrite")), + init_transform = p_fct(levels = c("weighted", "average"), special_vals = list("custom"), default = "weighted", tags = c("predict", "overwrite")), + init_transform_custom = p_uty(custom_check = check_matrix, tags = "predict", depends = quote(init_transform == "custom")), + batch_transform = p_lgl(default = FALSE, special_vals = list(NULL), tags = c("predict", "overwrite")), + learning_rate_transform = p_dbl(default = NULL, special_vals = list(NULL), tags = c("predict", "overwrite")), + epoch_callback_transform = p_uty( + default = NULL, + tags = c("predict", "overwrite"), + custom_check = crate(function(x) check_function(x, args = c("epochs", "n_epochs", "coords", "fixed_coords"), null.ok = TRUE)) ) ) - ps$set_values(verbose = FALSE) + ps$values = list(verbose = FALSE, use_supervised = FALSE) super$initialize(id, param_set = ps, param_vals = param_vals, packages = "uwot", feature_types = c("numeric", "integer")) } ), private = list( .train_dt = function(dt, levels, target) { - params = insert_named(self$param_set$get_values(tags = c("umap", "train")), list(ret_model = TRUE)) - umap = invoke(uwot::umap2, dt, .args = params) + pv = self$param_set$values + pv_args = self$param_set$get_values(tags = c("umap", "train")) + # Indicate that umap2() should return the full model which we need for prediction + pv_args = insert_named(pv_args, list(ret_model = TRUE)) + # Use target for supervised dimension reduction when specified + if (!is.null(pv$use_supervised) && pv$use_supervised) { + pv_args = insert_named(pv_args, list(y = target)) + } + # Use matrix passed to init_custom for initialization when specified + if (!is.null(pv$init) && pv$init == "custom") { + pv_args = insert_named(pv_args, list(init = pv$init_custom)) + } + umap = invoke(uwot::umap2, dt, .args = pv_args) self$state = umap umap$embedding }, .predict_dt = function(dt, levels) { - params = self$param_set$get_values(tags = c("umap", "predict")) - invoke(uwot::umap_transform, dt, self$state, .args = params) + pv = self$param_set$values + pv_args = self$param_set$get_values(tags = c("umap", "predict")) + # Get overwriting params and rename them to the correct argument names for uwot::umap_transform() + overwrite_pv_args = self$param_set$get_values(tags = c("overwrite", "predict")) + names(overwrite_pv_args) <- sub("_transform$", "", names(overwrite_pv_args)) + pv_args = insert_named(pv_args, overwrite_pv_args) + # Use matrix passed to init_transform_custom for initialization when specified + if (!is.null(pv$init_transform) && pv$init_transform == "custom") { + pv_args = insert_named(pv_args, list(init = pv$init_transform_custom)) + } + invoke(uwot::umap_transform, dt, self$state, .args = pv_args) }, - # We need to overload deep_clone since state$nn_index$ann is a C++ address if nn_method is "annoy" or "hnsw" + # We need to overload deep_clone since state$nn_index$ann is a RefClass if nn_method is "annoy" or "hnsw" deep_clone = function(name, value) { if (name == "state" && "NO_OP" %nin% class(value)) { - # TODO: Make sure these class names are correct for different options for nn_args - # attr(attr(value$nn_index, "class"), "package") might work otherwise - if (class(value$nn_index$ann) %in% c("RcppHNSWL2", "Rcpp_AnnoyEuclidean")) { - state = value - state$nn_index$ann = value$nn_index$ann$copy() - state$nn_index$type = value$nn_index$type - state$nn_index$metric = value$nn_index$metric - state$nn_index$ndim = value$nn_index$ndim - state + if (!is.null(value$nn_index)) { + if (methods::is(value$nn_index$ann, "envRefClass")) { + state = value + state$nn_index$ann = value$nn_index$ann$copy() + state + } else { + super$deep_clone(name, value) + } } else { super$deep_clone(name, value) } diff --git a/man/mlr_pipeops_umap.Rd b/man/mlr_pipeops_umap.Rd index 69ebd3124..adf871539 100644 --- a/man/mlr_pipeops_umap.Rd +++ b/man/mlr_pipeops_umap.Rd @@ -121,9 +121,11 @@ Initial learning rate used in optimization of the coordinates. Default is \code{ For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{scale} :: \code{logical(1)} / \code{character(1)}\cr Scaling to apply to the data. If \code{TRUE}, data is standardized. Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{init} :: \code{character(1)} | \code{matrix}\cr -Type of initialization for the coordinates. Default is \code{"spectral"}. -For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +\item \code{init} :: \code{character(1)}\cr +Type of initialization for the coordinates. May be set to \code{"custom"}, in which case the \code{matrix} of initial +coordinates passed to \code{init_custom} is used. Default is \code{"spectral"}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +\item \code{init_custom} :: \code{matrix}\cr +Matrix of initial coordinates. Only used, if \code{init} is \code{"custom"}. \item \code{init_sdev} :: \code{character(1)} | \code{numeric(1)}\cr Scales each dimension of the initialized coordinates to this standard deviation. Default is \code{"range"}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. @@ -148,9 +150,9 @@ Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{negative_sample_rate} :: \code{numeric(1)}\cr The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding. Default is \code{5}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{a} :: \code{any}\cr +\item \code{a} :: \code{numeric(1)}\cr More specific parameters controlling the embedding. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{b} :: \code{any}\cr +\item \code{b} :: \code{numeric(1)}\cr More specific parameters controlling the embedding. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{nn_method} :: \code{character(1)}\cr Method for finding nearest neighbors. Note that only values compatible with \code{\link[uwot:umap_transform]{uwot::umap_transform()}} are allowed. @@ -164,20 +166,20 @@ For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{approx_pow} :: \code{logical(1)}\cr If \code{TRUE}, use an approximation to the power function in the UMAP gradient. Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. -\item \code{y} :: \code{any}\cr -Optional target data for supervised dimension reduction. Default is \code{NULL}. -For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +\code{use_supervised} :: \code{logical(1)}\cr +If \code{TRUE}, perform supervised dimension reduction. This is done by passing the task's target to \code{\link[uwot:umap2]{uwot::umap2()}}'s \code{y} argument. +For details, see there. Initialized to \code{FALSE}. \item \code{target_n_neighbors} :: \code{integer(1)}\cr -Number of nearest neighbors to use to construct the target simplicial set. Default is \code{n_neighbors}. -For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +Number of nearest neighbors to use to construct the target simplicial set. Only used when performing supervised dimension reduction. +Default is \code{n_neighbors}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_metric} :: \code{character(1)}\cr -The metric used to measure distance for \code{y} if using supervised dimension reduction. +The metric used to measure distance for the task's target when performing supervised dimension reduction. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{target_weight} :: \code{numeric(1)}\cr -Weighting factor between data topology and target topology. Default is \code{0.5}. -For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +Weighting factor between data topology and target topology. Only used when performing supervised dimension reduction. +Default is \code{0.5}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca} :: \code{integer(1)}\cr -Redude data to this number of columns using PCA. Default is \code{NULL}. +Reduce data to this number of columns using PCA. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{pca_center} :: \code{logical(1)}\cr If \code{TRUE}, center the columns of X before carrying out PCA. Default is \code{TRUE}. @@ -203,10 +205,10 @@ For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. The minimum amount of work to do on each thread. Default is \code{1}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{verbose} :: \code{logical(1)}\cr -Should details be printed? Initialzed to \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +Should details be printed? Initialized to \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{batch} :: \code{logical(1)}\cr If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather -than during the epoch. Default is \code{FALSE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. +than during the epoch. Default is \code{TRUE}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. \item \code{opt_args} :: named \code{list()}\cr A list of optimizer parameters, used when \code{batch = TRUE}. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. @@ -229,6 +231,30 @@ Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}} A list containing additional arguments to pass to the nearest neighbor method. Default is \code{NULL}. For details, see \code{\link[uwot:umap2]{uwot::umap2()}}. } + +Additionally, there are several parameters that may be used to overwrite parameter values for prediction: +\itemize{ +\item \code{search_k_transform} :: \code{integer(1)}\cr +Number of nodes to search during the neighbor retrieval when predicting. +Only used if \code{nn_method} is \code{"annoy"}. If \code{NULL}, \code{search_k} is used instead. Default is \code{NULL}. For details, see \code{\link[uwot:umap_transform]{uwot::umap_transform()}}. +\item \code{n_epochs_transform} :: \code{integer(1)}\cr +Number of epochs used during the optimization of the embedded coordinates when predicting. +If \code{NULL}, \code{n_epochs} is used instead. Default is \code{NULL}. For details, see \code{\link[uwot:umap_transform]{uwot::umap_transform()}}. +\item \code{init_transform} :: \code{character(1)}\cr +Type of initialization for the coordinates when predicting. May be set to \code{"custom"}, in which case the \code{matrix} of initial +coordinates passed to \code{init_transform_custom} is used. Default is \code{"weighted"}. For details, see \code{\link[uwot:umap_transform]{uwot::umap_transform()}}. +\item \code{init_transform_custom} :: \code{matrix}\cr +Matrix of initial coordinates when predicting Only used, if \code{init_transform} is \code{"custom"}. +\item \code{batch_transform} :: \code{logical(1)}\cr +If \code{TRUE}, embedding coordinates are updated at the end of each epoch rather than during the epoch when predicting. +If \code{NULL}, \code{batch} is used instead. Default is \code{FALSE}. For details, see \code{\link[uwot:umap_transform]{uwot::umap_transform()}}. +\item \code{learning_rate_transform} :: \code{numeric(1)}\cr +Initial learning rate used in optimization of the coordinates when predicting. +If \code{NULL}, \code{learning_rate} is used instead. Default is \code{NULL}. For details, see \code{\link[uwot:umap_transform]{uwot::umap_transform()}}. +\item \code{epoch_callback_transform} :: \code{function}\cr +A function which will be invoked at the end of every epoch when predicting. +Default is \code{NULL}. For details, see \code{\link[uwot:umap_transform]{uwot::umap_transform()}}. +} } \section{Internals}{ From 28677597b74e7eb96201af31e43455627edf9ea6 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 23 Aug 2024 16:50:11 +0200 Subject: [PATCH 36/36] changed tests to fit new param scheme --- tests/testthat/test_pipeop_umap.R | 54 +++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/tests/testthat/test_pipeop_umap.R b/tests/testthat/test_pipeop_umap.R index 94d766365..cbcde57d5 100644 --- a/tests/testthat/test_pipeop_umap.R +++ b/tests/testthat/test_pipeop_umap.R @@ -20,6 +20,7 @@ test_that("PipeOpUMAP - basic properties", { test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default Params, nn_method = annoy", { skip_if_not_installed("uwot") + skip_if_not_installed("RcppAnnoy") task = mlr_tasks$get("iris")$filter(1:30) op = PipeOpUMAP$new() @@ -33,7 +34,7 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Default "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") expect_true(all(state_names %in% names(op$state))) - state_names_wo_pointers = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal + state_names_wo_pointers = setdiff(state_names, "nn_index") # since RefClass in state$nn_index$ann will not be equal expect_identical(op$state[state_names_wo_pointers], umap_out[state_names_wo_pointers]) expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) @@ -50,6 +51,8 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed task = mlr_tasks$get("iris")$filter(1:30) op = PipeOpUMAP$new() + + # BUild list of param with same names for PipeOpUMAP and uwot::umap2() / uwot::umap_transform() pv = list( seed = 1234L, nn_method = "annoy", @@ -64,24 +67,31 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed local_connectivity = 1.1, bandwidth = 0.9, repulsion_strength = 1.1, - negative_sample_rate = 6, - y = task$data()[, 1] + negative_sample_rate = 6 ) - op$param_set$set_values(.values = pv) + # Handle parameters that are differently named for PipeOpUMAP and uwot::umap2() / uwot::umap_transform() + pv_po = insert_named(pv, list(use_supervised = TRUE, + batch_transform = TRUE, + init_transform = "average", + search_k_transform = 1000L)) + op$param_set$set_values(.values = pv_po) + args_umap2 = insert_named(pv, list(ret_model = TRUE, y = task$data()[, 1])) + args_umap_transform = list(init = "average", search_k = 1000L, batch = TRUE) train_out = train_pipeop(op, list(task))[[1L]] - umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], .args = args_umap2) state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") expect_true(all(state_names %in% names(op$state))) - state_names = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal + state_names = setdiff(state_names, "nn_index") # since RefClass in state$nn_index$ann will not be equal expect_identical(op$state[state_names], umap_out[state_names]) expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) predict_out = predict_pipeop(op, list(task))[[1L]] - umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out, .args = args_umap_transform) + expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) }) @@ -93,6 +103,8 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed task = mlr_tasks$get("iris")$filter(1:30) op = PipeOpUMAP$new() + + # BUild list of param with same names for PipeOpUMAP and uwot::umap2() / uwot::umap_transform() pv = list( seed = 1234L, nn_method = "hnsw", @@ -108,24 +120,27 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed bandwidth = 0.9, repulsion_strength = 1.1, negative_sample_rate = 6, - y = task$data()[, 1], nn_args = list(M = 10L, ef_construction = 100L, ef = 20L) ) - op$param_set$set_values(.values = pv) + # Handle parameters that are differently named for PipeOpUMAP and uwot::umap2() / uwot::umap_transform() + pv_po = insert_named(pv, list(use_supervised = TRUE, init_transform = "average")) + op$param_set$set_values(.values = pv_po) + args_umap2 = insert_named(pv, list(ret_model = TRUE, y = task$data()[, 1])) + args_umap_transform = list(init = "average") train_out = train_pipeop(op, list(task))[[1L]] - umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], .args = args_umap2) state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") expect_true(all(state_names %in% names(op$state))) - state_names = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal + state_names = setdiff(state_names, "nn_index") # since RefClass in state$nn_index$ann will not be equal expect_identical(op$state[state_names], umap_out[state_names]) expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) predict_out = predict_pipeop(op, list(task))[[1L]] - umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out, .args = args_umap_transform) expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) }) @@ -137,6 +152,8 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed task = mlr_tasks$get("iris")$filter(1:30) op = PipeOpUMAP$new() + + # BUild list of param with same names for PipeOpUMAP and uwot::umap2() / uwot::umap_transform() pv = list( seed = 1234L, nn_method = "nndescent", @@ -152,25 +169,28 @@ test_that("PipeOpUMAP - Compare to uwot::umap2 and uwot::umap_transform; Changed bandwidth = 0.9, repulsion_strength = 1.1, negative_sample_rate = 6, - y = task$data()[, 1], nn_args = list(n_trees = 15L, max_candidates = 15L, pruning_degree_multiplier = 1.4, epsilon = 0.05) ) - op$param_set$set_values(.values = pv) + # Handle parameters that are differently named for PipeOpUMAP and uwot::umap2() / uwot::umap_transform() + pv_po = insert_named(pv, list(use_supervised = TRUE, init_transform = "average")) + op$param_set$set_values(.values = pv_po) + args_umap2 = insert_named(pv, list(ret_model = TRUE, y = task$data()[, 1])) + args_umap_transform = list(init = "average") train_out = train_pipeop(op, list(task))[[1L]] - umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], ret_model = TRUE, .args = pv) + umap_out = invoke(uwot::umap2, X = task$data()[, 2:5], .args = args_umap2) state_names = c("embedding", "scale_info", "search_k", "local_connectivity", "n_epochs", "alpha", "negative_sample_rate", "method", "a", "b", "gamma", "approx_pow", "metric", "norig_col", "pcg_rand", "batch", "opt_args", "num_precomputed_nns", "min_dist", "spread", "binary_edge_weights", "seed", "nn_method", "nn_args", "n_neighbors", "nn_index", "pca_models") expect_true(all(state_names %in% names(op$state))) - state_names = setdiff(state_names, "nn_index") # since address in state$nn_index$ann will not be equal + state_names = setdiff(state_names, "nn_index") # since RefClass in state$nn_index$ann will not be equal expect_identical(op$state[state_names], umap_out[state_names]) expect_equal(train_out$data()[, 2:3], as.data.table(umap_out[["embedding"]])) predict_out = predict_pipeop(op, list(task))[[1L]] - umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out) + umap_transform_out = invoke(uwot::umap_transform, X = task$data()[, 2:5], model = umap_out, .args = args_umap_transform) expect_equal(predict_out$data()[, 2:3], as.data.table(umap_transform_out)) })