Skip to content

Commit ede46cf

Browse files
committed
Merge remote-tracking branch 'origin/master' into palantir-master
2 parents 692e6f8 + b83b502 commit ede46cf

File tree

708 files changed

+17344
-5294
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

708 files changed

+17344
-5294
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ dev/pr-deps/
4747
dist/
4848
docs/_site
4949
docs/api
50+
sql/docs
51+
sql/site
5052
lib_managed/
5153
lint-r-report.log
5254
log/

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@ exportMethods("%<=>%",
286286
"lower",
287287
"lpad",
288288
"ltrim",
289+
"map_keys",
290+
"map_values",
289291
"max",
290292
"md5",
291293
"mean",

R/pkg/R/DataFrame.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ setMethod("cache",
593593
#'
594594
#' Persist this SparkDataFrame with the specified storage level. For details of the
595595
#' supported storage levels, refer to
596-
#' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}.
596+
#' \url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}.
597597
#'
598598
#' @param x the SparkDataFrame to persist.
599599
#' @param newLevel storage level chosen for the persistance. See available options in

R/pkg/R/RDD.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ setMethod("cacheRDD",
227227
#'
228228
#' Persist this RDD with the specified storage level. For details of the
229229
#' supported storage levels, refer to
230-
#'\url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}.
230+
#'\url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}.
231231
#'
232232
#' @param x The RDD to persist
233233
#' @param newLevel The new storage level to be assigned

R/pkg/R/functions.R

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,10 @@ NULL
195195
#' head(tmp2)
196196
#' head(select(tmp, posexplode(tmp$v1)))
197197
#' head(select(tmp, sort_array(tmp$v1)))
198-
#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
198+
#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
199+
#' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
200+
#' head(select(tmp3, map_keys(tmp3$v3)))
201+
#' head(select(tmp3, map_values(tmp3$v3)))}
199202
NULL
200203

201204
#' Window functions for Column operations
@@ -3055,6 +3058,34 @@ setMethod("array_contains",
30553058
column(jc)
30563059
})
30573060

3061+
#' @details
3062+
#' \code{map_keys}: Returns an unordered array containing the keys of the map.
3063+
#'
3064+
#' @rdname column_collection_functions
3065+
#' @aliases map_keys map_keys,Column-method
3066+
#' @export
3067+
#' @note map_keys since 2.3.0
3068+
setMethod("map_keys",
3069+
signature(x = "Column"),
3070+
function(x) {
3071+
jc <- callJStatic("org.apache.spark.sql.functions", "map_keys", x@jc)
3072+
column(jc)
3073+
})
3074+
3075+
#' @details
3076+
#' \code{map_values}: Returns an unordered array containing the values of the map.
3077+
#'
3078+
#' @rdname column_collection_functions
3079+
#' @aliases map_values map_values,Column-method
3080+
#' @export
3081+
#' @note map_values since 2.3.0
3082+
setMethod("map_values",
3083+
signature(x = "Column"),
3084+
function(x) {
3085+
jc <- callJStatic("org.apache.spark.sql.functions", "map_values", x@jc)
3086+
column(jc)
3087+
})
3088+
30583089
#' @details
30593090
#' \code{explode}: Creates a new row for each element in the given array or map column.
30603091
#'

R/pkg/R/generics.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,6 +1213,16 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
12131213
#' @name NULL
12141214
setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
12151215

1216+
#' @rdname column_collection_functions
1217+
#' @export
1218+
#' @name NULL
1219+
setGeneric("map_keys", function(x) { standardGeneric("map_keys") })
1220+
1221+
#' @rdname column_collection_functions
1222+
#' @export
1223+
#' @name NULL
1224+
setGeneric("map_values", function(x) { standardGeneric("map_values") })
1225+
12161226
#' @rdname column_misc_functions
12171227
#' @export
12181228
#' @name NULL

R/pkg/R/mllib_classification.R

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
6969
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
7070
#' or the number of partitions are large, this param could be adjusted to a larger size.
7171
#' This is an expert parameter. Default value should be good for most cases.
72+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
73+
#' column of string type.
74+
#' Supported options: "skip" (filter out rows with invalid data),
75+
#' "error" (throw an error), "keep" (put invalid data in a special additional
76+
#' bucket, at index numLabels). Default is "error".
7277
#' @param ... additional arguments passed to the method.
7378
#' @return \code{spark.svmLinear} returns a fitted linear SVM model.
7479
#' @rdname spark.svmLinear
@@ -98,7 +103,8 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
98103
#' @note spark.svmLinear since 2.2.0
99104
setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formula"),
100105
function(data, formula, regParam = 0.0, maxIter = 100, tol = 1E-6, standardization = TRUE,
101-
threshold = 0.0, weightCol = NULL, aggregationDepth = 2) {
106+
threshold = 0.0, weightCol = NULL, aggregationDepth = 2,
107+
handleInvalid = c("error", "keep", "skip")) {
102108
formula <- paste(deparse(formula), collapse = "")
103109

104110
if (!is.null(weightCol) && weightCol == "") {
@@ -107,10 +113,12 @@ setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formu
107113
weightCol <- as.character(weightCol)
108114
}
109115

116+
handleInvalid <- match.arg(handleInvalid)
117+
110118
jobj <- callJStatic("org.apache.spark.ml.r.LinearSVCWrapper", "fit",
111119
data@sdf, formula, as.numeric(regParam), as.integer(maxIter),
112120
as.numeric(tol), as.logical(standardization), as.numeric(threshold),
113-
weightCol, as.integer(aggregationDepth))
121+
weightCol, as.integer(aggregationDepth), handleInvalid)
114122
new("LinearSVCModel", jobj = jobj)
115123
})
116124

@@ -218,6 +226,11 @@ function(object, path, overwrite = FALSE) {
218226
#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
219227
#' The bound vector size must be equal to 1 for binomial regression, or the number
220228
#' of classes for multinomial regression.
229+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
230+
#' column of string type.
231+
#' Supported options: "skip" (filter out rows with invalid data),
232+
#' "error" (throw an error), "keep" (put invalid data in a special additional
233+
#' bucket, at index numLabels). Default is "error".
221234
#' @param ... additional arguments passed to the method.
222235
#' @return \code{spark.logit} returns a fitted logistic regression model.
223236
#' @rdname spark.logit
@@ -257,7 +270,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
257270
tol = 1E-6, family = "auto", standardization = TRUE,
258271
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
259272
lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
260-
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
273+
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL,
274+
handleInvalid = c("error", "keep", "skip")) {
261275
formula <- paste(deparse(formula), collapse = "")
262276
row <- 0
263277
col <- 0
@@ -304,6 +318,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
304318
upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
305319
}
306320

321+
handleInvalid <- match.arg(handleInvalid)
322+
307323
jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
308324
data@sdf, formula, as.numeric(regParam),
309325
as.numeric(elasticNetParam), as.integer(maxIter),
@@ -312,7 +328,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
312328
weightCol, as.integer(aggregationDepth),
313329
as.integer(row), as.integer(col),
314330
lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
315-
lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
331+
lowerBoundsOnIntercepts, upperBoundsOnIntercepts,
332+
handleInvalid)
316333
new("LogisticRegressionModel", jobj = jobj)
317334
})
318335

@@ -394,7 +411,12 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
394411
#' @param stepSize stepSize parameter.
395412
#' @param seed seed parameter for weights initialization.
396413
#' @param initialWeights initialWeights parameter for weights initialization, it should be a
397-
#' numeric vector.
414+
#' numeric vector.
415+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
416+
#' column of string type.
417+
#' Supported options: "skip" (filter out rows with invalid data),
418+
#' "error" (throw an error), "keep" (put invalid data in a special additional
419+
#' bucket, at index numLabels). Default is "error".
398420
#' @param ... additional arguments passed to the method.
399421
#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
400422
#' @rdname spark.mlp
@@ -426,7 +448,8 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
426448
#' @note spark.mlp since 2.1.0
427449
setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
428450
function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
429-
tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
451+
tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL,
452+
handleInvalid = c("error", "keep", "skip")) {
430453
formula <- paste(deparse(formula), collapse = "")
431454
if (is.null(layers)) {
432455
stop ("layers must be a integer vector with length > 1.")
@@ -441,10 +464,11 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
441464
if (!is.null(initialWeights)) {
442465
initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
443466
}
467+
handleInvalid <- match.arg(handleInvalid)
444468
jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
445469
"fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
446470
as.character(solver), as.integer(maxIter), as.numeric(tol),
447-
as.numeric(stepSize), seed, initialWeights)
471+
as.numeric(stepSize), seed, initialWeights, handleInvalid)
448472
new("MultilayerPerceptronClassificationModel", jobj = jobj)
449473
})
450474

@@ -514,6 +538,11 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
514538
#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
515539
#' operators are supported, including '~', '.', ':', '+', and '-'.
516540
#' @param smoothing smoothing parameter.
541+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
542+
#' column of string type.
543+
#' Supported options: "skip" (filter out rows with invalid data),
544+
#' "error" (throw an error), "keep" (put invalid data in a special additional
545+
#' bucket, at index numLabels). Default is "error".
517546
#' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
518547
#' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
519548
#' @rdname spark.naiveBayes
@@ -543,10 +572,12 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
543572
#' }
544573
#' @note spark.naiveBayes since 2.0.0
545574
setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
546-
function(data, formula, smoothing = 1.0) {
575+
function(data, formula, smoothing = 1.0,
576+
handleInvalid = c("error", "keep", "skip")) {
547577
formula <- paste(deparse(formula), collapse = "")
578+
handleInvalid <- match.arg(handleInvalid)
548579
jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
549-
formula, data@sdf, smoothing)
580+
formula, data@sdf, smoothing, handleInvalid)
550581
new("NaiveBayesModel", jobj = jobj)
551582
})
552583

R/pkg/R/mllib_regression.R

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
7676
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
7777
#' The default value is "frequencyDesc". When the ordering is set to
7878
#' "alphabetDesc", this drops the same category as R when encoding strings.
79+
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
80+
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
7981
#' @param ... additional arguments passed to the method.
8082
#' @aliases spark.glm,SparkDataFrame,formula-method
8183
#' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -127,7 +129,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
127129
function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
128130
regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
129131
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
130-
"alphabetDesc", "alphabetAsc")) {
132+
"alphabetDesc", "alphabetAsc"),
133+
offsetCol = NULL) {
131134

132135
stringIndexerOrderType <- match.arg(stringIndexerOrderType)
133136
if (is.character(family)) {
@@ -159,12 +162,19 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
159162
weightCol <- as.character(weightCol)
160163
}
161164

165+
if (!is.null(offsetCol)) {
166+
offsetCol <- as.character(offsetCol)
167+
if (nchar(offsetCol) == 0) {
168+
offsetCol <- NULL
169+
}
170+
}
171+
162172
# For known families, Gamma is upper-cased
163173
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
164174
"fit", formula, data@sdf, tolower(family$family), family$link,
165175
tol, as.integer(maxIter), weightCol, regParam,
166176
as.double(var.power), as.double(link.power),
167-
stringIndexerOrderType)
177+
stringIndexerOrderType, offsetCol)
168178
new("GeneralizedLinearRegressionModel", jobj = jobj)
169179
})
170180

@@ -192,6 +202,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
192202
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
193203
#' The default value is "frequencyDesc". When the ordering is set to
194204
#' "alphabetDesc", this drops the same category as R when encoding strings.
205+
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
206+
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
195207
#' @return \code{glm} returns a fitted generalized linear model.
196208
#' @rdname glm
197209
#' @export
@@ -209,10 +221,12 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
209221
function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
210222
var.power = 0.0, link.power = 1.0 - var.power,
211223
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
212-
"alphabetDesc", "alphabetAsc")) {
224+
"alphabetDesc", "alphabetAsc"),
225+
offsetCol = NULL) {
213226
spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
214227
var.power = var.power, link.power = link.power,
215-
stringIndexerOrderType = stringIndexerOrderType)
228+
stringIndexerOrderType = stringIndexerOrderType,
229+
offsetCol = offsetCol)
216230
})
217231

218232
# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().

0 commit comments

Comments
 (0)