Skip to content

Commit a0345cb

Browse files
Peng Mengsrowen
authored andcommitted
[SPARK-21680][ML][MLLIB] optimize Vector compress
## What changes were proposed in this pull request? When use Vector.compressed to change a Vector to SparseVector, the performance is very low comparing with Vector.toSparse. This is because you have to scan the value three times using Vector.compressed, but you just need two times when use Vector.toSparse. When the length of the vector is large, there is significant performance difference between this two method. ## How was this patch tested? The existing UT Author: Peng Meng <[email protected]> Closes apache#18899 from mpjlu/optVectorCompress.
1 parent 7add4e9 commit a0345cb

File tree

5 files changed

+60
-14
lines changed

5 files changed

+60
-14
lines changed

mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,21 @@ sealed trait Vector extends Serializable {
136136
* Converts this vector to a sparse vector with all explicit zeros removed.
137137
*/
138138
@Since("2.0.0")
139-
def toSparse: SparseVector
139+
def toSparse: SparseVector = toSparseWithSize(numNonzeros)
140+
141+
/**
142+
* Converts this vector to a sparse vector with all explicit zeros removed when the size is known.
143+
* This method is used to avoid re-computing the number of non-zero elements when it is
144+
* already known. This method should only be called after computing the number of non-zero
145+
* elements via [[numNonzeros]]. e.g.
146+
* {{{
147+
* val nnz = numNonzeros
148+
* val sv = toSparse(nnz)
149+
* }}}
150+
*
151+
* If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown.
152+
*/
153+
private[linalg] def toSparseWithSize(nnz: Int): SparseVector
140154

141155
/**
142156
* Converts this vector to a dense vector.
@@ -152,7 +166,7 @@ sealed trait Vector extends Serializable {
152166
val nnz = numNonzeros
153167
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
154168
if (1.5 * (nnz + 1.0) < size) {
155-
toSparse
169+
toSparseWithSize(nnz)
156170
} else {
157171
toDense
158172
}
@@ -495,8 +509,7 @@ class DenseVector @Since("2.0.0") ( @Since("2.0.0") val values: Array[Double]) e
495509
nnz
496510
}
497511

498-
override def toSparse: SparseVector = {
499-
val nnz = numNonzeros
512+
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
500513
val ii = new Array[Int](nnz)
501514
val vv = new Array[Double](nnz)
502515
var k = 0
@@ -635,8 +648,7 @@ class SparseVector @Since("2.0.0") (
635648
nnz
636649
}
637650

638-
override def toSparse: SparseVector = {
639-
val nnz = numNonzeros
651+
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
640652
if (nnz == numActives) {
641653
this
642654
} else {

mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,11 +318,21 @@ class VectorsSuite extends SparkMLFunSuite {
318318
assert(dv0s.numActives === 2)
319319
assert(dv0s === dv0)
320320

321+
assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0)
322+
val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros)
323+
assert(dv0s2.numActives === 2)
324+
assert(dv0s2 === dv0s)
325+
321326
val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
322327
assert(sv0.toDense === sv0)
323328
val sv0s = sv0.toSparse
324329
assert(sv0s.numActives === 2)
325330
assert(sv0s === sv0)
331+
332+
assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0)
333+
val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros)
334+
assert(sv0s2.numActives === 2)
335+
assert(sv0s2 === sv0s)
326336
}
327337

328338
test("Vector.compressed") {

mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,21 @@ sealed trait Vector extends Serializable {
149149
* Converts this vector to a sparse vector with all explicit zeros removed.
150150
*/
151151
@Since("1.4.0")
152-
def toSparse: SparseVector
152+
def toSparse: SparseVector = toSparseWithSize(numNonzeros)
153+
154+
/**
155+
* Converts this vector to a sparse vector with all explicit zeros removed when the size is known.
156+
* This method is used to avoid re-computing the number of non-zero elements when it is
157+
* already known. This method should only be called after computing the number of non-zero
158+
* elements via [[numNonzeros]]. e.g.
159+
* {{{
160+
* val nnz = numNonzeros
161+
* val sv = toSparse(nnz)
162+
* }}}
163+
*
164+
* If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown.
165+
*/
166+
private[linalg] def toSparseWithSize(nnz: Int): SparseVector
153167

154168
/**
155169
* Converts this vector to a dense vector.
@@ -165,7 +179,7 @@ sealed trait Vector extends Serializable {
165179
val nnz = numNonzeros
166180
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
167181
if (1.5 * (nnz + 1.0) < size) {
168-
toSparse
182+
toSparseWithSize(nnz)
169183
} else {
170184
toDense
171185
}
@@ -669,9 +683,7 @@ class DenseVector @Since("1.0.0") (
669683
nnz
670684
}
671685

672-
@Since("1.4.0")
673-
override def toSparse: SparseVector = {
674-
val nnz = numNonzeros
686+
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
675687
val ii = new Array[Int](nnz)
676688
val vv = new Array[Double](nnz)
677689
var k = 0
@@ -822,9 +834,7 @@ class SparseVector @Since("1.0.0") (
822834
nnz
823835
}
824836

825-
@Since("1.4.0")
826-
override def toSparse: SparseVector = {
827-
val nnz = numNonzeros
837+
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
828838
if (nnz == numActives) {
829839
this
830840
} else {

mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,11 +351,21 @@ class VectorsSuite extends SparkFunSuite with Logging {
351351
assert(dv0s.numActives === 2)
352352
assert(dv0s === dv0)
353353

354+
assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0)
355+
val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros)
356+
assert(dv0s2.numActives === 2)
357+
assert(dv0s2 === dv0s)
358+
354359
val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
355360
assert(sv0.toDense === sv0)
356361
val sv0s = sv0.toSparse
357362
assert(sv0s.numActives === 2)
358363
assert(sv0s === sv0)
364+
365+
assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0)
366+
val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros)
367+
assert(sv0s2.numActives === 2)
368+
assert(sv0s2 === sv0s)
359369
}
360370

361371
test("Vector.compressed") {

project/MimaExcludes.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,10 @@ object MimaExcludes {
10151015
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
10161016
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
10171017
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
1018+
) ++ Seq(
1019+
// [SPARK-21680][ML][MLLIB]optimzie Vector coompress
1020+
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"),
1021+
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize")
10181022
)
10191023
}
10201024

0 commit comments

Comments
 (0)