Skip to content

Commit a8af2d7

Browse files
committed
Test with colon is running, the result are incorrect though.
1 parent e1a12f8 commit a8af2d7

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

src/test/scala/org/apache/spark/ml/feature/ITSelectorSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
2626

2727
val df = readColonData(sqlContext)
2828
val cols = df.columns
29-
val model = getSelectorModel(df, df.columns.drop(1), df.columns.head, 10, 20)
29+
val model = getSelectorModel(sqlContext, df, df.columns.drop(1), df.columns.head, 10, 20)
3030

3131
assertResult("764, 1581, 1671, 512, 1670, 1324, 1381, 1971, 1422, 1411") {
3232
model.selectedFeatures.mkString(", ")

src/test/scala/org/apache/spark/ml/feature/TestHelper.scala

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import org.apache.spark.sql.{DataFrame, Row, SQLContext}
88
import org.apache.spark.sql.types._
99
import org.joda.time.format.DateTimeFormat
1010
import org.apache.spark.ml.linalg.Vectors
11+
import org.apache.spark.ml.linalg.Vector
1112
import org.apache.spark.ml.linalg.VectorUDT
13+
import org.apache.spark.sql.Dataset
14+
import org.apache.spark.ml.util._
1215

1316
/**
1417
* Loads various test datasets
@@ -28,21 +31,26 @@ object TestHelper {
2831
/**
2932
* @return the discretizer fit to the data given the specified features to bin and label use as target.
3033
*/
31-
def createSelectorModel(dataframe: DataFrame, inputCols: Array[String],
34+
35+
def createSelectorModel(sqlContext: SQLContext, dataframe: Dataset[_], inputCols: Array[String],
3236
labelColumn: String,
3337
nPartitions: Int = 100,
3438
numTopFeatures: Int = 20,
3539
allVectorsDense: Boolean = true): InfoThSelectorModel = {
3640
val featureAssembler = new VectorAssembler()
3741
.setInputCols(inputCols)
3842
.setOutputCol("features")
39-
val processedDf = featureAssembler.transform(dataframe)
40-
43+
val processedDf = featureAssembler.transform(dataframe).select(labelColumn + INDEX_SUFFIX, "features")
4144

42-
processedDf.map {
45+
/** InfoSelector requires all vectors from the same type (either be sparse or dense) **/
46+
val rddData = processedDf.rdd.map {
4347
case Row(label: Double, features: Vector) =>
44-
OldLabeledPoint(label, OldVectors.fromML(features))
48+
val standardv = if(allVectorsDense) features.toDense else features.toSparse
49+
Row.fromSeq(Seq(label, standardv))
4550
}
51+
52+
val inputData = sqlContext.createDataFrame(rddData, processedDf.schema)
53+
4654
val selector = new InfoThSelector()
4755
.setSelectCriterion("mrmr")
4856
.setNPartitions(nPartitions)
@@ -51,20 +59,20 @@ object TestHelper {
5159
.setLabelCol(labelColumn + INDEX_SUFFIX)
5260
.setOutputCol("selectedFeatures")
5361

54-
selector.fit(processedDf)
62+
selector.fit(inputData)
5563
}
5664

5765

5866
/**
5967
* The label column will have null values replaced with MISSING values in this case.
6068
* @return the discretizer fit to the data given the specified features to bin and label use as target.
6169
*/
62-
def getSelectorModel(dataframe: DataFrame, inputCols: Array[String],
70+
def getSelectorModel(sqlContext: SQLContext, dataframe: DataFrame, inputCols: Array[String],
6371
labelColumn: String,
6472
nPartitions: Int = 100,
6573
numTopFeatures: Int = 20): InfoThSelectorModel = {
6674
val processedDf = cleanLabelCol(dataframe, labelColumn)
67-
createSelectorModel(processedDf, inputCols, labelColumn, nPartitions, numTopFeatures)
75+
createSelectorModel(sqlContext, processedDf, inputCols, labelColumn, nPartitions, numTopFeatures)
6876
}
6977

7078

0 commit comments

Comments
 (0)