Skip to content

Commit ed272db

Browse files
committed
First completed and passed successfully with colon dataset.
1 parent a8af2d7 commit ed272db

File tree

3 files changed

+23
-11
lines changed

3 files changed

+23
-11
lines changed

src/main/scala/org/apache/spark/mllib/feature/InfoThSelector.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -316,8 +316,7 @@ class InfoThSelector @Since("1.6.0") (
316316
case dv: DenseVector =>
317317
dv.values
318318
}
319-
val condition = (value: Double) => value <= Byte.MaxValue &&
320-
value >= Byte.MinValue && value % 1 == 0.0
319+
val condition = (value: Double) => value <= 255 && value >= 0.0 && value % 1 == 0.0
321320
if (!values.forall(condition(_))) {
322321
val str = values.mkString(",")
323322
throw new SparkException(
@@ -412,7 +411,7 @@ class InfoThSelector @Since("1.6.0") (
412411
case F(feat, rel) =>
413412
(feat + 1) + "\t" + "%.4f".format(rel)
414413
}.mkString("\n")
415-
logInfo("\n*** Selected features ***\nFeature\tScore\n" + out)
414+
println("\n*** Selected features ***\nFeature\tScore\n" + out)
416415
// Features must be sorted
417416
new InfoThSelectorModel(selected.map { case F(feat, rel) => feat }.sorted.toArray)
418417
}

src/test/scala/org/apache/spark/ml/feature/ITSelectorSuite.scala

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import TestHelper._
1313
* @author Sergio Ramirez
1414
*/
1515
@RunWith(classOf[JUnitRunner])
16-
class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
16+
class ITSelectorSuite extends FunSuite with BeforeAndAfterAll {
1717

1818
var sqlContext: SQLContext = null
1919

@@ -22,13 +22,15 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
2222
}
2323

2424
/** Do entropy based binning of cars data from UC Irvine repository. */
25-
test("Run ITFS on colon data (nPart = 20, nfeat = 20)") {
25+
test("Run ITFS on colon data (nPart = 10, nfeat = 10)") {
2626

2727
val df = readColonData(sqlContext)
2828
val cols = df.columns
29-
val model = getSelectorModel(sqlContext, df, df.columns.drop(1), df.columns.head, 10, 20)
29+
val pad = 2
30+
val allVectorsDense = true
31+
val model = getSelectorModel(sqlContext, df, cols.drop(1), cols.head, 10, 10, allVectorsDense, pad)
3032

31-
assertResult("764, 1581, 1671, 512, 1670, 1324, 1381, 1971, 1422, 1411") {
33+
assertResult("512, 764, 1324, 1380, 1411, 1422, 1581, 1670, 1671, 1971") {
3234
model.selectedFeatures.mkString(", ")
3335
}
3436
}

src/test/scala/org/apache/spark/ml/feature/TestHelper.scala

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ object TestHelper {
3636
labelColumn: String,
3737
nPartitions: Int = 100,
3838
numTopFeatures: Int = 20,
39-
allVectorsDense: Boolean = true): InfoThSelectorModel = {
39+
allVectorsDense: Boolean = true,
40+
padded: Int = 0): InfoThSelectorModel = {
4041
val featureAssembler = new VectorAssembler()
4142
.setInputCols(inputCols)
4243
.setOutputCol("features")
@@ -45,7 +46,14 @@ object TestHelper {
4546
/** InfoSelector requires all vectors from the same type (either be sparse or dense) **/
4647
val rddData = processedDf.rdd.map {
4748
case Row(label: Double, features: Vector) =>
48-
val standardv = if(allVectorsDense) features.toDense else features.toSparse
49+
val standardv = if(allVectorsDense){
50+
Vectors.dense(features.toArray.map(_ + padded))
51+
} else {
52+
val sparseVec = features.toSparse
53+
val newValues: Array[Double] = sparseVec.values.map(_ + padded)
54+
Vectors.sparse(sparseVec.size, sparseVec.indices, newValues)
55+
}
56+
4957
Row.fromSeq(Seq(label, standardv))
5058
}
5159

@@ -70,9 +78,12 @@ object TestHelper {
7078
def getSelectorModel(sqlContext: SQLContext, dataframe: DataFrame, inputCols: Array[String],
7179
labelColumn: String,
7280
nPartitions: Int = 100,
73-
numTopFeatures: Int = 20): InfoThSelectorModel = {
81+
numTopFeatures: Int = 20,
82+
allVectorsDense: Boolean = true,
83+
padded: Int = 0): InfoThSelectorModel = {
7484
val processedDf = cleanLabelCol(dataframe, labelColumn)
75-
createSelectorModel(sqlContext, processedDf, inputCols, labelColumn, nPartitions, numTopFeatures)
85+
createSelectorModel(sqlContext, processedDf, inputCols, labelColumn,
86+
nPartitions, numTopFeatures, allVectorsDense, padded)
7687
}
7788

7889

0 commit comments

Comments
 (0)