Skip to content

Commit 973e9da

Browse files
authored
ignore nullable in DeepImageFeaturizer.validateSchema (#143)
In Spark SQL, nullability is a hint used during optimization and codegen to skip nullchecks, but not intended as an enforcement mechanism or as an implication that null values do exist. It might get dropped through the pipeline. This PR switches to DataType.equalsIgnoreNullability for the check * ignore nullable in DeepImageFeaturizer.validateSchema
1 parent a44fcbb commit 973e9da

File tree

3 files changed

+42
-3
lines changed

3 files changed

+42
-3
lines changed

src/main/scala/com/databricks/sparkdl/DeepImageFeaturizer.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,13 @@ import org.apache.spark.ml.linalg.Vectors
2525
import org.apache.spark.ml.param.{Param, ParamMap}
2626
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
2727
import org.apache.spark.sql.functions.{col, udf}
28-
import org.apache.spark.sql.types.StructType
28+
import org.apache.spark.sql.types.{DataTypeShim, StructType}
2929
import org.apache.spark.sql.{DataFrame, Dataset, Row}
3030
import org.tensorflow.framework.GraphDef
3131
import org.tensorframes.impl.DebugRowOps
3232
import org.tensorframes.{Shape, ShapeDescription}
3333

3434

35-
3635
class DeepImageFeaturizer(override val uid: String) extends Transformer with DefaultParamsWritable {
3736

3837
def this() = this(Identifiable.randomUID("deepImageFeaturizer"))
@@ -65,7 +64,7 @@ class DeepImageFeaturizer(override val uid: String) extends Transformer with Def
6564
val fieldIndex = schema.fieldIndex(inputColumnName)
6665
val colType = schema.fields(fieldIndex).dataType
6766
require(
68-
colType == ImageSchema.columnSchema,
67+
DataTypeShim.equalsIgnoreNullability(colType, ImageSchema.columnSchema),
6968
s"inputCol must be an image column with schema ImageSchema.columnSchema, got ${colType}"
7069
)
7170
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright 2017 Databricks, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.apache.spark.sql.types
18+
19+
object DataTypeShim {
20+
/**
21+
* Compares two types, ignoring nullability of ArrayType, MapType, StructType.
22+
*/
23+
def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = {
24+
DataType.equalsIgnoreNullability(left, right)
25+
}
26+
}

src/test/scala/com/databricks/sparkdl/DeepImageFeaturizerSuite.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,18 @@ class DeepImageFeaturizerSuite extends FunSuite with TestSparkContext with Defau
128128
.setOutputCol("myOutput")
129129
testDefaultReadWrite(featurizer)
130130
}
131+
132+
test("DeepImageFeaturizer accepts nullable") {
133+
val nullableImageSchema = StructType(
134+
data.schema("image").dataType.asInstanceOf[StructType]
135+
.fields.map(_.copy(nullable = true)))
136+
val nullableSchema = StructType(StructField("image", nullableImageSchema, true) :: Nil)
137+
val featurizer = new DeepImageFeaturizer()
138+
.setModelName("_test")
139+
.setInputCol("image")
140+
.setOutputCol("features")
141+
withClue("featurizer should accept nullable schemas") {
142+
featurizer.transformSchema(nullableSchema)
143+
}
144+
}
131145
}

0 commit comments

Comments
 (0)