Skip to content

Commit 0cdcf10

Browse files
Chang chenbaibaichenChang chen
authored
[GLUTEN-11088][VL] Add GlutenTests for get_json_object and schema merging (#11276)
* Add GlutenTest for schema merging failure in GlutenParquetSchemaSuite * Add GlutenTest for get_json_object function with GlutenPlan support * Update TODO comments in VeloxTestSettings to reflect fix --------- Co-authored-by: Chang chen <[email protected]> Co-authored-by: Chang chen <[email protected]>
1 parent b0da0aa commit 0cdcf10

File tree

3 files changed

+63
-3
lines changed

3 files changed

+63
-3
lines changed

gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ class VeloxTestSettings extends BackendTestSettings {
177177
.exclude("SPARK-42782: Hive compatibility check for get_json_object")
178178
// Velox does not support single quotes in get_json_object function.
179179
.exclude("function get_json_object - support single quotes")
180-
// TODO: fix in Spark-4.0
181180
.exclude("function get_json_object - path is null")
182181
.exclude("function get_json_object - json is null")
183182
.exclude("function get_json_object - Codegen Support")
@@ -527,8 +526,7 @@ class VeloxTestSettings extends BackendTestSettings {
527526
.exclude("schema mismatch failure error message for parquet vectorized reader")
528527
// https://github.com/apache/incubator-gluten/issues/11220
529528
.excludeByPrefix("SPARK-40819")
530-
// TODO: fix in Spark-4.0
531-
.excludeByPrefix("SPARK-46056")
529+
.excludeByPrefix("SPARK-46056") // TODO: fix in Spark-4.0
532530
.exclude("CANNOT_MERGE_SCHEMAS: Failed merging schemas")
533531
enableSuite[GlutenParquetThriftCompatibilitySuite]
534532
// Rewrite for file locating.

gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
*/
1717
package org.apache.spark.sql
1818

19+
import org.apache.gluten.execution.GlutenPlan
20+
1921
class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTrait {
2022
import testImplicits._
2123

@@ -101,4 +103,30 @@ class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTra
101103
checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value, '$path')"), Row(exp))
102104
}
103105
}
106+
107+
testGluten("function get_json_object - Codegen Support") {
108+
withTempView("GetJsonObjectTable") {
109+
val data = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring")
110+
data.createOrReplaceTempView("GetJsonObjectTable")
111+
val df = sql("SELECT key, get_json_object(jstring, '$.f1') FROM GetJsonObjectTable")
112+
val plan = df.queryExecution.executedPlan
113+
assert(plan.isInstanceOf[GlutenPlan])
114+
checkAnswer(df, Seq(Row("1", "value1")))
115+
}
116+
}
117+
testGluten("function get_json_object - path is null") {
118+
val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
119+
val df = data.selectExpr("get_json_object(a, null)")
120+
val plan = df.queryExecution.executedPlan
121+
assert(plan.isInstanceOf[GlutenPlan])
122+
checkAnswer(df, Row(null))
123+
}
124+
125+
testGluten("function get_json_object - json is null") {
126+
val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
127+
val df = data.selectExpr("get_json_object(null, '$.name')")
128+
val plan = df.queryExecution.executedPlan
129+
assert(plan.isInstanceOf[GlutenPlan])
130+
checkAnswer(df, Row(null))
131+
}
104132
}

gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616
*/
1717
package org.apache.spark.sql.execution.datasources.parquet
1818

19+
import org.apache.spark.SparkException
1920
import org.apache.spark.sql.GlutenSQLTestsBaseTrait
21+
import org.apache.spark.sql.Row
22+
import org.apache.spark.sql.catalyst.expressions.Cast.toSQLType
23+
import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
2024

2125
class GlutenParquetSchemaInferenceSuite
2226
extends ParquetSchemaInferenceSuite
@@ -27,4 +31,34 @@ class GlutenParquetSchemaSuite extends ParquetSchemaSuite with GlutenSQLTestsBas
2731
override protected def testFile(fileName: String): String = {
2832
getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName
2933
}
34+
35+
testGluten("CANNOT_MERGE_SCHEMAS: Failed merging schemas") {
36+
import testImplicits._
37+
38+
withTempPath {
39+
dir =>
40+
val path = dir.getCanonicalPath
41+
42+
// Note: Velox backend always generates Parquet files with nullable = true,
43+
// regardless of whether nullable is set to false or true in the schema.
44+
// Before https://github.com/apache/spark/pull/44644, `StructField.sql` would not
45+
// return the `NOT NULL` qualifier. This is why this test succeeds in Spark 3.5.
46+
val schema1 = StructType(Seq(StructField("id", LongType, nullable = true)))
47+
val df1 = spark.createDataFrame(
48+
spark.sparkContext.parallelize(Seq(Row(0L), Row(1L), Row(2L))),
49+
schema1)
50+
df1.write.parquet(s"$path/p=1")
51+
val df2 = df1.select($"id".cast(IntegerType).as(Symbol("id")))
52+
df2.write.parquet(s"$path/p=2")
53+
54+
checkError(
55+
exception = intercept[SparkException] {
56+
spark.read.option("mergeSchema", "true").parquet(path)
57+
},
58+
condition = "CANNOT_MERGE_SCHEMAS",
59+
sqlState = "42KD9",
60+
parameters = Map("left" -> toSQLType(df1.schema), "right" -> toSQLType(df2.schema))
61+
)
62+
}
63+
}
3064
}

0 commit comments

Comments
 (0)