[SPARK-23942][PYTHON][SQL] Makes collect in PySpark as action for a query executor listener

HyukjinKwon · HyukjinKwon · commit ab7b961a4fe9 · 2018-04-13T11:28:13.000+08:00
## What changes were proposed in this pull request? This PR proposes to add `collect` to a query executor as an action. Seems `collect` / `collect` with Arrow are not recognised via `QueryExecutionListener` as an action. For example, if we have a custom listener as below: ```scala package org.apache.spark.sql import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener class TestQueryExecutionListener extends QueryExecutionListener with Logging { override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { logError("Look at me! I'm 'onSuccess'") } override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { } } ``` and set `spark.sql.queryExecutionListeners` to `org.apache.spark.sql.TestQueryExecutionListener` Other operations in PySpark or Scala side seems fine: ```python >>> sql("SELECT * FROM range(1)").show() ``` ``` 18/04/09 17:02:04 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess' +---+ | id| +---+ | 0| +---+ ``` ```scala scala> sql("SELECT * FROM range(1)").collect() ``` ``` 18/04/09 16:58:41 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess' res1: Array[org.apache.spark.sql.Row] = Array([0]) ``` but .. **Before** ```python >>> sql("SELECT * FROM range(1)").collect() ``` ``` [Row(id=0)] ``` ```python >>> spark.conf.set("spark.sql.execution.arrow.enabled", "true") >>> sql("SELECT * FROM range(1)").toPandas() ``` ``` id 0 0 ``` **After** ```python >>> sql("SELECT * FROM range(1)").collect() ``` ``` 18/04/09 16:57:58 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess' [Row(id=0)] ``` ```python >>> spark.conf.set("spark.sql.execution.arrow.enabled", "true") >>> sql("SELECT * FROM range(1)").toPandas() ``` ``` 18/04/09 17:53:26 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess' id 0 0 ``` ## How was this patch tested? I have manually tested as described above and unit test was added. Author: hyukjinkwon <gurwls223@apache.org> Closes apache#21007 from HyukjinKwon/SPARK-23942.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -186,16 +186,12 @@ def __init__(self, key, value):
         self.value = value
 
 
-class ReusedSQLTestCase(ReusedPySparkTestCase):
-    @classmethod
-    def setUpClass(cls):
-        ReusedPySparkTestCase.setUpClass()
-        cls.spark = SparkSession(cls.sc)
-
-    @classmethod
-    def tearDownClass(cls):
-        ReusedPySparkTestCase.tearDownClass()
-        cls.spark.stop()
+class SQLTestUtils(object):
+    """
+    This util assumes the instance of this to have 'spark' attribute, having a spark session.
+    It is usually used with 'ReusedSQLTestCase' class but can be used if you feel sure the
+    the implementation of this class has 'spark' attribute.
+    """
 
     @contextmanager
     def sql_conf(self, pairs):
@@ -204,6 +200,7 @@ def sql_conf(self, pairs):
         `value` to the configuration `key` and then restores it back when it exits.
         """
         assert isinstance(pairs, dict), "pairs should be a dictionary."
+        assert hasattr(self, "spark"), "it should have 'spark' attribute, having a spark session."
 
         keys = pairs.keys()
         new_values = pairs.values()
@@ -219,6 +216,18 @@ def sql_conf(self, pairs):
                 else:
                     self.spark.conf.set(key, old_value)
 
+
+class ReusedSQLTestCase(ReusedPySparkTestCase, SQLTestUtils):
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.spark = SparkSession(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        ReusedPySparkTestCase.tearDownClass()
+        cls.spark.stop()
+
     def assertPandasEqual(self, expected, result):
         msg = ("DataFrames are not equal: " +
                "\n\nExpected:\n%s\n%s" % (expected, expected.dtypes) +
@@ -3066,6 +3075,64 @@ def test_sparksession_with_stopped_sparkcontext(self):
             sc.stop()
 
 
+class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils):
+    # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is
+    # static and immutable. This can't be set or unset, for example, via `spark.conf`.
+
+    @classmethod
+    def setUpClass(cls):
+        import glob
+        from pyspark.find_spark_home import _find_spark_home
+
+        SPARK_HOME = _find_spark_home()
+        filename_pattern = (
+            "sql/core/target/scala-*/test-classes/org/apache/spark/sql/"
+            "TestQueryExecutionListener.class")
+        if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)):
+            raise unittest.SkipTest(
+                "'org.apache.spark.sql.TestQueryExecutionListener' is not "
+                "available. Will skip the related tests.")
+
+        # Note that 'spark.sql.queryExecutionListeners' is a static immutable configuration.
+        cls.spark = SparkSession.builder \
+            .master("local[4]") \
+            .appName(cls.__name__) \
+            .config(
+                "spark.sql.queryExecutionListeners",
+                "org.apache.spark.sql.TestQueryExecutionListener") \
+            .getOrCreate()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.spark.stop()
+
+    def tearDown(self):
+        self.spark._jvm.OnSuccessCall.clear()
+
+    def test_query_execution_listener_on_collect(self):
+        self.assertFalse(
+            self.spark._jvm.OnSuccessCall.isCalled(),
+            "The callback from the query execution listener should not be called before 'collect'")
+        self.spark.sql("SELECT * FROM range(1)").collect()
+        self.assertTrue(
+            self.spark._jvm.OnSuccessCall.isCalled(),
+            "The callback from the query execution listener should be called after 'collect'")
+
+    @unittest.skipIf(
+        not _have_pandas or not _have_pyarrow,
+        _pandas_requirement_message or _pyarrow_requirement_message)
+    def test_query_execution_listener_on_collect_with_arrow(self):
+        with self.sql_conf({"spark.sql.execution.arrow.enabled": True}):
+            self.assertFalse(
+                self.spark._jvm.OnSuccessCall.isCalled(),
+                "The callback from the query execution listener should not be "
+                "called before 'toPandas'")
+            self.spark.sql("SELECT * FROM range(1)").toPandas()
+            self.assertTrue(
+                self.spark._jvm.OnSuccessCall.isCalled(),
+                "The callback from the query execution listener should be called after 'toPandas'")
+
+
 class SparkSessionTests(PySparkTestCase):
 
     # This test is separate because it's closely related with session's start and stop.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3189,10 +3189,10 @@ class Dataset[T] private[sql](
 
   private[sql] def collectToPython(): Int = {
     EvaluatePython.registerPicklers()
-    withNewExecutionId {
+    withAction("collectToPython", queryExecution) { plan =>
       val toJava: (Any) => Any = EvaluatePython.toJava(_, schema)
-      val iter = new SerDeUtil.AutoBatchedPickler(
-        queryExecution.executedPlan.executeCollect().iterator.map(toJava))
+      val iter: Iterator[Array[Byte]] = new SerDeUtil.AutoBatchedPickler(
+        plan.executeCollect().iterator.map(toJava))
       PythonRDD.serveIterator(iter, "serve-DataFrame")
     }
   }
@@ -3201,8 +3201,9 @@ class Dataset[T] private[sql](
    * Collect a Dataset as ArrowPayload byte arrays and serve to PySpark.
    */
   private[sql] def collectAsArrowToPython(): Int = {
-    withNewExecutionId {
-      val iter = toArrowPayload.collect().iterator.map(_.asPythonSerializable)
+    withAction("collectAsArrowToPython", queryExecution) { plan =>
+      val iter: Iterator[Array[Byte]] =
+        toArrowPayload(plan).collect().iterator.map(_.asPythonSerializable)
       PythonRDD.serveIterator(iter, "serve-Arrow")
     }
   }
@@ -3311,14 +3312,19 @@ class Dataset[T] private[sql](
   }
 
   /** Convert to an RDD of ArrowPayload byte arrays */
-  private[sql] def toArrowPayload: RDD[ArrowPayload] = {
+  private[sql] def toArrowPayload(plan: SparkPlan): RDD[ArrowPayload] = {
     val schemaCaptured = this.schema
     val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch
     val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone
-    queryExecution.toRdd.mapPartitionsInternal { iter =>
+    plan.execute().mapPartitionsInternal { iter =>
       val context = TaskContext.get()
       ArrowConverters.toPayloadIterator(
         iter, schemaCaptured, maxRecordsPerBatch, timeZoneId, context)
     }
   }
+
+  // This is only used in tests, for now.
+  private[sql] def toArrowPayload: RDD[ArrowPayload] = {
+    toArrowPayload(queryExecution.executedPlan)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.concurrent.atomic.AtomicBoolean
+
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.util.QueryExecutionListener
+
+
+class TestQueryExecutionListener extends QueryExecutionListener {
+  override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {
+    OnSuccessCall.isOnSuccessCalled.set(true)
+  }
+
+  override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { }
+}
+
+/**
+ * This has a variable to check if `onSuccess` is actually called or not. Currently, this is for
+ * the test case in PySpark. See SPARK-23942.
+ */
+object OnSuccessCall {
+  val isOnSuccessCalled = new AtomicBoolean(false)
+
+  def isCalled(): Boolean = isOnSuccessCalled.get()
+
+  def clear(): Unit = isOnSuccessCalled.set(false)
+}

Original file line number	Diff line number	Diff line change
`@@ -3189,10 +3189,10 @@ class Dataset[T] private[sql](`
`3189`	`3189`
`3190`	`3190`	`private[sql] def collectToPython(): Int = {`
`3191`	`3191`	`EvaluatePython.registerPicklers()`
`3192`		`- withNewExecutionId {`
	`3192`	`+ withAction("collectToPython", queryExecution) { plan =>`
`3193`	`3193`	`val toJava: (Any) => Any = EvaluatePython.toJava(_, schema)`
`3194`		`- val iter = new SerDeUtil.AutoBatchedPickler(`
`3195`		`- queryExecution.executedPlan.executeCollect().iterator.map(toJava))`
	`3194`	`+ val iter: Iterator[Array[Byte]] = new SerDeUtil.AutoBatchedPickler(`
	`3195`	`+ plan.executeCollect().iterator.map(toJava))`
`3196`	`3196`	`PythonRDD.serveIterator(iter, "serve-DataFrame")`
`3197`	`3197`	`}`
`3198`	`3198`	`}`
`@@ -3201,8 +3201,9 @@ class Dataset[T] private[sql](`
`3201`	`3201`	`* Collect a Dataset as ArrowPayload byte arrays and serve to PySpark.`
`3202`	`3202`	`*/`
`3203`	`3203`	`private[sql] def collectAsArrowToPython(): Int = {`
`3204`		`- withNewExecutionId {`
`3205`		`- val iter = toArrowPayload.collect().iterator.map(_.asPythonSerializable)`
	`3204`	`+ withAction("collectAsArrowToPython", queryExecution) { plan =>`
	`3205`	`+ val iter: Iterator[Array[Byte]] =`
	`3206`	`+ toArrowPayload(plan).collect().iterator.map(_.asPythonSerializable)`
`3206`	`3207`	`PythonRDD.serveIterator(iter, "serve-Arrow")`
`3207`	`3208`	`}`
`3208`	`3209`	`}`
`@@ -3311,14 +3312,19 @@ class Dataset[T] private[sql](`
`3311`	`3312`	`}`
`3312`	`3313`
`3313`	`3314`	`/** Convert to an RDD of ArrowPayload byte arrays */`
`3314`		`- private[sql] def toArrowPayload: RDD[ArrowPayload] = {`
	`3315`	`+ private[sql] def toArrowPayload(plan: SparkPlan): RDD[ArrowPayload] = {`
`3315`	`3316`	`val schemaCaptured = this.schema`
`3316`	`3317`	`val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch`
`3317`	`3318`	`val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone`
`3318`		`- queryExecution.toRdd.mapPartitionsInternal { iter =>`
	`3319`	`+ plan.execute().mapPartitionsInternal { iter =>`
`3319`	`3320`	`val context = TaskContext.get()`
`3320`	`3321`	`ArrowConverters.toPayloadIterator(`
`3321`	`3322`	`iter, schemaCaptured, maxRecordsPerBatch, timeZoneId, context)`
`3322`	`3323`	`}`
`3323`	`3324`	`}`
	`3325`	`+`
	`3326`	`+ // This is only used in tests, for now.`
	`3327`	`+ private[sql] def toArrowPayload: RDD[ArrowPayload] = {`
	`3328`	`+ toArrowPayload(queryExecution.executedPlan)`
	`3329`	`+ }`
`3324`	`3330`	`}`