[SPARK-26923][R][SQL][FOLLOW-UP] Show stderr in the exception whenever possible in RRunner

HyukjinKwon · HyukjinKwon · commit 17321782deac · 2019-11-15T11:13:36.000+09:00
### What changes were proposed in this pull request? This is a followup of apache#23977 I made a mistake related to this line: apache@3725b13#diff-71c2cad03f08cb5f6c70462aa4e28d3aL112 Previously, 1. the reader iterator for R worker read some initial data eagerly during RDD materialization. So it read the data before actual execution. For some reasons, in this case, it showed standard error from R worker. 2. After that, when error happens during actual execution, stderr wasn't shown: apache@3725b13#diff-71c2cad03f08cb5f6c70462aa4e28d3aL260 After my change apache@3725b13#diff-71c2cad03f08cb5f6c70462aa4e28d3aL112, it now ignores 1. case and only does 2. of previous code path, because 1. does not happen anymore as I avoided to such eager execution (which is consistent with PySpark code path). This PR proposes to do only 1. before/after execution always because It is pretty much possible R worker was failed during actual execution and it's best to show the stderr from R worker whenever possible. ### Why are the changes needed? It currently swallows standard error from R worker which makes debugging harder. ### Does this PR introduce any user-facing change? Yes, ```R df <- createDataFrame(list(list(n=1))) collect(dapply(df, function(x) { stop("asdkjasdjkbadskjbsdajbk") x }, structType("a double"))) ``` **Before:** ``` Error in handleErrors(returnStatus, conn) : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 13.0 failed 1 times, most recent failure: Lost task 0.0 in stage 13.0 (TID 13, 192.168.35.193, executor driver): org.apache.spark.SparkException: R worker exited unexpectedly (cranshed) at org.apache.spark.api.r.RRunner$$anon$1.read(RRunner.scala:130) at org.apache.spark.api.r.BaseRRunner$ReaderIterator.hasNext(BaseRRunner.scala:118) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:337) at org.apache.spark. ``` **After:** ``` Error in handleErrors(returnStatus, conn) : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, 192.168.35.193, executor driver): org.apache.spark.SparkException: R unexpectedly exited. R worker produced errors: Error in computeFunc(inputData) : asdkjasdjkbadskjbsdajbk at org.apache.spark.api.r.BaseRRunner$ReaderIterator$$anonfun$1.applyOrElse(BaseRRunner.scala:144) at org.apache.spark.api.r.BaseRRunner$ReaderIterator$$anonfun$1.applyOrElse(BaseRRunner.scala:137) at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38) at org.apache.spark.api.r.RRunner$$anon$1.read(RRunner.scala:128) at org.apache.spark.api.r.BaseRRunner$ReaderIterator.hasNext(BaseRRunner.scala:113) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegen ``` ### How was this patch tested? Manually tested and unittest was added. Closes apache#26517 from HyukjinKwon/SPARK-26923-followup. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -3238,6 +3238,13 @@ test_that("Histogram", {
   expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
 })
 
+test_that("dapply() should show error message from R worker", {
+  df <- createDataFrame(list(list(n = 1)))
+  expect_error({
+    collect(dapply(df, function(x) stop("custom error message"), structType("a double")))
+  }, "custom error message")
+})
+
 test_that("dapply() and dapplyCollect() on a DataFrame", {
   df <- createDataFrame(
           list(list(1L, 1, "1"), list(2L, 2, "2"), list(3L, 3, "3")),
diff --git a/core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala b/core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala
@@ -82,12 +82,7 @@ private[spark] abstract class BaseRRunner[IN, OUT](
       serverSocket.close()
     }
 
-    try {
-      newReaderIterator(dataStream, errThread)
-    } catch {
-      case e: Exception =>
-        throw new SparkException("R computation failed with\n " + errThread.getLines(), e)
-    }
+    newReaderIterator(dataStream, errThread)
   }
 
   /**
@@ -138,6 +133,16 @@ private[spark] abstract class BaseRRunner[IN, OUT](
      * and then returns null.
      */
     protected def read(): OUT
+
+    protected val handleException: PartialFunction[Throwable, OUT] = {
+      case e: Exception =>
+        var msg = "R unexpectedly exited."
+        val lines = errThread.getLines()
+        if (lines.trim().nonEmpty) {
+          msg += s"\nR worker produced errors: $lines\n"
+        }
+        throw new SparkException(msg, e)
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
@@ -125,10 +125,7 @@ private[spark] class RRunner[IN, OUT](
               eos = true
               null.asInstanceOf[OUT]
           }
-        } catch {
-          case eof: EOFException =>
-            throw new SparkException("R worker exited unexpectedly (cranshed)", eof)
-        }
+        } catch handleException
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala
@@ -191,11 +191,7 @@ class ArrowRRunner(
               null
           }
         }
-      } catch {
-        case eof: EOFException =>
-          throw new SparkException(
-            "R worker exited unexpectedly (crashed)\n " + errThread.getLines(), eof)
-      }
+      } catch handleException
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -125,10 +125,7 @@ private[spark] class RRunner[IN, OUT](`
`125`	`125`	`eos = true`
`126`	`126`	`null.asInstanceOf[OUT]`
`127`	`127`	`}`
`128`		`- } catch {`
`129`		`- case eof: EOFException =>`
`130`		`- throw new SparkException("R worker exited unexpectedly (cranshed)", eof)`
`131`		`- }`
	`128`	`+ } catch handleException`
`132`	`129`	`}`
`133`	`130`	`}`
`134`	`131`	`}`
Original file line number	Diff line number	Diff line change
`@@ -191,11 +191,7 @@ class ArrowRRunner(`
`191`	`191`	`null`
`192`	`192`	`}`
`193`	`193`	`}`
`194`		`- } catch {`
`195`		`- case eof: EOFException =>`
`196`		`- throw new SparkException(`
`197`		`- "R worker exited unexpectedly (crashed)\n " + errThread.getLines(), eof)`
`198`		`- }`
	`194`	`+ } catch handleException`
`199`	`195`	`}`
`200`	`196`	`}`
`201`	`197`	`}`