Skip to content

Commit 9bdff0b

Browse files
kiszkcloud-fan
authored andcommitted
[SPARK-22550][SQL] Fix 64KB JVM bytecode limit problem with elt
## What changes were proposed in this pull request? This PR changes `elt` code generation to place generated code for expression for arguments into separated methods if these size could be large. This PR resolved the case of `elt` with a lot of argument ## How was this patch tested? Added new test cases into `StringExpressionsSuite` Author: Kazuaki Ishizaki <[email protected]> Closes #19778 from kiszk/SPARK-22550.
1 parent c957714 commit 9bdff0b

File tree

3 files changed

+73
-26
lines changed

3 files changed

+73
-26
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -790,23 +790,7 @@ class CodegenContext {
790790
returnType: String = "void",
791791
makeSplitFunction: String => String = identity,
792792
foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
793-
val blocks = new ArrayBuffer[String]()
794-
val blockBuilder = new StringBuilder()
795-
var length = 0
796-
for (code <- expressions) {
797-
// We can't know how many bytecode will be generated, so use the length of source code
798-
// as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
799-
// also not be too small, or it will have many function calls (for wide table), see the
800-
// results in BenchmarkWideTable.
801-
if (length > 1024) {
802-
blocks += blockBuilder.toString()
803-
blockBuilder.clear()
804-
length = 0
805-
}
806-
blockBuilder.append(code)
807-
length += CodeFormatter.stripExtraNewLinesAndComments(code).length
808-
}
809-
blocks += blockBuilder.toString()
793+
val blocks = buildCodeBlocks(expressions)
810794

811795
if (blocks.length == 1) {
812796
// inline execution if only one block
@@ -841,6 +825,32 @@ class CodegenContext {
841825
}
842826
}
843827

828+
/**
829+
* Splits the generated code of expressions into multiple sequences of String
830+
* based on a threshold of length of a String
831+
*
832+
* @param expressions the codes to evaluate expressions.
833+
*/
834+
def buildCodeBlocks(expressions: Seq[String]): Seq[String] = {
835+
val blocks = new ArrayBuffer[String]()
836+
val blockBuilder = new StringBuilder()
837+
var length = 0
838+
for (code <- expressions) {
839+
// We can't know how many bytecode will be generated, so use the length of source code
840+
// as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
841+
// also not be too small, or it will have many function calls (for wide table), see the
842+
// results in BenchmarkWideTable.
843+
if (length > 1024) {
844+
blocks += blockBuilder.toString()
845+
blockBuilder.clear()
846+
length = 0
847+
}
848+
blockBuilder.append(code)
849+
length += CodeFormatter.stripExtraNewLinesAndComments(code).length
850+
}
851+
blocks += blockBuilder.toString()
852+
}
853+
844854
/**
845855
* Here we handle all the methods which have been added to the inner classes and
846856
* not to the outer class.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -288,22 +288,52 @@ case class Elt(children: Seq[Expression])
288288
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
289289
val index = indexExpr.genCode(ctx)
290290
val strings = stringExprs.map(_.genCode(ctx))
291+
val indexVal = ctx.freshName("index")
292+
val stringVal = ctx.freshName("stringVal")
291293
val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
292294
s"""
293295
case ${index + 1}:
294-
${ev.value} = ${eval.isNull} ? null : ${eval.value};
296+
${eval.code}
297+
$stringVal = ${eval.isNull} ? null : ${eval.value};
295298
break;
296299
"""
297-
}.mkString("\n")
298-
val indexVal = ctx.freshName("index")
299-
val stringArray = ctx.freshName("strings");
300+
}
300301

301-
ev.copy(index.code + "\n" + strings.map(_.code).mkString("\n") + s"""
302-
final int $indexVal = ${index.value};
303-
UTF8String ${ev.value} = null;
304-
switch ($indexVal) {
305-
$assignStringValue
302+
val cases = ctx.buildCodeBlocks(assignStringValue)
303+
val codes = if (cases.length == 1) {
304+
s"""
305+
UTF8String $stringVal = null;
306+
switch ($indexVal) {
307+
${cases.head}
308+
}
309+
"""
310+
} else {
311+
var prevFunc = "null"
312+
for (c <- cases.reverse) {
313+
val funcName = ctx.freshName("eltFunc")
314+
val funcBody = s"""
315+
private UTF8String $funcName(InternalRow ${ctx.INPUT_ROW}, int $indexVal) {
316+
UTF8String $stringVal = null;
317+
switch ($indexVal) {
318+
$c
319+
default:
320+
return $prevFunc;
321+
}
322+
return $stringVal;
323+
}
324+
"""
325+
val fullFuncName = ctx.addNewFunction(funcName, funcBody)
326+
prevFunc = s"$fullFuncName(${ctx.INPUT_ROW}, $indexVal)"
306327
}
328+
s"UTF8String $stringVal = $prevFunc;"
329+
}
330+
331+
ev.copy(
332+
s"""
333+
${index.code}
334+
final int $indexVal = ${index.value};
335+
$codes
336+
UTF8String ${ev.value} = $stringVal;
307337
final boolean ${ev.isNull} = ${ev.value} == null;
308338
""")
309339
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
116116
assert(Elt(Seq(Literal(1), Literal(2))).checkInputDataTypes().isFailure)
117117
}
118118

119+
test("SPARK-22550: Elt should not generate codes beyond 64KB") {
120+
val N = 10000
121+
val strings = (1 to N).map(x => s"s$x")
122+
val args = Literal.create(N, IntegerType) +: strings.map(Literal.create(_, StringType))
123+
checkEvaluation(Elt(args), s"s$N")
124+
}
125+
119126
test("StringComparison") {
120127
val row = create_row("abc", null)
121128
val c1 = 'a.string.at(0)

0 commit comments

Comments
 (0)