Skip to content

Commit 8efc5ec

Browse files
s1ckcloud-fan
authored andcommitted
[SPARK-27174][SQL] Add support for casting integer types to binary
Co-authored-by: Philip Stutz <philip.stutzgmail.com> ## What changes were proposed in this pull request? This PR adds support for casting * `ByteType` * `ShortType` * `IntegerType` * `LongType` to `BinaryType`. ## How was this patch tested? We added unit tests for casting instances of the above types. For validation, we used Javas `DataOutputStream` to compare the resulting byte array with the result of `Cast`. We state that the contribution is our original work and that we license the work to the project under the project’s open source license. cloud-fan we'd appreciate a review if you find the time, thx Closes apache#24107 from s1ck/cast_to_binary. Authored-by: Martin Junghanns <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 8204dc1 commit 8efc5ec

File tree

9 files changed

+234
-88
lines changed

9 files changed

+234
-88
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ object Cast {
4444
case (_, StringType) => true
4545

4646
case (StringType, BinaryType) => true
47+
case (_: IntegralType, BinaryType) => true
4748

4849
case (StringType, BooleanType) => true
4950
case (DateType, BooleanType) => true
@@ -326,6 +327,10 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
326327
// BinaryConverter
327328
private[this] def castToBinary(from: DataType): Any => Any = from match {
328329
case StringType => buildCast[UTF8String](_, _.getBytes)
330+
case ByteType => buildCast[Byte](_, NumberConverter.toBinary)
331+
case ShortType => buildCast[Short](_, NumberConverter.toBinary)
332+
case IntegerType => buildCast[Int](_, NumberConverter.toBinary)
333+
case LongType => buildCast[Long](_, NumberConverter.toBinary)
329334
}
330335

331336
// UDFToBoolean
@@ -908,7 +913,11 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
908913

909914
private[this] def castToBinaryCode(from: DataType): CastFunction = from match {
910915
case StringType =>
911-
(c, evPrim, evNull) => code"$evPrim = $c.getBytes();"
916+
(c, evPrim, evNull) =>
917+
code"$evPrim = $c.getBytes();"
918+
case _: IntegralType =>
919+
(c, evPrim, evNull) =>
920+
code"$evPrim = ${NumberConverter.getClass.getName.stripSuffix("$")}.toBinary($c);"
912921
}
913922

914923
private[this] def castToDateCode(

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,39 @@ object NumberConverter {
169169
}
170170
UTF8String.fromBytes(java.util.Arrays.copyOfRange(temp, resultStartPos, temp.length))
171171
}
172+
173+
def toBinary(l: Long): Array[Byte] = {
174+
val result = new Array[Byte](8)
175+
result(0) = (l >>> 56 & 0xFF).toByte
176+
result(1) = (l >>> 48 & 0xFF).toByte
177+
result(2) = (l >>> 40 & 0xFF).toByte
178+
result(3) = (l >>> 32 & 0xFF).toByte
179+
result(4) = (l >>> 24 & 0xFF).toByte
180+
result(5) = (l >>> 16 & 0xFF).toByte
181+
result(6) = (l >>> 8 & 0xFF).toByte
182+
result(7) = (l & 0xFF).toByte
183+
result
184+
}
185+
186+
def toBinary(i: Int): Array[Byte] = {
187+
val result = new Array[Byte](4)
188+
result(0) = (i >>> 24 & 0xFF).toByte
189+
result(1) = (i >>> 16 & 0xFF).toByte
190+
result(2) = (i >>> 8 & 0xFF).toByte
191+
result(3) = (i & 0xFF).toByte
192+
result
193+
}
194+
195+
def toBinary(s: Short): Array[Byte] = {
196+
val result = new Array[Byte](2)
197+
result(0) = (s >>> 8 & 0xFF).toByte
198+
result(1) = (s & 0xFF).toByte
199+
result
200+
}
201+
202+
def toBinary(s: Byte): Array[Byte] = {
203+
val result = new Array[Byte](1)
204+
result(0) = s
205+
result
206+
}
172207
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -215,11 +215,6 @@ class AnalysisErrorSuite extends AnalysisTest {
215215
testRelation2.groupBy('a)(sum(UnresolvedStar(None))),
216216
"Invalid usage of '*'" :: "in expression 'sum'" :: Nil)
217217

218-
errorTest(
219-
"bad casts",
220-
testRelation.select(Literal(1).cast(BinaryType).as('badCast)),
221-
"cannot cast" :: Literal(1).dataType.simpleString :: BinaryType.simpleString :: Nil)
222-
223218
errorTest(
224219
"sorting by unsupported column types",
225220
mapRelation.orderBy('map.asc),

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,11 @@
1717

1818
package org.apache.spark.sql.catalyst.util
1919

20+
import java.nio.ByteBuffer
21+
import java.nio.ByteOrder.BIG_ENDIAN
22+
2023
import org.apache.spark.SparkFunSuite
21-
import org.apache.spark.sql.catalyst.util.NumberConverter.convert
24+
import org.apache.spark.sql.catalyst.util.NumberConverter.{convert, toBinary}
2225
import org.apache.spark.unsafe.types.UTF8String
2326

2427
class NumberConverterSuite extends SparkFunSuite {
@@ -37,4 +40,47 @@ class NumberConverterSuite extends SparkFunSuite {
3740
checkConv("11abc", 10, 16, "B")
3841
}
3942

43+
test("byte to binary") {
44+
checkToBinary(0.toByte)
45+
checkToBinary(1.toByte)
46+
checkToBinary(-1.toByte)
47+
checkToBinary(Byte.MaxValue)
48+
checkToBinary(Byte.MinValue)
49+
}
50+
51+
test("short to binary") {
52+
checkToBinary(0.toShort)
53+
checkToBinary(1.toShort)
54+
checkToBinary(-1.toShort)
55+
checkToBinary(Short.MaxValue)
56+
checkToBinary(Short.MinValue)
57+
}
58+
59+
test("integer to binary") {
60+
checkToBinary(0)
61+
checkToBinary(1)
62+
checkToBinary(-1)
63+
checkToBinary(Int.MaxValue)
64+
checkToBinary(Int.MinValue)
65+
}
66+
67+
test("long to binary") {
68+
checkToBinary(0L)
69+
checkToBinary(1L)
70+
checkToBinary(-1L)
71+
checkToBinary(Long.MaxValue)
72+
checkToBinary(Long.MinValue)
73+
}
74+
75+
def checkToBinary[T](in: T): Unit = in match {
76+
case b: Byte =>
77+
assert(toBinary(b) === ByteBuffer.allocate(1).order(BIG_ENDIAN).put(b).array())
78+
case s: Short =>
79+
assert(toBinary(s) === ByteBuffer.allocate(2).order(BIG_ENDIAN).putShort(s).array())
80+
case i: Int =>
81+
assert(toBinary(i) === ByteBuffer.allocate(4).order(BIG_ENDIAN).putInt(i).array())
82+
case l: Long =>
83+
assert(toBinary(l) === ByteBuffer.allocate(8).order(BIG_ENDIAN).putLong(l).array())
84+
}
85+
4086
}

sql/core/src/test/resources/sql-tests/inputs/cast.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,19 @@ SELECT CAST('-9223372036854775809' AS long);
4040
SELECT CAST('9223372036854775807' AS long);
4141
SELECT CAST('9223372036854775808' AS long);
4242

43+
-- cast string to its binary representation
44+
SELECT HEX(CAST('abc' AS binary));
45+
46+
-- cast integral values to their corresponding binary representation
47+
SELECT HEX(CAST(CAST(123 AS byte) AS binary));
48+
SELECT HEX(CAST(CAST(-123 AS byte) AS binary));
49+
SELECT HEX(CAST(123S AS binary));
50+
SELECT HEX(CAST(-123S AS binary));
51+
SELECT HEX(CAST(123 AS binary));
52+
SELECT HEX(CAST(-123 AS binary));
53+
SELECT HEX(CAST(123L AS binary));
54+
SELECT HEX(CAST(-123L AS binary));
55+
4356
DESC FUNCTION boolean;
4457
DESC FUNCTION EXTENDED boolean;
4558
-- TODO: migrate all cast tests here.

sql/core/src/test/resources/sql-tests/results/cast.sql.out

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 24
2+
-- Number of queries: 33
33

44

55
-- !query 0
@@ -179,20 +179,92 @@ NULL
179179

180180

181181
-- !query 22
182-
DESC FUNCTION boolean
182+
SELECT HEX(CAST('abc' AS binary))
183183
-- !query 22 schema
184-
struct<function_desc:string>
184+
struct<hex(CAST(abc AS BINARY)):string>
185185
-- !query 22 output
186+
616263
187+
188+
189+
-- !query 23
190+
SELECT HEX(CAST(CAST(123 AS byte) AS binary))
191+
-- !query 23 schema
192+
struct<hex(CAST(CAST(123 AS TINYINT) AS BINARY)):string>
193+
-- !query 23 output
194+
7B
195+
196+
197+
-- !query 24
198+
SELECT HEX(CAST(CAST(-123 AS byte) AS binary))
199+
-- !query 24 schema
200+
struct<hex(CAST(CAST(-123 AS TINYINT) AS BINARY)):string>
201+
-- !query 24 output
202+
85
203+
204+
205+
-- !query 25
206+
SELECT HEX(CAST(123S AS binary))
207+
-- !query 25 schema
208+
struct<hex(CAST(123 AS BINARY)):string>
209+
-- !query 25 output
210+
007B
211+
212+
213+
-- !query 26
214+
SELECT HEX(CAST(-123S AS binary))
215+
-- !query 26 schema
216+
struct<hex(CAST(-123 AS BINARY)):string>
217+
-- !query 26 output
218+
FF85
219+
220+
221+
-- !query 27
222+
SELECT HEX(CAST(123 AS binary))
223+
-- !query 27 schema
224+
struct<hex(CAST(123 AS BINARY)):string>
225+
-- !query 27 output
226+
0000007B
227+
228+
229+
-- !query 28
230+
SELECT HEX(CAST(-123 AS binary))
231+
-- !query 28 schema
232+
struct<hex(CAST(-123 AS BINARY)):string>
233+
-- !query 28 output
234+
FFFFFF85
235+
236+
237+
-- !query 29
238+
SELECT HEX(CAST(123L AS binary))
239+
-- !query 29 schema
240+
struct<hex(CAST(123 AS BINARY)):string>
241+
-- !query 29 output
242+
000000000000007B
243+
244+
245+
-- !query 30
246+
SELECT HEX(CAST(-123L AS binary))
247+
-- !query 30 schema
248+
struct<hex(CAST(-123 AS BINARY)):string>
249+
-- !query 30 output
250+
FFFFFFFFFFFFFF85
251+
252+
253+
-- !query 31
254+
DESC FUNCTION boolean
255+
-- !query 31 schema
256+
struct<function_desc:string>
257+
-- !query 31 output
186258
Class: org.apache.spark.sql.catalyst.expressions.Cast
187259
Function: boolean
188260
Usage: boolean(expr) - Casts the value `expr` to the target data type `boolean`.
189261

190262

191-
-- !query 23
263+
-- !query 32
192264
DESC FUNCTION EXTENDED boolean
193-
-- !query 23 schema
265+
-- !query 32 schema
194266
struct<function_desc:string>
195-
-- !query 23 output
267+
-- !query 32 output
196268
Class: org.apache.spark.sql.catalyst.expressions.Cast
197269
Extended Usage:
198270
No example/argument for boolean.

0 commit comments

Comments
 (0)