Skip to content

Commit 32c848a

Browse files
committed
int_to_binary
1 parent c7aad67 commit 32c848a

File tree

4 files changed

+47
-10
lines changed

4 files changed

+47
-10
lines changed

docs/source/user-guide/latest/compatibility.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ The following cast operations are generally compatible with Spark except for the
106106
| byte | double | |
107107
| byte | decimal | |
108108
| byte | string | |
109+
| byte | binary | |
109110
| short | boolean | |
110111
| short | byte | |
111112
| short | integer | |
@@ -114,6 +115,7 @@ The following cast operations are generally compatible with Spark except for the
114115
| short | double | |
115116
| short | decimal | |
116117
| short | string | |
118+
| short | binary | |
117119
| integer | boolean | |
118120
| integer | byte | |
119121
| integer | short | |
@@ -122,6 +124,7 @@ The following cast operations are generally compatible with Spark except for the
122124
| integer | double | |
123125
| integer | decimal | |
124126
| integer | string | |
127+
| integer | binary | |
125128
| long | boolean | |
126129
| long | byte | |
127130
| long | short | |
@@ -130,6 +133,7 @@ The following cast operations are generally compatible with Spark except for the
130133
| long | double | |
131134
| long | decimal | |
132135
| long | string | |
136+
| long | binary | |
133137
| float | boolean | |
134138
| float | byte | |
135139
| float | short | |
@@ -160,7 +164,6 @@ The following cast operations are generally compatible with Spark except for the
160164
| string | long | |
161165
| string | float | |
162166
| string | double | |
163-
| string | binary | |
164167
| string | date | Only supports years between 262143 BC and 262142 AD |
165168
| binary | string | |
166169
| date | string | |

native/spark-expr/src/conversion_funcs/cast.rs

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ use crate::{timezone, BinaryOutputStyle};
2020
use crate::{EvalMode, SparkError, SparkResult};
2121
use arrow::array::builder::StringBuilder;
2222
use arrow::array::{
23-
BooleanBuilder, Decimal128Builder, DictionaryArray, GenericByteArray, ListArray,
23+
BinaryBuilder, BooleanBuilder, Decimal128Builder, DictionaryArray, GenericByteArray, ListArray,
2424
PrimitiveBuilder, StringArray, StructArray,
2525
};
2626
use arrow::compute::can_cast_types;
2727
use arrow::datatypes::{
28-
i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type, GenericBinaryType,
29-
Schema,
28+
i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type,
29+
GenericBinaryType, Schema,
3030
};
3131
use arrow::{
3232
array::{
@@ -311,22 +311,22 @@ fn can_cast_from_byte(to_type: &DataType, _: &SparkCastOptions) -> bool {
311311
use DataType::*;
312312
matches!(
313313
to_type,
314-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _)
314+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) | Binary
315315
)
316316
}
317317

318318
fn can_cast_from_short(to_type: &DataType, _: &SparkCastOptions) -> bool {
319319
use DataType::*;
320320
matches!(
321321
to_type,
322-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _)
322+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) | Binary
323323
)
324324
}
325325

326326
fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool {
327327
use DataType::*;
328328
match to_type {
329-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 => true,
329+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 | Binary => true,
330330
Decimal128(_, _) => {
331331
// incompatible: no overflow check
332332
options.allow_incompat
@@ -338,7 +338,7 @@ fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool {
338338
fn can_cast_from_long(to_type: &DataType, options: &SparkCastOptions) -> bool {
339339
use DataType::*;
340340
match to_type {
341-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => true,
341+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Binary => true,
342342
Decimal128(_, _) => {
343343
// incompatible: no overflow check
344344
options.allow_incompat
@@ -501,6 +501,29 @@ macro_rules! cast_float_to_string {
501501
}};
502502
}
503503

504+
// eval mode is not needed since all ints can be implemented in binary format
505+
macro_rules! cast_whole_num_to_binary {
506+
($array:expr, $primitive_type:ty, $byte_size:expr) => {{
507+
let input_arr = $array
508+
.as_any()
509+
.downcast_ref::<$primitive_type>()
510+
.ok_or_else(|| SparkError::Internal("Expected numeric array".to_string()))?;
511+
512+
let len = input_arr.len();
513+
let mut builder = BinaryBuilder::with_capacity(len, len * $byte_size);
514+
515+
for i in 0..input_arr.len() {
516+
if input_arr.is_null(i) {
517+
builder.append_null();
518+
} else {
519+
builder.append_value(input_arr.value(i).to_be_bytes());
520+
}
521+
}
522+
523+
Ok(Arc::new(builder.finish()) as ArrayRef)
524+
}};
525+
}
526+
504527
macro_rules! cast_int_to_int_macro {
505528
(
506529
$array: expr,
@@ -1096,6 +1119,10 @@ fn cast_array(
10961119
Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
10971120
}
10981121
(Binary, Utf8) => Ok(cast_binary_to_string::<i32>(&array, cast_options)?),
1122+
(Int8, Binary) => cast_whole_num_to_binary!(&array, Int8Array, 1),
1123+
(Int16, Binary) => cast_whole_num_to_binary!(&array, Int16Array, 2),
1124+
(Int32, Binary) => cast_whole_num_to_binary!(&array, Int32Array, 4),
1125+
(Int64, Binary) => cast_whole_num_to_binary!(&array, Int64Array, 8),
10991126
_ if cast_options.is_adapting_schema
11001127
|| is_datafusion_spark_compatible(from_type, to_type) =>
11011128
{

spark/src/main/scala/org/apache/comet/expressions/CometCast.scala

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.comet.expressions
2121

2222
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal}
2323
import org.apache.spark.sql.internal.SQLConf
24-
import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType}
24+
import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, DataTypes, DecimalType, NullType, StructType}
2525

2626
import org.apache.comet.CometConf
2727
import org.apache.comet.CometSparkSessionExtensions.withInfo
@@ -126,6 +126,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim {
126126
isSupported(dt.elementType, DataTypes.StringType, timeZoneId, evalMode)
127127
case (dt: ArrayType, dt1: ArrayType) =>
128128
isSupported(dt.elementType, dt1.elementType, timeZoneId, evalMode)
129+
case (from: DataType, _: BinaryType) => canCastToBinary(from)
129130
case (dt: DataType, _) if dt.typeName == "timestamp_ntz" =>
130131
// https://github.com/apache/datafusion-comet/issues/378
131132
toType match {
@@ -344,6 +345,12 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim {
344345
case _ => Unsupported(Some(s"Cast from DecimalType to $toType is not supported"))
345346
}
346347

348+
private def canCastToBinary(fromType: DataType): SupportLevel = fromType match {
349+
case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType =>
350+
Compatible()
351+
case _ => Unsupported(Some(s"Cast from BinaryType to $fromType is not supported"))
352+
}
353+
347354
private def unsupported(fromType: DataType, toType: DataType): Unsupported = {
348355
Unsupported(Some(s"Cast from $fromType to $toType is not supported"))
349356
}

spark/src/test/scala/org/apache/comet/CometCastSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
281281
hasIncompatibleType = usingParquetExecWithIncompatTypes)
282282
}
283283

284-
ignore("cast ShortType to BinaryType") {
284+
test("cast ShortType to BinaryType") {
285285
castTest(
286286
generateShorts(),
287287
DataTypes.BinaryType,

0 commit comments

Comments
 (0)