|
18 | 18 | //! "crypto" DataFusion functions |
19 | 19 |
|
20 | 20 | use arrow::array::{ |
21 | | - Array, ArrayRef, AsArray, BinaryArray, BinaryArrayType, |
22 | | - StringViewBuilder, |
| 21 | + Array, ArrayRef, AsArray, BinaryArray, BinaryArrayType, StringViewArray, |
23 | 22 | }; |
24 | | -use arrow::compute::StringArrayType; |
25 | 23 | use arrow::datatypes::DataType; |
26 | 24 | use blake2::{Blake2b512, Blake2s256, Digest}; |
27 | 25 | use blake3::Hasher as Blake3; |
| 26 | +use datafusion_common::cast::as_binary_array; |
| 27 | + |
| 28 | +use arrow::compute::StringArrayType; |
28 | 29 | use datafusion_common::{ |
29 | | - DataFusionError, Result, ScalarValue, exec_err, plan_err, utils::take_function_args, |
| 30 | + DataFusionError, Result, ScalarValue, exec_err, internal_err, plan_err, |
| 31 | + utils::take_function_args, |
30 | 32 | }; |
31 | 33 | use datafusion_expr::ColumnarValue; |
32 | 34 | use md5::Md5; |
@@ -136,77 +138,23 @@ impl fmt::Display for DigestAlgorithm { |
136 | 138 | /// computes md5 hash digest of the given input |
137 | 139 | pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> { |
138 | 140 | let [data] = take_function_args("md5", args)?; |
| 141 | + let value = digest_process(data, DigestAlgorithm::Md5)?; |
139 | 142 |
|
140 | | - // MD5 returns Utf8View (hex-encoded), so we use optimized fused digest+hex functions |
141 | | - // that avoid creating an intermediate BinaryArray |
142 | | - match data { |
143 | | - ColumnarValue::Array(a) => { |
144 | | - let array = match a.data_type() { |
145 | | - DataType::Utf8View => md5_hex_string_array(&a.as_string_view()), |
146 | | - DataType::Utf8 => md5_hex_string_array(&a.as_string::<i32>()), |
147 | | - DataType::LargeUtf8 => md5_hex_string_array(&a.as_string::<i64>()), |
148 | | - DataType::Binary => md5_hex_binary_array(&a.as_binary::<i32>()), |
149 | | - DataType::LargeBinary => md5_hex_binary_array(&a.as_binary::<i64>()), |
150 | | - DataType::BinaryView => md5_hex_binary_array(&a.as_binary_view()), |
151 | | - other => { |
152 | | - return exec_err!("Unsupported data type {other:?} for function md5"); |
153 | | - } |
154 | | - }; |
155 | | - Ok(ColumnarValue::Array(array)) |
156 | | - } |
157 | | - ColumnarValue::Scalar(scalar) => { |
158 | | - let hex_string = match scalar { |
159 | | - ScalarValue::Utf8View(a) |
160 | | - | ScalarValue::Utf8(a) |
161 | | - | ScalarValue::LargeUtf8(a) => { |
162 | | - a.as_ref().map(|s| hex_encode(Md5::digest(s.as_bytes()))) |
163 | | - } |
164 | | - ScalarValue::Binary(a) |
165 | | - | ScalarValue::LargeBinary(a) |
166 | | - | ScalarValue::BinaryView(a) => { |
167 | | - a.as_ref().map(|v| hex_encode(Md5::digest(v.as_slice()))) |
168 | | - } |
169 | | - other => { |
170 | | - return exec_err!("Unsupported data type {other:?} for function md5"); |
171 | | - } |
172 | | - }; |
173 | | - Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(hex_string))) |
174 | | - } |
175 | | - } |
176 | | -} |
177 | | - |
178 | | -/// Computes MD5 hash and hex-encodes in a single pass for string arrays. |
179 | | -/// Uses StringViewBuilder for efficient StringViewArray construction. |
180 | | -#[inline] |
181 | | -fn md5_hex_string_array<'a, T: StringArrayType<'a>>(input: &T) -> ArrayRef { |
182 | | - let mut builder = StringViewBuilder::with_capacity(input.len()); |
183 | | - for val in input.iter() { |
184 | | - match val { |
185 | | - Some(s) => { |
186 | | - let hash = Md5::digest(s.as_bytes()); |
187 | | - builder.append_value(hex_encode(hash)); |
188 | | - } |
189 | | - None => builder.append_null(), |
| 143 | + // md5 requires special handling because of its unique utf8view return type |
| 144 | + Ok(match value { |
| 145 | + ColumnarValue::Array(array) => { |
| 146 | + let binary_array = as_binary_array(&array)?; |
| 147 | + let string_array: StringViewArray = binary_array |
| 148 | + .iter() |
| 149 | + .map(|opt| opt.map(hex_encode::<_>)) |
| 150 | + .collect(); |
| 151 | + ColumnarValue::Array(Arc::new(string_array)) |
190 | 152 | } |
191 | | - } |
192 | | - Arc::new(builder.finish()) |
193 | | -} |
194 | | - |
195 | | -/// Computes MD5 hash and hex-encodes in a single pass for binary arrays. |
196 | | -/// Uses StringViewBuilder for efficient StringViewArray construction. |
197 | | -#[inline] |
198 | | -fn md5_hex_binary_array<'a, T: BinaryArrayType<'a>>(input: &T) -> ArrayRef { |
199 | | - let mut builder = StringViewBuilder::with_capacity(input.len()); |
200 | | - for val in input.iter() { |
201 | | - match val { |
202 | | - Some(bytes) => { |
203 | | - let hash = Md5::digest(bytes); |
204 | | - builder.append_value(hex_encode(hash)); |
205 | | - } |
206 | | - None => builder.append_null(), |
| 153 | + ColumnarValue::Scalar(ScalarValue::Binary(opt)) => { |
| 154 | + ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode::<_>))) |
207 | 155 | } |
208 | | - } |
209 | | - Arc::new(builder.finish()) |
| 156 | + _ => return internal_err!("Impossibly got invalid results from digest"), |
| 157 | + }) |
210 | 158 | } |
211 | 159 |
|
212 | 160 | /// Hex encoding lookup table for fast byte-to-hex conversion |
|
0 commit comments