Skip to content

Commit 5b90bc9

Browse files
committed
Add ILIKE, refactor NOT LIKE operators
ILIKE is case-insensitive version of LIKE. Also remove code duplication for NOT LIKE operators.
1 parent 2630309 commit 5b90bc9

File tree

1 file changed

+102
-104
lines changed

1 file changed

+102
-104
lines changed

arrow/src/compute/kernels/comparison.rs

Lines changed: 102 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation
2323
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.
2424
25-
use regex::Regex;
25+
use regex::RegexBuilder;
2626
use std::collections::HashMap;
2727

2828
use crate::array::*;
@@ -225,6 +225,23 @@ where
225225
pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
226226
left: &GenericStringArray<OffsetSize>,
227227
right: &GenericStringArray<OffsetSize>,
228+
) -> Result<BooleanArray> {
229+
like_utf8_impl(left, right, true, false)
230+
}
231+
232+
/// Case-insensitive version of [like_utf8]
233+
pub fn ilike_utf8<OffsetSize: StringOffsetSizeTrait>(
234+
left: &GenericStringArray<OffsetSize>,
235+
right: &GenericStringArray<OffsetSize>,
236+
) -> Result<BooleanArray> {
237+
like_utf8_impl(left, right, false, false)
238+
}
239+
240+
fn like_utf8_impl<OffsetSize: StringOffsetSizeTrait>(
241+
left: &GenericStringArray<OffsetSize>,
242+
right: &GenericStringArray<OffsetSize>,
243+
case_sensitive: bool,
244+
reverse_results: bool,
228245
) -> Result<BooleanArray> {
229246
let mut map = HashMap::new();
230247
if left.len() != right.len() {
@@ -245,17 +262,24 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
245262
regex
246263
} else {
247264
let re_pattern = pat.replace("%", ".*").replace("_", ".");
248-
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
249-
ArrowError::ComputeError(format!(
250-
"Unable to build regex from LIKE pattern: {}",
251-
e
252-
))
253-
})?;
265+
let re = RegexBuilder::new(&format!("^{}$", re_pattern))
266+
.case_insensitive(!case_sensitive)
267+
.build()
268+
.map_err(|e| {
269+
ArrowError::ComputeError(format!(
270+
"Unable to build regex from LIKE pattern: {}",
271+
e
272+
))
273+
})?;
254274
map.insert(pat, re);
255275
map.get(pat).unwrap()
256276
};
257277

258-
result.append(re.is_match(haystack));
278+
let mut r = re.is_match(haystack);
279+
if reverse_results {
280+
r = !r;
281+
}
282+
result.append(r);
259283
}
260284

261285
let data = ArrayData::new(
@@ -281,48 +305,89 @@ fn is_like_pattern(c: char) -> bool {
281305
pub fn like_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
282306
left: &GenericStringArray<OffsetSize>,
283307
right: &str,
308+
) -> Result<BooleanArray> {
309+
like_utf8_scalar_impl(left, right, true, false)
310+
}
311+
312+
/// Case-insensitive version of [like_utf8_scalar]
313+
pub fn ilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
314+
left: &GenericStringArray<OffsetSize>,
315+
right: &str,
316+
) -> Result<BooleanArray> {
317+
like_utf8_scalar_impl(left, right, false, false)
318+
}
319+
320+
fn like_utf8_scalar_impl<OffsetSize: StringOffsetSizeTrait>(
321+
left: &GenericStringArray<OffsetSize>,
322+
right: &str,
323+
case_sensitive: bool,
324+
reverse_results: bool,
284325
) -> Result<BooleanArray> {
285326
let null_bit_buffer = left.data().null_buffer().cloned();
286327
let bytes = bit_util::ceil(left.len(), 8);
287328
let mut bool_buf = MutableBuffer::from_len_zeroed(bytes);
288329
let bool_slice = bool_buf.as_slice_mut();
289330

290-
if !right.contains(is_like_pattern) {
331+
if case_sensitive && !right.contains(is_like_pattern) {
291332
// fast path, can use equals
292333
for i in 0..left.len() {
293-
if left.value(i) == right {
334+
let mut r = left.value(i) == right;
335+
if reverse_results {
336+
r = !r;
337+
}
338+
if r {
294339
bit_util::set_bit(bool_slice, i);
295340
}
296341
}
297-
} else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
342+
} else if case_sensitive
343+
&& right.ends_with('%')
344+
&& !right[..right.len() - 1].contains(is_like_pattern)
298345
{
299346
// fast path, can use starts_with
300347
let starts_with = &right[..right.len() - 1];
301348
for i in 0..left.len() {
302-
if left.value(i).starts_with(starts_with) {
349+
let mut r = left.value(i).starts_with(starts_with);
350+
if reverse_results {
351+
r = !r;
352+
}
353+
if r {
303354
bit_util::set_bit(bool_slice, i);
304355
}
305356
}
306-
} else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
357+
} else if case_sensitive
358+
&& right.starts_with('%')
359+
&& !right[1..].contains(is_like_pattern)
360+
{
307361
// fast path, can use ends_with
308362
let ends_with = &right[1..];
309363
for i in 0..left.len() {
310-
if left.value(i).ends_with(ends_with) {
364+
let mut r = left.value(i).ends_with(ends_with);
365+
if reverse_results {
366+
r = !r;
367+
}
368+
if r {
311369
bit_util::set_bit(bool_slice, i);
312370
}
313371
}
314372
} else {
315373
let re_pattern = right.replace("%", ".*").replace("_", ".");
316-
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
317-
ArrowError::ComputeError(format!(
318-
"Unable to build regex from LIKE pattern: {}",
319-
e
320-
))
321-
})?;
374+
let re = RegexBuilder::new(&format!("^{}$", re_pattern))
375+
.case_insensitive(!case_sensitive)
376+
.build()
377+
.map_err(|e| {
378+
ArrowError::ComputeError(format!(
379+
"Unable to build regex from LIKE pattern: {}",
380+
e
381+
))
382+
})?;
322383

323384
for i in 0..left.len() {
324385
let haystack = left.value(i);
325-
if re.is_match(haystack) {
386+
let mut r = re.is_match(haystack);
387+
if reverse_results {
388+
r = !r;
389+
}
390+
if r {
326391
bit_util::set_bit(bool_slice, i);
327392
}
328393
}
@@ -348,48 +413,15 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>(
348413
left: &GenericStringArray<OffsetSize>,
349414
right: &GenericStringArray<OffsetSize>,
350415
) -> Result<BooleanArray> {
351-
let mut map = HashMap::new();
352-
if left.len() != right.len() {
353-
return Err(ArrowError::ComputeError(
354-
"Cannot perform comparison operation on arrays of different length"
355-
.to_string(),
356-
));
357-
}
358-
359-
let null_bit_buffer =
360-
combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
361-
362-
let mut result = BooleanBufferBuilder::new(left.len());
363-
for i in 0..left.len() {
364-
let haystack = left.value(i);
365-
let pat = right.value(i);
366-
let re = if let Some(ref regex) = map.get(pat) {
367-
regex
368-
} else {
369-
let re_pattern = pat.replace("%", ".*").replace("_", ".");
370-
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
371-
ArrowError::ComputeError(format!(
372-
"Unable to build regex from LIKE pattern: {}",
373-
e
374-
))
375-
})?;
376-
map.insert(pat, re);
377-
map.get(pat).unwrap()
378-
};
379-
380-
result.append(!re.is_match(haystack));
381-
}
416+
like_utf8_impl(left, right, true, true)
417+
}
382418

383-
let data = ArrayData::new(
384-
DataType::Boolean,
385-
left.len(),
386-
None,
387-
null_bit_buffer,
388-
0,
389-
vec![result.finish()],
390-
vec![],
391-
);
392-
Ok(BooleanArray::from(data))
419+
/// Case-insensitive `NOT ILIKE` operator.
420+
pub fn nilike_utf8<OffsetSize: StringOffsetSizeTrait>(
421+
left: &GenericStringArray<OffsetSize>,
422+
right: &GenericStringArray<OffsetSize>,
423+
) -> Result<BooleanArray> {
424+
like_utf8_impl(left, right, false, true)
393425
}
394426

395427
/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
@@ -400,49 +432,15 @@ pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
400432
left: &GenericStringArray<OffsetSize>,
401433
right: &str,
402434
) -> Result<BooleanArray> {
403-
let null_bit_buffer = left.data().null_buffer().cloned();
404-
let mut result = BooleanBufferBuilder::new(left.len());
405-
406-
if !right.contains(is_like_pattern) {
407-
// fast path, can use equals
408-
for i in 0..left.len() {
409-
result.append(left.value(i) != right);
410-
}
411-
} else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
412-
{
413-
// fast path, can use ends_with
414-
for i in 0..left.len() {
415-
result.append(!left.value(i).starts_with(&right[..right.len() - 1]));
416-
}
417-
} else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
418-
// fast path, can use starts_with
419-
for i in 0..left.len() {
420-
result.append(!left.value(i).ends_with(&right[1..]));
421-
}
422-
} else {
423-
let re_pattern = right.replace("%", ".*").replace("_", ".");
424-
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
425-
ArrowError::ComputeError(format!(
426-
"Unable to build regex from LIKE pattern: {}",
427-
e
428-
))
429-
})?;
430-
for i in 0..left.len() {
431-
let haystack = left.value(i);
432-
result.append(!re.is_match(haystack));
433-
}
434-
}
435+
like_utf8_scalar_impl(left, right, true, true)
436+
}
435437

436-
let data = ArrayData::new(
437-
DataType::Boolean,
438-
left.len(),
439-
None,
440-
null_bit_buffer,
441-
0,
442-
vec![result.finish()],
443-
vec![],
444-
);
445-
Ok(BooleanArray::from(data))
438+
/// Case-insensitive `NOT ILIKE` operator.
439+
pub fn nilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
440+
left: &GenericStringArray<OffsetSize>,
441+
right: &str,
442+
) -> Result<BooleanArray> {
443+
like_utf8_scalar_impl(left, right, false, true)
446444
}
447445

448446
pub fn eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray> {

0 commit comments

Comments
 (0)