Skip to content

Commit 11e41b2

Browse files
authored
chore(query): Implement Oracle-compatible INSTR Function in Databend (#18084)
This PR implements Oracle-compatible `INSTR` string search functionality in Databend, supporting both three-parameter and four-parameter variants to enhance SQL compatibility with Oracle databases. ```sql INSTR(string, substring [, pos] [, occurrence]) ``` Finds the position of a substring within a string starting from a specified position. **Parameters:** - `string`: The input string to search within (can be NULL) - `substring`: The substring to locate (can be NULL) - `pos`: The starting position for the search (1-based index) - Positive number: Search forward from left to right - Negative number: Search backward from right to left - Zero: Treated as 1 (start from beginning) **Returns:** - The 1-based position where the substring is found - Returns 0 if: - Either string or substring is NULL - Substring is not found - pos is 0 (after being converted to 1) and substring isn't at start Finds the nth occurrence of a substring within a string starting from a specified position. **Additional Parameter:** - `occurrence`: Which occurrence of the substring to find (must be positive integer) - 1 = first occurrence (default behavior) - 2 = second occurrence, etc. **Returns:** - Same return rules as three-parameter version - For occurrence > 1, continues searching after previous matches - If occurrence exceeds actual number of matches, returns 0
1 parent 912f6d5 commit 11e41b2

File tree

3 files changed

+153
-0
lines changed

3 files changed

+153
-0
lines changed

src/query/functions/src/scalars/string.rs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,105 @@ pub fn register(registry: &mut FunctionRegistry) {
324324
}
325325
};
326326

327+
fn instr(s: &str, sub: &str, position: i64, occurrence: u64) -> u64 {
328+
if occurrence == 0 {
329+
return 0;
330+
}
331+
332+
let s_len_chars = s.chars().count();
333+
let sub_len_chars = sub.chars().count();
334+
if sub_len_chars > s_len_chars {
335+
return 0;
336+
}
337+
338+
if sub.is_empty() {
339+
return if position > 0 {
340+
// For forward search, the empty string matches at the position
341+
// such as: INSTR (' ABC ', ', 1) - > 1, INSTR (' ABC ', ' ', 4) - > 4
342+
// The effective range is from 1 to s_len_chars + 1
343+
if (position as usize) <= s_len_chars + 1 {
344+
position as u64
345+
} else {
346+
0
347+
}
348+
} else {
349+
// For reverse search, empty strings match positions after the end of the string
350+
// such as: INSTR('abc', '', -1) -> 4 (s_len_chars + 1)
351+
(s_len_chars + 1) as u64
352+
};
353+
}
354+
355+
let mut char_to_byte_map = Vec::with_capacity(s_len_chars + 1);
356+
for (byte_idx, _) in s.char_indices() {
357+
char_to_byte_map.push(byte_idx);
358+
}
359+
char_to_byte_map.push(s.len());
360+
361+
let get_slice_by_char_idx = |start_char_idx: usize, num_chars: usize| -> Option<&str> {
362+
if start_char_idx + num_chars > s_len_chars {
363+
return None;
364+
}
365+
let byte_start = char_to_byte_map[start_char_idx];
366+
let byte_end = char_to_byte_map[start_char_idx + num_chars];
367+
s.get(byte_start..byte_end)
368+
};
369+
370+
let mut found_count = 0;
371+
372+
if position > 0 {
373+
let start_char_idx_0_indexed = (position - 1) as usize;
374+
for current_char_idx in
375+
start_char_idx_0_indexed..=s_len_chars.saturating_sub(sub_len_chars)
376+
{
377+
if let Some(slice) = get_slice_by_char_idx(current_char_idx, sub_len_chars) {
378+
if slice == sub {
379+
found_count += 1;
380+
if found_count == occurrence {
381+
return (current_char_idx + 1) as u64;
382+
}
383+
}
384+
}
385+
}
386+
} else {
387+
let search_start_char_0_indexed =
388+
s_len_chars.saturating_sub(position.unsigned_abs() as usize);
389+
390+
let max_possible_match_start_idx =
391+
search_start_char_0_indexed.min(s_len_chars.saturating_sub(sub_len_chars));
392+
393+
for current_char_idx in (0..=max_possible_match_start_idx).rev() {
394+
if let Some(slice) = get_slice_by_char_idx(current_char_idx, sub_len_chars) {
395+
if slice == sub {
396+
found_count += 1;
397+
if found_count == occurrence {
398+
return (current_char_idx + 1) as u64;
399+
}
400+
}
401+
}
402+
}
403+
}
404+
405+
0
406+
}
407+
327408
registry.register_2_arg::<StringType, StringType, NumberType<u64>, _, _>(
328409
"instr",
329410
|_, _, _| FunctionDomain::Full,
330411
move |s: &str, substr: &str, _| find_at(s, substr, 1),
331412
);
332413

414+
registry.register_3_arg::<StringType, StringType, NumberType<i64>, NumberType<u64>, _, _>(
415+
"instr",
416+
|_, _, _, _| FunctionDomain::Full,
417+
move |s: &str, substr: &str, pos, _| instr(s, substr, pos, 1),
418+
);
419+
420+
registry.register_4_arg::<StringType, StringType, NumberType<i64>, NumberType<u64>, NumberType<u64>, _, _>(
421+
"instr",
422+
|_, _,_, _, _| FunctionDomain::Full,
423+
move |s: &str, substr: &str, pos, occurrence, _| instr(s, substr, pos, occurrence),
424+
);
425+
333426
registry.register_2_arg::<StringType, StringType, NumberType<u64>, _, _>(
334427
"position",
335428
|_, _, _| FunctionDomain::Full,

src/query/functions/tests/it/scalars/testdata/function_list.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2144,6 +2144,10 @@ Functions overloads:
21442144
1 insert(String NULL, Int64 NULL, Int64 NULL, String NULL) :: String NULL
21452145
0 instr(String, String) :: UInt64
21462146
1 instr(String NULL, String NULL) :: UInt64 NULL
2147+
2 instr(String, String, Int64) :: UInt64
2148+
3 instr(String NULL, String NULL, Int64 NULL) :: UInt64 NULL
2149+
4 instr(String, String, Int64, UInt64) :: UInt64
2150+
5 instr(String NULL, String NULL, Int64 NULL, UInt64 NULL) :: UInt64 NULL
21472151
0 is_array(Variant) :: Boolean
21482152
1 is_array(Variant NULL) :: Boolean NULL
21492153
0 is_binary(Variant) :: Boolean

tests/sqllogictests/suites/query/functions/02_0025_function_strings_locate.test

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,59 @@ query I
6767
SELECT INSTR('', '')
6868
----
6969
1
70+
71+
query T
72+
SELECT INSTR('你你好你', '你', 1);
73+
----
74+
1
75+
76+
query T
77+
SELECT INSTR('你你好你', '你', 1, 1);
78+
----
79+
1
80+
81+
query T
82+
SELECT INSTR('你你好你', '你', 1, 2);
83+
----
84+
2
85+
86+
query T
87+
SELECT INSTR('你你好你', '你', -1, 1);
88+
----
89+
4
90+
91+
query T
92+
SELECT INSTR('你你好你', '你', -1, 2);
93+
----
94+
2
95+
96+
query T
97+
SELECT INSTR('CORPORATE FLOOR','OR', 3, 2);
98+
----
99+
14
100+
101+
query T
102+
SELECT INSTR('CORPORATE FLOOR','OR', -3, 2)
103+
----
104+
2
105+
106+
statement ok
107+
create or replace table t(c1 string);
108+
109+
statement ok
110+
insert into t values('CORPORATE FLOOR'),('你你好你');
111+
112+
query T
113+
select instr(c1,'OR', -3, 2) from t;
114+
----
115+
2
116+
0
117+
118+
query T
119+
select instr(c1,'你', -1) from t;
120+
----
121+
0
122+
4
123+
124+
statement ok
125+
drop table if exists t;

0 commit comments

Comments
 (0)