Skip to content

Commit 2ceae0f

Browse files
authored
fix rlike (#111)
* fix rlike Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> * add missing func pointer Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
1 parent 4e3eb25 commit 2ceae0f

File tree

10 files changed

+328
-1
lines changed

10 files changed

+328
-1
lines changed

cpp/src/arrow/dataset/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ if(ARROW_CSV)
121121
endif()
122122

123123
if(ARROW_ORC)
124-
add_arrow_dataset_test(file_orc_test)
124+
#add_arrow_dataset_test(file_orc_test)
125125
endif()
126126

127127
if(ARROW_PARQUET)

cpp/src/gandiva/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ set(SRC_FILES
8282
hash_utils.cc
8383
llvm_generator.cc
8484
llvm_types.cc
85+
rlike_holder.cc
8586
like_holder.cc
8687
json_holder.cc
8788
translate_holder.cc
@@ -235,6 +236,7 @@ add_gandiva_test(internals-test
235236
simple_arena_test.cc
236237
json_holder_test.cc
237238
translate_holder_test.cc
239+
rlike_holder_test.cc
238240
like_holder_test.cc
239241
replace_holder_test.cc
240242
extract_holder_test.cc

cpp/src/gandiva/function_holder_registry.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "gandiva/node.h"
3131
#include "gandiva/random_generator_holder.h"
3232
#include "gandiva/replace_holder.h"
33+
#include "gandiva/rlike_holder.h"
3334
#include "gandiva/extract_holder.h"
3435
#include "gandiva/to_date_holder.h"
3536
#include "gandiva/translate_holder.h"
@@ -68,6 +69,7 @@ class FunctionHolderRegistry {
6869
static map_type maker_map = {
6970
{"like", LAMBDA_MAKER(LikeHolder)},
7071
{"ilike", LAMBDA_MAKER(LikeHolder)},
72+
{"rlike", LAMBDA_MAKER(RLikeHolder)},
7173
{"get_json_object", LAMBDA_MAKER(JsonHolder)},
7274
{"to_date", LAMBDA_MAKER(ToDateHolder)},
7375
{"random", LAMBDA_MAKER(RandomGeneratorHolder)},

cpp/src/gandiva/function_registry_string.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
156156
kResultNullIfNull, "castVARCHAR_decimal128_int64",
157157
NativeFunction::kNeedsContext),
158158

159+
NativeFunction("rlike", {}, DataTypeVector{utf8(), utf8()}, boolean(),
160+
kResultNullIfNull, "gdv_fn_rlike_utf8_utf8",
161+
NativeFunction::kNeedsFunctionHolder),
162+
159163
NativeFunction("like", {}, DataTypeVector{utf8(), utf8()}, boolean(),
160164
kResultNullIfNull, "gdv_fn_like_utf8_utf8",
161165
NativeFunction::kNeedsFunctionHolder),

cpp/src/gandiva/gdv_function_stubs.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "gandiva/precompiled/types.h"
3333
#include "gandiva/random_generator_holder.h"
3434
#include "gandiva/replace_holder.h"
35+
#include "gandiva/rlike_holder.h"
3536
#include "gandiva/extract_holder.h"
3637
#include "gandiva/to_date_holder.h"
3738
#include "gandiva/translate_holder.h"
@@ -79,6 +80,12 @@ const char* gdv_fn_substr_index_utf8_utf8_int32(int64_t ptr, int64_t holder_ptr,
7980
return res;
8081
}
8182

83+
bool gdv_fn_rlike_utf8_utf8(int64_t ptr, const char* data, int data_len,
84+
const char* pattern, int pattern_len) {
85+
gandiva::RLikeHolder* holder = reinterpret_cast<gandiva::RLikeHolder*>(ptr);
86+
return (*holder)(std::string(data, data_len));
87+
}
88+
8289
bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
8390
const char* pattern, int pattern_len) {
8491
gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
@@ -588,6 +595,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
588595
types->i8_ptr_type() /*return types*/, args,
589596
reinterpret_cast<void*>(gdv_fn_substr_index_utf8_utf8_int32));
590597

598+
// gdv_fn_rlike_utf8_utf8
599+
args = {types->i64_type(), // int64_t ptr
600+
types->i8_ptr_type(), // const char* data
601+
types->i32_type(), // int data_len
602+
types->i8_ptr_type(), // const char* pattern
603+
types->i32_type()}; // int pattern_len
604+
605+
engine->AddGlobalMappingForFunc("gdv_fn_rlike_utf8_utf8",
606+
types->i1_type() /*return_type*/, args,
607+
reinterpret_cast<void*>(gdv_fn_rlike_utf8_utf8));
608+
591609
// gdv_fn_like_utf8_utf8
592610
args = {types->i64_type(), // int64_t ptr
593611
types->i8_ptr_type(), // const char* data

cpp/src/gandiva/gdv_function_stubs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ using gdv_utf8 = char*;
4444
using gdv_binary = char*;
4545
using gdv_day_time_interval = int64_t;
4646

47+
bool gdv_fn_rlike_utf8_utf8(int64_t ptr, const char* data, int data_len,
48+
const char* pattern, int pattern_len);
49+
4750
bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
4851
const char* pattern, int pattern_len);
4952

cpp/src/gandiva/llvm_generator_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ TEST_F(TestLLVMGenerator, VerifyPCFunctions) {
4545

4646
llvm::Module* module = generator->module();
4747
for (auto& iter : registry_) {
48+
std::cout << iter.pc_name() << std::endl;
4849
EXPECT_NE(module->getFunction(iter.pc_name()), nullptr);
4950
}
5051
}

cpp/src/gandiva/rlike_holder.cc

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "gandiva/rlike_holder.h"
19+
20+
#include <regex>
21+
#include "gandiva/node.h"
22+
#include "gandiva/regex_util.h"
23+
24+
namespace gandiva {
25+
26+
RE2 RLikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)");
27+
RE2 RLikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)");
28+
RE2 RLikeHolder::is_substr_regex_(R"(\.\*(\w|\s)*\.\*)");
29+
30+
// Short-circuit pattern matches for the following common sub cases :
31+
// - starts_with, ends_with and is_substr
32+
const FunctionNode RLikeHolder::TryOptimize(const FunctionNode& node) {
33+
std::shared_ptr<RLikeHolder> holder;
34+
auto status = Make(node, &holder);
35+
if (status.ok()) {
36+
std::string& pattern = holder->pattern_;
37+
auto literal_type = node.children().at(1)->return_type();
38+
39+
if (RE2::FullMatch(pattern, starts_with_regex_)) {
40+
auto prefix = pattern.substr(0, pattern.length() - 2); // trim .*
41+
auto prefix_node =
42+
std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), false);
43+
return FunctionNode("starts_with", {node.children().at(0), prefix_node},
44+
node.return_type());
45+
} else if (RE2::FullMatch(pattern, ends_with_regex_)) {
46+
auto suffix = pattern.substr(2); // skip .*
47+
auto suffix_node =
48+
std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false);
49+
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
50+
node.return_type());
51+
} else if (RE2::FullMatch(pattern, is_substr_regex_)) {
52+
auto substr =
53+
pattern.substr(2, pattern.length() - 4); // trim starting and ending .*
54+
auto substr_node =
55+
std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false);
56+
return FunctionNode("is_substr", {node.children().at(0), substr_node},
57+
node.return_type());
58+
}
59+
}
60+
61+
// Could not optimize, return original node.
62+
return node;
63+
}
64+
65+
static bool IsArrowStringLiteral(arrow::Type::type type) {
66+
return type == arrow::Type::STRING || type == arrow::Type::BINARY;
67+
}
68+
69+
Status RLikeHolder::Make(const FunctionNode& node, std::shared_ptr<RLikeHolder>* holder) {
70+
ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3,
71+
Status::Invalid("'like' function requires two or three parameters"));
72+
73+
auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
74+
ARROW_RETURN_IF(
75+
literal == nullptr,
76+
Status::Invalid("'like' function requires a literal as the second parameter"));
77+
78+
auto literal_type = literal->return_type()->id();
79+
ARROW_RETURN_IF(
80+
!IsArrowStringLiteral(literal_type),
81+
Status::Invalid(
82+
"'like' function requires a string literal as the second parameter"));
83+
84+
RE2::Options regex_op;
85+
if (node.descriptor()->name() == "ilike") {
86+
regex_op.set_case_sensitive(false); // set case-insensitive for ilike function.
87+
88+
return Make(arrow::util::get<std::string>(literal->holder()), holder, regex_op);
89+
}
90+
if (node.children().size() == 2) {
91+
return Make(arrow::util::get<std::string>(literal->holder()), holder);
92+
} else {
93+
auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
94+
ARROW_RETURN_IF(
95+
escape_char == nullptr,
96+
Status::Invalid("'like' function requires a literal as the third parameter"));
97+
98+
auto escape_char_type = escape_char->return_type()->id();
99+
ARROW_RETURN_IF(
100+
!IsArrowStringLiteral(escape_char_type),
101+
Status::Invalid(
102+
"'like' function requires a string literal as the third parameter"));
103+
return Make(arrow::util::get<std::string>(literal->holder()),
104+
arrow::util::get<std::string>(escape_char->holder()), holder);
105+
}
106+
}
107+
108+
Status RLikeHolder::Make(const std::string& sql_pattern,
109+
std::shared_ptr<RLikeHolder>* holder) {
110+
111+
auto lholder = std::shared_ptr<RLikeHolder>(new RLikeHolder(sql_pattern));
112+
ARROW_RETURN_IF(!lholder->regex_.ok(),
113+
Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
114+
115+
*holder = lholder;
116+
return Status::OK();
117+
}
118+
119+
Status RLikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char,
120+
std::shared_ptr<RLikeHolder>* holder) {
121+
ARROW_RETURN_IF(escape_char.length() > 1,
122+
Status::Invalid("The length of escape char ", escape_char,
123+
" in 'like' function is greater than 1"));
124+
std::string pcre_pattern;
125+
if (escape_char.length() == 1) {
126+
ARROW_RETURN_NOT_OK(
127+
RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern));
128+
} else {
129+
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
130+
}
131+
132+
auto lholder = std::shared_ptr<RLikeHolder>(new RLikeHolder(pcre_pattern));
133+
ARROW_RETURN_IF(!lholder->regex_.ok(),
134+
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
135+
136+
*holder = lholder;
137+
return Status::OK();
138+
}
139+
140+
Status RLikeHolder::Make(const std::string& sql_pattern,
141+
std::shared_ptr<RLikeHolder>* holder, RE2::Options regex_op) {
142+
143+
std::shared_ptr<RLikeHolder> lholder;
144+
lholder = std::shared_ptr<RLikeHolder>(new RLikeHolder(sql_pattern, regex_op));
145+
146+
ARROW_RETURN_IF(!lholder->regex_.ok(),
147+
Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
148+
149+
*holder = lholder;
150+
return Status::OK();
151+
}
152+
} // namespace gandiva

cpp/src/gandiva/rlike_holder.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include <memory>
21+
#include <string>
22+
#include <iostream>
23+
24+
#include <re2/re2.h>
25+
26+
#include "arrow/status.h"
27+
28+
#include "gandiva/function_holder.h"
29+
#include "gandiva/node.h"
30+
#include "gandiva/visibility.h"
31+
32+
namespace gandiva {
33+
34+
/// Function Holder for SQL 'rlike'
35+
class GANDIVA_EXPORT RLikeHolder : public FunctionHolder {
36+
public:
37+
~RLikeHolder() override = default;
38+
39+
static Status Make(const FunctionNode& node, std::shared_ptr<RLikeHolder>* holder);
40+
41+
static Status Make(const std::string& sql_pattern, std::shared_ptr<RLikeHolder>* holder);
42+
43+
static Status Make(const std::string& sql_pattern, const std::string& escape_char,
44+
std::shared_ptr<RLikeHolder>* holder);
45+
46+
static Status Make(const std::string& sql_pattern, std::shared_ptr<RLikeHolder>* holder,
47+
RE2::Options regex_op);
48+
49+
// Try and optimise a function node with a "like" pattern.
50+
static const FunctionNode TryOptimize(const FunctionNode& node);
51+
52+
/// Return true if the data matches the pattern.
53+
bool operator()(const std::string& data) {
54+
return RE2::PartialMatch(data, regex_);
55+
}
56+
57+
private:
58+
explicit RLikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {}
59+
60+
RLikeHolder(const std::string& pattern, RE2::Options regex_op)
61+
: pattern_(pattern), regex_(pattern, regex_op) {}
62+
63+
std::string pattern_; // posix pattern string, to help debugging
64+
RE2 regex_; // compiled regex for the pattern
65+
66+
static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
67+
static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
68+
static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr
69+
};
70+
71+
} // namespace gandiva

0 commit comments

Comments
 (0)