|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +#include "gandiva/rlike_holder.h" |
| 19 | + |
| 20 | +#include <regex> |
| 21 | +#include "gandiva/node.h" |
| 22 | +#include "gandiva/regex_util.h" |
| 23 | + |
| 24 | +namespace gandiva { |
| 25 | + |
| 26 | +RE2 RLikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)"); |
| 27 | +RE2 RLikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)"); |
| 28 | +RE2 RLikeHolder::is_substr_regex_(R"(\.\*(\w|\s)*\.\*)"); |
| 29 | + |
| 30 | +// Short-circuit pattern matches for the following common sub cases : |
| 31 | +// - starts_with, ends_with and is_substr |
| 32 | +const FunctionNode RLikeHolder::TryOptimize(const FunctionNode& node) { |
| 33 | + std::shared_ptr<RLikeHolder> holder; |
| 34 | + auto status = Make(node, &holder); |
| 35 | + if (status.ok()) { |
| 36 | + std::string& pattern = holder->pattern_; |
| 37 | + auto literal_type = node.children().at(1)->return_type(); |
| 38 | + |
| 39 | + if (RE2::FullMatch(pattern, starts_with_regex_)) { |
| 40 | + auto prefix = pattern.substr(0, pattern.length() - 2); // trim .* |
| 41 | + auto prefix_node = |
| 42 | + std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), false); |
| 43 | + return FunctionNode("starts_with", {node.children().at(0), prefix_node}, |
| 44 | + node.return_type()); |
| 45 | + } else if (RE2::FullMatch(pattern, ends_with_regex_)) { |
| 46 | + auto suffix = pattern.substr(2); // skip .* |
| 47 | + auto suffix_node = |
| 48 | + std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false); |
| 49 | + return FunctionNode("ends_with", {node.children().at(0), suffix_node}, |
| 50 | + node.return_type()); |
| 51 | + } else if (RE2::FullMatch(pattern, is_substr_regex_)) { |
| 52 | + auto substr = |
| 53 | + pattern.substr(2, pattern.length() - 4); // trim starting and ending .* |
| 54 | + auto substr_node = |
| 55 | + std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false); |
| 56 | + return FunctionNode("is_substr", {node.children().at(0), substr_node}, |
| 57 | + node.return_type()); |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + // Could not optimize, return original node. |
| 62 | + return node; |
| 63 | +} |
| 64 | + |
| 65 | +static bool IsArrowStringLiteral(arrow::Type::type type) { |
| 66 | + return type == arrow::Type::STRING || type == arrow::Type::BINARY; |
| 67 | +} |
| 68 | + |
| 69 | +Status RLikeHolder::Make(const FunctionNode& node, std::shared_ptr<RLikeHolder>* holder) { |
| 70 | + ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3, |
| 71 | + Status::Invalid("'like' function requires two or three parameters")); |
| 72 | + |
| 73 | + auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get()); |
| 74 | + ARROW_RETURN_IF( |
| 75 | + literal == nullptr, |
| 76 | + Status::Invalid("'like' function requires a literal as the second parameter")); |
| 77 | + |
| 78 | + auto literal_type = literal->return_type()->id(); |
| 79 | + ARROW_RETURN_IF( |
| 80 | + !IsArrowStringLiteral(literal_type), |
| 81 | + Status::Invalid( |
| 82 | + "'like' function requires a string literal as the second parameter")); |
| 83 | + |
| 84 | + RE2::Options regex_op; |
| 85 | + if (node.descriptor()->name() == "ilike") { |
| 86 | + regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. |
| 87 | + |
| 88 | + return Make(arrow::util::get<std::string>(literal->holder()), holder, regex_op); |
| 89 | + } |
| 90 | + if (node.children().size() == 2) { |
| 91 | + return Make(arrow::util::get<std::string>(literal->holder()), holder); |
| 92 | + } else { |
| 93 | + auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get()); |
| 94 | + ARROW_RETURN_IF( |
| 95 | + escape_char == nullptr, |
| 96 | + Status::Invalid("'like' function requires a literal as the third parameter")); |
| 97 | + |
| 98 | + auto escape_char_type = escape_char->return_type()->id(); |
| 99 | + ARROW_RETURN_IF( |
| 100 | + !IsArrowStringLiteral(escape_char_type), |
| 101 | + Status::Invalid( |
| 102 | + "'like' function requires a string literal as the third parameter")); |
| 103 | + return Make(arrow::util::get<std::string>(literal->holder()), |
| 104 | + arrow::util::get<std::string>(escape_char->holder()), holder); |
| 105 | + } |
| 106 | +} |
| 107 | + |
| 108 | +Status RLikeHolder::Make(const std::string& sql_pattern, |
| 109 | + std::shared_ptr<RLikeHolder>* holder) { |
| 110 | + |
| 111 | + auto lholder = std::shared_ptr<RLikeHolder>(new RLikeHolder(sql_pattern)); |
| 112 | + ARROW_RETURN_IF(!lholder->regex_.ok(), |
| 113 | + Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed")); |
| 114 | + |
| 115 | + *holder = lholder; |
| 116 | + return Status::OK(); |
| 117 | +} |
| 118 | + |
| 119 | +Status RLikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, |
| 120 | + std::shared_ptr<RLikeHolder>* holder) { |
| 121 | + ARROW_RETURN_IF(escape_char.length() > 1, |
| 122 | + Status::Invalid("The length of escape char ", escape_char, |
| 123 | + " in 'like' function is greater than 1")); |
| 124 | + std::string pcre_pattern; |
| 125 | + if (escape_char.length() == 1) { |
| 126 | + ARROW_RETURN_NOT_OK( |
| 127 | + RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern)); |
| 128 | + } else { |
| 129 | + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); |
| 130 | + } |
| 131 | + |
| 132 | + auto lholder = std::shared_ptr<RLikeHolder>(new RLikeHolder(pcre_pattern)); |
| 133 | + ARROW_RETURN_IF(!lholder->regex_.ok(), |
| 134 | + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); |
| 135 | + |
| 136 | + *holder = lholder; |
| 137 | + return Status::OK(); |
| 138 | +} |
| 139 | + |
| 140 | +Status RLikeHolder::Make(const std::string& sql_pattern, |
| 141 | + std::shared_ptr<RLikeHolder>* holder, RE2::Options regex_op) { |
| 142 | + |
| 143 | + std::shared_ptr<RLikeHolder> lholder; |
| 144 | + lholder = std::shared_ptr<RLikeHolder>(new RLikeHolder(sql_pattern, regex_op)); |
| 145 | + |
| 146 | + ARROW_RETURN_IF(!lholder->regex_.ok(), |
| 147 | + Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed")); |
| 148 | + |
| 149 | + *holder = lholder; |
| 150 | + return Status::OK(); |
| 151 | +} |
| 152 | +} // namespace gandiva |
0 commit comments