Skip to content

Commit 46dfc2a

Browse files
committed
[fix][store] Fixup document regex search
1 parent 9cfe0bc commit 46dfc2a

File tree

3 files changed

+77
-4
lines changed

3 files changed

+77
-4
lines changed

src/common/helper.cc

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1911,8 +1911,8 @@ void Helper::PrintHtmlTable(std::ostream& os, bool use_html, const std::vector<s
19111911
if (line[i].size() <= 64) {
19121912
os << brpc::min_width(line[i], min_widths[i]);
19131913
} else {
1914-
os << "<div class=\"part\">" << line[i].substr(0, 64) << "..." << "<span class=\"full\">" << line[i]
1915-
<< "</span></div>";
1914+
os << "<div class=\"part\">" << line[i].substr(0, 64) << "..."
1915+
<< "<span class=\"full\">" << line[i] << "</span></div>";
19161916
}
19171917
}
19181918
} else {
@@ -2396,4 +2396,72 @@ void Helper::HandleBoolControlConfigVariable(const pb::common::ControlConfigVari
23962396
config.set_is_error_occurred(false);
23972397
}
23982398

2399+
bool Helper::IsBase64Encoded(const std::string& input) {
2400+
if (input.length() % 4 != 0) {
2401+
return false;
2402+
}
2403+
2404+
for (char c : input) {
2405+
if (!isalnum(c) && c != '+' && c != '/' && c != '=') {
2406+
return false;
2407+
}
2408+
}
2409+
2410+
size_t padding_count = 0;
2411+
for (size_t i = input.length(); i > 0; --i) {
2412+
if (input[i - 1] == '=') {
2413+
padding_count++;
2414+
} else {
2415+
break;
2416+
}
2417+
}
2418+
return padding_count <= 2;
2419+
}
2420+
2421+
std::string Helper::Base64Encode(const std::string& input) {
2422+
static const char base64_chars[] =
2423+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
2424+
"abcdefghijklmnopqrstuvwxyz"
2425+
"0123456789+/";
2426+
std::string encoded;
2427+
int val = 0, valb = -6;
2428+
for (unsigned char c : input) {
2429+
val = (val << 8) + c;
2430+
valb += 8;
2431+
while (valb >= 0) {
2432+
encoded.push_back(base64_chars[(val >> valb) & 0x3F]);
2433+
valb -= 6;
2434+
}
2435+
}
2436+
if (valb > -6) {
2437+
encoded.push_back(base64_chars[((val << 8) >> (valb + 8)) & 0x3F]);
2438+
}
2439+
while (encoded.size() % 4) {
2440+
encoded.push_back('=');
2441+
}
2442+
return encoded;
2443+
}
2444+
2445+
std::string Helper::EncodeREContent(const std::string& input) {
2446+
std::regex re_pattern(R"(RE\s*\[((?:[^\[\]]|\[.*?\])*)\])");
2447+
std::smatch matches;
2448+
std::string result = input;
2449+
std::string::const_iterator search_start(input.cbegin());
2450+
2451+
while (std::regex_search(search_start, input.cend(), matches, re_pattern)) {
2452+
std::string matched_text = matches[0];
2453+
std::string content = matches[1];
2454+
2455+
if (!IsBase64Encoded(content)) {
2456+
std::string encoded_content = Base64Encode(content);
2457+
std::string replacement = "RE [" + encoded_content + "]";
2458+
result.replace(matches.position(0), matched_text.length(), replacement);
2459+
}
2460+
2461+
search_start = matches.suffix().first;
2462+
}
2463+
2464+
return result;
2465+
}
2466+
23992467
} // namespace dingodb

src/common/helper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,10 @@ class Helper {
424424
static bool StringConvertFalse(const std::string& str);
425425
static void HandleBoolControlConfigVariable(const pb::common::ControlConfigVariable& variable,
426426
pb::common::ControlConfigVariable& config, bool& gflags_var);
427+
428+
static bool IsBase64Encoded(const std::string& input);
429+
static std::string Base64Encode(const std::string& input);
430+
static std::string EncodeREContent(const std::string& input);
427431
};
428432

429433
} // namespace dingodb

src/document/document_index.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,10 +392,11 @@ butil::Status DocumentIndex::Search(uint32_t topk, const std::string& query_stri
392392
// }
393393
// }
394394
// }
395+
std::string query_string_encode = Helper::EncodeREContent(query_string);
395396

396397
auto search_result =
397-
ffi_bm25_search_with_column_names(index_path_, query_string, topk, alive_ids, use_id_filter, use_range_filter,
398-
start_id, end_id, column_names, query_unlimited);
398+
ffi_bm25_search_with_column_names(index_path_, query_string_encode, topk, alive_ids, use_id_filter,
399+
use_range_filter, start_id, end_id, column_names, query_unlimited);
399400

400401
if (search_result.error_code == 0) {
401402
for (const auto& row_id_with_score : search_result.result) {

0 commit comments

Comments
 (0)