Skip to content

Commit c1ef21b

Browse files
authored
refactor(clp-s): Replace clp_s::StringUtils function calls with clp::string_utils equivalents and remove redundant implementations. (#1103)
1 parent 46aa1e1 commit c1ef21b

File tree

8 files changed

+33
-463
lines changed

8 files changed

+33
-463
lines changed

components/core/cmake/Options/options.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ endfunction()
198198

199199
function(validate_clp_s_archivereader_dependencies)
200200
validate_clp_dependencies_for_target(CLP_BUILD_CLP_S_ARCHIVEREADER
201+
CLP_BUILD_CLP_STRING_UTILS
201202
CLP_BUILD_CLP_S_CLP_DEPENDENCIES
202203
CLP_BUILD_CLP_S_IO
203204
CLP_BUILD_CLP_S_TIMESTAMPPATTERN
@@ -300,6 +301,7 @@ endfunction()
300301

301302
function(validate_clp_s_search_dependencies)
302303
validate_clp_dependencies_for_target(CLP_BUILD_CLP_S_SEARCH
304+
CLP_BUILD_CLP_STRING_UTILS
303305
CLP_BUILD_CLP_S_ARCHIVEREADER
304306
CLP_BUILD_CLP_S_CLP_DEPENDENCIES
305307
CLP_BUILD_CLP_S_SEARCH_AST

components/core/src/clp_s/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ if(CLP_BUILD_CLP_S_ARCHIVEREADER)
302302
clp_s_archive_reader
303303
PUBLIC
304304
absl::flat_hash_map
305+
clp::string_utils
305306
clp_s::io
306307
msgpack-cxx
307308
nlohmann_json::nlohmann_json

components/core/src/clp_s/DictionaryReader.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <unordered_set>
77

88
#include <boost/algorithm/string/case_conv.hpp>
9+
#include <string_utils/string_utils.hpp>
910

1011
#include "ArchiveReaderAdaptor.hpp"
1112
#include "DictionaryEntry.hpp"
@@ -191,7 +192,12 @@ void DictionaryReader<DictionaryIdType, EntryType>::get_entries_matching_wildcar
191192
std::unordered_set<EntryType const*>& entries
192193
) const {
193194
for (auto const& entry : m_entries) {
194-
if (StringUtils::wildcard_match_unsafe(entry.get_value(), wildcard_string, !ignore_case)) {
195+
if (clp::string_utils::wildcard_match_unsafe(
196+
entry.get_value(),
197+
wildcard_string,
198+
!ignore_case
199+
))
200+
{
195201
entries.insert(&entry);
196202
}
197203
}

components/core/src/clp_s/Utils.cpp

Lines changed: 3 additions & 304 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <boost/url.hpp>
1010
#include <fmt/core.h>
1111
#include <spdlog/spdlog.h>
12+
#include <string_utils/string_utils.hpp>
1213

1314
#include "archive_constants.hpp"
1415

@@ -188,9 +189,9 @@ bool StringUtils::get_bounds_of_next_var(string const& msg, size_t& begin_pos, s
188189
end_pos = begin_pos;
189190
for (; end_pos < msg_length; ++end_pos) {
190191
char c = msg[end_pos];
191-
if (is_decimal_digit(c)) {
192+
if (clp::string_utils::is_decimal_digit(c)) {
192193
contains_decimal_digit = true;
193-
} else if (is_alphabet(c)) {
194+
} else if (clp::string_utils::is_alphabet(c)) {
194195
contains_alphabet = true;
195196
} else if (is_delim(c)) {
196197
break;
@@ -212,308 +213,6 @@ bool StringUtils::get_bounds_of_next_var(string const& msg, size_t& begin_pos, s
212213
return (msg_length != begin_pos);
213214
}
214215

215-
size_t StringUtils::find_first_of(
216-
string const& haystack,
217-
char const* needles,
218-
size_t search_start_pos,
219-
size_t& needle_ix
220-
) {
221-
size_t haystack_length = haystack.length();
222-
size_t needles_length = strlen(needles);
223-
for (size_t i = search_start_pos; i < haystack_length; ++i) {
224-
for (needle_ix = 0; needle_ix < needles_length; ++needle_ix) {
225-
if (haystack[i] == needles[needle_ix]) {
226-
return i;
227-
}
228-
}
229-
}
230-
231-
return string::npos;
232-
}
233-
234-
string StringUtils::replace_characters(
235-
char const* characters_to_escape,
236-
char const* replacement_characters,
237-
string const& value,
238-
bool escape
239-
) {
240-
string new_value;
241-
size_t search_start_pos = 0;
242-
while (true) {
243-
size_t replace_char_ix;
244-
size_t char_to_replace_pos
245-
= find_first_of(value, characters_to_escape, search_start_pos, replace_char_ix);
246-
if (string::npos == char_to_replace_pos) {
247-
new_value.append(value, search_start_pos, string::npos);
248-
break;
249-
} else {
250-
new_value.append(value, search_start_pos, char_to_replace_pos - search_start_pos);
251-
if (escape) {
252-
new_value += "\\";
253-
}
254-
new_value += replacement_characters[replace_char_ix];
255-
search_start_pos = char_to_replace_pos + 1;
256-
}
257-
}
258-
return new_value;
259-
}
260-
261-
void StringUtils::to_lower(string& str) {
262-
std::transform(str.cbegin(), str.cend(), str.begin(), [](unsigned char c) {
263-
return std::tolower(c);
264-
});
265-
}
266-
267-
bool StringUtils::is_wildcard(char c) {
268-
static constexpr char cWildcards[] = "?*";
269-
for (size_t i = 0; i < strlen(cWildcards); ++i) {
270-
if (cWildcards[i] == c) {
271-
return true;
272-
}
273-
}
274-
return false;
275-
}
276-
277-
string StringUtils::clean_up_wildcard_search_string(string_view str) {
278-
string cleaned_str;
279-
280-
bool is_escaped = false;
281-
auto str_end = str.cend();
282-
for (auto current = str.cbegin(); current != str_end;) {
283-
auto c = *current;
284-
if (is_escaped) {
285-
is_escaped = false;
286-
287-
if (is_wildcard(c) || '\\' == c) {
288-
// Keep escaping if c is a wildcard character or an escape character
289-
cleaned_str += '\\';
290-
}
291-
cleaned_str += c;
292-
++current;
293-
} else if ('*' == c) {
294-
cleaned_str += c;
295-
296-
// Skip over all '*' to find the next non-'*'
297-
do {
298-
++current;
299-
} while (current != str_end && '*' == *current);
300-
} else {
301-
if ('\\' == c) {
302-
is_escaped = true;
303-
} else {
304-
cleaned_str += c;
305-
}
306-
++current;
307-
}
308-
}
309-
310-
return cleaned_str;
311-
}
312-
313-
bool StringUtils::advance_tame_to_next_match(
314-
char const*& tame_current,
315-
char const*& tame_bookmark,
316-
char const* tame_end,
317-
char const*& wild_current,
318-
char const*& wild_bookmark
319-
) {
320-
auto w = *wild_current;
321-
if ('?' != w) {
322-
// No need to check for '*' since the caller ensures wild doesn't
323-
// contain consecutive '*'
324-
325-
// Handle escaped characters
326-
if ('\\' == w) {
327-
++wild_current;
328-
// This is safe without a bounds check since this the caller
329-
// ensures there are no dangling escape characters
330-
w = *wild_current;
331-
}
332-
333-
// Advance tame_current until it matches wild_current
334-
while (true) {
335-
if (tame_end == tame_current) {
336-
// Wild group is longer than last group in tame, so
337-
// can't match
338-
// e.g. "*abc" doesn't match "zab"
339-
return false;
340-
}
341-
auto t = *tame_current;
342-
if (t == w) {
343-
break;
344-
}
345-
++tame_current;
346-
}
347-
}
348-
349-
tame_bookmark = tame_current;
350-
351-
return true;
352-
}
353-
354-
bool
355-
StringUtils::wildcard_match_unsafe(string_view tame, string_view wild, bool case_sensitive_match) {
356-
if (case_sensitive_match) {
357-
return wildcard_match_unsafe_case_sensitive(tame, wild);
358-
} else {
359-
// We convert to lowercase (rather than uppercase) anticipating that
360-
// callers use lowercase more frequently, so little will need to change.
361-
string lowercase_tame(tame);
362-
to_lower(lowercase_tame);
363-
string lowercase_wild(wild);
364-
to_lower(lowercase_wild);
365-
return wildcard_match_unsafe_case_sensitive(lowercase_tame, lowercase_wild);
366-
}
367-
}
368-
369-
/**
370-
* The algorithm basically works as follows:
371-
* Given a wild string "*abc*def*ghi*", it can be broken into groups of
372-
* characters delimited by one or more '*' characters. The goal of the
373-
* algorithm is then to determine whether the tame string contains each of
374-
* those groups in the same order.
375-
*
376-
* Thus, the algorithm:
377-
* 1. searches for the start of one of these groups in wild,
378-
* 2. searches for a group in tame starting with the same character, and then
379-
* 3. checks if the two match. If not, the search repeats with the next group in
380-
* tame.
381-
*/
382-
bool StringUtils::wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) {
383-
auto const tame_length = tame.length();
384-
auto const wild_length = wild.length();
385-
char const* tame_current = tame.data();
386-
char const* wild_current = wild.data();
387-
char const* tame_bookmark = nullptr;
388-
char const* wild_bookmark = nullptr;
389-
char const* tame_end = tame_current + tame_length;
390-
char const* wild_end = wild_current + wild_length;
391-
392-
// Handle wild or tame being empty
393-
if (0 == wild_length) {
394-
return 0 == tame_length;
395-
} else {
396-
if (0 == tame_length) {
397-
return "*" == wild;
398-
}
399-
}
400-
401-
char w;
402-
char t;
403-
bool is_escaped = false;
404-
while (true) {
405-
w = *wild_current;
406-
if ('*' == w) {
407-
++wild_current;
408-
if (wild_end == wild_current) {
409-
// Trailing '*' means everything remaining in tame will match
410-
return true;
411-
}
412-
413-
// Set wild and tame bookmarks
414-
wild_bookmark = wild_current;
415-
if (!advance_tame_to_next_match(
416-
tame_current,
417-
tame_bookmark,
418-
tame_end,
419-
wild_current,
420-
wild_bookmark
421-
))
422-
{
423-
return false;
424-
}
425-
} else {
426-
// Handle escaped characters
427-
if ('\\' == w) {
428-
is_escaped = true;
429-
++wild_current;
430-
// This is safe without a bounds check since this the caller
431-
// ensures there are no dangling escape characters
432-
w = *wild_current;
433-
}
434-
435-
// Handle a mismatch
436-
t = *tame_current;
437-
if (false == ((false == is_escaped && '?' == w) || t == w)) {
438-
if (nullptr == wild_bookmark) {
439-
// No bookmark to return to
440-
return false;
441-
}
442-
443-
wild_current = wild_bookmark;
444-
tame_current = tame_bookmark + 1;
445-
if (!advance_tame_to_next_match(
446-
tame_current,
447-
tame_bookmark,
448-
tame_end,
449-
wild_current,
450-
wild_bookmark
451-
))
452-
{
453-
return false;
454-
}
455-
}
456-
}
457-
458-
++tame_current;
459-
++wild_current;
460-
461-
// Handle reaching the end of tame or wild
462-
if (tame_end == tame_current) {
463-
return (wild_end == wild_current
464-
|| ('*' == *wild_current && (wild_current + 1) == wild_end));
465-
} else {
466-
if (wild_end == wild_current) {
467-
if (nullptr == wild_bookmark) {
468-
// No bookmark to return to
469-
return false;
470-
} else {
471-
wild_current = wild_bookmark;
472-
tame_current = tame_bookmark + 1;
473-
if (!advance_tame_to_next_match(
474-
tame_current,
475-
tame_bookmark,
476-
tame_end,
477-
wild_current,
478-
wild_bookmark
479-
))
480-
{
481-
return false;
482-
}
483-
}
484-
}
485-
}
486-
}
487-
}
488-
489-
bool StringUtils::convert_string_to_int64(std::string_view raw, int64_t& converted) {
490-
auto raw_end = raw.cend();
491-
auto result = std::from_chars(raw.cbegin(), raw_end, converted);
492-
if (raw_end != result.ptr) {
493-
return false;
494-
} else {
495-
return result.ec == std::errc();
496-
}
497-
}
498-
499-
bool StringUtils::convert_string_to_double(std::string const& raw, double& converted) {
500-
if (raw.empty()) {
501-
// Can't convert an empty string
502-
return false;
503-
}
504-
505-
char const* c_str = raw.c_str();
506-
char* end_ptr;
507-
// Reset errno so we can detect a new error
508-
errno = 0;
509-
double raw_as_double = strtod(c_str, &end_ptr);
510-
if (ERANGE == errno || (end_ptr - c_str) < raw.length()) {
511-
return false;
512-
}
513-
converted = raw_as_double;
514-
return true;
515-
}
516-
517216
void StringUtils::escape_json_string(std::string& destination, std::string_view const source) {
518217
// Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path
519218
// when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers

0 commit comments

Comments
 (0)