Skip to content

Commit d8f5c4d

Browse files
committed
feat: make file search patterns understand Unicode
File search patterns previously split up and escaped individual UTF-8 code units instead of matching on the actual codepoint. This change transcodes to UTF-32 codepoints emits non-special codepoints as curly hex escapes like `\x{10FFFF}`.
1 parent 222574d commit d8f5c4d

File tree

1 file changed

+21
-14
lines changed

1 file changed

+21
-14
lines changed

engine/system/win/sys_main.cpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -140,42 +140,49 @@ std::optional<std::string> BuildGlobPattern(std::filesystem::path const& glob)
140140
{
141141
using namespace std::literals::string_view_literals;
142142
auto globStr = glob.generic_u8string();
143+
auto globView = std::string_view(globStr);
143144

144145
// Deal with traditional "everything" wildcards.
145-
if (glob == "*" || glob == "*.*") {
146+
if (globView == "*" || globView == "*.*") {
146147
return {};
147148
}
148149

150+
auto u32Str = IndexUTF8ToUTF32(globStr);
151+
auto& offsets = u32Str.sourceCodeUnitOffsets;
152+
149153
fmt::memory_buffer buf;
154+
buf.reserve(globStr.size() * 3); // Decent estimate of final pattern size.
150155

151156
// If no wildcards are present, test file path verbatim.
152157
// We use a regex rather than string comparisons to make it case-insensitive.
153-
if (globStr.find_first_of("?*") == std::string::npos) {
154-
buf.reserve(globStr.size() * 3); // Decent estimate of final pattern size.
155-
156-
for (char ch : globStr) {
157-
fmt::format_to(fmt::appender(buf), "[{}]", ch);
158+
if (u32Str.text.find_first_of(U"?*") == std::u32string::npos) {
159+
for (size_t offIdx = 0; offIdx < offsets.size(); ++offIdx) {
160+
int byteOffset = offsets[offIdx];
161+
int nextOffset = (offIdx + 1 < offsets.size()) ? offsets[offIdx + 1] : globStr.size();
162+
fmt::format_to(fmt::appender(buf), "[{}]", globView.substr(byteOffset, nextOffset - byteOffset));
158163
}
159164
}
160165
else {
161166
// Otherwise build a regular expression from the glob and use that to match files.
162167
auto it = fmt::appender(buf);
163-
for (char ch : globStr) {
164-
if (ch == '*') {
168+
for (size_t offIdx = 0; offIdx < offsets.size(); ++offIdx) {
169+
char32_t ch = u32Str.text[offIdx];
170+
if (ch == U'*') {
165171
it = fmt::format_to(it, ".*");
166172
}
167-
else if (ch == '?') {
173+
else if (ch == U'?') {
168174
*it++ = '.';
169175
}
170-
else if ("+[]{}+()|"sv.find(ch) != std::string_view::npos) {
176+
else if (U".+[]{}+()|"sv.find(ch) != std::u32string::npos) {
171177
// Escape metacharacters
172-
it = fmt::format_to(it, "\\{}", ch);
178+
it = fmt::format_to(it, "\\{}", (char)ch);
173179
}
174-
else if (std::isalnum((unsigned char)ch)) {
175-
*it++ = ch;
180+
else if (ch < 0x80 && std::isalnum((unsigned char)ch)) {
181+
*it++ = (char)ch;
176182
}
177183
else {
178-
it = fmt::format_to(it, "[{}]", ch);
184+
// Emit as \x{10FFFF}.
185+
it = fmt::format_to(it, "\\x{{{:X}}}", (uint32_t)ch);
179186
}
180187
}
181188
}

0 commit comments

Comments
 (0)