Skip to content

Commit edc145b

Browse files
committed
Simplify UTF-16 silliness in canonicalize_path (Windows)
On Windows, replace a UTF-16-to-UTF-8 conversion with a UTF-8 code unit count. This removes several allocations in canonicalize_path's constructor. I did not measure the performance impact of this change. Holistically, the algorithm used for canonicalize_path::parent and canonicalize_path_result::drop_missing_components is pretty bad in terms of performance. I don't plan on fixing the algorithm any time soon though.
1 parent d94ee30 commit edc145b

File tree

4 files changed

+66
-3
lines changed

4 files changed

+66
-3
lines changed

src/file-canonical.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,7 @@ canonical_path::canonical_path(std::string &&path) : path_(std::move(path)) {
122122
HRESULT result = ::PathCchRemoveFileSpec(wpath->data(), wpath->size() + 1);
123123
switch (result) {
124124
case S_OK:
125-
// TODO(strager): Avoid allocating just to count UTF-8 code units.
126-
this->path_lengths_.push_back(
127-
std::filesystem::path(wpath->c_str()).u8string().size());
125+
this->path_lengths_.push_back(count_utf_8_code_units(wpath->c_str()));
128126
break;
129127
case S_FALSE:
130128
// Path is a root path already.

src/quick-lint-js/utf-16.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <optional>
88
#include <string>
9+
#include <string_view>
910
#include <vector>
1011

1112
namespace quick_lint_js {
@@ -29,6 +30,11 @@ class mbargv {
2930

3031
std::optional<std::wstring> mbstring_to_wstring(const char *mbstring);
3132
#endif
33+
34+
std::size_t count_utf_8_code_units(std::u16string_view) noexcept;
35+
#if defined(_WIN32)
36+
std::size_t count_utf_8_code_units(std::wstring_view) noexcept;
37+
#endif
3238
}
3339
#endif
3440

src/utf-16.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <quick-lint-js/narrow-cast.h>
1010
#include <quick-lint-js/utf-16.h>
1111
#include <string>
12+
#include <string_view>
1213
#include <vector>
1314

1415
#if defined(_WIN32)
@@ -101,6 +102,31 @@ std::optional<std::wstring> mbstring_to_wstring(const char *mbstring) {
101102
return wstring;
102103
}
103104
#endif
105+
106+
std::size_t count_utf_8_code_units(std::u16string_view utf_16) noexcept {
107+
std::size_t count = 0;
108+
for (char16_t c : utf_16) {
109+
if (c < 0x0080) {
110+
count += 1;
111+
} else if (c < 0x0800) {
112+
count += 2;
113+
} else if (0xd800 <= c && c <= 0xdfff) {
114+
// Part of a surrogate pair.
115+
count += 2;
116+
} else {
117+
count += 3;
118+
}
119+
}
120+
return count;
121+
}
122+
123+
#if defined(_WIN32)
124+
std::size_t count_utf_8_code_units(std::wstring_view utf_16) noexcept {
125+
static_assert(sizeof(char16_t) == sizeof(wchar_t));
126+
return count_utf_8_code_units(std::u16string_view(
127+
reinterpret_cast<const char16_t *>(utf_16.data()), utf_16.size()));
128+
}
129+
#endif
104130
}
105131

106132
// quick-lint-js finds bugs in JavaScript programs.

test/test-utf-16.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
#include <gtest/gtest.h>
55
#include <quick-lint-js/narrow-cast.h>
66
#include <quick-lint-js/utf-16.h>
7+
#include <string_view>
8+
9+
using namespace std::literals::string_view_literals;
710

811
namespace quick_lint_js {
12+
namespace {
913
#if defined(_WIN32)
1014
TEST(test_utf_16_windows, mbargv) {
1115
std::vector<wchar_t*> argv;
@@ -34,6 +38,35 @@ TEST(test_utf_16_windows, mbstring_to_wstring) {
3438
}
3539
}
3640
#endif
41+
42+
TEST(test_count_utf_8_code_units_in_utf_16, empty_string) {
43+
EXPECT_EQ(count_utf_8_code_units(u""sv), 0);
44+
}
45+
46+
TEST(test_count_utf_8_code_units_in_utf_16, ascii) {
47+
EXPECT_EQ(count_utf_8_code_units(u"abc 123"sv), 7);
48+
EXPECT_EQ(count_utf_8_code_units(u"\u007f"sv), 1);
49+
EXPECT_EQ(count_utf_8_code_units(u"\u0000"sv), 1);
50+
}
51+
52+
TEST(test_count_utf_8_code_units_in_utf_16, 2_byte_utf_8) {
53+
EXPECT_EQ(count_utf_8_code_units(u"\u0080"sv), 2);
54+
EXPECT_EQ(count_utf_8_code_units(u"\u07ff"sv), 2);
55+
}
56+
57+
TEST(test_count_utf_8_code_units_in_utf_16, 3_byte_utf_8) {
58+
EXPECT_EQ(count_utf_8_code_units(u"\u0800"sv), 3);
59+
EXPECT_EQ(count_utf_8_code_units(u"\uffff"sv), 3);
60+
}
61+
62+
TEST(test_count_utf_8_code_units_in_utf_16, surrogate_pair) {
63+
EXPECT_EQ(count_utf_8_code_units(u"\U00010437"sv), 4) << "D801 DC37";
64+
EXPECT_EQ(count_utf_8_code_units(u"\U00024B62"sv), 4) << "D852 DF62";
65+
}
66+
67+
// TODO(strager): How should count_utf_8_code_units behave with invalid UTF-16
68+
// (e.g. incomplete surrogate pair)?
69+
}
3770
}
3871

3972
// quick-lint-js finds bugs in JavaScript programs.

0 commit comments

Comments
 (0)