Skip to content

Commit 5adb489

Browse files
committed
Fix UTF-8 handling in ConvertUtf8ToUtf16 and improve error handling in Utf16Reader and Utf16Writer
1 parent 3a1d7e0 commit 5adb489

File tree

2 files changed

+37
-24
lines changed

2 files changed

+37
-24
lines changed

src/Explorer/UTF16Stream.cpp

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ std::wstring ConvertUtf8ToUtf16(std::string_view utf8_str)
1515

1616
while (i < utf8_str.length()) {
1717
uint32_t codepoint = 0;
18-
int bytes = 0;
18+
size_t bytes = 0;
1919

2020
unsigned char c = static_cast<unsigned char>(utf8_str[i]);
2121

@@ -36,21 +36,29 @@ std::wstring ConvertUtf8ToUtf16(std::string_view utf8_str)
3636
bytes = 4;
3737
}
3838
else {
39-
// Invalid UTF-8 sequence
39+
// Invalid UTF-8 sequence - skip this byte
4040
++i;
4141
continue;
4242
}
4343

4444
// Process remaining bytes
45-
for (int j = 1; j < bytes && (i + j) < utf8_str.length(); ++j) {
45+
bool valid = true;
46+
for (size_t j = 1; j < bytes && (i + j) < utf8_str.length(); ++j) {
4647
unsigned char next = static_cast<unsigned char>(utf8_str[i + j]);
4748
if ((next & 0xC0) != 0x80) {
48-
// Invalid UTF-8 sequence
49+
// Invalid continuation byte
50+
valid = false;
4951
break;
5052
}
5153
codepoint = (codepoint << 6) | (next & 0x3F);
5254
}
5355

56+
if (!valid || (i + bytes) > utf8_str.length()) {
57+
// Skip invalid sequence
58+
++i;
59+
continue;
60+
}
61+
5462
// Convert to UTF-16
5563
if (codepoint <= 0xFFFF) {
5664
// Character in BMP
@@ -107,30 +115,33 @@ bool Utf16Reader::getline(std::wstring& line)
107115
wchar_t next;
108116
if (ReadChar(next)) {
109117
if (next == L'\n') {
118+
// Found \r\n - line ending
110119
break;
111120
}
112-
line += ch;
121+
// \r followed by something else - include only the next char
113122
line += next;
114123
}
115-
} else {
124+
// Single \r as line ending
125+
break;
126+
}
127+
else {
116128
line += ch;
117129
}
118130
}
119131
return found_data;
120132
}
121133

122-
bool Utf16Reader::eof() const {
134+
bool Utf16Reader::eof() const
135+
{
123136
return file_.eof();
124137
}
125138

126-
void Utf16Reader::close() {
127-
file_.close();
128-
}
139+
void Utf16Reader::close() { file_.close(); }
129140

130141
bool Utf16Reader::ReadChar(wchar_t& ch)
131142
{
132143
file_.read(reinterpret_cast<char*>(&ch), sizeof(ch));
133-
if (file_.gcount() != sizeof(ch)) {
144+
if (file_.gcount() != static_cast<std::streamsize>(sizeof(ch))) {
134145
file_.setstate(std::ios::eofbit);
135146
return false;
136147
}
@@ -142,30 +153,30 @@ bool Utf16Reader::ReadChar(wchar_t& ch)
142153
Utf16Writer::Utf16Writer(const std::filesystem::path& filename)
143154
: file_(filename, std::ios::binary)
144155
{
145-
if (!file_.is_open()) {
156+
if (!file_.is_open())
157+
{
146158
throw std::runtime_error("Failed to open file: " + filename.string());
147159
}
148160
file_.write(reinterpret_cast<const char*>(&UTF16LE_BOM), sizeof(UTF16LE_BOM));
161+
if (!file_.good()) {
162+
throw std::runtime_error("Failed to write BOM to file: " + filename.string());
163+
}
149164
}
150165

151166
Utf16Writer::~Utf16Writer()
152167
{
153168
file_.close();
154169
}
155170

156-
bool Utf16Writer::is_open() const {
171+
bool Utf16Writer::is_open() const
172+
{
157173
return file_.is_open();
158174
}
159175

160176
Utf16Writer& Utf16Writer::operator<<(std::wstring_view str)
161177
{
162-
file_.write(reinterpret_cast<const char*>(str.data()), str.length() * sizeof(wchar_t));
163-
return *this;
164-
}
165-
166-
Utf16Writer& Utf16Writer::operator<<(const std::wstring& str)
167-
{
168-
file_.write(reinterpret_cast<const char*>(str.data()), str.length() * sizeof(wchar_t));
178+
file_.write(reinterpret_cast<const char*>(str.data()),
179+
static_cast<std::streamsize>(str.length() * sizeof(wchar_t)));
169180
return *this;
170181
}
171182

@@ -178,21 +189,24 @@ Utf16Writer& Utf16Writer::operator<<(wchar_t ch)
178189
Utf16Writer& Utf16Writer::operator<<(const wchar_t* str)
179190
{
180191
const std::wstring_view view(str);
181-
file_.write(reinterpret_cast<const char*>(view.data()), view.length() * sizeof(wchar_t));
192+
file_.write(reinterpret_cast<const char*>(view.data()),
193+
static_cast<std::streamsize>(view.length() * sizeof(wchar_t)));
182194
return *this;
183195
}
184196

185197
Utf16Writer& Utf16Writer::operator<<(uint32_t value)
186198
{
187199
const std::wstring str = std::to_wstring(value);
188-
file_.write(reinterpret_cast<const char*>(str.data()), str.length() * sizeof(wchar_t));
200+
file_.write(reinterpret_cast<const char*>(str.data()),
201+
static_cast<std::streamsize>(str.length() * sizeof(wchar_t)));
189202
return *this;
190203
}
191204

192205
Utf16Writer& Utf16Writer::operator<<(std::string_view str)
193206
{
194207
// Convert to UTF-16 from UTF-8
195208
std::wstring converted = ConvertUtf8ToUtf16(str);
196-
file_.write(reinterpret_cast<const char*>(converted.data()), converted.length() * sizeof(wchar_t));
209+
file_.write(reinterpret_cast<const char*>(converted.data()),
210+
static_cast<std::streamsize>(converted.length() * sizeof(wchar_t)));
197211
return *this;
198212
}

src/Explorer/UTF16Stream.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ class Utf16Writer
2424

2525
bool is_open() const;
2626

27-
Utf16Writer& operator<<(const std::wstring& str);
2827
Utf16Writer& operator<<(std::wstring_view str);
2928
Utf16Writer& operator<<(std::string_view str);
3029
Utf16Writer& operator<<(const wchar_t* str);

0 commit comments

Comments
 (0)