Skip to content

Commit 731cdb5

Browse files
KKiiimatc-github
authored andcommitted
fix(frontend): handle conversion between utf8 and utf16 strings based on llvm::ConvertUTF (#116)
1 parent 0b386cc commit 731cdb5

File tree

6 files changed

+2154
-2131
lines changed

6 files changed

+2154
-2131
lines changed

frontend/CompilerImpl.cpp

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#include <map>
1010
#include <optional>
1111
#include <regex>
12-
#include <sstream>
1312
#include <string>
1413
#include <string_view>
1514
#include <utility>
@@ -18,6 +17,7 @@
1817
#include "ASC/ASC.hpp"
1918
#include "CompilerImpl.hpp"
2019
#include "LinkedAPI.hpp"
20+
#include "llvm/Support/ConvertUTF.h"
2121
#include "warpo/frontend/Compiler.hpp"
2222
#include "warpo/support/Debug.hpp"
2323
#include "warpo/support/FileSystem.hpp"
@@ -42,15 +42,14 @@ enum WasmFFIBool : uint32_t { WASM_FALSE = 0, WASM_TRUE = 1 };
4242

4343
} // namespace
4444
int32_t FrontendCompiler::allocString(std::string_view str) {
45-
// FIXME: convert utf8 to utf16 need library
46-
int32_t const ptr = m.callExportedFunctionWithName<1>(stackTop, "__new", static_cast<int32_t>(str.size() * 2U),
45+
std::u16string utf16Str = utf8ToUtf16(std::string(str));
46+
int32_t const ptr = m.callExportedFunctionWithName<1>(stackTop, "__new", static_cast<int32_t>(utf16Str.size() * 2U),
4747
static_cast<int32_t>(2))[0]
4848
.i32;
4949
m.callExportedFunctionWithName<1>(stackTop, "__pin", ptr);
50-
uint8_t *const stringBegin = m.getLinearMemoryRegion(static_cast<uint32_t>(ptr), str.size() * 2U);
51-
for (size_t i = 0; i < str.size(); i++) {
52-
stringBegin[i * 2U] = str[i];
53-
}
50+
uint8_t *const stringBegin = m.getLinearMemoryRegion(static_cast<uint32_t>(ptr), utf16Str.size());
51+
std::memcpy(stringBegin, utf16Str.data(), utf16Str.size() * sizeof(char16_t));
52+
5453
return ptr;
5554
}
5655

@@ -70,14 +69,47 @@ std::string FrontendCompiler::getAsString(int32_t ptr) {
7069
uint32_t size = 0;
7170
std::memcpy(&size, header + 16, sizeof(size));
7271
uint8_t const *content = m.getLinearMemoryRegion(ptr, size);
73-
size /= 2U;
74-
std::stringstream ss{};
75-
for (uint32_t i = 0; i < size; ++i) {
76-
ss << content[i * 2U];
77-
}
78-
return std::move(ss).str();
72+
73+
std::u16string utf16Str;
74+
utf16Str.resize(size / 2);
75+
std::memcpy(utf16Str.data(), content, size);
76+
return utf16ToUtf8(utf16Str);
7977
};
8078

79+
std::u16string FrontendCompiler::utf8ToUtf16(std::string const &utf8Str) {
80+
if (utf8Str.empty())
81+
return std::u16string();
82+
const llvm::UTF8 *src = reinterpret_cast<const llvm::UTF8 *>(utf8Str.data());
83+
const llvm::UTF8 *srcEnd = src + utf8Str.size();
84+
std::u16string utf16Str;
85+
utf16Str.resize(utf8Str.size());
86+
llvm::UTF16 *dst = reinterpret_cast<llvm::UTF16 *>(utf16Str.data());
87+
llvm::UTF16 *dstEnd = dst + utf16Str.size();
88+
89+
if (llvm::ConvertUTF8toUTF16(&src, srcEnd, &dst, dstEnd, llvm::strictConversion) != llvm::conversionOK)
90+
throw std::runtime_error("UTF8 to UTF16 conversion failed");
91+
// Resize the string to the actual number of UTF-16 code units written
92+
utf16Str.resize(dst - reinterpret_cast<llvm::UTF16 *>(utf16Str.data()));
93+
return utf16Str;
94+
}
95+
96+
std::string FrontendCompiler::utf16ToUtf8(std::u16string const &utf16Str) {
97+
if (utf16Str.empty())
98+
return std::string();
99+
const llvm::UTF16 *src = reinterpret_cast<const llvm::UTF16 *>(utf16Str.data());
100+
const llvm::UTF16 *srcEnd = src + utf16Str.size();
101+
std::string utf8Str;
102+
utf8Str.resize(utf16Str.size() * 4); // UTF-8 can be up to 4 bytes per Unicode code point
103+
llvm::UTF8 *dst = reinterpret_cast<llvm::UTF8 *>(utf8Str.data());
104+
llvm::UTF8 *dstEnd = dst + utf8Str.size();
105+
106+
if (llvm::ConvertUTF16toUTF8(&src, srcEnd, &dst, dstEnd, llvm::strictConversion) != llvm::conversionOK)
107+
throw std::runtime_error("UTF16 to UTF8 conversion failed");
108+
// Resize the string to the actual number of UTF-8 bytes written
109+
utf8Str.resize(dst - reinterpret_cast<llvm::UTF8 *>(utf8Str.data()));
110+
return utf8Str;
111+
}
112+
81113
using PackageResolveResult = std::optional<std::pair<std::string, std::optional<std::string>>>;
82114

83115
static PackageResolveResult getPackageName(std::string const &fileInternalPath) {

frontend/CompilerImpl.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class FrontendCompiler {
3131
std::string errorMessage_;
3232

3333
int32_t allocString(std::string_view str);
34+
std::u16string utf8ToUtf16(std::string const &utf8Str);
35+
std::string utf16ToUtf8(std::u16string const &utf16Str);
3436

3537
void parseFile(int32_t const program, std::optional<std::string> const &code, std::string_view path, IsEntry isEntry);
3638

@@ -60,7 +62,6 @@ class FrontendCompiler {
6062
FrontendCompiler(Config const &config);
6163
~FrontendCompiler();
6264

63-
6465
CompilationResult compile(std::vector<std::string> const &entryFilePaths, Config const &config);
6566

6667
std::set<void *> allocedPtrs_;

0 commit comments

Comments
 (0)