Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 27 additions & 29 deletions libc/src/__support/wchar/string_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,57 +27,55 @@ template <typename T> class StringConverter {
size_t src_len;
size_t src_idx;

// # of src elements pushed to cr needed to represent the current character
size_t num_pushed;

// # of pops we are allowed to perform (essentially size of the dest buffer)
size_t num_to_write;

int pushFullCharacter() {
// on the very first pop, we need to make sure that we always
// pushFullCharacter in case a previous StringConverter pushed part of a
// character to the mbstate
bool first_pop;

ErrorOr<size_t> pushFullCharacter() {
size_t num_pushed;
for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
++num_pushed) {
int err = cr.push(src[src_idx + num_pushed]);
if (err != 0)
return err;
return Error(err);
}

// if we aren't able to read a full character from the source string
if (src_idx + num_pushed == src_len && !cr.isFull()) {
src_idx += num_pushed;
return -1;
return Error(-1);
}

return 0;
return num_pushed;
}

public:
StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
: cr(ps), src(s), src_len(srclen), src_idx(0), num_pushed(0),
num_to_write(dstlen) {
pushFullCharacter();
}

StringConverter(const T *s, size_t dstlen, mbstate *ps)
: StringConverter(s, SIZE_MAX, dstlen, ps) {}
StringConverter(const T *s, mbstate *ps, size_t dstlen, size_t srclen=SIZE_MAX)
: cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen),
first_pop(true) {}

// TODO: following functions are almost identical
// look into templating CharacterConverter pop functions
ErrorOr<char32_t> popUTF32() {
if (cr.isEmpty()) {
int err = pushFullCharacter();
if (err != 0)
return Error(err);
if (cr.isEmpty() || first_pop) {
first_pop = false;
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
return Error(src_elements_read.error());

if (cr.sizeAsUTF32() > num_to_write) {
cr.clear();
return Error(-1);
}

src_idx += src_elements_read.value();
}

auto out = cr.pop_utf32();
if (cr.isEmpty())
src_idx += num_pushed;

if (out.has_value() && out.value() == L'\0')
src_len = src_idx;

Expand All @@ -87,21 +85,21 @@ template <typename T> class StringConverter {
}

ErrorOr<char8_t> popUTF8() {
if (cr.isEmpty()) {
int err = pushFullCharacter();
if (err != 0)
return Error(err);
if (cr.isEmpty() || first_pop) {
first_pop = false;
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
return Error(src_elements_read.error());

if (cr.sizeAsUTF8() > num_to_write) {
cr.clear();
return Error(-1);
}

src_idx += src_elements_read.value();
}

auto out = cr.pop_utf8();
if (cr.isEmpty())
src_idx += num_pushed;

if (out.has_value() && out.value() == '\0')
src_len = src_idx;

Expand Down
66 changes: 33 additions & 33 deletions libc/test/src/__support/wchar/string_converter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
Expand Down Expand Up @@ -60,22 +60,22 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
const wchar_t *src = L"\x1f921\x2211\xff\x41";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
Expand All @@ -86,12 +86,12 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x88);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
Expand All @@ -102,7 +102,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
Expand Down Expand Up @@ -131,22 +131,22 @@ TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
Expand All @@ -164,7 +164,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), 5, SIZE_MAX, &state);
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 5);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
Expand All @@ -181,22 +181,22 @@ TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
Expand All @@ -215,7 +215,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
Expand All @@ -237,22 +237,22 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
const wchar_t *src = L"\x1f921\xff"; // clown emoji, y with diaeresis (ÿ)
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);

auto res = sc1.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);

res = sc1.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);

res = sc1.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);

res = sc1.popUTF8();
ASSERT_TRUE(res.has_value());
Expand All @@ -261,13 +261,13 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {

// sc2 should pick up where sc1 left off and continue the conversion
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), 1,
SIZE_MAX, &state);
reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), &state,
SIZE_MAX, 1);

res = sc2.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);

res = sc2.popUTF8();
ASSERT_TRUE(res.has_value());
Expand All @@ -279,7 +279,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
reinterpret_cast<const char8_t *>(src), 2, SIZE_MAX, &state);
reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 2);

auto res = sc1.popUTF32();
ASSERT_FALSE(res.has_value());
Expand All @@ -288,8 +288,8 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {

// sc2 should pick up where sc1 left off and continue the conversion
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), 3,
SIZE_MAX, &state);
reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), &state,
SIZE_MAX, 3);

res = sc2.popUTF32();
ASSERT_TRUE(res.has_value());
Expand All @@ -306,7 +306,7 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), SIZE_MAX, 1, &state);
reinterpret_cast<const char8_t *>(src), &state, 1, SIZE_MAX);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
Expand All @@ -320,19 +320,19 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), SIZE_MAX, 5, &state);
reinterpret_cast<const char32_t *>(src), &state, 5, SIZE_MAX);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
Expand Down
Loading