Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions libc/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ add_header_library(
libc.hdr.types.char32_t
)

add_header_library(
string_converter
HDRS
string_converter.h
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.src.__support.error_or
.mbstate
.character_converter
)

add_object_library(
character_converter
HDRS
Expand Down
87 changes: 87 additions & 0 deletions libc/src/__support/wchar/string_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"

namespace LIBC_NAMESPACE_DECL {
namespace internal {

template <typename T> class StringConverter {
private:
CharacterConverter cr;
const T *src;
size_t src_len;
size_t src_idx;

int pushFullCharacter() {
if (!cr.isEmpty())
return 0;

int original_idx = src_idx;
while (!cr.isFull() && src_idx < src_len) {
int err = cr.push(src[src_idx++]);
if (err != 0) {
// point to the beginning of the invalid sequence
src_idx = original_idx;
return err;
}
}

if (src_idx == src_len && !cr.isFull()) {
// src points to the beginning of the character
src_idx = original_idx;
return -1;
}

return 0;
}

public:
StringConverter(const T *s, mbstate *ps)
: cr(ps), src(s), src_len(SIZE_MAX), src_idx(0) {}
StringConverter(const T *s, size_t len, mbstate *ps)
: cr(ps), src(s), src_len(len), src_idx(0) {}

ErrorOr<char32_t> popUTF32() {
int err = pushFullCharacter();
if (err != 0)
return Error(err);

auto out = cr.pop_utf32();
if (out.has_value() && out.value() == L'\0')
src_len = src_idx;

return out;
}

ErrorOr<char8_t> popUTF8() {
int err = pushFullCharacter();
if (err != 0)
return Error(err);

auto out = cr.pop_utf8();
if (out.has_value() && out.value() == '\0')
src_len = src_idx;

return out;
}
};

} // namespace internal
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
10 changes: 10 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,13 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)

add_libc_test(
string_converter_test.cpp
SUITE
libc-support-tests
SRCS
string_converter_test.cpp
DEPENDS
libc.src.__support.wchar.string_converter
)
167 changes: 167 additions & 0 deletions libc/test/src/__support/wchar/string_converter_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
//===-- Unittests for StringConverter class -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "hdr/errno_macros.h"
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/string_converter.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcStringConverterTest, UTF8To32) {
// first 4 bytes are clown emoji, then next 3 are sigma symbol
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);

res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x2211);

res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);

res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
}

TEST(LlvmLibcStringConverterTest, UTF32To8) {
const wchar_t *src = L"\x1f921\x2211";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xE2);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x88);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x91);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);

res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
}

TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
const wchar_t *src = L"\x1f921\x2211";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), 1, &state);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);

res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
}

TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
// first 4 bytes are clown emoji, then next 3 are sigma symbol
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), 5, &state);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);

res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), -1);
}

TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
const wchar_t *src = L"\x1f921\xffffff";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state);

auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);

res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);

res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
}

TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state);

auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);

res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
}
Loading