Skip to content

Commit 56a4f8d

Browse files
authored
[libc] Wchar Stringconverter (#146388)
Implemented a string converter class to encapsulate the logic of converting between utf8 <-> utf32
1 parent 00dacf8 commit 56a4f8d

File tree

7 files changed

+521
-1
lines changed

7 files changed

+521
-1
lines changed

libc/src/__support/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ add_subdirectory(time)
406406

407407
# Requires access to uchar header which is not on macos
408408
# Therefore, cannot currently build this on macos in overlay mode
409-
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
409+
if(NOT (LIBC_TARGET_OS_IS_DARWIN))
410410
add_subdirectory(wchar)
411411
endif()
412412

libc/src/__support/wchar/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,19 @@ add_header_library(
66
libc.hdr.types.char32_t
77
)
88

9+
add_header_library(
10+
string_converter
11+
HDRS
12+
string_converter.h
13+
DEPENDS
14+
libc.hdr.types.char8_t
15+
libc.hdr.types.char32_t
16+
libc.hdr.types.size_t
17+
libc.src.__support.error_or
18+
.mbstate
19+
.character_converter
20+
)
21+
922
add_object_library(
1023
character_converter
1124
HDRS
@@ -16,6 +29,7 @@ add_object_library(
1629
libc.hdr.errno_macros
1730
libc.hdr.types.char8_t
1831
libc.hdr.types.char32_t
32+
libc.hdr.types.size_t
1933
libc.src.__support.error_or
2034
libc.src.__support.math_extras
2135
.mbstate

libc/src/__support/wchar/character_converter.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "hdr/errno_macros.h"
1010
#include "hdr/types/char32_t.h"
1111
#include "hdr/types/char8_t.h"
12+
#include "hdr/types/size_t.h"
1213
#include "src/__support/CPP/bit.h"
1314
#include "src/__support/common.h"
1415
#include "src/__support/error_or.h"
@@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
9293
state->bytes_stored++;
9394
return 0;
9495
}
96+
9597
// Invalid byte -> reset the state
9698
clear();
9799
return EILSEQ;
@@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
130132
return utf32;
131133
}
132134

135+
size_t CharacterConverter::sizeAsUTF32() {
136+
return 1; // a single utf-32 value can fit an entire character
137+
}
138+
139+
size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }
140+
133141
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
134142
if (isEmpty())
135143
return Error(-1);
@@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
156164
}
157165

158166
state->bytes_stored--;
167+
if (state->bytes_stored == 0)
168+
clear();
169+
159170
return static_cast<char8_t>(output);
160171
}
161172

libc/src/__support/wchar/character_converter.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "hdr/types/char32_t.h"
1313
#include "hdr/types/char8_t.h"
14+
#include "hdr/types/size_t.h"
1415
#include "src/__support/common.h"
1516
#include "src/__support/error_or.h"
1617
#include "src/__support/wchar/mbstate.h"
@@ -30,6 +31,9 @@ class CharacterConverter {
3031
bool isEmpty();
3132
bool isValidState();
3233

34+
size_t sizeAsUTF32();
35+
size_t sizeAsUTF8();
36+
3337
int push(char8_t utf8_byte);
3438
int push(char32_t utf32);
3539

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
10+
#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
11+
12+
#include "hdr/types/char32_t.h"
13+
#include "hdr/types/char8_t.h"
14+
#include "hdr/types/size_t.h"
15+
#include "src/__support/common.h"
16+
#include "src/__support/error_or.h"
17+
#include "src/__support/wchar/character_converter.h"
18+
#include "src/__support/wchar/mbstate.h"
19+
20+
namespace LIBC_NAMESPACE_DECL {
21+
namespace internal {
22+
23+
template <typename T> class StringConverter {
24+
private:
25+
CharacterConverter cr;
26+
const T *src;
27+
size_t src_len;
28+
size_t src_idx;
29+
30+
// # of pops we are allowed to perform (essentially size of the dest buffer)
31+
size_t num_to_write;
32+
33+
ErrorOr<size_t> pushFullCharacter() {
34+
size_t num_pushed;
35+
for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
36+
++num_pushed) {
37+
int err = cr.push(src[src_idx + num_pushed]);
38+
if (err != 0)
39+
return Error(err);
40+
}
41+
42+
// if we aren't able to read a full character from the source string
43+
if (src_idx + num_pushed == src_len && !cr.isFull()) {
44+
src_idx += num_pushed;
45+
return Error(-1);
46+
}
47+
48+
return num_pushed;
49+
}
50+
51+
public:
52+
StringConverter(const T *s, mbstate *ps, size_t dstlen,
53+
size_t srclen = SIZE_MAX)
54+
: cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen) {}
55+
56+
// TODO: following functions are almost identical
57+
// look into templating CharacterConverter pop functions
58+
ErrorOr<char32_t> popUTF32() {
59+
if (cr.isEmpty() || src_idx == 0) {
60+
auto src_elements_read = pushFullCharacter();
61+
if (!src_elements_read.has_value())
62+
return Error(src_elements_read.error());
63+
64+
if (cr.sizeAsUTF32() > num_to_write) {
65+
cr.clear();
66+
return Error(-1);
67+
}
68+
69+
src_idx += src_elements_read.value();
70+
}
71+
72+
auto out = cr.pop_utf32();
73+
if (out.has_value() && out.value() == L'\0')
74+
src_len = src_idx;
75+
76+
num_to_write--;
77+
78+
return out;
79+
}
80+
81+
ErrorOr<char8_t> popUTF8() {
82+
if (cr.isEmpty() || src_idx == 0) {
83+
auto src_elements_read = pushFullCharacter();
84+
if (!src_elements_read.has_value())
85+
return Error(src_elements_read.error());
86+
87+
if (cr.sizeAsUTF8() > num_to_write) {
88+
cr.clear();
89+
return Error(-1);
90+
}
91+
92+
src_idx += src_elements_read.value();
93+
}
94+
95+
auto out = cr.pop_utf8();
96+
if (out.has_value() && out.value() == '\0')
97+
src_len = src_idx;
98+
99+
num_to_write--;
100+
101+
return out;
102+
}
103+
104+
size_t getSourceIndex() { return src_idx; }
105+
};
106+
107+
} // namespace internal
108+
} // namespace LIBC_NAMESPACE_DECL
109+
110+
#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H

libc/test/src/__support/wchar/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,18 @@ add_libc_test(
1919
DEPENDS
2020
libc.src.__support.wchar.character_converter
2121
)
22+
23+
24+
add_libc_test(
25+
string_converter_test
26+
SUITE
27+
libc-support-tests
28+
SRCS
29+
string_converter_test.cpp
30+
DEPENDS
31+
libc.src.__support.wchar.string_converter
32+
libc.src.__support.wchar.mbstate
33+
libc.src.__support.error_or
34+
libc.hdr.errno_macros
35+
libc.hdr.types.char32_t
36+
)

0 commit comments

Comments
 (0)