diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index aa2079faed409..8bf6c402b0395 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1244,6 +1244,9 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.sys.socket.recv libc.src.sys.socket.recvfrom libc.src.sys.socket.recvmsg + + # wchar.h entrypoints + libc.src.wchar.mbrtowc ) endif() diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index c88c357009072..e4b3cb0faa820 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -20,6 +20,14 @@ add_proxy_header_library( libc.include.uchar ) +add_proxy_header_library( + mbstate_t + HDRS + mbstate_t.h + DEPENDS + libc.include.llvm-libc-types.mbstate_t +) + add_proxy_header_library( div_t HDRS diff --git a/libc/hdr/types/mbstate_t.h b/libc/hdr/types/mbstate_t.h new file mode 100644 index 0000000000000..367c6af7a3ffb --- /dev/null +++ b/libc/hdr/types/mbstate_t.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from mbstate_t.h -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_MBSTATE_T_H +#define LLVM_LIBC_HDR_TYPES_MBSTATE_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/mbstate_t.h" + +#else // Overlay mode + +#error "Cannot overlay mbstate_t + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_MBSTATE_T_H diff --git a/libc/include/llvm-libc-types/mbstate_t.h b/libc/include/llvm-libc-types/mbstate_t.h index 540d50975a264..009fe57da50e2 100644 --- a/libc/include/llvm-libc-types/mbstate_t.h +++ b/libc/include/llvm-libc-types/mbstate_t.h @@ -9,8 +9,12 @@ #ifndef LLVM_LIBC_TYPES_MBSTATE_T_H #define LLVM_LIBC_TYPES_MBSTATE_T_H -// TODO: Complete this once we implement functions that operate on this type. +#include "../llvm-libc-macros/stdint-macros.h" + typedef struct { + uint32_t __field1; + uint8_t __field2; + uint8_t __field3; } mbstate_t; #endif // LLVM_LIBC_TYPES_MBSTATE_T_H diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 84db73d8f01ea..c036636e12c32 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -29,6 +29,15 @@ functions: return_type: wint_t arguments: - type: int + - name: mbrtowc + standards: + - stdc + return_type: size_t + arguments: + - type: wchar_t *__restrict + - type: const char *__restrict + - type: size_t + - type: mbstate_t *__restrict - name: wmemset standards: - stdc diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index 6715e354e23e5..479c1dff2c6e0 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -19,3 +19,19 @@ add_object_library( libc.src.__support.math_extras .mbstate ) + +add_object_library( + mbrtowc + HDRS + mbrtowc.h + SRCS + mbrtowc.cpp + DEPENDS + libc.hdr.types.wchar_t + libc.hdr.types.size_t + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + .character_converter + .mbstate +) diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp new file mode 100644 index 0000000000000..954c7458f4dfb --- /dev/null +++ b/libc/src/__support/wchar/mbrtowc.cpp @@ -0,0 +1,49 @@ +//===-- Implementation for mbrtowc function ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/wchar/mbrtowc.h" +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, + size_t n, mbstate *__restrict ps) { + CharacterConverter char_conv(ps); + if (s == nullptr) + return 0; + size_t i = 0; + // Reading in bytes until we have a complete wc or error + for (; i < n && !char_conv.isFull(); ++i) { + int err = char_conv.push(static_cast(s[i])); + // Encoding error + if (err == -1) + return Error(-1); + } + auto wc = char_conv.pop_utf32(); + if (wc.has_value()) { + *pwc = wc.value(); + // null terminator -> return 0 + if (wc.value() == L'\0') + return 0; + return i; + } + // Incomplete but potentially valid + return -2; +} + +} // namespace internal + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wchar/mbrtowc.h b/libc/src/__support/wchar/mbrtowc.h new file mode 100644 index 0000000000000..37329ee61beac --- /dev/null +++ b/libc/src/__support/wchar/mbrtowc.h @@ -0,0 +1,29 @@ +//===-- Implementation header for mbrtowc function --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC +#define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, + size_t n, mbstate *__restrict ps); + +} // namespace internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h index fea693f73c3b5..32304a5215241 100644 --- a/libc/src/__support/wchar/mbstate.h +++ b/libc/src/__support/wchar/mbstate.h @@ -18,17 +18,17 @@ namespace internal { struct mbstate { // store a partial codepoint (in UTF-32) - char32_t partial; + char32_t partial = 0; /* Progress towards a conversion Increases with each push(...) until it reaches total_bytes Decreases with each pop(...) until it reaches 0 */ - uint8_t bytes_stored; + uint8_t bytes_stored = 0; // Total number of bytes that will be needed to represent this character - uint8_t total_bytes; + uint8_t total_bytes = 0; }; } // namespace internal diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 491dd5b34340a..163c29847e6a2 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -34,6 +34,23 @@ add_entrypoint_object( libc.src.__support.wctype_utils ) +add_entrypoint_object( + mbrtowc + SRCS + mbrtowc.cpp + HDRS + mbrtowc.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.mbstate_t + libc.hdr.types.wchar_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.wchar.mbrtowc + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate +) + add_entrypoint_object( wmemset SRCS diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp new file mode 100644 index 0000000000000..cd429ab8d30e2 --- /dev/null +++ b/libc/src/wchar/mbrtowc.cpp @@ -0,0 +1,38 @@ +//===-- Implementation of mbrtowc -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbrtowc.h" + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(size_t, mbrtowc, + (wchar_t *__restrict pwc, const char *__restrict s, size_t n, + mbstate_t *__restrict ps)) { + static internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(pwc, s, n, + ps == nullptr + ? &internal_mbstate + : reinterpret_cast(ps)); + if (!ret.has_value()) { + // Encoding failure + libc_errno = EILSEQ; + return -1; + } + return ret.value(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbrtowc.h b/libc/src/wchar/mbrtowc.h new file mode 100644 index 0000000000000..e2e3d3ebd2853 --- /dev/null +++ b/libc/src/wchar/mbrtowc.h @@ -0,0 +1,24 @@ +//===-- Implementation header for mbrtowc ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBRTOWC_H +#define LLVM_LIBC_SRC_WCHAR_MBRTOWC_H + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n, + mbstate_t *__restrict ps); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBRTOWC_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 4990b6953348b..d4cae1f6228bd 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -23,6 +23,20 @@ add_libc_test( libc.src.wchar.btowc ) +add_libc_test( + mbrtowc_test + SUITE + libc_wchar_unittests + SRCS + mbrtowc_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.string.memset + libc.src.wchar.mbrtowc + libc.hdr.types.mbstate_t + libc.hdr.types.wchar_t +) + add_libc_test( wctob_test SUITE diff --git a/libc/test/src/wchar/mbrtowc_test.cpp b/libc/test/src/wchar/mbrtowc_test.cpp new file mode 100644 index 0000000000000..69dcf00fde207 --- /dev/null +++ b/libc/test/src/wchar/mbrtowc_test.cpp @@ -0,0 +1,172 @@ +//===-- Unittests for mbrtowc ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/string/memset.h" +#include "src/wchar/mbrtowc.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcMBRToWC, OneByte) { + const char *ch = "A"; + wchar_t dest[2]; + // Testing if it works with nullptr mbstate_t + mbstate_t *mb = nullptr; + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(*dest), 'A'); + ASSERT_EQ(static_cast(n), 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 0, mb); + ASSERT_EQ(static_cast(n), -2); +} + +TEST(LlvmLibcMBRToWC, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb); + ASSERT_EQ(static_cast(*dest), 142); + ASSERT_EQ(static_cast(n), 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -2); + // Should pass after reading one more byte + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 1, mb); + ASSERT_EQ(static_cast(n), 1); + ASSERT_EQ(static_cast(*dest), 142); +} + +TEST(LlvmLibcMBRToWC, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb); + ASSERT_EQ(static_cast(*dest), 8721); + ASSERT_EQ(static_cast(n), 3); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -2); + // Should pass after reading two more bytes + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 8721); +} + +TEST(LlvmLibcMBRToWC, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb); + ASSERT_EQ(static_cast(*dest), 129313); + ASSERT_EQ(static_cast(n), 4); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb); + ASSERT_EQ(static_cast(n), -2); + // Should pass after reading two more bytes + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 2, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 129313); +} + +TEST(LlvmLibcMBRToWC, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); +} + +TEST(LlvmLibcMBRToWC, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 4 should error + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); + // Trying to push just the first one should error + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb); + ASSERT_EQ(static_cast(n), 0); + ASSERT_TRUE(*dest == L'\0'); +} + +TEST(LlvmLibcMBRToWC, InvalidLastByte) { + // Last byte is invalid since it does not have correct starting sequence. + // 0xC0 --> 11000000 starting sequence should be 10xxxxxx + const char ch[4] = {static_cast(0xF1), static_cast(0x80), + static_cast(0x80), static_cast(0xC0)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 4 should error + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); +} + +TEST(LlvmLibcMBRToWC, ValidTwoByteWithExtraRead) { + const char ch[3] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0x80)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 3 should return valid 2 byte + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 142); +} + +TEST(LlvmLibcMBRToWC, TwoValidTwoBytes) { + const char ch[4] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0xC7), static_cast(0x8C)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // mbstate should reset after reading first one + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 142); + n = LIBC_NAMESPACE::mbrtowc(dest + 1, ch + 2, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*(dest + 1)), 460); +} + +TEST(LlvmLibcMBRToWC, NullString) { + wchar_t dest[2] = {L'O', L'K'}; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // reading on nullptr should return 0 + size_t n = LIBC_NAMESPACE::mbrtowc(dest, nullptr, 2, mb); + ASSERT_EQ(static_cast(n), 0); + ASSERT_TRUE(dest[0] == L'O'); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), 0); +}