From 1ac2bff191b894ee64c4595443b55f117ce16909 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Tue, 17 Jun 2025 20:19:30 +0000 Subject: [PATCH 1/8] created initial files and internal function --- libc/src/__support/wchar/CMakeLists.txt | 15 +++++++++ libc/src/__support/wchar/mbrtowc.cpp | 44 +++++++++++++++++++++++++ libc/src/__support/wchar/mbrtowc.h | 28 ++++++++++++++++ libc/src/wchar/mbrtowc.cpp | 31 +++++++++++++++++ libc/src/wchar/mbrtowc.h | 23 +++++++++++++ 5 files changed, 141 insertions(+) create mode 100644 libc/src/__support/wchar/mbrtowc.cpp create mode 100644 libc/src/__support/wchar/mbrtowc.h create mode 100644 libc/src/wchar/mbrtowc.cpp create mode 100644 libc/src/wchar/mbrtowc.h diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index 6715e354e23e5..03703cda9f430 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -19,3 +19,18 @@ add_object_library( libc.src.__support.math_extras .mbstate ) + +add_object_library( + mbrtowc + HDRS + mbrtowc.h + SRCS + mbrtowc.cpp + DEPENDS + libc.hdr.types.wchar_t + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + .character_converter + .mbstate +) diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp new file mode 100644 index 0000000000000..5108e3779d527 --- /dev/null +++ b/libc/src/__support/wchar/mbrtowc.cpp @@ -0,0 +1,44 @@ +//===-- Implementation for mbrtowc function ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/wchar/mbrtowc.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, + size_t n, mbstate_t *__restrict ps) { + CharacterConverter char_conv((internal::mbstate *)ps); + if (s == nullptr) + return 0; + size_t i = 0; + // Reading in bytes until we have a complete wc or error + for (; i < n & !char_conv.isComplete(); ++i) { + int err = char_conv.push(static_cast(s[i])); + // Encoding error + if (err == -1) + return Error(-1); + } + auto wc = char_conv.pop_utf32(); + if (wc.has_value()) { + *pwc = wc.value(); + return i; + } + // Incomplete but potentially valid + return Error(-2); +} + +} // namespace internal + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wchar/mbrtowc.h b/libc/src/__support/wchar/mbrtowc.h new file mode 100644 index 0000000000000..40ef524d9b9e7 --- /dev/null +++ b/libc/src/__support/wchar/mbrtowc.h @@ -0,0 +1,28 @@ +//===-- Implementation header for mbrtowc function --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC +#define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC + +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, + size_t n, mbstate_t *__restrict ps); + +} // namespace internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp new file mode 100644 index 0000000000000..c39e768e78039 --- /dev/null +++ b/libc/src/wchar/mbrtowc.cpp @@ -0,0 +1,31 @@ +//===-- Implementation of mbrtowc -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbrtowc.h" + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(size_t, mbrtowc, + (wchar_t *__restrict pwc, const char *__restrict s, size_t n, + mbstate_t *__restrict ps)) { + auto ret = internal::mbrtowc(pwc, s, n, ps); + if (!ret.has_value()) { + if (ret.error() == -1) { + + } + } + return ret.value(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbrtowc.h b/libc/src/wchar/mbrtowc.h new file mode 100644 index 0000000000000..6883e26ebcb62 --- /dev/null +++ b/libc/src/wchar/mbrtowc.h @@ -0,0 +1,23 @@ +//===-- Implementation header for mbrtowc ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBRTOWC_H +#define LLVM_LIBC_SRC_WCHAR_MBRTOWC_H + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n, + mbstate_t *__restrict ps); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBRTOWC_H From 98ea7aebb251127d7b91950b4d1288d7da475b46 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Wed, 18 Jun 2025 17:26:56 +0000 Subject: [PATCH 2/8] [libc] mbrtowc implementation implemented the internal and public mbrtowc as well as tests for the public function. --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/hdr/types/CMakeLists.txt | 8 ++ libc/hdr/types/mbstate_t.h | 22 +++ libc/include/llvm-libc-types/mbstate_t.h | 6 +- libc/include/wchar.yaml | 9 ++ libc/src/__support/wchar/CMakeLists.txt | 1 + libc/src/__support/wchar/mbrtowc.cpp | 14 +- libc/src/__support/wchar/mbrtowc.h | 3 +- libc/src/wchar/CMakeLists.txt | 17 +++ libc/src/wchar/mbrtowc.cpp | 13 +- libc/src/wchar/mbrtowc.h | 3 +- libc/test/src/wchar/CMakeLists.txt | 14 ++ libc/test/src/wchar/mbrtowc_test.cpp | 170 +++++++++++++++++++++++ 13 files changed, 272 insertions(+), 9 deletions(-) create mode 100644 libc/hdr/types/mbstate_t.h create mode 100644 libc/test/src/wchar/mbrtowc_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index aa2079faed409..10509a0c25835 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -365,6 +365,7 @@ set(TARGET_LIBC_ENTRYPOINTS # wchar.h entrypoints libc.src.wchar.btowc + libc.src.wchar.mbrtowc libc.src.wchar.wcslen libc.src.wchar.wctob libc.src.wchar.wmemmove diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index c88c357009072..e4b3cb0faa820 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -20,6 +20,14 @@ add_proxy_header_library( libc.include.uchar ) +add_proxy_header_library( + mbstate_t + HDRS + mbstate_t.h + DEPENDS + libc.include.llvm-libc-types.mbstate_t +) + add_proxy_header_library( div_t HDRS diff --git a/libc/hdr/types/mbstate_t.h b/libc/hdr/types/mbstate_t.h new file mode 100644 index 0000000000000..15b2614341d7d --- /dev/null +++ b/libc/hdr/types/mbstate_t.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from mbstate_t.h -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_MBSTATE_T_H +#define LLVM_LIBC_HDR_TYPES_MBSTATE_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/mbstate_t.h" + +#else // Overlay mode + +#include "hdr/wchar_overlay.h" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_MBSTATE_T_H diff --git a/libc/include/llvm-libc-types/mbstate_t.h b/libc/include/llvm-libc-types/mbstate_t.h index 540d50975a264..009fe57da50e2 100644 --- a/libc/include/llvm-libc-types/mbstate_t.h +++ b/libc/include/llvm-libc-types/mbstate_t.h @@ -9,8 +9,12 @@ #ifndef LLVM_LIBC_TYPES_MBSTATE_T_H #define LLVM_LIBC_TYPES_MBSTATE_T_H -// TODO: Complete this once we implement functions that operate on this type. +#include "../llvm-libc-macros/stdint-macros.h" + typedef struct { + uint32_t __field1; + uint8_t __field2; + uint8_t __field3; } mbstate_t; #endif // LLVM_LIBC_TYPES_MBSTATE_T_H diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 84db73d8f01ea..06c621f59b462 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -29,6 +29,15 @@ functions: return_type: wint_t arguments: - type: int + - name: mbrtowc + standards: + - stdc + return_type: size_t + arguments: + - type: wchar_t * __restrict + - type: const char * __restrict + - type: size_t + - type: mbstate_t * __restrict - name: wmemset standards: - stdc diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index 03703cda9f430..479c1dff2c6e0 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -28,6 +28,7 @@ add_object_library( mbrtowc.cpp DEPENDS libc.hdr.types.wchar_t + libc.hdr.types.size_t libc.src.__support.common libc.src.__support.error_or libc.src.__support.macros.config diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp index 5108e3779d527..969448ee60e81 100644 --- a/libc/src/__support/wchar/mbrtowc.cpp +++ b/libc/src/__support/wchar/mbrtowc.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/__support/wchar/mbrtowc.h" +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/error_or.h" @@ -18,21 +20,25 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, - size_t n, mbstate_t *__restrict ps) { - CharacterConverter char_conv((internal::mbstate *)ps); + size_t n, mbstate *__restrict ps) { + CharacterConverter char_conv(ps); if (s == nullptr) return 0; size_t i = 0; + auto wc = char_conv.pop_utf32(); // Reading in bytes until we have a complete wc or error - for (; i < n & !char_conv.isComplete(); ++i) { + for (; i < n && !wc.has_value(); ++i) { int err = char_conv.push(static_cast(s[i])); // Encoding error if (err == -1) return Error(-1); + wc = char_conv.pop_utf32(); } - auto wc = char_conv.pop_utf32(); if (wc.has_value()) { *pwc = wc.value(); + // null terminator -> return 0 + if (wc.value() == L'\0') + return 0; return i; } // Incomplete but potentially valid diff --git a/libc/src/__support/wchar/mbrtowc.h b/libc/src/__support/wchar/mbrtowc.h index 40ef524d9b9e7..37329ee61beac 100644 --- a/libc/src/__support/wchar/mbrtowc.h +++ b/libc/src/__support/wchar/mbrtowc.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC #define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC +#include "hdr/types/size_t.h" #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/error_or.h" @@ -19,7 +20,7 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, - size_t n, mbstate_t *__restrict ps); + size_t n, mbstate *__restrict ps); } // namespace internal diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 491dd5b34340a..163c29847e6a2 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -34,6 +34,23 @@ add_entrypoint_object( libc.src.__support.wctype_utils ) +add_entrypoint_object( + mbrtowc + SRCS + mbrtowc.cpp + HDRS + mbrtowc.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.mbstate_t + libc.hdr.types.wchar_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.wchar.mbrtowc + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate +) + add_entrypoint_object( wmemset SRCS diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp index c39e768e78039..c29c5ee161e32 100644 --- a/libc/src/wchar/mbrtowc.cpp +++ b/libc/src/wchar/mbrtowc.cpp @@ -8,22 +8,31 @@ #include "src/wchar/mbrtowc.h" +#include "hdr/types/mbstate_t.h" #include "hdr/types/size_t.h" #include "hdr/types/wchar_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(size_t, mbrtowc, (wchar_t *__restrict pwc, const char *__restrict s, size_t n, mbstate_t *__restrict ps)) { - auto ret = internal::mbrtowc(pwc, s, n, ps); + static mbstate_t internal_mbstate{0, 0, 0}; + auto ret = internal::mbrtowc( + pwc, s, n, (internal::mbstate *)(ps == nullptr ? &internal_mbstate : ps)); if (!ret.has_value()) { + // Encoding failure if (ret.error() == -1) { - + libc_errno = EILSEQ; + return -1; } + // Could potentially read a valid wide character. + return -2; } return ret.value(); } diff --git a/libc/src/wchar/mbrtowc.h b/libc/src/wchar/mbrtowc.h index 6883e26ebcb62..e2e3d3ebd2853 100644 --- a/libc/src/wchar/mbrtowc.h +++ b/libc/src/wchar/mbrtowc.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SRC_WCHAR_MBRTOWC_H #define LLVM_LIBC_SRC_WCHAR_MBRTOWC_H +#include "hdr/types/mbstate_t.h" #include "hdr/types/size_t.h" #include "hdr/types/wchar_t.h" #include "src/__support/macros/config.h" @@ -16,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL { size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n, - mbstate_t *__restrict ps); + mbstate_t *__restrict ps); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 4990b6953348b..d4cae1f6228bd 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -23,6 +23,20 @@ add_libc_test( libc.src.wchar.btowc ) +add_libc_test( + mbrtowc_test + SUITE + libc_wchar_unittests + SRCS + mbrtowc_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.string.memset + libc.src.wchar.mbrtowc + libc.hdr.types.mbstate_t + libc.hdr.types.wchar_t +) + add_libc_test( wctob_test SUITE diff --git a/libc/test/src/wchar/mbrtowc_test.cpp b/libc/test/src/wchar/mbrtowc_test.cpp new file mode 100644 index 0000000000000..6e96e7ac31f49 --- /dev/null +++ b/libc/test/src/wchar/mbrtowc_test.cpp @@ -0,0 +1,170 @@ +//===-- Unittests for mbrtowc ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/string/memset.h" +#include "src/wchar/mbrtowc.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcMBRToWC, OneByte) { + const char *ch = "A"; + wchar_t dest[2]; + // Testing if it works with nullptr mbstate_t + mbstate_t *mb = nullptr; + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(*dest), 'A'); + ASSERT_EQ(static_cast(n), 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 0, mb); + ASSERT_EQ(static_cast(n), -2); +} + +TEST(LlvmLibcMBRToWC, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb); + ASSERT_EQ(static_cast(*dest), 142); + ASSERT_EQ(static_cast(n), 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -2); + // Should pass after reading one more byte + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 1, mb); + ASSERT_EQ(static_cast(n), 1); + ASSERT_EQ(static_cast(*dest), 142); +} + +TEST(LlvmLibcMBRToWC, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb); + ASSERT_EQ(static_cast(*dest), 8721); + ASSERT_EQ(static_cast(n), 3); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -2); + // Should pass after reading two more bytes + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 8721); +} + +TEST(LlvmLibcMBRToWC, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb); + ASSERT_EQ(static_cast(*dest), 129313); + ASSERT_EQ(static_cast(n), 4); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb); + ASSERT_EQ(static_cast(n), -2); + // Should pass after reading two more bytes + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 2, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 129313); +} + +TEST(LlvmLibcMBRToWC, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); +} + +TEST(LlvmLibcMBRToWC, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 4 should error + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); + // Trying to push just the first one should error + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb); + ASSERT_EQ(static_cast(n), 0); +} + +TEST(LlvmLibcMBRToWC, InvalidLastByte) { + // Last byte is invalid since it does not have correct starting sequence. + // 0xC0 --> 11000000 starting sequence should be 10xxxxxx + const char ch[4] = {static_cast(0xF1), static_cast(0x80), + static_cast(0x80), static_cast(0xC0)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 4 should error + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_EQ(static_cast(libc_errno), EILSEQ); +} + +TEST(LlvmLibcMBRToWC, ValidTwoByteWithExtraRead) { + const char ch[3] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0x80)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 3 should return valid 2 byte + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 142); +} + +TEST(LlvmLibcMBRToWC, TwoValidTwoBytes) { + const char ch[4] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0xC7), static_cast(0x8C)}; + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // mbstate should reset after reading first one + size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*dest), 142); + n = LIBC_NAMESPACE::mbrtowc(dest + 1, ch + 2, 2, mb); + ASSERT_EQ(static_cast(n), 2); + ASSERT_EQ(static_cast(*(dest + 1)), 460); +} + +TEST(LlvmLibcMBRToWC, NullString) { + wchar_t dest[2]; + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // reading on nullptr should return 0 + size_t n = LIBC_NAMESPACE::mbrtowc(dest, nullptr, 2, mb); + ASSERT_EQ(static_cast(n), 0); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); + ASSERT_EQ(static_cast(n), 0); +} From a34d1cac7b456fc0c42f2f67dd0326970c9bd538 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Wed, 18 Jun 2025 20:35:34 +0000 Subject: [PATCH 3/8] mbstate struct change for consistency --- libc/src/__support/wchar/mbstate.h | 6 +++--- libc/src/wchar/mbrtowc.cpp | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h index fb08fb4eaa188..087d3c0fbc4f0 100644 --- a/libc/src/__support/wchar/mbstate.h +++ b/libc/src/__support/wchar/mbstate.h @@ -18,17 +18,17 @@ namespace internal { struct mbstate { // store a partial codepoint (in UTF-32) - char32_t partial; + char32_t partial = 0; /* Progress towards a conversion For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte) For utf32 -> utf8, increases with each CharacterConverter::pop_utf8() */ - uint8_t bytes_processed; + uint8_t bytes_processed = 0; // Total number of bytes that will be needed to represent this character - uint8_t total_bytes; + uint8_t total_bytes = 0; }; } // namespace internal diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp index c29c5ee161e32..0a393890cf700 100644 --- a/libc/src/wchar/mbrtowc.cpp +++ b/libc/src/wchar/mbrtowc.cpp @@ -22,9 +22,11 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(size_t, mbrtowc, (wchar_t *__restrict pwc, const char *__restrict s, size_t n, mbstate_t *__restrict ps)) { - static mbstate_t internal_mbstate{0, 0, 0}; - auto ret = internal::mbrtowc( - pwc, s, n, (internal::mbstate *)(ps == nullptr ? &internal_mbstate : ps)); + static internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(pwc, s, n, + ps == nullptr + ? &internal_mbstate + : reinterpret_cast(ps)); if (!ret.has_value()) { // Encoding failure if (ret.error() == -1) { From 9464170a67939c634db4210f4923ede02e642ebc Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Wed, 18 Jun 2025 21:17:48 +0000 Subject: [PATCH 4/8] logic simplification --- libc/src/__support/wchar/character_converter.cpp | 9 +++++++-- libc/src/__support/wchar/character_converter.h | 1 + libc/src/__support/wchar/mbrtowc.cpp | 7 +++---- libc/src/wchar/mbrtowc.cpp | 8 ++------ libc/test/src/wchar/mbrtowc_test.cpp | 4 +++- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 3b9046dfb9a76..e521a4a0e380e 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -31,6 +31,11 @@ bool CharacterConverter::isComplete() { return state->bytes_processed == state->total_bytes; } +bool CharacterConverter::isFull() { + return (state->bytes_processed == state->total_bytes) && + (state->total_bytes != 0); +} + int CharacterConverter::push(char8_t utf8_byte) { uint8_t num_ones = static_cast(cpp::countl_one(utf8_byte)); // Checking the first byte if first push @@ -62,7 +67,7 @@ int CharacterConverter::push(char8_t utf8_byte) { // Any subsequent push // Adding 6 more bits so need to left shift constexpr size_t ENCODED_BITS_PER_UTF8 = 6; - if (num_ones == 1 && !isComplete()) { + if (num_ones == 1 && !isFull()) { char32_t byte = utf8_byte & mask_trailing_ones(); state->partial = state->partial << ENCODED_BITS_PER_UTF8; @@ -102,7 +107,7 @@ int CharacterConverter::push(char32_t utf32) { ErrorOr CharacterConverter::pop_utf32() { // If pop is called too early, do not reset the state, use error to determine // whether enough bytes have been pushed - if (!isComplete() || state->bytes_processed == 0) + if (!isFull() || state->bytes_processed == 0) return Error(-1); char32_t utf32 = state->partial; // reset if successful pop diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index c4ba7cf6b689f..5c7b56ea906bc 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -27,6 +27,7 @@ class CharacterConverter { void clear(); bool isComplete(); + bool isFull(); int push(char8_t utf8_byte); int push(char32_t utf32); diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp index 969448ee60e81..954c7458f4dfb 100644 --- a/libc/src/__support/wchar/mbrtowc.cpp +++ b/libc/src/__support/wchar/mbrtowc.cpp @@ -25,15 +25,14 @@ ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, if (s == nullptr) return 0; size_t i = 0; - auto wc = char_conv.pop_utf32(); // Reading in bytes until we have a complete wc or error - for (; i < n && !wc.has_value(); ++i) { + for (; i < n && !char_conv.isFull(); ++i) { int err = char_conv.push(static_cast(s[i])); // Encoding error if (err == -1) return Error(-1); - wc = char_conv.pop_utf32(); } + auto wc = char_conv.pop_utf32(); if (wc.has_value()) { *pwc = wc.value(); // null terminator -> return 0 @@ -42,7 +41,7 @@ ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, return i; } // Incomplete but potentially valid - return Error(-2); + return -2; } } // namespace internal diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp index 0a393890cf700..cd429ab8d30e2 100644 --- a/libc/src/wchar/mbrtowc.cpp +++ b/libc/src/wchar/mbrtowc.cpp @@ -29,12 +29,8 @@ LLVM_LIBC_FUNCTION(size_t, mbrtowc, : reinterpret_cast(ps)); if (!ret.has_value()) { // Encoding failure - if (ret.error() == -1) { - libc_errno = EILSEQ; - return -1; - } - // Could potentially read a valid wide character. - return -2; + libc_errno = EILSEQ; + return -1; } return ret.value(); } diff --git a/libc/test/src/wchar/mbrtowc_test.cpp b/libc/test/src/wchar/mbrtowc_test.cpp index 6e96e7ac31f49..69dcf00fde207 100644 --- a/libc/test/src/wchar/mbrtowc_test.cpp +++ b/libc/test/src/wchar/mbrtowc_test.cpp @@ -113,6 +113,7 @@ TEST(LlvmLibcMBRToWC, InvalidMultiByte) { // Trying to push the second and third should correspond to null wc n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb); ASSERT_EQ(static_cast(n), 0); + ASSERT_TRUE(*dest == L'\0'); } TEST(LlvmLibcMBRToWC, InvalidLastByte) { @@ -157,12 +158,13 @@ TEST(LlvmLibcMBRToWC, TwoValidTwoBytes) { } TEST(LlvmLibcMBRToWC, NullString) { - wchar_t dest[2]; + wchar_t dest[2] = {L'O', L'K'}; mbstate_t *mb; LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); // reading on nullptr should return 0 size_t n = LIBC_NAMESPACE::mbrtowc(dest, nullptr, 2, mb); ASSERT_EQ(static_cast(n), 0); + ASSERT_TRUE(dest[0] == L'O'); // reading a null terminator should return 0 const char *ch = "\0"; n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb); From e9d7cdc413f16c0e60c5ada4ce12959ace4b3263 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Wed, 18 Jun 2025 21:48:04 +0000 Subject: [PATCH 5/8] removed unnecessary check --- libc/src/__support/wchar/character_converter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 6f83cc7b17d9b..17075f282869a 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -114,7 +114,7 @@ int CharacterConverter::push(char32_t utf32) { ErrorOr CharacterConverter::pop_utf32() { // If pop is called too early, do not reset the state, use error to determine // whether enough bytes have been pushed - if (!isFull() || state->bytes_processed == 0) + if (!isFull()) return Error(-1); char32_t utf32 = state->partial; // reset if successful pop From 5bcab104174c7fd0d3ce6f4d7a44d740e1de3a97 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Fri, 20 Jun 2025 17:16:18 +0000 Subject: [PATCH 6/8] build files fixes --- libc/config/linux/x86_64/entrypoints.txt | 4 +++- libc/hdr/types/mbstate_t.h | 2 +- libc/include/wchar.yaml | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 10509a0c25835..bc76db32faaa7 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -365,7 +365,6 @@ set(TARGET_LIBC_ENTRYPOINTS # wchar.h entrypoints libc.src.wchar.btowc - libc.src.wchar.mbrtowc libc.src.wchar.wcslen libc.src.wchar.wctob libc.src.wchar.wmemmove @@ -990,6 +989,9 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.strings.strcasecmp_l libc.src.strings.strncasecmp_l + # wchar.h entrypoints + libc.src.wchar.mbrtowc + # assert.h entrypoints libc.src.assert.__assert_fail diff --git a/libc/hdr/types/mbstate_t.h b/libc/hdr/types/mbstate_t.h index 15b2614341d7d..367c6af7a3ffb 100644 --- a/libc/hdr/types/mbstate_t.h +++ b/libc/hdr/types/mbstate_t.h @@ -15,7 +15,7 @@ #else // Overlay mode -#include "hdr/wchar_overlay.h" +#error "Cannot overlay mbstate_t #endif // LLVM_LIBC_FULL_BUILD diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 06c621f59b462..c036636e12c32 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -34,10 +34,10 @@ functions: - stdc return_type: size_t arguments: - - type: wchar_t * __restrict - - type: const char * __restrict + - type: wchar_t *__restrict + - type: const char *__restrict - type: size_t - - type: mbstate_t * __restrict + - type: mbstate_t *__restrict - name: wmemset standards: - stdc From db18034d33e5650814d9c19ead749deb14b36912 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Fri, 20 Jun 2025 17:45:12 +0000 Subject: [PATCH 7/8] moved wchar to the bottom of entrypoints --- libc/config/linux/x86_64/entrypoints.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index bc76db32faaa7..8bf6c402b0395 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -989,9 +989,6 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.strings.strcasecmp_l libc.src.strings.strncasecmp_l - # wchar.h entrypoints - libc.src.wchar.mbrtowc - # assert.h entrypoints libc.src.assert.__assert_fail @@ -1247,6 +1244,9 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.sys.socket.recv libc.src.sys.socket.recvfrom libc.src.sys.socket.recvmsg + + # wchar.h entrypoints + libc.src.wchar.mbrtowc ) endif() From c8eae21b0dc93c3f5bba0b0169519b10337b47d0 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Fri, 20 Jun 2025 17:52:07 +0000 Subject: [PATCH 8/8] removed iscomplete from header --- libc/src/__support/wchar/character_converter.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index 265627665146e..be0e6129df236 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -26,7 +26,6 @@ class CharacterConverter { CharacterConverter(mbstate *mbstate); void clear(); - bool isComplete(); bool isFull(); bool isEmpty();