From 69768eb6dbc63755abf8140d2584edc4f91928f5 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Tue, 22 Jul 2025 21:43:27 +0000 Subject: [PATCH 1/5] implemented mblen --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/include/wchar.yaml | 7 ++ libc/src/wchar/CMakeLists.txt | 16 ++++ libc/src/wchar/mblen.cpp | 38 ++++++++ libc/src/wchar/mblen.h | 21 +++++ libc/src/wchar/mbrlen.cpp | 38 ++++++++ libc/src/wchar/mbrlen.h | 22 +++++ libc/test/src/wchar/CMakeLists.txt | 13 +++ libc/test/src/wchar/mblen_test.cpp | 105 +++++++++++++++++++++++ 9 files changed, 261 insertions(+) create mode 100644 libc/src/wchar/mblen.cpp create mode 100644 libc/src/wchar/mblen.h create mode 100644 libc/src/wchar/mbrlen.cpp create mode 100644 libc/src/wchar/mbrlen.h create mode 100644 libc/test/src/wchar/mblen_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 381359cec6f1d..5610c946a90fb 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1261,6 +1261,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.sys.socket.recvmsg # wchar.h entrypoints + libc.src.wchar.mblen libc.src.wchar.mbrtowc libc.src.wchar.mbtowc libc.src.wchar.wcrtomb diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 123d3440aeec3..605e505dc3bc7 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -53,6 +53,13 @@ functions: - type: wchar_t *__restrict - type: const char *__restrict - type: size_t + - name: mblen + standards: + - stdc + return_type: int + arguments: + - type: const char *__restrict + - type: size_t - name: wmemset standards: - stdc diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 7ace1a6ca66ba..5bc294eeec587 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -159,6 +159,22 @@ add_entrypoint_object( libc.src.__support.wchar.mbstate ) +add_entrypoint_object( + mblen + SRCS + mblen.cpp + HDRS + mblen.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.wchar_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.libc_errno + libc.src.__support.wchar.mbrtowc + libc.src.__support.wchar.mbstate +) + add_entrypoint_object( wmemset SRCS diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp new file mode 100644 index 0000000000000..0f7b94dcf7ca5 --- /dev/null +++ b/libc/src/wchar/mblen.cpp @@ -0,0 +1,38 @@ +//===-- Implementation of mblen -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mblen.h" + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, mblen, (const char *__restrict s, size_t n)) { + // returns 0 since UTF-8 encoding is not state-dependent + if (s == nullptr) + return 0; + internal::mbstate internal_mbstate; + // temp ptr to use for internal function + wchar_t buf[1]; + auto ret = internal::mbrtowc(buf, s, n, &internal_mbstate); + if (!ret.has_value() || static_cast(ret.value()) == -2) { + // Encoding failure + if (!ret.has_value()) + libc_errno = EILSEQ; + return -1; + } + return static_cast(ret.value()); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mblen.h b/libc/src/wchar/mblen.h new file mode 100644 index 0000000000000..982081270cf9a --- /dev/null +++ b/libc/src/wchar/mblen.h @@ -0,0 +1,21 @@ +//===-- Implementation header for mblen -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBLEN_H +#define LLVM_LIBC_SRC_WCHAR_MBLEN_H + +#include "hdr/types/size_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int mblen(const char *__restrict s, size_t n); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBLEN_H diff --git a/libc/src/wchar/mbrlen.cpp b/libc/src/wchar/mbrlen.cpp new file mode 100644 index 0000000000000..0a256f8d50102 --- /dev/null +++ b/libc/src/wchar/mbrlen.cpp @@ -0,0 +1,38 @@ +//===-- Implementation of mbrlen ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbrlen.h" + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(size_t, mbrlen, + (const char *__restrict s, size_t n, + mbstate_t *__restrict ps)) { + static internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(pwc, s, n, + ps == nullptr + ? &internal_mbstate + : reinterpret_cast(ps)); + if (!ret.has_value()) { + // Encoding failure + libc_errno = ret.error(); + return -1; + } + return ret.value(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbrlen.h b/libc/src/wchar/mbrlen.h new file mode 100644 index 0000000000000..08b59cfc8651c --- /dev/null +++ b/libc/src/wchar/mbrlen.h @@ -0,0 +1,22 @@ +//===-- Implementation header for mbrlen ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBRLEN_H +#define LLVM_LIBC_SRC_WCHAR_MBRLEN_H + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBRLEN_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 176cf7c3487cd..de0cd697b3a02 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -64,6 +64,19 @@ add_libc_test( libc.test.UnitTest.ErrnoCheckingTest ) +add_libc_test( + mblen_test + SUITE + libc_wchar_unittests + SRCS + mblen_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.wchar.mblen + libc.hdr.types.wchar_t + libc.test.UnitTest.ErrnoCheckingTest +) + add_libc_test( wctob_test SUITE diff --git a/libc/test/src/wchar/mblen_test.cpp b/libc/test/src/wchar/mblen_test.cpp new file mode 100644 index 0000000000000..f7a9dc55bdab6 --- /dev/null +++ b/libc/test/src/wchar/mblen_test.cpp @@ -0,0 +1,105 @@ +//===-- Unittests for mblen -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/wchar/mblen.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBLenTest, OneByte) { + const char *ch = "A"; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, -1); +} + +TEST_F(LlvmLibcMBLenTest, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); + // Should fail after trying to read next byte too + n = LIBC_NAMESPACE::mblen(ch + 1, 1); + ASSERT_EQ(n, -1); + // This one should be an invalid starting byte so should set errno + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + int n = LIBC_NAMESPACE::mblen(ch, 3); + ASSERT_EQ(n, 3); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + // Trying to push all 4 should error + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mblen(ch + 1, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, NullString) { + // reading on nullptr should return 0 + int n = LIBC_NAMESPACE::mblen(nullptr, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, 0); +} From 692505f894ce6c631ecf69f44c40abe9829a5c15 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Tue, 22 Jul 2025 22:48:43 +0000 Subject: [PATCH 2/5] implemented mbrlen --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/include/wchar.yaml | 8 ++ libc/src/wchar/CMakeLists.txt | 17 +++- libc/src/wchar/mblen.cpp | 5 +- libc/src/wchar/mbrlen.cpp | 3 +- libc/test/src/wchar/CMakeLists.txt | 16 +++- libc/test/src/wchar/mblen_test.cpp | 1 - libc/test/src/wchar/mbrlen_test.cpp | 105 +++++++++++++++++++++++ 8 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 libc/test/src/wchar/mbrlen_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 5610c946a90fb..3cb1c483cea9e 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1262,6 +1262,7 @@ if(LLVM_LIBC_FULL_BUILD) # wchar.h entrypoints libc.src.wchar.mblen + libc.src.wchar.mbrlen libc.src.wchar.mbrtowc libc.src.wchar.mbtowc libc.src.wchar.wcrtomb diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 605e505dc3bc7..9b2dcf0943eed 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -60,6 +60,14 @@ functions: arguments: - type: const char *__restrict - type: size_t + - name: mbrlen + standards: + - stdc + return_type: size_t + arguments: + - type: const char *__restrict + - type: size_t + - type: mbstate_t *__restrict - name: wmemset standards: - stdc diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 5bc294eeec587..4748177d9e6f1 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -167,7 +167,6 @@ add_entrypoint_object( mblen.h DEPENDS libc.hdr.types.size_t - libc.hdr.types.wchar_t libc.src.__support.common libc.src.__support.macros.config libc.src.__support.libc_errno @@ -175,6 +174,22 @@ add_entrypoint_object( libc.src.__support.wchar.mbstate ) +add_entrypoint_object( + mbrlen + SRCS + mbrlen.cpp + HDRS + mbrlen.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.mbstate_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.wchar.mbrtowc + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate +) + add_entrypoint_object( wmemset SRCS diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp index 0f7b94dcf7ca5..2b8f529325d25 100644 --- a/libc/src/wchar/mblen.cpp +++ b/libc/src/wchar/mblen.cpp @@ -9,7 +9,6 @@ #include "src/wchar/mblen.h" #include "hdr/types/size_t.h" -#include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" @@ -23,9 +22,7 @@ LLVM_LIBC_FUNCTION(int, mblen, (const char *__restrict s, size_t n)) { if (s == nullptr) return 0; internal::mbstate internal_mbstate; - // temp ptr to use for internal function - wchar_t buf[1]; - auto ret = internal::mbrtowc(buf, s, n, &internal_mbstate); + auto ret = internal::mbrtowc(nullptr, s, n, &internal_mbstate); if (!ret.has_value() || static_cast(ret.value()) == -2) { // Encoding failure if (!ret.has_value()) diff --git a/libc/src/wchar/mbrlen.cpp b/libc/src/wchar/mbrlen.cpp index 0a256f8d50102..8de78e099566b 100644 --- a/libc/src/wchar/mbrlen.cpp +++ b/libc/src/wchar/mbrlen.cpp @@ -10,7 +10,6 @@ #include "hdr/types/mbstate_t.h" #include "hdr/types/size_t.h" -#include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" @@ -23,7 +22,7 @@ LLVM_LIBC_FUNCTION(size_t, mbrlen, (const char *__restrict s, size_t n, mbstate_t *__restrict ps)) { static internal::mbstate internal_mbstate; - auto ret = internal::mbrtowc(pwc, s, n, + auto ret = internal::mbrtowc(nullptr, s, n, ps == nullptr ? &internal_mbstate : reinterpret_cast(ps)); diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index de0cd697b3a02..5251823b44d13 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -73,7 +73,21 @@ add_libc_test( DEPENDS libc.src.__support.libc_errno libc.src.wchar.mblen - libc.hdr.types.wchar_t + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + mbrtowc_test + SUITE + libc_wchar_unittests + SRCS + mbrtowc_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate + libc.src.string.memset + libc.src.wchar.mbrtowc + libc.hdr.types.mbstate_t libc.test.UnitTest.ErrnoCheckingTest ) diff --git a/libc/test/src/wchar/mblen_test.cpp b/libc/test/src/wchar/mblen_test.cpp index f7a9dc55bdab6..efd4df7020741 100644 --- a/libc/test/src/wchar/mblen_test.cpp +++ b/libc/test/src/wchar/mblen_test.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "hdr/types/wchar_t.h" #include "src/__support/libc_errno.h" #include "src/wchar/mblen.h" #include "test/UnitTest/ErrnoCheckingTest.h" diff --git a/libc/test/src/wchar/mbrlen_test.cpp b/libc/test/src/wchar/mbrlen_test.cpp new file mode 100644 index 0000000000000..f7a9dc55bdab6 --- /dev/null +++ b/libc/test/src/wchar/mbrlen_test.cpp @@ -0,0 +1,105 @@ +//===-- Unittests for mblen -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/wchar/mblen.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBLenTest, OneByte) { + const char *ch = "A"; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, -1); +} + +TEST_F(LlvmLibcMBLenTest, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); + // Should fail after trying to read next byte too + n = LIBC_NAMESPACE::mblen(ch + 1, 1); + ASSERT_EQ(n, -1); + // This one should be an invalid starting byte so should set errno + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + int n = LIBC_NAMESPACE::mblen(ch, 3); + ASSERT_EQ(n, 3); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + // Trying to push all 4 should error + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mblen(ch + 1, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, NullString) { + // reading on nullptr should return 0 + int n = LIBC_NAMESPACE::mblen(nullptr, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, 0); +} From 1eab8def2725d32b7a67e40fedb8bcdbef7058b4 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Tue, 22 Jul 2025 23:18:42 +0000 Subject: [PATCH 3/5] [libc] Implemented mblen functions Implemented mblen and mbrlen as well as tests --- libc/test/src/wchar/CMakeLists.txt | 6 +- libc/test/src/wchar/mbrlen_test.cpp | 116 ++++++++++++++++++---------- 2 files changed, 78 insertions(+), 44 deletions(-) diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 5251823b44d13..baa52b74c3d97 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -77,16 +77,16 @@ add_libc_test( ) add_libc_test( - mbrtowc_test + mbrlen_test SUITE libc_wchar_unittests SRCS - mbrtowc_test.cpp + mbrlen_test.cpp DEPENDS libc.src.__support.libc_errno libc.src.__support.wchar.mbstate libc.src.string.memset - libc.src.wchar.mbrtowc + libc.src.wchar.mbrlen libc.hdr.types.mbstate_t libc.test.UnitTest.ErrnoCheckingTest ) diff --git a/libc/test/src/wchar/mbrlen_test.cpp b/libc/test/src/wchar/mbrlen_test.cpp index f7a9dc55bdab6..e1452bf416054 100644 --- a/libc/test/src/wchar/mbrlen_test.cpp +++ b/libc/test/src/wchar/mbrlen_test.cpp @@ -1,4 +1,4 @@ -//===-- Unittests for mblen -----------------------------------------------===// +//===-- Unittests for mbrlen ----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,98 +8,132 @@ #include "hdr/types/wchar_t.h" #include "src/__support/libc_errno.h" -#include "src/wchar/mblen.h" +#include "src/__support/wchar/mbstate.h" +#include "src/string/memset.h" +#include "src/wchar/mbrlen.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -using LlvmLibcMBLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; +using LlvmLibcMBRLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; -TEST_F(LlvmLibcMBLenTest, OneByte) { +TEST_F(LlvmLibcMBRLenTest, OneByte) { const char *ch = "A"; - int n = LIBC_NAMESPACE::mblen(ch, 1); + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb); ASSERT_ERRNO_SUCCESS(); - ASSERT_EQ(n, 1); + ASSERT_EQ(n, static_cast(1)); // Should fail since we have not read enough - n = LIBC_NAMESPACE::mblen(ch, 0); + n = LIBC_NAMESPACE::mbrlen(ch, 0, &mb); ASSERT_ERRNO_SUCCESS(); - ASSERT_EQ(n, -1); + ASSERT_EQ(n, static_cast(-2)); } -TEST_F(LlvmLibcMBLenTest, TwoByte) { +TEST_F(LlvmLibcMBRLenTest, TwoByte) { const char ch[2] = {static_cast(0xC2), static_cast(0x8E)}; // Ž car symbol - int n = LIBC_NAMESPACE::mblen(ch, 4); + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, nullptr); ASSERT_ERRNO_SUCCESS(); - ASSERT_EQ(n, 2); + ASSERT_EQ(static_cast(n), 2); // Should fail since we have not read enough - n = LIBC_NAMESPACE::mblen(ch, 1); - ASSERT_EQ(n, -1); + n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb); + ASSERT_EQ(static_cast(n), -2); + ASSERT_ERRNO_SUCCESS(); + // Should pass after trying to read next byte + n = LIBC_NAMESPACE::mbrlen(ch + 1, 1, &mb); + ASSERT_EQ(static_cast(n), 1); ASSERT_ERRNO_SUCCESS(); - // Should fail after trying to read next byte too - n = LIBC_NAMESPACE::mblen(ch + 1, 1); - ASSERT_EQ(n, -1); - // This one should be an invalid starting byte so should set errno - ASSERT_ERRNO_EQ(EILSEQ); } -TEST_F(LlvmLibcMBLenTest, ThreeByte) { +TEST_F(LlvmLibcMBRLenTest, ThreeByte) { const char ch[3] = {static_cast(0xE2), static_cast(0x88), static_cast(0x91)}; // ∑ sigma symbol - int n = LIBC_NAMESPACE::mblen(ch, 3); - ASSERT_EQ(n, 3); + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 3, &mb); + ASSERT_EQ(static_cast(n), 3); ASSERT_ERRNO_SUCCESS(); // Should fail since we have not read enough - n = LIBC_NAMESPACE::mblen(ch, 2); - ASSERT_EQ(n, -1); + n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb); + ASSERT_EQ(static_cast(n), -2); ASSERT_ERRNO_SUCCESS(); } -TEST_F(LlvmLibcMBLenTest, FourByte) { +TEST_F(LlvmLibcMBRLenTest, FourByte) { const char ch[4] = {static_cast(0xF0), static_cast(0x9F), static_cast(0xA4), static_cast(0xA1)}; // 🤡 clown emoji - int n = LIBC_NAMESPACE::mblen(ch, 4); - ASSERT_EQ(n, 4); + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb); + ASSERT_EQ(static_cast(n), 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb); + ASSERT_EQ(static_cast(n), -2); ASSERT_ERRNO_SUCCESS(); // Should fail since we have not read enough - n = LIBC_NAMESPACE::mblen(ch, 2); - ASSERT_EQ(n, -1); + n = LIBC_NAMESPACE::mbrlen(ch + 2, 1, &mb); + ASSERT_EQ(static_cast(n), -2); + ASSERT_ERRNO_SUCCESS(); + + // Should pass after reading final byte + n = LIBC_NAMESPACE::mbrlen(ch + 3, 5, &mb); + ASSERT_EQ(static_cast(n), 1); ASSERT_ERRNO_SUCCESS(); } -TEST_F(LlvmLibcMBLenTest, InvalidByte) { +TEST_F(LlvmLibcMBRLenTest, InvalidByte) { const char ch[1] = {static_cast(0x80)}; - int n = LIBC_NAMESPACE::mblen(ch, 1); - ASSERT_EQ(n, -1); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr); + ASSERT_EQ(static_cast(n), -1); ASSERT_ERRNO_EQ(EILSEQ); } -TEST_F(LlvmLibcMBLenTest, InvalidMultiByte) { +TEST_F(LlvmLibcMBRLenTest, InvalidMultiByte) { const char ch[4] = {static_cast(0x80), static_cast(0x00), static_cast(0x80), static_cast(0x00)}; // invalid sequence of bytes + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); // Trying to push all 4 should error - int n = LIBC_NAMESPACE::mblen(ch, 4); - ASSERT_EQ(n, -1); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb); + ASSERT_EQ(static_cast(n), -1); ASSERT_ERRNO_EQ(EILSEQ); // Trying to push the second and third should correspond to null wc - n = LIBC_NAMESPACE::mblen(ch + 1, 2); - ASSERT_EQ(n, 0); + n = LIBC_NAMESPACE::mbrlen(ch + 1, 2, &mb); + ASSERT_EQ(static_cast(n), 0); ASSERT_ERRNO_SUCCESS(); } -TEST_F(LlvmLibcMBLenTest, NullString) { +TEST_F(LlvmLibcMBRLenTest, NullString) { // reading on nullptr should return 0 - int n = LIBC_NAMESPACE::mblen(nullptr, 2); - ASSERT_EQ(n, 0); + size_t n = LIBC_NAMESPACE::mbrlen(nullptr, 2, nullptr); + ASSERT_EQ(static_cast(n), 0); ASSERT_ERRNO_SUCCESS(); // reading a null terminator should return 0 const char *ch = "\0"; - n = LIBC_NAMESPACE::mblen(ch, 1); - ASSERT_EQ(n, 0); + n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr); + ASSERT_EQ(static_cast(n), 0); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidMBState) { + const char ch[4] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0xC7), static_cast(0x8C)}; + mbstate_t *mb; + LIBC_NAMESPACE::internal::mbstate inv; + inv.total_bytes = 6; + mb = reinterpret_cast(&inv); + // invalid mbstate should error + size_t n = LIBC_NAMESPACE::mbrlen(ch, 2, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EINVAL); } From 4c591a2f664a15c370e5cbc662eb3d156ec7f580 Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Thu, 24 Jul 2025 17:09:47 +0000 Subject: [PATCH 4/5] removed restrict from parameter --- libc/include/wchar.yaml | 2 +- libc/src/wchar/mblen.cpp | 2 +- libc/src/wchar/mblen.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 9b2dcf0943eed..4adf596abe650 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -58,7 +58,7 @@ functions: - stdc return_type: int arguments: - - type: const char *__restrict + - type: const char * - type: size_t - name: mbrlen standards: diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp index 2b8f529325d25..d6694a6558dc6 100644 --- a/libc/src/wchar/mblen.cpp +++ b/libc/src/wchar/mblen.cpp @@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, mblen, (const char *__restrict s, size_t n)) { +LLVM_LIBC_FUNCTION(int, mblen, (const char * s, size_t n)) { // returns 0 since UTF-8 encoding is not state-dependent if (s == nullptr) return 0; diff --git a/libc/src/wchar/mblen.h b/libc/src/wchar/mblen.h index 982081270cf9a..03f158711250a 100644 --- a/libc/src/wchar/mblen.h +++ b/libc/src/wchar/mblen.h @@ -14,7 +14,7 @@ namespace LIBC_NAMESPACE_DECL { -int mblen(const char *__restrict s, size_t n); +int mblen(const char * s, size_t n); } // namespace LIBC_NAMESPACE_DECL From e52ee8956616fb22f332ea95fb40c4ffe4548a1b Mon Sep 17 00:00:00 2001 From: Sriya Pratipati Date: Thu, 24 Jul 2025 17:16:24 +0000 Subject: [PATCH 5/5] fixed formatting --- libc/src/wchar/mblen.cpp | 2 +- libc/src/wchar/mblen.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp index d6694a6558dc6..2d15b3e0e5648 100644 --- a/libc/src/wchar/mblen.cpp +++ b/libc/src/wchar/mblen.cpp @@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, mblen, (const char * s, size_t n)) { +LLVM_LIBC_FUNCTION(int, mblen, (const char *s, size_t n)) { // returns 0 since UTF-8 encoding is not state-dependent if (s == nullptr) return 0; diff --git a/libc/src/wchar/mblen.h b/libc/src/wchar/mblen.h index 03f158711250a..a315a2f12f6a1 100644 --- a/libc/src/wchar/mblen.h +++ b/libc/src/wchar/mblen.h @@ -14,7 +14,7 @@ namespace LIBC_NAMESPACE_DECL { -int mblen(const char * s, size_t n); +int mblen(const char *s, size_t n); } // namespace LIBC_NAMESPACE_DECL