-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[libc] Implemented mblen functions #150141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Implemented mblen and mbrlen as well as tests
|
@llvm/pr-subscribers-libc Author: None (sribee8) ChangesImplemented mblen and mbrlen as well as tests Full diff: https://github.com/llvm/llvm-project/pull/150141.diff 10 Files Affected:
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 381359cec6f1d..3cb1c483cea9e 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1261,6 +1261,8 @@ if(LLVM_LIBC_FULL_BUILD)
libc.src.sys.socket.recvmsg
# wchar.h entrypoints
+ libc.src.wchar.mblen
+ libc.src.wchar.mbrlen
libc.src.wchar.mbrtowc
libc.src.wchar.mbtowc
libc.src.wchar.wcrtomb
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 123d3440aeec3..9b2dcf0943eed 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -53,6 +53,21 @@ functions:
- type: wchar_t *__restrict
- type: const char *__restrict
- type: size_t
+ - name: mblen
+ standards:
+ - stdc
+ return_type: int
+ arguments:
+ - type: const char *__restrict
+ - type: size_t
+ - name: mbrlen
+ standards:
+ - stdc
+ return_type: size_t
+ arguments:
+ - type: const char *__restrict
+ - type: size_t
+ - type: mbstate_t *__restrict
- name: wmemset
standards:
- stdc
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 159778df6acca..2b95d94e4230a 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -169,6 +169,37 @@ add_entrypoint_object(
libc.src.__support.wchar.mbstate
)
+add_entrypoint_object(
+ mblen
+ SRCS
+ mblen.cpp
+ HDRS
+ mblen.h
+ DEPENDS
+ libc.hdr.types.size_t
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.libc_errno
+ libc.src.__support.wchar.mbrtowc
+ libc.src.__support.wchar.mbstate
+)
+
+add_entrypoint_object(
+ mbrlen
+ SRCS
+ mbrlen.cpp
+ HDRS
+ mbrlen.h
+ DEPENDS
+ libc.hdr.types.size_t
+ libc.hdr.types.mbstate_t
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.wchar.mbrtowc
+ libc.src.__support.libc_errno
+ libc.src.__support.wchar.mbstate
+)
+
add_entrypoint_object(
wmemset
SRCS
diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp
new file mode 100644
index 0000000000000..2b8f529325d25
--- /dev/null
+++ b/libc/src/wchar/mblen.cpp
@@ -0,0 +1,35 @@
+//===-- Implementation of mblen -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mblen.h"
+
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbrtowc.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, mblen, (const char *__restrict s, size_t n)) {
+ // returns 0 since UTF-8 encoding is not state-dependent
+ if (s == nullptr)
+ return 0;
+ internal::mbstate internal_mbstate;
+ auto ret = internal::mbrtowc(nullptr, s, n, &internal_mbstate);
+ if (!ret.has_value() || static_cast<int>(ret.value()) == -2) {
+ // Encoding failure
+ if (!ret.has_value())
+ libc_errno = EILSEQ;
+ return -1;
+ }
+ return static_cast<int>(ret.value());
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mblen.h b/libc/src/wchar/mblen.h
new file mode 100644
index 0000000000000..982081270cf9a
--- /dev/null
+++ b/libc/src/wchar/mblen.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for mblen -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBLEN_H
+#define LLVM_LIBC_SRC_WCHAR_MBLEN_H
+
+#include "hdr/types/size_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int mblen(const char *__restrict s, size_t n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBLEN_H
diff --git a/libc/src/wchar/mbrlen.cpp b/libc/src/wchar/mbrlen.cpp
new file mode 100644
index 0000000000000..8de78e099566b
--- /dev/null
+++ b/libc/src/wchar/mbrlen.cpp
@@ -0,0 +1,37 @@
+//===-- Implementation of mbrlen ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbrlen.h"
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbrtowc.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, mbrlen,
+ (const char *__restrict s, size_t n,
+ mbstate_t *__restrict ps)) {
+ static internal::mbstate internal_mbstate;
+ auto ret = internal::mbrtowc(nullptr, s, n,
+ ps == nullptr
+ ? &internal_mbstate
+ : reinterpret_cast<internal::mbstate *>(ps));
+ if (!ret.has_value()) {
+ // Encoding failure
+ libc_errno = ret.error();
+ return -1;
+ }
+ return ret.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbrlen.h b/libc/src/wchar/mbrlen.h
new file mode 100644
index 0000000000000..08b59cfc8651c
--- /dev/null
+++ b/libc/src/wchar/mbrlen.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for mbrlen ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBRLEN_H
+#define LLVM_LIBC_SRC_WCHAR_MBRLEN_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBRLEN_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 176cf7c3487cd..baa52b74c3d97 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -64,6 +64,33 @@ add_libc_test(
libc.test.UnitTest.ErrnoCheckingTest
)
+add_libc_test(
+ mblen_test
+ SUITE
+ libc_wchar_unittests
+ SRCS
+ mblen_test.cpp
+ DEPENDS
+ libc.src.__support.libc_errno
+ libc.src.wchar.mblen
+ libc.test.UnitTest.ErrnoCheckingTest
+)
+
+add_libc_test(
+ mbrlen_test
+ SUITE
+ libc_wchar_unittests
+ SRCS
+ mbrlen_test.cpp
+ DEPENDS
+ libc.src.__support.libc_errno
+ libc.src.__support.wchar.mbstate
+ libc.src.string.memset
+ libc.src.wchar.mbrlen
+ libc.hdr.types.mbstate_t
+ libc.test.UnitTest.ErrnoCheckingTest
+)
+
add_libc_test(
wctob_test
SUITE
diff --git a/libc/test/src/wchar/mblen_test.cpp b/libc/test/src/wchar/mblen_test.cpp
new file mode 100644
index 0000000000000..efd4df7020741
--- /dev/null
+++ b/libc/test/src/wchar/mblen_test.cpp
@@ -0,0 +1,104 @@
+//===-- Unittests for mblen -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/libc_errno.h"
+#include "src/wchar/mblen.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcMBLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcMBLenTest, OneByte) {
+ const char *ch = "A";
+ int n = LIBC_NAMESPACE::mblen(ch, 1);
+ ASSERT_ERRNO_SUCCESS();
+ ASSERT_EQ(n, 1);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mblen(ch, 0);
+ ASSERT_ERRNO_SUCCESS();
+ ASSERT_EQ(n, -1);
+}
+
+TEST_F(LlvmLibcMBLenTest, TwoByte) {
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // � car symbol
+ int n = LIBC_NAMESPACE::mblen(ch, 4);
+ ASSERT_ERRNO_SUCCESS();
+ ASSERT_EQ(n, 2);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mblen(ch, 1);
+ ASSERT_EQ(n, -1);
+ ASSERT_ERRNO_SUCCESS();
+ // Should fail after trying to read next byte too
+ n = LIBC_NAMESPACE::mblen(ch + 1, 1);
+ ASSERT_EQ(n, -1);
+ // This one should be an invalid starting byte so should set errno
+ ASSERT_ERRNO_EQ(EILSEQ);
+}
+
+TEST_F(LlvmLibcMBLenTest, ThreeByte) {
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑ sigma symbol
+ int n = LIBC_NAMESPACE::mblen(ch, 3);
+ ASSERT_EQ(n, 3);
+ ASSERT_ERRNO_SUCCESS();
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mblen(ch, 2);
+ ASSERT_EQ(n, -1);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBLenTest, FourByte) {
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
+ int n = LIBC_NAMESPACE::mblen(ch, 4);
+ ASSERT_EQ(n, 4);
+ ASSERT_ERRNO_SUCCESS();
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mblen(ch, 2);
+ ASSERT_EQ(n, -1);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBLenTest, InvalidByte) {
+ const char ch[1] = {static_cast<char>(0x80)};
+ int n = LIBC_NAMESPACE::mblen(ch, 1);
+ ASSERT_EQ(n, -1);
+ ASSERT_ERRNO_EQ(EILSEQ);
+}
+
+TEST_F(LlvmLibcMBLenTest, InvalidMultiByte) {
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // invalid sequence of bytes
+ // Trying to push all 4 should error
+ int n = LIBC_NAMESPACE::mblen(ch, 4);
+ ASSERT_EQ(n, -1);
+ ASSERT_ERRNO_EQ(EILSEQ);
+
+ // Trying to push the second and third should correspond to null wc
+ n = LIBC_NAMESPACE::mblen(ch + 1, 2);
+ ASSERT_EQ(n, 0);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBLenTest, NullString) {
+ // reading on nullptr should return 0
+ int n = LIBC_NAMESPACE::mblen(nullptr, 2);
+ ASSERT_EQ(n, 0);
+ ASSERT_ERRNO_SUCCESS();
+ // reading a null terminator should return 0
+ const char *ch = "\0";
+ n = LIBC_NAMESPACE::mblen(ch, 1);
+ ASSERT_EQ(n, 0);
+}
diff --git a/libc/test/src/wchar/mbrlen_test.cpp b/libc/test/src/wchar/mbrlen_test.cpp
new file mode 100644
index 0000000000000..e1452bf416054
--- /dev/null
+++ b/libc/test/src/wchar/mbrlen_test.cpp
@@ -0,0 +1,139 @@
+//===-- Unittests for mbrlen ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/wchar_t.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/string/memset.h"
+#include "src/wchar/mbrlen.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcMBRLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcMBRLenTest, OneByte) {
+ const char *ch = "A";
+ mbstate_t mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb);
+ ASSERT_ERRNO_SUCCESS();
+ ASSERT_EQ(n, static_cast<size_t>(1));
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrlen(ch, 0, &mb);
+ ASSERT_ERRNO_SUCCESS();
+ ASSERT_EQ(n, static_cast<size_t>(-2));
+}
+
+TEST_F(LlvmLibcMBRLenTest, TwoByte) {
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // � car symbol
+ mbstate_t mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, nullptr);
+ ASSERT_ERRNO_SUCCESS();
+ ASSERT_EQ(static_cast<int>(n), 2);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ ASSERT_ERRNO_SUCCESS();
+ // Should pass after trying to read next byte
+ n = LIBC_NAMESPACE::mbrlen(ch + 1, 1, &mb);
+ ASSERT_EQ(static_cast<int>(n), 1);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBRLenTest, ThreeByte) {
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑ sigma symbol
+ mbstate_t mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 3, &mb);
+ ASSERT_EQ(static_cast<int>(n), 3);
+ ASSERT_ERRNO_SUCCESS();
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBRLenTest, FourByte) {
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
+ mbstate_t mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb);
+ ASSERT_EQ(static_cast<int>(n), 4);
+ ASSERT_ERRNO_SUCCESS();
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ ASSERT_ERRNO_SUCCESS();
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrlen(ch + 2, 1, &mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ ASSERT_ERRNO_SUCCESS();
+
+ // Should pass after reading final byte
+ n = LIBC_NAMESPACE::mbrlen(ch + 3, 5, &mb);
+ ASSERT_EQ(static_cast<int>(n), 1);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBRLenTest, InvalidByte) {
+ const char ch[1] = {static_cast<char>(0x80)};
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_ERRNO_EQ(EILSEQ);
+}
+
+TEST_F(LlvmLibcMBRLenTest, InvalidMultiByte) {
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // invalid sequence of bytes
+ mbstate_t mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ // Trying to push all 4 should error
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_ERRNO_EQ(EILSEQ);
+
+ // Trying to push the second and third should correspond to null wc
+ n = LIBC_NAMESPACE::mbrlen(ch + 1, 2, &mb);
+ ASSERT_EQ(static_cast<int>(n), 0);
+ ASSERT_ERRNO_SUCCESS();
+}
+
+TEST_F(LlvmLibcMBRLenTest, NullString) {
+ // reading on nullptr should return 0
+ size_t n = LIBC_NAMESPACE::mbrlen(nullptr, 2, nullptr);
+ ASSERT_EQ(static_cast<int>(n), 0);
+ ASSERT_ERRNO_SUCCESS();
+ // reading a null terminator should return 0
+ const char *ch = "\0";
+ n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr);
+ ASSERT_EQ(static_cast<int>(n), 0);
+}
+
+TEST_F(LlvmLibcMBRLenTest, InvalidMBState) {
+ const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0xC7), static_cast<char>(0x8C)};
+ mbstate_t *mb;
+ LIBC_NAMESPACE::internal::mbstate inv;
+ inv.total_bytes = 6;
+ mb = reinterpret_cast<mbstate_t *>(&inv);
+ // invalid mbstate should error
+ size_t n = LIBC_NAMESPACE::mbrlen(ch, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_ERRNO_EQ(EINVAL);
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with 1 small fix
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/39249 Here is the relevant piece of the build log for the reference |
Implemented mblen and mbrlen as well as tests --------- Co-authored-by: Sriya Pratipati <[email protected]>
Implemented mblen and mbrlen as well as tests