From de6f206cc4ff538a9bf19eb9f27636946a3416e5 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 9 Jul 2025 23:48:15 +0000 Subject: [PATCH 1/4] duplicate str_to_integer to create wcs_to_integer internal function --- libc/src/__support/CMakeLists.txt | 13 + libc/src/__support/wcs_to_integer.h | 156 ++++++++++++ libc/test/src/__support/CMakeLists.txt | 11 + .../src/__support/wcs_to_integer_test.cpp | 240 ++++++++++++++++++ 4 files changed, 420 insertions(+) create mode 100644 libc/src/__support/wcs_to_integer.h create mode 100644 libc/test/src/__support/wcs_to_integer_test.cpp diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 7e85136c08851..294d68474bd53 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -180,6 +180,19 @@ add_header_library( libc.src.__support.common ) +add_header_library( + wcs_to_integer + HDRS + wcs_to_integer.h + DEPENDS + .wctype_utils + .str_to_num_result + libc.hdr.errno_macros + libc.src.__support.CPP.limits + libc.src.__support.CPP.type_traits + libc.src.__support.common +) + add_header_library( integer_to_string HDRS diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h new file mode 100644 index 0000000000000..40fa28a98030d --- /dev/null +++ b/libc/src/__support/wcs_to_integer.h @@ -0,0 +1,156 @@ +//===-- String to integer conversion utils ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H +#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H + +#include "hdr/errno_macros.h" // For ERANGE +#include "src/__support/CPP/limits.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/CPP/type_traits/make_unsigned.h" +#include "src/__support/big_int.h" +#include "src/__support/common.h" +#include "src/__support/wctype_utils.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_num_result.h" +#include "src/__support/uint128.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +// Returns a pointer to the first character in src that is not a whitespace +// character (as determined by iswspace()) +// TODO: Change from returning a pointer to returning a length. +LIBC_INLINE const wchar_t * +first_non_whitespace(const wchar_t *__restrict src, + size_t src_len = cpp::numeric_limits::max()) { + size_t src_cur = 0; + while (src_cur < src_len && internal::iswspace(src[src_cur])) { + ++src_cur; + } + return src + src_cur; +} + +// checks if the next 3 characters of the string pointer are the start of a +// hexadecimal number. Does not advance the string pointer. +LIBC_INLINE bool +is_hex_start(const wchar_t *__restrict src, + size_t src_len = cpp::numeric_limits::max()) { + if (src_len < 3) + return false; + return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) && + b36_wchar_to_int(*(src + 2)) < 16; +} + +// Takes the address of the string pointer and parses the base from the start of +// it. +LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) { + // A hexadecimal number is defined as "the prefix 0x or 0X followed by a + // sequence of the decimal digits and the letters a (or A) through f (or F) + // with values 10 through 15 respectively." (C standard 6.4.4.1) + if (is_hex_start(src, src_len)) + return 16; + // An octal number is defined as "the prefix 0 optionally followed by a + // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any + // number that starts with 0, including just 0, is an octal number. + if (src_len > 0 && src[0] == L'0') + return 8; + // A decimal number is defined as beginning "with a nonzero digit and + // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1) + return 10; +} + +template +LIBC_INLINE StrToNumResult +wcstointeger(const wchar_t *__restrict src, int base, + const size_t src_len = cpp::numeric_limits::max()) { + using ResultType = make_integral_or_big_int_unsigned_t; + + ResultType result = 0; + + bool is_number = false; + size_t src_cur = 0; + int error_val = 0; + + if (src_len == 0) + return {0, 0, 0}; + + if (base < 0 || base == 1 || base > 36) + return {0, 0, EINVAL}; + + src_cur = static_cast(first_non_whitespace(src, src_len) - src); + + wchar_t result_sign = L'+'; + if (src[src_cur] == L'+' || src[src_cur] == L'-') { + result_sign = src[src_cur]; + ++src_cur; + } + + if (base == 0) + base = infer_base(src + src_cur, src_len - src_cur); + + if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur)) + src_cur = src_cur + 2; + + constexpr bool IS_UNSIGNED = cpp::is_unsigned_v; + const bool is_positive = (result_sign == L'+'); + + ResultType constexpr NEGATIVE_MAX = + !IS_UNSIGNED ? static_cast(cpp::numeric_limits::max()) + 1 + : cpp::numeric_limits::max(); + ResultType const abs_max = + (is_positive ? cpp::numeric_limits::max() : NEGATIVE_MAX); + ResultType const abs_max_div_by_base = + abs_max / static_cast(base); + + while (src_cur < src_len && iswalnum(src[src_cur])) { + wint_t cur_digit = b36_wchar_to_int(src[src_cur]); + if (cur_digit >= base) + break; + + is_number = true; + ++src_cur; + + // If the number has already hit the maximum value for the current type then + // the result cannot change, but we still need to advance src to the end of + // the number. + if (result == abs_max) { + error_val = ERANGE; + continue; + } + + if (result > abs_max_div_by_base) { + result = abs_max; + error_val = ERANGE; + } else { + result = result * static_cast(base); + } + if (result > abs_max - static_cast(cur_digit)) { + result = abs_max; + error_val = ERANGE; + } else { + result = result + static_cast(cur_digit); + } + } + + ptrdiff_t str_len = is_number ? static_cast(src_cur) : 0; + + if (error_val == ERANGE) { + if (is_positive || IS_UNSIGNED) + return {cpp::numeric_limits::max(), str_len, error_val}; + else // T is signed and there is a negative overflow + return {cpp::numeric_limits::min(), str_len, error_val}; + } + + return {static_cast(is_positive ? result : -result), str_len, error_val}; +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 9f626ed31cc07..e54d7a5c9638b 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -141,6 +141,17 @@ add_libc_test( libc.src.__support.str_to_integer ) +add_libc_test( + wcs_to_integer_test + SUITE + libc-support-tests + SRCS + wcs_to_integer_test.cpp + DEPENDS + libc.src.__support.integer_literals + libc.src.__support.wcs_to_integer +) + add_libc_test( integer_to_string_test SUITE diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp new file mode 100644 index 0000000000000..69a784391e196 --- /dev/null +++ b/libc/test/src/__support/wcs_to_integer_test.cpp @@ -0,0 +1,240 @@ +//===-- Unittests for wcs_to_integer --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/libc_errno.h" +#include "src/__support/wcs_to_integer.h" +#include + +#include "test/UnitTest/Test.h" + +// This file is for testing the src_len argument and other internal interface +// features. Primary testing is done in stdlib/StrolTest.cpp through the public +// interface. + +TEST(LlvmLibcStrToIntegerTest, SimpleLength) { + auto result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); + ASSERT_EQ(result.value, 12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 2); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(2)); + ASSERT_EQ(result.value, 12); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) { + auto result = + LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 15); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); + ASSERT_EQ(result.value, 12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); + ASSERT_EQ(result.value, 12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 7); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); + ASSERT_EQ(result.value, 12); + + result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 5); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, LeadingSign) { + auto result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, -12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 6); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 6); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, -12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 3); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(3)); + ASSERT_EQ(result.value, 12); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 3); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(3)); + ASSERT_EQ(result.value, -12); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 1); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 1); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) { + auto result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); + ASSERT_EQ(result.value, 0x12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 7); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); + ASSERT_EQ(result.value, 0x12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 5); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); + ASSERT_EQ(result.value, 0x123); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 2); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) { + auto result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); + ASSERT_EQ(result.value, 0x12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 7); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); + ASSERT_EQ(result.value, 0x12345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 5); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); + ASSERT_EQ(result.value, 0x123); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 2); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) { + auto result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 012345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 6); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 012345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 4); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(4)); + ASSERT_EQ(result.value, 0123); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 1); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) { + auto result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 012345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 6); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 012345); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 4); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(4)); + ASSERT_EQ(result.value, 0123); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 1); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); + ASSERT_EQ(result.value, 0); + + result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 0); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); + ASSERT_EQ(result.value, 0); +} + +TEST(LlvmLibcStrToIntegerTest, CombinedTests) { + auto result = + LIBC_NAMESPACE::internal::wcstointeger(L" -0x123", 0, 10); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); + ASSERT_EQ(result.value, -0x123); + + result = LIBC_NAMESPACE::internal::wcstointeger(L" -0x123", 0, 8); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(8)); + ASSERT_EQ(result.value, -0x1); + + result = LIBC_NAMESPACE::internal::wcstointeger(L" -0x123", 0, 7); + EXPECT_FALSE(result.has_error()); + EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); + ASSERT_EQ(result.value, 0); +} From 2b2e04f8053b709e1baa1df28698eda49f0fcb1e Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 9 Jul 2025 23:51:58 +0000 Subject: [PATCH 2/4] updated header comment --- libc/src/__support/wcs_to_integer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h index 40fa28a98030d..e16ee5008d0b2 100644 --- a/libc/src/__support/wcs_to_integer.h +++ b/libc/src/__support/wcs_to_integer.h @@ -1,4 +1,4 @@ -//===-- String to integer conversion utils ----------------------*- C++ -*-===// +//===-- Widechar string to integer conversion utils -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,10 +15,10 @@ #include "src/__support/CPP/type_traits/make_unsigned.h" #include "src/__support/big_int.h" #include "src/__support/common.h" -#include "src/__support/wctype_utils.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" +#include "src/__support/wctype_utils.h" namespace LIBC_NAMESPACE_DECL { namespace internal { @@ -109,7 +109,7 @@ wcstointeger(const wchar_t *__restrict src, int base, abs_max / static_cast(base); while (src_cur < src_len && iswalnum(src[src_cur])) { - wint_t cur_digit = b36_wchar_to_int(src[src_cur]); + int cur_digit = b36_wchar_to_int(src[src_cur]); if (cur_digit >= base) break; From 01b3c02d1f6974664d00fcd4782a85e66a15f134 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 9 Jul 2025 23:55:51 +0000 Subject: [PATCH 3/4] Updated tests --- libc/test/src/__support/wcs_to_integer_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp index 69a784391e196..e4107929c15fc 100644 --- a/libc/test/src/__support/wcs_to_integer_test.cpp +++ b/libc/test/src/__support/wcs_to_integer_test.cpp @@ -13,8 +13,7 @@ #include "test/UnitTest/Test.h" // This file is for testing the src_len argument and other internal interface -// features. Primary testing is done in stdlib/StrolTest.cpp through the public -// interface. +// features. Primary testing is done through the public interface. TEST(LlvmLibcStrToIntegerTest, SimpleLength) { auto result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 10); From 1c5771a8bd940861f2139d7656fe31441ae6e113 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Thu, 10 Jul 2025 16:51:21 +0000 Subject: [PATCH 4/4] modified first_non_whitespace to return an idx instead of ptr --- libc/src/__support/wcs_to_integer.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h index e16ee5008d0b2..4254bd860f77a 100644 --- a/libc/src/__support/wcs_to_integer.h +++ b/libc/src/__support/wcs_to_integer.h @@ -23,17 +23,16 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -// Returns a pointer to the first character in src that is not a whitespace +// Returns the idx of the first character in src that is not a whitespace // character (as determined by iswspace()) -// TODO: Change from returning a pointer to returning a length. -LIBC_INLINE const wchar_t * +LIBC_INLINE size_t first_non_whitespace(const wchar_t *__restrict src, size_t src_len = cpp::numeric_limits::max()) { size_t src_cur = 0; while (src_cur < src_len && internal::iswspace(src[src_cur])) { ++src_cur; } - return src + src_cur; + return src_cur; } // checks if the next 3 characters of the string pointer are the start of a @@ -83,7 +82,7 @@ wcstointeger(const wchar_t *__restrict src, int base, if (base < 0 || base == 1 || base > 36) return {0, 0, EINVAL}; - src_cur = static_cast(first_non_whitespace(src, src_len) - src); + src_cur = first_non_whitespace(src, src_len); wchar_t result_sign = L'+'; if (src[src_cur] == L'+' || src[src_cur] == L'-') {