Skip to content

Conversation

@uzairnawaz
Copy link
Contributor

Duplicated str_to_integer.h and modified it to work with widechars.
A future patch will implement the public functions (wcstol, wcstoll, etc) by calling this internal function.

@llvmbot
Copy link
Member

llvmbot commented Jul 9, 2025

@llvm/pr-subscribers-libc

Author: Uzair Nawaz (uzairnawaz)

Changes

Duplicated str_to_integer.h and modified it to work with widechars.
A future patch will implement the public functions (wcstol, wcstoll, etc) by calling this internal function.


Full diff: https://github.com/llvm/llvm-project/pull/147857.diff

4 Files Affected:

  • (modified) libc/src/__support/CMakeLists.txt (+13)
  • (added) libc/src/__support/wcs_to_integer.h (+156)
  • (modified) libc/test/src/__support/CMakeLists.txt (+11)
  • (added) libc/test/src/__support/wcs_to_integer_test.cpp (+239)
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 7e85136c08851..294d68474bd53 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -180,6 +180,19 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  wcs_to_integer
+  HDRS
+    wcs_to_integer.h
+  DEPENDS
+    .wctype_utils
+    .str_to_num_result
+    libc.hdr.errno_macros
+    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.common
+)
+
 add_header_library(
   integer_to_string
   HDRS
diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h
new file mode 100644
index 0000000000000..e16ee5008d0b2
--- /dev/null
+++ b/libc/src/__support/wcs_to_integer.h
@@ -0,0 +1,156 @@
+//===-- Widechar string to integer conversion utils -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
+#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
+
+#include "hdr/errno_macros.h" // For ERANGE
+#include "src/__support/CPP/limits.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/CPP/type_traits/make_unsigned.h"
+#include "src/__support/big_int.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_num_result.h"
+#include "src/__support/uint128.h"
+#include "src/__support/wctype_utils.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// Returns a pointer to the first character in src that is not a whitespace
+// character (as determined by iswspace())
+// TODO: Change from returning a pointer to returning a length.
+LIBC_INLINE const wchar_t *
+first_non_whitespace(const wchar_t *__restrict src,
+                     size_t src_len = cpp::numeric_limits<size_t>::max()) {
+  size_t src_cur = 0;
+  while (src_cur < src_len && internal::iswspace(src[src_cur])) {
+    ++src_cur;
+  }
+  return src + src_cur;
+}
+
+// checks if the next 3 characters of the string pointer are the start of a
+// hexadecimal number. Does not advance the string pointer.
+LIBC_INLINE bool
+is_hex_start(const wchar_t *__restrict src,
+             size_t src_len = cpp::numeric_limits<size_t>::max()) {
+  if (src_len < 3)
+    return false;
+  return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) &&
+         b36_wchar_to_int(*(src + 2)) < 16;
+}
+
+// Takes the address of the string pointer and parses the base from the start of
+// it.
+LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) {
+  // A hexadecimal number is defined as "the prefix 0x or 0X followed by a
+  // sequence of the decimal digits and the letters a (or A) through f (or F)
+  // with values 10 through 15 respectively." (C standard 6.4.4.1)
+  if (is_hex_start(src, src_len))
+    return 16;
+  // An octal number is defined as "the prefix 0 optionally followed by a
+  // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
+  // number that starts with 0, including just 0, is an octal number.
+  if (src_len > 0 && src[0] == L'0')
+    return 8;
+  // A decimal number is defined as beginning "with a nonzero digit and
+  // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
+  return 10;
+}
+
+template <class T>
+LIBC_INLINE StrToNumResult<T>
+wcstointeger(const wchar_t *__restrict src, int base,
+             const size_t src_len = cpp::numeric_limits<size_t>::max()) {
+  using ResultType = make_integral_or_big_int_unsigned_t<T>;
+
+  ResultType result = 0;
+
+  bool is_number = false;
+  size_t src_cur = 0;
+  int error_val = 0;
+
+  if (src_len == 0)
+    return {0, 0, 0};
+
+  if (base < 0 || base == 1 || base > 36)
+    return {0, 0, EINVAL};
+
+  src_cur = static_cast<size_t>(first_non_whitespace(src, src_len) - src);
+
+  wchar_t result_sign = L'+';
+  if (src[src_cur] == L'+' || src[src_cur] == L'-') {
+    result_sign = src[src_cur];
+    ++src_cur;
+  }
+
+  if (base == 0)
+    base = infer_base(src + src_cur, src_len - src_cur);
+
+  if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur))
+    src_cur = src_cur + 2;
+
+  constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
+  const bool is_positive = (result_sign == L'+');
+
+  ResultType constexpr NEGATIVE_MAX =
+      !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
+                   : cpp::numeric_limits<T>::max();
+  ResultType const abs_max =
+      (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX);
+  ResultType const abs_max_div_by_base =
+      abs_max / static_cast<ResultType>(base);
+
+  while (src_cur < src_len && iswalnum(src[src_cur])) {
+    int cur_digit = b36_wchar_to_int(src[src_cur]);
+    if (cur_digit >= base)
+      break;
+
+    is_number = true;
+    ++src_cur;
+
+    // If the number has already hit the maximum value for the current type then
+    // the result cannot change, but we still need to advance src to the end of
+    // the number.
+    if (result == abs_max) {
+      error_val = ERANGE;
+      continue;
+    }
+
+    if (result > abs_max_div_by_base) {
+      result = abs_max;
+      error_val = ERANGE;
+    } else {
+      result = result * static_cast<ResultType>(base);
+    }
+    if (result > abs_max - static_cast<ResultType>(cur_digit)) {
+      result = abs_max;
+      error_val = ERANGE;
+    } else {
+      result = result + static_cast<ResultType>(cur_digit);
+    }
+  }
+
+  ptrdiff_t str_len = is_number ? static_cast<ptrdiff_t>(src_cur) : 0;
+
+  if (error_val == ERANGE) {
+    if (is_positive || IS_UNSIGNED)
+      return {cpp::numeric_limits<T>::max(), str_len, error_val};
+    else // T is signed and there is a negative overflow
+      return {cpp::numeric_limits<T>::min(), str_len, error_val};
+  }
+
+  return {static_cast<T>(is_positive ? result : -result), str_len, error_val};
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9f626ed31cc07..e54d7a5c9638b 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -141,6 +141,17 @@ add_libc_test(
     libc.src.__support.str_to_integer
 )
 
+add_libc_test(
+  wcs_to_integer_test
+  SUITE
+    libc-support-tests
+  SRCS
+    wcs_to_integer_test.cpp
+  DEPENDS
+    libc.src.__support.integer_literals
+    libc.src.__support.wcs_to_integer
+)
+
 add_libc_test(
   integer_to_string_test
   SUITE
diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp
new file mode 100644
index 0000000000000..e4107929c15fc
--- /dev/null
+++ b/libc/test/src/__support/wcs_to_integer_test.cpp
@@ -0,0 +1,239 @@
+//===-- Unittests for wcs_to_integer --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/libc_errno.h"
+#include "src/__support/wcs_to_integer.h"
+#include <stddef.h>
+
+#include "test/UnitTest/Test.h"
+
+// This file is for testing the src_len argument and other internal interface
+// features. Primary testing is done through the public interface.
+
+TEST(LlvmLibcStrToIntegerTest, SimpleLength) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 2);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(2));
+  ASSERT_EQ(result.value, 12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) {
+  auto result =
+      LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 15);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 5);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, LeadingSign) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, -12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, -12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 3);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
+  ASSERT_EQ(result.value, 12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 3);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
+  ASSERT_EQ(result.value, -12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 5);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
+  ASSERT_EQ(result.value, 0x123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 2);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 5);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
+  ASSERT_EQ(result.value, 0x123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 2);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 4);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
+  ASSERT_EQ(result.value, 0123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 4);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
+  ASSERT_EQ(result.value, 0123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, CombinedTests) {
+  auto result =
+      LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
+  ASSERT_EQ(result.value, -0x123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 8);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(8));
+  ASSERT_EQ(result.value, -0x1);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 0);
+}

Copy link
Contributor

@michaelrj-google michaelrj-google left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with one change

@uzairnawaz uzairnawaz merged commit d93cc7a into llvm:main Jul 10, 2025
19 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants