diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index b9bc904471df9..ce5d0660ff81f 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -117,8 +117,8 @@ add_entrypoint_object( sscanf.h DEPENDS libc.src.__support.arg_list - libc.src.stdio.scanf_core.reader libc.src.stdio.scanf_core.scanf_main + libc.src.stdio.scanf_core.string_reader ) add_entrypoint_object( @@ -129,8 +129,8 @@ add_entrypoint_object( vsscanf.h DEPENDS libc.src.__support.arg_list - libc.src.stdio.scanf_core.reader libc.src.stdio.scanf_core.scanf_main + libc.src.stdio.scanf_core.string_reader ) add_entrypoint_object( diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt index 014413ccaa8da..d0d922ac823fe 100644 --- a/libc/src/stdio/scanf_core/CMakeLists.txt +++ b/libc/src/stdio/scanf_core/CMakeLists.txt @@ -61,10 +61,8 @@ if(NOT(TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD AND return() endif() -add_object_library( +add_header_library( scanf_main - SRCS - scanf_main.cpp HDRS scanf_main.h DEPENDS @@ -83,18 +81,19 @@ add_header_library( reader.h DEPENDS libc.src.__support.macros.attributes - ${file_deps} - ${use_system_file} ) -add_object_library( +add_header_library( + string_reader + HDRS + string_reader.h + DEPENDS + .reader + libc.src.__support.macros.attributes +) + +add_header_library( converter - SRCS - converter.cpp - string_converter.cpp - int_converter.cpp - float_converter.cpp - ptr_converter.cpp HDRS converter.h converter_utils.h diff --git a/libc/src/stdio/scanf_core/converter.cpp b/libc/src/stdio/scanf_core/converter.cpp deleted file mode 100644 index b1ee8cd1e74bb..0000000000000 --- a/libc/src/stdio/scanf_core/converter.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===-- Format specifier converter implmentation for scanf -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/stdio/scanf_core/converter.h" - -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/reader.h" - -#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT -#include "src/stdio/scanf_core/float_converter.h" -#endif // LIBC_COPT_SCANF_DISABLE_FLOAT -#include "src/stdio/scanf_core/current_pos_converter.h" -#include "src/stdio/scanf_core/int_converter.h" -#include "src/stdio/scanf_core/ptr_converter.h" -#include "src/stdio/scanf_core/string_converter.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace scanf_core { - -int convert(Reader *reader, const FormatSection &to_conv) { - int ret_val = 0; - switch (to_conv.conv_name) { - case '%': - return raw_match(reader, "%"); - case 's': - ret_val = raw_match(reader, " "); - if (ret_val != READ_OK) - return ret_val; - return convert_string(reader, to_conv); - case 'c': - case '[': - return convert_string(reader, to_conv); - case 'd': - case 'i': - case 'u': - case 'o': - case 'x': - case 'X': - ret_val = raw_match(reader, " "); - if (ret_val != READ_OK) - return ret_val; - return convert_int(reader, to_conv); -#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT - case 'f': - case 'F': - case 'e': - case 'E': - case 'a': - case 'A': - case 'g': - case 'G': - ret_val = raw_match(reader, " "); - if (ret_val != READ_OK) - return ret_val; - return convert_float(reader, to_conv); -#endif // LIBC_COPT_SCANF_DISABLE_FLOAT - case 'n': - return convert_current_pos(reader, to_conv); - case 'p': - ret_val = raw_match(reader, " "); - if (ret_val != READ_OK) - return ret_val; - return convert_pointer(reader, to_conv); - default: - return raw_match(reader, to_conv.raw_string); - } - return -1; -} - -// raw_string is assumed to have a positive size. -int raw_match(Reader *reader, cpp::string_view raw_string) { - char cur_char = reader->getc(); - int ret_val = READ_OK; - for (size_t i = 0; i < raw_string.size(); ++i) { - // Any space character matches any number of space characters. - if (internal::isspace(raw_string[i])) { - while (internal::isspace(cur_char)) { - cur_char = reader->getc(); - } - } else { - if (raw_string[i] == cur_char) { - cur_char = reader->getc(); - } else { - ret_val = MATCHING_FAILURE; - break; - } - } - } - reader->ungetc(cur_char); - return ret_val; -} - -} // namespace scanf_core -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/converter.h b/libc/src/stdio/scanf_core/converter.h index 3f514eeb75bdf..3df43e9de1e25 100644 --- a/libc/src/stdio/scanf_core/converter.h +++ b/libc/src/stdio/scanf_core/converter.h @@ -10,10 +10,19 @@ #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CONVERTER_H #include "src/__support/CPP/string_view.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" #include "src/stdio/scanf_core/core_structs.h" #include "src/stdio/scanf_core/reader.h" +#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT +#include "src/stdio/scanf_core/float_converter.h" +#endif // LIBC_COPT_SCANF_DISABLE_FLOAT +#include "src/stdio/scanf_core/current_pos_converter.h" +#include "src/stdio/scanf_core/int_converter.h" +#include "src/stdio/scanf_core/ptr_converter.h" +#include "src/stdio/scanf_core/string_converter.h" + #include namespace LIBC_NAMESPACE_DECL { @@ -22,11 +31,81 @@ namespace scanf_core { // convert will call a conversion function to convert the FormatSection into // its string representation, and then that will write the result to the // reader. -int convert(Reader *reader, const FormatSection &to_conv); +template +int convert(Reader *reader, const FormatSection &to_conv) { + int ret_val = 0; + switch (to_conv.conv_name) { + case '%': + return raw_match(reader, "%"); + case 's': + ret_val = raw_match(reader, " "); + if (ret_val != READ_OK) + return ret_val; + return convert_string(reader, to_conv); + case 'c': + case '[': + return convert_string(reader, to_conv); + case 'd': + case 'i': + case 'u': + case 'o': + case 'x': + case 'X': + ret_val = raw_match(reader, " "); + if (ret_val != READ_OK) + return ret_val; + return convert_int(reader, to_conv); +#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT + case 'f': + case 'F': + case 'e': + case 'E': + case 'a': + case 'A': + case 'g': + case 'G': + ret_val = raw_match(reader, " "); + if (ret_val != READ_OK) + return ret_val; + return convert_float(reader, to_conv); +#endif // LIBC_COPT_SCANF_DISABLE_FLOAT + case 'n': + return convert_current_pos(reader, to_conv); + case 'p': + ret_val = raw_match(reader, " "); + if (ret_val != READ_OK) + return ret_val; + return convert_pointer(reader, to_conv); + default: + return raw_match(reader, to_conv.raw_string); + } + return -1; +} // raw_match takes a raw string and matches it to the characters obtained from // the reader. -int raw_match(Reader *reader, cpp::string_view raw_string); +template +int raw_match(Reader *reader, cpp::string_view raw_string) { + char cur_char = reader->getc(); + int ret_val = READ_OK; + for (size_t i = 0; i < raw_string.size(); ++i) { + // Any space character matches any number of space characters. + if (internal::isspace(raw_string[i])) { + while (internal::isspace(cur_char)) { + cur_char = reader->getc(); + } + } else { + if (raw_string[i] == cur_char) { + cur_char = reader->getc(); + } else { + ret_val = MATCHING_FAILURE; + break; + } + } + } + reader->ungetc(cur_char); + return ret_val; +} } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/current_pos_converter.h b/libc/src/stdio/scanf_core/current_pos_converter.h index 8af1cc0ca0c27..8708490c82d3e 100644 --- a/libc/src/stdio/scanf_core/current_pos_converter.h +++ b/libc/src/stdio/scanf_core/current_pos_converter.h @@ -19,7 +19,8 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -LIBC_INLINE int convert_current_pos(Reader *reader, +template +LIBC_INLINE int convert_current_pos(Reader *reader, const FormatSection &to_conv) { write_int_with_length(reader->chars_read(), to_conv); return READ_OK; diff --git a/libc/src/stdio/scanf_core/float_converter.cpp b/libc/src/stdio/scanf_core/float_converter.cpp deleted file mode 100644 index 9c714d0727214..0000000000000 --- a/libc/src/stdio/scanf_core/float_converter.cpp +++ /dev/null @@ -1,229 +0,0 @@ -//===-- Int type specifier converters for scanf -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/stdio/scanf_core/float_converter.h" - -#include "src/__support/CPP/limits.h" -#include "src/__support/char_vector.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/converter_utils.h" -#include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/reader.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace scanf_core { - -// All of the floating point conversions are the same for scanf, every name will -// accept every style. -int convert_float(Reader *reader, const FormatSection &to_conv) { - // %a/A/e/E/f/F/g/G "Matches an optionally signed floating-point number, - // infinity, or NaN, whose format is the same as expected for the subject - // sequence of the strtod function. The corresponding argument shall be a - // pointer to floating." - - CharVector out_str = CharVector(); - bool is_number = false; - - size_t max_width = cpp::numeric_limits::max(); - if (to_conv.max_width > 0) { - max_width = to_conv.max_width; - } - - char cur_char = reader->getc(); - // Handle the sign. - if (cur_char == '+' || cur_char == '-') { - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - if (out_str.length() == max_width) { - return MATCHING_FAILURE; - } else { - cur_char = reader->getc(); - } - } - - static constexpr char DECIMAL_POINT = '.'; - static const char inf_string[] = "infinity"; - - // Handle inf - - if (internal::tolower(cur_char) == inf_string[0]) { - size_t inf_index = 0; - - for (; - inf_index < (sizeof(inf_string) - 1) && out_str.length() < max_width && - internal::tolower(cur_char) == inf_string[inf_index]; - ++inf_index) { - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - cur_char = reader->getc(); - } - - if (inf_index == 3 || inf_index == sizeof(inf_string) - 1) { - write_float_with_length(out_str.c_str(), to_conv); - return READ_OK; - } else { - return MATCHING_FAILURE; - } - } - - static const char nan_string[] = "nan"; - - // Handle nan - if (internal::tolower(cur_char) == nan_string[0]) { - size_t nan_index = 0; - - for (; - nan_index < (sizeof(nan_string) - 1) && out_str.length() < max_width && - internal::tolower(cur_char) == nan_string[nan_index]; - ++nan_index) { - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - cur_char = reader->getc(); - } - - if (nan_index == sizeof(nan_string) - 1) { - write_float_with_length(out_str.c_str(), to_conv); - return READ_OK; - } else { - return MATCHING_FAILURE; - } - } - - // Assume base of 10 by default but check if it is actually base 16. - int base = 10; - - // If the string starts with 0 it might be in hex. - if (cur_char == '0') { - is_number = true; - // Read the next character to check. - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - // If we've hit the end, then this is "0", which is valid. - if (out_str.length() == max_width) { - write_float_with_length(out_str.c_str(), to_conv); - return READ_OK; - } else { - cur_char = reader->getc(); - } - - // If that next character is an 'x' then this is a hexadecimal number. - if (internal::tolower(cur_char) == 'x') { - base = 16; - - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - // If we've hit the end here, we have "0x" which is a valid prefix to a - // floating point number, and will be evaluated to 0. - if (out_str.length() == max_width) { - write_float_with_length(out_str.c_str(), to_conv); - return READ_OK; - } else { - cur_char = reader->getc(); - } - } - } - - const char exponent_mark = ((base == 10) ? 'e' : 'p'); - bool after_decimal = false; - - // The format for the remaining characters at this point is DD.DDe+/-DD for - // base 10 and XX.XXp+/-DD for base 16 - - // This handles the digits before and after the decimal point, but not the - // exponent. - while (out_str.length() < max_width) { - if (internal::isalnum(cur_char) && - internal::b36_char_to_int(cur_char) < base) { - is_number = true; - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - cur_char = reader->getc(); - } else if (cur_char == DECIMAL_POINT && !after_decimal) { - after_decimal = true; - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - cur_char = reader->getc(); - } else { - break; - } - } - - // Handle the exponent, which has an exponent mark, an optional sign, and - // decimal digits. - if (internal::tolower(cur_char) == exponent_mark) { - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - if (out_str.length() == max_width) { - // This is laid out in the standard as being a matching error (100e is not - // a valid float) but may conflict with existing implementations. - return MATCHING_FAILURE; - } else { - cur_char = reader->getc(); - } - - if (cur_char == '+' || cur_char == '-') { - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - if (out_str.length() == max_width) { - return MATCHING_FAILURE; - } else { - cur_char = reader->getc(); - } - } - - // It is specified by the standard that "100er" is a matching failure since - // the longest prefix of a possibly valid floating-point number (which is - // "100e") is not a valid floating-point number. If there is an exponent - // mark then there must be a digit after it else the number is not valid. - // Some implementations will roll back two characters (to just "100") and - // accept that since the prefix is not valid, and some will interpret an - // exponent mark followed by no digits as an additional exponent of 0 - // (accepting "100e" and returning 100.0). Both of these behaviors are wrong - // by the standard, but they may be used in real code, see Hyrum's law. This - // code follows the standard, but may be incompatible due to code expecting - // these bugs. - if (!internal::isdigit(cur_char)) { - return MATCHING_FAILURE; - } - - while (internal::isdigit(cur_char) && out_str.length() < max_width) { - if (!out_str.append(cur_char)) { - return ALLOCATION_FAILURE; - } - cur_char = reader->getc(); - } - } - - // We always read one more character than will be used, so we have to put the - // last one back. - reader->ungetc(cur_char); - - // If we haven't actually found any digits, this is a matching failure (this - // catches cases like "+.") - if (!is_number) { - return MATCHING_FAILURE; - } - write_float_with_length(out_str.c_str(), to_conv); - - return READ_OK; -} - -} // namespace scanf_core -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/float_converter.h b/libc/src/stdio/scanf_core/float_converter.h index bd44847830fd1..6bbba379e13b6 100644 --- a/libc/src/stdio/scanf_core/float_converter.h +++ b/libc/src/stdio/scanf_core/float_converter.h @@ -9,7 +9,11 @@ #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_FLOAT_CONVERTER_H #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_FLOAT_CONVERTER_H +#include "src/__support/CPP/limits.h" +#include "src/__support/char_vector.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" +#include "src/stdio/scanf_core/converter_utils.h" #include "src/stdio/scanf_core/core_structs.h" #include "src/stdio/scanf_core/reader.h" @@ -18,7 +22,210 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -int convert_float(Reader *reader, const FormatSection &to_conv); +// All of the floating point conversions are the same for scanf, every name will +// accept every style. +template +int convert_float(Reader *reader, const FormatSection &to_conv) { + // %a/A/e/E/f/F/g/G "Matches an optionally signed floating-point number, + // infinity, or NaN, whose format is the same as expected for the subject + // sequence of the strtod function. The corresponding argument shall be a + // pointer to floating." + + CharVector out_str = CharVector(); + bool is_number = false; + + size_t max_width = cpp::numeric_limits::max(); + if (to_conv.max_width > 0) { + max_width = to_conv.max_width; + } + + char cur_char = reader->getc(); + // Handle the sign. + if (cur_char == '+' || cur_char == '-') { + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + if (out_str.length() == max_width) { + return MATCHING_FAILURE; + } else { + cur_char = reader->getc(); + } + } + + static constexpr char DECIMAL_POINT = '.'; + static const char inf_string[] = "infinity"; + + // Handle inf + + if (internal::tolower(cur_char) == inf_string[0]) { + size_t inf_index = 0; + + for (; + inf_index < (sizeof(inf_string) - 1) && out_str.length() < max_width && + internal::tolower(cur_char) == inf_string[inf_index]; + ++inf_index) { + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + cur_char = reader->getc(); + } + + if (inf_index == 3 || inf_index == sizeof(inf_string) - 1) { + write_float_with_length(out_str.c_str(), to_conv); + return READ_OK; + } else { + return MATCHING_FAILURE; + } + } + + static const char nan_string[] = "nan"; + + // Handle nan + if (internal::tolower(cur_char) == nan_string[0]) { + size_t nan_index = 0; + + for (; + nan_index < (sizeof(nan_string) - 1) && out_str.length() < max_width && + internal::tolower(cur_char) == nan_string[nan_index]; + ++nan_index) { + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + cur_char = reader->getc(); + } + + if (nan_index == sizeof(nan_string) - 1) { + write_float_with_length(out_str.c_str(), to_conv); + return READ_OK; + } else { + return MATCHING_FAILURE; + } + } + + // Assume base of 10 by default but check if it is actually base 16. + int base = 10; + + // If the string starts with 0 it might be in hex. + if (cur_char == '0') { + is_number = true; + // Read the next character to check. + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + // If we've hit the end, then this is "0", which is valid. + if (out_str.length() == max_width) { + write_float_with_length(out_str.c_str(), to_conv); + return READ_OK; + } else { + cur_char = reader->getc(); + } + + // If that next character is an 'x' then this is a hexadecimal number. + if (internal::tolower(cur_char) == 'x') { + base = 16; + + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + // If we've hit the end here, we have "0x" which is a valid prefix to a + // floating point number, and will be evaluated to 0. + if (out_str.length() == max_width) { + write_float_with_length(out_str.c_str(), to_conv); + return READ_OK; + } else { + cur_char = reader->getc(); + } + } + } + + const char exponent_mark = ((base == 10) ? 'e' : 'p'); + bool after_decimal = false; + + // The format for the remaining characters at this point is DD.DDe+/-DD for + // base 10 and XX.XXp+/-DD for base 16 + + // This handles the digits before and after the decimal point, but not the + // exponent. + while (out_str.length() < max_width) { + if (internal::isalnum(cur_char) && + internal::b36_char_to_int(cur_char) < base) { + is_number = true; + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + cur_char = reader->getc(); + } else if (cur_char == DECIMAL_POINT && !after_decimal) { + after_decimal = true; + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + cur_char = reader->getc(); + } else { + break; + } + } + + // Handle the exponent, which has an exponent mark, an optional sign, and + // decimal digits. + if (internal::tolower(cur_char) == exponent_mark) { + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + if (out_str.length() == max_width) { + // This is laid out in the standard as being a matching error (100e is not + // a valid float) but may conflict with existing implementations. + return MATCHING_FAILURE; + } else { + cur_char = reader->getc(); + } + + if (cur_char == '+' || cur_char == '-') { + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + if (out_str.length() == max_width) { + return MATCHING_FAILURE; + } else { + cur_char = reader->getc(); + } + } + + // It is specified by the standard that "100er" is a matching failure since + // the longest prefix of a possibly valid floating-point number (which is + // "100e") is not a valid floating-point number. If there is an exponent + // mark then there must be a digit after it else the number is not valid. + // Some implementations will roll back two characters (to just "100") and + // accept that since the prefix is not valid, and some will interpret an + // exponent mark followed by no digits as an additional exponent of 0 + // (accepting "100e" and returning 100.0). Both of these behaviors are wrong + // by the standard, but they may be used in real code, see Hyrum's law. This + // code follows the standard, but may be incompatible due to code expecting + // these bugs. + if (!internal::isdigit(cur_char)) { + return MATCHING_FAILURE; + } + + while (internal::isdigit(cur_char) && out_str.length() < max_width) { + if (!out_str.append(cur_char)) { + return ALLOCATION_FAILURE; + } + cur_char = reader->getc(); + } + } + + // We always read one more character than will be used, so we have to put the + // last one back. + reader->ungetc(cur_char); + + // If we haven't actually found any digits, this is a matching failure (this + // catches cases like "+.") + if (!is_number) { + return MATCHING_FAILURE; + } + write_float_with_length(out_str.c_str(), to_conv); + + return READ_OK; +} } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/int_converter.cpp b/libc/src/stdio/scanf_core/int_converter.cpp deleted file mode 100644 index fce817245c010..0000000000000 --- a/libc/src/stdio/scanf_core/int_converter.cpp +++ /dev/null @@ -1,230 +0,0 @@ -//===-- Int type specifier converters for scanf -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/stdio/scanf_core/int_converter.h" - -#include "src/__support/CPP/limits.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/converter_utils.h" -#include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/reader.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace scanf_core { - -// This code is very similar to the code in __support/str_to_integer.h but is -// not quite the same. Here is the list of differences and why they exist: -// 1) This takes a reader and a format section instead of a char* and the base. -// This should be fairly self explanatory. While the char* could be adapted -// to a reader and the base could be calculated ahead of time, the -// semantics are slightly different, specifically a char* can be indexed -// freely (I can read str[2] and then str[0]) whereas a File (which the -// reader may contain) cannot. -// 2) Because this uses a Reader, this function can only unget once. -// This is relevant because scanf specifies it reads the "longest sequence -// of input characters which does not exceed any specified field width and -// which is, or is a prefix of, a matching input sequence." Whereas the -// strtol function accepts "the longest initial subsequence of the input -// string (...) that is of the expected form." This is demonstrated by the -// differences in how they deal with the string "0xZZZ" when parsing as -// hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, -// since it reads the first 'Z', sees that it's not a valid hex digit, and -// reverses one character. The strtol function on the other hand only -// accepts the "0" since that's the longest valid hexadecimal sequence. It -// sees the 'Z' after the "0x" and determines that this is not the prefix -// to a valid hex string. -// 3) This conversion may have a maximum width. -// If a maximum width is specified, this conversion is only allowed to -// accept a certain number of characters. Strtol doesn't have any such -// limitation. -int convert_int(Reader *reader, const FormatSection &to_conv) { - // %d "Matches an optionally signed decimal integer [...] with the value 10 - // for the base argument. The corresponding argument shall be a pointer to - // signed integer." - - // %i "Matches an optionally signed integer [...] with the value 0 for the - // base argument. The corresponding argument shall be a pointer to signed - // integer." - - // %u "Matches an optionally signed decimal integer [...] with the value 10 - // for the base argument. The corresponding argument shall be a pointer to - // unsigned integer" - - // %o "Matches an optionally signed octal integer [...] with the value 8 for - // the base argument. The corresponding argument shall be a pointer to - // unsigned integer" - - // %x/X "Matches an optionally signed hexadecimal integer [...] with the value - // 16 for the base argument. The corresponding argument shall be a pointer to - // unsigned integer" - - size_t max_width = cpp::numeric_limits::max(); - if (to_conv.max_width > 0) { - max_width = to_conv.max_width; - } - - uintmax_t result = 0; - bool is_number = false; - bool is_signed = false; - int base = 0; - if (to_conv.conv_name == 'i') { - base = 0; - is_signed = true; - } else if (to_conv.conv_name == 'o') { - base = 8; - } else if (internal::tolower(to_conv.conv_name) == 'x' || - to_conv.conv_name == 'p') { - base = 16; - } else if (to_conv.conv_name == 'd') { - base = 10; - is_signed = true; - } else { // conv_name must be 'u' - base = 10; - } - - char cur_char = reader->getc(); - - char result_sign = '+'; - if (cur_char == '+' || cur_char == '-') { - result_sign = cur_char; - if (max_width > 1) { - --max_width; - cur_char = reader->getc(); - } else { - // If the max width has been hit already, then the return value must be 0 - // since no actual digits of the number have been parsed yet. - write_int_with_length(0, to_conv); - return MATCHING_FAILURE; - } - } - const bool is_negative = result_sign == '-'; - - // Base of 0 means automatically determine the base. Base of 16 may have a - // prefix of "0x" - if (base == 0 || base == 16) { - // If the first character is 0, then it could be octal or hex. - if (cur_char == '0') { - is_number = true; - - // Read the next character to check. - if (max_width > 1) { - --max_width; - cur_char = reader->getc(); - } else { - write_int_with_length(0, to_conv); - return READ_OK; - } - - if (internal::tolower(cur_char) == 'x') { - // This is a valid hex prefix. - - is_number = false; - // A valid hex prefix is not necessarily a valid number. For the - // conversion to be valid it needs to use all of the characters it - // consumes. From the standard: - // 7.23.6.2 paragraph 9: "An input item is defined as the longest - // sequence of input characters which does not exceed any specified - // field width and which is, or is a prefix of, a matching input - // sequence." - // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence, - // the execution of the directive fails: this condition is a matching - // failure" - base = 16; - if (max_width > 1) { - --max_width; - cur_char = reader->getc(); - } else { - return MATCHING_FAILURE; - } - - } else { - if (base == 0) { - base = 8; - } - } - } else if (base == 0) { - if (internal::isdigit(cur_char)) { - // If the first character is a different number, then it's 10. - base = 10; - } else { - // If the first character isn't a valid digit, then there are no valid - // digits at all. The number is 0. - reader->ungetc(cur_char); - write_int_with_length(0, to_conv); - return MATCHING_FAILURE; - } - } - } - - constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits::max(); - constexpr uintmax_t SIGNED_MAX = - static_cast(cpp::numeric_limits::max()); - constexpr uintmax_t NEGATIVE_SIGNED_MAX = - static_cast(cpp::numeric_limits::max()) + 1; - - const uintmax_t MAX = - (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) - : UNSIGNED_MAX); - - const uintmax_t max_div_by_base = MAX / base; - - if (internal::isalnum(cur_char) && - internal::b36_char_to_int(cur_char) < base) { - is_number = true; - } - - bool has_overflow = false; - size_t i = 0; - for (; i < max_width && internal::isalnum(cur_char) && - internal::b36_char_to_int(cur_char) < base; - ++i, cur_char = reader->getc()) { - - uintmax_t cur_digit = internal::b36_char_to_int(cur_char); - - if (result == MAX) { - has_overflow = true; - continue; - } else if (result > max_div_by_base) { - result = MAX; - has_overflow = true; - } else { - result = result * base; - } - - if (result > MAX - cur_digit) { - result = MAX; - has_overflow = true; - } else { - result = result + cur_digit; - } - } - - // We always read one more character than will be used, so we have to put the - // last one back. - reader->ungetc(cur_char); - - if (!is_number) - return MATCHING_FAILURE; - - if (has_overflow) { - write_int_with_length(MAX, to_conv); - } else { - if (is_negative) - result = -result; - - write_int_with_length(result, to_conv); - } - - return READ_OK; -} - -} // namespace scanf_core -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/int_converter.h b/libc/src/stdio/scanf_core/int_converter.h index 5fc27ad0faafc..35f11d67d4701 100644 --- a/libc/src/stdio/scanf_core/int_converter.h +++ b/libc/src/stdio/scanf_core/int_converter.h @@ -9,7 +9,10 @@ #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H +#include "src/__support/CPP/limits.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" +#include "src/stdio/scanf_core/converter_utils.h" #include "src/stdio/scanf_core/core_structs.h" #include "src/stdio/scanf_core/reader.h" @@ -18,7 +21,212 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -int convert_int(Reader *reader, const FormatSection &to_conv); +// This code is very similar to the code in __support/str_to_integer.h but is +// not quite the same. Here is the list of differences and why they exist: +// 1) This takes a reader and a format section instead of a char* and the base. +// This should be fairly self explanatory. While the char* could be adapted +// to a reader and the base could be calculated ahead of time, the +// semantics are slightly different, specifically a char* can be indexed +// freely (I can read str[2] and then str[0]) whereas a File (which the +// reader may contain) cannot. +// 2) Because this uses a Reader, this function can only unget once. +// This is relevant because scanf specifies it reads the "longest sequence +// of input characters which does not exceed any specified field width and +// which is, or is a prefix of, a matching input sequence." Whereas the +// strtol function accepts "the longest initial subsequence of the input +// string (...) that is of the expected form." This is demonstrated by the +// differences in how they deal with the string "0xZZZ" when parsing as +// hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, +// since it reads the first 'Z', sees that it's not a valid hex digit, and +// reverses one character. The strtol function on the other hand only +// accepts the "0" since that's the longest valid hexadecimal sequence. It +// sees the 'Z' after the "0x" and determines that this is not the prefix +// to a valid hex string. +// 3) This conversion may have a maximum width. +// If a maximum width is specified, this conversion is only allowed to +// accept a certain number of characters. Strtol doesn't have any such +// limitation. +template +int convert_int(Reader *reader, const FormatSection &to_conv) { + // %d "Matches an optionally signed decimal integer [...] with the value 10 + // for the base argument. The corresponding argument shall be a pointer to + // signed integer." + + // %i "Matches an optionally signed integer [...] with the value 0 for the + // base argument. The corresponding argument shall be a pointer to signed + // integer." + + // %u "Matches an optionally signed decimal integer [...] with the value 10 + // for the base argument. The corresponding argument shall be a pointer to + // unsigned integer" + + // %o "Matches an optionally signed octal integer [...] with the value 8 for + // the base argument. The corresponding argument shall be a pointer to + // unsigned integer" + + // %x/X "Matches an optionally signed hexadecimal integer [...] with the value + // 16 for the base argument. The corresponding argument shall be a pointer to + // unsigned integer" + + size_t max_width = cpp::numeric_limits::max(); + if (to_conv.max_width > 0) { + max_width = to_conv.max_width; + } + + uintmax_t result = 0; + bool is_number = false; + bool is_signed = false; + int base = 0; + if (to_conv.conv_name == 'i') { + base = 0; + is_signed = true; + } else if (to_conv.conv_name == 'o') { + base = 8; + } else if (internal::tolower(to_conv.conv_name) == 'x' || + to_conv.conv_name == 'p') { + base = 16; + } else if (to_conv.conv_name == 'd') { + base = 10; + is_signed = true; + } else { // conv_name must be 'u' + base = 10; + } + + char cur_char = reader->getc(); + + char result_sign = '+'; + if (cur_char == '+' || cur_char == '-') { + result_sign = cur_char; + if (max_width > 1) { + --max_width; + cur_char = reader->getc(); + } else { + // If the max width has been hit already, then the return value must be 0 + // since no actual digits of the number have been parsed yet. + write_int_with_length(0, to_conv); + return MATCHING_FAILURE; + } + } + const bool is_negative = result_sign == '-'; + + // Base of 0 means automatically determine the base. Base of 16 may have a + // prefix of "0x" + if (base == 0 || base == 16) { + // If the first character is 0, then it could be octal or hex. + if (cur_char == '0') { + is_number = true; + + // Read the next character to check. + if (max_width > 1) { + --max_width; + cur_char = reader->getc(); + } else { + write_int_with_length(0, to_conv); + return READ_OK; + } + + if (internal::tolower(cur_char) == 'x') { + // This is a valid hex prefix. + + is_number = false; + // A valid hex prefix is not necessarily a valid number. For the + // conversion to be valid it needs to use all of the characters it + // consumes. From the standard: + // 7.23.6.2 paragraph 9: "An input item is defined as the longest + // sequence of input characters which does not exceed any specified + // field width and which is, or is a prefix of, a matching input + // sequence." + // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence, + // the execution of the directive fails: this condition is a matching + // failure" + base = 16; + if (max_width > 1) { + --max_width; + cur_char = reader->getc(); + } else { + return MATCHING_FAILURE; + } + + } else { + if (base == 0) { + base = 8; + } + } + } else if (base == 0) { + if (internal::isdigit(cur_char)) { + // If the first character is a different number, then it's 10. + base = 10; + } else { + // If the first character isn't a valid digit, then there are no valid + // digits at all. The number is 0. + reader->ungetc(cur_char); + write_int_with_length(0, to_conv); + return MATCHING_FAILURE; + } + } + } + + constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits::max(); + constexpr uintmax_t SIGNED_MAX = + static_cast(cpp::numeric_limits::max()); + constexpr uintmax_t NEGATIVE_SIGNED_MAX = + static_cast(cpp::numeric_limits::max()) + 1; + + const uintmax_t MAX = + (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) + : UNSIGNED_MAX); + + const uintmax_t max_div_by_base = MAX / base; + + if (internal::isalnum(cur_char) && + internal::b36_char_to_int(cur_char) < base) { + is_number = true; + } + + bool has_overflow = false; + size_t i = 0; + for (; i < max_width && internal::isalnum(cur_char) && + internal::b36_char_to_int(cur_char) < base; + ++i, cur_char = reader->getc()) { + + uintmax_t cur_digit = internal::b36_char_to_int(cur_char); + + if (result == MAX) { + has_overflow = true; + continue; + } else if (result > max_div_by_base) { + result = MAX; + has_overflow = true; + } else { + result = result * base; + } + + if (result > MAX - cur_digit) { + result = MAX; + has_overflow = true; + } else { + result = result + cur_digit; + } + } + + // We always read one more character than will be used, so we have to put the + // last one back. + reader->ungetc(cur_char); + + if (!is_number) + return MATCHING_FAILURE; + + if (has_overflow) { + write_int_with_length(MAX, to_conv); + } else { + if (is_negative) + result = -result; + + write_int_with_length(result, to_conv); + } + + return READ_OK; +} } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/ptr_converter.cpp b/libc/src/stdio/scanf_core/ptr_converter.cpp deleted file mode 100644 index 37f002d3da698..0000000000000 --- a/libc/src/stdio/scanf_core/ptr_converter.cpp +++ /dev/null @@ -1,46 +0,0 @@ -//===-- Int type specifier converters for scanf -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/stdio/scanf_core/ptr_converter.h" - -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/converter_utils.h" -#include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/int_converter.h" -#include "src/stdio/scanf_core/reader.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace scanf_core { -int convert_pointer(Reader *reader, const FormatSection &to_conv) { - static const char nullptr_string[] = "(nullptr)"; - - // Check if it's exactly the nullptr string, if so then it's a nullptr. - char cur_char = reader->getc(); - size_t i = 0; - for (; i < (sizeof(nullptr_string) - 1) && - internal::tolower(cur_char) == nullptr_string[i]; - ++i) { - cur_char = reader->getc(); - } - if (i == (sizeof(nullptr_string) - 1)) { - *reinterpret_cast(to_conv.output_ptr) = nullptr; - return READ_OK; - } else if (i > 0) { - return MATCHING_FAILURE; - } - - reader->ungetc(cur_char); - - // Else treat it as a hex int - return convert_int(reader, to_conv); -} -} // namespace scanf_core -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/ptr_converter.h b/libc/src/stdio/scanf_core/ptr_converter.h index 0732c1c0e3284..e74a17eaac4cf 100644 --- a/libc/src/stdio/scanf_core/ptr_converter.h +++ b/libc/src/stdio/scanf_core/ptr_converter.h @@ -9,8 +9,10 @@ #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PTR_CONVERTER_H #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PTR_CONVERTER_H +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" #include "src/stdio/scanf_core/core_structs.h" +#include "src/stdio/scanf_core/int_converter.h" #include "src/stdio/scanf_core/reader.h" #include @@ -18,7 +20,30 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -int convert_pointer(Reader *reader, const FormatSection &to_conv); +template +int convert_pointer(Reader *reader, const FormatSection &to_conv) { + static const char nullptr_string[] = "(nullptr)"; + + // Check if it's exactly the nullptr string, if so then it's a nullptr. + char cur_char = reader->getc(); + size_t i = 0; + for (; i < (sizeof(nullptr_string) - 1) && + internal::tolower(cur_char) == nullptr_string[i]; + ++i) { + cur_char = reader->getc(); + } + if (i == (sizeof(nullptr_string) - 1)) { + *reinterpret_cast(to_conv.output_ptr) = nullptr; + return READ_OK; + } else if (i > 0) { + return MATCHING_FAILURE; + } + + reader->ungetc(cur_char); + + // Else treat it as a hex int + return convert_int(reader, to_conv); +} } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/reader.h b/libc/src/stdio/scanf_core/reader.h index 1f8ec9695a314..c71446ea0abed 100644 --- a/libc/src/stdio/scanf_core/reader.h +++ b/libc/src/stdio/scanf_core/reader.h @@ -9,17 +9,6 @@ #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_READER_H #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_READER_H -#include "hdr/types/FILE.h" - -#ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE -#include "src/__support/File/file.h" -#endif - -#if defined(LIBC_TARGET_ARCH_IS_GPU) -#include "src/stdio/getc.h" -#include "src/stdio/ungetc.h" -#endif - #include "src/__support/macros/attributes.h" // For LIBC_INLINE #include "src/__support/macros/config.h" @@ -27,103 +16,24 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -// We use the name "reader_internal" over "internal" because -// "internal" causes name lookups in files that include the current header to be -// ambigious i.e. `internal::foo` in those files, will try to lookup in -// `LIBC_NAMESPACE::scanf_core::internal` over `LIBC_NAMESPACE::internal` for -// e.g., `internal::ArgList` in `libc/src/stdio/scanf_core/scanf_main.h` -namespace reader_internal { - -#if defined(LIBC_TARGET_ARCH_IS_GPU) -// The GPU build provides FILE access through the host operating system's -// library. So here we simply use the public entrypoints like in the SYSTEM_FILE -// interface. Entrypoints should normally not call others, this is an exception. -// FIXME: We do not acquire any locks here, so this is not thread safe. -LIBC_INLINE int getc(void *f) { - return LIBC_NAMESPACE::getc(reinterpret_cast<::FILE *>(f)); -} - -LIBC_INLINE void ungetc(int c, void *f) { - LIBC_NAMESPACE::ungetc(c, reinterpret_cast<::FILE *>(f)); -} - -#elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE) - -LIBC_INLINE int getc(void *f) { - unsigned char c; - auto result = - reinterpret_cast(f)->read_unlocked(&c, 1); - size_t r = result.value; - if (result.has_error() || r != 1) - return '\0'; - - return c; -} - -LIBC_INLINE void ungetc(int c, void *f) { - reinterpret_cast(f)->ungetc_unlocked(c); -} -#else // defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE) - -// Since ungetc_unlocked isn't always available, we don't acquire the lock for -// system files. -LIBC_INLINE int getc(void *f) { return ::getc(reinterpret_cast<::FILE *>(f)); } - -LIBC_INLINE void ungetc(int c, void *f) { - ::ungetc(c, reinterpret_cast<::FILE *>(f)); -} -#endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE - -} // namespace reader_internal - -// This is intended to be either a raw string or a buffer syncronized with the -// file's internal buffer. -struct ReadBuffer { - const char *buffer; - size_t buff_len; - size_t buff_cur = 0; -}; - -class Reader { - ReadBuffer *rb; - void *input_stream = nullptr; +template class Reader { size_t cur_chars_read = 0; public: - // TODO: Set buff_len with a proper constant - LIBC_INLINE Reader(ReadBuffer *string_buffer) : rb(string_buffer) {} - - LIBC_INLINE Reader(void *stream, ReadBuffer *stream_buffer = nullptr) - : rb(stream_buffer), input_stream(stream) {} - // This returns the next character from the input and advances it by one // character. When it hits the end of the string or file it returns '\0' to // signal to stop parsing. LIBC_INLINE char getc() { ++cur_chars_read; - if (rb != nullptr) { - char output = rb->buffer[rb->buff_cur]; - ++(rb->buff_cur); - return output; - } - // This should reset the buffer if applicable. - return static_cast(reader_internal::getc(input_stream)); + return static_cast(this)->getc(); } // This moves the input back by one character, placing c into the buffer if // this is a file reader, else c is ignored. - LIBC_INLINE void ungetc(char c) { + LIBC_INLINE void ungetc(int c) { --cur_chars_read; - if (rb != nullptr && rb->buff_cur > 0) { - // While technically c should be written back to the buffer, in scanf we - // always write the character that was already there. Additionally, the - // buffer is most likely to contain a string that isn't part of a file, - // which may not be writable. - --(rb->buff_cur); - return; - } - reader_internal::ungetc(static_cast(c), input_stream); + static_cast(this)->ungetc(c); } LIBC_INLINE size_t chars_read() { return cur_chars_read; } diff --git a/libc/src/stdio/scanf_core/scanf_main.cpp b/libc/src/stdio/scanf_core/scanf_main.cpp deleted file mode 100644 index eb480943aeeda..0000000000000 --- a/libc/src/stdio/scanf_core/scanf_main.cpp +++ /dev/null @@ -1,46 +0,0 @@ -//===-- Starting point for scanf --------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/stdio/scanf_core/scanf_main.h" - -#include "src/__support/arg_list.h" -#include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/converter.h" -#include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/parser.h" -#include "src/stdio/scanf_core/reader.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace scanf_core { - -int scanf_main(Reader *reader, const char *__restrict str, - internal::ArgList &args) { - Parser parser(str, args); - int ret_val = READ_OK; - int conversions = 0; - for (FormatSection cur_section = parser.get_next_section(); - !cur_section.raw_string.empty() && ret_val == READ_OK; - cur_section = parser.get_next_section()) { - if (cur_section.has_conv) { - ret_val = convert(reader, cur_section); - // The %n (current position) conversion doesn't increment the number of - // assignments. - if (cur_section.conv_name != 'n') - conversions += ret_val == READ_OK ? 1 : 0; - } else { - ret_val = raw_match(reader, cur_section.raw_string); - } - } - - return conversions; -} - -} // namespace scanf_core -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/scanf_main.h b/libc/src/stdio/scanf_core/scanf_main.h index 27c246933dceb..f975d85c16f8a 100644 --- a/libc/src/stdio/scanf_core/scanf_main.h +++ b/libc/src/stdio/scanf_core/scanf_main.h @@ -11,6 +11,9 @@ #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/scanf_core/converter.h" +#include "src/stdio/scanf_core/core_structs.h" +#include "src/stdio/scanf_core/parser.h" #include "src/stdio/scanf_core/reader.h" #include @@ -18,8 +21,28 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -int scanf_main(Reader *reader, const char *__restrict str, - internal::ArgList &args); +template +int scanf_main(Reader *reader, const char *__restrict str, + internal::ArgList &args) { + Parser parser(str, args); + int ret_val = READ_OK; + int conversions = 0; + for (FormatSection cur_section = parser.get_next_section(); + !cur_section.raw_string.empty() && ret_val == READ_OK; + cur_section = parser.get_next_section()) { + if (cur_section.has_conv) { + ret_val = convert(reader, cur_section); + // The %n (current position) conversion doesn't increment the number of + // assignments. + if (cur_section.conv_name != 'n') + conversions += ret_val == READ_OK ? 1 : 0; + } else { + ret_val = raw_match(reader, cur_section.raw_string); + } + } + + return conversions; +} } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/string_converter.cpp b/libc/src/stdio/scanf_core/string_converter.cpp deleted file mode 100644 index 0de2eeed2f5e4..0000000000000 --- a/libc/src/stdio/scanf_core/string_converter.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===-- String type specifier converters for scanf --------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/stdio/scanf_core/string_converter.h" - -#include "src/__support/CPP/limits.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/reader.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace scanf_core { - -int convert_string(Reader *reader, const FormatSection &to_conv) { - // %s "Matches a sequence of non-white-space characters" - - // %c "Matches a sequence of characters of exactly the number specified by the - // field width (1 if no field width is present in the directive)" - - // %[ "Matches a nonempty sequence of characters from a set of expected - // characters (the scanset)." - size_t max_width = 0; - if (to_conv.max_width > 0) { - max_width = to_conv.max_width; - } else { - if (to_conv.conv_name == 'c') { - max_width = 1; - } else { - max_width = cpp::numeric_limits::max(); - } - } - - char *output = reinterpret_cast(to_conv.output_ptr); - - char cur_char = reader->getc(); - size_t i = 0; - for (; i < max_width && cur_char != '\0'; ++i) { - // If this is %s and we've hit a space, or if this is %[] and we've found - // something not in the scanset. - if ((to_conv.conv_name == 's' && internal::isspace(cur_char)) || - (to_conv.conv_name == '[' && !to_conv.scan_set.test(cur_char))) { - break; - } - // if the NO_WRITE flag is not set, write to the output. - if ((to_conv.flags & NO_WRITE) == 0) - output[i] = cur_char; - cur_char = reader->getc(); - } - - // We always read one more character than will be used, so we have to put the - // last one back. - reader->ungetc(cur_char); - - // If this is %s or %[] - if (to_conv.conv_name != 'c' && (to_conv.flags & NO_WRITE) == 0) { - // Always null terminate the string. This may cause a write to the - // (max_width + 1) byte, which is correct. The max width describes the max - // number of characters read from the input string, and doesn't necessarily - // correspond to the output. - output[i] = '\0'; - } - - if (i == 0) - return MATCHING_FAILURE; - return READ_OK; -} - -} // namespace scanf_core -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/string_converter.h b/libc/src/stdio/scanf_core/string_converter.h index 552dc22a502f5..3879f8c995899 100644 --- a/libc/src/stdio/scanf_core/string_converter.h +++ b/libc/src/stdio/scanf_core/string_converter.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_CONVERTER_H #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_CONVERTER_H +#include "src/__support/CPP/limits.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" #include "src/stdio/scanf_core/core_structs.h" #include "src/stdio/scanf_core/reader.h" @@ -18,7 +20,60 @@ namespace LIBC_NAMESPACE_DECL { namespace scanf_core { -int convert_string(Reader *reader, const FormatSection &to_conv); +template +int convert_string(Reader *reader, const FormatSection &to_conv) { + // %s "Matches a sequence of non-white-space characters" + + // %c "Matches a sequence of characters of exactly the number specified by the + // field width (1 if no field width is present in the directive)" + + // %[ "Matches a nonempty sequence of characters from a set of expected + // characters (the scanset)." + size_t max_width = 0; + if (to_conv.max_width > 0) { + max_width = to_conv.max_width; + } else { + if (to_conv.conv_name == 'c') { + max_width = 1; + } else { + max_width = cpp::numeric_limits::max(); + } + } + + char *output = reinterpret_cast(to_conv.output_ptr); + + char cur_char = reader->getc(); + size_t i = 0; + for (; i < max_width && cur_char != '\0'; ++i) { + // If this is %s and we've hit a space, or if this is %[] and we've found + // something not in the scanset. + if ((to_conv.conv_name == 's' && internal::isspace(cur_char)) || + (to_conv.conv_name == '[' && !to_conv.scan_set.test(cur_char))) { + break; + } + // if the NO_WRITE flag is not set, write to the output. + if ((to_conv.flags & NO_WRITE) == 0) + output[i] = cur_char; + cur_char = reader->getc(); + } + + // We always read one more character than will be used, so we have to put the + // last one back. + reader->ungetc(cur_char); + + // If this is %s or %[] + if (to_conv.conv_name != 'c' && (to_conv.flags & NO_WRITE) == 0) { + // Always null terminate the string. This may cause a write to the + // (max_width + 1) byte, which is correct. The max width describes the max + // number of characters read from the input string, and doesn't necessarily + // correspond to the output. + output[i] = '\0'; + } + + if (i == 0) + return MATCHING_FAILURE; + return READ_OK; +} } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/scanf_core/string_reader.h b/libc/src/stdio/scanf_core/string_reader.h new file mode 100644 index 0000000000000..95ca22d956b7d --- /dev/null +++ b/libc/src/stdio/scanf_core/string_reader.h @@ -0,0 +1,49 @@ +//===-- Reader definition for scanf -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_READER_H +#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_READER_H + +#include "src/__support/macros/attributes.h" // For LIBC_INLINE +#include "src/__support/macros/config.h" +#include "src/stdio/scanf_core/reader.h" + +#include + +namespace LIBC_NAMESPACE_DECL { +namespace scanf_core { + +class StringReader : public Reader { + const char *buffer; + [[maybe_unused]] size_t buff_len; + size_t buff_cur = 0; + +public: + LIBC_INLINE StringReader(const char *buffer, size_t buff_len) + : buffer(buffer), buff_len(buff_len) {} + + LIBC_INLINE char getc() { + char output = buffer[buff_cur]; + ++buff_cur; + return output; + } + LIBC_INLINE void ungetc(int) { + if (buff_cur > 0) { + // While technically c should be written back to the buffer, in scanf we + // always write the character that was already there. Additionally, the + // buffer is most likely to contain a string that isn't part of a file, + // which may not be writable. + --buff_cur; + } + } +}; + +} // namespace scanf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_READER_H diff --git a/libc/src/stdio/scanf_core/vfscanf_internal.h b/libc/src/stdio/scanf_core/vfscanf_internal.h index 4e20fa3b93091..1ff722b3bc8ac 100644 --- a/libc/src/stdio/scanf_core/vfscanf_internal.h +++ b/libc/src/stdio/scanf_core/vfscanf_internal.h @@ -38,6 +38,10 @@ LIBC_INLINE void funlockfile(::FILE *) { return; } LIBC_INLINE int ferror_unlocked(::FILE *f) { return LIBC_NAMESPACE::ferror(f); } +LIBC_INLINE int getc(::FILE *f) { return LIBC_NAMESPACE::getc(f); } + +LIBC_INLINE void ungetc(int c, ::FILE *f) { LIBC_NAMESPACE::ungetc(c, f); } + #elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE) LIBC_INLINE void flockfile(FILE *f) { @@ -52,6 +56,21 @@ LIBC_INLINE int ferror_unlocked(FILE *f) { return reinterpret_cast(f)->error_unlocked(); } +LIBC_INLINE int getc(FILE *f) { + unsigned char c; + auto result = + reinterpret_cast(f)->read_unlocked(&c, 1); + size_t r = result.value; + if (result.has_error() || r != 1) + return '\0'; + + return c; +} + +LIBC_INLINE void ungetc(int c, FILE *f) { + reinterpret_cast(f)->ungetc_unlocked(c); +} + #else // defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE) // Since ungetc_unlocked isn't always available, we don't acquire the lock for @@ -62,17 +81,35 @@ LIBC_INLINE void funlockfile(::FILE *) { return; } LIBC_INLINE int ferror_unlocked(::FILE *f) { return ::ferror(f); } +LIBC_INLINE int getc(::FILE *f) { return ::getc(f); } + +LIBC_INLINE void ungetc(int c, ::FILE *f) { ::ungetc(c, f); } + #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal namespace scanf_core { +class StreamReader : public Reader { + ::FILE *stream; + +public: + LIBC_INLINE StreamReader(::FILE *stream) : stream(stream) {} + + LIBC_INLINE char getc() { + return static_cast(internal::getc(static_cast(stream))); + } + LIBC_INLINE void ungetc(int c) { + internal::ungetc(c, static_cast(stream)); + } +}; + LIBC_INLINE int vfscanf_internal(::FILE *__restrict stream, const char *__restrict format, internal::ArgList &args) { internal::flockfile(stream); - scanf_core::Reader reader(stream); + scanf_core::StreamReader reader(stream); int retval = scanf_core::scanf_main(&reader, format, args); if (retval == 0 && internal::ferror_unlocked(stream)) retval = EOF; diff --git a/libc/src/stdio/sscanf.cpp b/libc/src/stdio/sscanf.cpp index 82de8a29f6ad1..9fa2ede461595 100644 --- a/libc/src/stdio/sscanf.cpp +++ b/libc/src/stdio/sscanf.cpp @@ -11,8 +11,8 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/scanf_core/reader.h" #include "src/stdio/scanf_core/scanf_main.h" +#include "src/stdio/scanf_core/string_reader.h" #include "hdr/stdio_macros.h" #include "hdr/types/FILE.h" @@ -29,8 +29,7 @@ LLVM_LIBC_FUNCTION(int, sscanf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - scanf_core::ReadBuffer rb{buffer, cpp::numeric_limits::max()}; - scanf_core::Reader reader(&rb); + scanf_core::StringReader reader(buffer, cpp::numeric_limits::max()); int ret_val = scanf_core::scanf_main(&reader, format, args); // This is done to avoid including stdio.h in the internals. On most systems // EOF is -1, so this will be transformed into just "return ret_val". diff --git a/libc/src/stdio/vsscanf.cpp b/libc/src/stdio/vsscanf.cpp index f3f56bce64292..7c7240a102b5a 100644 --- a/libc/src/stdio/vsscanf.cpp +++ b/libc/src/stdio/vsscanf.cpp @@ -11,8 +11,8 @@ #include "hdr/stdio_macros.h" #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/stdio/scanf_core/reader.h" #include "src/stdio/scanf_core/scanf_main.h" +#include "src/stdio/scanf_core/string_reader.h" #include @@ -21,9 +21,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, vsscanf, (const char *buffer, const char *format, va_list vlist)) { internal::ArgList args(vlist); - scanf_core::ReadBuffer rb{const_cast(buffer), - cpp::numeric_limits::max()}; - scanf_core::Reader reader(&rb); + scanf_core::StringReader reader(buffer, cpp::numeric_limits::max()); int ret_val = scanf_core::scanf_main(&reader, format, args); // This is done to avoid including stdio.h in the internals. On most systems // EOF is -1, so this will be transformed into just "return ret_val". diff --git a/libc/test/src/stdio/scanf_core/CMakeLists.txt b/libc/test/src/stdio/scanf_core/CMakeLists.txt index 058f665e42930..64ff7d324c6fd 100644 --- a/libc/test/src/stdio/scanf_core/CMakeLists.txt +++ b/libc/test/src/stdio/scanf_core/CMakeLists.txt @@ -32,7 +32,7 @@ add_libc_unittest( SRCS reader_test.cpp DEPENDS - libc.src.stdio.scanf_core.reader + libc.src.stdio.scanf_core.string_reader libc.src.__support.CPP.string_view COMPILE_OPTIONS ${use_system_file} @@ -45,8 +45,8 @@ add_libc_unittest( SRCS converter_test.cpp DEPENDS - libc.src.stdio.scanf_core.reader libc.src.stdio.scanf_core.converter + libc.src.stdio.scanf_core.string_reader libc.src.__support.CPP.string_view COMPILE_OPTIONS ${use_system_file} diff --git a/libc/test/src/stdio/scanf_core/converter_test.cpp b/libc/test/src/stdio/scanf_core/converter_test.cpp index d1aecd4c6ba06..ff0ce9200e51e 100644 --- a/libc/test/src/stdio/scanf_core/converter_test.cpp +++ b/libc/test/src/stdio/scanf_core/converter_test.cpp @@ -9,14 +9,13 @@ #include "src/__support/CPP/string_view.h" #include "src/stdio/scanf_core/converter.h" #include "src/stdio/scanf_core/core_structs.h" -#include "src/stdio/scanf_core/reader.h" +#include "src/stdio/scanf_core/string_reader.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcScanfConverterTest, RawMatchBasic) { const char *str = "abcdef"; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); // Reading "abc" should succeed. ASSERT_EQ(LIBC_NAMESPACE::scanf_core::raw_match(&reader, "abc"), @@ -51,8 +50,7 @@ TEST(LlvmLibcScanfConverterTest, RawMatchBasic) { TEST(LlvmLibcScanfConverterTest, RawMatchSpaces) { const char *str = " a \t\n b cd"; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); // Reading "a" should fail and not advance. // Since there's nothing in the format string (the second argument to @@ -98,8 +96,7 @@ TEST(LlvmLibcScanfConverterTest, RawMatchSpaces) { TEST(LlvmLibcScanfConverterTest, StringConvSimple) { const char *str = "abcDEF123 654LKJihg"; char result[20]; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); LIBC_NAMESPACE::scanf_core::FormatSection conv; conv.has_conv = true; @@ -120,8 +117,7 @@ TEST(LlvmLibcScanfConverterTest, StringConvSimple) { TEST(LlvmLibcScanfConverterTest, StringConvNoWrite) { const char *str = "abcDEF123 654LKJihg"; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); LIBC_NAMESPACE::scanf_core::FormatSection conv; conv.has_conv = true; @@ -141,8 +137,7 @@ TEST(LlvmLibcScanfConverterTest, StringConvNoWrite) { TEST(LlvmLibcScanfConverterTest, StringConvWidth) { const char *str = "abcDEF123 654LKJihg"; char result[6]; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); LIBC_NAMESPACE::scanf_core::FormatSection conv; conv.has_conv = true; @@ -175,8 +170,7 @@ TEST(LlvmLibcScanfConverterTest, StringConvWidth) { TEST(LlvmLibcScanfConverterTest, CharsConv) { const char *str = "abcDEF123 654LKJihg MNOpqr&*("; char result[20]; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); LIBC_NAMESPACE::scanf_core::FormatSection conv; conv.has_conv = true; @@ -230,8 +224,7 @@ TEST(LlvmLibcScanfConverterTest, CharsConv) { TEST(LlvmLibcScanfConverterTest, ScansetConv) { const char *str = "abcDEF[123] 654LKJihg"; char result[20]; - LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)}; - LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader); + LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str)); LIBC_NAMESPACE::scanf_core::FormatSection conv; conv.has_conv = true; diff --git a/libc/test/src/stdio/scanf_core/reader_test.cpp b/libc/test/src/stdio/scanf_core/reader_test.cpp index 43a14184c7650..4cafc81251f0b 100644 --- a/libc/test/src/stdio/scanf_core/reader_test.cpp +++ b/libc/test/src/stdio/scanf_core/reader_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" -#include "src/stdio/scanf_core/reader.h" +#include "src/stdio/scanf_core/string_reader.h" #include "test/UnitTest/Test.h" @@ -15,14 +15,14 @@ TEST(LlvmLibcScanfStringReaderTest, Constructor) { char str[10]; // buff_len justneeds to be a big number. The specific value isn't important // in the real world. - LIBC_NAMESPACE::scanf_core::ReadBuffer rb{const_cast(str), 1000000}; - LIBC_NAMESPACE::scanf_core::Reader reader(&rb); + LIBC_NAMESPACE::scanf_core::StringReader reader(const_cast(str), + 1000000); } TEST(LlvmLibcScanfStringReaderTest, SimpleRead) { const char *str = "abc"; - LIBC_NAMESPACE::scanf_core::ReadBuffer rb{const_cast(str), 1000000}; - LIBC_NAMESPACE::scanf_core::Reader reader(&rb); + LIBC_NAMESPACE::scanf_core::StringReader reader(const_cast(str), + 1000000); for (size_t i = 0; i < sizeof("abc"); ++i) { ASSERT_EQ(str[i], reader.getc()); @@ -31,8 +31,8 @@ TEST(LlvmLibcScanfStringReaderTest, SimpleRead) { TEST(LlvmLibcScanfStringReaderTest, ReadAndReverse) { const char *str = "abcDEF123"; - LIBC_NAMESPACE::scanf_core::ReadBuffer rb{const_cast(str), 1000000}; - LIBC_NAMESPACE::scanf_core::Reader reader(&rb); + LIBC_NAMESPACE::scanf_core::StringReader reader(const_cast(str), + 1000000); for (size_t i = 0; i < 5; ++i) { ASSERT_EQ(str[i], reader.getc());