From ac2c33f8311975c7e11acb8f1805b6ad8d464f9e Mon Sep 17 00:00:00 2001 From: firewave Date: Tue, 12 Sep 2023 15:25:31 +0200 Subject: [PATCH 1/4] added `FileStreamBuffered` to reduce the amount of `fgetc()` calls --- simplecpp.cpp | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/simplecpp.cpp b/simplecpp.cpp index d5b51080..2e23591b 100644 --- a/simplecpp.cpp +++ b/simplecpp.cpp @@ -467,6 +467,74 @@ class FileStream : public simplecpp::TokenList::Stream { int lastStatus{}; }; +class FileStreamBuffered : public simplecpp::TokenList::Stream { +public: + FileStreamBuffered(const std::string &filename, std::vector &files) + : file(fopen(filename.c_str(), "rb")) + { + if (!file) { + files.push_back(filename); + throw simplecpp::Output(files, simplecpp::Output::FILE_NOT_FOUND, "File is missing: " + filename); + } + init(); + } + + ~FileStreamBuffered() override { + fclose(file); + file = nullptr; + } + + int get() override { + read_internal(); + return buf[buf_idx++]; + } + int peek() override { + read_internal(); + return buf[buf_idx]; + } + void unget() override { + --buf_idx; + } + bool good() override { + return lastStatus != EOF; + } + +private: + void read_internal() { + // check if we are in the last chunk + if (buf_idx >= buf_len) { + if (buf_len != sizeof(buf)) { + lastStatus = EOF; + return; + } + } + + if (buf_idx == -1 || buf_idx == buf_len) + { + buf_idx = 0; + buf_len = fread(buf, 1, sizeof(buf), file); + if (buf_len == 0) { + lastStatus = EOF; + } + else if (buf_len != sizeof(buf)) { + if (ferror(file)) { + // TODO: is this correct? + lastStatus = EOF; + } + } + } + } + + FileStreamBuffered(const FileStreamBuffered&); + FileStreamBuffered &operator=(const FileStreamBuffered&); + + FILE *file; + int lastStatus{}; + unsigned char buf[8192]; + int buf_len{}; + int buf_idx{-1}; +}; + simplecpp::TokenList::TokenList(std::vector &filenames) : frontToken(nullptr), backToken(nullptr), files(filenames) {} simplecpp::TokenList::TokenList(std::istream &istr, std::vector &filenames, const std::string &filename, OutputList *outputList) @@ -487,7 +555,7 @@ simplecpp::TokenList::TokenList(const std::string &filename, std::vectorpush_back(e); From 9fd2842efc85363c474e425b5e2dfa6e569a036b Mon Sep 17 00:00:00 2001 From: firewave Date: Sat, 6 Sep 2025 00:56:21 +0200 Subject: [PATCH 2/4] unlikely --- simplecpp.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/simplecpp.cpp b/simplecpp.cpp index 2e23591b..2c95e598 100644 --- a/simplecpp.cpp +++ b/simplecpp.cpp @@ -58,6 +58,12 @@ # include #endif +#ifdef __GNUC__ +# define unlikely(x) __builtin_expect(!!(x), 0) +#else +# define unlikely(x) (x) +#endif + static bool isHex(const std::string &s) { return s.size()>2 && (s.compare(0,2,"0x")==0 || s.compare(0,2,"0X")==0); @@ -502,14 +508,14 @@ class FileStreamBuffered : public simplecpp::TokenList::Stream { private: void read_internal() { // check if we are in the last chunk - if (buf_idx >= buf_len) { + if (unlikely(buf_idx >= buf_len)) { if (buf_len != sizeof(buf)) { lastStatus = EOF; return; } } - if (buf_idx == -1 || buf_idx == buf_len) + if (unlikely(buf_idx == -1 || buf_idx == buf_len)) { buf_idx = 0; buf_len = fread(buf, 1, sizeof(buf), file); From b1a8c291808c752036b6634cfd42ae57d65c17db Mon Sep 17 00:00:00 2001 From: firewave Date: Sat, 11 Oct 2025 20:09:00 +0200 Subject: [PATCH 3/4] dbg --- .github/workflows/CI-unixish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI-unixish.yml b/.github/workflows/CI-unixish.yml index b4089ee1..bef957e6 100644 --- a/.github/workflows/CI-unixish.yml +++ b/.github/workflows/CI-unixish.yml @@ -100,7 +100,7 @@ jobs: if: matrix.os == 'ubuntu-24.04' run: | make clean - make -j$(nproc) CXXOPTS="-O1" + make -j$(nproc) CXXOPTS="-O1 -fno-omit-frame-pointer -fno-optimize-sibling-calls" valgrind --leak-check=full --num-callers=50 --show-reachable=yes --track-origins=yes --gen-suppressions=all --error-exitcode=42 ./testrunner # TODO: run Python tests with valgrind VALGRIND_TOOL=memcheck ./selfcheck.sh From 563457c54699270aa28a3983b6e96a3276022581 Mon Sep 17 00:00:00 2001 From: firewave Date: Sat, 11 Oct 2025 20:30:19 +0200 Subject: [PATCH 4/4] test --- simplecpp.cpp | 8 ++++---- test.cpp | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/simplecpp.cpp b/simplecpp.cpp index 2c95e598..dcedfa1c 100644 --- a/simplecpp.cpp +++ b/simplecpp.cpp @@ -259,12 +259,12 @@ class simplecpp::TokenList::Stream { virtual bool good() = 0; unsigned char readChar() { - unsigned char ch = static_cast(get()); + unsigned char ch = static_cast(get()); // TODO: check EOF? // For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the // character is non-ASCII character then replace it with 0xff if (isUtf16) { - const unsigned char ch2 = static_cast(get()); + const unsigned char ch2 = static_cast(get()); // TODO: check EOF? const int ch16 = makeUtf16Char(ch, ch2); ch = static_cast(((ch16 >= 0x80) ? 0xff : ch16)); } @@ -287,13 +287,13 @@ class simplecpp::TokenList::Stream { } unsigned char peekChar() { - unsigned char ch = static_cast(peek()); + unsigned char ch = static_cast(peek()); // TODO: check EOF? // For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the // character is non-ASCII character then replace it with 0xff if (isUtf16) { (void)get(); - const unsigned char ch2 = static_cast(peek()); + const unsigned char ch2 = static_cast(peek()); // TODO: check EOF? unget(); const int ch16 = makeUtf16Char(ch, ch2); ch = static_cast(((ch16 >= 0x80) ? 0xff : ch16)); diff --git a/test.cpp b/test.cpp index 26e3b95b..0261582f 100644 --- a/test.cpp +++ b/test.cpp @@ -2504,7 +2504,7 @@ static void readfile_nullbyte() const char code[] = "ab\0cd"; simplecpp::OutputList outputList; ASSERT_EQUALS("ab cd", readfile(code,sizeof(code), &outputList)); - ASSERT_EQUALS(true, outputList.empty()); // should warning be written? + ASSERT_EQUALS(true, outputList.empty()); // TODO: should warning be written? } static void readfile_char() @@ -2654,6 +2654,41 @@ static void readfile_file_not_found() ASSERT_EQUALS("file0,1,file_not_found,File is missing: NotAFile\n", toString(outputList)); } +static void readfile_empty() +{ + const char code[] = ""; + simplecpp::OutputList outputList; + ASSERT_EQUALS("", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS(true, outputList.empty()); +} + +// the BOM/UTF-16 detection reads two bytes +static void readfile_onebyte() +{ + const char code[] = "."; + simplecpp::OutputList outputList; + ASSERT_EQUALS(".", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS(true, outputList.empty()); +} + +static void readfile_utf16_unsupported() +{ + const char code[] = "\xfe\xff\xd8\x3d\xde\x42"; // smiley emoji + simplecpp::OutputList outputList; + ASSERT_EQUALS("", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS("file0,1,unhandled_char_error,The code contains unhandled character(s) (character code=255). Neither unicode nor extended ascii is supported.\n", toString(outputList)); +} + +static void readfile_utf16_incomplete() +{ + const char code[] = "\xfe\xff\x00\x31\x00\x32\x00"; // the last UTF16 char is incomplete + simplecpp::OutputList outputList; + ASSERT_EQUALS("12", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS(true, outputList.empty()); +} + +// TODO: test with incomplete BOMs + static void stringify1() { const char code_c[] = "#include \"A.h\"\n" @@ -3532,6 +3567,10 @@ int main(int argc, char **argv) TEST_CASE(readfile_unhandled_chars); TEST_CASE(readfile_error); TEST_CASE(readfile_file_not_found); + TEST_CASE(readfile_empty); + TEST_CASE(readfile_onebyte); + TEST_CASE(readfile_utf16_unsupported); + TEST_CASE(readfile_utf16_incomplete); TEST_CASE(stringify1);