diff --git a/.github/workflows/CI-unixish.yml b/.github/workflows/CI-unixish.yml index b4089ee1..bef957e6 100644 --- a/.github/workflows/CI-unixish.yml +++ b/.github/workflows/CI-unixish.yml @@ -100,7 +100,7 @@ jobs: if: matrix.os == 'ubuntu-24.04' run: | make clean - make -j$(nproc) CXXOPTS="-O1" + make -j$(nproc) CXXOPTS="-O1 -fno-omit-frame-pointer -fno-optimize-sibling-calls" valgrind --leak-check=full --num-callers=50 --show-reachable=yes --track-origins=yes --gen-suppressions=all --error-exitcode=42 ./testrunner # TODO: run Python tests with valgrind VALGRIND_TOOL=memcheck ./selfcheck.sh diff --git a/simplecpp.cpp b/simplecpp.cpp index d5b51080..dcedfa1c 100644 --- a/simplecpp.cpp +++ b/simplecpp.cpp @@ -58,6 +58,12 @@ # include #endif +#ifdef __GNUC__ +# define unlikely(x) __builtin_expect(!!(x), 0) +#else +# define unlikely(x) (x) +#endif + static bool isHex(const std::string &s) { return s.size()>2 && (s.compare(0,2,"0x")==0 || s.compare(0,2,"0X")==0); @@ -253,12 +259,12 @@ class simplecpp::TokenList::Stream { virtual bool good() = 0; unsigned char readChar() { - unsigned char ch = static_cast(get()); + unsigned char ch = static_cast(get()); // TODO: check EOF? // For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the // character is non-ASCII character then replace it with 0xff if (isUtf16) { - const unsigned char ch2 = static_cast(get()); + const unsigned char ch2 = static_cast(get()); // TODO: check EOF? const int ch16 = makeUtf16Char(ch, ch2); ch = static_cast(((ch16 >= 0x80) ? 0xff : ch16)); } @@ -281,13 +287,13 @@ class simplecpp::TokenList::Stream { } unsigned char peekChar() { - unsigned char ch = static_cast(peek()); + unsigned char ch = static_cast(peek()); // TODO: check EOF? // For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the // character is non-ASCII character then replace it with 0xff if (isUtf16) { (void)get(); - const unsigned char ch2 = static_cast(peek()); + const unsigned char ch2 = static_cast(peek()); // TODO: check EOF? unget(); const int ch16 = makeUtf16Char(ch, ch2); ch = static_cast(((ch16 >= 0x80) ? 0xff : ch16)); @@ -467,6 +473,74 @@ class FileStream : public simplecpp::TokenList::Stream { int lastStatus{}; }; +class FileStreamBuffered : public simplecpp::TokenList::Stream { +public: + FileStreamBuffered(const std::string &filename, std::vector &files) + : file(fopen(filename.c_str(), "rb")) + { + if (!file) { + files.push_back(filename); + throw simplecpp::Output(files, simplecpp::Output::FILE_NOT_FOUND, "File is missing: " + filename); + } + init(); + } + + ~FileStreamBuffered() override { + fclose(file); + file = nullptr; + } + + int get() override { + read_internal(); + return buf[buf_idx++]; + } + int peek() override { + read_internal(); + return buf[buf_idx]; + } + void unget() override { + --buf_idx; + } + bool good() override { + return lastStatus != EOF; + } + +private: + void read_internal() { + // check if we are in the last chunk + if (unlikely(buf_idx >= buf_len)) { + if (buf_len != sizeof(buf)) { + lastStatus = EOF; + return; + } + } + + if (unlikely(buf_idx == -1 || buf_idx == buf_len)) + { + buf_idx = 0; + buf_len = fread(buf, 1, sizeof(buf), file); + if (buf_len == 0) { + lastStatus = EOF; + } + else if (buf_len != sizeof(buf)) { + if (ferror(file)) { + // TODO: is this correct? + lastStatus = EOF; + } + } + } + } + + FileStreamBuffered(const FileStreamBuffered&); + FileStreamBuffered &operator=(const FileStreamBuffered&); + + FILE *file; + int lastStatus{}; + unsigned char buf[8192]; + int buf_len{}; + int buf_idx{-1}; +}; + simplecpp::TokenList::TokenList(std::vector &filenames) : frontToken(nullptr), backToken(nullptr), files(filenames) {} simplecpp::TokenList::TokenList(std::istream &istr, std::vector &filenames, const std::string &filename, OutputList *outputList) @@ -487,7 +561,7 @@ simplecpp::TokenList::TokenList(const std::string &filename, std::vectorpush_back(e); diff --git a/test.cpp b/test.cpp index 26e3b95b..0261582f 100644 --- a/test.cpp +++ b/test.cpp @@ -2504,7 +2504,7 @@ static void readfile_nullbyte() const char code[] = "ab\0cd"; simplecpp::OutputList outputList; ASSERT_EQUALS("ab cd", readfile(code,sizeof(code), &outputList)); - ASSERT_EQUALS(true, outputList.empty()); // should warning be written? + ASSERT_EQUALS(true, outputList.empty()); // TODO: should warning be written? } static void readfile_char() @@ -2654,6 +2654,41 @@ static void readfile_file_not_found() ASSERT_EQUALS("file0,1,file_not_found,File is missing: NotAFile\n", toString(outputList)); } +static void readfile_empty() +{ + const char code[] = ""; + simplecpp::OutputList outputList; + ASSERT_EQUALS("", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS(true, outputList.empty()); +} + +// the BOM/UTF-16 detection reads two bytes +static void readfile_onebyte() +{ + const char code[] = "."; + simplecpp::OutputList outputList; + ASSERT_EQUALS(".", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS(true, outputList.empty()); +} + +static void readfile_utf16_unsupported() +{ + const char code[] = "\xfe\xff\xd8\x3d\xde\x42"; // smiley emoji + simplecpp::OutputList outputList; + ASSERT_EQUALS("", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS("file0,1,unhandled_char_error,The code contains unhandled character(s) (character code=255). Neither unicode nor extended ascii is supported.\n", toString(outputList)); +} + +static void readfile_utf16_incomplete() +{ + const char code[] = "\xfe\xff\x00\x31\x00\x32\x00"; // the last UTF16 char is incomplete + simplecpp::OutputList outputList; + ASSERT_EQUALS("12", readfile(code,sizeof(code), &outputList)); + ASSERT_EQUALS(true, outputList.empty()); +} + +// TODO: test with incomplete BOMs + static void stringify1() { const char code_c[] = "#include \"A.h\"\n" @@ -3532,6 +3567,10 @@ int main(int argc, char **argv) TEST_CASE(readfile_unhandled_chars); TEST_CASE(readfile_error); TEST_CASE(readfile_file_not_found); + TEST_CASE(readfile_empty); + TEST_CASE(readfile_onebyte); + TEST_CASE(readfile_utf16_unsupported); + TEST_CASE(readfile_utf16_incomplete); TEST_CASE(stringify1);