diff --git a/include/kf/TextDetector.h b/include/kf/TextDetector.h index d278187..3e16a74 100644 --- a/include/kf/TextDetector.h +++ b/include/kf/TextDetector.h @@ -9,6 +9,10 @@ namespace kf { using namespace std; + /////////////////////////////////////////////////////////////////////////////////////////////////// + // TextDetector class provides a utility to determine whether a given buffer contains textual data. + // It filters out control characters (except for \t, \n, \r) to verify that the content represents valid text. + // Supported encodings include ANSI, UTF-8, UTF-16 (LE/BE), and UTF-32 (LE/BE). class TextDetector { public: diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fdd5175..124c3d3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,6 +44,7 @@ wdk_add_driver(kf-test WINVER NTDDI_WIN10 STL HexTest.cpp MapTest.cpp Vector.cpp + TextDetectorTest.cpp ScopeExitTest.cpp SingletonTest.cpp DoubleLinkedListTest.cpp diff --git a/test/TextDetectorTest.cpp b/test/TextDetectorTest.cpp new file mode 100644 index 0000000..4c53616 --- /dev/null +++ b/test/TextDetectorTest.cpp @@ -0,0 +1,221 @@ +#include "pch.h" +#include + +SCENARIO("TextDetector::isText") +{ + GIVEN("Valid UTF-8 text with BOM") + { + constexpr uint8_t kData[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-8 text with BOM and invalid char") + { + constexpr uint8_t kData[] = { 0xEF, 0xBB, 0xBF, 0x01, 'a', 'b' }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("Valid UTF-16LE text with BOM") + { + constexpr uint8_t kData[] = { + 0xFF, 0xFE, + 'T', 0x00, + 'e', 0x00, + 's', 0x00, + 't', 0x00 + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("UTF-16LE text with BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0xFF, 0xFE, + 0x01, 0x00, // invalid char + 'a', 0x00 + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-16BE text with BOM") + { + constexpr uint8_t kData[] = { + 0xFE, 0xFF, + 0x00, 'T', + 0x00, 'e', + 0x00, 's', + 0x00, 't' + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-16BE text with BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0xFE, 0xFF, + 0x00, 0x01, // invalid char + 0x00, 'a' + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("Valid UTF-32LE text with BOM") + { + constexpr uint8_t kData[] = { + 0xFF, 0xFE, 0x00, 0x00, + 'T', 0x00, 0x00, 0x00, + 'e', 0x00, 0x00, 0x00, + 's', 0x00, 0x00, 0x00, + 't', 0x00, 0x00, 0x00 + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("UTF-32LE text with BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0xFF, 0xFE, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, // invalid char + 'a', 0x00, 0x00, 0x00 + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("Valid UTF-32BE text with BOM") + { + constexpr uint8_t kData[] = { + 0x00, 0x00, 0xFE, 0xFF, + 0x00, 0x00, 0x00, 'T', + 0x00, 0x00, 0x00, 'e', + 0x00, 0x00, 0x00, 's', + 0x00, 0x00, 0x00, 't' + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("UTF-32BE text with BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0x00, 0x00, 0xFE, 0xFF, + 0x00, 0x00, 0x00, 0x01, // invalid char + 0x00, 0x00, 0x00, 'a' + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("Valid UTF-16BE text without BOM") + { + constexpr uint8_t kData[] = { + 0x00, 'T', + 0x00, 'e', + 0x00, 's', + 0x00, 't' + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-16BE text without BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0x00, 0x01, // invalid char + 0x00, 'a' + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("Valid UTF-16LE text without BOM") + { + constexpr uint8_t kData[] = { + 'T', 0x00, + 'e', 0x00, + 's', 0x00, + 't', 0x00 + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("UTF-16LE text without BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0x01, 0x00, // invalid char + 'a', 0x00 + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("Valid UTF-8 text without BOM") + { + constexpr uint8_t kData[] = { 'T', 'e', 's', 't' }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-8 text without BOM and invalid char") + { + constexpr uint8_t kData[] = { 0x01, 'a', 'b' }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; +}