From ccb76c8e35a1a44992c28a6fd1ab9535e9413d5e Mon Sep 17 00:00:00 2001 From: Vlada Kanivets Date: Mon, 11 Aug 2025 14:16:24 +0200 Subject: [PATCH 1/3] add tests for TextDetector --- include/kf/TextDetector.h | 4 ++ test/CMakeLists.txt | 1 + test/TextDetectorTest.cpp | 148 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 test/TextDetectorTest.cpp diff --git a/include/kf/TextDetector.h b/include/kf/TextDetector.h index d278187..3e16a74 100644 --- a/include/kf/TextDetector.h +++ b/include/kf/TextDetector.h @@ -9,6 +9,10 @@ namespace kf { using namespace std; + /////////////////////////////////////////////////////////////////////////////////////////////////// + // TextDetector class provides a utility to determine whether a given buffer contains textual data. + // It filters out control characters (except for \t, \n, \r) to verify that the content represents valid text. + // Supported encodings include ANSI, UTF-8, UTF-16 (LE/BE), and UTF-32 (LE/BE). class TextDetector { public: diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0c2103b..43bfadf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,6 +44,7 @@ wdk_add_driver(kf-test WINVER NTDDI_WIN10 STL HexTest.cpp MapTest.cpp Vector.cpp + TextDetectorTest.cpp ) target_link_libraries(kf-test kf::kf kmtest::kmtest) diff --git a/test/TextDetectorTest.cpp b/test/TextDetectorTest.cpp new file mode 100644 index 0000000..f358466 --- /dev/null +++ b/test/TextDetectorTest.cpp @@ -0,0 +1,148 @@ +#include "pch.h" +#include + +SCENARIO("TextDetector::isText") +{ + GIVEN("Valid UTF-8 text with BOM") + { + std::byte data[] = { + std::byte(0xEF), std::byte(0xBB), std::byte(0xBF), + std::byte('T'), std::byte('e'), std::byte('s'), std::byte('t') + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(data)); + } + }; + + GIVEN("UTF-8 text with BOM and invalid char") + { + std::byte data[] = { + std::byte(0xEF), std::byte(0xBB), std::byte(0xBF), + std::byte(0x01), // invalid char + std::byte('a'), std::byte('b') + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(data)); + } + }; + + GIVEN("Valid UTF-16LE text with BOM") + { + std::byte data[] = { + std::byte(0xFF), std::byte(0xFE), + std::byte('T'), std::byte(0x00), + std::byte('e'), std::byte(0x00), + std::byte('s'), std::byte(0x00), + std::byte('t'), std::byte(0x00) + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(data)); + } + } + + GIVEN("UTF-16LE text with BOM and invalid char") + { + std::byte data[] = { + std::byte(0xFF), std::byte(0xFE), + std::byte(0x01), std::byte(0x00), // invalid char + std::byte('a'), std::byte(0x00) + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(data)); + } + }; + + GIVEN("UTF-16BE text with BOM") + { + std::byte data[] = { + std::byte(0xFE), std::byte(0xFF), + std::byte(0x00), std::byte('T'), + std::byte(0x00), std::byte('e'), + std::byte(0x00), std::byte('s'), + std::byte(0x00), std::byte('t') + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(data)); + } + }; + + GIVEN("UTF-16BE text with BOM and invalid char") + { + std::byte data[] = { + std::byte(0xFE), std::byte(0xFF), + std::byte(0x00), std::byte(0x01), // invalid char + std::byte(0x00), std::byte('a') + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(data)); + } + }; + + GIVEN("Valid UTF-32LE text with BOM") + { + std::byte data[] = { + std::byte(0xFF), std::byte(0xFE), std::byte(0x00), std::byte(0x00), + std::byte('T'), std::byte(0x00), std::byte(0x00), std::byte(0x00), + std::byte('e'), std::byte(0x00), std::byte(0x00), std::byte(0x00) + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(data)); + } + } + + GIVEN("UTF-32LE text with BOM and invalid char") + { + std::byte data[] = { + std::byte(0xFF), std::byte(0xFE), std::byte(0x00), std::byte(0x00), + std::byte(0x01), std::byte(0x00), std::byte(0x00), std::byte(0x00), // invalid char + std::byte('a'), std::byte(0x00), std::byte(0x00), std::byte(0x00) + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(data)); + } + } + + GIVEN("Valid UTF-32BE text with BOM") + { + std::byte data[] = { + std::byte(0x00), std::byte(0x00), std::byte(0xFE), std::byte(0xFF), + std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('T'), + std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('e') + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(data)); + } + } + + GIVEN("UTF-32BE text with BOM and invalid char") + { + std::byte data[] = { + std::byte(0x00), std::byte(0x00), std::byte(0xFE), std::byte(0xFF), + std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte(0x01), // invalid char + std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('a') + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(data)); + } + } +} From 21e8b3cb237577a0d59de35bb4d2c1898ec12314 Mon Sep 17 00:00:00 2001 From: Vlada Kanivets Date: Tue, 12 Aug 2025 22:33:11 +0200 Subject: [PATCH 2/3] refactoring --- test/TextDetectorTest.cpp | 107 ++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 55 deletions(-) diff --git a/test/TextDetectorTest.cpp b/test/TextDetectorTest.cpp index f358466..7252d44 100644 --- a/test/TextDetectorTest.cpp +++ b/test/TextDetectorTest.cpp @@ -5,144 +5,141 @@ SCENARIO("TextDetector::isText") { GIVEN("Valid UTF-8 text with BOM") { - std::byte data[] = { - std::byte(0xEF), std::byte(0xBB), std::byte(0xBF), - std::byte('T'), std::byte('e'), std::byte('s'), std::byte('t') - }; + constexpr uint8_t kData[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' }; THEN("Text is detected") { - REQUIRE(kf::TextDetector::isText(data)); + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } }; GIVEN("UTF-8 text with BOM and invalid char") { - std::byte data[] = { - std::byte(0xEF), std::byte(0xBB), std::byte(0xBF), - std::byte(0x01), // invalid char - std::byte('a'), std::byte('b') - }; + constexpr uint8_t kData[] = { 0xEF, 0xBB, 0xBF, 0x01, 'a', 'b' }; THEN("Not a text is detected") { - REQUIRE(!kf::TextDetector::isText(data)); + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } }; GIVEN("Valid UTF-16LE text with BOM") { - std::byte data[] = { - std::byte(0xFF), std::byte(0xFE), - std::byte('T'), std::byte(0x00), - std::byte('e'), std::byte(0x00), - std::byte('s'), std::byte(0x00), - std::byte('t'), std::byte(0x00) + constexpr uint8_t kData[] = { + 0xFF, 0xFE, + 'T', 0x00, + 'e', 0x00, + 's', 0x00, + 't', 0x00 }; THEN("Text is detected") { - REQUIRE(kf::TextDetector::isText(data)); + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } } GIVEN("UTF-16LE text with BOM and invalid char") { - std::byte data[] = { - std::byte(0xFF), std::byte(0xFE), - std::byte(0x01), std::byte(0x00), // invalid char - std::byte('a'), std::byte(0x00) + constexpr uint8_t kData[] = { + 0xFF, 0xFE, + 0x01, 0x00, // invalid char + 'a', 0x00 }; THEN("Not a text is detected") { - REQUIRE(!kf::TextDetector::isText(data)); + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } }; GIVEN("UTF-16BE text with BOM") { - std::byte data[] = { - std::byte(0xFE), std::byte(0xFF), - std::byte(0x00), std::byte('T'), - std::byte(0x00), std::byte('e'), - std::byte(0x00), std::byte('s'), - std::byte(0x00), std::byte('t') + constexpr uint8_t kData[] = { + 0xFE, 0xFF, + 0x00, 'T', + 0x00, 'e', + 0x00, 's', + 0x00, 't' }; THEN("Text is detected") { - REQUIRE(kf::TextDetector::isText(data)); + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } }; GIVEN("UTF-16BE text with BOM and invalid char") { - std::byte data[] = { - std::byte(0xFE), std::byte(0xFF), - std::byte(0x00), std::byte(0x01), // invalid char - std::byte(0x00), std::byte('a') + constexpr uint8_t kData[] = { + 0xFE, 0xFF, + 0x00, 0x01, // invalid char + 0x00, 'a' }; THEN("Not a text is detected") { - REQUIRE(!kf::TextDetector::isText(data)); + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } }; GIVEN("Valid UTF-32LE text with BOM") { - std::byte data[] = { - std::byte(0xFF), std::byte(0xFE), std::byte(0x00), std::byte(0x00), - std::byte('T'), std::byte(0x00), std::byte(0x00), std::byte(0x00), - std::byte('e'), std::byte(0x00), std::byte(0x00), std::byte(0x00) + constexpr uint8_t kData[] = { + 0xFF, 0xFE, 0x00, 0x00, + 'T', 0x00, 0x00, 0x00, + 'e', 0x00, 0x00, 0x00, + 's', 0x00, 0x00, 0x00, + 't', 0x00, 0x00, 0x00 }; THEN("Text is detected") { - REQUIRE(kf::TextDetector::isText(data)); + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } } GIVEN("UTF-32LE text with BOM and invalid char") { - std::byte data[] = { - std::byte(0xFF), std::byte(0xFE), std::byte(0x00), std::byte(0x00), - std::byte(0x01), std::byte(0x00), std::byte(0x00), std::byte(0x00), // invalid char - std::byte('a'), std::byte(0x00), std::byte(0x00), std::byte(0x00) + constexpr uint8_t kData[] = { + 0xFF, 0xFE, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, // invalid char + 'a', 0x00, 0x00, 0x00 }; THEN("Not a text is detected") { - REQUIRE(!kf::TextDetector::isText(data)); + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } } GIVEN("Valid UTF-32BE text with BOM") { - std::byte data[] = { - std::byte(0x00), std::byte(0x00), std::byte(0xFE), std::byte(0xFF), - std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('T'), - std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('e') + constexpr uint8_t kData[] = { + 0x00, 0x00, 0xFE, 0xFF, + 0x00, 0x00, 0x00, 'T', + 0x00, 0x00, 0x00, 'e', + 0x00, 0x00, 0x00, 's', + 0x00, 0x00, 0x00, 't' }; THEN("Text is detected") { - REQUIRE(kf::TextDetector::isText(data)); + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } } GIVEN("UTF-32BE text with BOM and invalid char") { - std::byte data[] = { - std::byte(0x00), std::byte(0x00), std::byte(0xFE), std::byte(0xFF), - std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte(0x01), // invalid char - std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('a') + constexpr uint8_t kData[] = { + 0x00, 0x00, 0xFE, 0xFF, + 0x00, 0x00, 0x00, 0x01, // invalid char + 0x00, 0x00, 0x00, 'a' }; THEN("Not a text is detected") { - REQUIRE(!kf::TextDetector::isText(data)); + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } } } From dd8cfc9fa28a953c68e1ef7c90d5a8e5e2c90322 Mon Sep 17 00:00:00 2001 From: Vlada Kanivets Date: Wed, 13 Aug 2025 17:06:28 +0200 Subject: [PATCH 3/3] add more tests --- test/TextDetectorTest.cpp | 76 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/test/TextDetectorTest.cpp b/test/TextDetectorTest.cpp index 7252d44..4c53616 100644 --- a/test/TextDetectorTest.cpp +++ b/test/TextDetectorTest.cpp @@ -142,4 +142,80 @@ SCENARIO("TextDetector::isText") REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); } } + + GIVEN("Valid UTF-16BE text without BOM") + { + constexpr uint8_t kData[] = { + 0x00, 'T', + 0x00, 'e', + 0x00, 's', + 0x00, 't' + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-16BE text without BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0x00, 0x01, // invalid char + 0x00, 'a' + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("Valid UTF-16LE text without BOM") + { + constexpr uint8_t kData[] = { + 'T', 0x00, + 'e', 0x00, + 's', 0x00, + 't', 0x00 + }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + } + + GIVEN("UTF-16LE text without BOM and invalid char") + { + constexpr uint8_t kData[] = { + 0x01, 0x00, // invalid char + 'a', 0x00 + }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("Valid UTF-8 text without BOM") + { + constexpr uint8_t kData[] = { 'T', 'e', 's', 't' }; + + THEN("Text is detected") + { + REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; + + GIVEN("UTF-8 text without BOM and invalid char") + { + constexpr uint8_t kData[] = { 0x01, 'a', 'b' }; + + THEN("Not a text is detected") + { + REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData }))); + } + }; }