Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/kf/TextDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ namespace kf
{
using namespace std;

///////////////////////////////////////////////////////////////////////////////////////////////////
// TextDetector class provides a utility to determine whether a given buffer contains textual data.
// It filters out control characters (except for \t, \n, \r) to verify that the content represents valid text.
// Supported encodings include ANSI, UTF-8, UTF-16 (LE/BE), and UTF-32 (LE/BE).
class TextDetector
{
public:
Expand Down
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ wdk_add_driver(kf-test WINVER NTDDI_WIN10 STL
HexTest.cpp
MapTest.cpp
Vector.cpp
TextDetectorTest.cpp
)

target_link_libraries(kf-test kf::kf kmtest::kmtest)
Expand Down
148 changes: 148 additions & 0 deletions test/TextDetectorTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include "pch.h"
#include <kf/TextDetector.h>

SCENARIO("TextDetector::isText")
{
GIVEN("Valid UTF-8 text with BOM")
{
std::byte data[] = {
std::byte(0xEF), std::byte(0xBB), std::byte(0xBF),
std::byte('T'), std::byte('e'), std::byte('s'), std::byte('t')
};

THEN("Text is detected")
{
REQUIRE(kf::TextDetector::isText(data));
}
};

GIVEN("UTF-8 text with BOM and invalid char")
{
std::byte data[] = {
std::byte(0xEF), std::byte(0xBB), std::byte(0xBF),
std::byte(0x01), // invalid char
std::byte('a'), std::byte('b')
};

THEN("Not a text is detected")
{
REQUIRE(!kf::TextDetector::isText(data));
}
};

GIVEN("Valid UTF-16LE text with BOM")
{
std::byte data[] = {
std::byte(0xFF), std::byte(0xFE),
std::byte('T'), std::byte(0x00),
std::byte('e'), std::byte(0x00),
std::byte('s'), std::byte(0x00),
std::byte('t'), std::byte(0x00)
};

THEN("Text is detected")
{
REQUIRE(kf::TextDetector::isText(data));
}
}

GIVEN("UTF-16LE text with BOM and invalid char")
{
std::byte data[] = {
std::byte(0xFF), std::byte(0xFE),
std::byte(0x01), std::byte(0x00), // invalid char
std::byte('a'), std::byte(0x00)
};

THEN("Not a text is detected")
{
REQUIRE(!kf::TextDetector::isText(data));
}
};

GIVEN("UTF-16BE text with BOM")
{
std::byte data[] = {
std::byte(0xFE), std::byte(0xFF),
std::byte(0x00), std::byte('T'),
std::byte(0x00), std::byte('e'),
std::byte(0x00), std::byte('s'),
std::byte(0x00), std::byte('t')
};

THEN("Text is detected")
{
REQUIRE(kf::TextDetector::isText(data));
}
};

GIVEN("UTF-16BE text with BOM and invalid char")
{
std::byte data[] = {
std::byte(0xFE), std::byte(0xFF),
std::byte(0x00), std::byte(0x01), // invalid char
std::byte(0x00), std::byte('a')
};

THEN("Not a text is detected")
{
REQUIRE(!kf::TextDetector::isText(data));
}
};

GIVEN("Valid UTF-32LE text with BOM")
{
std::byte data[] = {
std::byte(0xFF), std::byte(0xFE), std::byte(0x00), std::byte(0x00),
std::byte('T'), std::byte(0x00), std::byte(0x00), std::byte(0x00),
std::byte('e'), std::byte(0x00), std::byte(0x00), std::byte(0x00)
};

THEN("Text is detected")
{
REQUIRE(kf::TextDetector::isText(data));
}
}

GIVEN("UTF-32LE text with BOM and invalid char")
{
std::byte data[] = {
std::byte(0xFF), std::byte(0xFE), std::byte(0x00), std::byte(0x00),
std::byte(0x01), std::byte(0x00), std::byte(0x00), std::byte(0x00), // invalid char
std::byte('a'), std::byte(0x00), std::byte(0x00), std::byte(0x00)
};

THEN("Not a text is detected")
{
REQUIRE(!kf::TextDetector::isText(data));
}
}

GIVEN("Valid UTF-32BE text with BOM")
{
std::byte data[] = {
std::byte(0x00), std::byte(0x00), std::byte(0xFE), std::byte(0xFF),
std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('T'),
std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('e')
};

THEN("Text is detected")
{
REQUIRE(kf::TextDetector::isText(data));
}
}

GIVEN("UTF-32BE text with BOM and invalid char")
{
std::byte data[] = {
std::byte(0x00), std::byte(0x00), std::byte(0xFE), std::byte(0xFF),
std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte(0x01), // invalid char
std::byte(0x00), std::byte(0x00), std::byte(0x00), std::byte('a')
};

THEN("Not a text is detected")
{
REQUIRE(!kf::TextDetector::isText(data));
}
}
}