Skip to content

Commit c50da26

Browse files
KellesiVlada KanivetsSergiusTheBest
authored
Add tests for TextDetector (KF-25, #42)
* add tests for TextDetector * refactoring * add more tests --------- Co-authored-by: Vlada Kanivets <vlada.kanivets@apriorit.com> Co-authored-by: Sergey Podobry <sergius@apriorit.com>
1 parent 88e65f8 commit c50da26

File tree

3 files changed

+226
-0
lines changed

3 files changed

+226
-0
lines changed

include/kf/TextDetector.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ namespace kf
99
{
1010
using namespace std;
1111

12+
///////////////////////////////////////////////////////////////////////////////////////////////////
13+
// TextDetector class provides a utility to determine whether a given buffer contains textual data.
14+
// It filters out control characters (except for \t, \n, \r) to verify that the content represents valid text.
15+
// Supported encodings include ANSI, UTF-8, UTF-16 (LE/BE), and UTF-32 (LE/BE).
1216
class TextDetector
1317
{
1418
public:

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ wdk_add_driver(kf-test WINVER NTDDI_WIN10 STL
4444
HexTest.cpp
4545
MapTest.cpp
4646
Vector.cpp
47+
TextDetectorTest.cpp
4748
ScopeExitTest.cpp
4849
SingletonTest.cpp
4950
DoubleLinkedListTest.cpp

test/TextDetectorTest.cpp

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
#include "pch.h"
2+
#include <kf/TextDetector.h>
3+
4+
SCENARIO("TextDetector::isText")
5+
{
6+
GIVEN("Valid UTF-8 text with BOM")
7+
{
8+
constexpr uint8_t kData[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' };
9+
10+
THEN("Text is detected")
11+
{
12+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
13+
}
14+
};
15+
16+
GIVEN("UTF-8 text with BOM and invalid char")
17+
{
18+
constexpr uint8_t kData[] = { 0xEF, 0xBB, 0xBF, 0x01, 'a', 'b' };
19+
20+
THEN("Not a text is detected")
21+
{
22+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
23+
}
24+
};
25+
26+
GIVEN("Valid UTF-16LE text with BOM")
27+
{
28+
constexpr uint8_t kData[] = {
29+
0xFF, 0xFE,
30+
'T', 0x00,
31+
'e', 0x00,
32+
's', 0x00,
33+
't', 0x00
34+
};
35+
36+
THEN("Text is detected")
37+
{
38+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
39+
}
40+
}
41+
42+
GIVEN("UTF-16LE text with BOM and invalid char")
43+
{
44+
constexpr uint8_t kData[] = {
45+
0xFF, 0xFE,
46+
0x01, 0x00, // invalid char
47+
'a', 0x00
48+
};
49+
50+
THEN("Not a text is detected")
51+
{
52+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
53+
}
54+
};
55+
56+
GIVEN("UTF-16BE text with BOM")
57+
{
58+
constexpr uint8_t kData[] = {
59+
0xFE, 0xFF,
60+
0x00, 'T',
61+
0x00, 'e',
62+
0x00, 's',
63+
0x00, 't'
64+
};
65+
66+
THEN("Text is detected")
67+
{
68+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
69+
}
70+
};
71+
72+
GIVEN("UTF-16BE text with BOM and invalid char")
73+
{
74+
constexpr uint8_t kData[] = {
75+
0xFE, 0xFF,
76+
0x00, 0x01, // invalid char
77+
0x00, 'a'
78+
};
79+
80+
THEN("Not a text is detected")
81+
{
82+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
83+
}
84+
};
85+
86+
GIVEN("Valid UTF-32LE text with BOM")
87+
{
88+
constexpr uint8_t kData[] = {
89+
0xFF, 0xFE, 0x00, 0x00,
90+
'T', 0x00, 0x00, 0x00,
91+
'e', 0x00, 0x00, 0x00,
92+
's', 0x00, 0x00, 0x00,
93+
't', 0x00, 0x00, 0x00
94+
};
95+
96+
THEN("Text is detected")
97+
{
98+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
99+
}
100+
}
101+
102+
GIVEN("UTF-32LE text with BOM and invalid char")
103+
{
104+
constexpr uint8_t kData[] = {
105+
0xFF, 0xFE, 0x00, 0x00,
106+
0x01, 0x00, 0x00, 0x00, // invalid char
107+
'a', 0x00, 0x00, 0x00
108+
};
109+
110+
THEN("Not a text is detected")
111+
{
112+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
113+
}
114+
}
115+
116+
GIVEN("Valid UTF-32BE text with BOM")
117+
{
118+
constexpr uint8_t kData[] = {
119+
0x00, 0x00, 0xFE, 0xFF,
120+
0x00, 0x00, 0x00, 'T',
121+
0x00, 0x00, 0x00, 'e',
122+
0x00, 0x00, 0x00, 's',
123+
0x00, 0x00, 0x00, 't'
124+
};
125+
126+
THEN("Text is detected")
127+
{
128+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
129+
}
130+
}
131+
132+
GIVEN("UTF-32BE text with BOM and invalid char")
133+
{
134+
constexpr uint8_t kData[] = {
135+
0x00, 0x00, 0xFE, 0xFF,
136+
0x00, 0x00, 0x00, 0x01, // invalid char
137+
0x00, 0x00, 0x00, 'a'
138+
};
139+
140+
THEN("Not a text is detected")
141+
{
142+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
143+
}
144+
}
145+
146+
GIVEN("Valid UTF-16BE text without BOM")
147+
{
148+
constexpr uint8_t kData[] = {
149+
0x00, 'T',
150+
0x00, 'e',
151+
0x00, 's',
152+
0x00, 't'
153+
};
154+
155+
THEN("Text is detected")
156+
{
157+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
158+
}
159+
};
160+
161+
GIVEN("UTF-16BE text without BOM and invalid char")
162+
{
163+
constexpr uint8_t kData[] = {
164+
0x00, 0x01, // invalid char
165+
0x00, 'a'
166+
};
167+
168+
THEN("Not a text is detected")
169+
{
170+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
171+
}
172+
};
173+
174+
GIVEN("Valid UTF-16LE text without BOM")
175+
{
176+
constexpr uint8_t kData[] = {
177+
'T', 0x00,
178+
'e', 0x00,
179+
's', 0x00,
180+
't', 0x00
181+
};
182+
183+
THEN("Text is detected")
184+
{
185+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
186+
}
187+
}
188+
189+
GIVEN("UTF-16LE text without BOM and invalid char")
190+
{
191+
constexpr uint8_t kData[] = {
192+
0x01, 0x00, // invalid char
193+
'a', 0x00
194+
};
195+
196+
THEN("Not a text is detected")
197+
{
198+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
199+
}
200+
};
201+
202+
GIVEN("Valid UTF-8 text without BOM")
203+
{
204+
constexpr uint8_t kData[] = { 'T', 'e', 's', 't' };
205+
206+
THEN("Text is detected")
207+
{
208+
REQUIRE(kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
209+
}
210+
};
211+
212+
GIVEN("UTF-8 text without BOM and invalid char")
213+
{
214+
constexpr uint8_t kData[] = { 0x01, 'a', 'b' };
215+
216+
THEN("Not a text is detected")
217+
{
218+
REQUIRE(!kf::TextDetector::isText(std::as_bytes(std::span{ kData })));
219+
}
220+
};
221+
}

0 commit comments

Comments
 (0)