Skip to content

Commit c6d46a4

Browse files
committed
utf8 tests
1 parent 0b03bb9 commit c6d46a4

File tree

4 files changed

+184
-4
lines changed

4 files changed

+184
-4
lines changed

include/boost/json/detail/sse2.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ count_valid<false>(
131131
continue;
132132
}
133133
// validate utf-8
134-
uint16_t first = classify_utf8(c & 0x7F);
134+
uint16_t first = classify_utf8(c);
135135
uint8_t len = first & 0xFF;
136136
if(BOOST_JSON_UNLIKELY(end - p < len))
137137
break;
@@ -180,7 +180,7 @@ count_valid<false>(
180180
continue;
181181
}
182182
// validate utf-8
183-
uint16_t first = classify_utf8(c & 0x7F);
183+
uint16_t first = classify_utf8(c);
184184
uint8_t len = first & 0xFF;
185185
if(BOOST_JSON_UNLIKELY(end - p < len))
186186
break;

include/boost/json/detail/utf8.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#ifndef BOOST_JSON_DETAIL_UTF8_HPP
1111
#define BOOST_JSON_DETAIL_UTF8_HPP
1212

13+
#include <boost/json/detail/config.hpp>
14+
1315
#include <cstddef>
1416
#include <cstring>
1517
#include <cstdint>
@@ -65,7 +67,7 @@ classify_utf8(char c)
6567
0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
6668
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
6769
};
68-
return first[static_cast<unsigned char>(c)];
70+
return first[static_cast<unsigned char>(c & 0x7F)];
6971
}
7072

7173
inline
@@ -128,7 +130,7 @@ class utf8_sequence
128130
const char* p,
129131
std::size_t remain) noexcept
130132
{
131-
first_ = classify_utf8(*p & 0x7F);
133+
first_ = classify_utf8(*p );
132134
if(remain >= length())
133135
size_ = length();
134136
else

test/Jamfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ local SOURCES =
4848
value_stack.cpp
4949
value_to.cpp
5050
value_ref.cpp
51+
utf8.cpp
5152
visit.cpp
5253
ryu/d2s_intrinsics_test.cpp
5354
ryu/d2s_table_test.cpp

test/utf8.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
//
2+
// Copyright (c) 2022 Dmitry Arkhipov ([email protected])
3+
//
4+
// Distributed under the Boost Software License, Version 1.0. (See accompanying
5+
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6+
//
7+
// Official repository: https://github.com/boostorg/json
8+
//
9+
10+
#include <boost/json/detail/utf8.hpp>
11+
#include <boost/json/string_view.hpp>
12+
13+
#include "test_suite.hpp"
14+
15+
BOOST_JSON_NS_BEGIN
16+
17+
class utf8_test
18+
{
19+
public:
20+
void
21+
testLoadLittleEndian()
22+
{
23+
BOOST_TEST(
24+
detail::load_little_endian<4>("\x01\x02\x03\x04\xFF")
25+
== 0x04030201);
26+
27+
BOOST_TEST(
28+
detail::load_little_endian<4>("\x12\x34\x56\x78\xFF")
29+
== 0x78563412);
30+
31+
BOOST_TEST(
32+
detail::load_little_endian<4>("\xFE\xDC\xBA\x98\xFF")
33+
== 0x98BADCFE);
34+
35+
BOOST_TEST(
36+
detail::load_little_endian<3>("\x12\x45\xFE\xFF")
37+
== 0x00FE4512);
38+
39+
BOOST_TEST(
40+
detail::load_little_endian<3>("\xE0\xA0\x80\xFF")
41+
== 0x0080A0E0);
42+
43+
BOOST_TEST(
44+
detail::load_little_endian<2>("\x37\xFC\xFF")
45+
== 0x0000FC37);
46+
47+
BOOST_TEST(detail::load_little_endian<1>("\xF1\xFF") == 0x000000F1);
48+
}
49+
50+
void
51+
testClassifyUtf8()
52+
{
53+
BOOST_TEST((detail::classify_utf8('\x00') & 0xFF) == 0);
54+
// from code point U+0080 (0xC280 in UTF-8)
55+
BOOST_TEST((detail::classify_utf8('\xC2') & 0xFF) == 2);
56+
// from code point U+07FF (0xDFBF in UTF-8)
57+
BOOST_TEST((detail::classify_utf8('\xDF') & 0xFF) == 2);
58+
// from code point U+0800 (0xE0A080 in UTF-8)
59+
BOOST_TEST((detail::classify_utf8('\xE0') & 0xFF) == 3);
60+
// from code point U+0FFFF (0xEFBFBF in UTF-8)
61+
BOOST_TEST((detail::classify_utf8('\xEF') & 0xFF) == 3);
62+
// from code point U+010000 (0xF0908080 in UTF-8)
63+
BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4);
64+
// from code point U+010000 (0xF0908080 in UTF-8)
65+
BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4);
66+
// from code point U+010FFFF (0xF48FBFBF in UTF-8)
67+
BOOST_TEST((detail::classify_utf8('\xF4') & 0xFF) == 4);
68+
}
69+
70+
void
71+
testIsValidUtf8()
72+
{
73+
auto is_valid_utf8 = [](char const* str) {
74+
std::uint16_t first = detail::classify_utf8(*str);
75+
return detail::is_valid_utf8(str, first);
76+
};
77+
78+
BOOST_TEST(is_valid_utf8("\xC2\x80")); // code point U+0080
79+
BOOST_TEST(is_valid_utf8("\xDF\xBF")); // code point U+07FF
80+
BOOST_TEST(is_valid_utf8("\xE0\xA0\x80")); // code point U+0800
81+
BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+0FFFF
82+
BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+010000
83+
BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+010FFFF
84+
85+
BOOST_TEST(! is_valid_utf8("\x80"));
86+
BOOST_TEST(! is_valid_utf8("\xBF"));
87+
88+
BOOST_TEST(! is_valid_utf8("\xDF\x00"));
89+
BOOST_TEST(! is_valid_utf8("\xDF\x7F"));
90+
BOOST_TEST(! is_valid_utf8("\xDF\xFF"));
91+
92+
BOOST_TEST(! is_valid_utf8("\xE0\x00\x80"));
93+
BOOST_TEST(! is_valid_utf8("\xE1\x7F\x80"));
94+
BOOST_TEST(! is_valid_utf8("\xE4\xFF\x80"));
95+
BOOST_TEST(! is_valid_utf8("\xE8\x80\x00"));
96+
BOOST_TEST(! is_valid_utf8("\xEC\x80\x7F"));
97+
BOOST_TEST(! is_valid_utf8("\xEF\x80\xFF"));
98+
99+
BOOST_TEST(! is_valid_utf8("\xF0\x00\x80\x80"));
100+
BOOST_TEST(! is_valid_utf8("\xF1\x7F\x80\x80"));
101+
BOOST_TEST(! is_valid_utf8("\xF2\xFF\x80\x80"));
102+
BOOST_TEST(! is_valid_utf8("\xF3\x80\x00\x80"));
103+
BOOST_TEST(! is_valid_utf8("\xF4\x80\x7F\x80"));
104+
BOOST_TEST(! is_valid_utf8("\xF0\x80\xFF\x80"));
105+
BOOST_TEST(! is_valid_utf8("\xF1\x80\x80\x00"));
106+
BOOST_TEST(! is_valid_utf8("\xF2\x80\x80\x7F"));
107+
BOOST_TEST(! is_valid_utf8("\xF3\x80\x80\xFF"));
108+
}
109+
110+
void
111+
testUtf8Sequence()
112+
{
113+
char const* str = "\xE0\xA0\x80\00\00";
114+
detail::utf8_sequence seq;
115+
seq.save(str, std::strlen(str));
116+
BOOST_TEST(seq.complete());
117+
BOOST_TEST(seq.length() == 3);
118+
BOOST_TEST(seq.needed() == 0);
119+
BOOST_TEST(string_view(seq.data(), 3) == str);
120+
BOOST_TEST(seq.valid());
121+
122+
seq.save(str, 1);
123+
BOOST_TEST(!seq.complete());
124+
BOOST_TEST(seq.length() == 3);
125+
BOOST_TEST(seq.needed() == 2);
126+
BOOST_TEST(string_view(seq.data(), 1) == string_view(str, 1));
127+
128+
seq.append(str + 1, 1);
129+
BOOST_TEST(!seq.complete());
130+
BOOST_TEST(seq.length() == 3);
131+
BOOST_TEST(seq.needed() == 1);
132+
BOOST_TEST(string_view(seq.data(), 2) == string_view(str, 2));
133+
134+
seq.append(str + 2, 2);
135+
BOOST_TEST(seq.complete());
136+
BOOST_TEST(seq.length() == 3);
137+
BOOST_TEST(seq.needed() == 0);
138+
BOOST_TEST(string_view(seq.data(), 3) == str);
139+
BOOST_TEST(seq.valid());
140+
141+
seq.append(str + 3, 1);
142+
BOOST_TEST(seq.complete());
143+
BOOST_TEST(seq.length() == 3);
144+
BOOST_TEST(seq.needed() == 0);
145+
BOOST_TEST(string_view(seq.data(), 3) == str);
146+
BOOST_TEST(seq.valid());
147+
148+
str = "\xF0\x90\x80\x80";
149+
seq.save(str, std::strlen(str));
150+
BOOST_TEST(seq.complete());
151+
BOOST_TEST(seq.length() == 4);
152+
BOOST_TEST(seq.needed() == 0);
153+
BOOST_TEST(string_view(seq.data(), 4) == str);
154+
BOOST_TEST(seq.valid());
155+
156+
str = "\xF0\x90\x80\xC0";
157+
seq.save(str, std::strlen(str));
158+
BOOST_TEST(seq.complete());
159+
BOOST_TEST(seq.length() == 4);
160+
BOOST_TEST(seq.needed() == 0);
161+
BOOST_TEST(string_view(seq.data(), 4) == str);
162+
BOOST_TEST(!seq.valid());
163+
}
164+
165+
void
166+
run()
167+
{
168+
testLoadLittleEndian();
169+
testClassifyUtf8();
170+
testIsValidUtf8();
171+
testUtf8Sequence();
172+
}
173+
};
174+
175+
TEST_SUITE(utf8_test, "boost.json.utf8");
176+
177+
BOOST_JSON_NS_END

0 commit comments

Comments
 (0)