Skip to content

Commit 17d42a1

Browse files
Validation: add UTF-8 validation support
This implements: - CborValidateUtf8 Signed-off-by: Thiago Macieira <[email protected]>
1 parent c85bcf5 commit 17d42a1

File tree

4 files changed

+160
-62
lines changed

4 files changed

+160
-62
lines changed

src/cborpretty.c

Lines changed: 9 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "cbor.h"
3232
#include "cborinternal_p.h"
3333
#include "compilersupport_p.h"
34+
#include "utf8_p.h"
3435

3536
#include <float.h>
3637
#include <inttypes.h>
@@ -161,10 +162,13 @@ static CborError hexDump(FILE *out, const void *ptr, size_t n)
161162
* On UTF-8 decoding error, it returns CborErrorInvalidUtf8TextString */
162163
static CborError utf8EscapedDump(FILE *out, const void *ptr, size_t n)
163164
{
164-
const char *buffer = (const char *)ptr;
165-
uint32_t uc;
166-
while (n--) {
167-
uc = (uint8_t)*buffer++;
165+
const uint8_t *buffer = (const uint8_t *)ptr;
166+
const uint8_t * const end = buffer + n;
167+
while (buffer < end) {
168+
uint32_t uc = get_utf8(&buffer, end);
169+
if (uc == ~0U)
170+
return CborErrorInvalidUtf8TextString;
171+
168172
if (uc < 0x80) {
169173
/* single-byte UTF-8 */
170174
if (uc < 0x7f && uc >= 0x20 && uc != '\\' && uc != '"') {
@@ -202,65 +206,8 @@ static CborError utf8EscapedDump(FILE *out, const void *ptr, size_t n)
202206
continue;
203207
}
204208

205-
/* multi-byte UTF-8, decode it */
206-
unsigned charsNeeded;
207-
uint32_t min_uc;
208-
if (unlikely(uc <= 0xC1))
209-
return CborErrorInvalidUtf8TextString;
210-
if (uc < 0xE0) {
211-
/* two-byte UTF-8 */
212-
charsNeeded = 2;
213-
min_uc = 0x80;
214-
uc &= 0x1f;
215-
} else if (uc < 0xF0) {
216-
/* three-byte UTF-8 */
217-
charsNeeded = 3;
218-
min_uc = 0x800;
219-
uc &= 0x0f;
220-
} else if (uc < 0xF5) {
221-
/* four-byte UTF-8 */
222-
charsNeeded = 4;
223-
min_uc = 0x10000;
224-
uc &= 0x07;
225-
} else {
226-
return CborErrorInvalidUtf8TextString;
227-
}
228-
229-
if (n < charsNeeded - 1)
230-
return CborErrorInvalidUtf8TextString;
231-
n -= charsNeeded - 1;
232-
233-
/* first continuation character */
234-
uint8_t b = (uint8_t)*buffer++;
235-
if ((b & 0xc0) != 0x80)
236-
return CborErrorInvalidUtf8TextString;
237-
uc <<= 6;
238-
uc |= b & 0x3f;
239-
240-
if (charsNeeded > 2) {
241-
/* second continuation character */
242-
b = (uint8_t)*buffer++;
243-
if ((b & 0xc0) != 0x80)
244-
return CborErrorInvalidUtf8TextString;
245-
uc <<= 6;
246-
uc |= b & 0x3f;
247-
248-
if (charsNeeded > 3) {
249-
/* third continuation character */
250-
b = (uint8_t)*buffer++;
251-
if ((b & 0xc0) != 0x80)
252-
return CborErrorInvalidUtf8TextString;
253-
uc <<= 6;
254-
uc |= b & 0x3f;
255-
}
256-
}
257-
258-
/* overlong sequence? surrogate pair? out or range? */
259-
if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
260-
return CborErrorInvalidUtf8TextString;
261-
262209
/* now print the sequence */
263-
if (charsNeeded > 3) {
210+
if (uc > 0xffffU) {
264211
/* needs surrogate pairs */
265212
if (fprintf(out, "\\u%04" PRIX32 "\\u%04" PRIX32,
266213
(uc >> 10) + 0xd7c0, /* high surrogate */

src/cborvalidation.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "cbor.h"
3232
#include "cborinternal_p.h"
3333
#include "compilersupport_p.h"
34+
#include "utf8_p.h"
3435

3536
#include <string.h>
3637

@@ -227,6 +228,18 @@ static const struct KnownTagData knownTagData[] = {
227228

228229
static CborError validate_value(CborValue *it, int flags, int recursionLeft);
229230

231+
static inline CborError validate_utf8_string(const void *ptr, size_t n)
232+
{
233+
const uint8_t *buffer = (const uint8_t *)ptr;
234+
const uint8_t * const end = buffer + n;
235+
while (buffer < end) {
236+
uint32_t uc = get_utf8(&buffer, end);
237+
if (uc == ~0U)
238+
return CborErrorInvalidUtf8TextString;
239+
}
240+
return CborNoError;
241+
}
242+
230243
static inline CborError validate_simple_type(uint8_t simple_type, int flags)
231244
{
232245
/* At current time, all known simple types are those from RFC 7049,
@@ -421,6 +434,12 @@ static CborError validate_value(CborValue *it, int flags, int recursionLeft)
421434
return err;
422435
if (!ptr)
423436
break;
437+
438+
if (type == CborTextStringType && flags & CborValidateUtf8) {
439+
err = validate_utf8_string(ptr, n);
440+
if (err)
441+
return err;
442+
}
424443
}
425444

426445
return CborNoError;

src/utf8_p.h

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/****************************************************************************
2+
**
3+
** Copyright (C) 2017 Intel Corporation
4+
**
5+
** Permission is hereby granted, free of charge, to any person obtaining a copy
6+
** of this software and associated documentation files (the "Software"), to deal
7+
** in the Software without restriction, including without limitation the rights
8+
** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
** copies of the Software, and to permit persons to whom the Software is
10+
** furnished to do so, subject to the following conditions:
11+
**
12+
** The above copyright notice and this permission notice shall be included in
13+
** all copies or substantial portions of the Software.
14+
**
15+
** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21+
** THE SOFTWARE.
22+
**
23+
****************************************************************************/
24+
25+
#include "compilersupport_p.h"
26+
27+
#include <stdint.h>
28+
29+
static inline uint32_t get_utf8(const uint8_t **buffer, const uint8_t *end)
30+
{
31+
uint32_t uc;
32+
ptrdiff_t n = end - *buffer;
33+
if (n == 0)
34+
return ~0U;
35+
36+
uc = *(*buffer)++;
37+
if (uc < 0x80) {
38+
/* single-byte UTF-8 */
39+
return uc;
40+
}
41+
42+
/* multi-byte UTF-8, decode it */
43+
int charsNeeded;
44+
uint32_t min_uc;
45+
if (unlikely(uc <= 0xC1))
46+
return ~0U;
47+
if (uc < 0xE0) {
48+
/* two-byte UTF-8 */
49+
charsNeeded = 2;
50+
min_uc = 0x80;
51+
uc &= 0x1f;
52+
} else if (uc < 0xF0) {
53+
/* three-byte UTF-8 */
54+
charsNeeded = 3;
55+
min_uc = 0x800;
56+
uc &= 0x0f;
57+
} else if (uc < 0xF5) {
58+
/* four-byte UTF-8 */
59+
charsNeeded = 4;
60+
min_uc = 0x10000;
61+
uc &= 0x07;
62+
} else {
63+
return ~0U;
64+
}
65+
66+
if (n < charsNeeded - 1)
67+
return ~0U;
68+
69+
/* first continuation character */
70+
uint8_t b = *(*buffer)++;
71+
if ((b & 0xc0) != 0x80)
72+
return ~0U;
73+
uc <<= 6;
74+
uc |= b & 0x3f;
75+
76+
if (charsNeeded > 2) {
77+
/* second continuation character */
78+
b = *(*buffer)++;
79+
if ((b & 0xc0) != 0x80)
80+
return ~0U;
81+
uc <<= 6;
82+
uc |= b & 0x3f;
83+
84+
if (charsNeeded > 3) {
85+
/* third continuation character */
86+
b = *(*buffer)++;
87+
if ((b & 0xc0) != 0x80)
88+
return ~0U;
89+
uc <<= 6;
90+
uc |= b & 0x3f;
91+
}
92+
}
93+
94+
/* overlong sequence? surrogate pair? out or range? */
95+
if (uc < min_uc || uc - 0xd800U < 2048U || uc > 0x10ffff)
96+
return ~0U;
97+
98+
return uc;
99+
}

tests/parser/tst_parser.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,39 @@ void tst_Parser::strictValidation_data()
16171617
QTest::newRow("tag-4294967296") << raw("\xdb\0\0\0\1\0\0\0\0\x60") << int(CborValidateCanonicalFormat) << CborNoError;
16181618

16191619
// strict mode
1620+
QTest::newRow("invalid-utf8-1char") << raw("\x61\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1621+
QTest::newRow("invalid-utf8-2chars-1") << raw("\x62\xc2\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1622+
QTest::newRow("invalid-utf8-2chars-2") << raw("\x62\xc3\xdf") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1623+
QTest::newRow("invalid-utf8-2chars-3") << raw("\x62\xc7\xf0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1624+
QTest::newRow("invalid-utf8-3chars-1") << raw("\x63\xe0\xa0\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1625+
QTest::newRow("invalid-utf8-3chars-2") << raw("\x63\xe0\xc0\xa0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1626+
QTest::newRow("invalid-utf8-4chars-1") << raw("\x64\xf0\x90\x80\xc0") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1627+
QTest::newRow("invalid-utf8-4chars-2") << raw("\x64\xf0\x90\xc0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1628+
QTest::newRow("invalid-utf8-4chars-3") << raw("\x64\xf0\xc0\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1629+
QTest::newRow("invalid-utf8-hi-surrogate") << raw("\x63\xed\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1630+
QTest::newRow("invalid-utf8-lo-surrogate") << raw("\x63\xed\xb0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1631+
QTest::newRow("invalid-utf8-surrogate-pair") << raw("\x66\xed\xa0\x80\xed\xb0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1632+
QTest::newRow("invalid-utf8-non-unicode-1") << raw("\x64\xf4\x90\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1633+
QTest::newRow("invalid-utf8-non-unicode-2") << raw("\x65\xf8\x88\x80\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1634+
QTest::newRow("invalid-utf8-non-unicode-3") << raw("\x66\xfc\x84\x80\x80\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1635+
QTest::newRow("invalid-utf8-non-unicode-4") << raw("\x66\xfd\xbf\xbf\xbf\xbf\xbf") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1636+
QTest::newRow("invalid-utf8-fe") << raw("\x61\xfe") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1637+
QTest::newRow("invalid-utf8-ff") << raw("\x61\xff") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1638+
QTest::newRow("invalid-utf8-overlong-1-2") << raw("\x62\xc1\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1639+
QTest::newRow("invalid-utf8-overlong-1-3") << raw("\x63\xe0\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1640+
QTest::newRow("invalid-utf8-overlong-1-4") << raw("\x64\xf0\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1641+
QTest::newRow("invalid-utf8-overlong-1-5") << raw("\x65\xf8\x80\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1642+
QTest::newRow("invalid-utf8-overlong-1-6") << raw("\x66\xfc\x80\x80\x80\x81\x81") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1643+
QTest::newRow("invalid-utf8-overlong-2-3") << raw("\x63\xe0\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1644+
QTest::newRow("invalid-utf8-overlong-2-4") << raw("\x64\xf0\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1645+
QTest::newRow("invalid-utf8-overlong-2-5") << raw("\x65\xf8\x80\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1646+
QTest::newRow("invalid-utf8-overlong-2-6") << raw("\x66\xfc\x80\x80\x80\x82\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1647+
QTest::newRow("invalid-utf8-overlong-3-4") << raw("\x64\xf0\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1648+
QTest::newRow("invalid-utf8-overlong-3-5") << raw("\x65\xf8\x80\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1649+
QTest::newRow("invalid-utf8-overlong-3-6") << raw("\x66\xfc\x80\x80\x80\xa0\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1650+
QTest::newRow("invalid-utf8-overlong-4-5") << raw("\x65\xf8\x80\x84\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1651+
QTest::newRow("invalid-utf8-overlong-4-6") << raw("\x66\xfc\x80\x80\x84\x80\x80") << int(CborValidateStrictMode) << CborErrorInvalidUtf8TextString;
1652+
16201653
QTest::newRow("tag-0-unsigned") << raw("\xc0\x00") << int(CborValidateStrictMode) << CborErrorInappropriateTagForType;
16211654
QTest::newRow("tag-0-bytearray") << raw("\xc0\x40") << int(CborValidateStrictMode) << CborErrorInappropriateTagForType;
16221655
QTest::newRow("tag-0-string") << raw("\xc0\x60") << int(CborValidateStrictMode) << CborNoError;

0 commit comments

Comments
 (0)