Skip to content

Commit 10ecb82

Browse files
larsxschneidergitster
authored andcommitted
utf8: add function to detect prohibited UTF-16/32 BOM
Whenever a data stream is declared to be UTF-16BE, UTF-16LE, UTF-32BE or UTF-32LE a BOM must not be used [1]. The function returns true if this is the case. This function is used in a subsequent commit. [1] http://unicode.org/faq/utf_bom.html#bom10 Signed-off-by: Lars Schneider <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 2f0c4a3 commit 10ecb82

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

utf8.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,32 @@ char *reencode_string_len(const char *in, int insz,
560560
}
561561
#endif
562562

563+
static int has_bom_prefix(const char *data, size_t len,
564+
const char *bom, size_t bom_len)
565+
{
566+
return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len);
567+
}
568+
569+
static const char utf16_be_bom[] = {0xFE, 0xFF};
570+
static const char utf16_le_bom[] = {0xFF, 0xFE};
571+
static const char utf32_be_bom[] = {0x00, 0x00, 0xFE, 0xFF};
572+
static const char utf32_le_bom[] = {0xFF, 0xFE, 0x00, 0x00};
573+
574+
int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
575+
{
576+
return (
577+
(same_utf_encoding("UTF-16BE", enc) ||
578+
same_utf_encoding("UTF-16LE", enc)) &&
579+
(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) ||
580+
has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom)))
581+
) || (
582+
(same_utf_encoding("UTF-32BE", enc) ||
583+
same_utf_encoding("UTF-32LE", enc)) &&
584+
(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) ||
585+
has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom)))
586+
);
587+
}
588+
563589
/*
564590
* Returns first character length in bytes for multi-byte `text` according to
565591
* `encoding`.

utf8.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,13 @@ typedef enum {
7070
void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int width,
7171
const char *s);
7272

73+
/*
74+
* If a data stream is declared as UTF-16BE or UTF-16LE, then a UTF-16
75+
* BOM must not be used [1]. The same applies for the UTF-32 equivalents.
76+
* The function returns true if this rule is violated.
77+
*
78+
* [1] http://unicode.org/faq/utf_bom.html#bom10
79+
*/
80+
int has_prohibited_utf_bom(const char *enc, const char *data, size_t len);
81+
7382
#endif

0 commit comments

Comments
 (0)