Skip to content

Commit 79444c9

Browse files
bk2204gitster
authored andcommitted
utf8: handle systems that don't write BOM for UTF-16
When serializing UTF-16 (and UTF-32), there are three possible ways to write the stream. One can write the data with a BOM in either big-endian or little-endian format, or one can write the data without a BOM in big-endian format. Most systems' iconv implementations choose to write it with a BOM in some endianness, since this is the most foolproof, and it is resistant to misinterpretation on Windows, where UTF-16 and the little-endian serialization are very common. For compatibility with Windows and to avoid accidental misuse there, Git always wants to write UTF-16 with a BOM, and will refuse to read UTF-16 without it. However, musl's iconv implementation writes UTF-16 without a BOM, relying on the user to interpret it as big-endian. This causes t0028 and the related functionality to fail, since Git won't read the file without a BOM. Add a Makefile and #define knob, ICONV_OMITS_BOM, that can be set if the iconv implementation has this behavior. When set, Git will write a BOM manually for UTF-16 and UTF-32 and then force the data to be written in UTF-16BE or UTF-32BE. We choose big-endian behavior here because the tests use the raw "UTF-16" encoding, which will be big-endian when the implementation requires this knob to be set. Update the tests to detect this case and write test data with an added BOM if necessary. Always write the BOM in the tests in big-endian format, since all iconv implementations that omit a BOM must use big-endian serialization according to the Unicode standard. Preserve the existing behavior for systems which do not have this knob enabled, since they may use optimized implementations, including defaulting to the native endianness, which may improve performance. Signed-off-by: brian m. carlson <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 11ad41d commit 79444c9

File tree

3 files changed

+50
-5
lines changed

3 files changed

+50
-5
lines changed

Makefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ all::
259259
# Define OLD_ICONV if your library has an old iconv(), where the second
260260
# (input buffer pointer) parameter is declared with type (const char **).
261261
#
262+
# Define ICONV_OMITS_BOM if your iconv implementation does not write a
263+
# byte-order mark (BOM) when writing UTF-16 or UTF-32 and always writes in
264+
# big-endian format.
265+
#
262266
# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
263267
#
264268
# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
@@ -1415,6 +1419,9 @@ ifndef NO_ICONV
14151419
EXTLIBS += $(ICONV_LINK) -liconv
14161420
endif
14171421
endif
1422+
ifdef ICONV_OMITS_BOM
1423+
BASIC_CFLAGS += -DICONV_OMITS_BOM
1424+
endif
14181425
ifdef NEEDS_LIBGEN
14191426
EXTLIBS += -lgen
14201427
endif

t/t0028-working-tree-encoding.sh

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,39 @@ test_description='working-tree-encoding conversion via gitattributes'
66

77
GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING
88

9+
test_lazy_prereq NO_UTF16_BOM '
10+
test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
11+
'
12+
13+
test_lazy_prereq NO_UTF32_BOM '
14+
test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
15+
'
16+
17+
write_utf16 () {
18+
if test_have_prereq NO_UTF16_BOM
19+
then
20+
printf '\xfe\xff'
21+
fi &&
22+
iconv -f UTF-8 -t UTF-16
23+
}
24+
25+
write_utf32 () {
26+
if test_have_prereq NO_UTF32_BOM
27+
then
28+
printf '\x00\x00\xfe\xff'
29+
fi &&
30+
iconv -f UTF-8 -t UTF-32
31+
}
32+
933
test_expect_success 'setup test files' '
1034
git config core.eol lf &&
1135
1236
text="hallo there!\ncan you read me?" &&
1337
echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
1438
echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
1539
printf "$text" >test.utf8.raw &&
16-
printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
17-
printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
40+
printf "$text" | write_utf16 >test.utf16.raw &&
41+
printf "$text" | write_utf32 >test.utf32.raw &&
1842
printf "\377\376" >test.utf16lebom.raw &&
1943
printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
2044
@@ -124,8 +148,8 @@ do
124148
test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
125149
test_when_finished "git reset --hard HEAD^" &&
126150
127-
cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw &&
128-
cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw &&
151+
cat lf.utf8.raw | write_utf${i} >lf.utf${i}.raw &&
152+
cat crlf.utf8.raw | write_utf${i} >crlf.utf${i}.raw &&
129153
cp crlf.utf${i}.raw eol.utf${i} &&
130154
131155
cat >expectIndexLF <<-EOF &&
@@ -223,7 +247,7 @@ test_expect_success ICONV_SHIFT_JIS 'check roundtrip encoding' '
223247
224248
text="hallo there!\nroundtrip test here!" &&
225249
printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
226-
printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
250+
printf "$text" | write_utf16 >roundtrip.utf16 &&
227251
echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
228252
229253
# SHIFT-JIS encoded files are round-trip checked by default...

utf8.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,10 @@ char *reencode_string_len(const char *in, size_t insz,
559559
/*
560560
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
561561
* Some users under Windows want the little endian version
562+
*
563+
* We handle UTF-16 and UTF-32 ourselves only if the platform does not
564+
* provide a BOM (which we require), since we want to match the behavior
565+
* of the system tools and libc as much as possible.
562566
*/
563567
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
564568
bom_str = utf16_le_bom;
@@ -568,6 +572,16 @@ char *reencode_string_len(const char *in, size_t insz,
568572
bom_str = utf16_be_bom;
569573
bom_len = sizeof(utf16_be_bom);
570574
out_encoding = "UTF-16BE";
575+
#ifdef ICONV_OMITS_BOM
576+
} else if (same_utf_encoding("UTF-16", out_encoding)) {
577+
bom_str = utf16_be_bom;
578+
bom_len = sizeof(utf16_be_bom);
579+
out_encoding = "UTF-16BE";
580+
} else if (same_utf_encoding("UTF-32", out_encoding)) {
581+
bom_str = utf32_be_bom;
582+
bom_len = sizeof(utf32_be_bom);
583+
out_encoding = "UTF-32BE";
584+
#endif
571585
}
572586

573587
conv = iconv_open(out_encoding, in_encoding);

0 commit comments

Comments
 (0)