Skip to content

Commit 18f9fb6

Browse files
committed
Merge branch 'bc/utf16-portability-fix'
The code and tests assume that the system supplied iconv() would always use BOM in its output when asked to encode to UTF-16 (or UTF-32), but apparently some implementations output big-endian without BOM. A compile-time knob has been added to help such systems (e.g. NonStop) to add BOM to the output to increase portability. * bc/utf16-portability-fix: utf8: handle systems that don't write BOM for UTF-16
2 parents 1db999c + 79444c9 commit 18f9fb6

File tree

3 files changed

+50
-5
lines changed

3 files changed

+50
-5
lines changed

Makefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ all::
259259
# Define OLD_ICONV if your library has an old iconv(), where the second
260260
# (input buffer pointer) parameter is declared with type (const char **).
261261
#
262+
# Define ICONV_OMITS_BOM if your iconv implementation does not write a
263+
# byte-order mark (BOM) when writing UTF-16 or UTF-32 and always writes in
264+
# big-endian format.
265+
#
262266
# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
263267
#
264268
# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
@@ -1417,6 +1421,9 @@ ifndef NO_ICONV
14171421
EXTLIBS += $(ICONV_LINK) -liconv
14181422
endif
14191423
endif
1424+
ifdef ICONV_OMITS_BOM
1425+
BASIC_CFLAGS += -DICONV_OMITS_BOM
1426+
endif
14201427
ifdef NEEDS_LIBGEN
14211428
EXTLIBS += -lgen
14221429
endif

t/t0028-working-tree-encoding.sh

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,39 @@ test_description='working-tree-encoding conversion via gitattributes'
66

77
GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING
88

9+
test_lazy_prereq NO_UTF16_BOM '
10+
test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
11+
'
12+
13+
test_lazy_prereq NO_UTF32_BOM '
14+
test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
15+
'
16+
17+
write_utf16 () {
18+
if test_have_prereq NO_UTF16_BOM
19+
then
20+
printf '\xfe\xff'
21+
fi &&
22+
iconv -f UTF-8 -t UTF-16
23+
}
24+
25+
write_utf32 () {
26+
if test_have_prereq NO_UTF32_BOM
27+
then
28+
printf '\x00\x00\xfe\xff'
29+
fi &&
30+
iconv -f UTF-8 -t UTF-32
31+
}
32+
933
test_expect_success 'setup test files' '
1034
git config core.eol lf &&
1135
1236
text="hallo there!\ncan you read me?" &&
1337
echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
1438
echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
1539
printf "$text" >test.utf8.raw &&
16-
printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
17-
printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
40+
printf "$text" | write_utf16 >test.utf16.raw &&
41+
printf "$text" | write_utf32 >test.utf32.raw &&
1842
printf "\377\376" >test.utf16lebom.raw &&
1943
printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
2044
@@ -124,8 +148,8 @@ do
124148
test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
125149
test_when_finished "git reset --hard HEAD^" &&
126150
127-
cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw &&
128-
cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw &&
151+
cat lf.utf8.raw | write_utf${i} >lf.utf${i}.raw &&
152+
cat crlf.utf8.raw | write_utf${i} >crlf.utf${i}.raw &&
129153
cp crlf.utf${i}.raw eol.utf${i} &&
130154
131155
cat >expectIndexLF <<-EOF &&
@@ -223,7 +247,7 @@ test_expect_success ICONV_SHIFT_JIS 'check roundtrip encoding' '
223247
224248
text="hallo there!\nroundtrip test here!" &&
225249
printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
226-
printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
250+
printf "$text" | write_utf16 >roundtrip.utf16 &&
227251
echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
228252
229253
# SHIFT-JIS encoded files are round-trip checked by default...

utf8.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,10 @@ char *reencode_string_len(const char *in, size_t insz,
559559
/*
560560
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
561561
* Some users under Windows want the little endian version
562+
*
563+
* We handle UTF-16 and UTF-32 ourselves only if the platform does not
564+
* provide a BOM (which we require), since we want to match the behavior
565+
* of the system tools and libc as much as possible.
562566
*/
563567
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
564568
bom_str = utf16_le_bom;
@@ -568,6 +572,16 @@ char *reencode_string_len(const char *in, size_t insz,
568572
bom_str = utf16_be_bom;
569573
bom_len = sizeof(utf16_be_bom);
570574
out_encoding = "UTF-16BE";
575+
#ifdef ICONV_OMITS_BOM
576+
} else if (same_utf_encoding("UTF-16", out_encoding)) {
577+
bom_str = utf16_be_bom;
578+
bom_len = sizeof(utf16_be_bom);
579+
out_encoding = "UTF-16BE";
580+
} else if (same_utf_encoding("UTF-32", out_encoding)) {
581+
bom_str = utf32_be_bom;
582+
bom_len = sizeof(utf32_be_bom);
583+
out_encoding = "UTF-32BE";
584+
#endif
571585
}
572586

573587
conv = iconv_open(out_encoding, in_encoding);

0 commit comments

Comments
 (0)