Skip to content

Commit 7a17918

Browse files
larsxschneidergitster
authored andcommitted
convert: check for detectable errors in UTF encodings
Check that new content is valid with respect to the user defined 'working-tree-encoding' attribute. Signed-off-by: Lars Schneider <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 107642f commit 7a17918

File tree

2 files changed

+123
-0
lines changed

2 files changed

+123
-0
lines changed

convert.c

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,64 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
266266

267267
}
268268

269+
static int validate_encoding(const char *path, const char *enc,
270+
const char *data, size_t len, int die_on_error)
271+
{
272+
/* We only check for UTF here as UTF?? can be an alias for UTF-?? */
273+
if (istarts_with(enc, "UTF")) {
274+
/*
275+
* Check for detectable errors in UTF encodings
276+
*/
277+
if (has_prohibited_utf_bom(enc, data, len)) {
278+
const char *error_msg = _(
279+
"BOM is prohibited in '%s' if encoded as %s");
280+
/*
281+
* This advice is shown for UTF-??BE and UTF-??LE encodings.
282+
* We cut off the last two characters of the encoding name
283+
* to generate the encoding name suitable for BOMs.
284+
*/
285+
const char *advise_msg = _(
286+
"The file '%s' contains a byte order "
287+
"mark (BOM). Please use UTF-%s as "
288+
"working-tree-encoding.");
289+
const char *stripped = NULL;
290+
char *upper = xstrdup_toupper(enc);
291+
upper[strlen(upper)-2] = '\0';
292+
if (!skip_prefix(upper, "UTF-", &stripped))
293+
skip_prefix(stripped, "UTF", &stripped);
294+
advise(advise_msg, path, stripped);
295+
free(upper);
296+
if (die_on_error)
297+
die(error_msg, path, enc);
298+
else {
299+
return error(error_msg, path, enc);
300+
}
301+
302+
} else if (is_missing_required_utf_bom(enc, data, len)) {
303+
const char *error_msg = _(
304+
"BOM is required in '%s' if encoded as %s");
305+
const char *advise_msg = _(
306+
"The file '%s' is missing a byte order "
307+
"mark (BOM). Please use UTF-%sBE or UTF-%sLE "
308+
"(depending on the byte order) as "
309+
"working-tree-encoding.");
310+
const char *stripped = NULL;
311+
char *upper = xstrdup_toupper(enc);
312+
if (!skip_prefix(upper, "UTF-", &stripped))
313+
skip_prefix(stripped, "UTF", &stripped);
314+
advise(advise_msg, path, stripped, stripped);
315+
free(upper);
316+
if (die_on_error)
317+
die(error_msg, path, enc);
318+
else {
319+
return error(error_msg, path, enc);
320+
}
321+
}
322+
323+
}
324+
return 0;
325+
}
326+
269327
static const char *default_encoding = "UTF-8";
270328

271329
static int encode_to_git(const char *path, const char *src, size_t src_len,
@@ -291,6 +349,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
291349
if (!buf && !src)
292350
return 1;
293351

352+
if (validate_encoding(path, enc, src, src_len, die_on_error))
353+
return 0;
354+
294355
dst = reencode_string_len(src, src_len, default_encoding, enc,
295356
&dst_len);
296357
if (!dst) {

t/t0028-working-tree-encoding.sh

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,52 @@ test_expect_success 'check $GIT_DIR/info/attributes support' '
6262

6363
for i in 16 32
6464
do
65+
test_expect_success "check prohibited UTF-${i} BOM" '
66+
test_when_finished "git reset --hard HEAD" &&
67+
68+
echo "*.utf${i}be text working-tree-encoding=utf-${i}be" >>.gitattributes &&
69+
echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" >>.gitattributes &&
70+
71+
# Here we add a UTF-16 (resp. UTF-32) files with BOM (big/little-endian)
72+
# but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. UTF-32).
73+
# In these cases the BOM is prohibited.
74+
cp bebom.utf${i}be.raw bebom.utf${i}be &&
75+
test_must_fail git add bebom.utf${i}be 2>err.out &&
76+
test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out &&
77+
test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out &&
78+
79+
cp lebom.utf${i}le.raw lebom.utf${i}be &&
80+
test_must_fail git add lebom.utf${i}be 2>err.out &&
81+
test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out &&
82+
test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out &&
83+
84+
cp bebom.utf${i}be.raw bebom.utf${i}le &&
85+
test_must_fail git add bebom.utf${i}le 2>err.out &&
86+
test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out &&
87+
test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out &&
88+
89+
cp lebom.utf${i}le.raw lebom.utf${i}le &&
90+
test_must_fail git add lebom.utf${i}le 2>err.out &&
91+
test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out &&
92+
test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out
93+
'
94+
95+
test_expect_success "check required UTF-${i} BOM" '
96+
test_when_finished "git reset --hard HEAD" &&
97+
98+
echo "*.utf${i} text working-tree-encoding=utf-${i}" >>.gitattributes &&
99+
100+
cp nobom.utf${i}be.raw nobom.utf${i} &&
101+
test_must_fail git add nobom.utf${i} 2>err.out &&
102+
test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out &&
103+
test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out &&
104+
105+
cp nobom.utf${i}le.raw nobom.utf${i} &&
106+
test_must_fail git add nobom.utf${i} 2>err.out &&
107+
test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out &&
108+
test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out
109+
'
110+
65111
test_expect_success "eol conversion for UTF-${i} encoded files on checkout" '
66112
test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
67113
test_when_finished "git reset --hard HEAD^" &&
@@ -139,4 +185,20 @@ test_expect_success 'error if encoding round trip is not the same during refresh
139185
test_i18ngrep "error: .* overwritten by checkout:" err.out
140186
'
141187

188+
test_expect_success 'error if encoding garbage is already in Git' '
189+
BEFORE_STATE=$(git rev-parse HEAD) &&
190+
test_when_finished "git reset --hard $BEFORE_STATE" &&
191+
192+
# Skip the UTF-16 filter for the added file
193+
# This simulates a Git version that has no checkoutEncoding support
194+
cp nobom.utf16be.raw nonsense.utf16 &&
195+
TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16) &&
196+
git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16 &&
197+
COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" $(git write-tree)) &&
198+
git update-ref refs/heads/master $COMMIT &&
199+
200+
git diff 2>err.out &&
201+
test_i18ngrep "error: BOM is required" err.out
202+
'
203+
142204
test_done

0 commit comments

Comments
 (0)