Skip to content

Commit 08a94a1

Browse files
torvaldsgitster
authored andcommitted
commit/commit-tree: correct latin1 to utf-8
When a line in the message is not a valid utf-8, "git mailinfo" attempts to convert it to utf-8 assuming the input is latin1 (and punt if it does not convert cleanly). Using the same heuristics in "git commit" and "git commit-tree" lets the editor output be in latin1 to make the overall system more consistent. Signed-off-by: Junio C Hamano <[email protected]>
1 parent 4c8a9db commit 08a94a1

File tree

2 files changed

+88
-28
lines changed

2 files changed

+88
-28
lines changed

builtin/mailinfo.c

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -481,36 +481,12 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
481481
return out;
482482
}
483483

484-
/*
485-
* When there is no known charset, guess.
486-
*
487-
* Right now we assume that if the target is UTF-8 (the default),
488-
* and it already looks like UTF-8 (which includes US-ASCII as its
489-
* subset, of course) then that is what it is and there is nothing
490-
* to do.
491-
*
492-
* Otherwise, we default to assuming it is Latin1 for historical
493-
* reasons.
494-
*/
495-
static const char *guess_charset(const struct strbuf *line, const char *target_charset)
496-
{
497-
if (is_encoding_utf8(target_charset)) {
498-
if (is_utf8(line->buf))
499-
return NULL;
500-
}
501-
return "ISO8859-1";
502-
}
503-
504484
static void convert_to_utf8(struct strbuf *line, const char *charset)
505485
{
506486
char *out;
507487

508-
if (!charset || !*charset) {
509-
charset = guess_charset(line, metainfo_charset);
510-
if (!charset)
511-
return;
512-
}
513-
488+
if (!charset || !*charset)
489+
return;
514490
if (!strcasecmp(metainfo_charset, charset))
515491
return;
516492
out = reencode_string(line->buf, metainfo_charset, charset);

commit.c

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
11121112
return result;
11131113
}
11141114

1115+
static int find_invalid_utf8(const char *buf, int len)
1116+
{
1117+
int offset = 0;
1118+
1119+
while (len) {
1120+
unsigned char c = *buf++;
1121+
int bytes, bad_offset;
1122+
1123+
len--;
1124+
offset++;
1125+
1126+
/* Simple US-ASCII? No worries. */
1127+
if (c < 0x80)
1128+
continue;
1129+
1130+
bad_offset = offset-1;
1131+
1132+
/*
1133+
* Count how many more high bits set: that's how
1134+
* many more bytes this sequence should have.
1135+
*/
1136+
bytes = 0;
1137+
while (c & 0x40) {
1138+
c <<= 1;
1139+
bytes++;
1140+
}
1141+
1142+
/* Must be between 1 and 5 more bytes */
1143+
if (bytes < 1 || bytes > 5)
1144+
return bad_offset;
1145+
1146+
/* Do we *have* that many bytes? */
1147+
if (len < bytes)
1148+
return bad_offset;
1149+
1150+
offset += bytes;
1151+
len -= bytes;
1152+
1153+
/* And verify that they are good continuation bytes */
1154+
do {
1155+
if ((*buf++ & 0xc0) != 0x80)
1156+
return bad_offset;
1157+
} while (--bytes);
1158+
1159+
/* We could/should check the value and length here too */
1160+
}
1161+
return -1;
1162+
}
1163+
1164+
/*
1165+
* This verifies that the buffer is in proper utf8 format.
1166+
*
1167+
* If it isn't, it assumes any non-utf8 characters are Latin1,
1168+
* and does the conversion.
1169+
*
1170+
* Fixme: we should probably also disallow overlong forms and
1171+
* invalid characters. But we don't do that currently.
1172+
*/
1173+
static int verify_utf8(struct strbuf *buf)
1174+
{
1175+
int ok = 1;
1176+
long pos = 0;
1177+
1178+
for (;;) {
1179+
int bad;
1180+
unsigned char c;
1181+
unsigned char replace[2];
1182+
1183+
bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
1184+
if (bad < 0)
1185+
return ok;
1186+
pos += bad;
1187+
ok = 0;
1188+
c = buf->buf[pos];
1189+
strbuf_remove(buf, pos, 1);
1190+
1191+
/* We know 'c' must be in the range 128-255 */
1192+
replace[0] = 0xc0 + (c >> 6);
1193+
replace[1] = 0x80 + (c & 0x3f);
1194+
strbuf_insert(buf, pos, replace, 2);
1195+
pos += 2;
1196+
}
1197+
}
1198+
11151199
static const char commit_utf8_warn[] =
1116-
"Warning: commit message does not conform to UTF-8.\n"
1200+
"Warning: commit message did not conform to UTF-8.\n"
11171201
"You may want to amend it after fixing the message, or set the config\n"
11181202
"variable i18n.commitencoding to the encoding your project uses.\n";
11191203

@@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
11701254
strbuf_addbuf(&buffer, msg);
11711255

11721256
/* And check the encoding */
1173-
if (encoding_is_utf8 && !is_utf8(buffer.buf))
1257+
if (encoding_is_utf8 && !verify_utf8(&buffer))
11741258
fprintf(stderr, commit_utf8_warn);
11751259

11761260
if (sign_commit && do_sign_commit(&buffer, sign_commit))

0 commit comments

Comments
 (0)