Skip to content

Commit 04deccd

Browse files
peffgitster
authored andcommitted
log: re-encode commit messages before grepping
If you run "git log --grep=foo", we will run your regex on the literal bytes of the commit message. This can provide confusing results if the commit message is not in the same encoding as your grep expression (or worse, you have commits in multiple encodings, in which case your regex would need to be written to match either encoding). On top of this, we might also be grepping in the commit's notes, which are already re-encoded, potentially leading to grepping in a buffer with mixed encodings concatenated. This is insanity, but most people never noticed, because their terminal and their commit encodings all match. Instead, let's massage the to-be-grepped commit into a standardized encoding. There is not much point in adding a flag for "this is the encoding I expect my grep pattern to match"; the only sane choice is for it to use the log output encoding. That is presumably what the user's terminal is using, and it means that the patterns found by the grep will match the output produced by git. As a bonus, this fixes a potential segfault in commit_match when commit->buffer is NULL, as we now build on logmsg_reencode, which handles reading the commit buffer from disk if necessary. The segfault can be triggered with: git commit -m 'text1' --allow-empty git commit -m 'text2' --allow-empty git log --graph --no-walk --grep 'text2' which arguably does not make any sense (--graph inherently wants a connected history, and by --no-walk the command line is telling us to show discrete points in history without connectivity), and we probably should forbid the combination, but that is a separate issue. Signed-off-by: Jeff King <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent be5c9fb commit 04deccd

File tree

2 files changed

+78
-7
lines changed

2 files changed

+78
-7
lines changed

revision.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,7 +2268,10 @@ static int commit_rewrite_person(struct strbuf *buf, const char *what, struct st
22682268
static int commit_match(struct commit *commit, struct rev_info *opt)
22692269
{
22702270
int retval;
2271+
const char *encoding;
2272+
char *message;
22712273
struct strbuf buf = STRBUF_INIT;
2274+
22722275
if (!opt->grep_filter.pattern_list && !opt->grep_filter.header_list)
22732276
return 1;
22742277

@@ -2279,13 +2282,23 @@ static int commit_match(struct commit *commit, struct rev_info *opt)
22792282
strbuf_addch(&buf, '\n');
22802283
}
22812284

2285+
/*
2286+
* We grep in the user's output encoding, under the assumption that it
2287+
* is the encoding they are most likely to write their grep pattern
2288+
* for. In addition, it means we will match the "notes" encoding below,
2289+
* so we will not end up with a buffer that has two different encodings
2290+
* in it.
2291+
*/
2292+
encoding = get_log_output_encoding();
2293+
message = logmsg_reencode(commit, encoding);
2294+
22822295
/* Copy the commit to temporary if we are using "fake" headers */
22832296
if (buf.len)
2284-
strbuf_addstr(&buf, commit->buffer);
2297+
strbuf_addstr(&buf, message);
22852298

22862299
if (opt->grep_filter.header_list && opt->mailmap) {
22872300
if (!buf.len)
2288-
strbuf_addstr(&buf, commit->buffer);
2301+
strbuf_addstr(&buf, message);
22892302

22902303
commit_rewrite_person(&buf, "\nauthor ", opt->mailmap);
22912304
commit_rewrite_person(&buf, "\ncommitter ", opt->mailmap);
@@ -2294,18 +2307,18 @@ static int commit_match(struct commit *commit, struct rev_info *opt)
22942307
/* Append "fake" message parts as needed */
22952308
if (opt->show_notes) {
22962309
if (!buf.len)
2297-
strbuf_addstr(&buf, commit->buffer);
2298-
format_display_notes(commit->object.sha1, &buf,
2299-
get_log_output_encoding(), 1);
2310+
strbuf_addstr(&buf, message);
2311+
format_display_notes(commit->object.sha1, &buf, encoding, 1);
23002312
}
23012313

2302-
/* Find either in the commit object, or in the temporary */
2314+
/* Find either in the original commit message, or in the temporary */
23032315
if (buf.len)
23042316
retval = grep_buffer(&opt->grep_filter, buf.buf, buf.len);
23052317
else
23062318
retval = grep_buffer(&opt->grep_filter,
2307-
commit->buffer, strlen(commit->buffer));
2319+
message, strlen(message));
23082320
strbuf_release(&buf);
2321+
logmsg_free(message, commit);
23092322
return retval;
23102323
}
23112324

t/t4210-log-i18n.sh

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/bin/sh
2+
3+
test_description='test log with i18n features'
4+
. ./test-lib.sh
5+
6+
# two forms of é
7+
utf8_e=$(printf '\303\251')
8+
latin1_e=$(printf '\351')
9+
10+
test_expect_success 'create commits in different encodings' '
11+
test_tick &&
12+
cat >msg <<-EOF &&
13+
utf8
14+
15+
t${utf8_e}st
16+
EOF
17+
git add msg &&
18+
git -c i18n.commitencoding=utf8 commit -F msg &&
19+
cat >msg <<-EOF &&
20+
latin1
21+
22+
t${latin1_e}st
23+
EOF
24+
git add msg &&
25+
git -c i18n.commitencoding=ISO-8859-1 commit -F msg
26+
'
27+
28+
test_expect_success 'log --grep searches in log output encoding (utf8)' '
29+
cat >expect <<-\EOF &&
30+
latin1
31+
utf8
32+
EOF
33+
git log --encoding=utf8 --format=%s --grep=$utf8_e >actual &&
34+
test_cmp expect actual
35+
'
36+
37+
test_expect_success 'log --grep searches in log output encoding (latin1)' '
38+
cat >expect <<-\EOF &&
39+
latin1
40+
utf8
41+
EOF
42+
git log --encoding=ISO-8859-1 --format=%s --grep=$latin1_e >actual &&
43+
test_cmp expect actual
44+
'
45+
46+
test_expect_success 'log --grep does not find non-reencoded values (utf8)' '
47+
>expect &&
48+
git log --encoding=utf8 --format=%s --grep=$latin1_e >actual &&
49+
test_cmp expect actual
50+
'
51+
52+
test_expect_success 'log --grep does not find non-reencoded values (latin1)' '
53+
>expect &&
54+
git log --encoding=ISO-8859-1 --format=%s --grep=$utf8_e >actual &&
55+
test_cmp expect actual
56+
'
57+
58+
test_done

0 commit comments

Comments
 (0)