Skip to content

Commit ae39ba4

Browse files
effective-lightgitster
authored andcommitted
grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
If we attempt to grep non-ascii log message text with an ascii pattern, we run into the following issue: $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author grep: (standard input): binary file matches So, to fix this teach the grep code to use PCRE2_UTF, as long as the log output is encoded in UTF-8. Signed-off-by: Ævar Arnfjörð Bjarmason <[email protected]> Signed-off-by: Hamza Mahfooz <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 6a5c337 commit ae39ba4

File tree

2 files changed

+52
-2
lines changed

2 files changed

+52
-2
lines changed

grep.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
382382
}
383383
options |= PCRE2_CASELESS;
384384
}
385-
if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
386-
!(!opt->ignore_case && (p->fixed || p->is_fixed)))
385+
if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
386+
(!opt->ignore_locale && is_utf8_locale() &&
387+
has_non_ascii(p->pattern) && !(!opt->ignore_case &&
388+
(p->fixed || p->is_fixed))))
387389
options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
388390

389391
#ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER

t/t7812-grep-icase-non-ascii.sh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
5353
test_cmp expected actual
5454
'
5555

56+
test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
57+
cat >expected <<-\EOF &&
58+
Author: <BOLD;RED>À Ú Thor<RESET> <[email protected]>
59+
EOF
60+
test_write_lines "forth" >file4 &&
61+
git add file4 &&
62+
git commit --author="À Ú Thor <[email protected]>" -m sécond &&
63+
git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
64+
grep Author log >actual.raw &&
65+
test_decode_color <actual.raw >actual &&
66+
test_cmp expected actual
67+
'
68+
69+
test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
70+
cat >expected <<-\EOF &&
71+
Commit: Ç<BOLD;RED> O Mîtter <[email protected]><RESET>
72+
EOF
73+
test_write_lines "fifth" >file5 &&
74+
git add file5 &&
75+
GIT_COMMITTER_NAME="Ç O Mîtter" &&
76+
GIT_COMMITTER_EMAIL="[email protected]" &&
77+
git -c i18n.commitEncoding=latin1 commit -m thïrd &&
78+
git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
79+
grep Commit: log >actual.raw &&
80+
test_decode_color <actual.raw >actual &&
81+
test_cmp expected actual
82+
'
83+
84+
test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
85+
cat >expected <<-\EOF &&
86+
sé<BOLD;RED>con<RESET>d
87+
EOF
88+
git log -1 --color=always --perl-regexp --grep="con" >log &&
89+
grep con log >actual.raw &&
90+
test_decode_color <actual.raw >actual &&
91+
test_cmp expected actual
92+
'
93+
94+
test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
95+
cat >expected <<-\EOF &&
96+
<BOLD;RED>thïrd<RESET>
97+
EOF
98+
git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
99+
grep "th.*rd" log >actual.raw &&
100+
test_decode_color <actual.raw >actual &&
101+
test_cmp expected actual
102+
'
103+
56104
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
57105
printf "\\200\\n" >invalid-0x80 &&
58106
echo "ævar" >expected &&

0 commit comments

Comments
 (0)