Skip to content

Commit be39144

Browse files
rscharfegitster
authored andcommitted
userdiff: support regexec(3) with multi-byte support
Since 1819ad3 (grep: fix multibyte regex handling under macOS, 2022-08-26) we use the system library for all regular expression matching on macOS, not just for git grep. It supports multi-byte strings and rejects invalid multi-byte characters. This broke all built-in userdiff word regexes in UTF-8 locales because they all include such invalid bytes in expressions that are intended to match multi-byte characters without explicit support for that from the regex engine. "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+" is added to all built-in word regexes to match a single non-space or multi-byte character. The \xNN characters are invalid if interpreted as UTF-8 because they have their high bit set, which indicates they are part of a multi-byte character, but they are surrounded by single-byte characters. Replace that expression with "|[^[:space:]]" if the regex engine supports multi-byte matching, as there is no need to have an explicit range for multi-byte characters then. Check for that capability at runtime, because it depends on the locale and thus on environment variables. Construct the full replacement expression at build time and just switch it in if necessary to avoid string manipulation and allocations at runtime. Additionally the word regex for tex contains the expression "[a-zA-Z0-9\x80-\xff]+" with a similarly invalid range. The best replacement with only valid characters that I can come up with is "([a-zA-Z0-9]|[^\x01-\x7f])+". Unlike the original it matches NUL characters, though. Assuming that tex files usually don't contain NUL this should be acceptable. Reported-by: D. Ben Knoble <[email protected]> Reported-by: Eric Sunshine <[email protected]> Helped-by: Junio C Hamano <[email protected]> Signed-off-by: René Scharfe <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 768bb23 commit be39144

File tree

3 files changed

+34
-2
lines changed

3 files changed

+34
-2
lines changed

t/t4034-diff-words.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ test_language_driver () {
6969
echo "* diff='"$lang"'" >.gitattributes &&
7070
word_diff --color-words
7171
'
72+
test_expect_success "diff driver '$lang' in Islandic" '
73+
LANG=is_IS.UTF-8 LANGUAGE=is LC_ALL="$is_IS_locale" \
74+
word_diff --color-words
75+
'
7276
}
7377

7478
test_expect_success setup '

userdiff.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ static int drivers_alloc;
1515
.cflags = REG_EXTENDED, \
1616
}, \
1717
.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
18+
.word_regex_multi_byte = wrx "|[^[:space:]]", \
1819
}
1920
#define IPATTERN(lang, rx, wrx) { \
2021
.name = lang, \
@@ -24,6 +25,7 @@ static int drivers_alloc;
2425
.cflags = REG_EXTENDED | REG_ICASE, \
2526
}, \
2627
.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
28+
.word_regex_multi_byte = wrx "|[^[:space:]]", \
2729
}
2830

2931
/*
@@ -292,7 +294,7 @@ PATTERNS("scheme",
292294
/* All other words should be delimited by spaces or parentheses */
293295
"|([^][)(}{[ \t])+"),
294296
PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
295-
"\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
297+
"\\\\[a-zA-Z@]+|\\\\.|([a-zA-Z0-9]|[^\x01-\x7f])+"),
296298
{ "default", NULL, -1, { NULL, 0 } },
297299
};
298300
#undef PATTERNS
@@ -328,6 +330,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
328330
return 0;
329331
}
330332

333+
static int regexec_supports_multi_byte_chars(void)
334+
{
335+
static const char not_space[] = "[^[:space:]]";
336+
static const char utf8_multi_byte_char[] = "\xc2\xa3";
337+
regex_t re;
338+
regmatch_t match;
339+
static int result = -1;
340+
341+
if (result != -1)
342+
return result;
343+
if (regcomp(&re, not_space, REG_EXTENDED))
344+
BUG("invalid regular expression: %s", not_space);
345+
result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
346+
match.rm_so == 0 &&
347+
match.rm_eo == strlen(utf8_multi_byte_char);
348+
regfree(&re);
349+
return result;
350+
}
351+
331352
static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
332353
{
333354
struct find_by_namelen_data udcbdata = {
@@ -401,7 +422,13 @@ int userdiff_config(const char *k, const char *v)
401422
struct userdiff_driver *userdiff_find_by_name(const char *name)
402423
{
403424
int len = strlen(name);
404-
return userdiff_find_by_namelen(name, len);
425+
struct userdiff_driver *driver = userdiff_find_by_namelen(name, len);
426+
if (driver && driver->word_regex_multi_byte) {
427+
if (regexec_supports_multi_byte_chars())
428+
driver->word_regex = driver->word_regex_multi_byte;
429+
driver->word_regex_multi_byte = NULL;
430+
}
431+
return driver;
405432
}
406433

407434
struct userdiff_driver *userdiff_find_by_path(struct index_state *istate,

userdiff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct userdiff_driver {
1717
int binary;
1818
struct userdiff_funcname funcname;
1919
const char *word_regex;
20+
const char *word_regex_multi_byte;
2021
const char *textconv;
2122
struct notes_cache *textconv_cache;
2223
int textconv_want_cache;

0 commit comments

Comments
 (0)