Skip to content

Commit cbfe844

Browse files
committed
Merge branch 'rs/userdiff-multibyte-regex'
The userdiff regexp patterns for various filetypes that are built into the system have been updated to avoid triggering regexp errors from UTF-8 aware regex engines. * rs/userdiff-multibyte-regex: userdiff: support regexec(3) with multi-byte support
2 parents 667fcf4 + be39144 commit cbfe844

File tree

3 files changed

+34
-2
lines changed

3 files changed

+34
-2
lines changed

t/t4034-diff-words.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ test_language_driver () {
6969
echo "* diff='"$lang"'" >.gitattributes &&
7070
word_diff --color-words
7171
'
72+
test_expect_success "diff driver '$lang' in Islandic" '
73+
LANG=is_IS.UTF-8 LANGUAGE=is LC_ALL="$is_IS_locale" \
74+
word_diff --color-words
75+
'
7276
}
7377

7478
test_expect_success setup '

userdiff.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ static int drivers_alloc;
1717
.cflags = REG_EXTENDED, \
1818
}, \
1919
.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
20+
.word_regex_multi_byte = wrx "|[^[:space:]]", \
2021
}
2122
#define IPATTERN(lang, rx, wrx) { \
2223
.name = lang, \
@@ -26,6 +27,7 @@ static int drivers_alloc;
2627
.cflags = REG_EXTENDED | REG_ICASE, \
2728
}, \
2829
.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
30+
.word_regex_multi_byte = wrx "|[^[:space:]]", \
2931
}
3032

3133
/*
@@ -294,7 +296,7 @@ PATTERNS("scheme",
294296
/* All other words should be delimited by spaces or parentheses */
295297
"|([^][)(}{[ \t])+"),
296298
PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
297-
"\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
299+
"\\\\[a-zA-Z@]+|\\\\.|([a-zA-Z0-9]|[^\x01-\x7f])+"),
298300
{ "default", NULL, NULL, -1, { NULL, 0 } },
299301
};
300302
#undef PATTERNS
@@ -330,6 +332,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
330332
return 0;
331333
}
332334

335+
static int regexec_supports_multi_byte_chars(void)
336+
{
337+
static const char not_space[] = "[^[:space:]]";
338+
static const char utf8_multi_byte_char[] = "\xc2\xa3";
339+
regex_t re;
340+
regmatch_t match;
341+
static int result = -1;
342+
343+
if (result != -1)
344+
return result;
345+
if (regcomp(&re, not_space, REG_EXTENDED))
346+
BUG("invalid regular expression: %s", not_space);
347+
result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
348+
match.rm_so == 0 &&
349+
match.rm_eo == strlen(utf8_multi_byte_char);
350+
regfree(&re);
351+
return result;
352+
}
353+
333354
static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
334355
{
335356
struct find_by_namelen_data udcbdata = {
@@ -405,7 +426,13 @@ int userdiff_config(const char *k, const char *v)
405426
struct userdiff_driver *userdiff_find_by_name(const char *name)
406427
{
407428
int len = strlen(name);
408-
return userdiff_find_by_namelen(name, len);
429+
struct userdiff_driver *driver = userdiff_find_by_namelen(name, len);
430+
if (driver && driver->word_regex_multi_byte) {
431+
if (regexec_supports_multi_byte_chars())
432+
driver->word_regex = driver->word_regex_multi_byte;
433+
driver->word_regex_multi_byte = NULL;
434+
}
435+
return driver;
409436
}
410437

411438
struct userdiff_driver *userdiff_find_by_path(struct index_state *istate,

userdiff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct userdiff_driver {
1818
int binary;
1919
struct userdiff_funcname funcname;
2020
const char *word_regex;
21+
const char *word_regex_multi_byte;
2122
const char *textconv;
2223
struct notes_cache *textconv_cache;
2324
int textconv_want_cache;

0 commit comments

Comments
 (0)