Skip to content

Commit 664d44e

Browse files
jrngitster
authored andcommitted
userdiff: simplify word-diff safeguard
git's diff-words support has a detail that can be a little dangerous: any text not matched by a given language's tokenization pattern is treated as whitespace and changes in such text would go unnoticed. Therefore each of the built-in regexes allows a special token type consisting of a single non-whitespace character [^[:space:]]. To make sure UTF-8 sequences remain human readable, the builtin regexes also have a special token type for runs of bytes with the high bit set. In English, non-ASCII characters are usually isolated so this is analogous to the [^[:space:]] pattern, except it matches a single _multibyte_ character despite use of the C locale. Unfortunately it is easy to make typos or forget entirely to include these catch-all token types when adding support for new languages (see v1.7.3.5~16, userdiff: fix typo in ruby and python word regexes, 2010-12-18). Avoid this by including them automatically within the PATTERNS and IPATTERN macros. While at it, change the UTF-8 sequence token type to match exactly one non-ASCII multi-byte character, rather than an arbitrary run of them. Suggested-by: Thomas Rast <[email protected]> Signed-off-by: Jonathan Nieder <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 8d96e72 commit 664d44e

File tree

1 file changed

+16
-24
lines changed

1 file changed

+16
-24
lines changed

userdiff.c

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ static int ndrivers;
88
static int drivers_alloc;
99

1010
#define PATTERNS(name, pattern, word_regex) \
11-
{ name, NULL, -1, { pattern, REG_EXTENDED }, word_regex }
11+
{ name, NULL, -1, { pattern, REG_EXTENDED }, \
12+
word_regex "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+" }
1213
#define IPATTERN(name, pattern, word_regex) \
13-
{ name, NULL, -1, { pattern, REG_EXTENDED | REG_ICASE }, word_regex }
14+
{ name, NULL, -1, { pattern, REG_EXTENDED | REG_ICASE }, \
15+
word_regex "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+" }
1416
static struct userdiff_driver builtin_drivers[] = {
1517
IPATTERN("fortran",
1618
"!^([C*]|[ \t]*!)\n"
@@ -24,19 +26,17 @@ IPATTERN("fortran",
2426
* Don't worry about format statements without leading digits since
2527
* they would have been matched above as a variable anyway. */
2628
"|[-+]?[0-9.]+([AaIiDdEeFfLlTtXx][Ss]?[-+]?[0-9.]*)?(_[a-zA-Z0-9][a-zA-Z0-9_]*)?"
27-
"|//|\\*\\*|::|[/<>=]="
28-
"|[^[:space:]]|[\x80-\xff]+"),
29+
"|//|\\*\\*|::|[/<>=]="),
2930
PATTERNS("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$",
30-
"[^<>= \t]+|[^[:space:]]|[\x80-\xff]+"),
31+
"[^<>= \t]+"),
3132
PATTERNS("java",
3233
"!^[ \t]*(catch|do|for|if|instanceof|new|return|switch|throw|while)\n"
3334
"^[ \t]*(([A-Za-z_][A-Za-z_0-9]*[ \t]+)+[A-Za-z_][A-Za-z_0-9]*[ \t]*\\([^;]*)$",
3435
/* -- */
3536
"[a-zA-Z_][a-zA-Z0-9_]*"
3637
"|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?"
3738
"|[-+*/<>%&^|=!]="
38-
"|--|\\+\\+|<<=?|>>>?=?|&&|\\|\\|"
39-
"|[^[:space:]]|[\x80-\xff]+"),
39+
"|--|\\+\\+|<<=?|>>>?=?|&&|\\|\\|"),
4040
PATTERNS("objc",
4141
/* Negate C statements that can look like functions */
4242
"!^[ \t]*(do|for|if|else|return|switch|while)\n"
@@ -49,8 +49,7 @@ PATTERNS("objc",
4949
/* -- */
5050
"[a-zA-Z_][a-zA-Z0-9_]*"
5151
"|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?"
52-
"|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->"
53-
"|[^[:space:]]|[\x80-\xff]+"),
52+
"|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->"),
5453
PATTERNS("pascal",
5554
"^((procedure|function|constructor|destructor|interface|"
5655
"implementation|initialization|finalization)[ \t]*.*)$"
@@ -59,8 +58,7 @@ PATTERNS("pascal",
5958
/* -- */
6059
"[a-zA-Z_][a-zA-Z0-9_]*"
6160
"|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+"
62-
"|<>|<=|>=|:=|\\.\\."
63-
"|[^[:space:]]|[\x80-\xff]+"),
61+
"|<>|<=|>=|:=|\\.\\."),
6462
PATTERNS("perl",
6563
"^[ \t]*package .*;\n"
6664
"^[ \t]*sub .* \\{\n"
@@ -76,33 +74,29 @@ PATTERNS("perl",
7674
"|&&|\\|\\||//|\\+\\+|--|\\*\\*|\\.\\.\\.?"
7775
"|[-+*/%.^&<>=!|]="
7876
"|=~|!~"
79-
"|<<|<>|<=>|>>"
80-
"|[^[:space:]]"),
77+
"|<<|<>|<=>|>>"),
8178
PATTERNS("php",
8279
"^[\t ]*(((public|protected|private|static)[\t ]+)*function.*)$\n"
8380
"^[\t ]*(class.*)$",
8481
/* -- */
8582
"[a-zA-Z_][a-zA-Z0-9_]*"
8683
"|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+"
87-
"|[-+*/<>%&^|=!.]=|--|\\+\\+|<<=?|>>=?|===|&&|\\|\\||::|->"
88-
"|[^[:space:]]|[\x80-\xff]+"),
84+
"|[-+*/<>%&^|=!.]=|--|\\+\\+|<<=?|>>=?|===|&&|\\|\\||::|->"),
8985
PATTERNS("python", "^[ \t]*((class|def)[ \t].*)$",
9086
/* -- */
9187
"[a-zA-Z_][a-zA-Z0-9_]*"
9288
"|[-+0-9.e]+[jJlL]?|0[xX]?[0-9a-fA-F]+[lL]?"
93-
"|[-+*/<>%&^|=!]=|//=?|<<=?|>>=?|\\*\\*=?"
94-
"|[^[:space:]]|[\x80-\xff]+"),
89+
"|[-+*/<>%&^|=!]=|//=?|<<=?|>>=?|\\*\\*=?"),
9590
/* -- */
9691
PATTERNS("ruby", "^[ \t]*((class|module|def)[ \t].*)$",
9792
/* -- */
9893
"(@|@@|\\$)?[a-zA-Z_][a-zA-Z0-9_]*"
9994
"|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+|\\?(\\\\C-)?(\\\\M-)?."
100-
"|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~"
101-
"|[^[:space:]]|[\x80-\xff]+"),
95+
"|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~"),
10296
PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$",
10397
"[={}\"]|[^={}\" \t]+"),
10498
PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
105-
"\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+|[^[:space:]]"),
99+
"\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
106100
PATTERNS("cpp",
107101
/* Jump targets or access declarations */
108102
"!^[ \t]*[A-Za-z_][A-Za-z_0-9]*:.*$\n"
@@ -113,8 +107,7 @@ PATTERNS("cpp",
113107
/* -- */
114108
"[a-zA-Z_][a-zA-Z0-9_]*"
115109
"|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?"
116-
"|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->"
117-
"|[^[:space:]]|[\x80-\xff]+"),
110+
"|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->"),
118111
PATTERNS("csharp",
119112
/* Keywords */
120113
"!^[ \t]*(do|while|for|if|else|instanceof|new|return|switch|case|throw|catch|using)\n"
@@ -129,8 +122,7 @@ PATTERNS("csharp",
129122
/* -- */
130123
"[a-zA-Z_][a-zA-Z0-9_]*"
131124
"|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?"
132-
"|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->"
133-
"|[^[:space:]]|[\x80-\xff]+"),
125+
"|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->"),
134126
{ "default", NULL, -1, { NULL, 0 } },
135127
};
136128
#undef PATTERNS

0 commit comments

Comments
 (0)