Skip to content

Commit 5df0c03

Browse files
committed
regsub -all: don't loop forever when matching everywhere
Although "" and "x*" both match the empty string, the former correctly exits, while the latter looped forever. Match Tcl here by advancing by one char in both cases, but in the latter case end of string is matched, while in the former it is not. Also prevent both cases from slicing a utf-8 char into bytes. Fixes: #353 Signed-off-by: Steve Bennett <[email protected]>
1 parent cd31c05 commit 5df0c03

File tree

3 files changed

+35
-16
lines changed

3 files changed

+35
-16
lines changed

jim-regexp.c

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,11 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
485485

486486
n = source_len - offset;
487487
p = source_str + offset;
488-
do {
488+
489+
/* To match Tcl, an empty pattern does not match at the end
490+
* of the string.
491+
*/
492+
while (n || pattern[0]) {
489493
int match = jim_regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags);
490494

491495
if (match >= REG_BADPAT) {
@@ -584,23 +588,22 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
584588
break;
585589
}
586590

587-
/* If the pattern is empty, need to step forwards */
588-
if (pattern[0] == '\0' && n) {
589-
/* Need to copy the char we are moving over */
590-
Jim_AppendString(interp, resultObj, p, 1);
591-
p++;
592-
n--;
593-
}
594-
591+
regexec_flags = 0;
595592
if (pmatch[0].rm_eo == pmatch[0].rm_so) {
596-
/* The match did not advance the string, so set REG_NOTBOL to force the next match */
597-
regexec_flags = REG_NOTBOL;
598-
}
599-
else {
600-
regexec_flags = 0;
593+
/* Matched a zero length string. Need to avoid matching the same position again */
594+
if (pattern[0] == '^') {
595+
/* An anchored search sets REG_BOL */
596+
regexec_flags = REG_NOTBOL;
597+
}
598+
else {
599+
/* A non-anchored search advances by one char */
600+
int charlen = utf8_charlen(p[0]);
601+
Jim_AppendString(interp, resultObj, p, charlen);
602+
p += charlen;
603+
n -= charlen;
604+
}
601605
}
602-
603-
} while (n);
606+
}
604607

605608
/*
606609
* Copy the portion of the string after the last match to the

tests/regexp.test

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,16 @@ test regexp-21.15 {Replace literal backslash} {
666666
set value
667667
} "\\abc\\def"
668668

669+
test regexp-21.16 {Replace nothing} {
670+
regsub -all {x*} anything !
671+
} {!a!n!y!t!h!i!n!g!}
672+
673+
test regexp-21.17 {Replace nothing via empty pattern} {
674+
# Interestingly in this case Tcl does not match
675+
# at end of string while the previous case does
676+
regsub -all {} anything !
677+
} {!a!n!y!t!h!i!n!g}
678+
669679
test regexp-22.1 {char range} {
670680
regexp -all -inline {[a-c]+} "defaaghbcadfbaacccd"
671681
} {aa bca baaccc}

tests/regexp2.test

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,12 @@ test regexpComp-21.10 {regexp command compiling tests} {
810810
# list [regsub -all "" "" bar str] $str
811811
# }
812812
#} {0 {}}
813+
test regexpComp-21.12 {regexp empty pattern with utf8} utf8 {
814+
# Make sure the second char isn't sliced up
815+
evalInProc {
816+
regsub -all "" a\u0442bc !
817+
}
818+
} "!a!\u0442!b!c"
813819

814820
# We can forgive the underlying regexp engine for not supporting this.
815821
# Why not use this instead? "((^X)*|\$)"

0 commit comments

Comments
 (0)