Skip to content

Commit 981ee02

Browse files
k-takatanobu
authored andcommitted
Fix performance problem with /k/i and /s/i (Close k-takata/Onigmo#97)
E.g. For the pattern `/----k/i`, optimization was totally turned off. Make it possible to use the characters before `k` (i.e. `----`) for optimization. k-takata/Onigmo@9c13de8
1 parent a637903 commit 981ee02

File tree

1 file changed

+43
-24
lines changed

1 file changed

+43
-24
lines changed

regcomp.c

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4217,7 +4217,7 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg,
42174217
{
42184218
OnigDistance i, len;
42194219
int clen, flen, n, j, k;
4220-
UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
4220+
UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
42214221
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
42224222
OnigEncoding enc = reg->enc;
42234223

@@ -4299,14 +4299,42 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg,
42994299
{
43004300
OnigDistance i, len;
43014301
int clen, flen, n, j, k;
4302-
UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
4302+
UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
43034303
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
43044304
OnigEncoding enc = reg->enc;
43054305

43064306
len = end - s;
43074307
if (len < ONIG_CHAR_TABLE_SIZE) {
43084308
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1);
43094309

4310+
if (ignore_case) {
4311+
for (i = 0; i < len; i += clen) {
4312+
p = s + i;
4313+
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4314+
p, end, items);
4315+
clen = enclen(enc, p, end);
4316+
if (p + clen > end)
4317+
clen = (int )(end - p);
4318+
4319+
for (j = 0; j < n; j++) {
4320+
if ((items[j].code_len != 1) || (items[j].byte_len != clen)) {
4321+
/* Different length isn't supported. Stop optimization at here. */
4322+
end = p;
4323+
goto endcheck;
4324+
}
4325+
flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf);
4326+
if (flen != clen) {
4327+
/* Different length isn't supported. Stop optimization at here. */
4328+
end = p;
4329+
goto endcheck;
4330+
}
4331+
}
4332+
}
4333+
endcheck:
4334+
;
4335+
}
4336+
4337+
len = end - s;
43104338
n = 0;
43114339
for (i = 0; i < len; i += clen) {
43124340
p = s + i;
@@ -4317,17 +4345,11 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg,
43174345
if (p + clen > end)
43184346
clen = (int )(end - p);
43194347

4320-
for (j = 0; j < n; j++) {
4321-
if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4322-
return 1; /* different length isn't supported. */
4323-
flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4324-
if (flen != clen)
4325-
return 1; /* different length isn't supported. */
4326-
}
43274348
for (j = 0; j < clen; j++) {
43284349
skip[s[i + j]] = (UChar )(len - i - j);
43294350
for (k = 0; k < n; k++) {
4330-
skip[buf[k][j]] = (UChar )(len - i - j);
4351+
ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
4352+
skip[buf[j]] = (UChar )(len - i - j);
43314353
}
43324354
}
43334355
}
@@ -4369,7 +4391,7 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg,
43694391
}
43704392
# endif
43714393
}
4372-
return 0;
4394+
return (int)len;
43734395
}
43744396
#endif /* USE_SUNDAY_QUICK_SEARCH */
43754397

@@ -5342,7 +5364,6 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
53425364
static int
53435365
set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
53445366
{
5345-
int r;
53465367
int allow_reverse;
53475368

53485369
if (e->len == 0) return 0;
@@ -5357,31 +5378,29 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
53575378

53585379
if (e->ignore_case > 0) {
53595380
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5360-
r = set_bm_skip(reg->exact, reg->exact_end, reg,
5381+
e->len = set_bm_skip(reg->exact, reg->exact_end, reg,
53615382
reg->map, &(reg->int_map), 1);
5362-
if (r == 0) {
5383+
reg->exact_end = reg->exact + e->len;
5384+
if (e->len >= 3) {
53635385
reg->optimize = (allow_reverse != 0
53645386
? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
53655387
}
5366-
else {
5388+
else if (e->len > 0) {
53675389
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
53685390
}
5391+
else
5392+
return 0;
53695393
}
53705394
else {
53715395
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
53725396
}
53735397
}
53745398
else {
53755399
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5376-
r = set_bm_skip(reg->exact, reg->exact_end, reg,
5377-
reg->map, &(reg->int_map), 0);
5378-
if (r == 0) {
5379-
reg->optimize = (allow_reverse != 0
5380-
? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
5381-
}
5382-
else {
5383-
reg->optimize = ONIG_OPTIMIZE_EXACT;
5384-
}
5400+
set_bm_skip(reg->exact, reg->exact_end, reg,
5401+
reg->map, &(reg->int_map), 0);
5402+
reg->optimize = (allow_reverse != 0
5403+
? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
53855404
}
53865405
else {
53875406
reg->optimize = ONIG_OPTIMIZE_EXACT;

0 commit comments

Comments
 (0)