Skip to content

Commit a5cf78b

Browse files
committed
Recognize existing Lucene escapes in QueryBuilder
1 parent a984fba commit a5cf78b

File tree

18 files changed

+497
-224
lines changed

18 files changed

+497
-224
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/search/QueryBuilder.java

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
package org.opengrok.indexer.search;
2626

2727
import java.io.File;
28+
import java.io.StringReader;
2829
import java.nio.charset.StandardCharsets;
2930
import java.security.MessageDigest;
3031
import java.security.NoSuchAlgorithmException;
@@ -413,23 +414,29 @@ private String getQueryText(String field) {
413414
* @param query the query string to escape
414415
* @return the escaped query string
415416
*/
416-
@SuppressWarnings("fallthrough")
417417
private String escapeQueryString(String field, String query) {
418+
StringReader reader = new StringReader(query);
419+
StringBuilder res = new StringBuilder();
418420
switch (field) {
419421
case FULL:
420-
// The free text field may contain terms qualified with other
421-
// field names, so we don't escape single colons.
422-
return query.replace("::", "\\:\\:");
422+
FullQueryEscaper fesc = new FullQueryEscaper(reader);
423+
fesc.setOut(res);
424+
fesc.consume();
425+
break;
423426
case PATH:
424-
// workaround for replacing / with escaped / - needed since lucene 4.x
425427
if (!(query.startsWith("/") && query.endsWith("/"))) {
426-
return (query.replace(":", "\\:")).replace("/", "\\/");
428+
PathQueryEscaper pesc = new PathQueryEscaper(reader);
429+
pesc.setOut(res);
430+
pesc.consume();
431+
break;
427432
}
428-
// Other fields shouldn't use qualified terms, so escape colons
429-
// so that we can search for them.
433+
// FALLTHROUGH
430434
default:
431-
return query.replace(":", "\\:");
435+
DefaultQueryEscaper desc = new DefaultQueryEscaper(reader);
436+
desc.setOut(res);
437+
desc.consume();
432438
}
439+
return res.toString();
433440
}
434441

435442
/**
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2018, Chris Fraire <[email protected]>.
22+
*/
23+
24+
package org.opengrok.indexer.search;
25+
26+
import java.io.IOException;
27+
28+
/**
29+
* Represents an abstract base class for OpenGrok query-building term-
30+
* transformers.
31+
*/
32+
abstract class TermEscaperBase {
33+
34+
private StringBuilder out;
35+
36+
/**
37+
* "Runs the scanner [as documented by JFlex].
38+
* <p>[The method] can be used to get the next token from the input."
39+
* <p>"Consume[s] input until one of the expressions in the specification
40+
* is matched or an error occurs."
41+
* @return a value returned by the lexer specification if defined or the
42+
* {@code EOF} value upon reading end-of-file
43+
* @throws IOException if an error occurs reading the input
44+
*/
45+
abstract boolean yylex() throws IOException;
46+
47+
/**
48+
* @param out the target to append
49+
*/
50+
void setOut(StringBuilder out) {
51+
this.out = out;
52+
}
53+
54+
void appendOut(char c) {
55+
out.append(c);
56+
}
57+
58+
void appendOut(String s) {
59+
out.append(s);
60+
}
61+
62+
/**
63+
* Call {@link #yylex()} until {@code false}, which consumes all input so
64+
* that the argument to {@link #setOut(StringBuilder)} contains the entire
65+
* transformation.
66+
*/
67+
void consume() {
68+
try {
69+
while (yylex()) {
70+
//noinspection UnnecessaryContinue
71+
continue;
72+
}
73+
} catch (IOException ex) {
74+
// cannot get here with StringBuilder operations
75+
}
76+
}
77+
}

opengrok-indexer/src/main/java/org/opengrok/indexer/web/Util.java

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import java.util.regex.Pattern;
6262
import java.util.zip.GZIPInputStream;
6363
import javax.servlet.http.HttpServletRequest;
64+
import org.apache.lucene.queryparser.classic.QueryParser;
6465
import org.opengrok.indexer.configuration.RuntimeEnvironment;
6566
import org.opengrok.indexer.history.Annotation;
6667
import org.opengrok.indexer.history.HistoryException;
@@ -77,18 +78,13 @@ public final class Util {
7778

7879
private static final int BOLD_COUNT_THRESHOLD = 1000;
7980

80-
/**
81-
* Matches a character that is not ASCII alpha-numeric or underscore.
82-
*/
83-
private static final Pattern NON_ASCII_ALPHA_NUM = Pattern.compile("[^A-Za-z0-9_]");
84-
8581
private static final String anchorLinkStart = "<a href=\"";
8682
private static final String anchorClassStart = "<a class=\"";
8783
private static final String anchorEnd = "</a>";
8884
private static final String closeQuotedTag = "\">";
8985

86+
/** Private to enforce static. */
9087
private Util() {
91-
// private to ensure static
9288
}
9389

9490
/**
@@ -142,24 +138,15 @@ public static String prehtmlize(CharSequence q) {
142138
}
143139

144140
/**
145-
* Append to {@code dest} the UTF-8 URL-encoded representation of
146-
* {@code str}, within explicit quotes (%22) to accommodate Lucene querying
147-
* if {@code str} contains any character that is not ASCII-alphanumeric or
148-
* underscore.
141+
* Append to {@code dest} the UTF-8 URL-encoded representation of the
142+
* Lucene-escaped version of {@code str}.
149143
* @param str a defined instance
150144
* @param dest a defined target
151145
* @throws IOException I/O exception
152146
*/
153147
public static void qurlencode(String str, Appendable dest)
154148
throws IOException {
155-
if (NON_ASCII_ALPHA_NUM.matcher(str).find()) {
156-
final String UQUOTE = "%22";
157-
dest.append(UQUOTE);
158-
URIEncode(str, dest);
159-
dest.append(UQUOTE);
160-
} else {
161-
URIEncode(str, dest);
162-
}
149+
URIEncode(QueryParser.escape(str), dest);
163150
}
164151

165152
/**
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2018, Chris Fraire <[email protected]>.
22+
*/
23+
24+
package org.opengrok.indexer.search;
25+
26+
%%
27+
%public
28+
%class DefaultQueryEscaper
29+
%extends TermEscaperBase
30+
%unicode
31+
%type boolean
32+
%eofval{
33+
return false;
34+
%eofval}
35+
36+
%include QueryEscaper.lexh
37+
%%
38+
39+
{LuceneSpecialEscape} {
40+
for (int i = 0; i < yylength(); ++i) {
41+
appendOut(yycharat(i)); // faster than yytext()
42+
}
43+
}
44+
45+
/*
46+
* Other fields shouldn't use qualified terms, so escape colons so that we can
47+
* search for them.
48+
*/
49+
50+
":" {
51+
appendOut("\\:");
52+
}
53+
54+
[^] {
55+
for (int i = 0; i < yylength(); ++i) {
56+
appendOut(yycharat(i)); // faster than yytext()
57+
}
58+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2018, Chris Fraire <[email protected]>.
22+
*/
23+
24+
package org.opengrok.indexer.search;
25+
26+
%%
27+
%public
28+
%class FullQueryEscaper
29+
%extends TermEscaperBase
30+
%unicode
31+
%type boolean
32+
%eofval{
33+
return false;
34+
%eofval}
35+
36+
%include QueryEscaper.lexh
37+
%%
38+
39+
{LuceneSpecialEscape} {
40+
for (int i = 0; i < yylength(); ++i) {
41+
appendOut(yycharat(i)); // faster than yytext()
42+
}
43+
}
44+
45+
/*
46+
* The free text field may contain terms qualified with other field names, so
47+
* don't escape single colons.
48+
*/
49+
50+
"::" {
51+
appendOut("\\:\\:");
52+
}
53+
54+
[^] {
55+
for (int i = 0; i < yylength(); ++i) {
56+
appendOut(yycharat(i)); // faster than yytext()
57+
}
58+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2018, Chris Fraire <[email protected]>.
22+
*/
23+
24+
package org.opengrok.indexer.search;
25+
26+
%%
27+
%public
28+
%class PathQueryEscaper
29+
%extends TermEscaperBase
30+
%unicode
31+
%type boolean
32+
%eofval{
33+
return false;
34+
%eofval}
35+
36+
%include QueryEscaper.lexh
37+
%%
38+
39+
{LuceneSpecialEscape} {
40+
for (int i = 0; i < yylength(); ++i) {
41+
appendOut(yycharat(i)); // faster than yytext()
42+
}
43+
}
44+
45+
":" {
46+
appendOut("\\:");
47+
}
48+
49+
/*
50+
* Workaround for replacing / with escaped / -- needed since lucene 4.x.
51+
*/
52+
53+
"/" {
54+
appendOut("\\/");
55+
}
56+
57+
[^] {
58+
for (int i = 0; i < yylength(); ++i) {
59+
appendOut(yycharat(i)); // faster than yytext()
60+
}
61+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2018, Chris Fraire <[email protected]>.
22+
*/
23+
24+
LuceneSpecialEscape = \\[\+\-\&\|\!\(\)\{\}\[\]\^\"\~\*\?\:\\]

opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ public void truncatedUuencodedFile() throws IOException {
423423
assertLinesEqual("UuencodeXref truncated",
424424
"<a class=\"l\" name=\"1\" href=\"#1\">1</a>"
425425
+ "<strong>begin</strong> <em>644</em> "
426-
+ "<a href=\"/source/s?full=%22test.txt%22\">test.txt</a>"
426+
+ "<a href=\"/source/s?full=test.txt\">test.txt</a>"
427427
+ "<span class=\"c\">\n"
428428
+ "<a class=\"l\" name=\"2\" href=\"#2\">2</a></span>",
429429
out.toString());

0 commit comments

Comments
 (0)