Skip to content

Commit b96be48

Browse files
authored
Merge pull request #2683 from tulinkry/OpenGrok-2670
detecting regexp literals in javascript
2 parents 312f4f6 + 0cf0fee commit b96be48

File tree

11 files changed

+234
-26
lines changed

11 files changed

+234
-26
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/javascript/JavaScriptAnalyzer.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved.
2222
* Portions Copyright (c) 2017-2018, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis.javascript;
@@ -50,11 +50,11 @@ protected JavaScriptAnalyzer(AnalyzerFactory factory) {
5050
* Gets a version number to be used to tag processed documents so that
5151
* re-analysis can be re-done later if a stored version number is different
5252
* from the current implementation.
53-
* @return 20180208_00
53+
* @return 20190217_00
5454
*/
5555
@Override
5656
protected int getSpecializedVersionNo() {
57-
return 20180208_00; // Edit comment above too!
57+
return 20190217_00; // Edit comment above too!
5858
}
5959

6060
/**

opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved.
2222
* Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
2323
*/
2424

@@ -42,7 +42,7 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
4242
%include CommonLexer.lexh
4343
%char
4444

45-
%state STRING COMMENT SCOMMENT QSTRING
45+
%state STRING REGEXP_START REGEXP COMMENT SCOMMENT QSTRING
4646

4747
%include JavaScript.lexh
4848
%%
@@ -56,6 +56,13 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
5656
{Number} {}
5757
\" { yybegin(STRING); }
5858
\' { yybegin(QSTRING); }
59+
/*
60+
* Literal regexps are in conflict with division "/" and are detected
61+
* in javascript based on context and when ambiguous, the division has
62+
* a higher precedence. We do a best-effort context matching for
63+
* preceding "=" (variable), "(" (function call) or ":" (object).
64+
*/
65+
[:=(][ \t\r\n]*/\/ { yybegin(REGEXP_START); }
5966
"/*" { yybegin(COMMENT); }
6067
"//" { yybegin(SCOMMENT); }
6168
}
@@ -65,6 +72,15 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
6572
\" { yybegin(YYINITIAL); }
6673
}
6774

75+
<REGEXP_START> {
76+
\/ { yybegin(REGEXP); }
77+
}
78+
79+
<REGEXP> {
80+
\\[/] {}
81+
\/[gimsuy]* { yybegin(YYINITIAL); }
82+
}
83+
6884
<QSTRING> {
6985
\\[\'\\] {}
7086
\' { yybegin(YYINITIAL); }
@@ -78,6 +94,6 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
7894
\n { yybegin(YYINITIAL);}
7995
}
8096

81-
<YYINITIAL, STRING, COMMENT, SCOMMENT, QSTRING> {
97+
<YYINITIAL, STRING, REGEXP_START, REGEXP, COMMENT, SCOMMENT, QSTRING> {
8298
[^] {}
8399
}

opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptXref.lex

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved.
2222
* Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
2323
*/
2424

@@ -68,7 +68,7 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] |
6868
[Xx][Mm][Ll] | [Cc][Oo][Nn][Ff] | [Tt][Xx][Tt] | [Hh][Tt][Mm][Ll]? |
6969
[Ii][Nn][Ii] | [Dd][Ii][Ff][Ff] | [Pp][Aa][Tt][Cc][Hh])
7070

71-
%state STRING COMMENT SCOMMENT QSTRING
71+
%state STRING REGEXP_START REGEXP COMMENT SCOMMENT QSTRING
7272

7373
%include Common.lexh
7474
%include CommonURI.lexh
@@ -98,7 +98,6 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] |
9898
onNonSymbolMatched(yytext(), yychar);
9999
onDisjointSpanChanged(null, yychar);
100100
}
101-
102101
\" {
103102
chkLOC();
104103
yypush(STRING);
@@ -121,6 +120,16 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] |
121120
onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
122121
onNonSymbolMatched(yytext(), yychar);
123122
}
123+
/*
124+
* Literal regexps are in conflict with division "/" and are detected
125+
* in javascript based on context and when ambiguous, the division has
126+
* a higher precedence. We do a best-effort context matching for
127+
* preceding "=" (variable), "(" (function call) or ":" (object).
128+
*/
129+
[:=(]{WhspChar}*/\/ {
130+
yypush(REGEXP_START);
131+
onNonSymbolMatched(yytext(), yychar);
132+
}
124133
}
125134

126135
<STRING> {
@@ -133,6 +142,19 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] |
133142
}
134143
}
135144

145+
<REGEXP_START> {
146+
\/ {
147+
onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
148+
onNonSymbolMatched(yytext(), yychar);
149+
yybegin(REGEXP);
150+
}
151+
}
152+
153+
<REGEXP> {
154+
\\[/] { onNonSymbolMatched(yytext(), yychar); }
155+
\/[gimsuy]* { chkLOC(); onNonSymbolMatched(yytext(), yychar); yypop(); }
156+
}
157+
136158
<QSTRING> {
137159
\\[\'\\] |
138160
\' {WhspChar}+ \' { chkLOC(); onNonSymbolMatched(yytext(), yychar); }
@@ -154,7 +176,7 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] |
154176
}
155177
}
156178

157-
<YYINITIAL, STRING, COMMENT, SCOMMENT, QSTRING> {
179+
<YYINITIAL, STRING, REGEXP_START, REGEXP, COMMENT, SCOMMENT, QSTRING> {
158180
{WhspChar}*{EOL} { onEndOfLineMatched(yytext(), yychar); }
159181
[[\s]--[\n]] { onNonSymbolMatched(yytext(), yychar); }
160182
[^\n] { chkLOC(); onNonSymbolMatched(yytext(), yychar); }

opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptSymbolTokenizerTest.java

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,21 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2010, 2018, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved.
2222
* Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
2323
*/
2424

2525
package org.opengrok.indexer.analysis.javascript;
2626

27+
import static org.junit.Assert.assertNotNull;
28+
import static org.opengrok.indexer.util.CustomAssertions.assertSymbolStream;
29+
2730
import java.io.BufferedReader;
2831
import java.io.InputStream;
2932
import java.io.InputStreamReader;
3033
import java.util.ArrayList;
3134
import java.util.List;
32-
import static org.junit.Assert.assertNotNull;
3335
import org.junit.Test;
34-
import static org.opengrok.indexer.util.CustomAssertions.assertSymbolStream;
3536

3637
/**
3738
* Tests the {@link JavaScriptSymbolTokenizer} class.
@@ -40,29 +41,44 @@ public class JavaScriptSymbolTokenizerTest {
4041

4142
/**
4243
* Test sample.js v. samplesymbols.txt
44+
*
4345
* @throws java.lang.Exception thrown on error
4446
*/
4547
@Test
4648
public void testJavaScriptSymbolStream() throws Exception {
49+
testSymbols("analysis/javascript/sample.js", "analysis/javascript/samplesymbols.txt");
50+
}
51+
52+
@Test
53+
public void testRegexpWithModifiersSymbols() throws Exception {
54+
testSymbols("analysis/javascript/regexp_modifiers.js", "analysis/javascript/regexp_modifiers_symbols.txt");
55+
}
56+
57+
@Test
58+
public void testRegexpSymbols() throws Exception {
59+
testSymbols("analysis/javascript/regexp_plain.js", "analysis/javascript/regexp_plain_symbols.txt");
60+
}
61+
62+
private void testSymbols(String codeResource, String symbolsResource) throws Exception {
4763
InputStream jsres = getClass().getClassLoader().getResourceAsStream(
48-
"analysis/javascript/sample.js");
49-
assertNotNull("despite sample.js as resource,", jsres);
64+
codeResource);
65+
assertNotNull(String.format("Unable to find %s as a resource", codeResource), jsres);
5066
InputStream symres = getClass().getClassLoader().getResourceAsStream(
51-
"analysis/javascript/samplesymbols.txt");
52-
assertNotNull("despite samplesymbols.txt as resource,", symres);
67+
symbolsResource);
68+
assertNotNull(String.format("Unable to find %s as a resource", symbolsResource), symres);
5369

5470
List<String> expectedSymbols = new ArrayList<>();
55-
try (BufferedReader wdsr = new BufferedReader(new InputStreamReader(
56-
symres, "UTF-8"))) {
71+
try (BufferedReader wdsr = new BufferedReader(new InputStreamReader(symres, "UTF-8"))) {
5772
String line;
5873
while ((line = wdsr.readLine()) != null) {
5974
int hasho = line.indexOf('#');
60-
if (hasho != -1) line = line.substring(0, hasho);
75+
if (hasho != -1) {
76+
line = line.substring(0, hasho);
77+
}
6178
expectedSymbols.add(line.trim());
6279
}
6380
}
6481

65-
assertSymbolStream(JavaScriptSymbolTokenizer.class, jsres,
66-
expectedSymbols);
82+
assertSymbolStream(JavaScriptSymbolTokenizer.class, jsres, expectedSymbols);
6783
}
6884
}

opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptXrefTest.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2012, 2018, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2012, 2019, Oracle and/or its affiliates. All rights reserved.
2222
* Portions Copyright (c) 2017, 2019, Chris Fraire <[email protected]>.
2323
*/
2424

2525
package org.opengrok.indexer.analysis.javascript;
2626

27+
import static org.opengrok.indexer.util.StreamUtils.readTagsFromResource;
28+
29+
import java.io.IOException;
2730
import org.junit.Test;
2831
import org.opengrok.indexer.analysis.XrefTestBase;
29-
import java.io.IOException;
30-
31-
import static org.opengrok.indexer.util.StreamUtils.readTagsFromResource;
3232

3333
/**
3434
* Tests the {@link JavaScriptXref} class.
@@ -49,4 +49,18 @@ public void shouldCloseTruncatedStringSpan() throws IOException {
4949
"analysis/javascript/truncated.js",
5050
"analysis/javascript/truncated_xref.html", null, 1);
5151
}
52+
53+
@Test
54+
public void shouldDetectRegularExpressionWithoutModifiers() throws IOException {
55+
writeAndCompare(new JavaScriptAnalyzerFactory(),
56+
"analysis/javascript/regexp_plain.js",
57+
"analysis/javascript/regexp_plain_xref.html", null, 14);
58+
}
59+
60+
@Test
61+
public void shouldDetectRegularExpressionWithModifiers() throws IOException {
62+
writeAndCompare(new JavaScriptAnalyzerFactory(),
63+
"analysis/javascript/regexp_modifiers.js",
64+
"analysis/javascript/regexp_modifiers_xref.html", null, 14);
65+
}
5266
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
function escapeLuceneCharacters1(term) {
2+
// must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
3+
var pattern = /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms;
4+
5+
return term.replace(pattern, "\\$1");
6+
}
7+
8+
function escapeLuceneCharacters2(term) {
9+
// must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
10+
var pattern = {
11+
pattern: /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms
12+
};
13+
14+
return term.replace(pattern, "\\$1");
15+
}
16+
17+
function escapeLuceneCharacters3(term) {
18+
// must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
19+
var pattern = new RegExp(/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms);
20+
21+
return term.replace(pattern, "\\$1");
22+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
escapeLuceneCharacters1
2+
term
3+
pattern
4+
term
5+
replace
6+
pattern
7+
escapeLuceneCharacters2
8+
term
9+
pattern
10+
pattern
11+
term
12+
replace
13+
pattern
14+
escapeLuceneCharacters3
15+
term
16+
pattern
17+
term
18+
replace
19+
pattern
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>sampleFile - OpenGrok cross reference for /sampleFile</title></head><body>
6+
<a class="l" name="1" href="#1">1</a><b>function</b> <a href="/source/s?defs=escapeLuceneCharacters1" class="intelliWindow-symbol" data-definition-place="undefined-in-file">escapeLuceneCharacters1</a>(<a href="/source/s?defs=term" class="intelliWindow-symbol" data-definition-place="undefined-in-file">term</a>) {
7+
<a class="l" name="2" href="#2">2</a> <span class="c">// must escape: + - &amp;&amp; || ! ( ) { } [ ] ^ &quot; ~ * ? : \</span>
8+
<a class="l" name="3" href="#3">3</a> <b>var</b> <a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a> = <span class="s">/([\+\-\!\(\)\{\}\[\]\^\&quot;\~\*\?\:\\]|&amp;&amp;|\|\|)/gms</span>;
9+
<a class="l" name="4" href="#4">4</a>
10+
<a class="l" name="5" href="#5">5</a> <b>return</b> <a href="/source/s?defs=term" class="intelliWindow-symbol" data-definition-place="undefined-in-file">term</a>.<a href="/source/s?defs=replace" class="intelliWindow-symbol" data-definition-place="undefined-in-file">replace</a>(<a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a>, <span class="s">&quot;\\$1&quot;</span>);
11+
<a class="l" name="6" href="#6">6</a>}
12+
<a class="l" name="7" href="#7">7</a>
13+
<a class="l" name="8" href="#8">8</a><b>function</b> <a href="/source/s?defs=escapeLuceneCharacters2" class="intelliWindow-symbol" data-definition-place="undefined-in-file">escapeLuceneCharacters2</a>(<a href="/source/s?defs=term" class="intelliWindow-symbol" data-definition-place="undefined-in-file">term</a>) {
14+
<a class="l" name="9" href="#9">9</a> <span class="c">// must escape: + - &amp;&amp; || ! ( ) { } [ ] ^ &quot; ~ * ? : \</span>
15+
<a class="hl" name="10" href="#10">10</a> <b>var</b> <a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a> = {
16+
<a class="l" name="11" href="#11">11</a> <a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a>: <span class="s">/([\+\-\!\(\)\{\}\[\]\^\&quot;\~\*\?\:\\]|&amp;&amp;|\|\|)/gms</span>
17+
<a class="l" name="12" href="#12">12</a> };
18+
<a class="l" name="13" href="#13">13</a>
19+
<a class="l" name="14" href="#14">14</a> <b>return</b> <a href="/source/s?defs=term" class="intelliWindow-symbol" data-definition-place="undefined-in-file">term</a>.<a href="/source/s?defs=replace" class="intelliWindow-symbol" data-definition-place="undefined-in-file">replace</a>(<a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a>, <span class="s">&quot;\\$1&quot;</span>);
20+
<a class="l" name="15" href="#15">15</a>}
21+
<a class="l" name="16" href="#16">16</a>
22+
<a class="l" name="17" href="#17">17</a><b>function</b> <a href="/source/s?defs=escapeLuceneCharacters3" class="intelliWindow-symbol" data-definition-place="undefined-in-file">escapeLuceneCharacters3</a>(<a href="/source/s?defs=term" class="intelliWindow-symbol" data-definition-place="undefined-in-file">term</a>) {
23+
<a class="l" name="18" href="#18">18</a> <span class="c">// must escape: + - &amp;&amp; || ! ( ) { } [ ] ^ &quot; ~ * ? : \</span>
24+
<a class="l" name="19" href="#19">19</a> <b>var</b> <a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a> = <b>new</b> <b>RegExp</b>(<span class="s">/([\+\-\!\(\)\{\}\[\]\^\&quot;\~\*\?\:\\]|&amp;&amp;|\|\|)/gms</span>);
25+
<a class="hl" name="20" href="#20">20</a>
26+
<a class="l" name="21" href="#21">21</a> <b>return</b> <a href="/source/s?defs=term" class="intelliWindow-symbol" data-definition-place="undefined-in-file">term</a>.<a href="/source/s?defs=replace" class="intelliWindow-symbol" data-definition-place="undefined-in-file">replace</a>(<a href="/source/s?defs=pattern" class="intelliWindow-symbol" data-definition-place="undefined-in-file">pattern</a>, <span class="s">&quot;\\$1&quot;</span>);
27+
<a class="l" name="22" href="#22">22</a>}
28+
<a class="l" name="23" href="#23">23</a></body>
29+
</html>
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
function escapeLuceneCharacters1(term) {
2+
// must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
3+
var pattern = /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/;
4+
5+
return term.replace(pattern, "\\$1");
6+
}
7+
8+
function escapeLuceneCharacters2(term) {
9+
// must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
10+
var pattern = {
11+
pattern: /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/
12+
};
13+
14+
return term.replace(pattern, "\\$1");
15+
}
16+
17+
function escapeLuceneCharacters3(term) {
18+
// must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
19+
var pattern = new RegExp(/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/);
20+
21+
return term.replace(pattern, "\\$1");
22+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
escapeLuceneCharacters1
2+
term
3+
pattern
4+
term
5+
replace
6+
pattern
7+
escapeLuceneCharacters2
8+
term
9+
pattern
10+
pattern
11+
term
12+
replace
13+
pattern
14+
escapeLuceneCharacters3
15+
term
16+
pattern
17+
term
18+
replace
19+
pattern

0 commit comments

Comments
 (0)