Skip to content

Commit 9cfa166

Browse files
inponomareveddumelendezkiview
authored
Enhance ScriptUtils: fix parser (#7646)
Co-authored-by: Eddú Meléndez <[email protected]> Co-authored-by: Kevin Wittek <[email protected]>
1 parent 3422ebb commit 9cfa166

File tree

4 files changed

+511
-202
lines changed

4 files changed

+511
-202
lines changed
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
package org.testcontainers.ext;
2+
3+
import lombok.Getter;
4+
import lombok.RequiredArgsConstructor;
5+
6+
import java.util.regex.Matcher;
7+
import java.util.regex.Pattern;
8+
9+
/**
10+
* Rough lexical parser for SQL scripts.
11+
*/
12+
@RequiredArgsConstructor
13+
class ScriptScanner {
14+
15+
private final String resource;
16+
17+
private final String script;
18+
19+
private final String separator;
20+
21+
private final String commentPrefix;
22+
23+
private final String blockCommentStartDelimiter;
24+
25+
private final String blockCommentEndDelimiter;
26+
27+
private final Pattern eol = Pattern.compile("[\n\r]+");
28+
29+
private final Pattern whitespace = Pattern.compile("\\s+");
30+
31+
private final Pattern identifier = Pattern.compile("[a-z][a-z0-9_]*", Pattern.CASE_INSENSITIVE);
32+
33+
private final Pattern singleQuotedString = Pattern.compile("'(\\\\'|[^'])*'");
34+
35+
private final Pattern ansiQuotedString = Pattern.compile("\"(\\\\\"|[^\"])*\"");
36+
37+
private final Pattern dollarQuotedStringDelimiter = Pattern.compile("\\$\\w*\\$");
38+
39+
private int offset;
40+
41+
@Getter
42+
private String currentMatch;
43+
44+
private boolean matches(String substring) {
45+
if (script.startsWith(substring, offset)) {
46+
currentMatch = substring;
47+
offset += currentMatch.length();
48+
return true;
49+
} else {
50+
currentMatch = "";
51+
return false;
52+
}
53+
}
54+
55+
private boolean matches(Pattern regexp) {
56+
Matcher m = regexp.matcher(script);
57+
if (m.find(offset) && m.start() == offset) {
58+
currentMatch = m.group();
59+
offset = m.end();
60+
return true;
61+
} else {
62+
currentMatch = "";
63+
return false;
64+
}
65+
}
66+
67+
private boolean matchesSingleLineComment() {
68+
/* Matches from commentPrefix to the EOL or end of script */
69+
if (matches(commentPrefix)) {
70+
Matcher m = eol.matcher(script);
71+
if (m.find(offset)) {
72+
currentMatch = commentPrefix + script.substring(offset, m.end());
73+
offset = m.end();
74+
} else {
75+
currentMatch = commentPrefix + script.substring(offset);
76+
offset = script.length();
77+
}
78+
return true;
79+
}
80+
return false;
81+
}
82+
83+
private boolean matchesMultilineComment() {
84+
/* Matches from blockCommentStartDelimiter to the next blockCommentEndDelimiter.
85+
* Error, if blockCommentEndDelimiter is not found. */
86+
if (matches(blockCommentStartDelimiter)) {
87+
int end = script.indexOf(blockCommentEndDelimiter, offset);
88+
if (end < 0) {
89+
throw new ScriptUtils.ScriptParseException(
90+
String.format("Missing block comment end delimiter [%s].", blockCommentEndDelimiter),
91+
resource
92+
);
93+
}
94+
end += blockCommentEndDelimiter.length();
95+
currentMatch = blockCommentStartDelimiter + script.substring(offset, end);
96+
offset = end;
97+
return true;
98+
}
99+
return false;
100+
}
101+
102+
private boolean matchesDollarQuotedString() {
103+
//Matches $<tag>$ .... $<tag>$
104+
if (matches(dollarQuotedStringDelimiter)) {
105+
String delimiter = currentMatch;
106+
int end = script.indexOf(delimiter, offset);
107+
if (end < 0) {
108+
throw new ScriptUtils.ScriptParseException(
109+
String.format("Unclosed dollar quoted string [%s].", delimiter),
110+
resource
111+
);
112+
}
113+
end += delimiter.length();
114+
currentMatch = delimiter + script.substring(offset, end);
115+
offset = end;
116+
return true;
117+
}
118+
return false;
119+
}
120+
121+
Lexem next() {
122+
if (offset < script.length()) {
123+
if (matches(separator)) {
124+
return Lexem.SEPARATOR;
125+
} else if (matchesSingleLineComment() || matchesMultilineComment()) {
126+
return Lexem.COMMENT;
127+
} else if (matches(singleQuotedString) || matches(ansiQuotedString) || matchesDollarQuotedString()) {
128+
return Lexem.QUOTED_STRING;
129+
} else if (matches(identifier)) {
130+
return Lexem.IDENTIFIER;
131+
} else if (matches(whitespace)) {
132+
return Lexem.WHITESPACE;
133+
} else {
134+
currentMatch = String.valueOf(script.charAt(offset++));
135+
return Lexem.OTHER;
136+
}
137+
} else {
138+
return Lexem.EOF;
139+
}
140+
}
141+
142+
enum Lexem {
143+
SEPARATOR,
144+
COMMENT,
145+
QUOTED_STRING,
146+
WHITESPACE,
147+
IDENTIFIER,
148+
OTHER,
149+
EOF,
150+
}
151+
}
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package org.testcontainers.ext;
2+
3+
import lombok.RequiredArgsConstructor;
4+
import org.apache.commons.lang3.StringUtils;
5+
import org.testcontainers.ext.ScriptScanner.Lexem;
6+
7+
import java.util.List;
8+
9+
/**
10+
* Performs splitting of an SQL script into statements including
11+
* basic clean-up.
12+
*/
13+
@RequiredArgsConstructor
14+
class ScriptSplitter {
15+
16+
private final ScriptScanner scanner;
17+
18+
private final List<String> statements;
19+
20+
private final StringBuilder sb = new StringBuilder();
21+
22+
/**
23+
* Standard parsing:
24+
* 1. Remove comments
25+
* 2. Shrink whitespace and eols
26+
* 3. Split on separator
27+
*/
28+
void split() {
29+
Lexem l;
30+
while ((l = scanner.next()) != Lexem.EOF) {
31+
switch (l) {
32+
case SEPARATOR:
33+
flushStringBuilder();
34+
break;
35+
case COMMENT:
36+
//skip
37+
break;
38+
case WHITESPACE:
39+
if (!sb.toString().endsWith(" ")) {
40+
sb.append(' ');
41+
}
42+
break;
43+
case IDENTIFIER:
44+
appendMatch();
45+
if ("begin".equalsIgnoreCase(scanner.getCurrentMatch())) {
46+
compoundStatement(false);
47+
flushStringBuilder();
48+
}
49+
break;
50+
default:
51+
appendMatch();
52+
}
53+
}
54+
flushStringBuilder();
55+
}
56+
57+
/**
58+
* Compound statement ('create procedure') mode:
59+
* 1. Do not remove comments
60+
* 2. Do not shrink whitespace
61+
* 3. Do not split on separators
62+
* 3. This mode can be recursive
63+
*/
64+
private void compoundStatement(boolean recursive) {
65+
Lexem l;
66+
while ((l = scanner.next()) != Lexem.EOF) {
67+
appendMatch();
68+
if (Lexem.IDENTIFIER.equals(l)) {
69+
if ("begin".equalsIgnoreCase(scanner.getCurrentMatch())) {
70+
compoundStatement(true);
71+
} else if ("end".equalsIgnoreCase(scanner.getCurrentMatch())) {
72+
if (endOfBlock(recursive)) {
73+
return;
74+
}
75+
}
76+
}
77+
}
78+
flushStringBuilder();
79+
}
80+
81+
private boolean endOfBlock(boolean recursive) {
82+
Lexem l;
83+
StringBuilder temporary = new StringBuilder();
84+
while ((l = scanner.next()) != Lexem.EOF) {
85+
switch (l) {
86+
case COMMENT:
87+
case WHITESPACE:
88+
temporary.append(scanner.getCurrentMatch());
89+
break;
90+
case SEPARATOR:
91+
//Only whitespace and comments preceded the separator: true end of block
92+
//If it's an internal block, append everything
93+
if (recursive) {
94+
sb.append(temporary);
95+
appendMatch();
96+
}
97+
return true;
98+
default:
99+
// Semicolon is not recognized as separator: this means that a custom
100+
// separator is used. Still, 'END;' should be a valid end of block
101+
if (";".equals(scanner.getCurrentMatch())) {
102+
if (recursive) {
103+
sb.append(temporary);
104+
appendMatch();
105+
}
106+
return true;
107+
}
108+
sb.append(temporary);
109+
appendMatch();
110+
return false;
111+
}
112+
}
113+
return true;
114+
}
115+
116+
private void appendMatch() {
117+
sb.append(scanner.getCurrentMatch());
118+
}
119+
120+
private void flushStringBuilder() {
121+
final String s = sb.toString().trim();
122+
if (StringUtils.isNotEmpty(s)) {
123+
statements.add(s);
124+
}
125+
sb.setLength(0);
126+
}
127+
}

0 commit comments

Comments
 (0)