Skip to content

Commit a7cd097

Browse files
authored
Merge pull request #6756 from erik-krogh/extractBigReg
JS: extract regexp literals for string concatenations
2 parents 3a8e2db + 0023b88 commit a7cd097

File tree

21 files changed

+4723
-696
lines changed

21 files changed

+4723
-696
lines changed

javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java

Lines changed: 87 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import java.nio.file.Path;
44
import java.util.ArrayList;
55
import java.util.Collections;
6+
import java.util.HashSet;
7+
import java.util.Arrays;
68
import java.util.List;
79
import java.util.Set;
810
import java.util.Stack;
@@ -332,17 +334,28 @@ private static class Context {
332334
private final Label parent;
333335
private final int childIndex;
334336
private final IdContext idcontext;
337+
private final boolean binopOperand;
335338

336339
public Context(Label parent, int childIndex, IdContext idcontext) {
340+
this(parent, childIndex, idcontext, false);
341+
}
342+
343+
public Context(Label parent, int childIndex, IdContext idcontext, boolean binopOperand) {
337344
this.parent = parent;
338345
this.childIndex = childIndex;
339346
this.idcontext = idcontext;
347+
this.binopOperand = binopOperand;
340348
}
341349

342350
/** True if the visited AST node occurs as part of a type annotation. */
343351
public boolean isInsideType() {
344352
return idcontext.isInsideType();
345353
}
354+
355+
/** True if the visited AST node occurs as one of the operands of a binary operation. */
356+
public boolean isBinopOperand() {
357+
return binopOperand;
358+
}
346359
}
347360

348361
private class V extends DefaultVisitor<Context, Label> {
@@ -358,16 +371,24 @@ public V(Platform platform, SourceType sourceType) {
358371
}
359372

360373
private Label visit(INode child, Label parent, int childIndex) {
361-
return visit(child, parent, childIndex, IdContext.VAR_BIND);
374+
return visit(child, parent, childIndex, IdContext.VAR_BIND, false);
362375
}
363376

364377
private Label visitAll(List<? extends INode> children, Label parent) {
365378
return visitAll(children, parent, IdContext.VAR_BIND, 0);
366379
}
367380

368381
private Label visit(INode child, Label parent, int childIndex, IdContext idContext) {
382+
return visit(child, parent, childIndex, idContext, false);
383+
}
384+
385+
private Label visit(INode child, Label parent, int childIndex, boolean binopOperand) {
386+
return visit(child, parent, childIndex, IdContext.VAR_BIND, binopOperand);
387+
}
388+
389+
private Label visit(INode child, Label parent, int childIndex, IdContext idContext, boolean binopOperand) {
369390
if (child == null) return null;
370-
return child.accept(this, new Context(parent, childIndex, idContext));
391+
return child.accept(this, new Context(parent, childIndex, idContext, binopOperand));
371392
}
372393

373394
private Label visitAll(
@@ -379,7 +400,7 @@ private Label visitAll(
379400
List<? extends INode> children, Label parent, IdContext idContext, int index, int step) {
380401
Label res = null;
381402
for (INode child : children) {
382-
res = visit(child, parent, index, idContext);
403+
res = visit(child, parent, index, idContext, false);
383404
index += step;
384405
}
385406
return res;
@@ -567,12 +588,17 @@ public Label visit(Literal nd, Context c) {
567588
String valueString = nd.getStringValue();
568589

569590
trapwriter.addTuple("literals", valueString, source, key);
591+
Position start = nd.getLoc().getStart();
592+
com.semmle.util.locations.Position startPos = new com.semmle.util.locations.Position(start.getLine(), start.getColumn() + 1 /* Convert from 0-based to 1-based. */, start.getOffset());
593+
570594
if (nd.isRegExp()) {
571595
OffsetTranslation offsets = new OffsetTranslation();
572596
offsets.set(0, 1); // skip the initial '/'
573-
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), offsets, nd, false);
574-
} else if (nd.isStringLiteral() && !c.isInsideType() && nd.getRaw().length() < 1000) {
575-
regexpExtractor.extract(valueString, makeStringLiteralOffsets(nd.getRaw()), nd, true);
597+
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getRaw()).offsetBy(0, offsets), startPos);
598+
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), sourceMap, nd, false);
599+
} else if (nd.isStringLiteral() && !c.isInsideType() && nd.getRaw().length() < 1000 && !c.isBinopOperand()) {
600+
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getRaw()).offsetBy(0, makeStringLiteralOffsets(nd.getRaw())), startPos);
601+
regexpExtractor.extract(valueString, sourceMap, nd, true);
576602

577603
// Scan the string for template tags, if we're in a context where such tags are relevant.
578604
if (scopeManager.isInTemplateFile()) {
@@ -593,6 +619,38 @@ private boolean isOctalDigit(char ch) {
593619
return '0' <= ch && ch <= '7';
594620
}
595621

622+
/**
623+
* Constant-folds simple string concatenations in `exp` while keeping an offset translation
624+
* that tracks back to the original source.
625+
*/
626+
private Pair<String, OffsetTranslation> getStringConcatResult(Expression exp) {
627+
if (exp instanceof BinaryExpression) {
628+
BinaryExpression be = (BinaryExpression) exp;
629+
if (be.getOperator().equals("+")) {
630+
Pair<String, OffsetTranslation> left = getStringConcatResult(be.getLeft());
631+
Pair<String, OffsetTranslation> right = getStringConcatResult(be.getRight());
632+
if (left == null || right == null) {
633+
return null;
634+
}
635+
String str = left.fst() + right.fst();
636+
if (str.length() > 1000) {
637+
return null;
638+
}
639+
640+
int delta = be.getRight().getLoc().getStart().getOffset() - be.getLeft().getLoc().getStart().getOffset();
641+
int offset = left.fst().length();
642+
return Pair.make(str, left.snd().append(right.snd(), offset, delta));
643+
}
644+
} else if (exp instanceof Literal) {
645+
Literal lit = (Literal) exp;
646+
if (!lit.isStringLiteral()) {
647+
return null;
648+
}
649+
return Pair.make(lit.getStringValue(), makeStringLiteralOffsets(lit.getRaw()));
650+
}
651+
return null;
652+
}
653+
596654
/**
597655
* Builds a translation from offsets in a string value back to its original raw literal text
598656
* (including quotes).
@@ -789,11 +847,32 @@ public Label visit(AssignmentExpression nd, Context c) {
789847
@Override
790848
public Label visit(BinaryExpression nd, Context c) {
791849
Label key = super.visit(nd, c);
792-
visit(nd.getLeft(), key, 0);
793-
visit(nd.getRight(), key, 1);
850+
visit(nd.getLeft(), key, 0, true);
851+
visit(nd.getRight(), key, 1, true);
852+
extractRegxpFromBinop(nd, c);
794853
return key;
795854
}
796855

856+
private void extractRegxpFromBinop(BinaryExpression nd, Context c) {
857+
if (c.isBinopOperand()) {
858+
return;
859+
}
860+
Pair<String, OffsetTranslation> concatResult = getStringConcatResult(nd);
861+
if (concatResult == null) {
862+
return;
863+
}
864+
String foldedString = concatResult.fst();
865+
if (foldedString.length() > 1000 && !foldedString.trim().isEmpty()) {
866+
return;
867+
}
868+
OffsetTranslation offsets = concatResult.snd();
869+
Position start = nd.getLoc().getStart();
870+
com.semmle.util.locations.Position startPos = new com.semmle.util.locations.Position(start.getLine(), start.getColumn() + 1 /* Convert from 0-based to 1-based. */, start.getOffset());
871+
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getLoc().getSource()).offsetBy(0, offsets), startPos);
872+
regexpExtractor.extract(foldedString, sourceMap, nd, true);
873+
return;
874+
}
875+
797876
@Override
798877
public Label visit(ComprehensionBlock nd, Context c) {
799878
Label key = super.visit(nd, c);

javascript/extractor/src/com/semmle/js/extractor/Main.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public class Main {
4343
* A version identifier that should be updated every time the extractor changes in such a way that
4444
* it may produce different tuples for the same file under the same {@link ExtractorConfig}.
4545
*/
46-
public static final String EXTRACTOR_VERSION = "2021-10-25";
46+
public static final String EXTRACTOR_VERSION = "2021-10-28";
4747

4848
public static final Pattern NEWLINE = Pattern.compile("\n");
4949

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind;
4444
import com.semmle.js.parser.RegExpParser;
4545
import com.semmle.js.parser.RegExpParser.Result;
46-
import com.semmle.util.locations.OffsetTranslation;
46+
import com.semmle.util.locations.SourceMap;
4747
import com.semmle.util.trap.TrapWriter;
4848
import com.semmle.util.trap.TrapWriter.Label;
4949

@@ -52,8 +52,7 @@ public class RegExpExtractor {
5252
private final TrapWriter trapwriter;
5353
private final LocationManager locationManager;
5454
private final RegExpParser parser = new RegExpParser();
55-
private Position literalStart;
56-
private OffsetTranslation offsets;
55+
private SourceMap sourceMap;
5756

5857
public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
5958
this.trapwriter = trapwriter;
@@ -122,17 +121,16 @@ private Label extractTerm(RegExpTerm term, Label parent, int idx) {
122121
}
123122

124123
public void emitLocation(SourceElement term, Label lbl) {
125-
int col = literalStart.getColumn();
126-
int sl, sc, el, ec;
127-
sl = el = literalStart.getLine();
128-
sc = col + offsets.get(term.getLoc().getStart().getColumn());
129-
ec = col + offsets.get(term.getLoc().getEnd().getColumn());
130-
sc += 1; // convert to 1-based
131-
ec += 1; // convert to 1-based
132-
ec -= 1; // convert to inclusive
124+
int start = term.getLoc().getStart().getColumn();
125+
int sl = sourceMap.getStart(start).getLine();
126+
int sc = sourceMap.getStart(start).getColumn();
127+
int end = term.getLoc().getEnd().getColumn();
128+
int el = sourceMap.getStart(end).getLine();
129+
int ec = sourceMap.getStart(end).getColumn() - 1; // convert to inclusive
133130
locationManager.emitSnippetLocation(lbl, sl, sc, el, ec);
134131
}
135132

133+
136134
private class V implements Visitor {
137135
private Label parent;
138136
private int idx;
@@ -348,16 +346,13 @@ public void visit(CharacterClassRange nd) {
348346
}
349347
}
350348

351-
public void extract(
352-
String src, OffsetTranslation offsets, Node parent, boolean isSpeculativeParsing) {
349+
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing) {
353350
Result res = parser.parse(src);
354-
355351
if (isSpeculativeParsing && res.getErrors().size() > 0) {
356352
return;
357353
}
358354

359-
this.literalStart = parent.getLoc().getStart();
360-
this.offsets = offsets;
355+
this.sourceMap = sourceMap;
361356
RegExpTerm ast = res.getAST();
362357
new V().visit(ast, trapwriter.localID(parent), 0);
363358

0 commit comments

Comments
 (0)