Skip to content

Commit 12305aa

Browse files
committed
extract regexp literals from string concatenations
1 parent 9478faf commit 12305aa

File tree

16 files changed

+3460
-149
lines changed

16 files changed

+3460
-149
lines changed

javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import java.nio.file.Path;
44
import java.util.ArrayList;
55
import java.util.Collections;
6+
import java.util.HashSet;
7+
import java.util.Arrays;
68
import java.util.List;
79
import java.util.Set;
810
import java.util.Stack;
@@ -164,6 +166,9 @@
164166
import com.semmle.util.trap.TrapWriter;
165167
import com.semmle.util.trap.TrapWriter.Label;
166168

169+
import com.semmle.util.files.FileLineOffsetCache;
170+
171+
167172
/** Extractor for AST-based information; invoked by the {@link JSExtractor}. */
168173
public class ASTExtractor {
169174
private final TrapWriter trapwriter;
@@ -567,12 +572,17 @@ public Label visit(Literal nd, Context c) {
567572
String valueString = nd.getStringValue();
568573

569574
trapwriter.addTuple("literals", valueString, source, key);
575+
Position start = nd.getLoc().getStart();
576+
com.semmle.util.locations.Position startPos = new com.semmle.util.locations.Position(start.getLine(), start.getColumn(), start.getOffset());
577+
570578
if (nd.isRegExp()) {
571579
OffsetTranslation offsets = new OffsetTranslation();
572580
offsets.set(0, 1); // skip the initial '/'
573-
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), offsets, nd, false);
581+
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getRaw()).offsetBy(0, offsets), startPos);
582+
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), sourceMap, nd, false);
574583
} else if (nd.isStringLiteral() && !c.isInsideType() && nd.getRaw().length() < 1000) {
575-
regexpExtractor.extract(valueString, makeStringLiteralOffsets(nd.getRaw()), nd, true);
584+
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getRaw()).offsetBy(0, makeStringLiteralOffsets(nd.getRaw())), startPos);
585+
regexpExtractor.extract(valueString, sourceMap, nd, true);
576586

577587
// Scan the string for template tags, if we're in a context where such tags are relevant.
578588
if (scopeManager.isInTemplateFile()) {
@@ -593,6 +603,48 @@ private boolean isOctalDigit(char ch) {
593603
return '0' <= ch && ch <= '7';
594604
}
595605

606+
private String getStringConcatResult(Expression exp) {
607+
if (exp instanceof BinaryExpression) {
608+
BinaryExpression be = (BinaryExpression) exp;
609+
if (be.getOperator().equals("+")) {
610+
String left = getStringConcatResult(be.getLeft());
611+
String right = getStringConcatResult(be.getRight());
612+
if (left != null && right != null) {
613+
return left + right;
614+
}
615+
}
616+
} else if (exp instanceof Literal) {
617+
Literal lit = (Literal) exp;
618+
if (!lit.isStringLiteral()) {
619+
return null;
620+
}
621+
return lit.getStringValue();
622+
}
623+
return null;
624+
}
625+
626+
private OffsetTranslation computeStringConcatOffset(Expression exp) {
627+
if (exp instanceof Literal && ((Literal)exp).isStringLiteral()) {
628+
String raw = ((Literal) exp).getRaw();
629+
return makeStringLiteralOffsets(raw);
630+
}
631+
632+
if (exp instanceof BinaryExpression) {
633+
BinaryExpression be = (BinaryExpression) exp;
634+
OffsetTranslation left = computeStringConcatOffset(be.getLeft());
635+
OffsetTranslation right = computeStringConcatOffset(be.getRight());
636+
637+
if (left == null || right == null) {
638+
return null;
639+
}
640+
int delta = be.getRight().getLoc().getStart().getOffset() - be.getLeft().getLoc().getStart().getOffset();
641+
int offset = getStringConcatResult(be.getLeft()).length();
642+
return left.append(right, offset, delta);
643+
}
644+
645+
return null;
646+
}
647+
596648
/**
597649
* Builds a translation from offsets in a string value back to its original raw literal text
598650
* (including quotes).
@@ -786,11 +838,31 @@ public Label visit(AssignmentExpression nd, Context c) {
786838
return key;
787839
}
788840

841+
// set to determine which BinaryExpression has been extracted as regexp
842+
private Set<Expression> extractedAsRegexp = new HashSet<>();
843+
789844
@Override
790845
public Label visit(BinaryExpression nd, Context c) {
791846
Label key = super.visit(nd, c);
847+
extractedAsRegexp.add(nd.getLeft());
848+
extractedAsRegexp.add(nd.getRight());
792849
visit(nd.getLeft(), key, 0);
793850
visit(nd.getRight(), key, 1);
851+
if (extractedAsRegexp.contains(nd)) {
852+
return key;
853+
}
854+
String rawString = getStringConcatResult(nd);
855+
if (rawString == null) {
856+
return key;
857+
}
858+
if (rawString.length() > 1000 && !rawString.trim().isEmpty()) {
859+
return key;
860+
}
861+
OffsetTranslation offsets = computeStringConcatOffset(nd);
862+
Position start = nd.getLoc().getStart();
863+
com.semmle.util.locations.Position startPos = new com.semmle.util.locations.Position(start.getLine(), start.getColumn(), start.getOffset());
864+
SourceMap sourceMap = SourceMap.legacyWithStartPos(SourceMap.fromString(nd.getLoc().getSource()).offsetBy(0, offsets), startPos);
865+
regexpExtractor.extract(rawString, sourceMap, nd, true);
794866
return key;
795867
}
796868

javascript/extractor/src/com/semmle/js/extractor/Main.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public class Main {
4343
* A version identifier that should be updated every time the extractor changes in such a way that
4444
* it may produce different tuples for the same file under the same {@link ExtractorConfig}.
4545
*/
46-
public static final String EXTRACTOR_VERSION = "2021-10-25";
46+
public static final String EXTRACTOR_VERSION = "2021-10-28";
4747

4848
public static final Pattern NEWLINE = Pattern.compile("\n");
4949

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind;
4444
import com.semmle.js.parser.RegExpParser;
4545
import com.semmle.js.parser.RegExpParser.Result;
46-
import com.semmle.util.locations.OffsetTranslation;
46+
import com.semmle.util.locations.SourceMap;
4747
import com.semmle.util.trap.TrapWriter;
4848
import com.semmle.util.trap.TrapWriter.Label;
4949

@@ -52,8 +52,7 @@ public class RegExpExtractor {
5252
private final TrapWriter trapwriter;
5353
private final LocationManager locationManager;
5454
private final RegExpParser parser = new RegExpParser();
55-
private Position literalStart;
56-
private OffsetTranslation offsets;
55+
private SourceMap sourceMap;
5756

5857
public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
5958
this.trapwriter = trapwriter;
@@ -122,17 +121,14 @@ private Label extractTerm(RegExpTerm term, Label parent, int idx) {
122121
}
123122

124123
public void emitLocation(SourceElement term, Label lbl) {
125-
int col = literalStart.getColumn();
126-
int sl, sc, el, ec;
127-
sl = el = literalStart.getLine();
128-
sc = col + offsets.get(term.getLoc().getStart().getColumn());
129-
ec = col + offsets.get(term.getLoc().getEnd().getColumn());
130-
sc += 1; // convert to 1-based
131-
ec += 1; // convert to 1-based
132-
ec -= 1; // convert to inclusive
124+
int sl = sourceMap.getStart(term.getLoc().getStart().getColumn()).getLine();
125+
int sc = sourceMap.getStart(term.getLoc().getStart().getColumn()).getColumn() + 1; // convert to 1-based
126+
int el = sourceMap.getEnd(term.getLoc().getEnd().getColumn()).getLine();
127+
int ec = sourceMap.getEnd(term.getLoc().getEnd().getColumn()).getColumn() - 1; // convert to inclusive
133128
locationManager.emitSnippetLocation(lbl, sl, sc, el, ec);
134129
}
135130

131+
136132
private class V implements Visitor {
137133
private Label parent;
138134
private int idx;
@@ -348,16 +344,13 @@ public void visit(CharacterClassRange nd) {
348344
}
349345
}
350346

351-
public void extract(
352-
String src, OffsetTranslation offsets, Node parent, boolean isSpeculativeParsing) {
347+
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing) {
353348
Result res = parser.parse(src);
354-
355349
if (isSpeculativeParsing && res.getErrors().size() > 0) {
356350
return;
357351
}
358352

359-
this.literalStart = parent.getLoc().getStart();
360-
this.offsets = offsets;
353+
this.sourceMap = sourceMap;
361354
RegExpTerm ast = res.getAST();
362355
new V().visit(ast, trapwriter.localID(parent), 0);
363356

0 commit comments

Comments
 (0)