cklin
diff --git a/‎java/ql/lib/semmle/code/java/security/performance/ReDoSUtil.qll
Lines changed: 53 additions & 13 deletions b/‎java/ql/lib/semmle/code/java/security/performance/ReDoSUtil.qll
Lines changed: 53 additions & 13 deletions
diff --git a/‎javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
Lines changed: 53 additions & 13 deletions b/‎javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
Lines changed: 53 additions & 13 deletions
@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
 /**
  * Holds if `a` and `b` are input symbols from the same regexp.
  */
-private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
+private predicate sharesRoot(InputSymbol a, InputSymbol b) {
   exists(RegExpRoot root |
     belongsTo(a, root) and
     belongsTo(b, root)
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
 /**
  * Holds if the `a` is an input symbol from a regexp that has root `root`.
  */
-private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
+private predicate belongsTo(InputSymbol a, RegExpRoot root) {
   exists(State s | getRoot(s.getRepr()) = root |
     delta(s, a, _)
     or
@@ -378,6 +378,13 @@ private module CharacterClasses {
     )
   }
 
+  bindingset[char, cc]
+  private string caseNormalize(string char, RegExpTerm cc) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then result = char.toLowerCase()
+    else result = char
+  }
+
   /**
    * An implementation of `CharacterClass` for positive (non inverted) character classes.
    */
@@ -386,7 +393,7 @@ private module CharacterClasses {
 
     PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
 
-    override string getARelevantChar() { result = getAMentionedChar(cc) }
+    override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
 
     override predicate matches(string char) { hasChildThatMatches(cc, char) }
   }
@@ -400,8 +407,8 @@ private module CharacterClasses {
     InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
 
     override string getARelevantChar() {
-      result = nextChar(getAMentionedChar(cc)) or
-      nextChar(result) = getAMentionedChar(cc)
+      result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
+      nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
     }
 
     bindingset[char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
    */
   private class PositiveCharacterClassEscape extends CharacterClass {
     string charClass;
+    RegExpTerm cc;
 
     PositiveCharacterClassEscape() {
-      exists(RegExpTerm cc |
-        isEscapeClass(cc, charClass) and
-        this = getCanonicalCharClass(cc) and
-        charClass = ["d", "s", "w"]
-      )
+      isEscapeClass(cc, charClass) and
+      this = getCanonicalCharClass(cc) and
+      charClass = ["d", "s", "w"]
     }
 
     override string getARelevantChar() {
@@ -445,7 +451,9 @@ private module CharacterClasses {
       result = " "
       or
       charClass = "w" and
-      result = ["a", "Z", "_", "0", "9"]
+      if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      then result = ["a", "z", "_", "0", "9"]
+      else result = ["a", "Z", "_", "0", "9"]
     }
 
     override predicate matches(string char) { classEscapeMatches(charClass, char) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
       not classEscapeMatches(charClass.toLowerCase(), char)
     }
   }
+
+  /** Gets a representative for all char classes that match the same chars as `c`. */
+  CharacterClass normalize(CharacterClass c) {
+    exists(string normalization |
+      normalization = getMormalizationString(c) and
+      result =
+        min(CharacterClass cc, string raw |
+          getMormalizationString(cc) = normalization and cc = CharClass(raw)
+        |
+          cc order by raw
+        )
+    )
+  }
+
+  /** Gets a string representing all the chars matched by `c` */
+  private string getMormalizationString(CharacterClass c) {
+    (c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
+    result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
+    or
+    (c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
+    // the string produced by the concat can not contain repeated chars
+    // so by starting the below with "nn" we can guarantee that
+    // it will not overlap with the above case.
+    // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
+    result =
+      "nn:" +
+        concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
+  }
 }
 
 private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
+    lbl =
+      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
+            getCanonicalizationFlags(cc.getRootTerm()))) and
     q2 = after(cc)
   )
   or
   exists(RegExpTerm cc | isEscapeClass(cc, _) |
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
+    lbl =
+      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
+            getCanonicalizationFlags(cc.getRootTerm()))) and
     q2 = after(cc)
   )
   or
 
@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
 /**
  * Holds if `a` and `b` are input symbols from the same regexp.
  */
-private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
+private predicate sharesRoot(InputSymbol a, InputSymbol b) {
   exists(RegExpRoot root |
     belongsTo(a, root) and
     belongsTo(b, root)
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
 /**
  * Holds if the `a` is an input symbol from a regexp that has root `root`.
  */
-private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
+private predicate belongsTo(InputSymbol a, RegExpRoot root) {
   exists(State s | getRoot(s.getRepr()) = root |
     delta(s, a, _)
     or
@@ -378,6 +378,13 @@ private module CharacterClasses {
     )
   }
 
+  bindingset[char, cc]
+  private string caseNormalize(string char, RegExpTerm cc) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then result = char.toLowerCase()
+    else result = char
+  }
+
   /**
    * An implementation of `CharacterClass` for positive (non inverted) character classes.
    */
@@ -386,7 +393,7 @@ private module CharacterClasses {
 
     PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
 
-    override string getARelevantChar() { result = getAMentionedChar(cc) }
+    override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
 
     override predicate matches(string char) { hasChildThatMatches(cc, char) }
   }
@@ -400,8 +407,8 @@ private module CharacterClasses {
     InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
 
     override string getARelevantChar() {
-      result = nextChar(getAMentionedChar(cc)) or
-      nextChar(result) = getAMentionedChar(cc)
+      result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
+      nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
     }
 
     bindingset[char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
    */
   private class PositiveCharacterClassEscape extends CharacterClass {
     string charClass;
+    RegExpTerm cc;
 
     PositiveCharacterClassEscape() {
-      exists(RegExpTerm cc |
-        isEscapeClass(cc, charClass) and
-        this = getCanonicalCharClass(cc) and
-        charClass = ["d", "s", "w"]
-      )
+      isEscapeClass(cc, charClass) and
+      this = getCanonicalCharClass(cc) and
+      charClass = ["d", "s", "w"]
     }
 
     override string getARelevantChar() {
@@ -445,7 +451,9 @@ private module CharacterClasses {
       result = " "
       or
       charClass = "w" and
-      result = ["a", "Z", "_", "0", "9"]
+      if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      then result = ["a", "z", "_", "0", "9"]
+      else result = ["a", "Z", "_", "0", "9"]
     }
 
     override predicate matches(string char) { classEscapeMatches(charClass, char) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
       not classEscapeMatches(charClass.toLowerCase(), char)
     }
   }
+
+  /** Gets a representative for all char classes that match the same chars as `c`. */
+  CharacterClass normalize(CharacterClass c) {
+    exists(string normalization |
+      normalization = getMormalizationString(c) and
+      result =
+        min(CharacterClass cc, string raw |
+          getMormalizationString(cc) = normalization and cc = CharClass(raw)
+        |
+          cc order by raw
+        )
+    )
+  }
+
+  /** Gets a string representing all the chars matched by `c` */
+  private string getMormalizationString(CharacterClass c) {
+    (c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
+    result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
+    or
+    (c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
+    // the string produced by the concat can not contain repeated chars
+    // so by starting the below with "nn" we can guarantee that
+    // it will not overlap with the above case.
+    // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
+    result =
+      "nn:" +
+        concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
+  }
 }
 
 private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
+    lbl =
+      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
+            getCanonicalizationFlags(cc.getRootTerm()))) and
     q2 = after(cc)
   )
   or
   exists(RegExpTerm cc | isEscapeClass(cc, _) |
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
+    lbl =
+      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
+            getCanonicalizationFlags(cc.getRootTerm()))) and
     q2 = after(cc)
   )
   or