Merge pull request github#11071 from erik-krogh/fixCanon

erik-krogh · web-flow · commit d67235b3c108 · 2022-11-07T14:10:50.000+01:00
ReDoS: fix canonicalization in NfaUtils
diff --git a/java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll b/java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll
@@ -129,19 +129,20 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
     min(RelevantRegExpTerm t, Location loc, File file |
       loc = t.getLocation() and
       file = t.getFile() and
-      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
+      str = getCanonicalizationString(t)
     |
       t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
     )
 }
 
 /**
- * Gets a string representation of the flags used with the regular expression.
- * Only the flags that are relevant for the canonicalization are included.
+ * Gets a string representation of `term` that is used for canonicalization.
  */
-string getCanonicalizationFlags(RegExpTerm root) {
-  root.isRootTerm() and
-  (if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
+private string getCanonicalizationString(RelevantRegExpTerm term) {
+  exists(string ignoreCase |
+    (if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
+    result = term.getRawValue() + "|" + ignoreCase
+  )
 }
 
 /**
@@ -186,12 +187,19 @@ private newtype TInputSymbol =
   Epsilon()
 
 /**
- * Gets the canonical CharClass for `term`.
+ * Gets the the CharClass corresponding to the canonical representative `term`.
  */
-CharClass getCanonicalCharClass(RegExpTerm term) {
+private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
   exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
 }
 
+/**
+ * Gets a char class that represents `term`, even when `term` is not the canonical representative.
+ */
+CharacterClass getCanonicalCharClass(RegExpTerm term) {
+  exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
+}
+
 /**
  * Holds if `a` and `b` are input symbols from the same regexp.
  */
@@ -284,7 +292,7 @@ private module CharacterClasses {
    */
   pragma[noinline]
   predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
-    exists(getCanonicalCharClass(cc)) and
+    exists(getCharClassForCanonicalTerm(cc)) and
     exists(RegExpTerm child | child = cc.getAChild() |
       char = child.(RegexpCharacterConstant).getValue()
       or
@@ -387,7 +395,7 @@ private module CharacterClasses {
   private class PositiveCharacterClass extends CharacterClass {
     RegExpCharacterClass cc;
 
-    PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
+    PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
 
     override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
 
@@ -400,7 +408,7 @@ private module CharacterClasses {
   private class InvertedCharacterClass extends CharacterClass {
     RegExpCharacterClass cc;
 
-    InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
+    InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
 
     override string getARelevantChar() {
       result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
@@ -435,7 +443,7 @@ private module CharacterClasses {
 
     PositiveCharacterClassEscape() {
       isEscapeClass(cc, charClass) and
-      this = getCanonicalCharClass(cc) and
+      this = getCharClassForCanonicalTerm(cc) and
       charClass = ["d", "s", "w"]
     }
 
@@ -475,7 +483,7 @@ private module CharacterClasses {
     NegativeCharacterClassEscape() {
       exists(RegExpTerm cc |
         isEscapeClass(cc, charClass) and
-        this = getCanonicalCharClass(cc) and
+        this = getCharClassForCanonicalTerm(cc) and
         charClass = ["D", "S", "W"]
       )
     }
@@ -652,17 +660,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl =
-      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
-            getCanonicalizationFlags(cc.getRootTerm()))) and
+    lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
     q2 = after(cc)
   )
   or
   exists(RegExpTerm cc | isEscapeClass(cc, _) |
     q1 = before(cc) and
-    lbl =
-      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
-            getCanonicalizationFlags(cc.getRootTerm()))) and
+    lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
     q2 = after(cc)
   )
   or
diff --git a/javascript/ql/lib/semmle/javascript/security/regexp/NfaUtils.qll b/javascript/ql/lib/semmle/javascript/security/regexp/NfaUtils.qll
@@ -129,19 +129,20 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
     min(RelevantRegExpTerm t, Location loc, File file |
       loc = t.getLocation() and
       file = t.getFile() and
-      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
+      str = getCanonicalizationString(t)
     |
       t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
     )
 }
 
 /**
- * Gets a string representation of the flags used with the regular expression.
- * Only the flags that are relevant for the canonicalization are included.
+ * Gets a string representation of `term` that is used for canonicalization.
  */
-string getCanonicalizationFlags(RegExpTerm root) {
-  root.isRootTerm() and
-  (if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
+private string getCanonicalizationString(RelevantRegExpTerm term) {
+  exists(string ignoreCase |
+    (if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
+    result = term.getRawValue() + "|" + ignoreCase
+  )
 }
 
 /**
@@ -186,12 +187,19 @@ private newtype TInputSymbol =
   Epsilon()
 
 /**
- * Gets the canonical CharClass for `term`.
+ * Gets the the CharClass corresponding to the canonical representative `term`.
  */
-CharClass getCanonicalCharClass(RegExpTerm term) {
+private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
   exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
 }
 
+/**
+ * Gets a char class that represents `term`, even when `term` is not the canonical representative.
+ */
+CharacterClass getCanonicalCharClass(RegExpTerm term) {
+  exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
+}
+
 /**
  * Holds if `a` and `b` are input symbols from the same regexp.
  */
@@ -284,7 +292,7 @@ private module CharacterClasses {
    */
   pragma[noinline]
   predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
-    exists(getCanonicalCharClass(cc)) and
+    exists(getCharClassForCanonicalTerm(cc)) and
     exists(RegExpTerm child | child = cc.getAChild() |
       char = child.(RegexpCharacterConstant).getValue()
       or
@@ -387,7 +395,7 @@ private module CharacterClasses {
   private class PositiveCharacterClass extends CharacterClass {
     RegExpCharacterClass cc;
 
-    PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
+    PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
 
     override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
 
@@ -400,7 +408,7 @@ private module CharacterClasses {
   private class InvertedCharacterClass extends CharacterClass {
     RegExpCharacterClass cc;
 
-    InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
+    InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
 
     override string getARelevantChar() {
       result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
@@ -435,7 +443,7 @@ private module CharacterClasses {
 
     PositiveCharacterClassEscape() {
       isEscapeClass(cc, charClass) and
-      this = getCanonicalCharClass(cc) and
+      this = getCharClassForCanonicalTerm(cc) and
       charClass = ["d", "s", "w"]
     }
 
@@ -475,7 +483,7 @@ private module CharacterClasses {
     NegativeCharacterClassEscape() {
       exists(RegExpTerm cc |
         isEscapeClass(cc, charClass) and
-        this = getCanonicalCharClass(cc) and
+        this = getCharClassForCanonicalTerm(cc) and
         charClass = ["D", "S", "W"]
       )
     }
@@ -652,17 +660,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl =
-      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
-            getCanonicalizationFlags(cc.getRootTerm()))) and
+    lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
     q2 = after(cc)
   )
   or
   exists(RegExpTerm cc | isEscapeClass(cc, _) |
     q1 = before(cc) and
-    lbl =
-      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
-            getCanonicalizationFlags(cc.getRootTerm()))) and
+    lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
     q2 = after(cc)
   )
   or
diff --git a/javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/IncompleteMultiCharacterSanitization.expected b/javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/IncompleteMultiCharacterSanitization.expected
@@ -37,3 +37,5 @@
 | tst-multi-character-sanitization.js:143:13:143:56 | content ... /g, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:143:30:143:30 | < | <script |
 | tst-multi-character-sanitization.js:144:13:144:91 | content ... /g, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:144:30:144:30 | < | <script |
 | tst-multi-character-sanitization.js:145:13:145:90 | content ... /g, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:145:30:145:30 | < | <script |
+| tst-multi-character-sanitization.js:148:3:148:99 | n.clone ... gi, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:148:41:148:41 | < | <script |
+| tst-multi-character-sanitization.js:152:3:152:99 | n.clone ... gi, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:152:41:152:41 | < | <script |
diff --git a/javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/tst-multi-character-sanitization.js b/javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/tst-multi-character-sanitization.js
@@ -144,4 +144,12 @@
   content = content.replace(/<(script|iframe|video)[\s\S]*?<\/(script|iframe|video)>/g, '') // NOT OK
   content = content.replace(/<(script|iframe|video)(.|\s)*?\/(script|iframe|video)>/g, '') // NOT OK
   content = content.replace(/<[^<]*>/g, ""); // OK
+
+  n.cloneNode(false).outerHTML.replace(/<\/?[\w:\-]+ ?|=[\"][^\"]+\"|=\'[^\']+\'|=[\w\-]+|>/gi, '').replace(/[\w:\-]+/gi, function(a) { // NOT OK
+    o.push({specified : 1, nodeName : a});
+  });
+
+  n.cloneNode(false).outerHTML.replace(/<\/?[\w:\-]+ ?|=[\"][^\"]+\"|=\'[^\']+\'|=[\w\-]+|>/gi, '').replace(/[\w:\-]+/gi, function(a) { // NOT OK
+    o.push({specified : 1, nodeName : a});
+  });  
 });
diff --git a/python/ql/lib/semmle/python/security/regexp/NfaUtils.qll b/python/ql/lib/semmle/python/security/regexp/NfaUtils.qll
@@ -129,19 +129,20 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
     min(RelevantRegExpTerm t, Location loc, File file |
       loc = t.getLocation() and
       file = t.getFile() and
-      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
+      str = getCanonicalizationString(t)
     |
       t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
     )
 }
 
 /**
- * Gets a string representation of the flags used with the regular expression.
- * Only the flags that are relevant for the canonicalization are included.
+ * Gets a string representation of `term` that is used for canonicalization.
  */
-string getCanonicalizationFlags(RegExpTerm root) {
-  root.isRootTerm() and
-  (if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
+private string getCanonicalizationString(RelevantRegExpTerm term) {
+  exists(string ignoreCase |
+    (if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
+    result = term.getRawValue() + "|" + ignoreCase
+  )
 }
 
 /**
@@ -186,12 +187,19 @@ private newtype TInputSymbol =
   Epsilon()
 
 /**
- * Gets the canonical CharClass for `term`.
+ * Gets the the CharClass corresponding to the canonical representative `term`.
  */
-CharClass getCanonicalCharClass(RegExpTerm term) {
+private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
   exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
 }
 
+/**
+ * Gets a char class that represents `term`, even when `term` is not the canonical representative.
+ */
+CharacterClass getCanonicalCharClass(RegExpTerm term) {
+  exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
+}
+
 /**
  * Holds if `a` and `b` are input symbols from the same regexp.
  */
@@ -284,7 +292,7 @@ private module CharacterClasses {
    */
   pragma[noinline]
   predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
-    exists(getCanonicalCharClass(cc)) and
+    exists(getCharClassForCanonicalTerm(cc)) and
     exists(RegExpTerm child | child = cc.getAChild() |
       char = child.(RegexpCharacterConstant).getValue()
       or
@@ -387,7 +395,7 @@ private module CharacterClasses {
   private class PositiveCharacterClass extends CharacterClass {
     RegExpCharacterClass cc;
 
-    PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
+    PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
 
     override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
 
@@ -400,7 +408,7 @@ private module CharacterClasses {
   private class InvertedCharacterClass extends CharacterClass {
     RegExpCharacterClass cc;
 
-    InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
+    InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
 
     override string getARelevantChar() {
       result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
@@ -435,7 +443,7 @@ private module CharacterClasses {
 
     PositiveCharacterClassEscape() {
       isEscapeClass(cc, charClass) and
-      this = getCanonicalCharClass(cc) and
+      this = getCharClassForCanonicalTerm(cc) and
       charClass = ["d", "s", "w"]
     }
 
@@ -475,7 +483,7 @@ private module CharacterClasses {
     NegativeCharacterClassEscape() {
       exists(RegExpTerm cc |
         isEscapeClass(cc, charClass) and
-        this = getCanonicalCharClass(cc) and
+        this = getCharClassForCanonicalTerm(cc) and
         charClass = ["D", "S", "W"]
       )
     }
@@ -652,17 +660,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl =
-      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
-            getCanonicalizationFlags(cc.getRootTerm()))) and
+    lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
     q2 = after(cc)
   )
   or
   exists(RegExpTerm cc | isEscapeClass(cc, _) |
     q1 = before(cc) and
-    lbl =
-      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
-            getCanonicalizationFlags(cc.getRootTerm()))) and
+    lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
     q2 = after(cc)
   )
   or
diff --git a/ruby/ql/lib/codeql/ruby/security/regexp/NfaUtils.qll b/ruby/ql/lib/codeql/ruby/security/regexp/NfaUtils.qll