Merge pull request #7120 from github/nickrolfe/regexp_g_anchor

nickrolfe · web-flow · commit 5a2ef8321ce7 · 2021-12-03T15:24:38.000Z
Ruby/Python: parse anchors in regexes as special characters
diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -539,8 +539,8 @@ private int toHex(string hex) {
 /**
  * A word boundary, that is, a regular expression term of the form `\b`.
  */
-class RegExpWordBoundary extends RegExpEscape {
-  RegExpWordBoundary() { this.getUnescaped() = "b" }
+class RegExpWordBoundary extends RegExpSpecialChar {
+  RegExpWordBoundary() { this.getChar() = "\\b" }
 }
 
 /**
@@ -809,7 +809,7 @@ class RegExpDot extends RegExpSpecialChar {
 }
 
 /**
- * A dollar assertion `$` matching the end of a line.
+ * A dollar assertion `$` or `\Z` matching the end of a line.
  *
  * Example:
  *
@@ -818,13 +818,13 @@ class RegExpDot extends RegExpSpecialChar {
  * ```
  */
 class RegExpDollar extends RegExpSpecialChar {
-  RegExpDollar() { this.getChar() = "$" }
+  RegExpDollar() { this.getChar() = ["$", "\\Z"] }
 
   override string getPrimaryQLClass() { result = "RegExpDollar" }
 }
 
 /**
- * A caret assertion `^` matching the beginning of a line.
+ * A caret assertion `^` or `\A` matching the beginning of a line.
  *
  * Example:
  *
@@ -833,7 +833,7 @@ class RegExpDollar extends RegExpSpecialChar {
  * ```
  */
 class RegExpCaret extends RegExpSpecialChar {
-  RegExpCaret() { this.getChar() = "^" }
+  RegExpCaret() { this.getChar() = ["^", "\\A"] }
 
   override string getPrimaryQLClass() { result = "RegExpCaret" }
 }
diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll
@@ -437,11 +437,18 @@ abstract class RegexString extends Expr {
   }
 
   predicate specialCharacter(int start, int end, string char) {
+    not this.inCharSet(start) and
     this.character(start, end) and
-    end = start + 1 and
-    char = this.getChar(start) and
-    (char = "$" or char = "^" or char = ".") and
-    not this.inCharSet(start)
+    (
+      end = start + 1 and
+      char = this.getChar(start) and
+      (char = "$" or char = "^" or char = ".")
+      or
+      end = start + 2 and
+      this.escapingChar(start) and
+      char = this.getText().substring(start, end) and
+      char = ["\\A", "\\Z", "\\b", "\\B"]
+    )
   }
 
   /** Whether the text in the range start,end is a group */
@@ -901,7 +908,8 @@ abstract class RegexString extends Expr {
     exists(int x | this.firstPart(x, end) |
       this.emptyMatchAtStartGroup(x, start) or
       this.qualifiedItem(x, start, true, _) or
-      this.specialCharacter(x, start, "^")
+      // ^ and \A match the start of the string
+      this.specialCharacter(x, start, ["^", "\\A"])
     )
     or
     exists(int y | this.firstPart(start, y) |
@@ -926,9 +934,8 @@ abstract class RegexString extends Expr {
       or
       this.qualifiedItem(end, y, true, _)
       or
-      this.specialCharacter(end, y, "$")
-      or
-      y = end + 2 and this.escapingChar(end) and this.getChar(end + 1) = "Z"
+      // $ and \Z match the end of the string.
+      this.specialCharacter(end, y, ["$", "\\Z"])
     )
     or
     exists(int x |
diff --git a/python/ql/test/library-tests/regex/Characters.expected b/python/ql/test/library-tests/regex/Characters.expected
@@ -58,6 +58,11 @@
 | \\A[+-]?\\d+ | 3 | 4 |
 | \\A[+-]?\\d+ | 4 | 5 |
 | \\A[+-]?\\d+ | 7 | 9 |
+| \\Afoo\\Z | 0 | 2 |
+| \\Afoo\\Z | 2 | 3 |
+| \\Afoo\\Z | 3 | 4 |
+| \\Afoo\\Z | 4 | 5 |
+| \\Afoo\\Z | 5 | 7 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 0 | 2 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 12 | 13 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 16 | 18 |
@@ -71,6 +76,11 @@
 | \\\|\\[\\][123]\|\\{\\} | 9 | 10 |
 | \\\|\\[\\][123]\|\\{\\} | 12 | 14 |
 | \\\|\\[\\][123]\|\\{\\} | 14 | 16 |
+| \\bfoo\\B | 0 | 2 |
+| \\bfoo\\B | 2 | 3 |
+| \\bfoo\\B | 3 | 4 |
+| \\bfoo\\B | 4 | 5 |
+| \\bfoo\\B | 5 | 7 |
 | \|x | 1 | 2 |
 | ^(^y\|^z)(u$\|v$)$ | 0 | 1 |
 | ^(^y\|^z)(u$\|v$)$ | 2 | 3 |
diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected
@@ -45,15 +45,25 @@
 | \\+0 | first | 0 | 2 |
 | \\+0 | last | 2 | 3 |
 | \\A[+-]?\\d+ | first | 0 | 2 |
+| \\A[+-]?\\d+ | first | 2 | 6 |
+| \\A[+-]?\\d+ | first | 2 | 7 |
+| \\A[+-]?\\d+ | first | 7 | 9 |
+| \\A[+-]?\\d+ | first | 7 | 10 |
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
+| \\Afoo\\Z | first | 0 | 2 |
+| \\Afoo\\Z | first | 2 | 3 |
+| \\Afoo\\Z | last | 4 | 5 |
+| \\Afoo\\Z | last | 5 | 7 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 33 |
 | \\\|\\[\\][123]\|\\{\\} | first | 0 | 2 |
 | \\\|\\[\\][123]\|\\{\\} | first | 12 | 14 |
 | \\\|\\[\\][123]\|\\{\\} | last | 6 | 11 |
 | \\\|\\[\\][123]\|\\{\\} | last | 14 | 16 |
+| \\bfoo\\B | first | 0 | 2 |
+| \\bfoo\\B | last | 5 | 7 |
 | \|x | first | 1 | 2 |
 | \|x | last | 1 | 2 |
 | ^(^y\|^z)(u$\|v$)$ | first | 0 | 1 |
diff --git a/python/ql/test/library-tests/regex/Regex.expected b/python/ql/test/library-tests/regex/Regex.expected
@@ -116,14 +116,20 @@
 | \\+0 | char | 0 | 2 |
 | \\+0 | char | 2 | 3 |
 | \\+0 | sequence | 0 | 3 |
-| \\A[+-]?\\d+ | char | 0 | 2 |
+| \\A[+-]?\\d+ | \\A | 0 | 2 |
 | \\A[+-]?\\d+ | char | 3 | 4 |
 | \\A[+-]?\\d+ | char | 4 | 5 |
 | \\A[+-]?\\d+ | char | 7 | 9 |
 | \\A[+-]?\\d+ | char-set | 2 | 6 |
 | \\A[+-]?\\d+ | qualified | 2 | 7 |
 | \\A[+-]?\\d+ | qualified | 7 | 10 |
 | \\A[+-]?\\d+ | sequence | 0 | 10 |
+| \\Afoo\\Z | \\A | 0 | 2 |
+| \\Afoo\\Z | \\Z | 5 | 7 |
+| \\Afoo\\Z | char | 2 | 3 |
+| \\Afoo\\Z | char | 3 | 4 |
+| \\Afoo\\Z | char | 4 | 5 |
+| \\Afoo\\Z | sequence | 0 | 7 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 0 | 2 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 12 | 13 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 16 | 18 |
@@ -148,6 +154,12 @@
 | \\\|\\[\\][123]\|\\{\\} | choice | 0 | 16 |
 | \\\|\\[\\][123]\|\\{\\} | sequence | 0 | 11 |
 | \\\|\\[\\][123]\|\\{\\} | sequence | 12 | 16 |
+| \\bfoo\\B | \\B | 5 | 7 |
+| \\bfoo\\B | \\b | 0 | 2 |
+| \\bfoo\\B | char | 2 | 3 |
+| \\bfoo\\B | char | 3 | 4 |
+| \\bfoo\\B | char | 4 | 5 |
+| \\bfoo\\B | sequence | 0 | 7 |
 | \|x | char | 1 | 2 |
 | \|x | choice | 0 | 2 |
 | \|x | sequence | 1 | 2 |
diff --git a/python/ql/test/library-tests/regex/test.py b/python/ql/test/library-tests/regex/test.py
@@ -73,3 +73,7 @@
 
 # Consistency check
 baz = re.compile(r'\+0')
+
+# Anchors
+re.compile(r'\Afoo\Z')
+re.compile(r'\bfoo\B')
diff --git a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@@ -100,5 +100,8 @@
 | redos.py:371:25:371:35 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
 | redos.py:380:35:380:41 | [^"\\s]+ | This part of the regular expression may cause exponential backtracking on strings starting with '/' and containing many repetitions of '!'. |
 | redos.py:381:35:381:41 | [^"\\s]+ | This part of the regular expression may cause exponential backtracking on strings starting with '/' and containing many repetitions of '!'. |
+| redos.py:384:26:384:32 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
+| redos.py:385:24:385:30 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
+| redos.py:386:26:386:32 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
 | unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
diff --git a/python/ql/test/query-tests/Security/CWE-730-ReDoS/redos.py b/python/ql/test/query-tests/Security/CWE-730-ReDoS/redos.py
@@ -378,4 +378,9 @@
 
 # BAD
 bad88 = re.compile(r'/("[^"]*?"|[^"\s]+)+(?=\s*|\s*$)X')
-bad89 = re.compile(r'/("[^"]*?"|[^"\s]+)+(?=X)')
+bad89 = re.compile(r'/("[^"]*?"|[^"\s]+)+(?=X)')
+
+# BAD
+bad90 = re.compile(r'\A(\d|0)*x')
+bad91 = re.compile(r'(\d|0)*\Z')
+bad92 = re.compile(r'\b(\d|0)*x')
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
@@ -397,7 +397,7 @@ class RegExp extends AST::RegExpLiteral {
       end = start + 2 and
       this.escapingChar(start) and
       char = this.getText().substring(start, end) and
-      char = ["\\A", "\\Z", "\\z"]
+      char = ["\\A", "\\Z", "\\z", "\\G", "\\b", "\\B"]
     )
   }
 
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
@@ -462,8 +462,8 @@ private int toHex(string hex) {
 /**
  * A word boundary, that is, a regular expression term of the form `\b`.
  */
-class RegExpWordBoundary extends RegExpEscape {
-  RegExpWordBoundary() { this.getUnescaped() = "b" }
+class RegExpWordBoundary extends RegExpSpecialChar {
+  RegExpWordBoundary() { this.getChar() = "\\b" }
 }
 
 /**
diff --git a/ruby/ql/test/library-tests/regexp/parse.expected b/ruby/ql/test/library-tests/regexp/parse.expected
diff --git a/ruby/ql/test/library-tests/regexp/regexp.rb b/ruby/ql/test/library-tests/regexp/regexp.rb
diff --git a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
diff --git a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/tst.rb b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/tst.rb

Original file line number	Diff line number	Diff line change
`@@ -539,8 +539,8 @@ private int toHex(string hex) {`
`539`	`539`	`/**`
`540`	`540`	* A word boundary, that is, a regular expression term of the form `\b`.
`541`	`541`	`*/`
`542`		`-class RegExpWordBoundary extends RegExpEscape {`
`543`		`- RegExpWordBoundary() { this.getUnescaped() = "b" }`
	`542`	`+class RegExpWordBoundary extends RegExpSpecialChar {`
	`543`	`+ RegExpWordBoundary() { this.getChar() = "\\b" }`
`544`	`544`	`}`
`545`	`545`
`546`	`546`	`/**`
`@@ -809,7 +809,7 @@ class RegExpDot extends RegExpSpecialChar {`
`809`	`809`	`}`
`810`	`810`
`811`	`811`	`/**`
`812`		- * A dollar assertion `$` matching the end of a line.
	`812`	+ * A dollar assertion `$` or `\Z` matching the end of a line.
`813`	`813`	`*`
`814`	`814`	`* Example:`
`815`	`815`	`*`
`@@ -818,13 +818,13 @@ class RegExpDot extends RegExpSpecialChar {`
`818`	`818`	* ```
`819`	`819`	`*/`
`820`	`820`	`class RegExpDollar extends RegExpSpecialChar {`
`821`		`- RegExpDollar() { this.getChar() = "$" }`
	`821`	`+ RegExpDollar() { this.getChar() = ["$", "\\Z"] }`
`822`	`822`
`823`	`823`	`override string getPrimaryQLClass() { result = "RegExpDollar" }`
`824`	`824`	`}`
`825`	`825`
`826`	`826`	`/**`
`827`		- * A caret assertion `^` matching the beginning of a line.
	`827`	+ * A caret assertion `^` or `\A` matching the beginning of a line.
`828`	`828`	`*`
`829`	`829`	`* Example:`
`830`	`830`	`*`
`@@ -833,7 +833,7 @@ class RegExpDollar extends RegExpSpecialChar {`
`833`	`833`	* ```
`834`	`834`	`*/`
`835`	`835`	`class RegExpCaret extends RegExpSpecialChar {`
`836`		`- RegExpCaret() { this.getChar() = "^" }`
	`836`	`+ RegExpCaret() { this.getChar() = ["^", "\\A"] }`
`837`	`837`
`838`	`838`	`override string getPrimaryQLClass() { result = "RegExpCaret" }`
`839`	`839`	`}`
Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ class RegExp extends AST::RegExpLiteral {`
`397`	`397`	`end = start + 2 and`
`398`	`398`	`this.escapingChar(start) and`
`399`	`399`	`char = this.getText().substring(start, end) and`
`400`		`- char = ["\\A", "\\Z", "\\z"]`
	`400`	`+ char = ["\\A", "\\Z", "\\z", "\\G", "\\b", "\\B"]`
`401`	`401`	`)`
`402`	`402`	`}`
`403`	`403`