Optimise handling PostgreSQL dollar quotes (#1905)

alexmojaki · basepi · web-flow · commit 468ecf7a2ad9 · 2023-09-27T17:32:12.000-04:00
* Optimise handling PostgreSQL dollar quotes

* CHANGELOG

---------

Co-authored-by: Colton Myers &lt;colton@basepi.net&gt;
diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
@@ -37,6 +37,7 @@ endif::[]
 ===== Features
 
 * Collect the `configured_hostname` and `detected_hostname` separately, and switch to FQDN for the `detected_hostname`. {pull}1891[#1891]
+* Improve postgres dollar-quote detection to be much faster {pull}1905[#1905]
 
 [float]
 ===== Bug fixes
diff --git a/elasticapm/instrumentation/packages/dbapi2.py b/elasticapm/instrumentation/packages/dbapi2.py
@@ -54,21 +54,6 @@ def __repr__(self):
         return "<Literal {}{}{}>".format(self.literal_type, self.content, self.literal_type)
 
 
-def skip_to(start, tokens, value_sequence):
-    i = start
-    while i < len(tokens):
-        for idx, token in enumerate(value_sequence):
-            if tokens[i + idx] != token:
-                break
-        else:
-            # Match
-            return tokens[start : i + len(value_sequence)]
-        i += 1
-
-    # Not found
-    return None
-
-
 def look_for_table(sql, keyword):
     tokens = tokenize(sql)
     table_name = _scan_for_table_with_tokens(tokens, keyword)
@@ -109,7 +94,6 @@ def scan(tokens):
                 prev_was_escape = False
                 lexeme.append(token)
             else:
-
                 if token == literal_started:
                     if literal_started == "'" and len(tokens) > i + 1 and tokens[i + 1] == "'":  # double quotes
                         i += 1
@@ -133,14 +117,30 @@ def scan(tokens):
                 # Postgres can use arbitrary characters between two $'s as a
                 # literal separation token, e.g.: $fish$ literal $fish$
                 # This part will detect that and skip over the literal.
-                skipped_token = skip_to(i + 1, tokens, "$")
-                if skipped_token is not None:
-                    dollar_token = ["$"] + skipped_token
-
-                    skipped = skip_to(i + len(dollar_token), tokens, dollar_token)
-                    if skipped:  # end wasn't found.
-                        yield i, Literal("".join(dollar_token), "".join(skipped[: -len(dollar_token)]))
-                        i = i + len(skipped) + len(dollar_token)
+                try:
+                    # Closing dollar of the opening quote,
+                    # i.e. the second $ in the first $fish$
+                    closing_dollar_idx = tokens.index("$", i + 1)
+                except ValueError:
+                    pass
+                else:
+                    quote = tokens[i : closing_dollar_idx + 1]
+                    length = len(quote)
+                    # Opening dollar of the closing quote,
+                    # i.e. the first $ in the second $fish$
+                    closing_quote_idx = closing_dollar_idx + 1
+                    while True:
+                        try:
+                            closing_quote_idx = tokens.index("$", closing_quote_idx)
+                        except ValueError:
+                            break
+                        if tokens[closing_quote_idx : closing_quote_idx + length] == quote:
+                            yield i, Literal(
+                                "".join(quote), "".join(tokens[closing_dollar_idx + 1 : closing_quote_idx])
+                            )
+                            i = closing_quote_idx + length
+                            break
+                        closing_quote_idx += 1
             else:
                 if token != " ":
                     yield i, token
diff --git a/tests/instrumentation/dbapi2_tests.py b/tests/instrumentation/dbapi2_tests.py
@@ -27,6 +27,7 @@
 #  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 #  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
 
 from elasticapm.instrumentation.packages.dbapi2 import Literal, extract_signature, scan, tokenize
 
@@ -71,6 +72,36 @@ def test_scan_double_quotes_at_end():
     assert actual == expected
 
 
+@pytest.mark.parametrize("quote", ["$$", "$q$"])
+@pytest.mark.parametrize(
+    "content",
+    [
+        "",
+        "q",
+        "Peter q Pan",
+        "Peter $ Pan",
+        "Peter $q Pan",
+        "Peter q$ Pan",
+        "Peter $q q$ $q q$ Pan Peter $q q$ $q q$ Pan",
+        "Peter $qq$ Pan",
+    ],
+)
+def test_scan_dollar_quote(quote, content):
+    sql = f"Hello {quote}{content}{quote} at Disney World"
+    tokens = tokenize(sql)
+    actual = [t[1] for t in scan(tokens)]
+    expected = ["Hello", Literal(quote, content), "at", "Disney", "World"]
+    assert actual == expected
+
+
+def test_dollar_quote_containing_double_dollar():
+    sql = "Hello $q$Peter $$ Pan$q$ at Disney World"
+    tokens = tokenize(sql)
+    actual = [t[1] for t in scan(tokens)]
+    expected = ["Hello", Literal("$q$", "Peter $$ Pan"), "at", "Disney", "World"]
+    assert actual == expected
+
+
 def test_extract_signature_string():
     sql = "Hello 'Peter Pan' at Disney World"
     actual = extract_signature(sql)