rptest: switch to ripgrep in log search

travisdowns · travisdowns · commit 537009d6e3c3 · 2025-11-25T09:59:04.000-03:00
Use ripgrep (rg) instead of grep for log searching in rptest. In my benchmark this results in about a 3x speedup when searching through 3 logs in parallel resulting from creating 10 topics of 1000 partitions. The benchmark is also included in this change, though @ignored so it does not run CI (it takes a minute or so). This requires translating GNU BRE (used by grep by default) to ERE (used by grep -E, and ripgrep). It was probably a mistake to use BRE in the first place, but it is what it is.
diff --git a/tests/rptest/services/utils.py b/tests/rptest/services/utils.py
@@ -132,7 +132,7 @@ class LogSearch(ABC):
         "Exceptional future ignored",
         "UndefinedBehaviorSanitizer",
         "Aborting on shard",
-        "libc++abi: terminating due to uncaught exception",
+        "terminating due to uncaught exception",
         "oversized allocation",
     ]
 
@@ -157,7 +157,10 @@ def __init__(
 
     @abstractmethod
     def _capture_log(self, node: Any, expr: str) -> Generator[str, None, None]:
-        """Method to get log from host node. Overriden by each child."""
+        """Method to get log from host node. Overriden by each child.
+
+        expr is a GNU BRE regex (i.e., the default grep regex style), which means
+        you need to escape things like +() if you intend them to be metacharacters"""
         # Fake return type for type hint silence
         # And proper handling when called directly
         yield from []
@@ -248,6 +251,63 @@ def search_logs(self, versioned_nodes: VersionedNodes) -> None:
             raise BadLogLines(bad_loglines)
 
 
+def _gnu_bre_to_ere(bre_pattern: str) -> str:
+    r"""
+    Convert a GNU Basic Regular Expression (BRE) to a GNU Extended Regular
+    Expression (ERE).
+
+    This function handles two main differences between GNU BRE and ERE:
+    1.  In BRE, `(`, `)`, `{`, `}`, `+`, `?`, and `|` are literal characters,
+        whereas in ERE they are special metacharacters. To treat them as
+        literals in ERE, they must be escaped with a backslash.
+    2.  In BRE, the escaped versions `\(`, `\)`, `\{`, `\}`, `\+`, `\?`, and
+        `\|` have special meanings (grouping, intervals, etc.), while in ERE,
+        the unescaped versions have these special meanings.
+
+    The conversion is performed by iterating through the BRE pattern and
+    applying the following rules:
+    - Unescaped `(`, `)`, `{`, `}`, `+`, `?`, `|` are escaped.
+    - Escaped `\(`, `\)`, `\{`, `\}`, `\+`, `\?`, `\|` are unescaped.
+    - Other characters, including other escaped characters (e.g., `\.`, `\*`),
+      are kept as they are.
+    - The logic correctly handles double backslashes (`\\`), ensuring they
+      are preserved.
+    """
+
+    # these are metacharacters in both ERE and GNU BRE but in BRE
+    # they must be escaped to have their metacharacter meaning
+    BRE_ESCAPED_METACHARACTERS = set("(){}+?|")
+
+    ere_pattern = ""
+    i = 0
+    while i < len(bre_pattern):
+        char = bre_pattern[i]
+        if char == "\\":
+            if i + 1 < len(bre_pattern):
+                next_char = bre_pattern[i + 1]
+                if next_char in BRE_ESCAPED_METACHARACTERS:
+                    # Unescape BRE metacharacters to become ERE metacharacters
+                    ere_pattern += next_char
+                    i += 2
+                else:
+                    # Keep other escaped characters as they are (e.g., \\, \*, \.)
+                    ere_pattern += char + next_char
+                    i += 2
+            else:
+                # Trailing backslash
+                ere_pattern += char
+                i += 1
+        elif char in BRE_ESCAPED_METACHARACTERS:
+            # Escape ERE metacharacters that are literals in BRE
+            ere_pattern += "\\" + char
+            i += 1
+        else:
+            # Keep all other characters
+            ere_pattern += char
+            i += 1
+    return ere_pattern
+
+
 class LogSearchLocal(LogSearch):
     def __init__(
         self,
@@ -260,7 +320,10 @@ def __init__(
         self.targetpath = targetpath
 
     def _capture_log(self, node: ClusterNode, expr: str) -> Generator[str, None, None]:
-        cmd = f"grep {expr} {self.targetpath} || true"
+        if not expr.startswith("-P"):
+            # some naughty tests use this to force grep/rg to use PRCE
+            expr = _gnu_bre_to_ere(expr)
+        cmd = f"rg {expr} {self.targetpath} || true"
         for line in node.account.ssh_capture(cmd):
             yield line
 
diff --git a/tests/rptest/tests/services_self_test.py b/tests/rptest/tests/services_self_test.py
@@ -11,10 +11,11 @@
 import signal
 from subprocess import CalledProcessError
 from typing import Any, Callable, Iterator
+import time
 
 from ducktape.cluster.cluster import ClusterNode
 from ducktape.cluster.remoteaccount import RemoteCommandError
-from ducktape.mark import matrix
+from ducktape.mark import matrix, ignore
 from ducktape.mark.resource import cluster as dt_cluster
 from ducktape.tests.test import Test, TestContext
 
@@ -667,6 +668,43 @@ def validate_exception(e: BadLogLines) -> bool:
         with expect_exception(BadLogLines, validate_exception):
             self.redpanda.raise_on_bad_logs(allow_list=[])
 
+    @ignore
+    @cluster(num_nodes=3, check_allowed_error_logs=False)
+    def test_bll_bench(self):
+        """
+        Test that the LogMessage admin API correctly logs messages and that
+        ERROR level logs are caught by raise_on_bad_logs.
+
+        Ignored by default since we don't want to run benchmarks in CI.
+        """
+        # create and delete a 1000-partition topic 10 times
+        rpk = RpkTool(self.redpanda)
+
+        parts = 1000
+
+        for i in range(10):
+            topic_name = f"bll_bench_{i}"
+
+            def _all_partitions_present():
+                try:
+                    desc = list(rpk.describe_topic(topic_name))
+                    return len(desc) == parts
+                except Exception:
+                    return False
+
+            # 1000 partitions, replication factor 1 to avoid excess resource usage
+            rpk.create_topic(topic_name, partitions=parts, replicas=3)
+            self.redpanda.wait_until(
+                _all_partitions_present, timeout_sec=30, backoff_sec=1
+            )
+            rpk.delete_topic(topic_name)
+            self.logger.warning(f"c d topic {i}")
+
+        start = time.time()
+        self.redpanda.raise_on_bad_logs(allow_list=[])
+        elapsed = time.time() - start
+        self.logger.warning(f"raise_on_bad_logs elapsed {elapsed:.3f}s")
+
 
 class RedpandaServiceSelfRawTest(Test):
     """This 'raw' test inherits only from Test, so that internally it