⚡️ Speed up function _strip_ansi by 116% in PR #217 (proper-cleanup)

codeflash-ai[bot] · web-flow · commit aa2e25463993 · 2025-05-19T04:39:50.000Z
Here is an optimized version of `_strip_ansi` focusing on runtime speed. The main bottleneck is the regular expression replacement in `re.sub`, specifically returning the (possibly empty) group 4 (link text in hyperlinks) or, if it doesn’t match, an empty string for ANSI codes.  
This can be significantly sped up by avoiding the costly `r"\4"` (which always triggers group resolution machinery in the regex engine), and instead using a faster replacer callback.  
Since every match will be either an ANSI escape code (where group 4 is `None`) or a hyperlink (where group 4 contains the visible link text), we can handle both cases in one simple function.

Optimized `code`.



**Performance notes:**  
- This avoids all regex group substitution machinery for the common ANSI case.
- No change to visible/functional behavior.
- No changes to external function names or signatures.
- String and bytes cases are handled separately, so no unnecessary type checks inside tight loops.

**Comment:**  
No original comments were changed or removed.  
No changes made to the public interface or expected output.  
All logic concerning group 4 and escape sequence removal is preserved.
diff --git a/codeflash/code_utils/tabulate.py b/codeflash/code_utils/tabulate.py
@@ -61,7 +61,8 @@ def _is_separating_line_value(value):
 def _is_separating_line(row):
     row_type = type(row)
     is_sl = (row_type == list or row_type == str) and (
-        (len(row) >= 1 and _is_separating_line_value(row[0])) or (len(row) >= 2 and _is_separating_line_value(row[1]))
+        (len(row) >= 1 and _is_separating_line_value(row[0]))
+        or (len(row) >= 2 and _is_separating_line_value(row[1]))
     )
 
     return is_sl
@@ -151,7 +152,9 @@ def _pipe_line_with_colons(colwidths, colaligns):
 _ansi_codes_bytes = re.compile(_ansi_escape_pat.encode("utf8"), re.VERBOSE)
 _ansi_color_reset_code = "\033[0m"
 
-_float_with_thousands_separators = re.compile(r"^(([+-]?[0-9]{1,3})(?:,([0-9]{3}))*)?(?(1)\.[0-9]*|\.[0-9]+)?$")
+_float_with_thousands_separators = re.compile(
+    r"^(([+-]?[0-9]{1,3})(?:,([0-9]{3}))*)?(?(1)\.[0-9]*|\.[0-9]+)?$"
+)
 
 
 def _isnumber_with_thousands_separator(string):
@@ -200,12 +203,16 @@ def _isint(string, inttype=int):
             (hasattr(string, "is_integer") or hasattr(string, "__array__"))
             and str(type(string)).startswith("<class 'numpy.int")
         )  # numpy.int64 and similar
-        or (isinstance(string, (bytes, str)) and _isconvertible(inttype, string))  # integer as string
+        or (
+            isinstance(string, (bytes, str)) and _isconvertible(inttype, string)
+        )  # integer as string
     )
 
 
 def _isbool(string):
-    return type(string) is bool or (isinstance(string, (bytes, str)) and string in {"True", "False"})
+    return type(string) is bool or (
+        isinstance(string, (bytes, str)) and string in {"True", "False"}
+    )
 
 
 def _type(string, has_invisible=True, numparse=True):
@@ -219,10 +226,18 @@ def _type(string, has_invisible=True, numparse=True):
     if _isbool(string):
         return bool
     if numparse and (
-        _isint(string) or (isinstance(string, str) and _isnumber_with_thousands_separator(string) and "." not in string)
+        _isint(string)
+        or (
+            isinstance(string, str)
+            and _isnumber_with_thousands_separator(string)
+            and "." not in string
+        )
     ):
         return int
-    if numparse and (_isnumber(string) or (isinstance(string, str) and _isnumber_with_thousands_separator(string))):
+    if numparse and (
+        _isnumber(string)
+        or (isinstance(string, str) and _isnumber_with_thousands_separator(string))
+    ):
         return float
     if isinstance(string, bytes):
         return bytes
@@ -261,10 +276,23 @@ def _padnone(ignore_width, s):
 
 
 def _strip_ansi(s):
+    # Use a custom replacer instead of r"\4" for much faster performance.
+    def _replacer(match):
+        return (
+            match.group(4) or ""
+        )  # Only the hyperlink's visible text, or remove the escape.
+
     if isinstance(s, str):
-        return _ansi_codes.sub(r"\4", s)
+        return _ansi_codes.sub(_replacer, s)
+
     # a bytestring
-    return _ansi_codes_bytes.sub(r"\4", s)
+    def _replacer_bytes(match):
+        g = match.group(4)
+        return (
+            g if g else b""
+        )  # Only the hyperlink's visible text, or remove the escape.
+
+    return _ansi_codes_bytes.sub(_replacer_bytes, s)
 
 
 def _visible_width(s):
@@ -365,19 +393,29 @@ def _align_column(
     is_multiline=False,
     preserve_whitespace=False,
 ):
-    strings, padfn = _align_column_choose_padfn(strings, alignment, has_invisible, preserve_whitespace)
-    width_fn = _align_column_choose_width_fn(has_invisible, enable_widechars, is_multiline)
+    strings, padfn = _align_column_choose_padfn(
+        strings, alignment, has_invisible, preserve_whitespace
+    )
+    width_fn = _align_column_choose_width_fn(
+        has_invisible, enable_widechars, is_multiline
+    )
 
     s_widths = list(map(width_fn, strings))
     maxwidth = max(max(_flat_list(s_widths)), minwidth)
     # TODO: refactor column alignment in single-line and multiline modes
     if is_multiline:
         if not enable_widechars and not has_invisible:
-            padded_strings = ["\n".join([padfn(maxwidth, s) for s in ms.splitlines()]) for ms in strings]
+            padded_strings = [
+                "\n".join([padfn(maxwidth, s) for s in ms.splitlines()])
+                for ms in strings
+            ]
         else:
             # enable wide-character width corrections
             s_lens = [[len(s) for s in re.split("[\r\n]", ms)] for ms in strings]
-            visible_widths = [[maxwidth - (w - l) for w, l in zip(mw, ml)] for mw, ml in zip(s_widths, s_lens)]
+            visible_widths = [
+                [maxwidth - (w - l) for w, l in zip(mw, ml)]
+                for mw, ml in zip(s_widths, s_lens)
+            ]
             # wcswidth and _visible_width don't count invisible characters;
             # padfn doesn't need to apply another correction
             padded_strings = [
@@ -419,13 +457,19 @@ def _format(val, valtype, floatfmt, intfmt, missingval="", has_invisible=True):
     if valtype is int:
         if isinstance(val, str):
             val_striped = val.encode("unicode_escape").decode("utf-8")
-            colored = re.search(r"(\\[xX]+[0-9a-fA-F]+\[\d+[mM]+)([0-9.]+)(\\.*)$", val_striped)
+            colored = re.search(
+                r"(\\[xX]+[0-9a-fA-F]+\[\d+[mM]+)([0-9.]+)(\\.*)$", val_striped
+            )
             if colored:
                 total_groups = len(colored.groups())
                 if total_groups == 3:
                     digits = colored.group(2)
                     if digits.isdigit():
-                        val_new = colored.group(1) + format(int(digits), intfmt) + colored.group(3)
+                        val_new = (
+                            colored.group(1)
+                            + format(int(digits), intfmt)
+                            + colored.group(3)
+                        )
                         val = val_new.encode("utf-8").decode("unicode_escape")
             intfmt = ""
         return format(val, intfmt)
@@ -447,11 +491,15 @@ def _format(val, valtype, floatfmt, intfmt, missingval="", has_invisible=True):
         return f"{val}"
 
 
-def _align_header(header, alignment, width, visible_width, is_multiline=False, width_fn=None):
+def _align_header(
+    header, alignment, width, visible_width, is_multiline=False, width_fn=None
+):
     """Pad string header to width chars given known visible_width of the header."""
     if is_multiline:
         header_lines = re.split(_multiline_codes, header)
-        padded_lines = [_align_header(h, alignment, width, width_fn(h)) for h in header_lines]
+        padded_lines = [
+            _align_header(h, alignment, width, width_fn(h)) for h in header_lines
+        ]
         return "\n".join(padded_lines)
     # else: not multiline
     ninvisible = len(header) - visible_width
@@ -504,14 +552,19 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
             # likely a conventional dict
             keys = tabular_data.keys()
             try:
-                rows = list(izip_longest(*tabular_data.values()))  # columns have to be transposed
+                rows = list(
+                    izip_longest(*tabular_data.values())
+                )  # columns have to be transposed
             except TypeError:  # not iterable
                 raise TypeError(err_msg)
 
         elif hasattr(tabular_data, "index"):
             # values is a property, has .index => it's likely a pandas.DataFrame (pandas 0.11.0)
             keys = list(tabular_data)
-            if showindex in {"default", "always", True} and tabular_data.index.name is not None:
+            if (
+                showindex in {"default", "always", True}
+                and tabular_data.index.name is not None
+            ):
                 if isinstance(tabular_data.index.name, list):
                     keys[:0] = tabular_data.index.name
                 else:
@@ -535,10 +588,19 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
         if headers == "keys" and not rows:
             # an empty table (issue #81)
             headers = []
-        elif headers == "keys" and hasattr(tabular_data, "dtype") and tabular_data.dtype.names:
+        elif (
+            headers == "keys"
+            and hasattr(tabular_data, "dtype")
+            and tabular_data.dtype.names
+        ):
             # numpy record array
             headers = tabular_data.dtype.names
-        elif headers == "keys" and len(rows) > 0 and isinstance(rows[0], tuple) and hasattr(rows[0], "_fields"):
+        elif (
+            headers == "keys"
+            and len(rows) > 0
+            and isinstance(rows[0], tuple)
+            and hasattr(rows[0], "_fields")
+        ):
             # namedtuple
             headers = list(map(str, rows[0]._fields))
         elif len(rows) > 0 and hasattr(rows[0], "keys") and hasattr(rows[0], "values"):
@@ -569,7 +631,9 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
                 else:
                     headers = []
             elif headers:
-                raise ValueError("headers for a list of dicts is not a dict or a keyword")
+                raise ValueError(
+                    "headers for a list of dicts is not a dict or a keyword"
+                )
             rows = [[row.get(k) for k in keys] for row in rows]
 
         elif (
@@ -582,7 +646,11 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
             # print tabulate(cursor, headers='keys')
             headers = [column[0] for column in tabular_data.description]
 
-        elif dataclasses is not None and len(rows) > 0 and dataclasses.is_dataclass(rows[0]):
+        elif (
+            dataclasses is not None
+            and len(rows) > 0
+            and dataclasses.is_dataclass(rows[0])
+        ):
             # Python's dataclass
             field_names = [field.name for field in dataclasses.fields(rows[0])]
             if headers == "keys":
@@ -652,7 +720,9 @@ def tabulate(
     if tabular_data is None:
         tabular_data = []
 
-    list_of_lists, headers, headers_pad = _normalize_tabular_data(tabular_data, headers, showindex=showindex)
+    list_of_lists, headers, headers_pad = _normalize_tabular_data(
+        tabular_data, headers, showindex=showindex
+    )
     list_of_lists, separating_lines = _remove_separating_lines(list_of_lists)
 
     # PrettyTable formatting does not use any extra padding.
@@ -694,7 +764,11 @@ def tabulate(
     has_invisible = _ansi_codes.search(plain_text) is not None
 
     enable_widechars = wcwidth is not None and WIDE_CHARS_MODE
-    if not isinstance(tablefmt, TableFormat) and tablefmt in multiline_formats and _is_multiline(plain_text):
+    if (
+        not isinstance(tablefmt, TableFormat)
+        and tablefmt in multiline_formats
+        and _is_multiline(plain_text)
+    ):
         tablefmt = multiline_formats.get(tablefmt, tablefmt)
         is_multiline = True
     else:
@@ -706,13 +780,17 @@ def tabulate(
     numparses = _expand_numparse(disable_numparse, len(cols))
     coltypes = [_column_type(col, numparse=np) for col, np in zip(cols, numparses)]
     if isinstance(floatfmt, str):  # old version
-        float_formats = len(cols) * [floatfmt]  # just duplicate the string to use in each column
+        float_formats = len(cols) * [
+            floatfmt
+        ]  # just duplicate the string to use in each column
     else:  # if floatfmt is list, tuple etc we have one per column
         float_formats = list(floatfmt)
         if len(float_formats) < len(cols):
             float_formats.extend((len(cols) - len(float_formats)) * [_DEFAULT_FLOATFMT])
     if isinstance(intfmt, str):  # old version
-        int_formats = len(cols) * [intfmt]  # just duplicate the string to use in each column
+        int_formats = len(cols) * [
+            intfmt
+        ]  # just duplicate the string to use in each column
     else:  # if intfmt is list, tuple etc we have one per column
         int_formats = list(intfmt)
         if len(int_formats) < len(cols):
@@ -725,7 +803,9 @@ def tabulate(
             missing_vals.extend((len(cols) - len(missing_vals)) * [_DEFAULT_MISSINGVAL])
     cols = [
         [_format(v, ct, fl_fmt, int_fmt, miss_v, has_invisible) for v in c]
-        for c, ct, fl_fmt, int_fmt, miss_v in zip(cols, coltypes, float_formats, int_formats, missing_vals)
+        for c, ct, fl_fmt, int_fmt, miss_v in zip(
+            cols, coltypes, float_formats, int_formats, missing_vals
+        )
     ]
 
     # align columns
@@ -748,14 +828,24 @@ def tabulate(
                 break
             if align != "global":
                 aligns[idx] = align
-    minwidths = [width_fn(h) + min_padding for h in headers] if headers else [0] * len(cols)
+    minwidths = (
+        [width_fn(h) + min_padding for h in headers] if headers else [0] * len(cols)
+    )
     aligns_copy = aligns.copy()
     # Reset alignments in copy of alignments list to "left" for 'colon_grid' format,
     # which enforces left alignment in the text output of the data.
     if tablefmt == "colon_grid":
         aligns_copy = ["left"] * len(cols)
     cols = [
-        _align_column(c, a, minw, has_invisible, enable_widechars, is_multiline, preserve_whitespace)
+        _align_column(
+            c,
+            a,
+            minw,
+            has_invisible,
+            enable_widechars,
+            is_multiline,
+            preserve_whitespace,
+        )
         for c, a, minw in zip(cols, aligns_copy, minwidths)
     ]
 
@@ -786,7 +876,10 @@ def tabulate(
                     aligns_headers[hidx] = aligns[hidx]
                 elif align != "global":
                     aligns_headers[hidx] = align
-        minwidths = [max(minw, max(width_fn(cl) for cl in c)) for minw, c in zip(minwidths, t_cols)]
+        minwidths = [
+            max(minw, max(width_fn(cl) for cl in c))
+            for minw, c in zip(minwidths, t_cols)
+        ]
         headers = [
             _align_header(h, a, minw, width_fn(h), is_multiline, width_fn)
             for h, a, minw in zip(headers, aligns_headers, minwidths)
@@ -801,7 +894,16 @@ def tabulate(
 
     ra_default = rowalign if isinstance(rowalign, str) else None
     rowaligns = _expand_iterable(rowalign, len(rows), ra_default)
-    return _format_table(tablefmt, headers, aligns_headers, rows, minwidths, aligns, is_multiline, rowaligns=rowaligns)
+    return _format_table(
+        tablefmt,
+        headers,
+        aligns_headers,
+        rows,
+        minwidths,
+        aligns,
+        is_multiline,
+        rowaligns=rowaligns,
+    )
 
 
 def _expand_numparse(disable_numparse, column_count):
@@ -864,7 +966,9 @@ def _append_line(lines, colwidths, colaligns, linefmt):
     return lines
 
 
-def _format_table(fmt, headers, headersaligns, rows, colwidths, colaligns, is_multiline, rowaligns):
+def _format_table(
+    fmt, headers, headersaligns, rows, colwidths, colaligns, is_multiline, rowaligns
+):
     lines = []
     hidden = fmt.with_header_hide if (headers and fmt.with_header_hide) else []
     pad = fmt.padding
@@ -888,21 +992,41 @@ def _format_table(fmt, headers, headersaligns, rows, colwidths, colaligns, is_mu
         # initial rows with a line below
         for row, ralign in zip(rows[:-1], rowaligns):
             if row != SEPARATING_LINE:
-                append_row(lines, pad_row(row, pad), padded_widths, colaligns, fmt.datarow, rowalign=ralign)
+                append_row(
+                    lines,
+                    pad_row(row, pad),
+                    padded_widths,
+                    colaligns,
+                    fmt.datarow,
+                    rowalign=ralign,
+                )
             _append_line(lines, padded_widths, colaligns, fmt.linebetweenrows)
         # the last row without a line below
-        append_row(lines, pad_row(rows[-1], pad), padded_widths, colaligns, fmt.datarow, rowalign=rowaligns[-1])
+        append_row(
+            lines,
+            pad_row(rows[-1], pad),
+            padded_widths,
+            colaligns,
+            fmt.datarow,
+            rowalign=rowaligns[-1],
+        )
     else:
         separating_line = (
-            fmt.linebetweenrows or fmt.linebelowheader or fmt.linebelow or fmt.lineabove or Line("", "", "", "")
+            fmt.linebetweenrows
+            or fmt.linebelowheader
+            or fmt.linebelow
+            or fmt.lineabove
+            or Line("", "", "", "")
         )
         for row in rows:
             # test to see if either the 1st column or the 2nd column (account for showindex) has
             # the SEPARATING_LINE flag
             if _is_separating_line(row):
                 _append_line(lines, padded_widths, colaligns, separating_line)
             else:
-                append_row(lines, pad_row(row, pad), padded_widths, colaligns, fmt.datarow)
+                append_row(
+                    lines, pad_row(row, pad), padded_widths, colaligns, fmt.datarow
+                )
 
     if fmt.linebelow and "linebelow" not in hidden:
         _append_line(lines, padded_widths, colaligns, fmt.linebelow)