Skip to content

Commit aa2e254

Browse files
⚡️ Speed up function _strip_ansi by 116% in PR #217 (proper-cleanup)
Here is an optimized version of `_strip_ansi` focusing on runtime speed. The main bottleneck is the regular expression replacement in `re.sub`, specifically returning the (possibly empty) group 4 (link text in hyperlinks) or, if it doesn’t match, an empty string for ANSI codes. This can be significantly sped up by avoiding the costly `r"\4"` (which always triggers group resolution machinery in the regex engine), and instead using a faster replacer callback. Since every match will be either an ANSI escape code (where group 4 is `None`) or a hyperlink (where group 4 contains the visible link text), we can handle both cases in one simple function. Optimized `code`. **Performance notes:** - This avoids all regex group substitution machinery for the common ANSI case. - No change to visible/functional behavior. - No changes to external function names or signatures. - String and bytes cases are handled separately, so no unnecessary type checks inside tight loops. **Comment:** No original comments were changed or removed. No changes made to the public interface or expected output. All logic concerning group 4 and escape sequence removal is preserved.
1 parent 62e10b1 commit aa2e254

File tree

1 file changed

+160
-36
lines changed

1 file changed

+160
-36
lines changed

codeflash/code_utils/tabulate.py

Lines changed: 160 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ def _is_separating_line_value(value):
6161
def _is_separating_line(row):
6262
row_type = type(row)
6363
is_sl = (row_type == list or row_type == str) and (
64-
(len(row) >= 1 and _is_separating_line_value(row[0])) or (len(row) >= 2 and _is_separating_line_value(row[1]))
64+
(len(row) >= 1 and _is_separating_line_value(row[0]))
65+
or (len(row) >= 2 and _is_separating_line_value(row[1]))
6566
)
6667

6768
return is_sl
@@ -151,7 +152,9 @@ def _pipe_line_with_colons(colwidths, colaligns):
151152
_ansi_codes_bytes = re.compile(_ansi_escape_pat.encode("utf8"), re.VERBOSE)
152153
_ansi_color_reset_code = "\033[0m"
153154

154-
_float_with_thousands_separators = re.compile(r"^(([+-]?[0-9]{1,3})(?:,([0-9]{3}))*)?(?(1)\.[0-9]*|\.[0-9]+)?$")
155+
_float_with_thousands_separators = re.compile(
156+
r"^(([+-]?[0-9]{1,3})(?:,([0-9]{3}))*)?(?(1)\.[0-9]*|\.[0-9]+)?$"
157+
)
155158

156159

157160
def _isnumber_with_thousands_separator(string):
@@ -200,12 +203,16 @@ def _isint(string, inttype=int):
200203
(hasattr(string, "is_integer") or hasattr(string, "__array__"))
201204
and str(type(string)).startswith("<class 'numpy.int")
202205
) # numpy.int64 and similar
203-
or (isinstance(string, (bytes, str)) and _isconvertible(inttype, string)) # integer as string
206+
or (
207+
isinstance(string, (bytes, str)) and _isconvertible(inttype, string)
208+
) # integer as string
204209
)
205210

206211

207212
def _isbool(string):
208-
return type(string) is bool or (isinstance(string, (bytes, str)) and string in {"True", "False"})
213+
return type(string) is bool or (
214+
isinstance(string, (bytes, str)) and string in {"True", "False"}
215+
)
209216

210217

211218
def _type(string, has_invisible=True, numparse=True):
@@ -219,10 +226,18 @@ def _type(string, has_invisible=True, numparse=True):
219226
if _isbool(string):
220227
return bool
221228
if numparse and (
222-
_isint(string) or (isinstance(string, str) and _isnumber_with_thousands_separator(string) and "." not in string)
229+
_isint(string)
230+
or (
231+
isinstance(string, str)
232+
and _isnumber_with_thousands_separator(string)
233+
and "." not in string
234+
)
223235
):
224236
return int
225-
if numparse and (_isnumber(string) or (isinstance(string, str) and _isnumber_with_thousands_separator(string))):
237+
if numparse and (
238+
_isnumber(string)
239+
or (isinstance(string, str) and _isnumber_with_thousands_separator(string))
240+
):
226241
return float
227242
if isinstance(string, bytes):
228243
return bytes
@@ -261,10 +276,23 @@ def _padnone(ignore_width, s):
261276

262277

263278
def _strip_ansi(s):
279+
# Use a custom replacer instead of r"\4" for much faster performance.
280+
def _replacer(match):
281+
return (
282+
match.group(4) or ""
283+
) # Only the hyperlink's visible text, or remove the escape.
284+
264285
if isinstance(s, str):
265-
return _ansi_codes.sub(r"\4", s)
286+
return _ansi_codes.sub(_replacer, s)
287+
266288
# a bytestring
267-
return _ansi_codes_bytes.sub(r"\4", s)
289+
def _replacer_bytes(match):
290+
g = match.group(4)
291+
return (
292+
g if g else b""
293+
) # Only the hyperlink's visible text, or remove the escape.
294+
295+
return _ansi_codes_bytes.sub(_replacer_bytes, s)
268296

269297

270298
def _visible_width(s):
@@ -365,19 +393,29 @@ def _align_column(
365393
is_multiline=False,
366394
preserve_whitespace=False,
367395
):
368-
strings, padfn = _align_column_choose_padfn(strings, alignment, has_invisible, preserve_whitespace)
369-
width_fn = _align_column_choose_width_fn(has_invisible, enable_widechars, is_multiline)
396+
strings, padfn = _align_column_choose_padfn(
397+
strings, alignment, has_invisible, preserve_whitespace
398+
)
399+
width_fn = _align_column_choose_width_fn(
400+
has_invisible, enable_widechars, is_multiline
401+
)
370402

371403
s_widths = list(map(width_fn, strings))
372404
maxwidth = max(max(_flat_list(s_widths)), minwidth)
373405
# TODO: refactor column alignment in single-line and multiline modes
374406
if is_multiline:
375407
if not enable_widechars and not has_invisible:
376-
padded_strings = ["\n".join([padfn(maxwidth, s) for s in ms.splitlines()]) for ms in strings]
408+
padded_strings = [
409+
"\n".join([padfn(maxwidth, s) for s in ms.splitlines()])
410+
for ms in strings
411+
]
377412
else:
378413
# enable wide-character width corrections
379414
s_lens = [[len(s) for s in re.split("[\r\n]", ms)] for ms in strings]
380-
visible_widths = [[maxwidth - (w - l) for w, l in zip(mw, ml)] for mw, ml in zip(s_widths, s_lens)]
415+
visible_widths = [
416+
[maxwidth - (w - l) for w, l in zip(mw, ml)]
417+
for mw, ml in zip(s_widths, s_lens)
418+
]
381419
# wcswidth and _visible_width don't count invisible characters;
382420
# padfn doesn't need to apply another correction
383421
padded_strings = [
@@ -419,13 +457,19 @@ def _format(val, valtype, floatfmt, intfmt, missingval="", has_invisible=True):
419457
if valtype is int:
420458
if isinstance(val, str):
421459
val_striped = val.encode("unicode_escape").decode("utf-8")
422-
colored = re.search(r"(\\[xX]+[0-9a-fA-F]+\[\d+[mM]+)([0-9.]+)(\\.*)$", val_striped)
460+
colored = re.search(
461+
r"(\\[xX]+[0-9a-fA-F]+\[\d+[mM]+)([0-9.]+)(\\.*)$", val_striped
462+
)
423463
if colored:
424464
total_groups = len(colored.groups())
425465
if total_groups == 3:
426466
digits = colored.group(2)
427467
if digits.isdigit():
428-
val_new = colored.group(1) + format(int(digits), intfmt) + colored.group(3)
468+
val_new = (
469+
colored.group(1)
470+
+ format(int(digits), intfmt)
471+
+ colored.group(3)
472+
)
429473
val = val_new.encode("utf-8").decode("unicode_escape")
430474
intfmt = ""
431475
return format(val, intfmt)
@@ -447,11 +491,15 @@ def _format(val, valtype, floatfmt, intfmt, missingval="", has_invisible=True):
447491
return f"{val}"
448492

449493

450-
def _align_header(header, alignment, width, visible_width, is_multiline=False, width_fn=None):
494+
def _align_header(
495+
header, alignment, width, visible_width, is_multiline=False, width_fn=None
496+
):
451497
"""Pad string header to width chars given known visible_width of the header."""
452498
if is_multiline:
453499
header_lines = re.split(_multiline_codes, header)
454-
padded_lines = [_align_header(h, alignment, width, width_fn(h)) for h in header_lines]
500+
padded_lines = [
501+
_align_header(h, alignment, width, width_fn(h)) for h in header_lines
502+
]
455503
return "\n".join(padded_lines)
456504
# else: not multiline
457505
ninvisible = len(header) - visible_width
@@ -504,14 +552,19 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
504552
# likely a conventional dict
505553
keys = tabular_data.keys()
506554
try:
507-
rows = list(izip_longest(*tabular_data.values())) # columns have to be transposed
555+
rows = list(
556+
izip_longest(*tabular_data.values())
557+
) # columns have to be transposed
508558
except TypeError: # not iterable
509559
raise TypeError(err_msg)
510560

511561
elif hasattr(tabular_data, "index"):
512562
# values is a property, has .index => it's likely a pandas.DataFrame (pandas 0.11.0)
513563
keys = list(tabular_data)
514-
if showindex in {"default", "always", True} and tabular_data.index.name is not None:
564+
if (
565+
showindex in {"default", "always", True}
566+
and tabular_data.index.name is not None
567+
):
515568
if isinstance(tabular_data.index.name, list):
516569
keys[:0] = tabular_data.index.name
517570
else:
@@ -535,10 +588,19 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
535588
if headers == "keys" and not rows:
536589
# an empty table (issue #81)
537590
headers = []
538-
elif headers == "keys" and hasattr(tabular_data, "dtype") and tabular_data.dtype.names:
591+
elif (
592+
headers == "keys"
593+
and hasattr(tabular_data, "dtype")
594+
and tabular_data.dtype.names
595+
):
539596
# numpy record array
540597
headers = tabular_data.dtype.names
541-
elif headers == "keys" and len(rows) > 0 and isinstance(rows[0], tuple) and hasattr(rows[0], "_fields"):
598+
elif (
599+
headers == "keys"
600+
and len(rows) > 0
601+
and isinstance(rows[0], tuple)
602+
and hasattr(rows[0], "_fields")
603+
):
542604
# namedtuple
543605
headers = list(map(str, rows[0]._fields))
544606
elif len(rows) > 0 and hasattr(rows[0], "keys") and hasattr(rows[0], "values"):
@@ -569,7 +631,9 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
569631
else:
570632
headers = []
571633
elif headers:
572-
raise ValueError("headers for a list of dicts is not a dict or a keyword")
634+
raise ValueError(
635+
"headers for a list of dicts is not a dict or a keyword"
636+
)
573637
rows = [[row.get(k) for k in keys] for row in rows]
574638

575639
elif (
@@ -582,7 +646,11 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"):
582646
# print tabulate(cursor, headers='keys')
583647
headers = [column[0] for column in tabular_data.description]
584648

585-
elif dataclasses is not None and len(rows) > 0 and dataclasses.is_dataclass(rows[0]):
649+
elif (
650+
dataclasses is not None
651+
and len(rows) > 0
652+
and dataclasses.is_dataclass(rows[0])
653+
):
586654
# Python's dataclass
587655
field_names = [field.name for field in dataclasses.fields(rows[0])]
588656
if headers == "keys":
@@ -652,7 +720,9 @@ def tabulate(
652720
if tabular_data is None:
653721
tabular_data = []
654722

655-
list_of_lists, headers, headers_pad = _normalize_tabular_data(tabular_data, headers, showindex=showindex)
723+
list_of_lists, headers, headers_pad = _normalize_tabular_data(
724+
tabular_data, headers, showindex=showindex
725+
)
656726
list_of_lists, separating_lines = _remove_separating_lines(list_of_lists)
657727

658728
# PrettyTable formatting does not use any extra padding.
@@ -694,7 +764,11 @@ def tabulate(
694764
has_invisible = _ansi_codes.search(plain_text) is not None
695765

696766
enable_widechars = wcwidth is not None and WIDE_CHARS_MODE
697-
if not isinstance(tablefmt, TableFormat) and tablefmt in multiline_formats and _is_multiline(plain_text):
767+
if (
768+
not isinstance(tablefmt, TableFormat)
769+
and tablefmt in multiline_formats
770+
and _is_multiline(plain_text)
771+
):
698772
tablefmt = multiline_formats.get(tablefmt, tablefmt)
699773
is_multiline = True
700774
else:
@@ -706,13 +780,17 @@ def tabulate(
706780
numparses = _expand_numparse(disable_numparse, len(cols))
707781
coltypes = [_column_type(col, numparse=np) for col, np in zip(cols, numparses)]
708782
if isinstance(floatfmt, str): # old version
709-
float_formats = len(cols) * [floatfmt] # just duplicate the string to use in each column
783+
float_formats = len(cols) * [
784+
floatfmt
785+
] # just duplicate the string to use in each column
710786
else: # if floatfmt is list, tuple etc we have one per column
711787
float_formats = list(floatfmt)
712788
if len(float_formats) < len(cols):
713789
float_formats.extend((len(cols) - len(float_formats)) * [_DEFAULT_FLOATFMT])
714790
if isinstance(intfmt, str): # old version
715-
int_formats = len(cols) * [intfmt] # just duplicate the string to use in each column
791+
int_formats = len(cols) * [
792+
intfmt
793+
] # just duplicate the string to use in each column
716794
else: # if intfmt is list, tuple etc we have one per column
717795
int_formats = list(intfmt)
718796
if len(int_formats) < len(cols):
@@ -725,7 +803,9 @@ def tabulate(
725803
missing_vals.extend((len(cols) - len(missing_vals)) * [_DEFAULT_MISSINGVAL])
726804
cols = [
727805
[_format(v, ct, fl_fmt, int_fmt, miss_v, has_invisible) for v in c]
728-
for c, ct, fl_fmt, int_fmt, miss_v in zip(cols, coltypes, float_formats, int_formats, missing_vals)
806+
for c, ct, fl_fmt, int_fmt, miss_v in zip(
807+
cols, coltypes, float_formats, int_formats, missing_vals
808+
)
729809
]
730810

731811
# align columns
@@ -748,14 +828,24 @@ def tabulate(
748828
break
749829
if align != "global":
750830
aligns[idx] = align
751-
minwidths = [width_fn(h) + min_padding for h in headers] if headers else [0] * len(cols)
831+
minwidths = (
832+
[width_fn(h) + min_padding for h in headers] if headers else [0] * len(cols)
833+
)
752834
aligns_copy = aligns.copy()
753835
# Reset alignments in copy of alignments list to "left" for 'colon_grid' format,
754836
# which enforces left alignment in the text output of the data.
755837
if tablefmt == "colon_grid":
756838
aligns_copy = ["left"] * len(cols)
757839
cols = [
758-
_align_column(c, a, minw, has_invisible, enable_widechars, is_multiline, preserve_whitespace)
840+
_align_column(
841+
c,
842+
a,
843+
minw,
844+
has_invisible,
845+
enable_widechars,
846+
is_multiline,
847+
preserve_whitespace,
848+
)
759849
for c, a, minw in zip(cols, aligns_copy, minwidths)
760850
]
761851

@@ -786,7 +876,10 @@ def tabulate(
786876
aligns_headers[hidx] = aligns[hidx]
787877
elif align != "global":
788878
aligns_headers[hidx] = align
789-
minwidths = [max(minw, max(width_fn(cl) for cl in c)) for minw, c in zip(minwidths, t_cols)]
879+
minwidths = [
880+
max(minw, max(width_fn(cl) for cl in c))
881+
for minw, c in zip(minwidths, t_cols)
882+
]
790883
headers = [
791884
_align_header(h, a, minw, width_fn(h), is_multiline, width_fn)
792885
for h, a, minw in zip(headers, aligns_headers, minwidths)
@@ -801,7 +894,16 @@ def tabulate(
801894

802895
ra_default = rowalign if isinstance(rowalign, str) else None
803896
rowaligns = _expand_iterable(rowalign, len(rows), ra_default)
804-
return _format_table(tablefmt, headers, aligns_headers, rows, minwidths, aligns, is_multiline, rowaligns=rowaligns)
897+
return _format_table(
898+
tablefmt,
899+
headers,
900+
aligns_headers,
901+
rows,
902+
minwidths,
903+
aligns,
904+
is_multiline,
905+
rowaligns=rowaligns,
906+
)
805907

806908

807909
def _expand_numparse(disable_numparse, column_count):
@@ -864,7 +966,9 @@ def _append_line(lines, colwidths, colaligns, linefmt):
864966
return lines
865967

866968

867-
def _format_table(fmt, headers, headersaligns, rows, colwidths, colaligns, is_multiline, rowaligns):
969+
def _format_table(
970+
fmt, headers, headersaligns, rows, colwidths, colaligns, is_multiline, rowaligns
971+
):
868972
lines = []
869973
hidden = fmt.with_header_hide if (headers and fmt.with_header_hide) else []
870974
pad = fmt.padding
@@ -888,21 +992,41 @@ def _format_table(fmt, headers, headersaligns, rows, colwidths, colaligns, is_mu
888992
# initial rows with a line below
889993
for row, ralign in zip(rows[:-1], rowaligns):
890994
if row != SEPARATING_LINE:
891-
append_row(lines, pad_row(row, pad), padded_widths, colaligns, fmt.datarow, rowalign=ralign)
995+
append_row(
996+
lines,
997+
pad_row(row, pad),
998+
padded_widths,
999+
colaligns,
1000+
fmt.datarow,
1001+
rowalign=ralign,
1002+
)
8921003
_append_line(lines, padded_widths, colaligns, fmt.linebetweenrows)
8931004
# the last row without a line below
894-
append_row(lines, pad_row(rows[-1], pad), padded_widths, colaligns, fmt.datarow, rowalign=rowaligns[-1])
1005+
append_row(
1006+
lines,
1007+
pad_row(rows[-1], pad),
1008+
padded_widths,
1009+
colaligns,
1010+
fmt.datarow,
1011+
rowalign=rowaligns[-1],
1012+
)
8951013
else:
8961014
separating_line = (
897-
fmt.linebetweenrows or fmt.linebelowheader or fmt.linebelow or fmt.lineabove or Line("", "", "", "")
1015+
fmt.linebetweenrows
1016+
or fmt.linebelowheader
1017+
or fmt.linebelow
1018+
or fmt.lineabove
1019+
or Line("", "", "", "")
8981020
)
8991021
for row in rows:
9001022
# test to see if either the 1st column or the 2nd column (account for showindex) has
9011023
# the SEPARATING_LINE flag
9021024
if _is_separating_line(row):
9031025
_append_line(lines, padded_widths, colaligns, separating_line)
9041026
else:
905-
append_row(lines, pad_row(row, pad), padded_widths, colaligns, fmt.datarow)
1027+
append_row(
1028+
lines, pad_row(row, pad), padded_widths, colaligns, fmt.datarow
1029+
)
9061030

9071031
if fmt.linebelow and "linebelow" not in hidden:
9081032
_append_line(lines, padded_widths, colaligns, fmt.linebelow)

0 commit comments

Comments
 (0)