Skip to content

Commit 033a177

Browse files
Moosieusjosevalim
authored andcommitted
Fix String.replace_invalid/2 perf regressions (#13090)
1 parent b9fffa3 commit 033a177

File tree

1 file changed

+54
-58
lines changed

1 file changed

+54
-58
lines changed

lib/elixir/lib/string.ex

Lines changed: 54 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,6 +1871,22 @@ defmodule String do
18711871
end
18721872
end
18731873

1874+
defguardp replace_invalid_ii_of_iii(i, ii)
1875+
when (896 <= Bitwise.bor(Bitwise.bsl(i, 6), ii) and
1876+
Bitwise.bor(Bitwise.bsl(i, 6), ii) <= 1023) or
1877+
(32 <= Bitwise.bor(Bitwise.bsl(i, 6), ii) and
1878+
Bitwise.bor(Bitwise.bsl(i, 6), ii) <= 863)
1879+
1880+
defguardp replace_invalid_ii_of_iv(i, ii)
1881+
when 16 <= Bitwise.bor(Bitwise.bsl(i, 6), ii) and
1882+
Bitwise.bor(Bitwise.bsl(i, 6), ii) <= 271
1883+
1884+
defguardp replace_invalid_iii_of_iv(i, ii, iii)
1885+
when 1024 <= Bitwise.bor(Bitwise.bor(Bitwise.bsl(i, 12), Bitwise.bsl(ii, 6)), iii) and
1886+
Bitwise.bor(Bitwise.bor(Bitwise.bsl(i, 12), Bitwise.bsl(ii, 6)), iii) <= 17407
1887+
1888+
defguardp replace_invalid_is_next(next) when Bitwise.bsr(next, 6) !== 0b10
1889+
18741890
@doc ~S"""
18751891
Returns a new string created by replacing all invalid bytes with `replacement` (`"�"` by default).
18761892
@@ -1889,94 +1905,74 @@ defmodule String do
18891905
"nem rán bERROR! bề"
18901906
"""
18911907
@doc since: "1.16.0"
1892-
def replace_invalid(string, replacement \\ "�")
1893-
when is_binary(string) and is_binary(replacement) do
1894-
do_replace_invalid(string, replacement, <<>>)
1908+
def replace_invalid(bytes, replacement \\ "�")
1909+
when is_binary(bytes) and is_binary(replacement) do
1910+
do_replace_invalid(bytes, replacement, <<>>)
18951911
end
18961912

18971913
# Valid ASCII (for better average speed)
1898-
defp do_replace_invalid(<<ascii::8, n_lead::2, rest::bits>>, rep, acc)
1899-
when ascii in 0..127 and n_lead != 0b10 do
1900-
do_replace_invalid(<<n_lead::2, rest::bits>>, rep, <<acc::bits, ascii::8>>)
1914+
defp do_replace_invalid(<<ascii::8, next::8, _::bytes>> = rest, rep, acc)
1915+
when ascii in 0..127 and replace_invalid_is_next(next) do
1916+
<<_::8, rest::bytes>> = rest
1917+
do_replace_invalid(rest, rep, acc <> <<ascii::8>>)
19011918
end
19021919

19031920
# Valid UTF-8
1904-
defp do_replace_invalid(<<codepoint::utf8, rest::bits>>, rep, acc) do
1905-
do_replace_invalid(rest, rep, <<acc::bits, codepoint::utf8>>)
1921+
defp do_replace_invalid(<<grapheme::utf8, rest::bytes>>, rep, acc) do
1922+
do_replace_invalid(rest, rep, acc <> <<grapheme::utf8>>)
19061923
end
19071924

19081925
# 2/3 truncated sequence
1909-
defp do_replace_invalid(<<0b1110::4, i::4, 0b10::2, ii::6>>, rep, acc) do
1910-
<<tcp::10>> = <<i::4, ii::6>>
1911-
<<acc::bits, replace_invalid_ii_of_iii(tcp, rep)::bits>>
1926+
defp do_replace_invalid(<<0b1110::4, i::4, 0b10::2, ii::6>>, rep, acc)
1927+
when replace_invalid_ii_of_iii(i, ii) do
1928+
acc <> rep
19121929
end
19131930

1914-
defp do_replace_invalid(<<0b1110::4, i::4, 0b10::2, ii::6, n_lead::2, rest::bits>>, rep, acc)
1915-
when n_lead != 0b10 do
1916-
<<tcp::10>> = <<i::4, ii::6>>
1917-
1918-
do_replace_invalid(
1919-
<<n_lead::2, rest::bits>>,
1920-
rep,
1921-
<<acc::bits, replace_invalid_ii_of_iii(tcp, rep)::bits>>
1922-
)
1931+
defp do_replace_invalid(<<0b1110::4, i::4, 0b10::2, ii::6, next::8, _::bytes>> = rest, rep, acc)
1932+
when replace_invalid_ii_of_iii(i, ii) and replace_invalid_is_next(next) do
1933+
<<_::16, rest::bytes>> = rest
1934+
do_replace_invalid(rest, rep, acc <> rep)
19231935
end
19241936

19251937
# 2/4
1926-
defp do_replace_invalid(<<0b11110::5, i::3, 0b10::2, ii::6>>, rep, acc) do
1927-
<<tcp::10>> = <<i::4, ii::6>>
1928-
<<acc::bits, replace_invalid_ii_of_iiii(tcp, rep)::bits>>
1938+
defp do_replace_invalid(<<0b11110::5, i::3, 0b10::2, ii::6>>, rep, acc)
1939+
when replace_invalid_ii_of_iv(i, ii) do
1940+
acc <> rep
19291941
end
19301942

1931-
defp do_replace_invalid(<<0b11110::5, i::3, 0b10::2, ii::6, n_lead::2, rest::bits>>, rep, acc)
1932-
when n_lead != 0b10 do
1933-
<<tcp::10>> = <<i::4, ii::6>>
1934-
1935-
do_replace_invalid(
1936-
<<n_lead::2, rest::bits>>,
1937-
rep,
1938-
<<acc::bits, replace_invalid_ii_of_iiii(tcp, rep)::bits>>
1939-
)
1943+
defp do_replace_invalid(
1944+
<<0b11110::5, i::3, 0b10::2, ii::6, next::8, _::bytes>> = rest,
1945+
rep,
1946+
acc
1947+
)
1948+
when replace_invalid_ii_of_iv(i, ii) and replace_invalid_is_next(next) do
1949+
<<_::16, rest::bytes>> = rest
1950+
do_replace_invalid(rest, rep, acc <> rep)
19401951
end
19411952

19421953
# 3/4
1943-
defp do_replace_invalid(<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6>>, rep, acc) do
1944-
<<tcp::15>> = <<i::3, ii::6, iii::6>>
1945-
<<acc::bits, replace_invalid_iii_of_iiii(tcp, rep)::bits>>
1954+
defp do_replace_invalid(<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6>>, rep, acc)
1955+
when replace_invalid_iii_of_iv(i, ii, iii) do
1956+
acc <> rep
19461957
end
19471958

19481959
defp do_replace_invalid(
1949-
<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6, n_lead::2, rest::bits>>,
1960+
<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6, next::8, _::bytes>> = rest,
19501961
rep,
19511962
acc
19521963
)
1953-
when n_lead != 0b10 do
1954-
<<tcp::15>> = <<i::3, ii::6, iii::6>>
1955-
1956-
do_replace_invalid(
1957-
<<n_lead::2, rest::bits>>,
1958-
rep,
1959-
<<acc::bits, replace_invalid_iii_of_iiii(tcp, rep)::bits>>
1960-
)
1964+
when replace_invalid_iii_of_iv(i, ii, iii) and replace_invalid_is_next(next) do
1965+
<<_::24, rest::bytes>> = rest
1966+
do_replace_invalid(rest, rep, acc <> rep)
19611967
end
19621968

1963-
# any other invalid bytes
1964-
defp do_replace_invalid(<<_, rest::bits>>, rep, acc),
1965-
do: do_replace_invalid(rest, rep, <<acc::bits, rep::bits>>)
1969+
# Everything else
1970+
defp do_replace_invalid(<<_, rest::bytes>>, rep, acc),
1971+
do: do_replace_invalid(rest, rep, acc <> rep)
19661972

1973+
# Final
19671974
defp do_replace_invalid(<<>>, _, acc), do: acc
19681975

1969-
# bounds-checking truncated code points for overlong encodings
1970-
defp replace_invalid_ii_of_iii(tcp, rep) when tcp >= 32 and tcp <= 863, do: rep
1971-
defp replace_invalid_ii_of_iii(tcp, rep) when tcp >= 896 and tcp <= 1023, do: rep
1972-
defp replace_invalid_ii_of_iii(_, rep), do: rep <> rep
1973-
1974-
defp replace_invalid_ii_of_iiii(tcp, rep) when tcp >= 16 and tcp <= 271, do: rep
1975-
defp replace_invalid_ii_of_iiii(_, rep), do: rep <> rep
1976-
1977-
defp replace_invalid_iii_of_iiii(tcp, rep) when tcp >= 1024 and tcp <= 17407, do: rep
1978-
defp replace_invalid_iii_of_iiii(_, rep), do: rep <> rep <> rep
1979-
19801976
@doc ~S"""
19811977
Splits the string into chunks of characters that share a common trait.
19821978

0 commit comments

Comments
 (0)