@@ -1871,6 +1871,22 @@ defmodule String do
1871
1871
end
1872
1872
end
1873
1873
1874
+ defguardp replace_invalid_ii_of_iii ( i , ii )
1875
+ when ( 896 <= Bitwise . bor ( Bitwise . bsl ( i , 6 ) , ii ) and
1876
+ Bitwise . bor ( Bitwise . bsl ( i , 6 ) , ii ) <= 1023 ) or
1877
+ ( 32 <= Bitwise . bor ( Bitwise . bsl ( i , 6 ) , ii ) and
1878
+ Bitwise . bor ( Bitwise . bsl ( i , 6 ) , ii ) <= 863 )
1879
+
1880
+ defguardp replace_invalid_ii_of_iv ( i , ii )
1881
+ when 16 <= Bitwise . bor ( Bitwise . bsl ( i , 6 ) , ii ) and
1882
+ Bitwise . bor ( Bitwise . bsl ( i , 6 ) , ii ) <= 271
1883
+
1884
+ defguardp replace_invalid_iii_of_iv ( i , ii , iii )
1885
+ when 1024 <= Bitwise . bor ( Bitwise . bor ( Bitwise . bsl ( i , 12 ) , Bitwise . bsl ( ii , 6 ) ) , iii ) and
1886
+ Bitwise . bor ( Bitwise . bor ( Bitwise . bsl ( i , 12 ) , Bitwise . bsl ( ii , 6 ) ) , iii ) <= 17407
1887
+
1888
+ defguardp replace_invalid_is_next ( next ) when Bitwise . bsr ( next , 6 ) !== 0b10
1889
+
1874
1890
@ doc ~S"""
1875
1891
Returns a new string created by replacing all invalid bytes with `replacement` (`"�"` by default).
1876
1892
@@ -1889,94 +1905,74 @@ defmodule String do
1889
1905
"nem rán bERROR! bề"
1890
1906
"""
1891
1907
@ doc since: "1.16.0"
1892
- def replace_invalid ( string , replacement \\ "�" )
1893
- when is_binary ( string ) and is_binary ( replacement ) do
1894
- do_replace_invalid ( string , replacement , << >> )
1908
+ def replace_invalid ( bytes , replacement \\ "�" )
1909
+ when is_binary ( bytes ) and is_binary ( replacement ) do
1910
+ do_replace_invalid ( bytes , replacement , << >> )
1895
1911
end
1896
1912
1897
1913
# Valid ASCII (for better average speed)
1898
- defp do_replace_invalid ( << ascii :: 8 , n_lead :: 2 , rest :: bits >> , rep , acc )
1899
- when ascii in 0 .. 127 and n_lead != 0b10 do
1900
- do_replace_invalid ( << n_lead :: 2 , rest :: bits >> , rep , << acc :: bits , ascii :: 8 >> )
1914
+ defp do_replace_invalid ( << ascii :: 8 , next :: 8 , _ :: bytes >> = rest , rep , acc )
1915
+ when ascii in 0 .. 127 and replace_invalid_is_next ( next ) do
1916
+ << _ :: 8 , rest :: bytes >> = rest
1917
+ do_replace_invalid ( rest , rep , acc <> << ascii :: 8 >> )
1901
1918
end
1902
1919
1903
1920
# Valid UTF-8
1904
- defp do_replace_invalid ( << codepoint :: utf8 , rest :: bits >> , rep , acc ) do
1905
- do_replace_invalid ( rest , rep , << acc :: bits , codepoint :: utf8 >> )
1921
+ defp do_replace_invalid ( << grapheme :: utf8 , rest :: bytes >> , rep , acc ) do
1922
+ do_replace_invalid ( rest , rep , acc <> << grapheme :: utf8 >> )
1906
1923
end
1907
1924
1908
1925
# 2/3 truncated sequence
1909
- defp do_replace_invalid ( << 0b1110 :: 4 , i :: 4 , 0b10 :: 2 , ii :: 6 >> , rep , acc ) do
1910
- << tcp :: 10 >> = << i :: 4 , ii :: 6 >>
1911
- << acc :: bits , replace_invalid_ii_of_iii ( tcp , rep ) :: bits >>
1926
+ defp do_replace_invalid ( << 0b1110 :: 4 , i :: 4 , 0b10 :: 2 , ii :: 6 >> , rep , acc )
1927
+ when replace_invalid_ii_of_iii ( i , ii ) do
1928
+ acc <> rep
1912
1929
end
1913
1930
1914
- defp do_replace_invalid ( << 0b1110 :: 4 , i :: 4 , 0b10 :: 2 , ii :: 6 , n_lead :: 2 , rest :: bits >> , rep , acc )
1915
- when n_lead != 0b10 do
1916
- << tcp :: 10 >> = << i :: 4 , ii :: 6 >>
1917
-
1918
- do_replace_invalid (
1919
- << n_lead :: 2 , rest :: bits >> ,
1920
- rep ,
1921
- << acc :: bits , replace_invalid_ii_of_iii ( tcp , rep ) :: bits >>
1922
- )
1931
+ defp do_replace_invalid ( << 0b1110 :: 4 , i :: 4 , 0b10 :: 2 , ii :: 6 , next :: 8 , _ :: bytes >> = rest , rep , acc )
1932
+ when replace_invalid_ii_of_iii ( i , ii ) and replace_invalid_is_next ( next ) do
1933
+ << _ :: 16 , rest :: bytes >> = rest
1934
+ do_replace_invalid ( rest , rep , acc <> rep )
1923
1935
end
1924
1936
1925
1937
# 2/4
1926
- defp do_replace_invalid ( << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 >> , rep , acc ) do
1927
- << tcp :: 10 >> = << i :: 4 , ii :: 6 >>
1928
- << acc :: bits , replace_invalid_ii_of_iiii ( tcp , rep ) :: bits >>
1938
+ defp do_replace_invalid ( << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 >> , rep , acc )
1939
+ when replace_invalid_ii_of_iv ( i , ii ) do
1940
+ acc <> rep
1929
1941
end
1930
1942
1931
- defp do_replace_invalid ( << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 , n_lead :: 2 , rest :: bits >> , rep , acc )
1932
- when n_lead != 0b10 do
1933
- << tcp :: 10 >> = << i :: 4 , ii :: 6 >>
1934
-
1935
- do_replace_invalid (
1936
- << n_lead :: 2 , rest :: bits >> ,
1937
- rep ,
1938
- << acc :: bits , replace_invalid_ii_of_iiii ( tcp , rep ) :: bits >>
1939
- )
1943
+ defp do_replace_invalid (
1944
+ << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 , next :: 8 , _ :: bytes >> = rest ,
1945
+ rep ,
1946
+ acc
1947
+ )
1948
+ when replace_invalid_ii_of_iv ( i , ii ) and replace_invalid_is_next ( next ) do
1949
+ << _ :: 16 , rest :: bytes >> = rest
1950
+ do_replace_invalid ( rest , rep , acc <> rep )
1940
1951
end
1941
1952
1942
1953
# 3/4
1943
- defp do_replace_invalid ( << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 , 0b10 :: 2 , iii :: 6 >> , rep , acc ) do
1944
- << tcp :: 15 >> = << i :: 3 , ii :: 6 , iii :: 6 >>
1945
- << acc :: bits , replace_invalid_iii_of_iiii ( tcp , rep ) :: bits >>
1954
+ defp do_replace_invalid ( << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 , 0b10 :: 2 , iii :: 6 >> , rep , acc )
1955
+ when replace_invalid_iii_of_iv ( i , ii , iii ) do
1956
+ acc <> rep
1946
1957
end
1947
1958
1948
1959
defp do_replace_invalid (
1949
- << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 , 0b10 :: 2 , iii :: 6 , n_lead :: 2 , rest :: bits >> ,
1960
+ << 0b11110 :: 5 , i :: 3 , 0b10 :: 2 , ii :: 6 , 0b10 :: 2 , iii :: 6 , next :: 8 , _ :: bytes >> = rest ,
1950
1961
rep ,
1951
1962
acc
1952
1963
)
1953
- when n_lead != 0b10 do
1954
- << tcp :: 15 >> = << i :: 3 , ii :: 6 , iii :: 6 >>
1955
-
1956
- do_replace_invalid (
1957
- << n_lead :: 2 , rest :: bits >> ,
1958
- rep ,
1959
- << acc :: bits , replace_invalid_iii_of_iiii ( tcp , rep ) :: bits >>
1960
- )
1964
+ when replace_invalid_iii_of_iv ( i , ii , iii ) and replace_invalid_is_next ( next ) do
1965
+ << _ :: 24 , rest :: bytes >> = rest
1966
+ do_replace_invalid ( rest , rep , acc <> rep )
1961
1967
end
1962
1968
1963
- # any other invalid bytes
1964
- defp do_replace_invalid ( << _ , rest :: bits >> , rep , acc ) ,
1965
- do: do_replace_invalid ( rest , rep , << acc :: bits , rep :: bits >> )
1969
+ # Everything else
1970
+ defp do_replace_invalid ( << _ , rest :: bytes >> , rep , acc ) ,
1971
+ do: do_replace_invalid ( rest , rep , acc <> rep )
1966
1972
1973
+ # Final
1967
1974
defp do_replace_invalid ( << >> , _ , acc ) , do: acc
1968
1975
1969
- # bounds-checking truncated code points for overlong encodings
1970
- defp replace_invalid_ii_of_iii ( tcp , rep ) when tcp >= 32 and tcp <= 863 , do: rep
1971
- defp replace_invalid_ii_of_iii ( tcp , rep ) when tcp >= 896 and tcp <= 1023 , do: rep
1972
- defp replace_invalid_ii_of_iii ( _ , rep ) , do: rep <> rep
1973
-
1974
- defp replace_invalid_ii_of_iiii ( tcp , rep ) when tcp >= 16 and tcp <= 271 , do: rep
1975
- defp replace_invalid_ii_of_iiii ( _ , rep ) , do: rep <> rep
1976
-
1977
- defp replace_invalid_iii_of_iiii ( tcp , rep ) when tcp >= 1024 and tcp <= 17407 , do: rep
1978
- defp replace_invalid_iii_of_iiii ( _ , rep ) , do: rep <> rep <> rep
1979
-
1980
1976
@ doc ~S"""
1981
1977
Splits the string into chunks of characters that share a common trait.
1982
1978
0 commit comments