Skip to content

Commit 2e89d8f

Browse files
committed
Restore broken optimization of unicode conversion function
When used to convert from `latin1` to `utf8` or vice versa, the `unicode:characters_to_binary/3` function would return the original binary unchanged if it only contained 7-bit ASCII characters. For example: unicode:characters_to_binary("abc", latin1, utf8) To determine whether the input was a binary with only 7-bit characters, the undocument BIF `unicode:bin_is_7bit/1` is used. In Erlang/OTP 27, this optimization broke because `bin_is_7bit/1` accidentally started to always return `false`. Resolves #10072
1 parent c388a2d commit 2e89d8f

File tree

2 files changed

+33
-7
lines changed

2 files changed

+33
-7
lines changed

erts/emulator/beam/erl_unicode.c

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -885,15 +885,25 @@ static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_4)
885885

886886
BIF_RETTYPE unicode_bin_is_7bit_1(BIF_ALIST_1)
887887
{
888-
Sint need;
889-
if(!is_bitstring(BIF_ARG_1)) {
890-
BIF_RET(am_false);
888+
const byte *temp_alloc = NULL, *bytes;
889+
Uint size;
890+
Eterm ret;
891+
892+
bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &size, &temp_alloc);
893+
if (bytes == NULL) {
894+
BIF_RET(am_false);
891895
}
892-
need = latin1_binary_need(BIF_ARG_1);
893-
if(need >= 0 && aligned_binary_size(BIF_ARG_1) == need) {
894-
BIF_RET(am_true);
896+
897+
ret = am_true;
898+
for (Uint i = 0; i < size; i++) {
899+
if (bytes[i] & ((byte) 0x80)) {
900+
ret = am_false;
901+
break;
902+
}
895903
}
896-
BIF_RET(am_false);
904+
905+
erts_free_aligned_binary_bytes(temp_alloc);
906+
BIF_RET(ret);
897907
}
898908

899909
static int is_valid_utf8(Eterm orig_bin)

lib/stdlib/test/unicode_SUITE.erl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
ex_binaries_errors_utf32_big/1,
3737
normalize/1,
3838
huge_illegal_code_points/1,
39+
bin_is_7bit/1,
3940
error_info/1
4041
]).
4142

@@ -51,6 +52,7 @@ all() ->
5152
normalize,
5253
{group,binaries_errors},
5354
huge_illegal_code_points,
55+
bin_is_7bit,
5456
error_info].
5557

5658
groups() ->
@@ -1408,6 +1410,20 @@ make_unaligned(Bin0) when is_binary(Bin0) ->
14081410
<<0:3,Bin:Sz/binary,31:5>> = id(Bin1),
14091411
Bin.
14101412

1413+
bin_is_7bit(_Config) ->
1414+
%% This BIF is undocumented, but the unicode module uses it to
1415+
%% avoid unnecessary conversion work.
1416+
true = do_bin_is_7bit(~""),
1417+
true = do_bin_is_7bit(~"abc"),
1418+
false = do_bin_is_7bit(~"björn"),
1419+
false = unicode:bin_is_7bit(<<0:7>>),
1420+
ok.
1421+
1422+
do_bin_is_7bit(Bin) ->
1423+
Res = unicode:bin_is_7bit(Bin),
1424+
Res = unicode:bin_is_7bit(make_unaligned(Bin)),
1425+
Res.
1426+
14111427
error_info(_Config) ->
14121428
L = [{characters_to_binary, [abc]},
14131429
{characters_to_binary,[abc, utf8]},

0 commit comments

Comments
 (0)