Skip to content

Commit b33dd12

Browse files
lukaszsamsonJosé Valim
authored andcommitted
Fix error when :trim_bom is used with :encoding
Signed-off-by: José Valim <[email protected]>
1 parent c36b2c9 commit b33dd12

File tree

5 files changed

+64
-8
lines changed

5 files changed

+64
-8
lines changed

lib/elixir/lib/file.ex

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1512,7 +1512,8 @@ defmodule File do
15121512
in raw mode for performance reasons. Therefore, Elixir **will** open
15131513
streams in `:raw` mode with the `:read_ahead` option unless an encoding
15141514
is specified. This means any data streamed into the file must be
1515-
converted to `t:iodata/0` type. If you pass `[:utf8]` in the modes parameter,
1515+
converted to `t:iodata/0` type. If you pass e.g. `[encoding: :utf8]`
1516+
or `[encoding: {:utf16, :little}]` in the modes parameter,
15161517
the underlying stream will use `IO.write/2` and the `String.Chars` protocol
15171518
to convert the data. See `IO.binwrite/2` and `IO.write/2` .
15181519
@@ -1524,6 +1525,9 @@ defmodule File do
15241525
If you pass `:trim_bom` in the modes parameter, the stream will
15251526
trim UTF-8, UTF-16 and UTF-32 byte order marks when reading from file.
15261527
1528+
Note that this function does not try to discover the file encoding basing
1529+
on BOM.
1530+
15271531
## Examples
15281532
15291533
# Read in 2048 byte chunks rather than lines

lib/elixir/lib/file/stream.ex

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,24 @@ defmodule File.Stream do
140140
end
141141
end
142142

143-
defp trim_bom(device, raw) do
144-
header = if raw, do: IO.binread(device, 4), else: IO.read(device, 1)
145-
{:ok, new_pos} = :file.position(device, bom_length(header))
143+
defp trim_bom(device, true) do
144+
bom_length = device |> IO.binread(4) |> bom_length()
145+
{:ok, new_pos} = :file.position(device, bom_length)
146146
{device, new_pos}
147147
end
148148

149+
defp trim_bom(device, false) do
150+
# Or we read the bom in the correct amount or it isn't there
151+
case bom_length(IO.read(device, 1)) do
152+
0 ->
153+
{:ok, _} = :file.position(device, 0)
154+
{device, 0}
155+
156+
_ ->
157+
{device, 1}
158+
end
159+
end
160+
149161
defp bom_length(<<239, 187, 191, _rest::binary>>), do: 3
150162
defp bom_length(<<254, 255, _rest::binary>>), do: 2
151163
defp bom_length(<<255, 254, _rest::binary>>), do: 2

lib/elixir/test/elixir/file_test.exs

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,23 +1569,63 @@ defmodule FileTest do
15691569
src = fixture_path("utf8_bom.txt")
15701570

15711571
assert src
1572-
|> File.stream!([:utf8])
1572+
|> File.stream!([{:encoding, :utf8}])
15731573
|> Enum.take(1) == [<<239, 187, 191>> <> "Русский\n"]
15741574

15751575
assert src
1576-
|> File.stream!([:utf8], 1)
1576+
|> File.stream!([{:encoding, :utf8}], 1)
15771577
|> Enum.take(9) == ["\uFEFF", "Р", "у", "с", "с", "к", "и", "й", "\n"]
15781578
end
15791579

15801580
test "trims BOM via option with utf8 encoding" do
15811581
src = fixture_path("utf8_bom.txt")
15821582

15831583
assert src
1584-
|> File.stream!([:utf8, :trim_bom])
1584+
|> File.stream!([{:encoding, :utf8}, :trim_bom])
15851585
|> Enum.take(1) == ["Русский\n"]
15861586

15871587
assert src
1588-
|> File.stream!([:utf8, :trim_bom], 1)
1588+
|> File.stream!([{:encoding, :utf8}, :trim_bom], 1)
1589+
|> Enum.take(8) == ["Р", "у", "с", "с", "к", "и", "й", "\n"]
1590+
end
1591+
1592+
test "keeps BOM with UTF16 BE" do
1593+
src = fixture_path("utf16_be_bom.txt")
1594+
1595+
assert src
1596+
|> File.stream!([{:encoding, {:utf16, :big}}])
1597+
|> Enum.take(1) == ["\uFEFFРусский\n"]
1598+
end
1599+
1600+
test "keeps BOM with UTF16 LE" do
1601+
src = fixture_path("utf16_le_bom.txt")
1602+
1603+
assert src
1604+
|> File.stream!([{:encoding, {:utf16, :little}}])
1605+
|> Enum.take(1) == ["\uFEFFРусский\n"]
1606+
end
1607+
1608+
test "trims BOM via option with utf16 BE encoding" do
1609+
src = fixture_path("utf16_be_bom.txt")
1610+
1611+
assert src
1612+
|> File.stream!([{:encoding, {:utf16, :big}}, :trim_bom])
1613+
|> Enum.take(1) == ["Русский\n"]
1614+
1615+
assert src
1616+
|> File.stream!([{:encoding, {:utf16, :big}}, :trim_bom], 1)
1617+
|> Enum.take(8) == ["Р", "у", "с", "с", "к", "и", "й", "\n"]
1618+
end
1619+
1620+
test "trims BOM via option with utf16 LE encoding" do
1621+
src = fixture_path("utf16_le_bom.txt")
1622+
1623+
assert src
1624+
|> File.stream!([{:encoding, {:utf16, :little}}, :trim_bom])
1625+
|> Enum.take(1) == ["Русский\n"]
1626+
1627+
assert src
1628+
|> File.stream!([{:encoding, {:utf16, :little}}, :trim_bom], 1)
15891629
|> Enum.take(8) == ["Р", "у", "с", "с", "к", "и", "й", "\n"]
15901630
end
15911631

22 Bytes
Binary file not shown.
22 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)