Skip to content

Commit 9717a63

Browse files
author
José Valim
committed
Do not consider subpatterns on Regex.split/3
1 parent fae2067 commit 9717a63

File tree

6 files changed

+78
-49
lines changed

6 files changed

+78
-49
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* [Kernel] Correctly parse unary/binary operators regardless of number of spaces
1616
* [Kernel] Ensure private functions are not exported
1717
* [Protocol] Do not expose protocol convention on `assert_impl!/2`
18+
* [Regex] Do not consider subpatterns on `Regex.split/3`
1819
* [Stream] Implement the Inspect protocol for Streams so we do not leak the Stream representation
1920

2021
* Soft deprecations (no warnings emitted)

lib/elixir/lib/regex.ex

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ defmodule Regex do
321321
## Options
322322
323323
* `:parts` - when specified, splits the string into the given number of
324-
parts. If not specified, `:parts` is defaulted to `:infinity`, which will
324+
parts. If not specified, `:parts` defaults to `:infinity`, which will
325325
split the string into the maximum number of parts possible based on the
326326
given pattern.
327327
@@ -348,21 +348,44 @@ defmodule Regex do
348348

349349
def split(regex, string, options \\ [])
350350

351+
def split(%Regex{}, "", _options), do: [""]
352+
351353
def split(%Regex{re_pattern: compiled}, string, options) when is_binary(string) do
352-
parts = Keyword.get(options, :parts, :infinity)
353-
opts = [return: :binary, parts: zero_to_infinity(parts)]
354-
splits = :re.split(string, compiled, opts)
354+
case :re.run(string, compiled, [:global, capture: :first]) do
355+
{:match, matches} ->
356+
do_split(matches, string, 0,
357+
parts_to_index(Keyword.get(options, :parts, :infinity)),
358+
Keyword.get(options, :trim, false))
359+
:nomatch ->
360+
[string]
361+
end
362+
end
363+
364+
defp parts_to_index(:infinity), do: 0
365+
defp parts_to_index(n) when is_integer(n) and n > 0, do: n
366+
367+
defp do_split(_, "", _index, _counter, true), do: []
368+
defp do_split(_, string, _index, 1, _trim), do: [string]
369+
defp do_split([], string, _index, _counter, _trim), do: [string]
355370

356-
if Keyword.get(options, :trim, false) do
357-
for split <- splits, split != "", do: split
371+
defp do_split([[{0, 0}]|t], string, index, counter, trim) do
372+
do_split(t, string, index, counter, trim)
373+
end
374+
375+
defp do_split([[{pos, length}]|t], string, index, counter, trim) do
376+
first = pos - index
377+
last = first + length
378+
379+
head = binary_part(string, 0, first)
380+
tail = binary_part(string, last, byte_size(string) - last)
381+
382+
if trim and head == "" do
383+
do_split(t, tail, pos + length, counter, trim)
358384
else
359-
splits
385+
[head|do_split(t, tail, pos + length, counter - 1, trim)]
360386
end
361387
end
362388

363-
defp zero_to_infinity(0), do: :infinity
364-
defp zero_to_infinity(n), do: n
365-
366389
@doc ~S"""
367390
Receives a regex, a binary and a replacement, returns a new
368391
binary where the all matches are replaced by replacement.

lib/elixir/lib/string.ex

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -227,55 +227,53 @@ defmodule String do
227227
"""
228228
@spec split(t, t | [t] | Regex.t) :: [t]
229229
@spec split(t, t | [t] | Regex.t, Keyword.t) :: [t]
230-
def split(binary, pattern, options \\ [])
230+
def split(string, pattern, options \\ [])
231231

232-
def split("", _pattern, _options), do: [""]
233-
234-
def split(binary, "", options) do
235-
index =
236-
case Keyword.get(options, :parts, :infinity) do
237-
num when is_number(num) and num > 0 -> num
238-
_ -> 0
239-
end
240-
split_codepoints(binary, index - 1, Keyword.get(options, :trim, false))
232+
def split(string, "", options) do
233+
parts = Keyword.get(options, :parts, :infinity)
234+
split_codepoints(string, parts_to_index(parts), Keyword.get(options, :trim, false))
241235
end
242236

243-
def split(binary, pattern, options) do
237+
def split(string, pattern, options) do
244238
if Regex.regex?(pattern) do
245-
Regex.split(pattern, binary, options)
239+
Regex.split(pattern, string, options)
246240
else
247-
splits =
248-
case Keyword.get(options, :parts, :infinity) do
249-
num when is_number(num) and num > 0 ->
250-
split_parts(binary, pattern, num - 1)
251-
_ ->
252-
:binary.split(binary, pattern, [:global])
253-
end
254-
255-
if Keyword.get(options, :trim, false) do
256-
for split <- splits, split != "", do: split
241+
parts = Keyword.get(options, :parts, :infinity)
242+
trim = Keyword.get(options, :trim, false)
243+
if parts == :infinity and trim == false do
244+
:binary.split(string, pattern, [:global])
257245
else
258-
splits
246+
split_parts(string, pattern, parts_to_index(parts), trim)
259247
end
260248
end
261249
end
262250

263-
defp split_codepoints(binary, 0, _trim), do: [binary]
251+
defp parts_to_index(:infinity), do: 0
252+
defp parts_to_index(n) when is_integer(n) and n > 0, do: n
253+
254+
defp split_codepoints(binary, 1, _trim), do: [binary]
264255
defp split_codepoints(<<h :: utf8, t :: binary>>, count, trim),
265256
do: [<<h :: utf8>>|split_codepoints(t, count - 1, trim)]
266257
defp split_codepoints(<<h, t :: binary>>, count, trim),
267258
do: [<<h>>|split_codepoints(t, count - 1, trim)]
268259
defp split_codepoints(<<>>, _, true), do: []
269260
defp split_codepoints(<<>>, _, false), do: [""]
270261

271-
defp split_parts("", _pattern, _num), do: [""]
272-
defp split_parts(binary, pattern, num), do: split_parts(binary, pattern, num, [])
273-
defp split_parts("", _pattern, _num, parts), do: Enum.reverse([""|parts])
274-
defp split_parts(binary, _pattern, 0, parts), do: Enum.reverse([binary|parts])
275-
defp split_parts(binary, pattern, num, parts) do
276-
case :binary.split(binary, pattern) do
277-
[head] -> Enum.reverse([head|parts])
278-
[head, rest] -> split_parts(rest, pattern, num - 1, [head|parts])
262+
defp split_parts("", _pattern, _num, true), do: []
263+
defp split_parts("", _pattern, _num, _trim), do: [""]
264+
defp split_parts(string, _pattern, 1, _trim), do: [string]
265+
defp split_parts(string, pattern, num, trim) do
266+
case :binary.split(string, pattern) do
267+
[""] when trim ->
268+
[]
269+
[head] ->
270+
[head]
271+
[head, tail] ->
272+
if trim and head == "" do
273+
split_parts(tail, pattern, num, trim)
274+
else
275+
[head|split_parts(tail, pattern, num-1, trim)]
276+
end
279277
end
280278
end
281279

lib/elixir/test/elixir/regex_test.exs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,14 +149,16 @@ defmodule RegexTest do
149149

150150
test :split do
151151
assert Regex.split(~r",", "") == [""]
152+
assert Regex.split(~r",", "", trim: true) == [""]
152153
assert Regex.split(~r" ", "foo bar baz") == ["foo", "bar", "baz"]
153-
assert Regex.split(~r" ", "foo bar baz", parts: 0) == ["foo", "bar", "baz"]
154154
assert Regex.split(~r" ", "foo bar baz", parts: :infinity) == ["foo", "bar", "baz"]
155155
assert Regex.split(~r" ", "foo bar baz", parts: 10) == ["foo", "bar", "baz"]
156156
assert Regex.split(~r" ", "foo bar baz", parts: 2) == ["foo", "bar baz"]
157157
assert Regex.split(~r"\s", "foobar") == ["foobar"]
158158
assert Regex.split(~r" ", " foo bar baz ") == ["", "foo", "bar", "baz", ""]
159159
assert Regex.split(~r" ", " foo bar baz ", trim: true) == ["foo", "bar", "baz"]
160+
assert Regex.split(~r" ", " foo bar baz ", parts: 2) == ["", "foo bar baz "]
161+
assert Regex.split(~r" ", " foo bar baz ", trim: true, parts: 2) == ["foo", "bar baz "]
160162
assert Regex.split(~r"=", "key=") == ["key", ""]
161163
assert Regex.split(~r"=", "=value") == ["", "value"]
162164
end

lib/elixir/test/elixir/string_test.exs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ defmodule StringTest do
2323
end
2424

2525
test :split do
26-
assert String.split("") == [""]
26+
assert String.split("") == []
2727
assert String.split("foo bar") == ["foo", "bar"]
2828
assert String.split(" foo bar") == ["foo", "bar"]
2929
assert String.split("foo bar ") == ["foo", "bar"]
@@ -32,24 +32,29 @@ defmodule StringTest do
3232
assert String.split("foo" <> <<31>> <> "bar") == ["foo", "bar"]
3333
assert String.split("foo" <> <<194, 133>> <> "bar") == ["foo", "bar"]
3434

35-
assert String.split("", ",") == [""]
3635
assert String.split("a,b,c", ",") == ["a", "b", "c"]
3736
assert String.split("a,b", ".") == ["a,b"]
3837
assert String.split("1,2 3,4", [" ", ","]) == ["1", "2", "3", "4"]
38+
39+
assert String.split("", ",") == [""]
3940
assert String.split(" a b c ", " ") == ["", "a", "b", "c", ""]
41+
assert String.split(" a b c ", " ", parts: :infinity) == ["", "a", "b", "c", ""]
42+
assert String.split(" a b c ", " ", parts: 1) == [" a b c "]
43+
assert String.split(" a b c ", " ", parts: 2) == ["", "a b c "]
4044

45+
assert String.split("", ",", trim: true) == []
4146
assert String.split(" a b c ", " ", trim: true) == ["a", "b", "c"]
42-
assert String.split(" a b c ", " ", trim: true, parts: 0) == ["a", "b", "c"]
4347
assert String.split(" a b c ", " ", trim: true, parts: :infinity) == ["a", "b", "c"]
4448
assert String.split(" a b c ", " ", trim: true, parts: 1) == [" a b c "]
49+
assert String.split(" a b c ", " ", trim: true, parts: 2) == ["a", "b c "]
4550

4651
assert String.split("abé", "") == ["a", "b", "é", ""]
47-
assert String.split("abé", "", parts: 0) == ["a", "b", "é", ""]
52+
assert String.split("abé", "", parts: :infinity) == ["a", "b", "é", ""]
4853
assert String.split("abé", "", parts: 1) == ["abé"]
4954
assert String.split("abé", "", parts: 2) == ["a", "bé"]
5055
assert String.split("abé", "", parts: 10) == ["a", "b", "é", ""]
5156
assert String.split("abé", "", trim: true) == ["a", "b", "é"]
52-
assert String.split("abé", "", trim: true, parts: 0) == ["a", "b", "é"]
57+
assert String.split("abé", "", trim: true, parts: :infinity) == ["a", "b", "é"]
5358
assert String.split("abé", "", trim: true, parts: 2) == ["a", "bé"]
5459
end
5560

lib/elixir/unicode/unicode.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ defmodule String.Unicode do
123123

124124
# Split
125125

126-
def split(""), do: [""]
126+
def split(""), do: []
127127

128128
def split(string) when is_binary(string) do
129129
:lists.reverse do_split(string, "", [])

0 commit comments

Comments
 (0)