Skip to content

Commit c0fbea8

Browse files
author
José Valim
committed
Merge pull request #909 from devinus/unicode-split
Unicode split
2 parents c3e5054 + 1963585 commit c0fbea8

File tree

3 files changed

+83
-47
lines changed

3 files changed

+83
-47
lines changed

lib/elixir/lib/string.ex

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -143,29 +143,35 @@ defmodule String do
143143
The string is split into as many parts as possible by
144144
default, unless the `global` option is set to false.
145145
If a pattern is not specified, the string is split on
146-
whitespace occurrences.
146+
Unicode whitespace occurrences with leading and trailing
147+
whitespace ignored.
147148
148149
It returns a list with the original string if the pattern
149150
can't be matched.
150151
151152
## Examples
152153
153-
String.split("a,b,c", ",") #=> ["a", "b", "c"]
154-
String.split("a,b,c", ",", global: false) #=> ["a", "b,c"]
154+
String.split("foo bar") #=> ["foo", "bar"]
155+
String.split("foo" <> <<194,133>> <> "bar") #=> ["foo", "bar"]
156+
String.split(" foo bar ") #=> ["foo", "bar"]
157+
158+
String.split("a,b,c", ",") #=> ["a", "b", "c"]
159+
String.split("a,b,c", ",", global: false) #=> ["a", "b,c"]
155160
156-
String.split("foo bar") #=> ["foo", "bar"]
157161
String.split("1,2 3,4", [" ", ","]) #=> ["1", "2", "3", "4"]
158162
159163
String.split("a,b,c", %r{,}) #=> ["a", "b", "c"]
160-
String.split("a,b,c", %r{,}, global: false) #=> ["a", "b,c"]
161-
String.split("a,b", %r{\.}) #=> ["a,b"]
164+
String.split("a,b,c", %r{,}, global: false) #=> ["a", "b,c"]
165+
String.split("a,b", %r{\.}) #=> ["a,b"]
162166
163167
"""
164168
@spec split(t) :: [t]
165169
@spec split(t, t | [t] | Regex.t) :: [t]
166170
@spec split(t, t | [t] | Regex.t, Keyword.t) :: [t]
167171

168-
def split(binary, pattern // " ", options // [])
172+
defdelegate split(binary), to: String.Unicode
173+
174+
def split(binary, pattern, options // [])
169175

170176
def split(binary, pattern, options) when is_regex(pattern) do
171177
Regex.split(pattern, binary, global: options[:global])
@@ -225,8 +231,8 @@ defmodule String do
225231
end
226232

227233
@doc """
228-
Returns a string where trailing whitespace characters
229-
and new line have been removed.
234+
Returns a string where trailing Unicode whitespace
235+
has been removed.
230236
231237
## Examples
232238
@@ -272,8 +278,8 @@ defmodule String do
272278
end
273279

274280
@doc """
275-
Returns a string where leading whitespace characters
276-
have been removed.
281+
Returns a string where leading Unicode whitespace
282+
has been removed.
277283
278284
## Examples
279285
@@ -302,8 +308,8 @@ defmodule String do
302308
end
303309

304310
@doc """
305-
Returns a string where leading/trailing whitespace
306-
and new line characters have been removed.
311+
Returns a string where leading/trailing Unicode whitespace
312+
has been removed.
307313
308314
## Examples
309315

lib/elixir/priv/unicode.ex

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -57,60 +57,54 @@ defmodule String.Unicode do
5757

5858
# Downcase
5959

60+
def downcase(""), do: ""
61+
6062
lc { codepoint, _upper, lower, _title } inlist codes, lower && lower != codepoint do
61-
def downcase(unquote(codepoint) <> t) do
62-
unquote(lower) <> downcase(t)
63+
def downcase(unquote(codepoint) <> rest) do
64+
unquote(lower) <> downcase(rest)
6365
end
6466
end
6567

66-
def downcase(<< h, t :: binary >>) do
67-
<< h >> <> downcase(t)
68-
end
69-
70-
def downcase(<< >>) do
71-
<< >>
68+
def downcase(<< char, rest :: binary >>) do
69+
<< char >> <> downcase(rest)
7270
end
7371

7472
# Upcase
7573

74+
def upcase(""), do: ""
75+
7676
lc { codepoint, upper, _lower, _title } inlist codes, upper && upper != codepoint do
77-
def upcase(unquote(codepoint) <> t) do
78-
unquote(upper) <> upcase(t)
77+
def upcase(unquote(codepoint) <> rest) do
78+
unquote(upper) <> upcase(rest)
7979
end
8080
end
8181

82-
def upcase(<< h, t :: binary >>) do
83-
<< h >> <> upcase(t)
84-
end
85-
86-
def upcase(<< >>) do
87-
<< >>
82+
def upcase(<< char, rest :: binary >>) do
83+
<< char >> <> upcase(rest)
8884
end
8985

9086
# Titlecase once
9187

88+
def titlecase_once(""), do: { "", "" }
89+
9290
lc { codepoint, _upper, _lower, title } inlist codes, title && title != codepoint do
93-
def titlecase_once(unquote(codepoint) <> t) do
94-
{ unquote(title), t }
91+
def titlecase_once(unquote(codepoint) <> rest) do
92+
{ unquote(title), rest }
9593
end
9694
end
9795

98-
def titlecase_once(<< h, t :: binary >>) do
99-
{ <<h>>, t }
100-
end
101-
102-
def titlecase_once(<< >>) do
103-
{ <<>>, <<>> }
96+
def titlecase_once(<< char, rest :: binary >>) do
97+
{ << char >>, rest }
10498
end
10599

106100
# Strip
107101

108102
def lstrip(""), do: ""
109103

110-
lc char inlist whitespace do
111-
args = quote do: [unquote(char) <> rest]
112-
exprs = quote do: lstrip(rest)
113-
def :lstrip, args, [], do: exprs
104+
lc codepoint inlist whitespace do
105+
def lstrip(unquote(codepoint) <> rest) do
106+
lstrip(rest)
107+
end
114108
end
115109

116110
def lstrip(other) when is_binary(other), do: other
@@ -121,18 +115,48 @@ defmodule String.Unicode do
121115
do_rstrip(string, "")
122116
end
123117

124-
lc char inlist whitespace do
125-
defp do_rstrip(unquote(char) <> rest, buffer) do
126-
do_rstrip(rest, unquote(char) <> buffer)
118+
lc codepoint inlist whitespace do
119+
defp do_rstrip(unquote(codepoint) <> rest, buffer) do
120+
do_rstrip(rest, unquote(codepoint) <> buffer)
127121
end
128122
end
129123

130-
defp do_rstrip(<< char, string :: binary >>, buffer) do
131-
<< buffer :: binary, char, do_rstrip(string, "") :: binary >>
124+
defp do_rstrip(<< char, rest :: binary >>, buffer) do
125+
<< buffer :: binary, char, do_rstrip(rest, "") :: binary >>
132126
end
133127

134128
defp do_rstrip(<<>>, _), do: <<>>
135129

130+
# Split
131+
132+
def split(""), do: ""
133+
134+
def split(string) when is_binary(string) do
135+
:lists.reverse do_split(string, "", [])
136+
end
137+
138+
lc codepoint inlist whitespace do
139+
defp do_split(unquote(codepoint) <> rest, buffer, acc) do
140+
if buffer != "" do
141+
do_split(rest, "", [buffer | acc])
142+
else
143+
do_split(rest, buffer, acc)
144+
end
145+
end
146+
end
147+
148+
defp do_split(<< char, rest :: binary >>, buffer, acc) do
149+
do_split(rest, << buffer :: binary, char >>, acc)
150+
end
151+
152+
defp do_split(<<>>, buffer, acc) do
153+
if buffer != "" do
154+
[buffer | acc]
155+
else
156+
acc
157+
end
158+
end
159+
136160
# Graphemes
137161

138162
lc codepoints inlist seqs do

lib/elixir/test/elixir/string_test.exs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,14 @@ defmodule StringTest do
1717

1818
test :split do
1919
assert String.split("foo bar") == ["foo", "bar"]
20-
assert String.split("a,b,c", ",") == ["a", "b", "c"]
20+
assert String.split(" foo bar") == ["foo", "bar"]
21+
assert String.split("foo bar ") == ["foo", "bar"]
22+
assert String.split(" foo bar ") == ["foo", "bar"]
23+
assert String.split("foo\t\n\v\f\r\sbar\n") == ["foo", "bar"]
24+
assert String.split("foo" <> <<31>> <> "bar") == ["foo", "bar"]
25+
assert String.split("foo" <> <<194,133>> <> "bar") == ["foo", "bar"]
2126

27+
assert String.split("a,b,c", ",") == ["a", "b", "c"]
2228
assert String.split("a,b", ".") == ["a,b"]
2329
assert String.split("1,2 3,4", [" ", ","]) == ["1", "2", "3", "4"]
2430

0 commit comments

Comments
 (0)