Skip to content

Commit 2eb77e1

Browse files
amuinoJosé Valim
authored andcommitted
New definition of whitespace & breakable whitespace
All whitespace can be removed by `strip` but only breakable whitespace can be used as a delimiter by `split`, with updated docs for String.split/1 Signed-off-by: José Valim <[email protected]>
1 parent 173c57f commit 2eb77e1

File tree

4 files changed

+55
-10
lines changed

4 files changed

+55
-10
lines changed

lib/elixir/lib/string.ex

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,9 +239,11 @@ defmodule String do
239239
def printable?(<<>>), do: true
240240
def printable?(binary) when is_binary(binary), do: false
241241

242-
@doc """
242+
@doc ~S"""
243243
Divides a string into substrings at each Unicode whitespace
244-
occurrence with leading and trailing whitespace ignored.
244+
occurrence with leading and trailing whitespace ignored. Groups
245+
of whitespace are treated as a single occurrence. Divisions do
246+
not occur on non-breaking whitespace.
245247
246248
## Examples
247249
@@ -254,6 +256,9 @@ defmodule String do
254256
iex> String.split(" foo bar ")
255257
["foo", "bar"]
256258
259+
iex> String.split("no\u00a0break")
260+
["no\u00a0break"]
261+
257262
"""
258263
@spec split(t) :: [t]
259264
defdelegate split(binary), to: String.Unicode

lib/elixir/test/elixir/string_test.exs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ defmodule StringTest do
3333
assert String.split("foo bar ") == ["foo", "bar"]
3434
assert String.split(" foo bar ") == ["foo", "bar"]
3535
assert String.split("foo\t\n\v\f\r\sbar\n") == ["foo", "bar"]
36-
assert String.split("foo" <> <<31>> <> "bar") == ["foo", "bar"]
3736
assert String.split("foo" <> <<194, 133>> <> "bar") == ["foo", "bar"]
37+
# information separators are not considered whitespace
38+
assert String.split("foo\u001Fbar") == ["foo\u001Fbar"]
39+
# no-break space is excluded
40+
assert String.split("foo\00A0bar") == ["foo\00A0bar"]
41+
assert String.split("foo\u202Fbar") == ["foo\u202Fbar"]
3842

3943
assert String.split("a,b,c", ",") == ["a", "b", "c"]
4044
assert String.split("a,b", ".") == ["a,b"]
@@ -179,14 +183,17 @@ defmodule StringTest do
179183
assert String.rstrip(" abc a") == " abc a"
180184
assert String.rstrip("a abc a\n\n") == "a abc a"
181185
assert String.rstrip("a abc a\t\n\v\f\r\s") == "a abc a"
182-
assert String.rstrip("a abc a " <> <<31>>) == "a abc a"
183186
assert String.rstrip("a abc a" <> <<194, 133>>) == "a abc a"
184187
assert String.rstrip(" abc aa", ?a) == " abc "
185188
assert String.rstrip(" abc __", ?_) == " abc "
186189
assert String.rstrip(" aaaaaaaaa", ?a) == " "
187190
assert String.rstrip("aaaaaaaaaa", ?a) == ""
188191
assert String.rstrip("]]]]]]]]]]", ?]) == ""
189192
assert String.rstrip(" cat 猫猫", ?猫) == " cat "
193+
# information separators are not whitespace
194+
assert String.rstrip("a abc a \u001F") == "a abc a \u001F"
195+
# no-break space
196+
assert String.rstrip("a abc a \u00A0") == "a abc a"
190197
end
191198

192199
test "lstrip" do
@@ -195,10 +202,13 @@ defmodule StringTest do
195202
assert String.lstrip("a abc a") == "a abc a"
196203
assert String.lstrip("\n\na abc a") == "a abc a"
197204
assert String.lstrip("\t\n\v\f\r\sa abc a") == "a abc a"
198-
assert String.lstrip(<<31>> <> " a abc a") == "a abc a"
199205
assert String.lstrip(<<194, 133>> <> "a abc a") == "a abc a"
200206
assert String.lstrip("__ abc _", ?_) == " abc _"
201207
assert String.lstrip("猫猫 cat ", ?猫) == " cat "
208+
# information separators are not whitespace
209+
assert String.lstrip("\u001F a abc a") == <<31>> <> " a abc a"
210+
# no-break space
211+
assert String.lstrip("\u00A0 a abc a") == "a abc a"
202212
end
203213

204214
test "strip" do
@@ -208,6 +218,10 @@ defmodule StringTest do
208218
assert String.strip("a abc a\t\n\v\f\r\s") == "a abc a"
209219
assert String.strip("___ abc ___", ?_) == " abc "
210220
assert String.strip("猫猫猫 cat 猫猫猫", ?猫) == " cat "
221+
# no-break space
222+
assert String.strip("\u00A0a abc a\u00A0") == "a abc a"
223+
# whitespace defined as a range
224+
assert String.strip("\u2008a abc a\u2005") == "a abc a"
211225
end
212226

213227
test "rjust" do

lib/elixir/unicode/WhiteSpace.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
2+
0020 ; White_Space # Zs SPACE
3+
0085 ; White_Space # Cc <control-0085>
4+
00A0 ; White_Space # Zs NO-BREAK SPACE
5+
1680 ; White_Space # Zs OGHAM SPACE MARK
6+
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
7+
2028 ; White_Space # Zl LINE SEPARATOR
8+
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
9+
202F ; White_Space # Zs NARROW NO-BREAK SPACE
10+
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
11+
3000 ; White_Space # Zs IDEOGRAPHIC SPACE

lib/elixir/unicode/unicode.ex

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,26 @@ defmodule String.Unicode do
1212
@moduledoc false
1313
def version, do: {7, 0, 0}
1414

15+
# WhiteSpace.txt is extracted from Unicode's PropList.txt (just the White_Space property)
16+
prop_path = Path.join(__DIR__, "WhiteSpace.txt")
17+
prop_range_line_regex = ~r/\A([0-9A-F]{4})(?:\.\.([0-9A-F]{4}))?/
18+
whitespace = Enum.reduce File.stream!(prop_path), [], fn(line, acc) ->
19+
case Regex.run(prop_range_line_regex, line, capture: :all_but_first) do
20+
[single] ->
21+
[to_binary.(single) | acc]
22+
[first, last] ->
23+
range = String.to_integer(first, 16)..String.to_integer(last, 16)
24+
acc ++ Enum.map(range, &to_binary.(Integer.to_string(&1, 16)))
25+
_ ->
26+
acc
27+
end
28+
end
29+
1530
data_path = Path.join(__DIR__, "UnicodeData.txt")
1631

17-
{codes, whitespace} = Enum.reduce File.stream!(data_path), {[], []}, fn(line, {cacc, wacc}) ->
32+
{codes, breakable_whitespace} = Enum.reduce File.stream!(data_path), {[], whitespace}, fn(line, {cacc, wacc}) ->
1833
[codepoint, _name, _category,
19-
_class, bidi, _decomposition,
34+
_class, _bidi, decomposition,
2035
_numeric_1, _numeric_2, _numeric_3,
2136
_bidi_mirror, _unicode_1, _iso,
2237
upper, lower, title] = :binary.split(line, ";", [:global])
@@ -30,8 +45,8 @@ defmodule String.Unicode do
3045
to_binary.(lower),
3146
to_binary.(title)} | cacc],
3247
wacc}
33-
bidi in ["B", "S", "WS"] ->
34-
{cacc, [to_binary.(codepoint) | wacc]}
48+
String.starts_with?(decomposition, "<noBreak>") ->
49+
{cacc, List.delete(wacc, to_binary.(codepoint))}
3550
true ->
3651
{cacc, wacc}
3752
end
@@ -156,7 +171,7 @@ defmodule String.Unicode do
156171
:lists.reverse do_split(string, "", [])
157172
end
158173

159-
for codepoint <- whitespace do
174+
for codepoint <- breakable_whitespace do
160175
defp do_split(unquote(codepoint) <> rest, buffer, acc) do
161176
do_split(rest, "", add_buffer_to_acc(buffer, acc))
162177
end

0 commit comments

Comments
 (0)