Merge pull request #909 from devinus/unicode-split

José Valim · José Valim · commit c0fbea8e9b7e · 2013-04-06T07:32:36.000-07:00
Unicode split
diff --git a/lib/elixir/lib/string.ex b/lib/elixir/lib/string.ex
@@ -143,29 +143,35 @@ defmodule String do
   The string is split into as many parts as possible by
   default, unless the `global` option is set to false.
   If a pattern is not specified, the string is split on
-  whitespace occurrences.
+  Unicode whitespace occurrences with leading and trailing
+  whitespace ignored.
 
   It returns a list with the original string if the pattern
   can't be matched.
 
   ## Examples
 
-      String.split("a,b,c", ",")  #=> ["a", "b", "c"]
-      String.split("a,b,c", ",", global: false)  #=> ["a", "b,c"]
+      String.split("foo bar") #=> ["foo", "bar"]
+      String.split("foo" <> <<194,133>> <> "bar") #=> ["foo", "bar"]
+      String.split(" foo bar ") #=> ["foo", "bar"]
+
+      String.split("a,b,c", ",") #=> ["a", "b", "c"]
+      String.split("a,b,c", ",", global: false) #=> ["a", "b,c"]
 
-      String.split("foo bar")     #=> ["foo", "bar"]
       String.split("1,2 3,4", [" ", ","]) #=> ["1", "2", "3", "4"]
 
       String.split("a,b,c", %r{,}) #=> ["a", "b", "c"]
-      String.split("a,b,c", %r{,}, global: false)  #=> ["a", "b,c"]
-      String.split("a,b", %r{\.})   #=> ["a,b"]
+      String.split("a,b,c", %r{,}, global: false) #=> ["a", "b,c"]
+      String.split("a,b", %r{\.}) #=> ["a,b"]
 
   """
   @spec split(t) :: [t]
   @spec split(t, t | [t] | Regex.t) :: [t]
   @spec split(t, t | [t] | Regex.t, Keyword.t) :: [t]
 
-  def split(binary, pattern // " ", options // [])
+  defdelegate split(binary), to: String.Unicode
+
+  def split(binary, pattern, options // [])
 
   def split(binary, pattern, options) when is_regex(pattern) do
     Regex.split(pattern, binary, global: options[:global])
@@ -225,8 +231,8 @@ defmodule String do
   end
 
   @doc """
-  Returns a string where trailing whitespace characters
-  and new line have been removed.
+  Returns a string where trailing Unicode whitespace
+  has been removed.
 
   ## Examples
 
@@ -272,8 +278,8 @@ defmodule String do
   end
 
   @doc """
-  Returns a string where leading whitespace characters
-  have been removed.
+  Returns a string where leading Unicode whitespace
+  has been removed.
 
   ## Examples
 
@@ -302,8 +308,8 @@ defmodule String do
   end
 
   @doc """
-  Returns a string where leading/trailing whitespace
-  and new line characters have been removed.
+  Returns a string where leading/trailing Unicode whitespace
+  has been removed.
 
   ## Examples
 
diff --git a/lib/elixir/priv/unicode.ex b/lib/elixir/priv/unicode.ex
@@ -57,60 +57,54 @@ defmodule String.Unicode do
 
   # Downcase
 
+  def downcase(""), do: ""
+
   lc { codepoint, _upper, lower, _title } inlist codes, lower && lower != codepoint do
-    def downcase(unquote(codepoint) <> t) do
-      unquote(lower) <> downcase(t)
+    def downcase(unquote(codepoint) <> rest) do
+      unquote(lower) <> downcase(rest)
     end
   end
 
-  def downcase(<< h, t :: binary >>) do
-    << h >> <> downcase(t)
-  end
-
-  def downcase(<< >>) do
-    << >>
+  def downcase(<< char, rest :: binary >>) do
+    << char >> <> downcase(rest)
   end
 
   # Upcase
 
+  def upcase(""), do: ""
+
   lc { codepoint, upper, _lower, _title } inlist codes, upper && upper != codepoint do
-    def upcase(unquote(codepoint) <> t) do
-      unquote(upper) <> upcase(t)
+    def upcase(unquote(codepoint) <> rest) do
+      unquote(upper) <> upcase(rest)
     end
   end
 
-  def upcase(<< h, t :: binary >>) do
-    << h >> <> upcase(t)
-  end
-
-  def upcase(<< >>) do
-    << >>
+  def upcase(<< char, rest :: binary >>) do
+    << char >> <> upcase(rest)
   end
 
   # Titlecase once
 
+  def titlecase_once(""), do: { "", "" }
+
   lc { codepoint, _upper, _lower, title } inlist codes, title && title != codepoint do
-    def titlecase_once(unquote(codepoint) <> t) do
-      { unquote(title), t }
+    def titlecase_once(unquote(codepoint) <> rest) do
+      { unquote(title), rest }
     end
   end
 
-  def titlecase_once(<< h, t :: binary >>) do
-    { <<h>>, t }
-  end
-
-  def titlecase_once(<< >>) do
-    { <<>>, <<>> }
+  def titlecase_once(<< char, rest :: binary >>) do
+    { << char >>, rest }
   end
 
   # Strip
 
   def lstrip(""), do: ""
 
-  lc char inlist whitespace do
-    args  = quote do: [unquote(char) <> rest]
-    exprs = quote do: lstrip(rest)
-    def :lstrip, args, [], do: exprs
+  lc codepoint inlist whitespace do
+    def lstrip(unquote(codepoint) <> rest) do
+      lstrip(rest)
+    end
   end
 
   def lstrip(other) when is_binary(other), do: other
@@ -121,18 +115,48 @@ defmodule String.Unicode do
     do_rstrip(string, "")
   end
 
-  lc char inlist whitespace do
-    defp do_rstrip(unquote(char) <> rest, buffer) do
-      do_rstrip(rest, unquote(char) <> buffer)
+  lc codepoint inlist whitespace do
+    defp do_rstrip(unquote(codepoint) <> rest, buffer) do
+      do_rstrip(rest, unquote(codepoint) <> buffer)
     end
   end
 
-  defp do_rstrip(<< char, string :: binary >>, buffer) do
-    << buffer :: binary, char, do_rstrip(string, "") :: binary >>
+  defp do_rstrip(<< char, rest :: binary >>, buffer) do
+    << buffer :: binary, char, do_rstrip(rest, "") :: binary >>
   end
 
   defp do_rstrip(<<>>, _), do: <<>>
 
+  # Split
+
+  def split(""), do: ""
+
+  def split(string) when is_binary(string) do
+    :lists.reverse do_split(string, "", [])
+  end
+
+  lc codepoint inlist whitespace do
+    defp do_split(unquote(codepoint) <> rest, buffer, acc) do
+      if buffer != "" do
+        do_split(rest, "", [buffer | acc])
+      else
+        do_split(rest, buffer, acc)
+      end
+    end
+  end
+
+  defp do_split(<< char, rest :: binary >>, buffer, acc) do
+    do_split(rest, << buffer :: binary, char >>, acc)
+  end
+
+  defp do_split(<<>>, buffer, acc) do
+    if buffer != "" do
+      [buffer | acc]
+    else
+      acc
+    end
+  end
+
   # Graphemes
 
   lc codepoints inlist seqs do
diff --git a/lib/elixir/test/elixir/string_test.exs b/lib/elixir/test/elixir/string_test.exs
@@ -17,8 +17,14 @@ defmodule StringTest do
 
   test :split do
     assert String.split("foo bar") == ["foo", "bar"]
-    assert String.split("a,b,c", ",") == ["a", "b", "c"]
+    assert String.split(" foo bar") == ["foo", "bar"]
+    assert String.split("foo bar ") == ["foo", "bar"]
+    assert String.split(" foo bar ") == ["foo", "bar"]
+    assert String.split("foo\t\n\v\f\r\sbar\n") == ["foo", "bar"]
+    assert String.split("foo" <> <<31>> <> "bar") == ["foo", "bar"]
+    assert String.split("foo" <> <<194,133>> <> "bar") == ["foo", "bar"]
 
+    assert String.split("a,b,c", ",") == ["a", "b", "c"]
     assert String.split("a,b", ".") == ["a,b"]
     assert String.split("1,2 3,4", [" ", ","]) == ["1", "2", "3", "4"]