Skip to content

Commit 0802014

Browse files
author
José Valim
committed
No longer compile unicode in parallel
Compiling in parallel only bought us 2s (out of 24s) since compiling graphemes is what takes most of the time but it considerably increases memory usage, by about 30%, since both compilation targets are loaded into memory at once.
1 parent bfc3027 commit 0802014

File tree

3 files changed

+177
-176
lines changed

3 files changed

+177
-176
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ unicode: $(UNICODE)
7676
$(UNICODE): lib/elixir/unicode/*
7777
@ echo "==> unicode (compile)";
7878
@ echo "This step can take up to a minute to compile in order to embed the Unicode database"
79-
$(Q) cd lib/elixir && ../../$(ELIXIRC) unicode/graphemes.ex unicode/unicode.ex -o ebin;
79+
$(Q) cd lib/elixir && ../../$(ELIXIRC) unicode/unicode.ex -o ebin;
8080

8181
$(eval $(call APP_TEMPLATE,ex_unit,ExUnit))
8282
$(eval $(call APP_TEMPLATE,eex,EEx))

lib/elixir/unicode/graphemes.ex

Lines changed: 0 additions & 175 deletions
This file was deleted.

lib/elixir/unicode/unicode.ex

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,179 @@ defmodule String.Unicode do
174174
[]
175175
end
176176
end
177+
178+
defmodule String.Graphemes do
179+
@moduledoc false
180+
181+
cluster_path = Path.join(__DIR__, "GraphemeBreakProperty.txt")
182+
regex = %r/(?:^([0-9A-F]+)(?:\.\.([0-9A-F]+))?)\s+;\s(\w+)/m
183+
184+
to_range = fn
185+
first, "" ->
186+
[<< binary_to_integer(first, 16) :: utf8 >>]
187+
first, last ->
188+
range = binary_to_integer(first, 16)..binary_to_integer(last, 16)
189+
Enum.map(range, fn(int) -> << int :: utf8 >> end)
190+
end
191+
192+
cluster = Enum.reduce File.stream!(cluster_path), HashDict.new, fn(line, dict) ->
193+
[ _full, first, last, class ] = Regex.run(regex, line)
194+
195+
# Skip surrogates
196+
if first == "D800" and last == "DFFF" do
197+
dict
198+
else
199+
list = to_range.(first, last)
200+
Dict.update(dict, class, list, &(&1 ++ list))
201+
end
202+
end
203+
204+
# There is no codepoint marked as Prepend by Unicode 6.3.0
205+
if cluster["Prepend"] do
206+
raise "It seems this new unicode version has added Prepend items. " <>
207+
"Please remove this error and uncomment the code below."
208+
end
209+
210+
# Don't break CRLF
211+
def next_grapheme(<< ?\n, ?\r, rest :: binary >>) do
212+
{ "\n\r", rest }
213+
end
214+
215+
# Break on control
216+
lc codepoint inlist cluster["CR"] ++ cluster["LF"] ++ cluster["Control"] do
217+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
218+
{ :binary.part(string, 0, unquote(size(codepoint))), rest }
219+
end
220+
end
221+
222+
# Break on Prepend*
223+
# lc codepoint inlist cluster["Prepend"] do
224+
# def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
225+
# next_prepend(rest, string, unquote(size(codepoint)))
226+
# end
227+
# end
228+
229+
# Handle Hangul L
230+
lc codepoint inlist cluster["L"] do
231+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
232+
next_hangul_l(rest, string, unquote(size(codepoint)))
233+
end
234+
end
235+
236+
# Handle Hangul T
237+
lc codepoint inlist cluster["T"] do
238+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
239+
next_hangul_t(rest, string, unquote(size(codepoint)))
240+
end
241+
end
242+
243+
# Handle Regional
244+
lc codepoint inlist cluster["Regional_Indicator"] do
245+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
246+
next_regional(rest, string, unquote(size(codepoint)))
247+
end
248+
end
249+
250+
# Handle extended entries
251+
def next_grapheme(<< cp :: utf8, rest :: binary >> = string) do
252+
next_extend(rest, string, byte_size(<< cp :: utf8 >>))
253+
end
254+
255+
def next_grapheme(<< cp, rest :: binary >>) do
256+
{ <<cp>>, rest }
257+
end
258+
259+
def next_grapheme(<<>>) do
260+
nil
261+
end
262+
263+
# Handle Hangul L
264+
lc codepoint inlist cluster["L"] do
265+
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
266+
next_hangul_l(rest, string, size + unquote(size(codepoint)))
267+
end
268+
end
269+
270+
lc codepoint inlist cluster["LV"] do
271+
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
272+
next_hangul_v(rest, string, size + unquote(size(codepoint)))
273+
end
274+
end
275+
276+
lc codepoint inlist cluster["LVT"] do
277+
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
278+
next_hangul_t(rest, string, size + unquote(size(codepoint)))
279+
end
280+
end
281+
282+
defp next_hangul_l(rest, string, size) do
283+
next_hangul_v(rest, string, size)
284+
end
285+
286+
# Handle Hangul V
287+
lc codepoint inlist cluster["V"] do
288+
defp next_hangul_v(<< unquote(codepoint), rest :: binary >>, string, size) do
289+
next_hangul_v(rest, string, size + unquote(size(codepoint)))
290+
end
291+
end
292+
293+
defp next_hangul_v(rest, string, size) do
294+
next_hangul_t(rest, string, size)
295+
end
296+
297+
# Handle Hangul T
298+
lc codepoint inlist cluster["T"] do
299+
defp next_hangul_t(<< unquote(codepoint), rest :: binary >>, string, size) do
300+
next_hangul_t(rest, string, size + unquote(size(codepoint)))
301+
end
302+
end
303+
304+
defp next_hangul_t(rest, string, size) do
305+
next_extend(rest, string, size)
306+
end
307+
308+
# Handle regional
309+
lc codepoint inlist cluster["Regional_Indicator"] do
310+
defp next_regional(<< unquote(codepoint), rest :: binary >>, string, size) do
311+
next_regional(rest, string, size + unquote(size(codepoint)))
312+
end
313+
end
314+
315+
defp next_regional(rest, string, size) do
316+
next_extend(rest, string, size)
317+
end
318+
319+
# Handle Extend+SpacingMark
320+
lc codepoint inlist cluster["Extend"] ++ cluster["SpacingMark"] do
321+
defp next_extend(<< unquote(codepoint), rest :: binary >>, string, size) do
322+
next_extend(rest, string, size + unquote(size(codepoint)))
323+
end
324+
end
325+
326+
defp next_extend(rest, string, size) do
327+
{ :binary.part(string, 0, size), rest }
328+
end
329+
330+
# Handle Prepend
331+
# lc codepoint inlist cluster["Prepend"] do
332+
# defp next_prepend(<< unquote(codepoint), rest :: binary >>, string, size) do
333+
# next_prepend(rest, string, size + unquote(size(codepoint)))
334+
# end
335+
# end
336+
#
337+
# defp next_prepend(rest, string, size) do
338+
# { :binary.part(string, 0, size), rest }
339+
# end
340+
341+
def graphemes(binary) when is_binary(binary) do
342+
do_graphemes(next_grapheme(binary))
343+
end
344+
345+
defp do_graphemes({ c, rest }) do
346+
[c|do_graphemes(next_grapheme(rest))]
347+
end
348+
349+
defp do_graphemes(nil) do
350+
[]
351+
end
352+
end

0 commit comments

Comments
 (0)