Skip to content

Commit 6cc6646

Browse files
committed
Improve performance of UnicodeNormalize.canonical_ordering_one
Use array_of_integer.sort! instead of buble-sort-like algorithm
1 parent 37884c4 commit 6cc6646

File tree

2 files changed

+33
-8
lines changed

2 files changed

+33
-8
lines changed

lib/unicode_normalize/normalize.rb

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,16 +82,22 @@ def self.hangul_comp_one(string)
8282

8383
## Canonical Ordering
8484
def self.canonical_ordering_one(string)
85-
sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
86-
(sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
87-
(0..i).each do |j|
88-
later_class = sorting[j+1].last
89-
if 0<later_class and later_class<sorting[j].last
90-
sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
91-
end
85+
result = ''
86+
unordered = []
87+
chars = string.chars
88+
n = chars.size
89+
chars.each_with_index do |char, i|
90+
ccc = CLASS_TABLE[char]
91+
if ccc == 0
92+
unordered.sort!.each { result << chars[it % n] }
93+
unordered.clear
94+
result << char
95+
else
96+
unordered << ccc * n + i
9297
end
9398
end
94-
return sorting.collect(&:first).join('')
99+
unordered.sort!.each { result << chars[it % n] }
100+
result
95101
end
96102

97103
## Normalization Forms for Patterns (not whole Strings)

test/test_unicode_normalize.rb

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,4 +209,23 @@ def test_us_ascii
209209
assert_equal true, ascii_string.unicode_normalized?(:nfkc)
210210
assert_equal true, ascii_string.unicode_normalized?(:nfkd)
211211
end
212+
213+
def test_canonical_ordering
214+
a = "\u03B1\u0313\u0300\u0345"
215+
a_unordered1 = "\u03B1\u0345\u0313\u0300"
216+
a_unordered2 = "\u03B1\u0313\u0345\u0300"
217+
u1 = "U\u0308\u0304"
218+
u2 = "U\u0304\u0308"
219+
s = "s\u0323\u0307"
220+
s_unordered = "s\u0307\u0323"
221+
o = "\u{1611e}\u{1611e}\u{1611f}"
222+
# Actual cases called through String#unicode_normalize
223+
assert_equal(s + o, UnicodeNormalize.canonical_ordering_one(s_unordered + o))
224+
assert_equal(a[1..], UnicodeNormalize.canonical_ordering_one(a_unordered1[1..]))
225+
assert_equal(a[1..] + o, UnicodeNormalize.canonical_ordering_one(a_unordered2[1..] + o))
226+
# Artificial cases
227+
assert_equal(a + u1 + o + u2 + s, UnicodeNormalize.canonical_ordering_one(a + u1 + o + u2 + s))
228+
assert_equal(s[1..] + a + a, UnicodeNormalize.canonical_ordering_one(s_unordered[1..] + a_unordered1 + a_unordered2))
229+
assert_equal(o + s + u1 + a + o + a + u2 + o, UnicodeNormalize.canonical_ordering_one(o + s_unordered + u1 + a_unordered1 + o + a_unordered2 + u2 + o))
230+
end
212231
end

0 commit comments

Comments
 (0)