@@ -208,12 +208,19 @@ end
208208
209209using Base. Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK
210210
211- function _decompose_char! (codepoint:: Union{Integer,Char} , dest:: Vector{UInt32} , options:: Integer )
212- ret = @ ccall utf8proc_decompose_char (codepoint:: UInt32 , dest:: Ptr{UInt32} , length (dest):: Int , options:: Cint , C_NULL :: Ptr{Cint} ):: Int
211+ function _decompose_char! (codepoint:: Union{Integer,Char} , dest:: Vector{UInt32} , offset :: Integer , options:: Integer )
212+ ret = GC . @preserve dest @ ccall utf8proc_decompose_char (codepoint:: UInt32 , pointer ( dest, 1 + offset) :: Ptr{UInt32} , ( length (dest) - offset ):: Int , options:: Cint , C_NULL :: Ptr{Cint} ):: Int
213213 ret < 0 && utf8proc_error (ret)
214214 return ret
215215end
216216
217+ # would be good to have higher-level accessor functions in utf8proc. alternatively,
218+ # we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
219+ # because of the bitfields.
220+ combining_class (uc:: Integer ) =
221+ 0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load (ccall (:utf8proc_get_property , Ptr{UInt16}, (UInt32,), uc), 2 ) : 0x0000
222+ combining_class (c:: AbstractChar ) = ismalformed (c) ? 0x0000 : combining_class (UInt32 (c))
223+
217224"""
218225 isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
219226
@@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
225232function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
226233to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
227234
235+ !!! compat "Julia 1.8"
236+ The `isequal_normalized` function was added in Julia 1.8.
237+
228238# Examples
229239
230240For example, the string `"noël"` can be constructed in two canonically equivalent ways
@@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
251261true
252262```
253263"""
254- function isequal_normalized (s1:: AbstractString , s2:: AbstractString ; casefold:: Bool = false , stripmark:: Bool = false , chartransform= identity)
255- function decompose_next_char! (c, state, d, options, s)
256- n = _decompose_char! (c, d, options)
257- if n > length (d) # may be possible in future Unicode versions?
258- n = _decompose_char! (c, resize! (d, n), options)
264+ isequal_normalized (s1:: AbstractString , s2:: AbstractString ; casefold:: Bool = false , stripmark:: Bool = false , chartransform= identity) =
265+ _isequal_normalized! (s1, s2, Vector {UInt32} (undef, 4 ), Vector {UInt32} (undef, 4 ), chartransform; casefold, stripmark)
266+
267+ # like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
268+ function _isequal_normalized! (s1:: AbstractString , s2:: AbstractString ,
269+ d1:: Vector{UInt32} , d2:: Vector{UInt32} , chartransform:: F = identity;
270+ casefold:: Bool = false , stripmark:: Bool = false ) where {F}
271+ function decompose_next_chars! (state, d, options, s)
272+ local n
273+ offset = 0
274+ @inbounds while true
275+ # read a char and decompose it to d
276+ c = chartransform (UInt32 (state[1 ]))
277+ state = iterate (s, state[2 ])
278+ if c < 0x80 # fast path for common ASCII case
279+ n = 1 + offset
280+ n > length (d) && resize! (d, 2 n)
281+ d[n] = casefold ? (0x41 ≤ c ≤ 0x5A ? c+ 0x20 : c) : c
282+ break # ASCII characters are all zero combining class
283+ else
284+ while true
285+ n = _decompose_char! (c, d, offset, options) + offset
286+ if n > length (d)
287+ resize! (d, 2 n)
288+ continue
289+ end
290+ break
291+ end
292+ end
293+
294+ # decomposed chars must be sorted in ascending order of combining class,
295+ # which means we need to keep fetching chars until we get to non-combining
296+ (iszero (combining_class (d[n])) || isnothing (state)) && break # non-combining
297+ offset = n
259298 end
260- return 1 , n, iterate (s, state)
299+
300+ # sort by combining class
301+ if n < 32 # almost always true
302+ for j1 = 2 : n # insertion sort
303+ cc = combining_class (d[j1])
304+ iszero (cc) && continue # don't re-order non-combiners
305+ for j2 = j1: - 1 : 2
306+ combining_class (d[j2- 1 ]) ≤ cc && break
307+ d[j2- 1 ], d[j2] = d[j2], d[j2- 1 ]
308+ end
309+ end
310+ else # avoid n^2 complexity in crazy large-n case
311+ j = 1
312+ @views while j < n
313+ j₀ = j + something (findnext (iszero ∘ combining_class, d[j+ 1 : n], 1 ), n+ 1 - j)
314+ sort! (d[j: j₀- 1 ], by= combining_class)
315+ j = j₀
316+ end
317+ end
318+
319+ # split return statement to help type inference:
320+ return state === nothing ? (1 , n, nothing ) : (1 , n, state)
261321 end
262322 options = UTF8PROC_DECOMPOSE
263323 casefold && (options |= UTF8PROC_CASEFOLD)
264324 stripmark && (options |= UTF8PROC_STRIPMARK)
265325 i1,i2 = iterate (s1),iterate (s2)
266- d1,d2 = Vector {UInt32} (undef, 4 ), Vector {UInt32} (undef, 4 ) # codepoint buffers
267326 n1 = n2 = 0 # lengths of codepoint buffers
268327 j1 = j2 = 1 # indices in d1, d2
269328 while true
270329 if j1 > n1
271330 i1 === nothing && return i2 === nothing && j2 > n2
272- j1, n1, i1 = decompose_next_char! ( chartransform ( UInt32 (i1[ 1 ])), i1[ 2 ] , d1, options, s1)
331+ j1, n1, i1 = decompose_next_chars! (i1 , d1, options, s1)
273332 end
274333 if j2 > n2
275334 i2 === nothing && return false
276- j2, n2, i2 = decompose_next_char! ( chartransform ( UInt32 (i2[ 1 ])), i2[ 2 ] , d2, options, s2)
335+ j2, n2, i2 = decompose_next_chars! (i2 , d2, options, s2)
277336 end
278337 d1[j1] == d2[j2] || return false
279338 j1 += 1 ; j2 += 1
0 commit comments