Skip to content

Commit b18c5b5

Browse files
committed
Redesign combining table
1 parent d05ed9e commit b18c5b5

File tree

5 files changed

+14305
-13986
lines changed

5 files changed

+14305
-13986
lines changed

data/Manifest.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# This file is machine-generated - editing it directly is not advised
22

3-
julia_version = "1.10.5"
3+
julia_version = "1.11.2"
44
manifest_format = "2.0"
55
project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"
66

77
[[deps.OffsetArrays]]
8-
git-tree-sha1 = "1a27764e945a152f7ca7efa04de513d473e9542e"
8+
git-tree-sha1 = "5e1897147d1ff8d98883cda2be2187dcf57d8f0c"
99
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10-
version = "1.14.1"
10+
version = "1.15.0"
1111

1212
[deps.OffsetArrays.extensions]
1313
OffsetArraysAdaptExt = "Adapt"

data/data_generator.jl

Lines changed: 91 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -256,82 +256,99 @@ end
256256
# decompressed on the C side at runtime.
257257

258258
# Inverse decomposition mapping tables for combining two characters into a single one.
259-
comb1st_indices = Dict{UInt32,Int}()
260-
comb1st_indices_sorted_keys = Origin(0)(UInt32[])
261-
comb2nd_indices = Dict{UInt32,Int}()
262-
comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
263-
comb2nd_indices_nonbasic = Set{UInt32}()
264-
comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
259+
comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
260+
comb_issecond = Set{UInt32}()
265261
for char in char_props
266262
if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
267263
length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
268264
char_hash[char.decomp_mapping[1]].combining_class == 0 &&
269265
char.code exclusions
270266
dm0 = char.decomp_mapping[1]
271267
dm1 = char.decomp_mapping[2]
272-
if !haskey(comb1st_indices, dm0)
273-
comb1st_indices[dm0] = length(comb1st_indices)
274-
push!(comb1st_indices_sorted_keys, dm0)
275-
push!(comb_array, Dict{Int,UInt32}())
276-
@assert length(comb1st_indices) == length(comb_array)
277-
end
278-
if !haskey(comb2nd_indices, dm1)
279-
push!(comb2nd_indices_sorted_keys, dm1)
280-
comb2nd_indices[dm1] = length(comb2nd_indices)
281-
end
282-
@assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
283-
comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
284-
if char.code > 0xFFFF
285-
push!(comb2nd_indices_nonbasic, dm1)
268+
if !haskey(comb_mapping, dm0)
269+
comb_mapping[dm0] = Dict{UInt32, UInt32}()
286270
end
271+
comb_mapping[dm0][dm1] = char.code
272+
push!(comb_issecond, dm1)
287273
end
288274
end
289275

290-
comb_indices = Dict{UInt32,Int}()
291-
comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
292-
comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
276+
comb_index = Dict{UInt32, UInt32}()
277+
comb_length = Dict{UInt32, UInt32}()
293278
let
294-
cumoffset = 0
295-
for dm0 in comb1st_indices_sorted_keys
296-
index = comb1st_indices[dm0]
297-
first = nothing
298-
last = nothing
299-
offset = 0
300-
for b in eachindex(comb2nd_indices_sorted_keys)
301-
dm1 = comb2nd_indices_sorted_keys[b]
302-
if haskey(comb_array[index], b)
303-
if isnothing(first)
304-
first = offset
305-
end
306-
last = offset
307-
if dm1 in comb2nd_indices_nonbasic
308-
last += 1
309-
end
310-
end
311-
offset += 1
312-
if dm1 in comb2nd_indices_nonbasic
313-
offset += 1
314-
end
315-
end
316-
comb1st_indices_firstoffsets[index] = first
317-
comb1st_indices_lastoffsets[index] = last
318-
@assert !haskey(comb_indices, dm0)
319-
comb_indices[dm0] = cumoffset
320-
cumoffset += last - first + 1 + 2
321-
end
322-
323-
offset = 0
324-
for dm1 in comb2nd_indices_sorted_keys
325-
@assert !haskey(comb_indices, dm1)
326-
comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
327-
@assert comb2nd_indices[dm1] + offset <= 0x4000
328-
if dm1 in comb2nd_indices_nonbasic
329-
comb_indices[dm1] |= 0x4000
330-
offset += 1
331-
end
279+
ind = 0
280+
for dm0 in sort!(collect(keys(comb_mapping)))
281+
comb_index[dm0] = ind
282+
len = length(comb_mapping[dm0])
283+
comb_length[dm0] = len
284+
ind += len
332285
end
333286
end
334287

288+
# comb1st_indices = Dict{UInt32,Int}()
289+
# comb1st_indices_sorted_keys = Origin(0)(UInt32[])
290+
# comb2nd_indices = Dict{UInt32,Int}()
291+
# comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
292+
# comb2nd_indices_length(code::UInt32) = code < 0x8000 ? 1 : 2
293+
# comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
294+
# for (i,char) in enumerate(char_props)
295+
# if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
296+
# length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
297+
# char_hash[char.decomp_mapping[1]].combining_class == 0 &&
298+
# char.code ∉ exclusions
299+
# dm0 = char.decomp_mapping[1]
300+
# dm1 = char.decomp_mapping[2]
301+
# if !haskey(comb1st_indices, dm0)
302+
# comb1st_indices[dm0] = length(comb1st_indices)
303+
# push!(comb1st_indices_sorted_keys, dm0)
304+
# push!(comb_array, Dict{Int,UInt32}())
305+
# @assert length(comb1st_indices) == length(comb_array)
306+
# end
307+
# if !haskey(comb2nd_indices, dm1)
308+
# push!(comb2nd_indices_sorted_keys, dm1)
309+
# comb2nd_indices[dm1] = length(comb2nd_indices)
310+
# end
311+
# @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
312+
# comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
313+
# end
314+
# end
315+
#
316+
# comb_indices = Dict{UInt32,Int}()
317+
# comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
318+
# comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
319+
# let
320+
# cumoffset = 0
321+
# for dm0 in comb1st_indices_sorted_keys
322+
# index = comb1st_indices[dm0]
323+
# first = nothing
324+
# last = nothing
325+
# offset = 0
326+
# for b in eachindex(comb2nd_indices_sorted_keys)
327+
# dm1 = comb2nd_indices_sorted_keys[b]
328+
# if haskey(comb_array[index], b)
329+
# if isnothing(first)
330+
# first = offset
331+
# end
332+
# last = offset + comb2nd_indices_length(dm1) - 1
333+
# end
334+
# offset += comb2nd_indices_length(dm1)
335+
# end
336+
# comb1st_indices_firstoffsets[index] = first
337+
# comb1st_indices_lastoffsets[index] = last
338+
# @assert !haskey(comb_indices, dm0)
339+
# comb_indices[dm0] = 0x4000 | cumoffset
340+
# cumoffset += last - first + 1 + 2
341+
# end
342+
#
343+
# offset = 0
344+
# for dm1 in comb2nd_indices_sorted_keys
345+
# @assert !haskey(comb_indices, dm1)
346+
# comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
347+
# @assert comb2nd_indices[dm1] + offset < 0x4000
348+
# offset += comb2nd_indices_length(dm1) - 1
349+
# end
350+
# end
351+
335352
utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq))
336353

337354
# Utility for packing all UTF-16 encoded sequences into one big array
@@ -391,7 +408,9 @@ function char_table_properties!(sequences, char)
391408
uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping),
392409
lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping),
393410
titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping),
394-
comb_index = get(comb_indices, code, typemax(UInt16)),
411+
comb_index = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
412+
comb_length = get(comb_length, code, 0),
413+
comb_issecond = code in comb_issecond,
395414
bidi_mirrored = char.bidi_mirrored,
396415
comp_exclusion = code in exclusions || code in excl_version,
397416
ignorable = code in ignorable,
@@ -473,8 +492,7 @@ function c_uint16(seqindex)
473492
end
474493

475494
function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
476-
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
477-
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
495+
comb_index, comb_length, comb_issecond)
478496
print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
479497
write_c_index_array(io, sequences.storage, 8)
480498
print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
@@ -484,7 +502,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
484502

485503
print(io, """
486504
static const utf8proc_property_t utf8proc_properties[] = {
487-
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
505+
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
488506
""")
489507
for prop in deduplicated_props
490508
print(io, " {",
@@ -498,6 +516,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
498516
c_uint16(prop.lowercase_seqindex), ", ",
499517
c_uint16(prop.titlecase_seqindex), ", ",
500518
c_uint16(prop.comb_index), ", ",
519+
c_uint16(prop.comb_length), ", ",
520+
prop.comb_issecond, ", ",
501521
prop.bidi_mirrored, ", ",
502522
prop.comp_exclusion, ", ",
503523
prop.ignorable, ", ",
@@ -512,41 +532,18 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
512532
end
513533
print(io, "};\n\n")
514534

515-
print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ")
516-
i = 0
517-
for a in eachindex(comb1st_indices_firstoffsets)
518-
offset = 0
519-
print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
520-
for b in eachindex(comb2nd_indices_sorted_keys)
521-
dm1 = comb2nd_indices_sorted_keys[b]
522-
if offset > comb1st_indices_lastoffsets[a]
523-
break
524-
end
525-
if offset >= comb1st_indices_firstoffsets[a]
526-
i += 1
527-
if i == 8
528-
i = 0
529-
print(io, "\n ")
530-
end
531-
v = get(comb_array[a], b, 0)
532-
if dm1 in comb2nd_indices_nonbasic
533-
print(io, (v & 0xFFFF0000) >> 16, ", ")
534-
end
535-
print(io, v & 0xFFFF, ", ")
536-
end
537-
offset += 1
538-
if dm1 in comb2nd_indices_nonbasic
539-
offset += 1
540-
end
535+
print(io, "static const utf8proc_uint32_t utf8proc_combinations[][2] = {\n")
536+
for dm0 in sort!(collect(keys(comb_mapping)))
537+
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
538+
code = comb_mapping[dm0][dm1]
539+
print(io, " { ", dm1, ", ", code, " },\n")
541540
end
542-
print(io, "\n")
543541
end
544542
print(io, "};\n\n")
545543
end
546544

547545

548546
if !isinteractive()
549547
print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
550-
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
551-
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
548+
comb_index, comb_length, comb_issecond)
552549
end

utf8proc.c

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
651651
utf8proc_propval_t max_combining_class = -1;
652652
utf8proc_ssize_t rpos;
653653
utf8proc_ssize_t wpos = 0;
654-
utf8proc_int32_t composition;
655654
for (rpos = 0; rpos < length; rpos++) {
656655
current_char = buffer[rpos];
657656
current_property = unsafe_get_property(current_char);
@@ -685,22 +684,28 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
685684
if (!starter_property) {
686685
starter_property = unsafe_get_property(*starter);
687686
}
688-
if (starter_property->comb_index < 0x8000 &&
689-
current_property->comb_index != UINT16_MAX &&
690-
current_property->comb_index >= 0x8000) {
691-
int sidx = starter_property->comb_index;
692-
int idx = current_property->comb_index & 0x3FFF;
693-
if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) {
694-
idx += sidx + 2 - utf8proc_combinations[sidx];
695-
if (current_property->comb_index & 0x4000) {
696-
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
697-
} else
698-
composition = utf8proc_combinations[idx];
699-
700-
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
701-
!(unsafe_get_property(composition)->comp_exclusion))) {
702-
*starter = composition;
703-
starter_property = NULL;
687+
int idx = starter_property->comb_index;
688+
if (idx < 0x3FF && current_property->comb_issecond) {
689+
int len = starter_property->comb_length;
690+
utf8proc_int32_t max_second = utf8proc_combinations[idx + len - 1][0];
691+
if (current_char <= max_second) {
692+
// TODO: binary search? arithmetic search?
693+
for (int off = 0; off < len; ++off) {
694+
utf8proc_int32_t second = utf8proc_combinations[idx + off][0];
695+
if (current_char < second) {
696+
/* not found */
697+
break;
698+
}
699+
if (current_char == second) {
700+
/* found */
701+
utf8proc_int32_t composition = utf8proc_combinations[idx + off][1];
702+
*starter = composition;
703+
starter_property = NULL;
704+
break;
705+
}
706+
}
707+
if (starter_property == NULL) {
708+
/* found */
704709
continue;
705710
}
706711
}

utf8proc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,9 @@ typedef struct utf8proc_property_struct {
255255
utf8proc_uint16_t uppercase_seqindex;
256256
utf8proc_uint16_t lowercase_seqindex;
257257
utf8proc_uint16_t titlecase_seqindex;
258-
utf8proc_uint16_t comb_index;
258+
utf8proc_uint16_t comb_index:10;
259+
utf8proc_uint16_t comb_length:5;
260+
utf8proc_uint16_t comb_issecond:1;
259261
unsigned bidi_mirrored:1;
260262
unsigned comp_exclusion:1;
261263
/**

0 commit comments

Comments
 (0)