@@ -236,8 +236,8 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
236236 width = 1
237237 elseif code == 0x2028 || code == 0x2029
238238 # By definition, should have zero width (on the same line)
239- # 0x002028 '
' category: Zl name: LINE SEPARATOR/
240- # 0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
239+ # 0x002028 '\u2028 ' category: Zl name: LINE SEPARATOR/
240+ # 0x002029 '\u2029 ' category: Zp name: PARAGRAPH SEPARATOR/
241241 width = 0
242242 end
243243
@@ -256,79 +256,33 @@ end
256256# decompressed on the C side at runtime.
257257
258258# Inverse decomposition mapping tables for combining two characters into a single one.
259- comb1st_indices = Dict {UInt32,Int} ()
260- comb1st_indices_sorted_keys = Origin (0 )(UInt32[])
261- comb2nd_indices = Dict {UInt32,Int} ()
262- comb2nd_indices_sorted_keys = Origin (0 )(UInt32[])
263- comb2nd_indices_nonbasic = Set {UInt32} ()
264- comb_array = Origin (0 )(Vector {Dict{Int,UInt32}} ())
259+ comb_mapping = Dict {UInt32, Dict{UInt32, UInt32}} ()
260+ comb_issecond = Set {UInt32} ()
265261for char in char_props
262+ # What happens with decompositions that are longer than 2?
266263 if isnothing (char. decomp_type) && ! isnothing (char. decomp_mapping) &&
267264 length (char. decomp_mapping) == 2 && ! isnothing (char_hash[char. decomp_mapping[1 ]]) &&
268265 char_hash[char. decomp_mapping[1 ]]. combining_class == 0 &&
269- char. code ∉ exclusions
266+ ( char. code ∉ exclusions && char . code ∉ excl_version)
270267 dm0 = char. decomp_mapping[1 ]
271268 dm1 = char. decomp_mapping[2 ]
272- if ! haskey (comb1st_indices, dm0)
273- comb1st_indices[dm0] = length (comb1st_indices)
274- push! (comb1st_indices_sorted_keys, dm0)
275- push! (comb_array, Dict {Int,UInt32} ())
276- @assert length (comb1st_indices) == length (comb_array)
277- end
278- if ! haskey (comb2nd_indices, dm1)
279- push! (comb2nd_indices_sorted_keys, dm1)
280- comb2nd_indices[dm1] = length (comb2nd_indices)
281- end
282- @assert ! haskey (comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
283- comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char. code
284- if char. code > 0xFFFF
285- push! (comb2nd_indices_nonbasic, dm1)
269+ if ! haskey (comb_mapping, dm0)
270+ comb_mapping[dm0] = Dict {UInt32, UInt32} ()
286271 end
272+ comb_mapping[dm0][dm1] = char. code
273+ push! (comb_issecond, dm1)
287274 end
288275end
289276
290- comb_indices = Dict {UInt32,Int} ()
291- comb1st_indices_lastoffsets = Origin (0 )(zeros (Int, length (comb1st_indices)))
292- comb1st_indices_firstoffsets = Origin (0 )(zeros (Int, length (comb1st_indices)))
277+ comb_index = Dict {UInt32, UInt32} ()
278+ comb_length = Dict {UInt32, UInt32} ()
293279let
294- cumoffset = 0
295- for dm0 in comb1st_indices_sorted_keys
296- index = comb1st_indices[dm0]
297- first = nothing
298- last = nothing
299- offset = 0
300- for b in eachindex (comb2nd_indices_sorted_keys)
301- dm1 = comb2nd_indices_sorted_keys[b]
302- if haskey (comb_array[index], b)
303- if isnothing (first)
304- first = offset
305- end
306- last = offset
307- if dm1 in comb2nd_indices_nonbasic
308- last += 1
309- end
310- end
311- offset += 1
312- if dm1 in comb2nd_indices_nonbasic
313- offset += 1
314- end
315- end
316- comb1st_indices_firstoffsets[index] = first
317- comb1st_indices_lastoffsets[index] = last
318- @assert ! haskey (comb_indices, dm0)
319- comb_indices[dm0] = cumoffset
320- cumoffset += last - first + 1 + 2
321- end
322-
323- offset = 0
324- for dm1 in comb2nd_indices_sorted_keys
325- @assert ! haskey (comb_indices, dm1)
326- comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
327- @assert comb2nd_indices[dm1] + offset <= 0x4000
328- if dm1 in comb2nd_indices_nonbasic
329- comb_indices[dm1] |= 0x4000
330- offset += 1
331- end
280+ ind = 0
281+ for dm0 in sort! (collect (keys (comb_mapping)))
282+ comb_index[dm0] = ind
283+ len = length (comb_mapping[dm0])
284+ comb_length[dm0] = len
285+ ind += len
332286 end
333287end
334288
@@ -391,7 +345,9 @@ function char_table_properties!(sequences, char)
391345 uppercase_seqindex = encode_sequence! (sequences, char. uppercase_mapping),
392346 lowercase_seqindex = encode_sequence! (sequences, char. lowercase_mapping),
393347 titlecase_seqindex = encode_sequence! (sequences, char. titlecase_mapping),
394- comb_index = get (comb_indices, code, typemax (UInt16)),
348+ comb_index = get (comb_index, code, 0x3FF ), # see utf8proc_property_struct::comb_index
349+ comb_length = get (comb_length, code, 0 ),
350+ comb_issecond = code in comb_issecond,
395351 bidi_mirrored = char. bidi_mirrored,
396352 comp_exclusion = code in exclusions || code in excl_version,
397353 ignorable = code in ignorable,
@@ -473,8 +429,7 @@ function c_uint16(seqindex)
473429end
474430
475431function print_c_data_tables (io, sequences, prop_page_indices, prop_pages, deduplicated_props,
476- comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
477- comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
432+ comb_index, comb_length, comb_issecond)
478433 print (io, " static const utf8proc_uint16_t utf8proc_sequences[] = " )
479434 write_c_index_array (io, sequences. storage, 8 )
480435 print (io, " static const utf8proc_uint16_t utf8proc_stage1table[] = " )
@@ -484,7 +439,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
484439
485440 print (io, """
486441 static const utf8proc_property_t utf8proc_properties[] = {
487- {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX , false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
442+ {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false , false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
488443 """ )
489444 for prop in deduplicated_props
490445 print (io, " {" ,
@@ -498,6 +453,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
498453 c_uint16 (prop. lowercase_seqindex), " , " ,
499454 c_uint16 (prop. titlecase_seqindex), " , " ,
500455 c_uint16 (prop. comb_index), " , " ,
456+ c_uint16 (prop. comb_length), " , " ,
457+ prop. comb_issecond, " , " ,
501458 prop. bidi_mirrored, " , " ,
502459 prop. comp_exclusion, " , " ,
503460 prop. ignorable, " , " ,
@@ -512,42 +469,30 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
512469 end
513470 print (io, " };\n\n " )
514471
515- print (io, " static const utf8proc_uint16_t utf8proc_combinations[] = {\n " )
516- i = 0
517- for a in eachindex (comb1st_indices_firstoffsets)
518- offset = 0
519- print (io, comb1st_indices_firstoffsets[a], " , " , comb1st_indices_lastoffsets[a], " , " )
520- for b in eachindex (comb2nd_indices_sorted_keys)
521- dm1 = comb2nd_indices_sorted_keys[b]
522- if offset > comb1st_indices_lastoffsets[a]
523- break
524- end
525- if offset >= comb1st_indices_firstoffsets[a]
526- i += 1
527- if i == 8
528- i = 0
529- print (io, " \n " )
530- end
531- v = get (comb_array[a], b, 0 )
532- if dm1 in comb2nd_indices_nonbasic
533- print (io, (v & 0xFFFF0000 ) >> 16 , " , " )
534- end
535- print (io, v & 0xFFFF , " , " )
536- end
537- offset += 1
538- if dm1 in comb2nd_indices_nonbasic
539- offset += 1
540- end
472+ print (io, " static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n " )
473+ for dm0 in sort! (collect (keys (comb_mapping)))
474+ print (io, " " );
475+ for dm1 in sort! (collect (keys (comb_mapping[dm0])))
476+ print (io, " " , dm1, " ," )
477+ end
478+ print (io, " \n " );
479+ end
480+ print (io, " };\n\n " )
481+
482+ print (io, " static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n " )
483+ for dm0 in sort! (collect (keys (comb_mapping)))
484+ print (io, " " );
485+ for dm1 in sort! (collect (keys (comb_mapping[dm0])))
486+ code = comb_mapping[dm0][dm1]
487+ print (io, " " , code, " ," )
541488 end
542- print (io, " \n " )
489+ print (io, " \n " );
543490 end
544491 print (io, " };\n\n " )
545492end
546493
547494
548495if ! isinteractive ()
549496 print_c_data_tables (stdout , sequences, prop_page_indices, prop_pages, deduplicated_props,
550- comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
551- comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
497+ comb_index, comb_length, comb_issecond)
552498end
553-
0 commit comments