@@ -255,6 +255,38 @@ typedef struct utf8proc_property_struct {
255255 utf8proc_uint16_t uppercase_seqindex ;
256256 utf8proc_uint16_t lowercase_seqindex ;
257257 utf8proc_uint16_t titlecase_seqindex ;
258+ /**
259+ * Character combining table.
260+ *
261+ * The character combining table is formally indexed by two
262+ * characters, the first and second character that might form a
263+ * combining pair. The table entry then contains the combined
264+ * character. Most character pairs cannot be combined. There are
265+ * about 1,000 characters that can be the first character in a
266+ * combining pair, and for most, there are only a handful for
267+ * possible second characters.
268+ *
269+ * The combining table is stored as `utf8proc_uint32_t
270+ * utf8proc_combinations[][2]`. That is, it contains a pair `(second
271+ * combining character, combined character)` for every character
272+ * that can be a first combining character.
273+ *
274+ * - `comb_index`: Index into the combining table if this character
275+ * is the first character in a combining pair, else 0x3ff
276+ *
277+ * - `comb_length`: Number of table entries for this first character
278+ *
279+ * - `comb_is_second`: As optimization we also record whether this
280+ * characther is the second combining character in any pair. If
281+ * not, we can skip the table lookup.
282+ *
283+ * A table lookup starts from a given character pair. It first
284+ * checks whether the first character is stored in the table
285+ * (checking whether the index is 0x3ff) and whether the second
286+ * index is stored in the table (looking at `comb_is_second`). If
287+ * so, the `comb_length` table entries will be checked sequentially
288+ * for a match.
289+ */
258290 utf8proc_uint16_t comb_index :10 ;
259291 utf8proc_uint16_t comb_length :5 ;
260292 utf8proc_uint16_t comb_issecond :1 ;
0 commit comments