File tree Expand file tree Collapse file tree 1 file changed +12
-1
lines changed
Expand file tree Collapse file tree 1 file changed +12
-1
lines changed Original file line number Diff line number Diff line change @@ -368,7 +368,8 @@ typedef enum {
368368} sz_status_t ;
369369
370370/* *
371- * @brief Describes the length of a UTF8 @b rune / character / codepoint in bytes.
371+ * @brief Describes the length of a UTF-8 @b rune / character / codepoint in bytes, which can be 1 to 4.
372+ * @see https://en.wikipedia.org/wiki/UTF-8
372373 */
373374typedef enum {
374375 sz_utf8_invalid_k = 0 , // !< Invalid UTF8 character.
@@ -378,6 +379,16 @@ typedef enum {
378379 sz_utf8_rune_4bytes_k = 4 , // !< 4-byte UTF8 character.
379380} sz_rune_length_t ;
380381
382+ /* *
383+ * @brief Stores a single UTF-8 @b rune / character / codepoint unpacked into @b UTF-32.
384+ * @see https://en.wikipedia.org/wiki/UTF-32
385+ *
386+ * The theoretical capacity of the underlying numeric type is 4 bytes, with over 4 billion possible states, but:
387+ * - UTF-8, however, in its' largest 4-byte form has only 3+6+6+6 = 21 bits of usable space for 2 million states.
388+ * - Unicode, in turn, has only @b 1'114'112 possible code points from U+0000 to U+10FFFF.
389+ * - Of those, in Unicode 16, only @b 155'063 are assigned characters ~ a little over 17 bits of content.
390+ * That's @b 0.004% of the 32-bit space, so sparse data-structures are encouraged for UTF-8 oriented algorithms.
391+ */
381392typedef sz_u32_t sz_rune_t ;
382393
383394/* *
You can’t perform that action at this time.
0 commit comments