Skip to content

Commit b6e4406

Browse files
committed
Docs: Details on the Unicode range
1 parent 407dd2d commit b6e4406

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

include/stringzilla/types.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ typedef enum {
368368
} sz_status_t;
369369

370370
/**
371-
* @brief Describes the length of a UTF8 @b rune / character / codepoint in bytes.
371+
* @brief Describes the length of a UTF-8 @b rune / character / codepoint in bytes, which can be 1 to 4.
372+
* @see https://en.wikipedia.org/wiki/UTF-8
372373
*/
373374
typedef enum {
374375
sz_utf8_invalid_k = 0, //!< Invalid UTF8 character.
@@ -378,6 +379,16 @@ typedef enum {
378379
sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
379380
} sz_rune_length_t;
380381

382+
/**
383+
* @brief Stores a single UTF-8 @b rune / character / codepoint unpacked into @b UTF-32.
384+
* @see https://en.wikipedia.org/wiki/UTF-32
385+
*
386+
* The theoretical capacity of the underlying numeric type is 4 bytes, with over 4 billion possible states, but:
387+
* - UTF-8, however, in its' largest 4-byte form has only 3+6+6+6 = 21 bits of usable space for 2 million states.
388+
* - Unicode, in turn, has only @b 1'114'112 possible code points from U+0000 to U+10FFFF.
389+
* - Of those, in Unicode 16, only @b 155'063 are assigned characters ~ a little over 17 bits of content.
390+
* That's @b 0.004% of the 32-bit space, so sparse data-structures are encouraged for UTF-8 oriented algorithms.
391+
*/
381392
typedef sz_u32_t sz_rune_t;
382393

383394
/**

0 commit comments

Comments
 (0)