ashvardanian
diff --git a/‎include/stringzilla/utf8_unpack.h‎
Lines changed: 5 additions & 8 deletions b/‎include/stringzilla/utf8_unpack.h‎
Lines changed: 5 additions & 8 deletions
@@ -98,7 +98,8 @@ SZ_DYNAMIC sz_size_t sz_utf8_case_fold(        //
  *
  *  This function applies full Unicode Case Folding as defined in the Unicode Standard (UAX #21 and
  *  CaseFolding.txt), covering all bicameral scripts, all offset-based one-to-one folds, all table-based
- *  one-to-one folds, and all normative one-to-many expansions.
+ *  one-to-one folds, and all normative one-to-many expansions. It doesn't however perform any normalization,
+ *  like NFKC or NFC, so combining marks are treated as-is.
  *
  *  The following character mappings are supported:
  *
@@ -142,14 +143,10 @@ SZ_DYNAMIC sz_size_t sz_utf8_case_fold(        //
  *
  *  - ICU abandoned Boyer-Moore for Unicode, reverting to linear search for correctness
  *  - ClickHouse uses Volnitsky with fallback to naive search for problematic characters
- *  - ripgrep uses simple case folding only (no expansion handling)
+ *  - RipGrep uses simple case folding only (no expansion handling) leveraging the Rust RegEx engine
  *
- *  Potential algorithmic improvements for future versions:
- *
- *  - Streaming comparison with small expansion buffer instead of pre-materializing folded needle
- *  - Fingerprint-based filtering using rolling hash over folded codepoints
- *  - Conservative skip distances that account for maximum expansion ratio (3:1)
- *  - First-codepoint filtering to quickly reject non-matching positions
+ *  StringZilla implements several algorithms. Most importantly it first locates the longest expansion-free
+ *  slice of the needle to locate against.
  *
  *  @see https://unicode-org.github.io/icu/userguide/collation/string-search.html
  *       ICU String Search - discusses why Boyer-Moore was abandoned for Unicode
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,8 @@ SZ_DYNAMIC sz_size_t sz_utf8_case_fold( //`
`98`	`98`	`*`
`99`	`99`	`* This function applies full Unicode Case Folding as defined in the Unicode Standard (UAX #21 and`
`100`	`100`	`* CaseFolding.txt), covering all bicameral scripts, all offset-based one-to-one folds, all table-based`
`101`		`- * one-to-one folds, and all normative one-to-many expansions.`
	`101`	`+ * one-to-one folds, and all normative one-to-many expansions. It doesn't however perform any normalization,`
	`102`	`+ * like NFKC or NFC, so combining marks are treated as-is.`
`102`	`103`	`*`
`103`	`104`	`* The following character mappings are supported:`
`104`	`105`	`*`
`@@ -142,14 +143,10 @@ SZ_DYNAMIC sz_size_t sz_utf8_case_fold( //`
`142`	`143`	`*`
`143`	`144`	`* - ICU abandoned Boyer-Moore for Unicode, reverting to linear search for correctness`
`144`	`145`	`* - ClickHouse uses Volnitsky with fallback to naive search for problematic characters`
`145`		`- * - ripgrep uses simple case folding only (no expansion handling)`
	`146`	`+ * - RipGrep uses simple case folding only (no expansion handling) leveraging the Rust RegEx engine`
`146`	`147`	`*`
`147`		`- * Potential algorithmic improvements for future versions:`
`148`		`- *`
`149`		`- * - Streaming comparison with small expansion buffer instead of pre-materializing folded needle`
`150`		`- * - Fingerprint-based filtering using rolling hash over folded codepoints`
`151`		`- * - Conservative skip distances that account for maximum expansion ratio (3:1)`
`152`		`- * - First-codepoint filtering to quickly reject non-matching positions`
	`148`	`+ * StringZilla implements several algorithms. Most importantly it first locates the longest expansion-free`
	`149`	`+ * slice of the needle to locate against.`
`153`	`150`	`*`
`154`	`151`	`* @see https://unicode-org.github.io/icu/userguide/collation/string-search.html`
`155`	`152`	`* ICU String Search - discusses why Boyer-Moore was abandoned for Unicode`