Backport unicode-org#4530 to the 1.4 branch (unicode-org#4537)

hsivonen · web-flow · commit 6d8029e86731 · 2024-01-23T18:44:20.000+02:00
For releasing `icu_normalizer` 1.4.1.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,15 @@
  - [Remove icu_datagen's dep on `fractional`](https://github.com/unicode-org/icu4x/pull/4472)
    - `icu_datagen@1.4.1`
 
+ - Fix normalization of character whose decomposition contains more than one starter and ends with a non-starter followed by a non-starter
+   with a lower Canonical Combining Class than the last character of the decomposition. (https://github.com/unicode-org/icu4x/pull/4530)
+   - `icu_normalizer@1.4.1`
+
 ## icu4x 1.4 (Nov 16, 2023)
 
 - General
   - MSRV is now 1.67
- 
+
 - Components
     - Compiled data updated to CLDR 44 and ICU 74 (https://github.com/unicode-org/icu4x/pull/4245)
     - `icu_calendar`
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/components/normalizer/Cargo.toml b/components/normalizer/Cargo.toml
@@ -7,7 +7,7 @@ name = "icu_normalizer"
 description = "API for normalizing text into Unicode Normalization Forms"
 license-file = "LICENSE"
 
-version.workspace = true
+version = "1.4.1"
 rust-version.workspace = true
 authors.workspace = true
 edition.workspace = true
diff --git a/components/normalizer/src/lib.rs b/components/normalizer/src/lib.rs
@@ -637,7 +637,7 @@ where
                 i += 1;
                 // Half-width kana and iota subscript don't occur in the tails
                 // of these multicharacter decompositions.
-                if decomposition_starts_with_non_starter(trie_value) {
+                if !decomposition_starts_with_non_starter(trie_value) {
                     combining_start = i;
                 }
             }
@@ -676,7 +676,7 @@ where
                 i += 1;
                 // Half-width kana and iota subscript don't occur in the tails
                 // of these multicharacter decompositions.
-                if decomposition_starts_with_non_starter(trie_value) {
+                if !decomposition_starts_with_non_starter(trie_value) {
                     combining_start = i;
                 }
             }
diff --git a/components/normalizer/tests/tests.rs b/components/normalizer/tests/tests.rs
@@ -1308,6 +1308,28 @@ fn test_utf16_basic() {
     );
 }
 
+#[test]
+fn test_accented_digraph() {
+    let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
+    assert_eq!(
+        normalizer.normalize("\u{01C4}\u{0323}"),
+        "DZ\u{0323}\u{030C}"
+    );
+    assert_eq!(
+        normalizer.normalize("DZ\u{030C}\u{0323}"),
+        "DZ\u{0323}\u{030C}"
+    );
+}
+
+#[test]
+fn test_ddd() {
+    let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
+    assert_eq!(
+        normalizer.normalize("\u{0DDD}\u{0334}"),
+        "\u{0DD9}\u{0DCF}\u{0334}\u{0DCA}"
+    );
+}
+
 #[test]
 fn test_is_normalized() {
     let nfd: DecomposingNormalizer = DecomposingNormalizer::new_nfd();

Original file line number	Diff line number	Diff line change
`@@ -637,7 +637,7 @@ where`
`637`	`637`	`i += 1;`
`638`	`638`	`// Half-width kana and iota subscript don't occur in the tails`
`639`	`639`	`// of these multicharacter decompositions.`
`640`		`- if decomposition_starts_with_non_starter(trie_value) {`
	`640`	`+ if !decomposition_starts_with_non_starter(trie_value) {`
`641`	`641`	`combining_start = i;`
`642`	`642`	`}`
`643`	`643`	`}`
`@@ -676,7 +676,7 @@ where`
`676`	`676`	`i += 1;`
`677`	`677`	`// Half-width kana and iota subscript don't occur in the tails`
`678`	`678`	`// of these multicharacter decompositions.`
`679`		`- if decomposition_starts_with_non_starter(trie_value) {`
	`679`	`+ if !decomposition_starts_with_non_starter(trie_value) {`
`680`	`680`	`combining_start = i;`
`681`	`681`	`}`
`682`	`682`	`}`