uniq: fix multibyte input

sylvestre · sylvestre · commit 1e23a3fa8d1c · 2025-01-01T13:17:52.000+01:00
Should fix tests/uniq/uniq.pl
diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs
@@ -154,43 +154,44 @@ impl Uniq {
 
     fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
     where
-        F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
+        F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
     {
         let fields_to_check = self.skip_fields(line);
-        let len = fields_to_check.len();
-        let slice_start = self.slice_start.unwrap_or(0);
-        let slice_stop = self.slice_stop.unwrap_or(len);
-        if len > 0 {
-            // fast path: avoid doing any work if there is no need to skip or map to lower-case
-            if !self.ignore_case && slice_start == 0 && slice_stop == len {
-                return closure(&mut fields_to_check.iter().copied());
-            }
 
-            // fast path: avoid skipping
-            if self.ignore_case && slice_start == 0 && slice_stop == len {
-                return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
-            }
+        // Skip self.slice_start bytes (if -s was used).
+        // self.slice_start is how many characters to skip, but historically
+        // uniq’s `-s N` means “skip N *bytes*,” so do that literally:
+        let skip_bytes = self.slice_start.unwrap_or(0);
+        let fields_to_check = if skip_bytes < fields_to_check.len() {
+            &fields_to_check[skip_bytes..]
+        } else {
+            // If skipping beyond end-of-line, leftover is empty => effectively ""
+            &[]
+        };
 
-            // fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
-            if !self.ignore_case {
-                return closure(
-                    &mut fields_to_check
-                        .iter()
-                        .skip(slice_start)
-                        .take(slice_stop)
-                        .copied(),
-                );
+        // Convert the leftover bytes to UTF-8 for character-based -w
+        // If invalid UTF-8, just compare them as individual bytes (fallback).
+        let string_after_skip = match std::str::from_utf8(fields_to_check) {
+            Ok(s) => s,
+            Err(_) => {
+                // Fallback: if invalid UTF-8, treat them as single-byte “chars”
+                return closure(&mut fields_to_check.iter().map(|&b| b as char));
             }
+        };
 
-            closure(
-                &mut fields_to_check
-                    .iter()
-                    .skip(slice_start)
-                    .take(slice_stop)
-                    .map(|u| u.to_ascii_lowercase()),
-            )
+        let total_chars = string_after_skip.chars().count();
+
+        // `-w N` => Compare no more than N characters
+        let slice_stop = self.slice_stop.unwrap_or(total_chars);
+        let slice_start = slice_stop.min(total_chars);
+
+        let mut iter = string_after_skip.chars().take(slice_start);
+
+        if self.ignore_case {
+            // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
+            closure(&mut iter.map(|c| c.to_ascii_lowercase()))
         } else {
-            closure(&mut fields_to_check.iter().copied())
+            closure(&mut iter)
         }
     }
 
diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs
@@ -1172,3 +1172,13 @@ fn gnu_tests() {
         }
     }
 }
+
+#[test]
+fn test_stdin_w1_multibyte() {
+    let input = "à\ná\n";
+    new_ucmd!()
+        .args(&["-w1"])
+        .pipe_in(input)
+        .run()
+        .stdout_is("à\ná\n");
+}

Original file line number	Diff line number	Diff line change
`@@ -1172,3 +1172,13 @@ fn gnu_tests() {`
`1172`	`1172`	`}`
`1173`	`1173`	`}`
`1174`	`1174`	`}`
	`1175`	`+`
	`1176`	`+#[test]`
	`1177`	`+fn test_stdin_w1_multibyte() {`
	`1178`	`+ let input = "à\ná\n";`
	`1179`	`+ new_ucmd!()`
	`1180`	`+ .args(&["-w1"])`
	`1181`	`+ .pipe_in(input)`
	`1182`	`+ .run()`
	`1183`	`+ .stdout_is("à\ná\n");`
	`1184`	`+}`