Skip to content

Commit 1e23a3f

Browse files
committed
uniq: fix multibyte input
Should fix tests/uniq/uniq.pl
1 parent 805754b commit 1e23a3f

File tree

2 files changed

+41
-30
lines changed

2 files changed

+41
-30
lines changed

src/uu/uniq/src/uniq.rs

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -154,43 +154,44 @@ impl Uniq {
154154

155155
fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
156156
where
157-
F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
157+
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
158158
{
159159
let fields_to_check = self.skip_fields(line);
160-
let len = fields_to_check.len();
161-
let slice_start = self.slice_start.unwrap_or(0);
162-
let slice_stop = self.slice_stop.unwrap_or(len);
163-
if len > 0 {
164-
// fast path: avoid doing any work if there is no need to skip or map to lower-case
165-
if !self.ignore_case && slice_start == 0 && slice_stop == len {
166-
return closure(&mut fields_to_check.iter().copied());
167-
}
168160

169-
// fast path: avoid skipping
170-
if self.ignore_case && slice_start == 0 && slice_stop == len {
171-
return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
172-
}
161+
// Skip self.slice_start bytes (if -s was used).
162+
// self.slice_start is how many characters to skip, but historically
163+
// uniq’s `-s N` means “skip N *bytes*,” so do that literally:
164+
let skip_bytes = self.slice_start.unwrap_or(0);
165+
let fields_to_check = if skip_bytes < fields_to_check.len() {
166+
&fields_to_check[skip_bytes..]
167+
} else {
168+
// If skipping beyond end-of-line, leftover is empty => effectively ""
169+
&[]
170+
};
173171

174-
// fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
175-
if !self.ignore_case {
176-
return closure(
177-
&mut fields_to_check
178-
.iter()
179-
.skip(slice_start)
180-
.take(slice_stop)
181-
.copied(),
182-
);
172+
// Convert the leftover bytes to UTF-8 for character-based -w
173+
// If invalid UTF-8, just compare them as individual bytes (fallback).
174+
let string_after_skip = match std::str::from_utf8(fields_to_check) {
175+
Ok(s) => s,
176+
Err(_) => {
177+
// Fallback: if invalid UTF-8, treat them as single-byte “chars”
178+
return closure(&mut fields_to_check.iter().map(|&b| b as char));
183179
}
180+
};
184181

185-
closure(
186-
&mut fields_to_check
187-
.iter()
188-
.skip(slice_start)
189-
.take(slice_stop)
190-
.map(|u| u.to_ascii_lowercase()),
191-
)
182+
let total_chars = string_after_skip.chars().count();
183+
184+
// `-w N` => Compare no more than N characters
185+
let slice_stop = self.slice_stop.unwrap_or(total_chars);
186+
let slice_start = slice_stop.min(total_chars);
187+
188+
let mut iter = string_after_skip.chars().take(slice_start);
189+
190+
if self.ignore_case {
191+
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
192+
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
192193
} else {
193-
closure(&mut fields_to_check.iter().copied())
194+
closure(&mut iter)
194195
}
195196
}
196197

tests/by-util/test_uniq.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,3 +1172,13 @@ fn gnu_tests() {
11721172
}
11731173
}
11741174
}
1175+
1176+
#[test]
1177+
fn test_stdin_w1_multibyte() {
1178+
let input = \ná\n";
1179+
new_ucmd!()
1180+
.args(&["-w1"])
1181+
.pipe_in(input)
1182+
.run()
1183+
.stdout_is(\ná\n");
1184+
}

0 commit comments

Comments
 (0)