@@ -154,43 +154,44 @@ impl Uniq {
154154
155155 fn cmp_key < F > ( & self , line : & [ u8 ] , mut closure : F ) -> bool
156156 where
157- F : FnMut ( & mut dyn Iterator < Item = u8 > ) -> bool ,
157+ F : FnMut ( & mut dyn Iterator < Item = char > ) -> bool ,
158158 {
159159 let fields_to_check = self . skip_fields ( line) ;
160- let len = fields_to_check. len ( ) ;
161- let slice_start = self . slice_start . unwrap_or ( 0 ) ;
162- let slice_stop = self . slice_stop . unwrap_or ( len) ;
163- if len > 0 {
164- // fast path: avoid doing any work if there is no need to skip or map to lower-case
165- if !self . ignore_case && slice_start == 0 && slice_stop == len {
166- return closure ( & mut fields_to_check. iter ( ) . copied ( ) ) ;
167- }
168160
169- // fast path: avoid skipping
170- if self . ignore_case && slice_start == 0 && slice_stop == len {
171- return closure ( & mut fields_to_check. iter ( ) . map ( |u| u. to_ascii_lowercase ( ) ) ) ;
172- }
161+ // Skip self.slice_start bytes (if -s was used).
162+ // self.slice_start is how many characters to skip, but historically
163+ // uniq’s `-s N` means “skip N *bytes*,” so do that literally:
164+ let skip_bytes = self . slice_start . unwrap_or ( 0 ) ;
165+ let fields_to_check = if skip_bytes < fields_to_check. len ( ) {
166+ & fields_to_check[ skip_bytes..]
167+ } else {
168+ // If skipping beyond end-of-line, leftover is empty => effectively ""
169+ & [ ]
170+ } ;
173171
174- // fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
175- if !self . ignore_case {
176- return closure (
177- & mut fields_to_check
178- . iter ( )
179- . skip ( slice_start)
180- . take ( slice_stop)
181- . copied ( ) ,
182- ) ;
172+ // Convert the leftover bytes to UTF-8 for character-based -w
173+ // If invalid UTF-8, just compare them as individual bytes (fallback).
174+ let string_after_skip = match std:: str:: from_utf8 ( fields_to_check) {
175+ Ok ( s) => s,
176+ Err ( _) => {
177+ // Fallback: if invalid UTF-8, treat them as single-byte “chars”
178+ return closure ( & mut fields_to_check. iter ( ) . map ( |& b| b as char ) ) ;
183179 }
180+ } ;
184181
185- closure (
186- & mut fields_to_check
187- . iter ( )
188- . skip ( slice_start)
189- . take ( slice_stop)
190- . map ( |u| u. to_ascii_lowercase ( ) ) ,
191- )
182+ let total_chars = string_after_skip. chars ( ) . count ( ) ;
183+
184+ // `-w N` => Compare no more than N characters
185+ let slice_stop = self . slice_stop . unwrap_or ( total_chars) ;
186+ let slice_start = slice_stop. min ( total_chars) ;
187+
188+ let mut iter = string_after_skip. chars ( ) . take ( slice_start) ;
189+
190+ if self . ignore_case {
191+ // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
192+ closure ( & mut iter. map ( |c| c. to_ascii_lowercase ( ) ) )
192193 } else {
193- closure ( & mut fields_to_check . iter ( ) . copied ( ) )
194+ closure ( & mut iter)
194195 }
195196 }
196197
0 commit comments