@@ -42,6 +42,8 @@ enum Delimiters {
4242 None ,
4343}
4444
45+ const OUTPUT_BUFFER_CAPACITY : usize = 128 * 1024 ;
46+
4547struct Uniq {
4648 repeats_only : bool ,
4749 uniques_only : bool ,
@@ -55,6 +57,14 @@ struct Uniq {
5557 zero_terminated : bool ,
5658}
5759
60+ #[ derive( Default ) ]
61+ struct LineMeta {
62+ key_start : usize ,
63+ key_end : usize ,
64+ lowercase : Vec < u8 > ,
65+ use_lowercase : bool ,
66+ }
67+
5868macro_rules! write_line_terminator {
5969 ( $writer: expr, $line_terminator: expr) => {
6070 $writer
@@ -64,42 +74,53 @@ macro_rules! write_line_terminator {
6474}
6575
6676impl Uniq {
67- pub fn print_uniq ( & self , reader : impl BufRead , mut writer : impl Write ) -> UResult < ( ) > {
77+ pub fn print_uniq ( & self , mut reader : impl BufRead , mut writer : impl Write ) -> UResult < ( ) > {
6878 let mut first_line_printed = false ;
6979 let mut group_count = 1 ;
7080 let line_terminator = self . get_line_terminator ( ) ;
71- let mut lines = reader. split ( line_terminator) ;
72- let mut line = match lines. next ( ) {
73- Some ( l) => l?,
74- None => return Ok ( ( ) ) ,
75- } ;
76-
7781 let writer = & mut writer;
7882
79- // compare current `line` with consecutive lines (`next_line`) of the input
80- // and if needed, print `line` based on the command line options provided
81- for next_line in lines {
82- let next_line = next_line?;
83- if self . cmp_keys ( & line, & next_line) {
83+ let mut current_buf = Vec :: with_capacity ( 1024 ) ;
84+ if !Self :: read_line ( & mut reader, & mut current_buf, line_terminator) ? {
85+ return Ok ( ( ) ) ;
86+ }
87+ let mut current_meta = LineMeta :: default ( ) ;
88+ self . build_meta ( & current_buf, & mut current_meta) ;
89+
90+ let mut next_buf = Vec :: with_capacity ( 1024 ) ;
91+ let mut next_meta = LineMeta :: default ( ) ;
92+
93+ loop {
94+ if !Self :: read_line ( & mut reader, & mut next_buf, line_terminator) ? {
95+ break ;
96+ }
97+
98+ self . build_meta ( & next_buf, & mut next_meta) ;
99+
100+ if self . keys_differ ( & current_buf, & current_meta, & next_buf, & next_meta) {
84101 if ( group_count == 1 && !self . repeats_only )
85102 || ( group_count > 1 && !self . uniques_only )
86103 {
87- self . print_line ( writer, & line , group_count, first_line_printed) ?;
104+ self . print_line ( writer, & current_buf , group_count, first_line_printed) ?;
88105 first_line_printed = true ;
89106 }
90- line = next_line;
107+ std:: mem:: swap ( & mut current_buf, & mut next_buf) ;
108+ std:: mem:: swap ( & mut current_meta, & mut next_meta) ;
91109 group_count = 1 ;
92110 } else {
93111 if self . all_repeated {
94- self . print_line ( writer, & line , group_count, first_line_printed) ?;
112+ self . print_line ( writer, & current_buf , group_count, first_line_printed) ?;
95113 first_line_printed = true ;
96- line = next_line;
114+ std:: mem:: swap ( & mut current_buf, & mut next_buf) ;
115+ std:: mem:: swap ( & mut current_meta, & mut next_meta) ;
97116 }
98117 group_count += 1 ;
99118 }
119+ next_buf. clear ( ) ;
100120 }
121+
101122 if ( group_count == 1 && !self . repeats_only ) || ( group_count > 1 && !self . uniques_only ) {
102- self . print_line ( writer, & line , group_count, first_line_printed) ?;
123+ self . print_line ( writer, & current_buf , group_count, first_line_printed) ?;
103124 first_line_printed = true ;
104125 }
105126 if ( self . delimiters == Delimiters :: Append || self . delimiters == Delimiters :: Both )
@@ -113,79 +134,134 @@ impl Uniq {
113134 Ok ( ( ) )
114135 }
115136
116- fn skip_fields ( & self , line : & [ u8 ] ) -> Vec < u8 > {
137+ fn get_line_terminator ( & self ) -> u8 {
138+ if self . zero_terminated { 0 } else { b'\n' }
139+ }
140+
141+ fn keys_differ (
142+ & self ,
143+ first_line : & [ u8 ] ,
144+ first_meta : & LineMeta ,
145+ second_line : & [ u8 ] ,
146+ second_meta : & LineMeta ,
147+ ) -> bool {
148+ let first_slice = & first_line[ first_meta. key_start ..first_meta. key_end ] ;
149+ let second_slice = & second_line[ second_meta. key_start ..second_meta. key_end ] ;
150+
151+ if !self . ignore_case {
152+ return first_slice != second_slice;
153+ }
154+
155+ let first_cmp = if first_meta. use_lowercase {
156+ first_meta. lowercase . as_slice ( )
157+ } else {
158+ first_slice
159+ } ;
160+ let second_cmp = if second_meta. use_lowercase {
161+ second_meta. lowercase . as_slice ( )
162+ } else {
163+ second_slice
164+ } ;
165+
166+ first_cmp != second_cmp
167+ }
168+
169+ fn key_bounds ( & self , line : & [ u8 ] ) -> ( usize , usize ) {
170+ let mut start = self . skip_fields_offset ( line) ;
171+ if let Some ( skip_bytes) = self . slice_start {
172+ start = start. saturating_add ( skip_bytes) . min ( line. len ( ) ) ;
173+ }
174+
175+ let end = self . key_end_index ( line, start) ;
176+ ( start, end)
177+ }
178+
179+ fn skip_fields_offset ( & self , line : & [ u8 ] ) -> usize {
117180 if let Some ( skip_fields) = self . skip_fields {
118- let mut line = line. iter ( ) ;
119- let mut line_after_skipped_field: Vec < u8 > ;
181+ let mut idx = 0 ;
120182 for _ in 0 ..skip_fields {
121- if line. all ( |u| u . is_ascii_whitespace ( ) ) {
122- return Vec :: new ( ) ;
183+ while idx < line. len ( ) && line [ idx ] . is_ascii_whitespace ( ) {
184+ idx += 1 ;
123185 }
124- line_after_skipped_field = line
125- . by_ref ( )
126- . skip_while ( |u| !u . is_ascii_whitespace ( ) )
127- . copied ( )
128- . collect :: < Vec < u8 > > ( ) ;
129-
130- if line_after_skipped_field . is_empty ( ) {
131- return Vec :: new ( ) ;
186+ if idx > = line. len ( ) {
187+ return line . len ( ) ;
188+ }
189+ while idx < line . len ( ) && !line [ idx ] . is_ascii_whitespace ( ) {
190+ idx += 1 ;
191+ }
192+ if idx >= line . len ( ) {
193+ return line . len ( ) ;
132194 }
133- line = line_after_skipped_field. iter ( ) ;
134195 }
135- line . copied ( ) . collect :: < Vec < u8 > > ( )
196+ idx
136197 } else {
137- line . to_vec ( )
198+ 0
138199 }
139200 }
140201
141- fn get_line_terminator ( & self ) -> u8 {
142- if self . zero_terminated { 0 } else { b'\n' }
202+ fn key_end_index ( & self , line : & [ u8 ] , key_start : usize ) -> usize {
203+ let remainder = & line[ key_start..] ;
204+ match self . slice_stop {
205+ None => line. len ( ) ,
206+ Some ( limit) => {
207+ if remainder. is_empty ( ) {
208+ return key_start;
209+ }
210+ if let Ok ( valid) = std:: str:: from_utf8 ( remainder) {
211+ let prefix_len = Self :: char_prefix_len ( valid, limit) ;
212+ key_start + prefix_len
213+ } else {
214+ key_start + remainder. len ( ) . min ( limit)
215+ }
216+ }
217+ }
143218 }
144219
145- fn cmp_keys ( & self , first : & [ u8 ] , second : & [ u8 ] ) -> bool {
146- self . cmp_key ( first, |first_iter| {
147- self . cmp_key ( second, |second_iter| first_iter. ne ( second_iter) )
148- } )
220+ fn char_prefix_len ( text : & str , limit : usize ) -> usize {
221+ for ( count, ( idx, _) ) in text. char_indices ( ) . enumerate ( ) {
222+ if count == limit {
223+ return idx;
224+ }
225+ }
226+ text. len ( )
149227 }
150228
151- fn cmp_key < F > ( & self , line : & [ u8 ] , mut closure : F ) -> bool
152- where
153- F : FnMut ( & mut dyn Iterator < Item = char > ) -> bool ,
154- {
155- let fields_to_check = self . skip_fields ( line) ;
156-
157- // Skip self.slice_start bytes (if -s was used).
158- // self.slice_start is how many characters to skip, but historically
159- // uniq's `-s N` means "skip N *bytes*," so do that literally:
160- let skip_bytes = self . slice_start . unwrap_or ( 0 ) ;
161- let fields_to_check = if skip_bytes < fields_to_check. len ( ) {
162- & fields_to_check[ skip_bytes..]
163- } else {
164- // If skipping beyond end-of-line, leftover is empty => effectively ""
165- & [ ]
166- } ;
167-
168- // Convert the leftover bytes to UTF-8 for character-based -w
169- // If invalid UTF-8, just compare them as individual bytes (fallback).
170- let Ok ( string_after_skip) = std:: str:: from_utf8 ( fields_to_check) else {
171- // Fallback: if invalid UTF-8, treat them as single-byte "chars"
172- return closure ( & mut fields_to_check. iter ( ) . map ( |& b| b as char ) ) ;
173- } ;
174-
175- let total_chars = string_after_skip. chars ( ) . count ( ) ;
176-
177- // `-w N` => Compare no more than N characters
178- let slice_stop = self . slice_stop . unwrap_or ( total_chars) ;
179- let slice_start = slice_stop. min ( total_chars) ;
229+ fn build_meta ( & self , line : & [ u8 ] , meta : & mut LineMeta ) {
230+ let ( key_start, key_end) = self . key_bounds ( line) ;
231+ meta. key_start = key_start;
232+ meta. key_end = key_end;
233+
234+ if self . ignore_case && key_start < key_end {
235+ let slice = & line[ key_start..key_end] ;
236+ if slice. iter ( ) . any ( |b| b. is_ascii_uppercase ( ) ) {
237+ meta. lowercase . clear ( ) ;
238+ meta. lowercase . reserve ( slice. len ( ) ) ;
239+ meta. lowercase
240+ . extend ( slice. iter ( ) . map ( |b| b. to_ascii_lowercase ( ) ) ) ;
241+ meta. use_lowercase = true ;
242+ return ;
243+ }
244+ }
180245
181- let mut iter = string_after_skip. chars ( ) . take ( slice_start) ;
246+ meta. use_lowercase = false ;
247+ }
182248
183- if self . ignore_case {
184- // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
185- closure ( & mut iter. map ( |c| c. to_ascii_lowercase ( ) ) )
186- } else {
187- closure ( & mut iter)
249+ fn read_line (
250+ reader : & mut impl BufRead ,
251+ buffer : & mut Vec < u8 > ,
252+ line_terminator : u8 ,
253+ ) -> UResult < bool > {
254+ buffer. clear ( ) ;
255+ let bytes_read = reader
256+ . read_until ( line_terminator, buffer)
257+ . map_err_context ( || translate ! ( "uniq-error-read-error" ) ) ?;
258+ if bytes_read == 0 {
259+ return Ok ( false ) ;
188260 }
261+ if buffer. last ( ) . is_some_and ( |last| * last == line_terminator) {
262+ buffer. pop ( ) ;
263+ }
264+ Ok ( true )
189265 }
190266
191267 fn should_print_delimiter ( & self , group_count : usize , first_line_printed : bool ) -> bool {
@@ -214,22 +290,59 @@ impl Uniq {
214290 write_line_terminator ! ( writer, line_terminator) ?;
215291 }
216292
293+ let mut count_buf = [ 0u8 ; Self :: COUNT_PREFIX_BUF_SIZE ] ;
294+
217295 if self . show_counts {
218- let prefix = format ! ( "{count:7} " ) ;
219- let out = prefix
220- . as_bytes ( )
221- . iter ( )
222- . chain ( line. iter ( ) )
223- . copied ( )
224- . collect :: < Vec < u8 > > ( ) ;
225- writer. write_all ( out. as_slice ( ) )
226- } else {
227- writer. write_all ( line)
296+ // Call the associated function (no &self) after the refactor above.
297+ let prefix = Self :: build_count_prefix ( count, & mut count_buf) ;
298+ writer
299+ . write_all ( prefix)
300+ . map_err_context ( || translate ! ( "uniq-error-write-error" ) ) ?;
228301 }
229- . map_err_context ( || translate ! ( "uniq-error-write-error" ) ) ?;
302+
303+ writer
304+ . write_all ( line)
305+ . map_err_context ( || translate ! ( "uniq-error-write-error" ) ) ?;
230306
231307 write_line_terminator ! ( writer, line_terminator)
232308 }
309+
310+ const COUNT_PREFIX_WIDTH : usize = 7 ;
311+ const COUNT_PREFIX_BUF_SIZE : usize = 32 ;
312+
313+ // This function does not use `self`, so make it an associated function.
314+ // Also remove needless explicit lifetimes to satisfy clippy::needless-lifetimes.
315+ fn build_count_prefix ( count : usize , buf : & mut [ u8 ; Self :: COUNT_PREFIX_BUF_SIZE ] ) -> & [ u8 ] {
316+ let mut digits_buf = [ 0u8 ; 20 ] ;
317+ let mut value = count;
318+ let mut idx = digits_buf. len ( ) ;
319+
320+ if value == 0 {
321+ idx -= 1 ;
322+ digits_buf[ idx] = b'0' ;
323+ } else {
324+ while value > 0 {
325+ idx -= 1 ;
326+ digits_buf[ idx] = b'0' + ( value % 10 ) as u8 ;
327+ value /= 10 ;
328+ }
329+ }
330+
331+ let digits = & digits_buf[ idx..] ;
332+ let width = Self :: COUNT_PREFIX_WIDTH ;
333+
334+ if digits. len ( ) <= width {
335+ let pad = width - digits. len ( ) ;
336+ buf[ ..pad] . fill ( b' ' ) ;
337+ buf[ pad..pad + digits. len ( ) ] . copy_from_slice ( digits) ;
338+ buf[ width] = b' ' ;
339+ & buf[ ..=width]
340+ } else {
341+ buf[ ..digits. len ( ) ] . copy_from_slice ( digits) ;
342+ buf[ digits. len ( ) ] = b' ' ;
343+ & buf[ ..=digits. len ( ) ]
344+ }
345+ }
233346}
234347
235348fn opt_parsed ( opt_name : & str , matches : & ArgMatches ) -> UResult < Option < usize > > {
@@ -741,8 +854,11 @@ fn open_output_file(out_file_name: Option<&OsStr>) -> UResult<Box<dyn Write>> {
741854 let out_file = File :: create ( path) . map_err_context (
742855 || translate ! ( "uniq-error-could-not-open" , "path" => path. maybe_quote( ) ) ,
743856 ) ?;
744- Box :: new ( BufWriter :: new ( out_file) )
857+ Box :: new ( BufWriter :: with_capacity ( OUTPUT_BUFFER_CAPACITY , out_file) )
745858 }
746- _ => Box :: new ( stdout ( ) . lock ( ) ) ,
859+ _ => Box :: new ( BufWriter :: with_capacity (
860+ OUTPUT_BUFFER_CAPACITY ,
861+ stdout ( ) . lock ( ) ,
862+ ) ) ,
747863 } )
748864}
0 commit comments