diff --git a/src/uu/uniq/locales/en-US.ftl b/src/uu/uniq/locales/en-US.ftl index 558fecb0547..3106270cba9 100644 --- a/src/uu/uniq/locales/en-US.ftl +++ b/src/uu/uniq/locales/en-US.ftl @@ -21,6 +21,7 @@ uniq-help-zero-terminated = end lines with 0 byte, not newline # Error messages uniq-error-write-line-terminator = Could not write line terminator uniq-error-write-error = write error +uniq-error-read-error = read error uniq-error-invalid-argument = Invalid argument for { $opt_name }: { $arg } uniq-error-try-help = Try 'uniq --help' for more information. uniq-error-group-mutually-exclusive = --group is mutually exclusive with -c/-d/-D/-u diff --git a/src/uu/uniq/locales/fr-FR.ftl b/src/uu/uniq/locales/fr-FR.ftl index f9f1cf13004..094af7001bb 100644 --- a/src/uu/uniq/locales/fr-FR.ftl +++ b/src/uu/uniq/locales/fr-FR.ftl @@ -20,6 +20,7 @@ uniq-help-zero-terminated = terminer les lignes avec un octet 0, pas une nouvell # Messages d'erreur uniq-error-write-line-terminator = Impossible d'écrire le terminateur de ligne uniq-error-write-error = erreur d'écriture +uniq-error-read-error = erreur de lecture uniq-error-invalid-argument = Argument invalide pour { $opt_name } : { $arg } uniq-error-try-help = Essayez 'uniq --help' pour plus d'informations. uniq-error-group-mutually-exclusive = --group est mutuellement exclusif avec -c/-d/-D/-u diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index 506f1956596..3845ba459ea 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -42,6 +42,8 @@ enum Delimiters { None, } +const OUTPUT_BUFFER_CAPACITY: usize = 128 * 1024; + struct Uniq { repeats_only: bool, uniques_only: bool, @@ -55,6 +57,14 @@ struct Uniq { zero_terminated: bool, } +#[derive(Default)] +struct LineMeta { + key_start: usize, + key_end: usize, + lowercase: Vec, + use_lowercase: bool, +} + macro_rules! write_line_terminator { ($writer:expr, $line_terminator:expr) => { $writer @@ -64,42 +74,53 @@ macro_rules! write_line_terminator { } impl Uniq { - pub fn print_uniq(&self, reader: impl BufRead, mut writer: impl Write) -> UResult<()> { + pub fn print_uniq(&self, mut reader: impl BufRead, mut writer: impl Write) -> UResult<()> { let mut first_line_printed = false; let mut group_count = 1; let line_terminator = self.get_line_terminator(); - let mut lines = reader.split(line_terminator); - let mut line = match lines.next() { - Some(l) => l?, - None => return Ok(()), - }; - let writer = &mut writer; - // compare current `line` with consecutive lines (`next_line`) of the input - // and if needed, print `line` based on the command line options provided - for next_line in lines { - let next_line = next_line?; - if self.cmp_keys(&line, &next_line) { + let mut current_buf = Vec::with_capacity(1024); + if !Self::read_line(&mut reader, &mut current_buf, line_terminator)? { + return Ok(()); + } + let mut current_meta = LineMeta::default(); + self.build_meta(¤t_buf, &mut current_meta); + + let mut next_buf = Vec::with_capacity(1024); + let mut next_meta = LineMeta::default(); + + loop { + if !Self::read_line(&mut reader, &mut next_buf, line_terminator)? { + break; + } + + self.build_meta(&next_buf, &mut next_meta); + + if self.keys_differ(¤t_buf, ¤t_meta, &next_buf, &next_meta) { if (group_count == 1 && !self.repeats_only) || (group_count > 1 && !self.uniques_only) { - self.print_line(writer, &line, group_count, first_line_printed)?; + self.print_line(writer, ¤t_buf, group_count, first_line_printed)?; first_line_printed = true; } - line = next_line; + std::mem::swap(&mut current_buf, &mut next_buf); + std::mem::swap(&mut current_meta, &mut next_meta); group_count = 1; } else { if self.all_repeated { - self.print_line(writer, &line, group_count, first_line_printed)?; + self.print_line(writer, ¤t_buf, group_count, first_line_printed)?; first_line_printed = true; - line = next_line; + std::mem::swap(&mut current_buf, &mut next_buf); + std::mem::swap(&mut current_meta, &mut next_meta); } group_count += 1; } + next_buf.clear(); } + if (group_count == 1 && !self.repeats_only) || (group_count > 1 && !self.uniques_only) { - self.print_line(writer, &line, group_count, first_line_printed)?; + self.print_line(writer, ¤t_buf, group_count, first_line_printed)?; first_line_printed = true; } if (self.delimiters == Delimiters::Append || self.delimiters == Delimiters::Both) @@ -113,79 +134,134 @@ impl Uniq { Ok(()) } - fn skip_fields(&self, line: &[u8]) -> Vec { + fn get_line_terminator(&self) -> u8 { + if self.zero_terminated { 0 } else { b'\n' } + } + + fn keys_differ( + &self, + first_line: &[u8], + first_meta: &LineMeta, + second_line: &[u8], + second_meta: &LineMeta, + ) -> bool { + let first_slice = &first_line[first_meta.key_start..first_meta.key_end]; + let second_slice = &second_line[second_meta.key_start..second_meta.key_end]; + + if !self.ignore_case { + return first_slice != second_slice; + } + + let first_cmp = if first_meta.use_lowercase { + first_meta.lowercase.as_slice() + } else { + first_slice + }; + let second_cmp = if second_meta.use_lowercase { + second_meta.lowercase.as_slice() + } else { + second_slice + }; + + first_cmp != second_cmp + } + + fn key_bounds(&self, line: &[u8]) -> (usize, usize) { + let mut start = self.skip_fields_offset(line); + if let Some(skip_bytes) = self.slice_start { + start = start.saturating_add(skip_bytes).min(line.len()); + } + + let end = self.key_end_index(line, start); + (start, end) + } + + fn skip_fields_offset(&self, line: &[u8]) -> usize { if let Some(skip_fields) = self.skip_fields { - let mut line = line.iter(); - let mut line_after_skipped_field: Vec; + let mut idx = 0; for _ in 0..skip_fields { - if line.all(|u| u.is_ascii_whitespace()) { - return Vec::new(); + while idx < line.len() && line[idx].is_ascii_whitespace() { + idx += 1; } - line_after_skipped_field = line - .by_ref() - .skip_while(|u| !u.is_ascii_whitespace()) - .copied() - .collect::>(); - - if line_after_skipped_field.is_empty() { - return Vec::new(); + if idx >= line.len() { + return line.len(); + } + while idx < line.len() && !line[idx].is_ascii_whitespace() { + idx += 1; + } + if idx >= line.len() { + return line.len(); } - line = line_after_skipped_field.iter(); } - line.copied().collect::>() + idx } else { - line.to_vec() + 0 } } - fn get_line_terminator(&self) -> u8 { - if self.zero_terminated { 0 } else { b'\n' } + fn key_end_index(&self, line: &[u8], key_start: usize) -> usize { + let remainder = &line[key_start..]; + match self.slice_stop { + None => line.len(), + Some(limit) => { + if remainder.is_empty() { + return key_start; + } + if let Ok(valid) = std::str::from_utf8(remainder) { + let prefix_len = Self::char_prefix_len(valid, limit); + key_start + prefix_len + } else { + key_start + remainder.len().min(limit) + } + } + } } - fn cmp_keys(&self, first: &[u8], second: &[u8]) -> bool { - self.cmp_key(first, |first_iter| { - self.cmp_key(second, |second_iter| first_iter.ne(second_iter)) - }) + fn char_prefix_len(text: &str, limit: usize) -> usize { + for (count, (idx, _)) in text.char_indices().enumerate() { + if count == limit { + return idx; + } + } + text.len() } - fn cmp_key(&self, line: &[u8], mut closure: F) -> bool - where - F: FnMut(&mut dyn Iterator) -> bool, - { - let fields_to_check = self.skip_fields(line); - - // Skip self.slice_start bytes (if -s was used). - // self.slice_start is how many characters to skip, but historically - // uniq's `-s N` means "skip N *bytes*," so do that literally: - let skip_bytes = self.slice_start.unwrap_or(0); - let fields_to_check = if skip_bytes < fields_to_check.len() { - &fields_to_check[skip_bytes..] - } else { - // If skipping beyond end-of-line, leftover is empty => effectively "" - &[] - }; - - // Convert the leftover bytes to UTF-8 for character-based -w - // If invalid UTF-8, just compare them as individual bytes (fallback). - let Ok(string_after_skip) = std::str::from_utf8(fields_to_check) else { - // Fallback: if invalid UTF-8, treat them as single-byte "chars" - return closure(&mut fields_to_check.iter().map(|&b| b as char)); - }; - - let total_chars = string_after_skip.chars().count(); - - // `-w N` => Compare no more than N characters - let slice_stop = self.slice_stop.unwrap_or(total_chars); - let slice_start = slice_stop.min(total_chars); + fn build_meta(&self, line: &[u8], meta: &mut LineMeta) { + let (key_start, key_end) = self.key_bounds(line); + meta.key_start = key_start; + meta.key_end = key_end; + + if self.ignore_case && key_start < key_end { + let slice = &line[key_start..key_end]; + if slice.iter().any(|b| b.is_ascii_uppercase()) { + meta.lowercase.clear(); + meta.lowercase.reserve(slice.len()); + meta.lowercase + .extend(slice.iter().map(|b| b.to_ascii_lowercase())); + meta.use_lowercase = true; + return; + } + } - let mut iter = string_after_skip.chars().take(slice_start); + meta.use_lowercase = false; + } - if self.ignore_case { - // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII: - closure(&mut iter.map(|c| c.to_ascii_lowercase())) - } else { - closure(&mut iter) + fn read_line( + reader: &mut impl BufRead, + buffer: &mut Vec, + line_terminator: u8, + ) -> UResult { + buffer.clear(); + let bytes_read = reader + .read_until(line_terminator, buffer) + .map_err_context(|| translate!("uniq-error-read-error"))?; + if bytes_read == 0 { + return Ok(false); } + if buffer.last().is_some_and(|last| *last == line_terminator) { + buffer.pop(); + } + Ok(true) } fn should_print_delimiter(&self, group_count: usize, first_line_printed: bool) -> bool { @@ -214,22 +290,59 @@ impl Uniq { write_line_terminator!(writer, line_terminator)?; } + let mut count_buf = [0u8; Self::COUNT_PREFIX_BUF_SIZE]; + if self.show_counts { - let prefix = format!("{count:7} "); - let out = prefix - .as_bytes() - .iter() - .chain(line.iter()) - .copied() - .collect::>(); - writer.write_all(out.as_slice()) - } else { - writer.write_all(line) + // Call the associated function (no &self) after the refactor above. + let prefix = Self::build_count_prefix(count, &mut count_buf); + writer + .write_all(prefix) + .map_err_context(|| translate!("uniq-error-write-error"))?; } - .map_err_context(|| translate!("uniq-error-write-error"))?; + + writer + .write_all(line) + .map_err_context(|| translate!("uniq-error-write-error"))?; write_line_terminator!(writer, line_terminator) } + + const COUNT_PREFIX_WIDTH: usize = 7; + const COUNT_PREFIX_BUF_SIZE: usize = 32; + + // This function does not use `self`, so make it an associated function. + // Also remove needless explicit lifetimes to satisfy clippy::needless-lifetimes. + fn build_count_prefix(count: usize, buf: &mut [u8; Self::COUNT_PREFIX_BUF_SIZE]) -> &[u8] { + let mut digits_buf = [0u8; 20]; + let mut value = count; + let mut idx = digits_buf.len(); + + if value == 0 { + idx -= 1; + digits_buf[idx] = b'0'; + } else { + while value > 0 { + idx -= 1; + digits_buf[idx] = b'0' + (value % 10) as u8; + value /= 10; + } + } + + let digits = &digits_buf[idx..]; + let width = Self::COUNT_PREFIX_WIDTH; + + if digits.len() <= width { + let pad = width - digits.len(); + buf[..pad].fill(b' '); + buf[pad..pad + digits.len()].copy_from_slice(digits); + buf[width] = b' '; + &buf[..=width] + } else { + buf[..digits.len()].copy_from_slice(digits); + buf[digits.len()] = b' '; + &buf[..=digits.len()] + } + } } fn opt_parsed(opt_name: &str, matches: &ArgMatches) -> UResult> { @@ -741,8 +854,11 @@ fn open_output_file(out_file_name: Option<&OsStr>) -> UResult> { let out_file = File::create(path).map_err_context( || translate!("uniq-error-could-not-open", "path" => path.maybe_quote()), )?; - Box::new(BufWriter::new(out_file)) + Box::new(BufWriter::with_capacity(OUTPUT_BUFFER_CAPACITY, out_file)) } - _ => Box::new(stdout().lock()), + _ => Box::new(BufWriter::with_capacity( + OUTPUT_BUFFER_CAPACITY, + stdout().lock(), + )), }) }