Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/uu/uniq/locales/en-US.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ uniq-help-zero-terminated = end lines with 0 byte, not newline
# Error messages
uniq-error-write-line-terminator = Could not write line terminator
uniq-error-write-error = write error
uniq-error-read-error = read error
uniq-error-invalid-argument = Invalid argument for { $opt_name }: { $arg }
uniq-error-try-help = Try 'uniq --help' for more information.
uniq-error-group-mutually-exclusive = --group is mutually exclusive with -c/-d/-D/-u
Expand Down
1 change: 1 addition & 0 deletions src/uu/uniq/locales/fr-FR.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ uniq-help-zero-terminated = terminer les lignes avec un octet 0, pas une nouvell
# Messages d'erreur
uniq-error-write-line-terminator = Impossible d'écrire le terminateur de ligne
uniq-error-write-error = erreur d'écriture
uniq-error-read-error = erreur de lecture
uniq-error-invalid-argument = Argument invalide pour { $opt_name } : { $arg }
uniq-error-try-help = Essayez 'uniq --help' pour plus d'informations.
uniq-error-group-mutually-exclusive = --group est mutuellement exclusif avec -c/-d/-D/-u
Expand Down
290 changes: 203 additions & 87 deletions src/uu/uniq/src/uniq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ enum Delimiters {
None,
}

const OUTPUT_BUFFER_CAPACITY: usize = 128 * 1024;

struct Uniq {
repeats_only: bool,
uniques_only: bool,
Expand All @@ -55,6 +57,14 @@ struct Uniq {
zero_terminated: bool,
}

#[derive(Default)]
struct LineMeta {
key_start: usize,
key_end: usize,
lowercase: Vec<u8>,
use_lowercase: bool,
}

macro_rules! write_line_terminator {
($writer:expr, $line_terminator:expr) => {
$writer
Expand All @@ -64,42 +74,53 @@ macro_rules! write_line_terminator {
}

impl Uniq {
pub fn print_uniq(&self, reader: impl BufRead, mut writer: impl Write) -> UResult<()> {
pub fn print_uniq(&self, mut reader: impl BufRead, mut writer: impl Write) -> UResult<()> {
let mut first_line_printed = false;
let mut group_count = 1;
let line_terminator = self.get_line_terminator();
let mut lines = reader.split(line_terminator);
let mut line = match lines.next() {
Some(l) => l?,
None => return Ok(()),
};

let writer = &mut writer;

// compare current `line` with consecutive lines (`next_line`) of the input
// and if needed, print `line` based on the command line options provided
for next_line in lines {
let next_line = next_line?;
if self.cmp_keys(&line, &next_line) {
let mut current_buf = Vec::with_capacity(1024);
if !Self::read_line(&mut reader, &mut current_buf, line_terminator)? {
return Ok(());
}
let mut current_meta = LineMeta::default();
self.build_meta(&current_buf, &mut current_meta);

let mut next_buf = Vec::with_capacity(1024);
let mut next_meta = LineMeta::default();

loop {
if !Self::read_line(&mut reader, &mut next_buf, line_terminator)? {
break;
}

self.build_meta(&next_buf, &mut next_meta);

if self.keys_differ(&current_buf, &current_meta, &next_buf, &next_meta) {
if (group_count == 1 && !self.repeats_only)
|| (group_count > 1 && !self.uniques_only)
{
self.print_line(writer, &line, group_count, first_line_printed)?;
self.print_line(writer, &current_buf, group_count, first_line_printed)?;
first_line_printed = true;
}
line = next_line;
std::mem::swap(&mut current_buf, &mut next_buf);
std::mem::swap(&mut current_meta, &mut next_meta);
group_count = 1;
} else {
if self.all_repeated {
self.print_line(writer, &line, group_count, first_line_printed)?;
self.print_line(writer, &current_buf, group_count, first_line_printed)?;
first_line_printed = true;
line = next_line;
std::mem::swap(&mut current_buf, &mut next_buf);
std::mem::swap(&mut current_meta, &mut next_meta);
}
group_count += 1;
}
next_buf.clear();
}

if (group_count == 1 && !self.repeats_only) || (group_count > 1 && !self.uniques_only) {
self.print_line(writer, &line, group_count, first_line_printed)?;
self.print_line(writer, &current_buf, group_count, first_line_printed)?;
first_line_printed = true;
}
if (self.delimiters == Delimiters::Append || self.delimiters == Delimiters::Both)
Expand All @@ -113,79 +134,134 @@ impl Uniq {
Ok(())
}

fn skip_fields(&self, line: &[u8]) -> Vec<u8> {
fn get_line_terminator(&self) -> u8 {
if self.zero_terminated { 0 } else { b'\n' }
}

fn keys_differ(
&self,
first_line: &[u8],
first_meta: &LineMeta,
second_line: &[u8],
second_meta: &LineMeta,
) -> bool {
let first_slice = &first_line[first_meta.key_start..first_meta.key_end];
let second_slice = &second_line[second_meta.key_start..second_meta.key_end];

if !self.ignore_case {
return first_slice != second_slice;
}

let first_cmp = if first_meta.use_lowercase {
first_meta.lowercase.as_slice()
} else {
first_slice
};
let second_cmp = if second_meta.use_lowercase {
second_meta.lowercase.as_slice()
} else {
second_slice
};

first_cmp != second_cmp
}

fn key_bounds(&self, line: &[u8]) -> (usize, usize) {
let mut start = self.skip_fields_offset(line);
if let Some(skip_bytes) = self.slice_start {
start = start.saturating_add(skip_bytes).min(line.len());
}

let end = self.key_end_index(line, start);
(start, end)
}

fn skip_fields_offset(&self, line: &[u8]) -> usize {
if let Some(skip_fields) = self.skip_fields {
let mut line = line.iter();
let mut line_after_skipped_field: Vec<u8>;
let mut idx = 0;
for _ in 0..skip_fields {
if line.all(|u| u.is_ascii_whitespace()) {
return Vec::new();
while idx < line.len() && line[idx].is_ascii_whitespace() {
idx += 1;
}
line_after_skipped_field = line
.by_ref()
.skip_while(|u| !u.is_ascii_whitespace())
.copied()
.collect::<Vec<u8>>();

if line_after_skipped_field.is_empty() {
return Vec::new();
if idx >= line.len() {
return line.len();
}
while idx < line.len() && !line[idx].is_ascii_whitespace() {
idx += 1;
}
if idx >= line.len() {
return line.len();
}
line = line_after_skipped_field.iter();
}
line.copied().collect::<Vec<u8>>()
idx
} else {
line.to_vec()
0
}
}

fn get_line_terminator(&self) -> u8 {
if self.zero_terminated { 0 } else { b'\n' }
fn key_end_index(&self, line: &[u8], key_start: usize) -> usize {
let remainder = &line[key_start..];
match self.slice_stop {
None => line.len(),
Some(limit) => {
if remainder.is_empty() {
return key_start;
}
if let Ok(valid) = std::str::from_utf8(remainder) {
let prefix_len = Self::char_prefix_len(valid, limit);
key_start + prefix_len
} else {
key_start + remainder.len().min(limit)
}
}
}
}

fn cmp_keys(&self, first: &[u8], second: &[u8]) -> bool {
self.cmp_key(first, |first_iter| {
self.cmp_key(second, |second_iter| first_iter.ne(second_iter))
})
fn char_prefix_len(text: &str, limit: usize) -> usize {
for (count, (idx, _)) in text.char_indices().enumerate() {
if count == limit {
return idx;
}
}
text.len()
}

fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
where
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
{
let fields_to_check = self.skip_fields(line);

// Skip self.slice_start bytes (if -s was used).
// self.slice_start is how many characters to skip, but historically
// uniq's `-s N` means "skip N *bytes*," so do that literally:
let skip_bytes = self.slice_start.unwrap_or(0);
let fields_to_check = if skip_bytes < fields_to_check.len() {
&fields_to_check[skip_bytes..]
} else {
// If skipping beyond end-of-line, leftover is empty => effectively ""
&[]
};

// Convert the leftover bytes to UTF-8 for character-based -w
// If invalid UTF-8, just compare them as individual bytes (fallback).
let Ok(string_after_skip) = std::str::from_utf8(fields_to_check) else {
// Fallback: if invalid UTF-8, treat them as single-byte "chars"
return closure(&mut fields_to_check.iter().map(|&b| b as char));
};

let total_chars = string_after_skip.chars().count();

// `-w N` => Compare no more than N characters
let slice_stop = self.slice_stop.unwrap_or(total_chars);
let slice_start = slice_stop.min(total_chars);
fn build_meta(&self, line: &[u8], meta: &mut LineMeta) {
let (key_start, key_end) = self.key_bounds(line);
meta.key_start = key_start;
meta.key_end = key_end;

if self.ignore_case && key_start < key_end {
let slice = &line[key_start..key_end];
if slice.iter().any(|b| b.is_ascii_uppercase()) {
meta.lowercase.clear();
meta.lowercase.reserve(slice.len());
meta.lowercase
.extend(slice.iter().map(|b| b.to_ascii_lowercase()));
meta.use_lowercase = true;
return;
}
}

let mut iter = string_after_skip.chars().take(slice_start);
meta.use_lowercase = false;
}

if self.ignore_case {
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
} else {
closure(&mut iter)
fn read_line(
reader: &mut impl BufRead,
buffer: &mut Vec<u8>,
line_terminator: u8,
) -> UResult<bool> {
buffer.clear();
let bytes_read = reader
.read_until(line_terminator, buffer)
.map_err_context(|| translate!("uniq-error-read-error"))?;
if bytes_read == 0 {
return Ok(false);
}
if buffer.last().is_some_and(|last| *last == line_terminator) {
buffer.pop();
}
Ok(true)
}

fn should_print_delimiter(&self, group_count: usize, first_line_printed: bool) -> bool {
Expand Down Expand Up @@ -214,22 +290,59 @@ impl Uniq {
write_line_terminator!(writer, line_terminator)?;
}

let mut count_buf = [0u8; Self::COUNT_PREFIX_BUF_SIZE];

if self.show_counts {
let prefix = format!("{count:7} ");
let out = prefix
.as_bytes()
.iter()
.chain(line.iter())
.copied()
.collect::<Vec<u8>>();
writer.write_all(out.as_slice())
} else {
writer.write_all(line)
// Call the associated function (no &self) after the refactor above.
let prefix = Self::build_count_prefix(count, &mut count_buf);
writer
.write_all(prefix)
.map_err_context(|| translate!("uniq-error-write-error"))?;
}
.map_err_context(|| translate!("uniq-error-write-error"))?;

writer
.write_all(line)
.map_err_context(|| translate!("uniq-error-write-error"))?;

write_line_terminator!(writer, line_terminator)
}

const COUNT_PREFIX_WIDTH: usize = 7;
const COUNT_PREFIX_BUF_SIZE: usize = 32;

// This function does not use `self`, so make it an associated function.
// Also remove needless explicit lifetimes to satisfy clippy::needless-lifetimes.
fn build_count_prefix(count: usize, buf: &mut [u8; Self::COUNT_PREFIX_BUF_SIZE]) -> &[u8] {
let mut digits_buf = [0u8; 20];
let mut value = count;
let mut idx = digits_buf.len();

if value == 0 {
idx -= 1;
digits_buf[idx] = b'0';
} else {
while value > 0 {
idx -= 1;
digits_buf[idx] = b'0' + (value % 10) as u8;
value /= 10;
}
}

let digits = &digits_buf[idx..];
let width = Self::COUNT_PREFIX_WIDTH;

if digits.len() <= width {
let pad = width - digits.len();
buf[..pad].fill(b' ');
buf[pad..pad + digits.len()].copy_from_slice(digits);
buf[width] = b' ';
&buf[..=width]
} else {
buf[..digits.len()].copy_from_slice(digits);
buf[digits.len()] = b' ';
&buf[..=digits.len()]
}
}
}

fn opt_parsed(opt_name: &str, matches: &ArgMatches) -> UResult<Option<usize>> {
Expand Down Expand Up @@ -741,8 +854,11 @@ fn open_output_file(out_file_name: Option<&OsStr>) -> UResult<Box<dyn Write>> {
let out_file = File::create(path).map_err_context(
|| translate!("uniq-error-could-not-open", "path" => path.maybe_quote()),
)?;
Box::new(BufWriter::new(out_file))
Box::new(BufWriter::with_capacity(OUTPUT_BUFFER_CAPACITY, out_file))
}
_ => Box::new(stdout().lock()),
_ => Box::new(BufWriter::with_capacity(
OUTPUT_BUFFER_CAPACITY,
stdout().lock(),
)),
})
}
Loading