Skip to content

Commit a316262

Browse files
committed
cmp: print verbose diffs as we find them
Before this change, we would first find all changes so we could obtain the largest offset we will report and use that to set up the padding. Now we use the file sizes to estimate the largest possible offset. Not only does this allow us to print earlier, reduces memory usage, as we do not store diffs to report later, but it also fixes a case in which our output was different to GNU cmp's - because it also seems to estimate based on size. Memory usage drops by a factor of 1000(!), without losing performance while comparing 2 binaries of hundreds of MBs: Before: Maximum resident set size (kbytes): 2489260 Benchmark 1: ../target/release/diffutils \ cmp -l -b /usr/lib64/chromium-browser/chromium-browser /usr/lib64/firefox/libxul.so Time (mean ± σ): 14.466 s ± 0.166 s [User: 12.367 s, System: 2.012 s] Range (min … max): 14.350 s … 14.914 s 10 runs After: Maximum resident set size (kbytes): 2636 Benchmark 1: ../target/release/diffutils \ cmp -l -b /usr/lib64/chromium-browser/chromium-browser /usr/lib64/firefox/libxul.so Time (mean ± σ): 13.724 s ± 0.038 s [User: 12.263 s, System: 1.372 s] Range (min … max): 13.667 s … 13.793 s 10 runs
1 parent 0bf04b4 commit a316262

File tree

2 files changed

+93
-99
lines changed

2 files changed

+93
-99
lines changed

src/cmp.rs

Lines changed: 92 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use std::ffi::OsString;
99
use std::io::{BufRead, BufReader, BufWriter, Read, Write};
1010
use std::iter::Peekable;
1111
use std::process::ExitCode;
12-
use std::{fs, io};
12+
use std::{cmp, fs, io};
1313

1414
#[cfg(not(target_os = "windows"))]
1515
use std::os::fd::{AsRawFd, FromRawFd};
@@ -320,10 +320,35 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
320320
let mut from = prepare_reader(&params.from, &params.skip_a, params)?;
321321
let mut to = prepare_reader(&params.to, &params.skip_b, params)?;
322322

323+
let mut offset_width = params.max_bytes.unwrap_or(usize::MAX);
324+
325+
if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(&params.from), fs::metadata(&params.to)) {
326+
#[cfg(not(target_os = "windows"))]
327+
let (a_size, b_size) = (a_meta.size(), b_meta.size());
328+
329+
#[cfg(target_os = "windows")]
330+
let (a_size, b_size) = (a_meta.file_size(), b_meta.file_size());
331+
332+
// If the files have different sizes, we already know they are not identical. If we have not
333+
// been asked to show even the first difference, we can quit early.
334+
if params.quiet && a_size != b_size {
335+
return Ok(Cmp::Different);
336+
}
337+
338+
let smaller = cmp::min(a_size, b_size) as usize;
339+
offset_width = cmp::min(smaller, offset_width);
340+
}
341+
342+
let offset_width = 1 + offset_width.checked_ilog10().unwrap_or(1) as usize;
343+
344+
// Capacity calc: at_byte width + 2 x 3-byte octal numbers + 2 x 4-byte value + 4 spaces
345+
let mut output = Vec::<u8>::with_capacity(offset_width + 3 * 2 + 4 * 2 + 4);
346+
323347
let mut at_byte = 1;
324348
let mut at_line = 1;
325349
let mut start_of_line = true;
326-
let mut verbose_diffs = vec![];
350+
let mut stdout = BufWriter::new(io::stdout().lock());
351+
let mut compare = Cmp::Equal;
327352
loop {
328353
// Fill up our buffers.
329354
let from_buf = match from.fill_buf() {
@@ -360,10 +385,6 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
360385
&params.to.to_string_lossy()
361386
};
362387

363-
if params.verbose {
364-
report_verbose_diffs(verbose_diffs, params)?;
365-
}
366-
367388
report_eof(at_byte, at_line, start_of_line, eof_on, params);
368389
return Ok(Cmp::Different);
369390
}
@@ -395,8 +416,24 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
395416
// first one runs out.
396417
for (&from_byte, &to_byte) in from_buf.iter().zip(to_buf.iter()) {
397418
if from_byte != to_byte {
419+
compare = Cmp::Different;
420+
398421
if params.verbose {
399-
verbose_diffs.push((at_byte, from_byte, to_byte));
422+
format_verbose_difference(
423+
from_byte,
424+
to_byte,
425+
at_byte,
426+
offset_width,
427+
&mut output,
428+
params,
429+
)?;
430+
stdout.write_all(output.as_slice()).map_err(|e| {
431+
format!(
432+
"{}: error printing output: {e}",
433+
params.executable.to_string_lossy()
434+
)
435+
})?;
436+
output.clear();
400437
} else {
401438
report_difference(from_byte, to_byte, at_byte, at_line, params);
402439
return Ok(Cmp::Different);
@@ -422,12 +459,7 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
422459
to.consume(consumed);
423460
}
424461

425-
if params.verbose && !verbose_diffs.is_empty() {
426-
report_verbose_diffs(verbose_diffs, params)?;
427-
return Ok(Cmp::Different);
428-
}
429-
430-
Ok(Cmp::Equal)
462+
Ok(compare)
431463
}
432464

433465
// Exit codes are documented at
@@ -450,21 +482,6 @@ pub fn main(opts: Peekable<ArgsOs>) -> ExitCode {
450482
return ExitCode::SUCCESS;
451483
}
452484

453-
// If the files have different sizes, we already know they are not identical. If we have not
454-
// been asked to show even the first difference, we can quit early.
455-
if params.quiet {
456-
if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(&params.from), fs::metadata(&params.to)) {
457-
#[cfg(not(target_os = "windows"))]
458-
if a_meta.size() != b_meta.size() {
459-
return ExitCode::from(1);
460-
}
461-
#[cfg(target_os = "windows")]
462-
if a_meta.file_size() != b_meta.file_size() {
463-
return ExitCode::from(1);
464-
}
465-
}
466-
}
467-
468485
match cmp(&params) {
469486
Ok(Cmp::Equal) => ExitCode::SUCCESS,
470487
Ok(Cmp::Different) => ExitCode::from(1),
@@ -533,99 +550,76 @@ fn format_byte(byte: u8) -> String {
533550
// This function has been optimized to not use the Rust fmt system, which
534551
// leads to a massive speed up when processing large files: cuts the time
535552
// for comparing 2 ~36MB completely different files in half on an M1 Max.
536-
fn report_verbose_diffs(diffs: Vec<(usize, u8, u8)>, params: &Params) -> Result<(), String> {
553+
#[inline]
554+
fn format_verbose_difference(
555+
from_byte: u8,
556+
to_byte: u8,
557+
at_byte: usize,
558+
offset_width: usize,
559+
output: &mut Vec<u8>,
560+
params: &Params,
561+
) -> Result<(), String> {
537562
assert!(!params.quiet);
538563

539-
let mut stdout = BufWriter::new(io::stdout().lock());
540-
if let Some((offset, _, _)) = diffs.last() {
541-
// Obtain the width of the first column from the last byte offset.
542-
let width = format!("{}", offset).len();
543-
544-
let mut at_byte_buf = itoa::Buffer::new();
545-
let mut from_oct = [0u8; 3]; // for octal conversions
546-
let mut to_oct = [0u8; 3];
547-
548-
// Capacity calc: at_byte width + 2 x 3-byte octal numbers + 4-byte value + up to 2 byte value + 4 spaces
549-
let mut output = Vec::<u8>::with_capacity(width + 3 * 2 + 4 + 2 + 4);
550-
551-
if params.print_bytes {
552-
for (at_byte, from_byte, to_byte) in diffs {
553-
output.clear();
554-
555-
// "{:>width$} {:>3o} {:4} {:>3o} {}",
556-
let at_byte_str = at_byte_buf.format(at_byte);
557-
let at_byte_padding = width - at_byte_str.len();
564+
let mut at_byte_buf = itoa::Buffer::new();
565+
let mut from_oct = [0u8; 3]; // for octal conversions
566+
let mut to_oct = [0u8; 3];
558567

559-
for _ in 0..at_byte_padding {
560-
output.push(b' ')
561-
}
562-
563-
output.extend_from_slice(at_byte_str.as_bytes());
564-
565-
output.push(b' ');
568+
if params.print_bytes {
569+
// "{:>width$} {:>3o} {:4} {:>3o} {}",
570+
let at_byte_str = at_byte_buf.format(at_byte);
571+
let at_byte_padding = offset_width.saturating_sub(at_byte_str.len());
566572

567-
output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());
573+
for _ in 0..at_byte_padding {
574+
output.push(b' ')
575+
}
568576

569-
output.push(b' ');
577+
output.extend_from_slice(at_byte_str.as_bytes());
570578

571-
let from_byte_str = format_byte(from_byte);
572-
let from_byte_padding = 4 - from_byte_str.len();
579+
output.push(b' ');
573580

574-
output.extend_from_slice(from_byte_str.as_bytes());
581+
output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());
575582

576-
for _ in 0..from_byte_padding {
577-
output.push(b' ')
578-
}
583+
output.push(b' ');
579584

580-
output.push(b' ');
585+
let from_byte_str = format_byte(from_byte);
586+
let from_byte_padding = 4 - from_byte_str.len();
581587

582-
output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());
588+
output.extend_from_slice(from_byte_str.as_bytes());
583589

584-
output.push(b' ');
590+
for _ in 0..from_byte_padding {
591+
output.push(b' ')
592+
}
585593

586-
output.extend_from_slice(format_byte(to_byte).as_bytes());
594+
output.push(b' ');
587595

588-
output.push(b'\n');
596+
output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());
589597

590-
stdout.write_all(output.as_slice()).map_err(|e| {
591-
format!(
592-
"{}: error printing output: {e}",
593-
params.executable.to_string_lossy()
594-
)
595-
})?;
596-
}
597-
} else {
598-
for (at_byte, from_byte, to_byte) in diffs {
599-
output.clear();
598+
output.push(b' ');
600599

601-
// "{:>width$} {:>3o} {:>3o}"
602-
let at_byte_str = at_byte_buf.format(at_byte);
603-
let at_byte_padding = width - at_byte_str.len();
600+
output.extend_from_slice(format_byte(to_byte).as_bytes());
604601

605-
for _ in 0..at_byte_padding {
606-
output.push(b' ')
607-
}
602+
output.push(b'\n');
603+
} else {
604+
// "{:>width$} {:>3o} {:>3o}"
605+
let at_byte_str = at_byte_buf.format(at_byte);
606+
let at_byte_padding = offset_width - at_byte_str.len();
608607

609-
output.extend_from_slice(at_byte_str.as_bytes());
608+
for _ in 0..at_byte_padding {
609+
output.push(b' ')
610+
}
610611

611-
output.push(b' ');
612+
output.extend_from_slice(at_byte_str.as_bytes());
612613

613-
output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());
614+
output.push(b' ');
614615

615-
output.push(b' ');
616+
output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());
616617

617-
output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());
618+
output.push(b' ');
618619

619-
output.push(b'\n');
620+
output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());
620621

621-
stdout.write_all(output.as_slice()).map_err(|e| {
622-
format!(
623-
"{}: error printing output: {e}",
624-
params.executable.to_string_lossy()
625-
)
626-
})?;
627-
}
628-
}
622+
output.push(b'\n');
629623
}
630624

631625
Ok(())

tests/integration.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ mod cmp {
616616
.code(predicate::eq(1))
617617
.failure()
618618
.stderr(predicate::str::is_empty())
619-
.stdout(predicate::eq("4 40 144 d\n8 40 150 h\n"));
619+
.stdout(predicate::eq(" 4 40 144 d\n 8 40 150 h\n"));
620620
Ok(())
621621
}
622622

0 commit comments

Comments
 (0)