From f7f291de0d5a5b0b2762eccb95b1b966b3991ecb Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 18 Jan 2026 09:48:42 -0500 Subject: [PATCH 1/2] pr: remove inaccurate unit test Remove unit test for `pr` that was enforcing incorrect behavior. These tests were ostensibly designed to match corresponding ones in the GNU test suite, but they don't match the current behavior of GNU `pr`, so it is not valuable to keep them. --- tests/by-util/test_pr.rs | 46 ---------------------------------------- 1 file changed, 46 deletions(-) diff --git a/tests/by-util/test_pr.rs b/tests/by-util/test_pr.rs index 19afb57bc45..2220e9c3969 100644 --- a/tests/by-util/test_pr.rs +++ b/tests/by-util/test_pr.rs @@ -478,52 +478,6 @@ fn test_with_date_format_env() { .stdout_matches(®ex); } -#[test] -fn test_with_pr_core_utils_tests() { - let test_cases = vec![ - ("", vec!["0Ft"], vec!["0F"], 0), - ("", vec!["0Fnt"], vec!["0Fnt-expected"], 0), - ("+3", vec!["0Ft"], vec!["3-0F"], 0), - ("+3 -f", vec!["0Ft"], vec!["3f-0F"], 0), - ("-a -3", vec!["0Ft"], vec!["a3-0F"], 0), - ("-a -3 -f", vec!["0Ft"], vec!["a3f-0F"], 0), - ("-a -3 -f", vec!["0Fnt"], vec!["a3f-0Fnt-expected"], 0), - ("+3 -a -3 -f", vec!["0Ft"], vec!["3a3f-0F"], 0), - ("-l 24", vec!["FnFn"], vec!["l24-FF"], 0), - ("-W 20 -l24 -f", vec!["tFFt-ll"], vec!["W20l24f-ll"], 0), - ]; - - for test_case in test_cases { - let (flags, input_file, expected_file, return_code) = test_case; - let mut scenario = new_ucmd!(); - let input_file_path = input_file.first().unwrap(); - let test_file_path = expected_file.first().unwrap(); - let value = file_last_modified_time(&scenario, input_file_path); - let mut arguments: Vec<&str> = flags - .split(' ') - .filter(|i| i.trim() != "") - .collect::>(); - - arguments.extend(input_file.clone()); - - let scenario_with_args = scenario.args(&arguments); - - let scenario_with_expected_status = if return_code == 0 { - scenario_with_args.succeeds() - } else { - scenario_with_args.fails() - }; - - scenario_with_expected_status.stdout_is_templated_fixture( - test_file_path, - &[ - ("{last_modified_time}", &value), - ("{file_name}", input_file_path), - ], - ); - } -} - #[test] fn test_with_join_lines_option() { let test_file_1 = "hosts.log"; From aedca01137089e35a7014d3375bb2b6d753d2b34 Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 18 Jan 2026 09:55:29 -0500 Subject: [PATCH 2/2] pr: uniformly scan for form feed and newline chars Fix the way form feed characters are interpreted by changing the way lines and pages are found. Before this commit, a file comprising two form feed characters (`/f/f`) would result in too few trailing newlines at the end of the second page. After this change, each page is produced with the correct number of lines. This commit changes the way files are read, replacing complex iterators with a loop-based approach, iteratively scanning for newline or form feed characters. The `memchr` library is used to efficiently scan for these two characters. One downside of this implementation is that it currently reads the entire input file into memory; this can be improved in subsequent merge requests. --- Cargo.lock | 1 + src/uu/pr/Cargo.toml | 1 + src/uu/pr/src/pr.rs | 417 +++++++++++++++++++-------------------- tests/by-util/test_pr.rs | 25 +++ 4 files changed, 229 insertions(+), 215 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3cac5c62fe8..e0da285d3f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3690,6 +3690,7 @@ dependencies = [ "clap", "fluent", "itertools 0.14.0", + "memchr", "regex", "thiserror 2.0.17", "uucore", diff --git a/src/uu/pr/Cargo.toml b/src/uu/pr/Cargo.toml index d8b5b279117..4eb7539ddf7 100644 --- a/src/uu/pr/Cargo.toml +++ b/src/uu/pr/Cargo.toml @@ -21,6 +21,7 @@ path = "src/pr.rs" clap = { workspace = true } uucore = { workspace = true, features = ["entries", "time"] } itertools = { workspace = true } +memchr = { workspace = true } regex = { workspace = true } thiserror = { workspace = true } fluent = { workspace = true } diff --git a/src/uu/pr/src/pr.rs b/src/uu/pr/src/pr.rs index a5a8b7b570d..05a0eea10e3 100644 --- a/src/uu/pr/src/pr.rs +++ b/src/uu/pr/src/pr.rs @@ -9,10 +9,9 @@ use clap::{Arg, ArgAction, ArgMatches, Command}; use itertools::Itertools; use regex::Regex; -use std::fs::{File, metadata}; -use std::io::{BufRead, BufReader, Lines, Read, Write, stdin, stdout}; -#[cfg(unix)] -use std::os::unix::fs::FileTypeExt; +use std::fs::metadata; +use std::io::{Read, Write, stdin, stdout}; +use std::string::FromUtf8Error; use std::time::SystemTime; use thiserror::Error; @@ -28,11 +27,11 @@ const LINES_PER_PAGE_FOR_FORM_FEED: usize = 63; const HEADER_LINES_PER_PAGE: usize = 5; const TRAILER_LINES_PER_PAGE: usize = 5; const FILE_STDIN: &str = "-"; -const READ_BUFFER_SIZE: usize = 1024 * 64; const DEFAULT_COLUMN_WIDTH: usize = 72; const DEFAULT_COLUMN_WIDTH_WITH_S_OPTION: usize = 512; const DEFAULT_COLUMN_SEPARATOR: &char = &TAB; const FF: u8 = 0x0C_u8; +const NL: u8 = b'\n'; mod options { pub const HEADER: &str = "header"; @@ -82,13 +81,32 @@ struct OutputOptions { line_width: Option, } +/// One line of an input file, annotated with file, page, and line number. +#[derive(Default, Clone)] struct FileLine { file_id: usize, - line_number: usize, page_number: usize, - group_key: usize, - line_content: Result, - form_feeds_after: usize, + line_number: usize, + line_content: String, +} + +impl FileLine { + fn from_buf( + file_id: usize, + page_number: usize, + line_number: usize, + buf: &[u8], + ) -> Result { + // TODO Don't read bytes to String just to directly write them + // out again anyway. + let line_content = String::from_utf8(buf.to_vec())?; + Ok(Self { + file_id, + page_number, + line_number, + line_content, + }) + } } struct ColumnModeOptions { @@ -115,21 +133,16 @@ impl Default for NumberingMode { } } -impl Default for FileLine { - fn default() -> Self { - Self { - file_id: 0, - line_number: 0, - page_number: 0, - group_key: 0, - line_content: Ok(String::new()), - form_feeds_after: 0, +impl From for PrError { + fn from(err: std::io::Error) -> Self { + Self::EncounteredErrors { + msg: err.to_string(), } } } -impl From for PrError { - fn from(err: std::io::Error) -> Self { +impl From for PrError { + fn from(err: FromUtf8Error) -> Self { Self::EncounteredErrors { msg: err.to_string(), } @@ -138,23 +151,8 @@ impl From for PrError { #[derive(Debug, Error)] enum PrError { - #[error("{}", translate!("pr-error-reading-input", "file" => file.clone()))] - Input { - #[source] - source: std::io::Error, - file: String, - }, - #[error("{}", translate!("pr-error-unknown-filetype", "file" => file.clone()))] - UnknownFiletype { file: String }, #[error("pr: {msg}")] EncounteredErrors { msg: String }, - #[error("{}", translate!("pr-error-is-directory", "file" => file.clone()))] - IsDirectory { file: String }, - #[cfg(not(windows))] - #[error("{}", translate!("pr-error-socket-not-supported", "file" => file.clone()))] - IsSocket { file: String }, - #[error("{}", translate!("pr-error-no-such-file", "file" => file.clone()))] - NotExists { file: String }, } pub fn uu_app() -> Command { @@ -764,95 +762,22 @@ fn build_options( }) } -fn open(path: &str) -> Result, PrError> { - if path == FILE_STDIN { - let stdin = stdin(); - return Ok(Box::new(stdin) as Box); - } - - metadata(path).map_or_else( - |_| { - Err(PrError::NotExists { - file: path.to_string(), - }) - }, - |i| { - let path_string = path.to_string(); - match i.file_type() { - #[cfg(unix)] - ft if ft.is_socket() => Err(PrError::IsSocket { file: path_string }), - ft if ft.is_dir() => Err(PrError::IsDirectory { file: path_string }), - - ft => { - #[allow(unused_mut)] - let mut is_valid = ft.is_file() || ft.is_symlink(); - - #[cfg(unix)] - { - is_valid = - is_valid || ft.is_char_device() || ft.is_block_device() || ft.is_fifo(); - } - - if is_valid { - Ok(Box::new(File::open(path).map_err(|e| PrError::Input { - source: e, - file: path.to_string(), - })?) as Box) - } else { - Err(PrError::UnknownFiletype { file: path_string }) - } - } - } - }, - ) -} - -fn split_lines_if_form_feed(file_content: Result) -> Vec { - file_content.map_or_else( - |e| { - vec![FileLine { - line_content: Err(e), - ..FileLine::default() - }] - }, - |content| { - let mut lines = Vec::new(); - let mut f_occurred = 0; - let mut chunk = Vec::new(); - for byte in content.as_bytes() { - if byte == &FF { - f_occurred += 1; - } else { - if f_occurred != 0 { - // First time byte occurred in the scan - lines.push(FileLine { - line_content: Ok(String::from_utf8(chunk.clone()).unwrap()), - form_feeds_after: f_occurred, - ..FileLine::default() - }); - chunk.clear(); - } - chunk.push(*byte); - f_occurred = 0; - } - } - - lines.push(FileLine { - line_content: Ok(String::from_utf8(chunk).unwrap()), - form_feeds_after: f_occurred, - ..FileLine::default() - }); - - lines - }, - ) -} - fn pr(path: &str, options: &OutputOptions) -> Result { - let lines = BufReader::with_capacity(READ_BUFFER_SIZE, open(path)?).lines(); + // Read the entire contents of the file into a buffer. + // + // TODO Read incrementally. + let buf = if path == "-" { + let mut f = stdin(); + let mut buf = vec![]; + f.read_to_end(&mut buf)?; + buf + } else { + std::fs::read(path)? + }; - let pages = read_stream_and_create_pages(options, lines, 0); + let pages = get_pages(options, 0, &buf)?; + // Split the text into pages, and then print each line in each page. for page_with_page_number in pages { let page_number = page_with_page_number.0 + 1; let page = page_with_page_number.1; @@ -862,115 +787,180 @@ fn pr(path: &str, options: &OutputOptions) -> Result { Ok(0) } -fn read_stream_and_create_pages( +/// Group lines of a file into pages. +/// +/// Returns a list of the form `(page_num, lines)`. +/// +/// # Errors +/// +/// Returns an error if the bytes are not a valid UTF-8 string. +fn get_pages( options: &OutputOptions, - lines: Lines>>, file_id: usize, -) -> Box)>> { + buf: &[u8], +) -> Result)>, FromUtf8Error> { let start_page = options.start_page; - let start_line_number = get_start_line_number(options); - let last_page = options.end_page; + let end_page = options.end_page; let lines_needed_per_page = lines_to_read_for_page(options); - Box::new( - lines - .flat_map(split_lines_if_form_feed) - .enumerate() - .map(move |(i, line)| FileLine { - line_number: i + start_line_number, - file_id, - ..line - }) // Add line number and file_id - .batching(move |it| { - let mut first_page = Vec::new(); - let mut page_with_lines = Vec::new(); - for line in it { - let form_feeds_after = line.form_feeds_after; - first_page.push(line); - - if form_feeds_after > 1 { - // insert empty pages - page_with_lines.push(first_page); - for _i in 1..form_feeds_after { - page_with_lines.push(vec![]); - } - return Some(page_with_lines); - } - - if first_page.len() == lines_needed_per_page || form_feeds_after == 1 { - break; - } + // Keep a running total of the number of lines read, starting with + // 0 or another specified number. + let mut line_num = get_start_line_number(options); + + // We will collect each page into a list of pages, along with + // its page number. + let mut pages: Vec<(usize, Vec)> = vec![]; + + // We will build each page iteratively, since one page may + // contain multiple lines and may be interrupted by either a + // form feed or by reaching a line limit. + let mut page = vec![]; + let mut page_num = 0; + + // Remember the index of the end of the last line to use as the + // beginning of the next line. + let mut prev = 0; + + // Search for either the form feed character `\f` or the newline + // character `\n`. The newline character marks the end of a line, + // and a page comprises several lines. A form feed character marks + // the end of a page regardless of how many lines have been read. + for i in memchr::memchr2_iter(FF, NL, buf) { + if buf[i] == FF { + // Treat everything up to (but not including) the form feed + // character as the last line of the page. + let file_line = FileLine::from_buf(file_id, page_num, line_num, &buf[prev..i])?; + page.push(file_line); + + // Remember where the last line ended. + prev = i + 1; + + // The page is finished, so we add it to the list of + // pages and clear the `page` buffer for the next + // iteration. + // + // TODO Optimization opportunity: don't bother pushing + // lines and pages if we aren't going to display it. + if start_page <= page_num + 1 && end_page.is_none_or(|e| page_num < e) { + pages.push((page_num, page.clone())); + } + page_num += 1; + page.clear(); + } else { + // Add everything up to (but not including) the newline + // character as one line of the page. + let file_line = FileLine::from_buf(file_id, page_num, line_num, &buf[prev..i])?; + page.push(file_line); + line_num += 1; + + // Remember where the last line ended. + prev = i + 1; + + // If the page is finished, add it to the list of pages + // and clear the `page` buffer for the next iteration. + if page.len() >= lines_needed_per_page { + if start_page <= page_num + 1 && end_page.is_none_or(|e| page_num < e) { + pages.push((page_num, page.clone())); } + page_num += 1; + page.clear(); + } + } + } - if first_page.is_empty() { - return None; - } - page_with_lines.push(first_page); - Some(page_with_lines) - }) // Create set of pages as form feeds could lead to empty pages - .flatten() // Flatten to pages from page sets - .enumerate() // Assign page number - .skip_while(move |(x, _)| { - // Skip the not needed pages - let current_page = x + 1; - current_page < start_page - }) - .take_while(move |(x, _)| { - // Take only the required pages - let current_page = x + 1; + // Consider all trailing bytes as the last line. + if prev < buf.len() { + let file_line = FileLine::from_buf(file_id, page_num, line_num, &buf[prev..])?; + page.push(file_line); + } - current_page >= start_page - && last_page.is_none_or(|last_page| current_page <= last_page) - }), - ) + // Consider all trailing lines as the last page. + if !page.is_empty() && start_page <= page_num + 1 && end_page.is_none_or(|e| page_num < e) { + pages.push((page_num, page.clone())); + } + + Ok(pages) } -fn mpr(paths: &[&str], options: &OutputOptions) -> Result { - let n_files = paths.len(); +/// Key used to group lines together according to their file and page number. +fn group_key(num_files: usize, line: &FileLine) -> usize { + (line.page_number + 1) * num_files + line.file_id +} + +/// Group each line by its file and page number. +/// +/// The input list of `lines` must be already sorted according to the +/// `group_key`. +fn group_lines(num_files: usize, lines: Vec) -> Vec<(usize, Vec)> { + let mut result: Vec<(usize, Vec)> = vec![]; + let mut current_key: Option = None; + let mut current_group: Vec = vec![]; + for file_line in lines { + match current_key { + None => { + current_key = Some(group_key(num_files, &file_line)); + current_group.push(file_line); + } + Some(key) if group_key(num_files, &file_line) == key => { + current_group.push(file_line); + } + Some(key) => { + result.push((key, current_group.clone())); + current_group.clear(); + current_key = Some(group_key(num_files, &file_line)); + current_group.push(file_line); + } + } + } + // TODO Handle empty file. + result.push((current_key.unwrap(), current_group)); + result +} + +/// Group each line by its file and page number. +/// +/// Each group can then be merged into columns of a single page. +fn get_file_line_groups( + options: &OutputOptions, + paths: &[&str], +) -> Result)>, PrError> { + let num_files = paths.len(); + let mut all_lines = vec![]; + for (file_id, path) in paths.iter().enumerate() { + // Read the entire contents of the file into a buffer. + // + // TODO Read incrementally. + let buf = if *path == "-" { + let mut f = stdin(); + let mut buf = vec![]; + f.read_to_end(&mut buf)?; + buf + } else { + std::fs::read(path)? + }; - // Check if files exists - for path in paths { - open(path)?; + // Split the text into pages and collect each line for + // subsequent grouping. + for (_, mut page) in get_pages(options, file_id, &buf)? { + all_lines.append(&mut page); + } } + // Sort each line by group number and then by line number. + all_lines.sort_by_key(|l| (group_key(num_files, l), l.line_number)); - let file_line_groups = paths - .iter() - .enumerate() - .map(|(i, path)| { - let lines = BufReader::with_capacity(READ_BUFFER_SIZE, open(path).unwrap()).lines(); - - read_stream_and_create_pages(options, lines, i).flat_map(move |(x, line)| { - let file_line = line; - let page_number = x + 1; - file_line - .into_iter() - .map(|fl| FileLine { - page_number, - group_key: page_number * n_files + fl.file_id, - ..fl - }) - .collect::>() - }) - }) - .kmerge_by(|a, b| { - if a.group_key == b.group_key { - a.line_number < b.line_number - } else { - a.group_key < b.group_key - } - }) - .chunk_by(|file_line| file_line.group_key); + Ok(group_lines(num_files, all_lines)) +} + +fn mpr(paths: &[&str], options: &OutputOptions) -> Result { + let file_line_groups = get_file_line_groups(options, paths)?; let start_page = options.start_page; let mut lines = Vec::new(); let mut page_counter = start_page; - for (_key, file_line_group) in &file_line_groups { + for (_key, file_line_group) in file_line_groups { for file_line in file_line_group { - if let Err(e) = file_line.line_content { - return Err(e.into()); - } - let new_page_number = file_line.page_number; + let new_page_number = file_line.page_number + 1; if page_counter != new_page_number { print_page(&lines, options, page_counter)?; lines = Vec::new(); @@ -1124,10 +1114,7 @@ fn get_line_for_printing( let blank_line = String::new(); let formatted_line_number = get_formatted_line_number(options, file_line.line_number, index); - let mut complete_line = format!( - "{formatted_line_number}{}", - file_line.line_content.as_ref().unwrap() - ); + let mut complete_line = format!("{formatted_line_number}{}", file_line.line_content); let offset_spaces = &options.offset_spaces; diff --git a/tests/by-util/test_pr.rs b/tests/by-util/test_pr.rs index 2220e9c3969..80478258cf7 100644 --- a/tests/by-util/test_pr.rs +++ b/tests/by-util/test_pr.rs @@ -606,3 +606,28 @@ fn test_omit_pagination_option() { .pipe_in("a\nb\n") .succeeds(); } + +#[test] +fn test_form_feed_newlines() { + // Here we define the expected output. + // + // Each page should have the same number of blank lines before the + // form-feed character. + let whitespace = " ".repeat(50); + let datetime_pattern = r"\d\d\d\d-\d\d-\d\d \d\d:\d\d"; + let page1 = format!("\n\n{datetime_pattern}{whitespace}Page 1\n\n\n\n\x0c"); + let page2 = format!("\n\n{datetime_pattern}{whitespace}Page 2\n\n\n\n\x0c"); + let pattern = format!("{page1}{page2}"); + let regex = Regex::new(&pattern).unwrap(); + + // Command line: `printf "\f\f" | pr -f`. + // + // Escape code `\x0c` in a Rust string literal is the ASCII escape + // code `\f` for the "form feed" character (which appears like + // `^L` in the terminal). + new_ucmd!() + .arg("-f") + .pipe_in("\x0c\x0c") + .succeeds() + .stdout_matches(®ex); +}