Skip to content

Commit 7866727

Browse files
authored
Merge pull request #8746 from jacob-greenfield/sort-fix-buffer-read
sort: fix newline handling across large and/or multiple files
2 parents cc3a6a4 + 25e269c commit 7866727

File tree

3 files changed

+39
-10
lines changed

3 files changed

+39
-10
lines changed

src/uu/sort/src/chunks.rs

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,9 @@ fn read_to_buffer<T: Read>(
259259
separator: u8,
260260
) -> UResult<(usize, bool)> {
261261
let mut read_target = &mut buffer[start_offset..];
262-
let mut last_file_target_size = read_target.len();
262+
let mut last_file_empty = true;
263+
let mut newline_search_offset = 0;
264+
let mut found_newline = false;
263265
loop {
264266
match file.read(read_target) {
265267
Ok(0) => {
@@ -278,13 +280,17 @@ fn read_to_buffer<T: Read>(
278280
continue;
279281
}
280282
}
281-
let mut sep_iter = memchr_iter(separator, buffer).rev();
282-
let last_line_end = sep_iter.next();
283-
if sep_iter.next().is_some() {
284-
// We read enough lines.
285-
let end = last_line_end.unwrap();
286-
// We want to include the separator here, because it shouldn't be carried over.
287-
return Ok((end + 1, true));
283+
284+
let mut sep_iter =
285+
memchr_iter(separator, &buffer[newline_search_offset..buffer.len()]).rev();
286+
newline_search_offset = buffer.len();
287+
if let Some(last_line_end) = sep_iter.next() {
288+
if found_newline || sep_iter.next().is_some() {
289+
// We read enough lines.
290+
// We want to include the separator here, because it shouldn't be carried over.
291+
return Ok((last_line_end + 1, true));
292+
}
293+
found_newline = true;
288294
}
289295

290296
// We need to read more lines
@@ -295,7 +301,7 @@ fn read_to_buffer<T: Read>(
295301
} else {
296302
// This file has been fully read.
297303
let mut leftover_len = read_target.len();
298-
if last_file_target_size != leftover_len {
304+
if !last_file_empty {
299305
// The file was not empty.
300306
let read_len = buffer.len() - leftover_len;
301307
if buffer[read_len - 1] != separator {
@@ -308,7 +314,7 @@ fn read_to_buffer<T: Read>(
308314
}
309315
if let Some(next_file) = next_files.next() {
310316
// There is another file.
311-
last_file_target_size = leftover_len;
317+
last_file_empty = true;
312318
*file = next_file?;
313319
} else {
314320
// This was the last file.
@@ -319,6 +325,7 @@ fn read_to_buffer<T: Read>(
319325
}
320326
Ok(n) => {
321327
read_target = &mut read_target[n..];
328+
last_file_empty = false;
322329
}
323330
Err(e) if e.kind() == ErrorKind::Interrupted => {
324331
// retry

src/uu/sort/src/ext_sort.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use crate::{
3636
};
3737
use crate::{Line, print_sorted};
3838

39+
// Note: update `test_sort::test_start_buffer` if this size is changed
3940
const START_BUFFER_SIZE: usize = 8_000;
4041

4142
/// Sort files by using auxiliary files for storing intermediate chunks (if needed), and output the result.

tests/by-util/test_sort.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1906,4 +1906,25 @@ fn test_color_environment_variables() {
19061906
}
19071907
}
19081908

1909+
#[test]
1910+
fn test_start_buffer() {
1911+
// Test that a file with the exact same size as the start buffer is handled correctly
1912+
const FILE_B: &[u8] = &[b'b'; 8_000];
1913+
const FILE_A: &[u8] = b"aaa";
1914+
1915+
let mut expected = FILE_A.to_vec();
1916+
expected.push(b'\n');
1917+
expected.extend_from_slice(FILE_B);
1918+
expected.push(b'\n');
1919+
1920+
let (at, mut ucmd) = at_and_ucmd!();
1921+
1922+
at.write_bytes("b", FILE_B);
1923+
at.write_bytes("a", FILE_A);
1924+
1925+
ucmd.args(&["b", "a"])
1926+
.succeeds()
1927+
.stdout_only_bytes(&expected);
1928+
}
1929+
19091930
/* spell-checker: enable */

0 commit comments

Comments
 (0)