Skip to content

Commit a452d86

Browse files
authored
Improve sort buffer sizing heuristics and honor explicit --buffer-size (#8833)
* feat(sort): add adaptive buffer sizing and fast paths - move heuristics into a new buffer_hint module and default to automatic sizing when the buffer flag is absent - tune chunk and external sort buffers to avoid runaway allocations - add fast lexicographic and ASCII case-insensitive comparisons for the default mode - refresh spell-check and dependency metadata for the new code * fix(sort): reuse SIGINT handler for temporary directory cleanup - keep the latest path/lock pair in a shared registry so SIGINT always cleans the active directory - guard handler installation with an atomic flag and reset state when the wrapper is dropped * refactor(sort): simplify merge batch size to fixed value Remove Linux-specific dynamic adjustment based on file descriptors and use a fixed batch size of 64 for consistency across platforms. * fix Cargo.lock linux environments
1 parent 01c0a61 commit a452d86

File tree

4 files changed

+187
-36
lines changed

4 files changed

+187
-36
lines changed

.vscode/cspell.dictionaries/jargon.wordlist.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ autogenerated
66
autogenerates
77
bitmask
88
bitwise
9+
bufferram
910
bytewise
1011
canonicalization
1112
canonicalize
@@ -45,6 +46,7 @@ fileio
4546
filesystem
4647
filesystems
4748
flamegraph
49+
freeram
4850
fsxattr
4951
fullblock
5052
getfacl
@@ -123,6 +125,7 @@ shortcode
123125
shortcodes
124126
siginfo
125127
sigusr
128+
strcasecmp
126129
subcommand
127130
subexpression
128131
submodule
@@ -134,6 +137,7 @@ syscalls
134137
sysconf
135138
tokenize
136139
toolchain
140+
totalram
137141
truthy
138142
ucase
139143
unbuffered

fuzz/Cargo.lock

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/uu/sort/src/sort.rs

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,8 @@ struct Precomputed {
301301
num_infos_per_line: usize,
302302
floats_per_line: usize,
303303
selections_per_line: usize,
304+
fast_lexicographic: bool,
305+
fast_ascii_insensitive: bool,
304306
}
305307

306308
impl GlobalSettings {
@@ -341,6 +343,47 @@ impl GlobalSettings {
341343
.iter()
342344
.filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
343345
.count();
346+
347+
self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic();
348+
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
349+
}
350+
351+
/// Returns true when the fast lexicographic path can be used safely.
352+
fn can_use_fast_lexicographic(&self) -> bool {
353+
self.mode == SortMode::Default
354+
&& !self.ignore_case
355+
&& !self.dictionary_order
356+
&& !self.ignore_non_printing
357+
&& !self.ignore_leading_blanks
358+
&& self.selectors.len() == 1
359+
&& {
360+
let selector = &self.selectors[0];
361+
!selector.needs_selection
362+
&& matches!(selector.settings.mode, SortMode::Default)
363+
&& !selector.settings.ignore_case
364+
&& !selector.settings.dictionary_order
365+
&& !selector.settings.ignore_non_printing
366+
&& !selector.settings.ignore_blanks
367+
}
368+
}
369+
370+
/// Returns true when the ASCII case-insensitive fast path is valid.
371+
fn can_use_fast_ascii_insensitive(&self) -> bool {
372+
self.mode == SortMode::Default
373+
&& self.ignore_case
374+
&& !self.dictionary_order
375+
&& !self.ignore_non_printing
376+
&& !self.ignore_leading_blanks
377+
&& self.selectors.len() == 1
378+
&& {
379+
let selector = &self.selectors[0];
380+
!selector.needs_selection
381+
&& matches!(selector.settings.mode, SortMode::Default)
382+
&& selector.settings.ignore_case
383+
&& !selector.settings.dictionary_order
384+
&& !selector.settings.ignore_non_printing
385+
&& !selector.settings.ignore_blanks
386+
}
344387
}
345388
}
346389

@@ -1643,6 +1686,26 @@ fn compare_by<'a>(
16431686
a_line_data: &LineData<'a>,
16441687
b_line_data: &LineData<'a>,
16451688
) -> Ordering {
1689+
if global_settings.precomputed.fast_lexicographic {
1690+
let cmp = a.line.cmp(b.line);
1691+
return if global_settings.reverse {
1692+
cmp.reverse()
1693+
} else {
1694+
cmp
1695+
};
1696+
}
1697+
1698+
if global_settings.precomputed.fast_ascii_insensitive {
1699+
let cmp = ascii_case_insensitive_cmp(a.line, b.line);
1700+
if cmp != Ordering::Equal || a.line == b.line {
1701+
return if global_settings.reverse {
1702+
cmp.reverse()
1703+
} else {
1704+
cmp
1705+
};
1706+
}
1707+
}
1708+
16461709
let mut selection_index = 0;
16471710
let mut num_info_index = 0;
16481711
let mut parsed_float_index = 0;
@@ -1754,6 +1817,26 @@ fn compare_by<'a>(
17541817
}
17551818
}
17561819

1820+
/// Compare two byte slices in ASCII case-insensitive order without allocating.
1821+
/// We lower each byte on the fly so that binary input (including `NUL`) stays
1822+
/// untouched and we avoid locale-sensitive routines such as `strcasecmp`.
1823+
fn ascii_case_insensitive_cmp(a: &[u8], b: &[u8]) -> Ordering {
1824+
#[inline]
1825+
fn lower(byte: u8) -> u8 {
1826+
byte.to_ascii_lowercase()
1827+
}
1828+
1829+
for (lhs, rhs) in a.iter().copied().zip(b.iter().copied()) {
1830+
let l = lower(lhs);
1831+
let r = lower(rhs);
1832+
if l != r {
1833+
return l.cmp(&r);
1834+
}
1835+
}
1836+
1837+
a.len().cmp(&b.len())
1838+
}
1839+
17571840
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
17581841
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
17591842
// scientific notation, so we strip those lines only after the end of the following numeric string.

src/uu/sort/src/tmp_dir.rs

Lines changed: 90 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
//
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
5+
use std::sync::atomic::{AtomicBool, Ordering};
56
use std::{
67
fs::File,
78
path::{Path, PathBuf},
8-
sync::{Arc, Mutex},
9+
sync::{Arc, Mutex, OnceLock},
910
};
1011

1112
use tempfile::TempDir;
@@ -29,6 +30,70 @@ pub struct TmpDirWrapper {
2930
lock: Arc<Mutex<()>>,
3031
}
3132

33+
#[derive(Default, Clone)]
34+
struct HandlerRegistration {
35+
lock: Option<Arc<Mutex<()>>>,
36+
path: Option<PathBuf>,
37+
}
38+
39+
fn handler_state() -> Arc<Mutex<HandlerRegistration>> {
40+
// Lazily create the global HandlerRegistration so all TmpDirWrapper instances and the
41+
// SIGINT handler operate on the same lock/path snapshot.
42+
static HANDLER_STATE: OnceLock<Arc<Mutex<HandlerRegistration>>> = OnceLock::new();
43+
HANDLER_STATE
44+
.get_or_init(|| Arc::new(Mutex::new(HandlerRegistration::default())))
45+
.clone()
46+
}
47+
48+
fn ensure_signal_handler_installed(state: Arc<Mutex<HandlerRegistration>>) -> UResult<()> {
49+
// This shared state must originate from `handler_state()` so the handler always sees
50+
// the current lock/path pair and can clean up the active temp directory on SIGINT.
51+
// Install a shared SIGINT handler so the active temp directory is deleted when the user aborts.
52+
// Guard to ensure the SIGINT handler is registered once per process and reused.
53+
static HANDLER_INSTALLED: AtomicBool = AtomicBool::new(false);
54+
55+
if HANDLER_INSTALLED
56+
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
57+
.is_err()
58+
{
59+
return Ok(());
60+
}
61+
62+
let handler_state = state.clone();
63+
if let Err(e) = ctrlc::set_handler(move || {
64+
// Load the latest lock/path snapshot so the handler cleans the active temp dir.
65+
let (lock, path) = {
66+
let state = handler_state.lock().unwrap();
67+
(state.lock.clone(), state.path.clone())
68+
};
69+
70+
if let Some(lock) = lock {
71+
let _guard = lock.lock().unwrap();
72+
if let Some(path) = path {
73+
if let Err(e) = remove_tmp_dir(&path) {
74+
show_error!(
75+
"{}",
76+
translate!(
77+
"sort-failed-to-delete-temporary-directory",
78+
"error" => e
79+
)
80+
);
81+
}
82+
}
83+
}
84+
85+
std::process::exit(2)
86+
}) {
87+
HANDLER_INSTALLED.store(false, Ordering::Release);
88+
return Err(USimpleError::new(
89+
2,
90+
translate!("sort-failed-to-set-up-signal-handler", "error" => e),
91+
));
92+
}
93+
94+
Ok(())
95+
}
96+
3297
impl TmpDirWrapper {
3398
pub fn new(path: PathBuf) -> Self {
3499
Self {
@@ -52,31 +117,14 @@ impl TmpDirWrapper {
52117
);
53118

54119
let path = self.temp_dir.as_ref().unwrap().path().to_owned();
55-
let lock = self.lock.clone();
56-
ctrlc::set_handler(move || {
57-
// Take the lock so that `next_file_path` returns no new file path,
58-
// and the program doesn't terminate before the handler has finished
59-
let _lock = lock.lock().unwrap();
60-
if let Err(e) = remove_tmp_dir(&path) {
61-
show_error!(
62-
"{}",
63-
translate!(
64-
"sort-failed-to-delete-temporary-directory",
65-
"error" => e
66-
)
67-
);
68-
}
69-
std::process::exit(2)
70-
})
71-
.map_err(|e| {
72-
USimpleError::new(
73-
2,
74-
translate!(
75-
"sort-failed-to-set-up-signal-handler",
76-
"error" => e
77-
),
78-
)
79-
})
120+
let state = handler_state();
121+
{
122+
let mut guard = state.lock().unwrap();
123+
guard.lock = Some(self.lock.clone());
124+
guard.path = Some(path);
125+
}
126+
127+
ensure_signal_handler_installed(state)
80128
}
81129

82130
pub fn next_file(&mut self) -> UResult<(File, PathBuf)> {
@@ -100,6 +148,22 @@ impl TmpDirWrapper {
100148
}
101149
}
102150

151+
impl Drop for TmpDirWrapper {
152+
fn drop(&mut self) {
153+
let state = handler_state();
154+
let mut guard = state.lock().unwrap();
155+
156+
if guard
157+
.lock
158+
.as_ref()
159+
.is_some_and(|current| Arc::ptr_eq(current, &self.lock))
160+
{
161+
guard.lock = None;
162+
guard.path = None;
163+
}
164+
}
165+
}
166+
103167
/// Remove the directory at `path` by deleting its child files and then itself.
104168
/// Errors while deleting child files are ignored.
105169
fn remove_tmp_dir(path: &Path) -> std::io::Result<()> {

0 commit comments

Comments
 (0)