Skip to content

Commit 8bbce0d

Browse files
authored
Merge pull request #8386 from RenjiSann/improve-sort
sort: Handle non-utf8 sorting content
2 parents 98374e8 + 27faee7 commit 8bbce0d

15 files changed

+369
-366
lines changed

src/uu/ls/src/ls.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2139,8 +2139,11 @@ fn sort_entries(entries: &mut [PathData], config: &Config, out: &mut BufWriter<S
21392139
// The default sort in GNU ls is case insensitive
21402140
Sort::Name => entries.sort_by(|a, b| a.display_name.cmp(&b.display_name)),
21412141
Sort::Version => entries.sort_by(|a, b| {
2142-
version_cmp(&a.p_buf.to_string_lossy(), &b.p_buf.to_string_lossy())
2143-
.then(a.p_buf.to_string_lossy().cmp(&b.p_buf.to_string_lossy()))
2142+
version_cmp(
2143+
os_str_as_bytes_lossy(a.p_buf.as_os_str()).as_ref(),
2144+
os_str_as_bytes_lossy(b.p_buf.as_os_str()).as_ref(),
2145+
)
2146+
.then(a.p_buf.to_string_lossy().cmp(&b.p_buf.to_string_lossy()))
21442147
}),
21452148
Sort::Extension => entries.sort_by(|a, b| {
21462149
a.p_buf

src/uu/sort/src/check.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ pub fn check(path: &OsStr, settings: &GlobalSettings) -> UResult<()> {
7272
return Err(SortError::Disorder {
7373
file: path.to_owned(),
7474
line_number: line_idx,
75-
line: new_first.line.to_owned(),
75+
line: String::from_utf8_lossy(new_first.line).into_owned(),
7676
silent: settings.check_silent,
7777
}
7878
.into());
@@ -86,7 +86,7 @@ pub fn check(path: &OsStr, settings: &GlobalSettings) -> UResult<()> {
8686
return Err(SortError::Disorder {
8787
file: path.to_owned(),
8888
line_number: line_idx,
89-
line: b.line.to_owned(),
89+
line: String::from_utf8_lossy(b.line).into_owned(),
9090
silent: settings.check_silent,
9191
}
9292
.into());

src/uu/sort/src/chunks.rs

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@ use memchr::memchr_iter;
1717
use self_cell::self_cell;
1818
use uucore::error::{UResult, USimpleError};
1919

20-
use crate::{
21-
GeneralBigDecimalParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo,
22-
};
20+
use crate::{GeneralBigDecimalParseResult, GlobalSettings, Line, numeric_str_cmp::NumInfo};
2321

2422
self_cell!(
2523
/// The chunk that is passed around between threads.
@@ -41,7 +39,7 @@ pub struct ChunkContents<'a> {
4139

4240
#[derive(Debug)]
4341
pub struct LineData<'a> {
44-
pub selections: Vec<&'a str>,
42+
pub selections: Vec<&'a [u8]>,
4543
pub num_infos: Vec<NumInfo>,
4644
pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
4745
pub line_num_floats: Vec<Option<f64>>,
@@ -68,7 +66,7 @@ impl Chunk {
6866
let selections = unsafe {
6967
// SAFETY: (same as above) It is safe to (temporarily) transmute to a vector of &str with a longer lifetime,
7068
// because the vector is empty.
71-
std::mem::transmute::<Vec<&'_ str>, Vec<&'static str>>(std::mem::take(
69+
std::mem::transmute::<Vec<&'_ [u8]>, Vec<&'static [u8]>>(std::mem::take(
7270
&mut contents.line_data.selections,
7371
))
7472
};
@@ -100,7 +98,7 @@ impl Chunk {
10098

10199
pub struct RecycledChunk {
102100
lines: Vec<Line<'static>>,
103-
selections: Vec<&'static str>,
101+
selections: Vec<&'static [u8]>,
104102
num_infos: Vec<NumInfo>,
105103
parsed_floats: Vec<GeneralBigDecimalParseResult>,
106104
line_num_floats: Vec<Option<f64>>,
@@ -180,15 +178,14 @@ pub fn read<T: Read>(
180178
let selections = unsafe {
181179
// SAFETY: It is safe to transmute to an empty vector of selections with shorter lifetime.
182180
// It was only temporarily transmuted to a Vec<Line<'static>> to make recycling possible.
183-
std::mem::transmute::<Vec<&'static str>, Vec<&'_ str>>(selections)
181+
std::mem::transmute::<Vec<&'static [u8]>, Vec<&'_ [u8]>>(selections)
184182
};
185183
let mut lines = unsafe {
186184
// SAFETY: (same as above) It is safe to transmute to a vector of lines with shorter lifetime,
187185
// because it was only temporarily transmuted to a Vec<Line<'static>> to make recycling possible.
188186
std::mem::transmute::<Vec<Line<'static>>, Vec<Line<'_>>>(lines)
189187
};
190-
let read = std::str::from_utf8(&buffer[..read])
191-
.map_err(|error| SortError::Uft8Error { error })?;
188+
let read = &buffer[..read];
192189
let mut line_data = LineData {
193190
selections,
194191
num_infos,
@@ -205,13 +202,13 @@ pub fn read<T: Read>(
205202

206203
/// Split `read` into `Line`s, and add them to `lines`.
207204
fn parse_lines<'a>(
208-
read: &'a str,
205+
read: &'a [u8],
209206
lines: &mut Vec<Line<'a>>,
210207
line_data: &mut LineData<'a>,
211208
separator: u8,
212209
settings: &GlobalSettings,
213210
) {
214-
let read = read.strip_suffix(separator as char).unwrap_or(read);
211+
let read = read.strip_suffix(&[separator]).unwrap_or(read);
215212

216213
assert!(lines.is_empty());
217214
assert!(line_data.selections.is_empty());
@@ -220,7 +217,7 @@ fn parse_lines<'a>(
220217
assert!(line_data.line_num_floats.is_empty());
221218
let mut token_buffer = vec![];
222219
lines.extend(
223-
read.split(separator as char)
220+
read.split(|&c| c == separator)
224221
.enumerate()
225222
.map(|(index, line)| Line::create(line, index, line_data, &mut token_buffer, settings)),
226223
);

src/uu/sort/src/custom_str_cmp.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
1010
use std::cmp::Ordering;
1111

12-
fn filter_char(c: char, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool {
12+
fn filter_char(c: u8, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool {
1313
if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) {
1414
return false;
1515
}
@@ -19,7 +19,7 @@ fn filter_char(c: char, ignore_non_printing: bool, ignore_non_dictionary: bool)
1919
true
2020
}
2121

22-
fn cmp_chars(a: char, b: char, ignore_case: bool) -> Ordering {
22+
fn cmp_chars(a: u8, b: u8, ignore_case: bool) -> Ordering {
2323
if ignore_case {
2424
a.to_ascii_uppercase().cmp(&b.to_ascii_uppercase())
2525
} else {
@@ -28,8 +28,8 @@ fn cmp_chars(a: char, b: char, ignore_case: bool) -> Ordering {
2828
}
2929

3030
pub fn custom_str_cmp(
31-
a: &str,
32-
b: &str,
31+
a: &[u8],
32+
b: &[u8],
3333
ignore_non_printing: bool,
3434
ignore_non_dictionary: bool,
3535
ignore_case: bool,
@@ -39,11 +39,11 @@ pub fn custom_str_cmp(
3939
return a.cmp(b);
4040
}
4141
let mut a_chars = a
42-
.chars()
43-
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
42+
.iter()
43+
.filter(|&&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
4444
let mut b_chars = b
45-
.chars()
46-
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
45+
.iter()
46+
.filter(|&&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
4747
loop {
4848
let a_char = a_chars.next();
4949
let b_char = b_chars.next();
@@ -52,7 +52,7 @@ pub fn custom_str_cmp(
5252
(Some(_), None) => return Ordering::Greater,
5353
(None, Some(_)) => return Ordering::Less,
5454
(Some(a_char), Some(b_char)) => {
55-
let ordering = cmp_chars(a_char, b_char, ignore_case);
55+
let ordering = cmp_chars(*a_char, *b_char, ignore_case);
5656
if ordering != Ordering::Equal {
5757
return ordering;
5858
}

src/uu/sort/src/ext_sort.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ fn write<I: WriteableTmpFile>(
272272

273273
fn write_lines<T: Write>(lines: &[Line], writer: &mut T, separator: u8) {
274274
for s in lines {
275-
writer.write_all(s.line.as_bytes()).unwrap();
275+
writer.write_all(s.line).unwrap();
276276
writer.write_all(&[separator]).unwrap();
277277
}
278278
}

0 commit comments

Comments
 (0)