Skip to content

Commit afdf5f9

Browse files
dhr412mattsu2020
authored andcommitted
wc: respect POSIXLY_CORRECT for word counting (uutils#10344)
* wc: respect POSIXLY_CORRECT for word counting * wc: Add test for POSIXLY_CORRECT word counting
1 parent 3b00a2d commit afdf5f9

File tree

2 files changed

+31
-1
lines changed

2 files changed

+31
-1
lines changed

src/uu/wc/src/wc.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ mod word_count;
1313
use std::{
1414
borrow::{Borrow, Cow},
1515
cmp::max,
16+
env,
1617
ffi::{OsStr, OsString},
1718
fs::{self, File},
1819
io::{self, Write},
@@ -578,10 +579,17 @@ fn process_chunk<
578579
text: &str,
579580
current_len: &mut usize,
580581
in_word: &mut bool,
582+
posixly_correct: bool,
581583
) {
582584
for ch in text.chars() {
583585
if SHOW_WORDS {
584-
if ch.is_whitespace() {
586+
let is_space = if posixly_correct {
587+
matches!(ch, '\t'..='\r' | ' ')
588+
} else {
589+
ch.is_whitespace()
590+
};
591+
592+
if is_space {
585593
*in_word = false;
586594
} else if !(*in_word) {
587595
// This also counts control characters! (As of GNU coreutils 9.5)
@@ -639,6 +647,7 @@ fn word_count_from_reader_specialized<
639647
let mut reader = BufReadDecoder::new(reader.buffered());
640648
let mut in_word = false;
641649
let mut current_len = 0;
650+
let posixly_correct = env::var_os("POSIXLY_CORRECT").is_some();
642651
while let Some(chunk) = reader.next_strict() {
643652
match chunk {
644653
Ok(text) => {
@@ -647,6 +656,7 @@ fn word_count_from_reader_specialized<
647656
text,
648657
&mut current_len,
649658
&mut in_word,
659+
posixly_correct,
650660
);
651661
}
652662
Err(e) => {

tests/by-util/test_wc.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,3 +891,23 @@ fn test_simd_respects_glibc_tunables() {
891891
);
892892
}
893893
}
894+
895+
#[test]
896+
fn test_posixly_correct_whitespace() {
897+
let input = "word\u{00A0}word"; // Non-breaking space
898+
899+
// Default: Unicode whitespace is respected
900+
new_ucmd!()
901+
.arg("-w")
902+
.pipe_in(input)
903+
.succeeds()
904+
.stdout_is("2\n");
905+
906+
// POSIXLY_CORRECT: Only ASCII whitespace
907+
new_ucmd!()
908+
.arg("-w")
909+
.env("POSIXLY_CORRECT", "1")
910+
.pipe_in(input)
911+
.succeeds()
912+
.stdout_is("1\n");
913+
}

0 commit comments

Comments
 (0)