Skip to content

Commit 6ee9241

Browse files
committed
refactor(parasplit): extract scan_word_end method to eliminate code duplication
Extracted the logic for scanning the end of a word into a new `scan_word_end` method within the `WordSplit` implementation. This refactoring removes duplicated code from the `Iterator` implementation, improving maintainability and reducing redundancy. Additionally, added documentation comments to `utf8_char_width` and `decode_char` functions for better clarity.
1 parent 94efbb4 commit 6ee9241

File tree

1 file changed

+29
-22
lines changed

1 file changed

+29
-22
lines changed

src/uu/fmt/src/parasplit.rs

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ fn char_width(c: char) -> usize {
2626
}
2727
}
2828

29+
/// Return the UTF-8 sequence length implied by a leading byte, or `None` if invalid.
2930
fn utf8_char_width(byte: u8) -> Option<usize> {
3031
match byte {
3132
0x00..=0x7F => Some(1),
@@ -36,6 +37,7 @@ fn utf8_char_width(byte: u8) -> Option<usize> {
3637
}
3738
}
3839

40+
/// Decode a UTF-8 character starting at `start`, returning the char and bytes consumed.
3941
fn decode_char(bytes: &[u8], start: usize) -> (Option<char>, usize) {
4042
let first = bytes[start];
4143
if first < 0x80 {
@@ -610,6 +612,31 @@ impl WordSplit<'_> {
610612
fn is_punctuation_byte(b: u8) -> bool {
611613
matches!(b, b'!' | b'.' | b'?')
612614
}
615+
616+
fn scan_word_end(&self, word_start: usize) -> (usize, usize, Option<u8>) {
617+
let mut word_nchars = 0;
618+
let mut idx = word_start;
619+
let mut last_ascii = None;
620+
while idx < self.length {
621+
let (ch, consumed) = decode_char(self.bytes, idx);
622+
let is_whitespace = ch.filter(|c| c.is_ascii()).is_some_and(is_fmt_whitespace);
623+
if is_whitespace {
624+
break;
625+
}
626+
word_nchars += ch.map_or(1, char_width);
627+
if let Some(ch) = ch {
628+
if ch.is_ascii() {
629+
last_ascii = Some(ch as u8);
630+
} else {
631+
last_ascii = None;
632+
}
633+
} else {
634+
last_ascii = None;
635+
}
636+
idx += consumed;
637+
}
638+
(idx, word_nchars, last_ascii)
639+
}
613640
}
614641

615642
pub struct WordInfo<'a> {
@@ -647,28 +674,8 @@ impl<'a> Iterator for WordSplit<'a> {
647674
// find the beginning of the next whitespace
648675
// note that this preserves the invariant that self.position
649676
// points to whitespace character OR end of string
650-
let mut word_nchars = 0;
651-
let mut idx = word_start;
652-
let mut last_ascii = None;
653-
while idx < self.length {
654-
let (ch, consumed) = decode_char(self.bytes, idx);
655-
let is_whitespace = ch.filter(|c| c.is_ascii()).is_some_and(is_fmt_whitespace);
656-
if is_whitespace {
657-
break;
658-
}
659-
word_nchars += ch.map_or(1, char_width);
660-
if let Some(ch) = ch {
661-
if ch.is_ascii() {
662-
last_ascii = Some(ch as u8);
663-
} else {
664-
last_ascii = None;
665-
}
666-
} else {
667-
last_ascii = None;
668-
}
669-
idx += consumed;
670-
}
671-
self.position = idx;
677+
let (next_position, word_nchars, last_ascii) = self.scan_word_end(word_start);
678+
self.position = next_position;
672679

673680
let word_start_relative = word_start - old_position;
674681
// if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence.

0 commit comments

Comments
 (0)