Skip to content

Commit 5b70ed4

Browse files
authored
Merge pull request #9682 from CrazyRoka/ptx-implement-sentence-regexp
ptx: implement -S/--sentence-regexp
1 parent 1883daf commit 5b70ed4

File tree

3 files changed

+93
-13
lines changed

3 files changed

+93
-13
lines changed

src/uu/ptx/locales/en-US.ftl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,5 @@ ptx-error-dumb-format = There is no dumb format with GNU extensions disabled
2828
ptx-error-not-implemented = { $feature } not implemented yet
2929
ptx-error-write-failed = write failed
3030
ptx-error-extra-operand = extra operand { $operand }
31+
ptx-error-empty-regexp = A regular expression cannot match a length zero string
32+
ptx-error-invalid-regexp = Invalid regexp: { $error }

src/uu/ptx/src/ptx.rs

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use clap::{Arg, ArgAction, Command};
1919
use regex::Regex;
2020
use thiserror::Error;
2121
use uucore::display::Quotable;
22-
use uucore::error::{FromIo, UError, UResult, UUsageError};
22+
use uucore::error::{FromIo, UError, UResult, USimpleError, UUsageError};
2323
use uucore::format_usage;
2424
use uucore::translate;
2525

@@ -43,6 +43,7 @@ struct Config {
4343
context_regex: String,
4444
line_width: usize,
4545
gap_size: usize,
46+
sentence_regex: Option<String>,
4647
}
4748

4849
impl Default for Config {
@@ -59,6 +60,7 @@ impl Default for Config {
5960
context_regex: "\\w+".to_owned(),
6061
line_width: 72,
6162
gap_size: 3,
63+
sentence_regex: None,
6264
}
6365
}
6466
}
@@ -197,25 +199,33 @@ struct WordRef {
197199

198200
#[derive(Debug, Error)]
199201
enum PtxError {
200-
#[error("{}", translate!("ptx-error-not-implemented", "feature" => (*.0)))]
201-
NotImplemented(&'static str),
202-
203202
#[error("{0}")]
204203
ParseError(ParseIntError),
205204
}
206205

207206
impl UError for PtxError {}
208207

209-
fn get_config(matches: &clap::ArgMatches) -> UResult<Config> {
208+
fn get_config(matches: &mut clap::ArgMatches) -> UResult<Config> {
210209
let mut config = Config::default();
211210
let err_msg = "parsing options failed";
212211
if matches.get_flag(options::TRADITIONAL) {
213212
config.gnu_ext = false;
214213
config.format = OutFormat::Roff;
215214
"[^ \t\n]+".clone_into(&mut config.context_regex);
216215
}
217-
if matches.contains_id(options::SENTENCE_REGEXP) {
218-
return Err(PtxError::NotImplemented("-S").into());
216+
if let Some(regex) = matches.remove_one::<String>(options::SENTENCE_REGEXP) {
217+
// TODO: The regex crate used here is not fully compatible with GNU's regex implementation.
218+
// For example, it does not support backreferences.
219+
// In the future, we might want to switch to the onig crate (like expr does) for better compatibility.
220+
221+
// Verify regex is valid and doesn't match empty string
222+
if let Ok(re) = Regex::new(&regex) {
223+
if re.is_match("") {
224+
return Err(USimpleError::new(1, translate!("ptx-error-empty-regexp")));
225+
}
226+
}
227+
228+
config.sentence_regex = Some(regex);
219229
}
220230
config.auto_ref = matches.get_flag(options::AUTO_REFERENCE);
221231
config.input_ref = matches.get_flag(options::REFERENCES);
@@ -271,17 +281,30 @@ struct FileContent {
271281

272282
type FileMap = HashMap<OsString, FileContent>;
273283

274-
fn read_input(input_files: &[OsString]) -> std::io::Result<FileMap> {
284+
fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result<FileMap> {
275285
let mut file_map: FileMap = HashMap::new();
276286
let mut offset: usize = 0;
287+
288+
let sentence_splitter = if let Some(re_str) = &config.sentence_regex {
289+
Some(Regex::new(re_str).map_err(|e| {
290+
std::io::Error::new(
291+
std::io::ErrorKind::InvalidInput,
292+
translate!("ptx-error-invalid-regexp", "error" => e),
293+
)
294+
})?)
295+
} else {
296+
None
297+
};
298+
277299
for filename in input_files {
278-
let reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" {
300+
let mut reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" {
279301
Box::new(stdin())
280302
} else {
281303
let file = File::open(Path::new(filename))?;
282304
Box::new(file)
283305
});
284-
let lines: Vec<String> = reader.lines().collect::<std::io::Result<Vec<String>>>()?;
306+
307+
let lines = read_lines(sentence_splitter.as_ref(), &mut reader)?;
285308

286309
// Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long.
287310
// Since we will be jumping around the line a lot, we dump the content into a Vec<char>, which can be indexed in constant time.
@@ -300,6 +323,24 @@ fn read_input(input_files: &[OsString]) -> std::io::Result<FileMap> {
300323
Ok(file_map)
301324
}
302325

326+
fn read_lines(
327+
sentence_splitter: Option<&Regex>,
328+
reader: &mut dyn BufRead,
329+
) -> std::io::Result<Vec<String>> {
330+
if let Some(re) = sentence_splitter {
331+
let mut buffer = String::new();
332+
reader.read_to_string(&mut buffer)?;
333+
334+
Ok(re
335+
.split(&buffer)
336+
.map(|s| s.replace('\n', " ")) // ptx behavior: newlines become spaces inside sentences
337+
.filter(|s| !s.is_empty()) // remove empty sentences
338+
.collect())
339+
} else {
340+
reader.lines().collect()
341+
}
342+
}
343+
303344
/// Go through every lines in the input files and record each match occurrence as a `WordRef`.
304345
fn create_word_set(config: &Config, filter: &WordFilter, file_map: &FileMap) -> BTreeSet<WordRef> {
305346
let reg = Regex::new(&filter.word_regex).unwrap();
@@ -850,8 +891,8 @@ mod options {
850891

851892
#[uucore::main]
852893
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
853-
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
854-
let mut config = get_config(&matches)?;
894+
let mut matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
895+
let mut config = get_config(&mut matches)?;
855896

856897
let input_files;
857898
let output_file: OsString;
@@ -883,7 +924,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
883924
}
884925

885926
let word_filter = WordFilter::new(&matches, &config)?;
886-
let file_map = read_input(&input_files).map_err_context(String::new)?;
927+
let file_map = read_input(&input_files, &config).map_err_context(String::new)?;
887928
let word_set = create_word_set(&config, &word_filter, &file_map);
888929
write_traditional_output(&mut config, &file_map, &word_set, &output_file)
889930
}

tests/by-util/test_ptx.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,43 @@ fn test_utf8() {
257257
.stdout_only("\\xx {}{it’s}{disabled}{}{}\n\\xx {}{}{it’s}{ disabled}{}\n");
258258
}
259259

260+
#[test]
261+
fn test_sentence_regexp_basic() {
262+
new_ucmd!()
263+
.args(&["-G", "-S", "\\."])
264+
.pipe_in("Hello. World.")
265+
.succeeds()
266+
.stdout_contains("Hello")
267+
.stdout_contains("World");
268+
}
269+
270+
#[test]
271+
fn test_sentence_regexp_split_behavior() {
272+
new_ucmd!()
273+
.args(&["-G", "-w", "50", "-S", "[.!]"])
274+
.pipe_in("One sentence. Two sentence!")
275+
.succeeds()
276+
.stdout_contains("One sentence")
277+
.stdout_contains("Two sentence");
278+
}
279+
280+
#[test]
281+
fn test_sentence_regexp_empty_match_failure() {
282+
new_ucmd!()
283+
.args(&["-G", "-S", "^"])
284+
.fails()
285+
.stderr_contains("A regular expression cannot match a length zero string");
286+
}
287+
288+
#[test]
289+
fn test_sentence_regexp_newlines_are_spaces() {
290+
new_ucmd!()
291+
.args(&["-G", "-S", "\\."])
292+
.pipe_in("Start of\nsentence.")
293+
.succeeds()
294+
.stdout_contains("Start of sentence");
295+
}
296+
260297
#[test]
261298
fn test_gnu_mode_dumb_format() {
262299
// Test GNU mode (dumb format) - the default mode without -G flag

0 commit comments

Comments
 (0)