@@ -19,7 +19,7 @@ use clap::{Arg, ArgAction, Command};
1919use regex:: Regex ;
2020use thiserror:: Error ;
2121use uucore:: display:: Quotable ;
22- use uucore:: error:: { FromIo , UError , UResult , UUsageError } ;
22+ use uucore:: error:: { FromIo , UError , UResult , USimpleError , UUsageError } ;
2323use uucore:: format_usage;
2424use uucore:: translate;
2525
@@ -43,6 +43,7 @@ struct Config {
4343 context_regex : String ,
4444 line_width : usize ,
4545 gap_size : usize ,
46+ sentence_regex : Option < String > ,
4647}
4748
4849impl Default for Config {
@@ -59,6 +60,7 @@ impl Default for Config {
5960 context_regex : "\\ w+" . to_owned ( ) ,
6061 line_width : 72 ,
6162 gap_size : 3 ,
63+ sentence_regex : None ,
6264 }
6365 }
6466}
@@ -197,25 +199,33 @@ struct WordRef {
197199
198200#[ derive( Debug , Error ) ]
199201enum PtxError {
200- #[ error( "{}" , translate!( "ptx-error-not-implemented" , "feature" => ( * . 0 ) ) ) ]
201- NotImplemented ( & ' static str ) ,
202-
203202 #[ error( "{0}" ) ]
204203 ParseError ( ParseIntError ) ,
205204}
206205
207206impl UError for PtxError { }
208207
209- fn get_config ( matches : & clap:: ArgMatches ) -> UResult < Config > {
208+ fn get_config ( matches : & mut clap:: ArgMatches ) -> UResult < Config > {
210209 let mut config = Config :: default ( ) ;
211210 let err_msg = "parsing options failed" ;
212211 if matches. get_flag ( options:: TRADITIONAL ) {
213212 config. gnu_ext = false ;
214213 config. format = OutFormat :: Roff ;
215214 "[^ \t \n ]+" . clone_into ( & mut config. context_regex ) ;
216215 }
217- if matches. contains_id ( options:: SENTENCE_REGEXP ) {
218- return Err ( PtxError :: NotImplemented ( "-S" ) . into ( ) ) ;
216+ if let Some ( regex) = matches. remove_one :: < String > ( options:: SENTENCE_REGEXP ) {
217+ // TODO: The regex crate used here is not fully compatible with GNU's regex implementation.
218+ // For example, it does not support backreferences.
219+ // In the future, we might want to switch to the onig crate (like expr does) for better compatibility.
220+
221+ // Verify regex is valid and doesn't match empty string
222+ if let Ok ( re) = Regex :: new ( & regex) {
223+ if re. is_match ( "" ) {
224+ return Err ( USimpleError :: new ( 1 , translate ! ( "ptx-error-empty-regexp" ) ) ) ;
225+ }
226+ }
227+
228+ config. sentence_regex = Some ( regex) ;
219229 }
220230 config. auto_ref = matches. get_flag ( options:: AUTO_REFERENCE ) ;
221231 config. input_ref = matches. get_flag ( options:: REFERENCES ) ;
@@ -271,17 +281,30 @@ struct FileContent {
271281
272282type FileMap = HashMap < OsString , FileContent > ;
273283
274- fn read_input ( input_files : & [ OsString ] ) -> std:: io:: Result < FileMap > {
284+ fn read_input ( input_files : & [ OsString ] , config : & Config ) -> std:: io:: Result < FileMap > {
275285 let mut file_map: FileMap = HashMap :: new ( ) ;
276286 let mut offset: usize = 0 ;
287+
288+ let sentence_splitter = if let Some ( re_str) = & config. sentence_regex {
289+ Some ( Regex :: new ( re_str) . map_err ( |e| {
290+ std:: io:: Error :: new (
291+ std:: io:: ErrorKind :: InvalidInput ,
292+ translate ! ( "ptx-error-invalid-regexp" , "error" => e) ,
293+ )
294+ } ) ?)
295+ } else {
296+ None
297+ } ;
298+
277299 for filename in input_files {
278- let reader: BufReader < Box < dyn Read > > = BufReader :: new ( if filename == "-" {
300+ let mut reader: BufReader < Box < dyn Read > > = BufReader :: new ( if filename == "-" {
279301 Box :: new ( stdin ( ) )
280302 } else {
281303 let file = File :: open ( Path :: new ( filename) ) ?;
282304 Box :: new ( file)
283305 } ) ;
284- let lines: Vec < String > = reader. lines ( ) . collect :: < std:: io:: Result < Vec < String > > > ( ) ?;
306+
307+ let lines = read_lines ( sentence_splitter. as_ref ( ) , & mut reader) ?;
285308
286309 // Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long.
287310 // Since we will be jumping around the line a lot, we dump the content into a Vec<char>, which can be indexed in constant time.
@@ -300,6 +323,24 @@ fn read_input(input_files: &[OsString]) -> std::io::Result<FileMap> {
300323 Ok ( file_map)
301324}
302325
326+ fn read_lines (
327+ sentence_splitter : Option < & Regex > ,
328+ reader : & mut dyn BufRead ,
329+ ) -> std:: io:: Result < Vec < String > > {
330+ if let Some ( re) = sentence_splitter {
331+ let mut buffer = String :: new ( ) ;
332+ reader. read_to_string ( & mut buffer) ?;
333+
334+ Ok ( re
335+ . split ( & buffer)
336+ . map ( |s| s. replace ( '\n' , " " ) ) // ptx behavior: newlines become spaces inside sentences
337+ . filter ( |s| !s. is_empty ( ) ) // remove empty sentences
338+ . collect ( ) )
339+ } else {
340+ reader. lines ( ) . collect ( )
341+ }
342+ }
343+
303344/// Go through every lines in the input files and record each match occurrence as a `WordRef`.
304345fn create_word_set ( config : & Config , filter : & WordFilter , file_map : & FileMap ) -> BTreeSet < WordRef > {
305346 let reg = Regex :: new ( & filter. word_regex ) . unwrap ( ) ;
@@ -850,8 +891,8 @@ mod options {
850891
851892#[ uucore:: main]
852893pub fn uumain ( args : impl uucore:: Args ) -> UResult < ( ) > {
853- let matches = uucore:: clap_localization:: handle_clap_result ( uu_app ( ) , args) ?;
854- let mut config = get_config ( & matches) ?;
894+ let mut matches = uucore:: clap_localization:: handle_clap_result ( uu_app ( ) , args) ?;
895+ let mut config = get_config ( & mut matches) ?;
855896
856897 let input_files;
857898 let output_file: OsString ;
@@ -883,7 +924,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
883924 }
884925
885926 let word_filter = WordFilter :: new ( & matches, & config) ?;
886- let file_map = read_input ( & input_files) . map_err_context ( String :: new) ?;
927+ let file_map = read_input ( & input_files, & config ) . map_err_context ( String :: new) ?;
887928 let word_set = create_word_set ( & config, & word_filter, & file_map) ;
888929 write_traditional_output ( & mut config, & file_map, & word_set, & output_file)
889930}
0 commit comments