@@ -919,204 +919,6 @@ impl Write for LineChunkWriter<'_> {
919919 }
920920}
921921
922- /// Write lines to each sequential output files, limited by bytes.
923- ///
924- /// This struct maintains an underlying writer representing the
925- /// current chunk of the output. On each call to [`write`], it writes
926- /// as many lines as possible to the current chunk without exceeding
927- /// the specified byte limit. If a single line has more bytes than the
928- /// limit, then fill an entire single chunk with those bytes and
929- /// handle the remainder of the line as if it were its own distinct
930- /// line. As many new underlying writers are created as needed to
931- /// write all the data in the input buffer.
932- struct LineBytesChunkWriter < ' a > {
933- /// Parameters for creating the underlying writer for each new chunk.
934- settings : & ' a Settings ,
935-
936- /// The maximum number of bytes allowed for a single chunk of output.
937- chunk_size : u64 ,
938-
939- /// Running total of number of chunks that have been completed.
940- num_chunks_written : usize ,
941-
942- /// Remaining capacity in number of bytes in the current chunk.
943- ///
944- /// This number starts at `chunk_size` and decreases as lines are
945- /// written. Once it reaches zero, a writer for a new chunk is
946- /// initialized and this number gets reset to `chunk_size`.
947- num_bytes_remaining_in_current_chunk : usize ,
948-
949- /// The underlying writer for the current chunk.
950- ///
951- /// Once the number of bytes written to this writer exceeds
952- /// `chunk_size`, a new writer is initialized and assigned to this
953- /// field.
954- inner : BufWriter < Box < dyn Write > > ,
955-
956- /// Iterator that yields filenames for each chunk.
957- filename_iterator : FilenameIterator < ' a > ,
958- }
959-
960- impl < ' a > LineBytesChunkWriter < ' a > {
961- fn new ( chunk_size : u64 , settings : & ' a Settings ) -> UResult < Self > {
962- let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix ) ?;
963- let filename = filename_iterator
964- . next ( )
965- . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
966- if settings. verbose {
967- println ! ( "creating file {}" , filename. quote( ) ) ;
968- }
969- let inner = settings. instantiate_current_writer ( & filename, true ) ?;
970- Ok ( LineBytesChunkWriter {
971- settings,
972- chunk_size,
973- num_bytes_remaining_in_current_chunk : usize:: try_from ( chunk_size) . unwrap ( ) ,
974- num_chunks_written : 0 ,
975- inner,
976- filename_iterator,
977- } )
978- }
979- }
980-
981- impl Write for LineBytesChunkWriter < ' _ > {
982- /// Write as many lines to a chunk as possible without
983- /// exceeding the byte limit. If a single line has more bytes
984- /// than the limit, then fill an entire single chunk with those
985- /// bytes and handle the remainder of the line as if it were
986- /// its own distinct line.
987- ///
988- /// For example: if the `chunk_size` is 8 and the input is:
989- ///
990- /// ```text
991- /// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
992- /// ```
993- ///
994- /// then the output gets broken into chunks like this:
995- ///
996- /// ```text
997- /// chunk 0 chunk 1 chunk 2 chunk 3
998- ///
999- /// 0 1 2
1000- /// 01234567 89 01234 56789 012 345 6
1001- /// |------| |-------| |--------| |---|
1002- /// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
1003- /// ```
1004- ///
1005- /// Implements `--line-bytes=SIZE`
1006- fn write ( & mut self , mut buf : & [ u8 ] ) -> std:: io:: Result < usize > {
1007- // The total number of bytes written during the loop below.
1008- //
1009- // It is necessary to keep this running total because we may
1010- // be making multiple calls to `write()` on multiple different
1011- // underlying writers and we want the final reported number of
1012- // bytes written to reflect the total number of bytes written
1013- // to all of the underlying writers.
1014- let mut total_bytes_written = 0 ;
1015-
1016- // Loop until we have written all bytes in the input buffer
1017- // (or an IO error occurs).
1018- loop {
1019- // If the buffer is empty, then we are done writing.
1020- if buf. is_empty ( ) {
1021- return Ok ( total_bytes_written) ;
1022- }
1023-
1024- // If we have filled the current chunk with bytes, then
1025- // start a new chunk and initialize its corresponding
1026- // writer.
1027- if self . num_bytes_remaining_in_current_chunk == 0 {
1028- self . num_chunks_written += 1 ;
1029- let filename = self . filename_iterator . next ( ) . ok_or_else ( || {
1030- std:: io:: Error :: new ( ErrorKind :: Other , "output file suffixes exhausted" )
1031- } ) ?;
1032- if self . settings . verbose {
1033- println ! ( "creating file {}" , filename. quote( ) ) ;
1034- }
1035- self . inner = self . settings . instantiate_current_writer ( & filename, true ) ?;
1036- self . num_bytes_remaining_in_current_chunk = self . chunk_size . try_into ( ) . unwrap ( ) ;
1037- }
1038-
1039- // Find the first separator (default - newline character) in the buffer.
1040- let sep = self . settings . separator ;
1041- match memchr:: memchr ( sep, buf) {
1042- // If there is no separator character and the buffer is
1043- // not empty, then write as many bytes as we can and
1044- // then move on to the next chunk if necessary.
1045- None => {
1046- let end = self . num_bytes_remaining_in_current_chunk ;
1047-
1048- // This is ugly but here to match GNU behavior. If the input
1049- // doesn't end with a separator, pretend that it does for handling
1050- // the second to last segment chunk. See `line-bytes.sh`.
1051- if end == buf. len ( )
1052- && self . num_bytes_remaining_in_current_chunk
1053- < self . chunk_size . try_into ( ) . unwrap ( )
1054- && buf[ buf. len ( ) - 1 ] != sep
1055- {
1056- self . num_bytes_remaining_in_current_chunk = 0 ;
1057- } else {
1058- let num_bytes_written = custom_write (
1059- & buf[ ..end. min ( buf. len ( ) ) ] ,
1060- & mut self . inner ,
1061- self . settings ,
1062- ) ?;
1063- self . num_bytes_remaining_in_current_chunk -= num_bytes_written;
1064- total_bytes_written += num_bytes_written;
1065- buf = & buf[ num_bytes_written..] ;
1066- }
1067- }
1068-
1069- // If there is a separator character and the line
1070- // (including the separator character) will fit in the
1071- // current chunk, then write the entire line and
1072- // continue to the next iteration. (See chunk 1 in the
1073- // example comment above.)
1074- Some ( i) if i < self . num_bytes_remaining_in_current_chunk => {
1075- let num_bytes_written =
1076- custom_write ( & buf[ ..=i] , & mut self . inner , self . settings ) ?;
1077- self . num_bytes_remaining_in_current_chunk -= num_bytes_written;
1078- total_bytes_written += num_bytes_written;
1079- buf = & buf[ num_bytes_written..] ;
1080- }
1081-
1082- // If there is a separator character, the line
1083- // (including the separator character) will not fit in
1084- // the current chunk, *and* no other lines have been
1085- // written to the current chunk, then write as many
1086- // bytes as we can and continue to the next
1087- // iteration. (See chunk 0 in the example comment
1088- // above.)
1089- Some ( _)
1090- if self . num_bytes_remaining_in_current_chunk
1091- == self . chunk_size . try_into ( ) . unwrap ( ) =>
1092- {
1093- let end = self . num_bytes_remaining_in_current_chunk ;
1094- let num_bytes_written =
1095- custom_write ( & buf[ ..end] , & mut self . inner , self . settings ) ?;
1096- self . num_bytes_remaining_in_current_chunk -= num_bytes_written;
1097- total_bytes_written += num_bytes_written;
1098- buf = & buf[ num_bytes_written..] ;
1099- }
1100-
1101- // If there is a separator character, the line
1102- // (including the separator character) will not fit in
1103- // the current chunk, and at least one other line has
1104- // been written to the current chunk, then signal to
1105- // the next iteration that a new chunk needs to be
1106- // created and continue to the next iteration of the
1107- // loop to try writing the line there.
1108- Some ( _) => {
1109- self . num_bytes_remaining_in_current_chunk = 0 ;
1110- }
1111- }
1112- }
1113- }
1114-
1115- fn flush ( & mut self ) -> std:: io:: Result < ( ) > {
1116- self . inner . flush ( )
1117- }
1118- }
1119-
1120922/// Output file parameters
1121923struct OutFile {
1122924 filename : String ,
@@ -1629,6 +1431,114 @@ where
16291431 Ok ( ( ) )
16301432}
16311433
1434+ /// Like `std::io::Lines`, but includes the line ending character.
1435+ ///
1436+ /// This struct is generally created by calling `lines_with_sep` on a
1437+ /// reader.
1438+ pub struct LinesWithSep < R > {
1439+ inner : R ,
1440+ separator : u8 ,
1441+ }
1442+
1443+ impl < R > Iterator for LinesWithSep < R >
1444+ where
1445+ R : BufRead ,
1446+ {
1447+ type Item = std:: io:: Result < Vec < u8 > > ;
1448+
1449+ /// Read bytes from a buffer up to the requested number of lines.
1450+ fn next ( & mut self ) -> Option < Self :: Item > {
1451+ let mut buf = vec ! [ ] ;
1452+ match self . inner . read_until ( self . separator , & mut buf) {
1453+ Ok ( 0 ) => None ,
1454+ Ok ( _) => Some ( Ok ( buf) ) ,
1455+ Err ( e) => Some ( Err ( e) ) ,
1456+ }
1457+ }
1458+ }
1459+
1460+ /// Like `std::str::lines` but includes the line ending character.
1461+ ///
1462+ /// The `separator` defines the character to interpret as the line
1463+ /// ending. For the usual notion of "line", set this to `b'\n'`.
1464+ pub fn lines_with_sep < R > ( reader : R , separator : u8 ) -> LinesWithSep < R >
1465+ where
1466+ R : BufRead ,
1467+ {
1468+ LinesWithSep {
1469+ inner : reader,
1470+ separator,
1471+ }
1472+ }
1473+
1474+ fn line_bytes < R > ( settings : & Settings , reader : & mut R , chunk_size : usize ) -> UResult < ( ) >
1475+ where
1476+ R : BufRead ,
1477+ {
1478+ let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix ) ?;
1479+
1480+ // Initialize the writer just to satisfy the compiler. It is going
1481+ // to be overwritten for sure at the beginning of the loop below
1482+ // because we start with `remaining == 0`, indicating that a new
1483+ // chunk should start.
1484+ let mut writer: BufWriter < Box < dyn Write > > =
1485+ BufWriter :: new ( Box :: new ( std:: io:: Cursor :: new ( vec ! [ ] ) ) ) ;
1486+
1487+ let mut remaining = 0 ;
1488+ for line in lines_with_sep ( reader, settings. separator ) {
1489+ let line = line?;
1490+ let mut line = & line[ ..] ;
1491+ loop {
1492+ if remaining == 0 {
1493+ let filename = filename_iterator
1494+ . next ( )
1495+ . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
1496+ if settings. verbose {
1497+ println ! ( "creating file {}" , filename. quote( ) ) ;
1498+ }
1499+ writer = settings. instantiate_current_writer ( & filename, true ) ?;
1500+ remaining = chunk_size;
1501+ }
1502+
1503+ // Special case: if this is the last line and it doesn't end
1504+ // with a newline character, then count its length as though
1505+ // it did end with a newline. If that puts it over the edge
1506+ // of this chunk, continue to the next chunk.
1507+ if line. len ( ) == remaining
1508+ && remaining < chunk_size
1509+ && line[ line. len ( ) - 1 ] != settings. separator
1510+ {
1511+ remaining = 0 ;
1512+ continue ;
1513+ }
1514+
1515+ // If the entire line fits in this chunk, write it and
1516+ // continue to the next line.
1517+ if line. len ( ) <= remaining {
1518+ custom_write_all ( line, & mut writer, settings) ?;
1519+ remaining -= line. len ( ) ;
1520+ break ;
1521+ }
1522+
1523+ // If the line is too large to fit in *any* chunk and we are
1524+ // at the start of a new chunk, write as much as we can of
1525+ // it and pass the remainder along to the next chunk.
1526+ if line. len ( ) > chunk_size && remaining == chunk_size {
1527+ custom_write_all ( & line[ ..chunk_size] , & mut writer, settings) ?;
1528+ line = & line[ chunk_size..] ;
1529+ remaining = 0 ;
1530+ continue ;
1531+ }
1532+
1533+ // If the line is too large to fit in *this* chunk, but
1534+ // might otherwise fit in the next chunk, then just continue
1535+ // to the next chunk and let it be handled there.
1536+ remaining = 0 ;
1537+ }
1538+ }
1539+ Ok ( ( ) )
1540+ }
1541+
16321542#[ allow( clippy:: cognitive_complexity) ]
16331543fn split ( settings : & Settings ) -> UResult < ( ) > {
16341544 let r_box = if settings. input == "-" {
@@ -1701,23 +1611,6 @@ fn split(settings: &Settings) -> UResult<()> {
17011611 } ,
17021612 }
17031613 }
1704- Strategy :: LineBytes ( chunk_size) => {
1705- let mut writer = LineBytesChunkWriter :: new ( chunk_size, settings) ?;
1706- match std:: io:: copy ( & mut reader, & mut writer) {
1707- Ok ( _) => Ok ( ( ) ) ,
1708- Err ( e) => match e. kind ( ) {
1709- // TODO Since the writer object controls the creation of
1710- // new files, we need to rely on the `std::io::Result`
1711- // returned by its `write()` method to communicate any
1712- // errors to this calling scope. If a new file cannot be
1713- // created because we have exceeded the number of
1714- // allowable filenames, we use `ErrorKind::Other` to
1715- // indicate that. A special error message needs to be
1716- // printed in that case.
1717- ErrorKind :: Other => Err ( USimpleError :: new ( 1 , format ! ( "{e}" ) ) ) ,
1718- _ => Err ( uio_error ! ( e, "input/output error" ) ) ,
1719- } ,
1720- }
1721- }
1614+ Strategy :: LineBytes ( chunk_size) => line_bytes ( settings, & mut reader, chunk_size as usize ) ,
17221615 }
17231616}
0 commit comments