Skip to content

Commit f1f4973

Browse files
committed
basenc: stream base32/base64 I/O to honor bounded-memory test
GNU basenc bounded-memory failed because the Rust impl buffered entire input and exceeded the vmem limit. Stream base32/base64 via BufReader and chunked encode/decode so the working set stays around 8 KiB. Keep base58 buffered to preserve its big-integer semantics. Flush already-decoded bytes before returning errors to match GNU output.
1 parent aaa0610 commit f1f4973

File tree

1 file changed

+262
-46
lines changed

1 file changed

+262
-46
lines changed

src/uu/base32/src/base_common.rs

Lines changed: 262 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use clap::{Arg, ArgAction, Command};
99
use std::ffi::OsString;
1010
use std::fs::File;
11-
use std::io::{self, ErrorKind, Read, Seek, Write};
11+
use std::io::{self, BufReader, ErrorKind, Read, Write};
1212
use std::path::{Path, PathBuf};
1313
use uucore::display::Quotable;
1414
use uucore::encoding::{
@@ -28,6 +28,8 @@ pub const BASE_CMD_PARSE_ERROR: i32 = 1;
2828
///
2929
/// This default is only used if no "-w"/"--wrap" argument is passed
3030
pub const WRAP_DEFAULT: usize = 76;
31+
// Fixed to 8 KiB (equivalent to std::io::DEFAULT_BUF_SIZE on most targets)
32+
pub const DEFAULT_BUFFER_SIZE: usize = 8 * 1024;
3133

3234
pub struct Config {
3335
pub decode: bool,
@@ -149,64 +151,63 @@ pub fn base_app(about: &'static str, usage: &str) -> Command {
149151
)
150152
}
151153

152-
/// A trait alias for types that implement both `Read` and `Seek`.
153-
pub trait ReadSeek: Read + Seek {}
154-
155-
/// Automatically implement the `ReadSeek` trait for any type that implements both `Read` and `Seek`.
156-
impl<T: Read + Seek> ReadSeek for T {}
157-
158-
pub fn get_input(config: &Config) -> UResult<Box<dyn ReadSeek>> {
154+
pub fn get_input(config: &Config) -> UResult<Box<dyn Read>> {
159155
match &config.to_read {
160156
Some(path_buf) => {
161-
// Do not buffer input, because buffering is handled by `fast_decode` and `fast_encode`
162157
let file =
163158
File::open(path_buf).map_err_context(|| path_buf.maybe_quote().to_string())?;
164-
Ok(Box::new(file))
159+
Ok(Box::new(BufReader::new(file)))
165160
}
166161
None => {
167-
let mut buffer = Vec::new();
168-
io::stdin().read_to_end(&mut buffer)?;
169-
Ok(Box::new(io::Cursor::new(buffer)))
162+
// Stdin is already buffered by the OS; wrap once more to reduce syscalls per read.
163+
Ok(Box::new(BufReader::new(io::stdin())))
170164
}
171165
}
172166
}
173-
174-
/// Determines if the input buffer contains any padding ('=') ignoring trailing whitespace.
175-
fn read_and_has_padding<R: Read>(input: &mut R) -> UResult<(bool, Vec<u8>)> {
176-
let mut buf = Vec::new();
177-
input
178-
.read_to_end(&mut buf)
179-
.map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?;
180-
181-
// Treat the stream as padded if any '=' exists (GNU coreutils continues decoding
182-
// even when padding bytes are followed by more data).
183-
let has_padding = buf.contains(&b'=');
184-
185-
Ok((has_padding, buf))
186-
}
187-
188-
pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Config) -> UResult<()> {
189-
let (has_padding, read) = read_and_has_padding(input)?;
190-
167+
pub fn handle_input<R: Read>(input: &mut R, format: Format, config: Config) -> UResult<()> {
168+
// Always allow padding for Base64 to avoid a full pre-scan of the input.
191169
let supports_fast_decode_and_encode =
192-
get_supports_fast_decode_and_encode(format, config.decode, has_padding);
170+
get_supports_fast_decode_and_encode(format, config.decode, true);
193171

194172
let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref();
195173
let mut stdout_lock = io::stdout().lock();
196-
let result = if config.decode {
197-
fast_decode::fast_decode(
198-
read,
174+
let result = match (format, config.decode) {
175+
// Base58 must process the entire input as one big integer; keep the
176+
// historical behaviour of buffering everything for this format only.
177+
(Format::Base58, _) => {
178+
let mut buffered = Vec::new();
179+
input
180+
.read_to_end(&mut buffered)
181+
.map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?;
182+
if config.decode {
183+
fast_decode::fast_decode_buffer(
184+
buffered,
185+
&mut stdout_lock,
186+
supports_fast_decode_and_encode_ref,
187+
config.ignore_garbage,
188+
)
189+
} else {
190+
fast_encode::fast_encode_buffer(
191+
buffered,
192+
&mut stdout_lock,
193+
supports_fast_decode_and_encode_ref,
194+
config.wrap_cols,
195+
)
196+
}
197+
}
198+
// Streaming path for all other encodings keeps memory bounded.
199+
(_, true) => fast_decode::fast_decode_stream(
200+
input,
199201
&mut stdout_lock,
200202
supports_fast_decode_and_encode_ref,
201203
config.ignore_garbage,
202-
)
203-
} else {
204-
fast_encode::fast_encode(
205-
read,
204+
),
205+
(_, false) => fast_encode::fast_encode_stream(
206+
input,
206207
&mut stdout_lock,
207208
supports_fast_decode_and_encode_ref,
208209
config.wrap_cols,
209-
)
210+
),
210211
};
211212

212213
// Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc
@@ -296,14 +297,17 @@ pub fn get_supports_fast_decode_and_encode(
296297
}
297298

298299
pub mod fast_encode {
299-
use crate::base_common::WRAP_DEFAULT;
300+
use crate::base_common::{DEFAULT_BUFFER_SIZE, WRAP_DEFAULT};
300301
use std::{
301302
cmp::min,
302303
collections::VecDeque,
303-
io::{self, Write},
304+
io::{self, Read, Write},
304305
num::NonZeroUsize,
305306
};
306-
use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
307+
use uucore::{
308+
encoding::SupportsFastDecodeAndEncode,
309+
error::{UResult, USimpleError},
310+
};
307311

308312
struct LineWrapping {
309313
line_length: NonZeroUsize,
@@ -405,7 +409,7 @@ pub mod fast_encode {
405409
}
406410
// End of helper functions
407411

408-
pub fn fast_encode(
412+
pub fn fast_encode_buffer(
409413
input: Vec<u8>,
410414
output: &mut dyn Write,
411415
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
@@ -506,10 +510,90 @@ pub mod fast_encode {
506510
}
507511
Ok(())
508512
}
513+
514+
pub fn fast_encode_stream(
515+
input: &mut dyn Read,
516+
output: &mut dyn Write,
517+
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
518+
wrap: Option<usize>,
519+
) -> UResult<()> {
520+
const ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;
521+
522+
let encode_in_chunks_of_size =
523+
supports_fast_decode_and_encode.unpadded_multiple() * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
524+
525+
assert!(encode_in_chunks_of_size > 0);
526+
527+
let mut line_wrapping = match wrap {
528+
Some(0) => None,
529+
Some(an) => Some(LineWrapping {
530+
line_length: NonZeroUsize::new(an).unwrap(),
531+
print_buffer: Vec::<u8>::new(),
532+
}),
533+
None => Some(LineWrapping {
534+
line_length: NonZeroUsize::new(WRAP_DEFAULT).unwrap(),
535+
print_buffer: Vec::<u8>::new(),
536+
}),
537+
};
538+
539+
// Buffers
540+
let mut leftover_buffer = VecDeque::<u8>::new();
541+
let mut encoded_buffer = VecDeque::<u8>::new();
542+
543+
let mut read_buffer = vec![0u8; encode_in_chunks_of_size.max(DEFAULT_BUFFER_SIZE)];
544+
545+
loop {
546+
let read = input
547+
.read(&mut read_buffer)
548+
.map_err(|err| USimpleError::new(1, super::format_read_error(err.kind())))?;
549+
if read == 0 {
550+
break;
551+
}
552+
553+
leftover_buffer.extend(&read_buffer[..read]);
554+
555+
while leftover_buffer.len() >= encode_in_chunks_of_size {
556+
{
557+
let contiguous = leftover_buffer.make_contiguous();
558+
encode_in_chunks_to_buffer(
559+
supports_fast_decode_and_encode,
560+
&contiguous[..encode_in_chunks_of_size],
561+
&mut encoded_buffer,
562+
)?;
563+
}
564+
565+
// Drop the data we just encoded
566+
leftover_buffer.drain(..encode_in_chunks_of_size);
567+
568+
write_to_output(
569+
&mut line_wrapping,
570+
&mut encoded_buffer,
571+
output,
572+
false,
573+
wrap == Some(0),
574+
)?;
575+
}
576+
}
577+
578+
// Encode any remaining bytes and flush
579+
supports_fast_decode_and_encode
580+
.encode_to_vec_deque(leftover_buffer.make_contiguous(), &mut encoded_buffer)?;
581+
582+
write_to_output(
583+
&mut line_wrapping,
584+
&mut encoded_buffer,
585+
output,
586+
true,
587+
wrap == Some(0),
588+
)?;
589+
590+
Ok(())
591+
}
509592
}
510593

511594
pub mod fast_decode {
512-
use std::io::{self, Write};
595+
use crate::base_common::DEFAULT_BUFFER_SIZE;
596+
use std::io::{self, Read, Write};
513597
use uucore::{
514598
encoding::SupportsFastDecodeAndEncode,
515599
error::{UResult, USimpleError},
@@ -579,7 +663,7 @@ pub mod fast_decode {
579663
}
580664
// End of helper functions
581665

582-
pub fn fast_decode(
666+
pub fn fast_decode_buffer(
583667
input: Vec<u8>,
584668
output: &mut dyn Write,
585669
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
@@ -671,6 +755,123 @@ pub mod fast_decode {
671755

672756
Ok(())
673757
}
758+
759+
pub fn fast_decode_stream(
760+
input: &mut dyn Read,
761+
output: &mut dyn Write,
762+
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
763+
ignore_garbage: bool,
764+
) -> UResult<()> {
765+
const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;
766+
767+
let alphabet = supports_fast_decode_and_encode.alphabet();
768+
let alphabet_table = alphabet_lookup(alphabet);
769+
let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
770+
let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
771+
772+
assert!(decode_in_chunks_of_size > 0);
773+
assert!(valid_multiple > 0);
774+
775+
let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();
776+
777+
let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);
778+
let mut decoded_buffer = Vec::<u8>::new();
779+
let mut read_buffer = [0u8; DEFAULT_BUFFER_SIZE];
780+
781+
loop {
782+
let read = input
783+
.read(&mut read_buffer)
784+
.map_err(|err| USimpleError::new(1, super::format_read_error(err.kind())))?;
785+
if read == 0 {
786+
break;
787+
}
788+
789+
for &byte in &read_buffer[..read] {
790+
if byte == b'\n' || byte == b'\r' {
791+
continue;
792+
}
793+
794+
if alphabet_table[usize::from(byte)] {
795+
buffer.push(byte);
796+
} else if ignore_garbage {
797+
continue;
798+
} else {
799+
if supports_partial_decode {
800+
flush_ready_chunks(
801+
&mut buffer,
802+
decode_in_chunks_of_size,
803+
valid_multiple,
804+
supports_fast_decode_and_encode,
805+
&mut decoded_buffer,
806+
output,
807+
)?;
808+
} else {
809+
while buffer.len() >= decode_in_chunks_of_size {
810+
decode_in_chunks_to_buffer(
811+
supports_fast_decode_and_encode,
812+
&buffer[..decode_in_chunks_of_size],
813+
&mut decoded_buffer,
814+
)?;
815+
write_to_output(&mut decoded_buffer, output)?;
816+
buffer.drain(..decode_in_chunks_of_size);
817+
}
818+
}
819+
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
820+
}
821+
822+
if supports_partial_decode {
823+
flush_ready_chunks(
824+
&mut buffer,
825+
decode_in_chunks_of_size,
826+
valid_multiple,
827+
supports_fast_decode_and_encode,
828+
&mut decoded_buffer,
829+
output,
830+
)?;
831+
} else if buffer.len() == decode_in_chunks_of_size {
832+
decode_in_chunks_to_buffer(
833+
supports_fast_decode_and_encode,
834+
&buffer,
835+
&mut decoded_buffer,
836+
)?;
837+
write_to_output(&mut decoded_buffer, output)?;
838+
buffer.clear();
839+
}
840+
}
841+
}
842+
843+
if supports_partial_decode {
844+
flush_ready_chunks(
845+
&mut buffer,
846+
decode_in_chunks_of_size,
847+
valid_multiple,
848+
supports_fast_decode_and_encode,
849+
&mut decoded_buffer,
850+
output,
851+
)?;
852+
}
853+
854+
if !buffer.is_empty() {
855+
let mut owned_chunk: Option<Vec<u8>> = None;
856+
let mut had_invalid_tail = false;
857+
858+
if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) {
859+
had_invalid_tail = pad_result.had_invalid_tail;
860+
owned_chunk = Some(pad_result.chunk);
861+
}
862+
863+
let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer);
864+
865+
supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
866+
write_to_output(&mut decoded_buffer, output)?;
867+
868+
if had_invalid_tail {
869+
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
870+
}
871+
}
872+
873+
Ok(())
874+
}
674875
}
675876

676877
fn format_read_error(kind: ErrorKind) -> String {
@@ -692,6 +893,21 @@ fn format_read_error(kind: ErrorKind) -> String {
692893
translate!("base-common-read-error", "error" => kind_string_capitalized)
693894
}
694895

896+
/// Determines if the input buffer contains any padding ('=') ignoring trailing whitespace.
897+
#[cfg(test)]
898+
fn read_and_has_padding<R: Read>(input: &mut R) -> UResult<(bool, Vec<u8>)> {
899+
let mut buf = Vec::new();
900+
input
901+
.read_to_end(&mut buf)
902+
.map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?;
903+
904+
// Treat the stream as padded if any '=' exists (GNU coreutils continues decoding
905+
// even when padding bytes are followed by more data).
906+
let has_padding = buf.contains(&b'=');
907+
908+
Ok((has_padding, buf))
909+
}
910+
695911
#[cfg(test)]
696912
mod tests {
697913
use crate::base_common::read_and_has_padding;

0 commit comments

Comments
 (0)