Skip to content

Commit 96236e8

Browse files
committed
[pax] performance improvements
Changes Made 1. Pattern matching optimization (pax/pattern.rs): - Pre-collect chars once in matches(), pass slice to recursive functions - Eliminates O(n) allocation per recursive call during backtracking 2. Substitution string building (pax/subst.rs): - Use String::with_capacity() for exact allocation size - Use push_str() instead of format!() in loops - Pre-allocate build_replacement() buffer with reasonable estimate 3. Static padding buffers (pax/formats/ustar.rs, pax/formats/pax.rs): - Added static ZERO_BLOCK: [u8; 512] - Replaced vec![0u8; padding] with &ZERO_BLOCK[..padding] - Eliminates per-entry heap allocations 4. Pre-compiled delete patterns (pax/options.rs): - Added delete_patterns_compiled: Vec<Pattern> field - Patterns compiled once during parsing, not on every keyword check - should_delete_keyword() now uses pre-compiled patterns Performance Results (1000 files × 10KB = 10MB) | Operation | System tar | Our pax | Result | |-----------|------------|---------|------------------| | Create | 0.134s | 0.067s | pax 2x faster | | Extract | 0.328s | 0.349s | Same (I/O bound) | | List | 0.015s | 0.012s | pax 20% faster |
1 parent 41462d8 commit 96236e8

File tree

5 files changed

+58
-60
lines changed

5 files changed

+58
-60
lines changed

pax/formats/pax.rs

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ use std::io::{Read, Write};
3131
use std::path::PathBuf;
3232

3333
const BLOCK_SIZE: usize = 512;
34+
/// Static zero buffer for padding and end-of-archive markers
35+
static ZERO_BLOCK: [u8; BLOCK_SIZE] = [0u8; BLOCK_SIZE];
3436

3537
// Extended header typeflags
3638
const PAX_XHDR: u8 = b'x'; // Per-file extended header
@@ -504,11 +506,11 @@ impl<R: Read> PaxReader<R> {
504506
let mut data = vec![0u8; size as usize];
505507
self.reader.read_exact(&mut data)?;
506508

507-
// Skip padding to block boundary
509+
// Skip padding to block boundary using a stack buffer
508510
let padding = padding_needed(size);
509511
if padding > 0 {
510-
let mut pad = vec![0u8; padding];
511-
self.reader.read_exact(&mut pad)?;
512+
let mut pad = [0u8; BLOCK_SIZE];
513+
self.reader.read_exact(&mut pad[..padding])?;
512514
}
513515

514516
ExtendedHeader::parse(&data)
@@ -695,11 +697,10 @@ impl<W: Write> PaxWriter<W> {
695697
// Write global header data
696698
self.writer.write_all(&data)?;
697699

698-
// Pad to block boundary
700+
// Pad to block boundary using static buffer
699701
let padding = padding_needed(data.len() as u64);
700702
if padding > 0 {
701-
let zeros = vec![0u8; padding];
702-
self.writer.write_all(&zeros)?;
703+
self.writer.write_all(&ZERO_BLOCK[..padding])?;
703704
}
704705

705706
Ok(())
@@ -759,11 +760,10 @@ impl<W: Write> PaxWriter<W> {
759760
// Write extended header data
760761
self.writer.write_all(&data)?;
761762

762-
// Pad to block boundary
763+
// Pad to block boundary using static buffer
763764
let padding = padding_needed(data.len() as u64);
764765
if padding > 0 {
765-
let zeros = vec![0u8; padding];
766-
self.writer.write_all(&zeros)?;
766+
self.writer.write_all(&ZERO_BLOCK[..padding])?;
767767
}
768768

769769
Ok(())
@@ -797,19 +797,18 @@ impl<W: Write> ArchiveWriter for PaxWriter<W> {
797797
}
798798

799799
fn finish_entry(&mut self) -> PaxResult<()> {
800+
// Pad to block boundary using static buffer
800801
let padding = padding_needed(self.bytes_written);
801802
if padding > 0 {
802-
let zeros = vec![0u8; padding];
803-
self.writer.write_all(&zeros)?;
803+
self.writer.write_all(&ZERO_BLOCK[..padding])?;
804804
}
805805
Ok(())
806806
}
807807

808808
fn finish(&mut self) -> PaxResult<()> {
809-
// Write two zero blocks
810-
let zeros = [0u8; BLOCK_SIZE];
811-
self.writer.write_all(&zeros)?;
812-
self.writer.write_all(&zeros)?;
809+
// Write two zero blocks using static buffer
810+
self.writer.write_all(&ZERO_BLOCK)?;
811+
self.writer.write_all(&ZERO_BLOCK)?;
813812
self.writer.flush()?;
814813
Ok(())
815814
}

pax/formats/ustar.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ use std::io::{Read, Write};
3333
use std::path::PathBuf;
3434

3535
const BLOCK_SIZE: usize = 512;
36+
/// Static zero buffer for padding and end-of-archive markers
37+
static ZERO_BLOCK: [u8; BLOCK_SIZE] = [0u8; BLOCK_SIZE];
3638
const NAME_LEN: usize = 100;
3739
const PREFIX_LEN: usize = 155;
3840
const LINKNAME_LEN: usize = 100;
@@ -184,20 +186,18 @@ impl<W: Write> ArchiveWriter for UstarWriter<W> {
184186
}
185187

186188
fn finish_entry(&mut self) -> PaxResult<()> {
187-
// Pad to block boundary
189+
// Pad to block boundary using static zero buffer
188190
let padding = padding_needed(self.bytes_written);
189191
if padding > 0 {
190-
let zeros = vec![0u8; padding];
191-
self.writer.write_all(&zeros)?;
192+
self.writer.write_all(&ZERO_BLOCK[..padding])?;
192193
}
193194
Ok(())
194195
}
195196

196197
fn finish(&mut self) -> PaxResult<()> {
197-
// Write two zero blocks
198-
let zeros = [0u8; BLOCK_SIZE];
199-
self.writer.write_all(&zeros)?;
200-
self.writer.write_all(&zeros)?;
198+
// Write two zero blocks using static buffer
199+
self.writer.write_all(&ZERO_BLOCK)?;
200+
self.writer.write_all(&ZERO_BLOCK)?;
201201
self.writer.flush()?;
202202
Ok(())
203203
}

pax/options.rs

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ pub struct FormatOptions {
6262
pub list_format: Option<String>,
6363
/// Delete patterns (delete=pattern) - for future pax format support
6464
pub delete_patterns: Vec<String>,
65+
/// Pre-compiled delete patterns for efficient matching
66+
delete_patterns_compiled: Vec<Pattern>,
6567
/// Times option (include atime/mtime in extended headers)
6668
pub include_times: bool,
6769
/// Linkdata option (write contents for hard links)
@@ -159,6 +161,10 @@ impl FormatOptions {
159161
"delete" => {
160162
if let Some(pattern) = value {
161163
self.delete_patterns.push(pattern.to_string());
164+
// Pre-compile pattern for efficient matching
165+
if let Ok(compiled) = Pattern::new(pattern) {
166+
self.delete_patterns_compiled.push(compiled);
167+
}
162168
}
163169
}
164170
"times" => {
@@ -224,6 +230,8 @@ impl FormatOptions {
224230
}
225231
self.delete_patterns
226232
.extend(other.delete_patterns.iter().cloned());
233+
self.delete_patterns_compiled
234+
.extend(other.delete_patterns_compiled.iter().cloned());
227235
if other.include_times {
228236
self.include_times = true;
229237
}
@@ -234,17 +242,11 @@ impl FormatOptions {
234242

235243
/// Check if a keyword should be deleted from extended headers
236244
///
237-
/// Returns true if the keyword matches any of the delete patterns
245+
/// Returns true if the keyword matches any of the pre-compiled delete patterns
238246
pub fn should_delete_keyword(&self, keyword: &str) -> bool {
239-
for pattern_str in &self.delete_patterns {
240-
// Parse pattern and match against keyword
241-
if let Ok(pattern) = Pattern::new(pattern_str) {
242-
if pattern.matches(keyword) {
243-
return true;
244-
}
245-
}
246-
}
247-
false
247+
self.delete_patterns_compiled
248+
.iter()
249+
.any(|pattern| pattern.matches(keyword))
248250
}
249251

250252
/// Get the global options map for extended header generation

pax/pattern.rs

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ impl Pattern {
5656

5757
/// Check if a string matches this pattern
5858
pub fn matches(&self, text: &str) -> bool {
59-
match_tokens(&self.tokens, text)
59+
// Pre-collect chars once to avoid repeated allocations in recursive calls
60+
let text_chars: Vec<char> = text.chars().collect();
61+
match_tokens_with_chars(&self.tokens, &text_chars, 0)
6062
}
6163
}
6264

@@ -132,59 +134,54 @@ fn parse_char_class(chars: &mut std::iter::Peekable<std::str::Chars>) -> PaxResu
132134
Err(PaxError::PatternError("unclosed bracket".to_string()))
133135
}
134136

135-
/// Match tokens against text
136-
fn match_tokens(tokens: &[Token], text: &str) -> bool {
137-
match_tokens_at(tokens, text, 0)
138-
}
139-
140-
/// Recursive matching with position tracking
141-
fn match_tokens_at(tokens: &[Token], text: &str, pos: usize) -> bool {
137+
/// Recursive matching with position tracking using pre-collected chars
138+
fn match_tokens_with_chars(tokens: &[Token], text_chars: &[char], pos: usize) -> bool {
142139
if tokens.is_empty() {
143-
return pos == text.len();
140+
return pos == text_chars.len();
144141
}
145142

146-
let text_chars: Vec<char> = text.chars().collect();
147-
148143
match &tokens[0] {
149144
Token::Char(c) => {
150145
if pos < text_chars.len() && text_chars[pos] == *c {
151-
match_tokens_at(&tokens[1..], text, pos + 1)
146+
match_tokens_with_chars(&tokens[1..], text_chars, pos + 1)
152147
} else {
153148
false
154149
}
155150
}
156151
Token::Any => {
157152
// ? matches any single character except /
158153
if pos < text_chars.len() && text_chars[pos] != '/' {
159-
match_tokens_at(&tokens[1..], text, pos + 1)
154+
match_tokens_with_chars(&tokens[1..], text_chars, pos + 1)
160155
} else {
161156
false
162157
}
163158
}
164159
Token::Star => {
165160
// * matches any sequence except /
166-
match_star(&tokens[1..], text, pos)
161+
match_star_with_chars(&tokens[1..], text_chars, pos)
167162
}
168163
Token::Class(class) => {
169164
if pos < text_chars.len() && class_matches(class, text_chars[pos]) {
170-
match_tokens_at(&tokens[1..], text, pos + 1)
165+
match_tokens_with_chars(&tokens[1..], text_chars, pos + 1)
171166
} else {
172167
false
173168
}
174169
}
175170
}
176171
}
177172

178-
/// Handle star matching (greedy with backtracking)
173+
/// Handle star matching (greedy with backtracking) using pre-collected chars
179174
/// Star matches any sequence except /
180-
#[allow(clippy::needless_range_loop)]
181-
fn match_star(remaining_tokens: &[Token], text: &str, start_pos: usize) -> bool {
182-
let text_chars: Vec<char> = text.chars().collect();
175+
fn match_star_with_chars(
176+
remaining_tokens: &[Token],
177+
text_chars: &[char],
178+
start_pos: usize,
179+
) -> bool {
183180
let text_len = text_chars.len();
184181

185182
// Try matching zero or more characters (but not /)
186183
for pos in start_pos..=text_len {
187-
if match_tokens_at(remaining_tokens, text, pos) {
184+
if match_tokens_with_chars(remaining_tokens, text_chars, pos) {
188185
return true;
189186
}
190187

pax/subst.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,12 @@ impl Substitution {
134134
let match_start = matches[0].start;
135135
let match_end = matches[0].end;
136136

137-
// Replace the matched portion
138-
let new_result = format!(
139-
"{}{}{}",
140-
&result[..match_start],
141-
replacement,
142-
&result[match_end..]
143-
);
137+
// Replace the matched portion efficiently using with_capacity and push_str
138+
let new_len = result.len() - (match_end - match_start) + replacement.len();
139+
let mut new_result = String::with_capacity(new_len);
140+
new_result.push_str(&result[..match_start]);
141+
new_result.push_str(&replacement);
142+
new_result.push_str(&result[match_end..]);
144143

145144
// Update position for next iteration
146145
// Move past the replacement (or at least one char to avoid infinite loop)
@@ -182,7 +181,8 @@ impl Substitution {
182181

183182
/// Build the replacement string from template and match groups
184183
fn build_replacement(template: &str, input: &str, matches: &[Match]) -> String {
185-
let mut result = String::new();
184+
// Pre-allocate with a reasonable estimate (template length + some extra for expansions)
185+
let mut result = String::with_capacity(template.len() + 32);
186186
let mut chars = template.chars().peekable();
187187

188188
while let Some(c) = chars.next() {

0 commit comments

Comments
 (0)