Skip to content

Commit 53d25d2

Browse files
committed
fix issue that made the buffer contain 00 bytes in files smaller than
BUFFER_SIZE, even though the file didn't have any 00 bytes. some code refactoring
1 parent 1e059ac commit 53d25d2

File tree

1 file changed

+94
-67
lines changed

1 file changed

+94
-67
lines changed

src/utils/search.rs

Lines changed: 94 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,94 @@ use std::fs::File;
22
use std::io::{BufReader, Read};
33
use std::ops::Range;
44

5-
#[derive(Debug)]
6-
pub struct Match {
7-
index: Range<usize>,
5+
#[derive(Debug, Clone)]
6+
pub struct Matches {
7+
indexes: Vec<Range<usize>>,
88
data: Vec<u8>,
9-
offset: usize,
9+
offset: Vec<usize>,
10+
pattern_len: usize,
11+
context_bytes_size: usize,
12+
curr_context_bytes_indexes: Option<Range<usize>>,
1013
}
1114

12-
impl Match {
13-
pub fn new(offset: usize, index: Range<usize>, data: Vec<u8>) -> Self {
15+
impl Matches {
16+
pub fn new(pattern_len: usize, context_bytes_size: usize) -> Self {
1417
Self {
15-
index,
16-
data,
17-
offset,
18+
pattern_len,
19+
context_bytes_size,
20+
indexes: Vec::new(),
21+
data: Vec::new(),
22+
offset: Vec::new(),
23+
curr_context_bytes_indexes: None,
1824
}
1925
}
2026

21-
pub fn offset(&self) -> usize {
22-
self.offset
27+
pub fn offset(&self) -> &[usize] {
28+
&self.offset
2329
}
2430

2531
/// Get a reference to the match's index.
26-
pub fn index(&self) -> Range<usize> {
27-
self.index.clone()
32+
pub fn indexes(&self) -> &[Range<usize>] {
33+
&self.indexes
2834
}
2935

3036
/// Get a reference to the match's data.
3137
pub fn data(&self) -> &[u8] {
3238
&self.data
3339
}
40+
41+
fn populate_matches(&mut self, index: usize, pos_in_file: usize, buffer: &[u8]) {
42+
// index where we should start collecting bytes for context
43+
let offset = index - (index % self.context_bytes_size);
44+
45+
// search_for_slice only return the index where the match start so we need to
46+
// create a range with all indexes from the match
47+
let match_indexes = index..index + self.pattern_len;
48+
49+
// Creates the index range for the context bytes.
50+
// context_bytes_size_indexes can contain all the indexes for the match or partially, depends on
51+
// context_bytes_size and the pattern size
52+
let mut context_bytes_indexes = if offset + self.context_bytes_size <= buffer.len() {
53+
offset..offset + self.context_bytes_size
54+
} else {
55+
offset..buffer.len()
56+
};
57+
58+
// In case context_bytes_size doesn't contain all of the match indexes we
59+
// need to extend the end of the range
60+
if context_bytes_indexes.end < match_indexes.end {
61+
context_bytes_indexes.end += self.context_bytes_size;
62+
}
63+
64+
let context_bytes_indexes = Some(context_bytes_indexes);
65+
if context_bytes_indexes != self.curr_context_bytes_indexes { // Check if context_bytes_indexes was already added
66+
self.curr_context_bytes_indexes = context_bytes_indexes.clone();
67+
68+
// The actual bytes for context + the matching bytes
69+
// needed for printing the result
70+
self.data.extend_from_slice(&buffer[context_bytes_indexes.unwrap()]);
71+
72+
// The index is relative to the position in the current buffer we are
73+
// reading from the file, but we need to store the position relative to the
74+
// whole file
75+
self.offset.push(index + pos_in_file);
76+
}
77+
78+
// Now we need to know the indexes of the match inside of context_bytes
79+
let mut match_indexes = match_indexes.start % self.data.len()
80+
..match_indexes.end % self.data.len();
81+
82+
if match_indexes.end < match_indexes.start {
83+
match_indexes.end = match_indexes.start + self.pattern_len;
84+
}
85+
86+
self.indexes.push(match_indexes);
87+
}
3488
}
3589

3690
pub struct Searcher<'a> {
3791
pattern: &'a [u8],
38-
result: Vec<Vec<Match>>,
92+
matches: Matches,
3993
context_bytes_size: usize,
4094
}
4195

@@ -45,72 +99,45 @@ impl<'a> Searcher<'a> {
4599
pub fn new(pattern: &'a [u8], context_bytes_size: usize) -> Self {
46100
Self {
47101
pattern,
48-
result: Vec::new(),
102+
matches: Matches::new(pattern.len(), context_bytes_size),
49103
context_bytes_size,
50104
}
51105
}
52106

53107
pub fn search_in_file(&mut self, filepath: &str) -> std::io::Result<()> {
54108
let file = File::open(filepath)?;
109+
let file_size = file.metadata().unwrap().len() as usize;
55110

56111
let mut reader = BufReader::new(file);
57-
let mut buffer = [0; Self::BUFFER_SIZE];
58112
let mut pos_in_file = 0;
59113

60-
loop {
61-
let n = reader.read(&mut buffer).unwrap();
114+
if file_size < self.context_bytes_size {
115+
self.context_bytes_size = file_size;
116+
}
62117

63-
if n == 0 {
64-
break;
65-
}
118+
if file_size <= Self::BUFFER_SIZE {
119+
let mut buffer = Vec::new();
120+
reader.read_to_end(&mut buffer)?;
66121

67122
let result = Self::search_slice(&buffer, self.pattern);
123+
for index in result {
124+
self.matches.populate_matches(index, 0, &buffer);
125+
}
126+
} else {
127+
let mut buffer = [0; Self::BUFFER_SIZE];
128+
loop {
129+
let n = reader.read(&mut buffer).unwrap();
130+
131+
if n == 0 {
132+
break;
133+
}
68134

69-
if !result.is_empty() {
70-
// Convert the vector of indexes that match the pattern into Match objects
71-
let result = result
72-
.iter()
73-
.map(|&index| {
74-
// index where we should start collecting bytes for context
75-
let offset = index - (index % self.context_bytes_size);
76-
77-
// search_for_slice only return the index where the match start so we need to
78-
// create a range with all indexes from the match
79-
let match_indexes = index..index + self.pattern.len();
80-
81-
// Creates the index range for the context bytes.
82-
// this can contain all the indexes for the match or partially, depends on
83-
// context_bytes_size and the pattern size
84-
let mut context_bytes_indexes = offset..offset + self.context_bytes_size;
85-
86-
// In case context_bytes_size doesn't contain all of the match indexes we
87-
// need to extend the end of the range
88-
if context_bytes_indexes.end < match_indexes.end {
89-
context_bytes_indexes.end += self.context_bytes_size;
90-
}
91-
92-
// The actual bytes for context + the matching bytes
93-
// only for printing the result
94-
let context_bytes = buffer[context_bytes_indexes].to_vec();
95-
96-
// Now we need to know the indexes of the match inside of context_bytes
97-
let mut match_indexes = match_indexes.start % context_bytes.len()
98-
..match_indexes.end % context_bytes.len();
99-
100-
if match_indexes.end == 0 {
101-
match_indexes.end = self.context_bytes_size;
102-
}
103-
104-
// The index is relative to the position in the current buffer we are
105-
// reading from the file, but we need to store the position relative to the
106-
// whole file
107-
Match::new(index + pos_in_file, match_indexes, context_bytes)
108-
})
109-
.collect();
110-
111-
self.result.push(result);
135+
let result = Self::search_slice(&buffer, self.pattern);
136+
for index in result {
137+
self.matches.populate_matches(index, pos_in_file, &buffer);
138+
}
139+
pos_in_file += Self::BUFFER_SIZE;
112140
}
113-
pos_in_file += Self::BUFFER_SIZE;
114141
}
115142

116143
Ok(())
@@ -161,8 +188,8 @@ impl<'a> Searcher<'a> {
161188
}
162189

163190
/// Get a reference to the searcher's result.
164-
pub fn result(&self) -> &[Vec<Match>] {
165-
&self.result
191+
pub fn result(&self) -> &Matches {
192+
&self.matches
166193
}
167194

168195
/// Return the context bytes size.

0 commit comments

Comments
 (0)