Skip to content

Commit e6bccac

Browse files
committed
refactor(mft): replace parallel filename collection with optimized function
1 parent 1f59353 commit e6bccac

File tree

2 files changed

+136
-89
lines changed

2 files changed

+136
-89
lines changed

src/mft/fast_entry.rs

Lines changed: 135 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -4,67 +4,76 @@
44
//! to extract FILE_NAME (0x30) attributes with minimal overhead.
55
66
use crate::mft::fast_fixup::detect_entry_size;
7+
use crate::mft::mft_file::MftFile;
8+
use rayon::prelude::*;
79

810
pub const ATTR_TYPE_FILE_NAME: u32 = 0x30;
911
const ATTRIBUTE_TYPE_END: u32 = 0xFFFF_FFFF;
1012

1113
#[derive(Clone, Copy, Debug)]
1214
pub struct FileNameRef<'a> {
1315
pub entry_id: u32,
14-
pub parent_ref: u64, // raw 64-bit reference (contains sequence)
16+
pub parent_ref: u64, // raw 64-bit reference (contains sequence)
1517
pub namespace: u8,
1618
pub name_utf16: &'a [u16],
1719
}
1820

1921
/// Collection of FILE_NAME attributes extracted from MFT data.
20-
///
22+
///
2123
/// This structure provides organized access to all filename references found
2224
/// in an MFT, with efficient lookup by entry ID.
2325
#[derive(Clone, Debug)]
2426
pub struct FileNameCollection<'a> {
2527
/// All FILE_NAME references found across all entries
2628
pub all_filenames: Vec<FileNameRef<'a>>,
27-
/// Index mapping where `per_entry[entry_id]` contains indices
29+
/// Index mapping where `per_entry[entry_id]` contains indices
2830
/// into `all_filenames` for all filenames belonging to that entry
2931
pub per_entry_indices: Vec<Vec<usize>>,
3032
}
3133

3234
impl<'a> FileNameCollection<'a> {
3335
/// Get all filename references for a specific entry ID.
34-
///
36+
///
3537
/// # Arguments
36-
///
38+
///
3739
/// * `entry_id` - The MFT entry ID to look up
38-
///
40+
///
3941
/// # Returns
40-
///
42+
///
4143
/// An iterator over all `FileNameRef` instances for the given entry,
4244
/// or an empty iterator if the entry ID is not found.
43-
///
45+
///
4446
/// # Example
45-
///
46-
/// ```rust
47-
/// # use mft::fast_entry::{par_collect_filenames, FileNameCollection};
48-
/// # let data = &[0u8; 2048]; // Mock MFT data
49-
/// let collection = par_collect_filenames(data, 1024);
50-
///
47+
///
48+
/// ```rust,no_run
49+
/// use teamy_mft::mft::{fast_entry, mft_file::MftFile};
50+
/// # fn demo() -> eyre::Result<()> {
51+
/// // Load an MFT file and collect all filename (x30) attributes
52+
/// let mft = MftFile::from_path(std::path::Path::new("C:\\path\\to\\cached.mft"))?;
53+
/// let collection = fast_entry::par_collect_filenames_typed(&mft);
5154
/// for filename in collection.filenames_for_entry(5) {
5255
/// println!("Entry 5 filename: {:?}", filename);
5356
/// }
57+
/// # Ok(()) }
58+
/// # let _ = demo();
5459
/// ```
5560
pub fn filenames_for_entry(&self, entry_id: u32) -> impl Iterator<Item = &FileNameRef<'a>> {
5661
self.per_entry_indices
5762
.get(entry_id as usize)
58-
.map(|indices| indices.iter().filter_map(|&idx| self.all_filenames.get(idx)))
63+
.map(|indices| {
64+
indices
65+
.iter()
66+
.filter_map(|&idx| self.all_filenames.get(idx))
67+
})
5968
.into_iter()
6069
.flatten()
6170
}
62-
71+
6372
/// Get the total number of filename references collected.
6473
pub fn x30_count(&self) -> usize {
6574
self.all_filenames.len()
6675
}
67-
76+
6877
/// Get the number of entries that have filename attributes.
6978
pub fn entry_count(&self) -> usize {
7079
self.per_entry_indices.len()
@@ -73,44 +82,83 @@ impl<'a> FileNameCollection<'a> {
7382

7483
#[inline]
7584
fn read_u16(bytes: &[u8], off: usize) -> Option<u16> {
76-
bytes.get(off..off+2).map(|b| u16::from_le_bytes([b[0], b[1]]))
85+
bytes
86+
.get(off..off + 2)
87+
.map(|b| u16::from_le_bytes([b[0], b[1]]))
7788
}
7889
#[inline]
7990
fn read_u32(bytes: &[u8], off: usize) -> Option<u32> {
80-
bytes.get(off..off+4).map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]]))
91+
bytes
92+
.get(off..off + 4)
93+
.map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]]))
8194
}
8295
#[inline]
8396
fn read_u64(bytes: &[u8], off: usize) -> Option<u64> {
84-
bytes.get(off..off+8).map(|b| u64::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]))
97+
bytes
98+
.get(off..off + 8)
99+
.map(|b| u64::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]))
85100
}
86101

87102
/// Parse the total entry size from the first entry slice (delegates to fast_fixup helper)
88103
#[inline]
89-
pub fn parse_first_entry_size(first_entry: &[u8]) -> Option<u32> { detect_entry_size(first_entry) }
104+
pub fn parse_first_entry_size(first_entry: &[u8]) -> Option<u32> {
105+
detect_entry_size(first_entry)
106+
}
90107

91108
/// Iterate all FILE_NAME attributes in an entry, invoking callback for each.
92109
/// Returns number of filename attributes found.
93-
pub fn for_each_filename<'a, F: FnMut(FileNameRef<'a>)>(entry_bytes: &'a [u8], entry_id: u32, mut f: F) -> usize {
110+
pub fn for_each_filename<'a, F: FnMut(FileNameRef<'a>)>(
111+
entry_bytes: &'a [u8],
112+
entry_id: u32,
113+
mut f: F,
114+
) -> usize {
94115
// Signature check
95-
if entry_bytes.len() < 0x18 || &entry_bytes[0..4] != b"FILE" { return 0; }
96-
let first_attr_off = match read_u16(entry_bytes, 0x14) { Some(v) => v as usize, None => return 0 };
97-
if first_attr_off == 0 || first_attr_off >= entry_bytes.len() { return 0; }
116+
if entry_bytes.len() < 0x18 || &entry_bytes[0..4] != b"FILE" {
117+
return 0;
118+
}
119+
let first_attr_off = match read_u16(entry_bytes, 0x14) {
120+
Some(v) => v as usize,
121+
None => return 0,
122+
};
123+
if first_attr_off == 0 || first_attr_off >= entry_bytes.len() {
124+
return 0;
125+
}
98126

99127
let mut offset = first_attr_off;
100128
let mut count = 0;
101-
while offset + 16 <= entry_bytes.len() { // minimal attribute header length guard
102-
let attr_type = match read_u32(entry_bytes, offset) { Some(v) => v, None => break };
103-
if attr_type == ATTRIBUTE_TYPE_END { break; }
104-
let attr_len = match read_u32(entry_bytes, offset + 4) { Some(v) => v as usize, None => break };
105-
if attr_len == 0 || offset + attr_len > entry_bytes.len() { break; }
129+
while offset + 16 <= entry_bytes.len() {
130+
// minimal attribute header length guard
131+
let attr_type = match read_u32(entry_bytes, offset) {
132+
Some(v) => v,
133+
None => break,
134+
};
135+
if attr_type == ATTRIBUTE_TYPE_END {
136+
break;
137+
}
138+
let attr_len = match read_u32(entry_bytes, offset + 4) {
139+
Some(v) => v as usize,
140+
None => break,
141+
};
142+
if attr_len == 0 || offset + attr_len > entry_bytes.len() {
143+
break;
144+
}
106145
let non_res_flag = entry_bytes.get(offset + 8).copied().unwrap_or(0);
107146
if attr_type == ATTR_TYPE_FILE_NAME && non_res_flag == 0 {
108147
// Resident attribute header layout (offsets relative to attribute start)
109-
if offset + 24 > entry_bytes.len() { break; }
110-
let value_len = match read_u32(entry_bytes, offset + 16) { Some(v) => v as usize, None => break };
111-
let value_off = match read_u16(entry_bytes, offset + 20) { Some(v) => v as usize, None => break };
148+
if offset + 24 > entry_bytes.len() {
149+
break;
150+
}
151+
let value_len = match read_u32(entry_bytes, offset + 16) {
152+
Some(v) => v as usize,
153+
None => break,
154+
};
155+
let value_off = match read_u16(entry_bytes, offset + 20) {
156+
Some(v) => v as usize,
157+
None => break,
158+
};
112159
let value_abs = offset + value_off;
113-
if value_abs + value_len > entry_bytes.len() || value_len < 0x42 { /* need base struct */ } else {
160+
if value_abs + value_len > entry_bytes.len() || value_len < 0x42 { /* need base struct */
161+
} else {
114162
// FILE_NAME structure
115163
if let Some(parent_ref) = read_u64(entry_bytes, value_abs) {
116164
let name_len = entry_bytes.get(value_abs + 0x40).copied().unwrap_or(0) as usize;
@@ -119,8 +167,18 @@ pub fn for_each_filename<'a, F: FnMut(FileNameRef<'a>)>(entry_bytes: &'a [u8], e
119167
let name_bytes_end = name_utf16_off + name_len * 2;
120168
if name_bytes_end <= entry_bytes.len() {
121169
// SAFETY: constructing &[u16] from properly aligned bytes – alignment of u16 may be 2; slice.as_ptr() is aligned to 1. Use from_raw_parts_unaligned (stable?) -> fallback to copy if misaligned risk. Here we accept potential unaligned read; on x86 it's fine.
122-
let raw: &[u16] = unsafe { std::slice::from_raw_parts(entry_bytes[name_utf16_off..name_bytes_end].as_ptr() as *const u16, name_len) };
123-
f(FileNameRef { entry_id, parent_ref, namespace, name_utf16: raw });
170+
let raw: &[u16] = unsafe {
171+
std::slice::from_raw_parts(
172+
entry_bytes[name_utf16_off..name_bytes_end].as_ptr() as *const u16,
173+
name_len,
174+
)
175+
};
176+
f(FileNameRef {
177+
entry_id,
178+
parent_ref,
179+
namespace,
180+
name_utf16: raw,
181+
});
124182
count += 1;
125183
}
126184
}
@@ -132,75 +190,65 @@ pub fn for_each_filename<'a, F: FnMut(FileNameRef<'a>)>(entry_bytes: &'a [u8], e
132190
}
133191

134192
/// Parallel collection of all FILE_NAME attributes from MFT data.
135-
///
136-
/// This function processes MFT entries in parallel using Rayon to extract all FILE_NAME
137-
/// attributes efficiently. It's particularly useful for large MFT files where sequential
138-
/// processing would be too slow.
139-
///
140-
/// # Arguments
141-
///
142-
/// * `mft_file` - Raw MFT data bytes (must be properly fixed up with `fast_fixup` first)
143-
/// * `entry_size` - Size of each MFT entry in bytes (typically 1024 bytes)
144-
///
145-
/// # Returns
146-
///
147-
/// A `FileNameCollection` containing all FILE_NAME references with efficient lookup by entry ID.
148-
///
149-
/// # Requirements
150-
///
151-
/// * Requires the `parallel` feature to be enabled
152-
/// * Input data must be evenly divisible by `entry_size`
153-
/// * MFT data must have fixups already applied (use `fast_fixup` module first)
154-
///
193+
///
194+
/// This function processes MFT entries in parallel to extract all FILE_NAME attributes efficiently.
195+
/// It's particularly useful for large MFT files where sequential processing would be too slow.
196+
///
155197
/// # Example
156-
///
157-
/// ```rust
158-
/// # use mft::fast_entry::par_collect_filenames;
159-
/// # let data = &[0u8; 2048]; // Mock MFT data
160-
/// let entry_size = 1024;
161-
/// let collection = par_collect_filenames(data, entry_size);
162-
///
198+
///
199+
/// ```rust,no_run
200+
/// use teamy_mft::mft::{fast_entry, mft_file::MftFile};
201+
/// # fn demo() -> eyre::Result<()> {
202+
/// let mft = MftFile::from_path(std::path::Path::new("C:\\path\\to\\cached.mft"))?;
203+
/// let collection = fast_entry::par_collect_filenames_typed(&mft);
163204
/// // Access all filenames for entry ID 5
164205
/// for filename in collection.filenames_for_entry(5) {
165206
/// println!("Entry 5 has filename: {:?}", filename);
166207
/// }
167-
///
168-
/// println!("Total filenames found: {}", collection.total_count());
208+
/// println!("Total filenames found: {}", collection.x30_count());
209+
/// # Ok(()) }
210+
/// # let _ = demo();
169211
/// ```
170-
///
171-
/// # Performance
172-
///
173-
/// This function uses Rayon's parallel iterator to process entries concurrently,
174-
/// making it significantly faster than sequential processing for large MFT files.
175-
/// The trade-off is higher memory usage and complexity in the returned data structure.
176-
pub fn par_collect_filenames<'a>(mft_file: &'a [u8], entry_size: usize) -> FileNameCollection<'a> {
177-
use rayon::prelude::*;
178-
let entries = mft_file.par_chunks_exact(entry_size).enumerate();
179-
let per_thread: Vec<(Vec<FileNameRef<'a>>, Vec<(u32, usize)>)> = entries.map(|(idx, entry)| {
180-
let mut list = Vec::new();
181-
let mut pairs = Vec::new();
182-
for_each_filename(entry, idx as u32, |fref| {
183-
let global_index = list.len();
184-
list.push(fref);
185-
pairs.push((fref.entry_id, global_index));
186-
});
187-
(list, pairs)
188-
}).collect();
212+
pub fn collect_filenames<'a>(mft: &'a MftFile) -> FileNameCollection<'a> {
213+
let full: &'a [u8] = &*mft; // borrow the entire bytes buffer
214+
let entry_size = mft.entry_size().get::<uom::si::information::byte>();
215+
let entry_count = mft.entry_count();
216+
217+
let per_thread: Vec<(Vec<FileNameRef<'a>>, Vec<(u32, usize)>)> = (0..entry_count)
218+
.into_par_iter()
219+
.map(|idx| {
220+
let mut list = Vec::new();
221+
let mut pairs = Vec::new();
222+
let start = idx * entry_size;
223+
let end = start + entry_size;
224+
let record_bytes: &'a [u8] = &full[start..end];
225+
for_each_filename(record_bytes, idx as u32, |fref| {
226+
let global_index = list.len();
227+
list.push(fref);
228+
pairs.push((fref.entry_id, global_index));
229+
});
230+
(list, pairs)
231+
})
232+
.collect();
189233

190234
let total = per_thread.iter().map(|(v, _)| v.len()).sum();
191235
let mut file_names = Vec::with_capacity(total);
192-
for (v, _) in &per_thread { file_names.extend_from_slice(v); }
236+
for (v, _) in &per_thread {
237+
file_names.extend_from_slice(v);
238+
}
193239

194-
let entry_count = mft_file.len() / entry_size;
195240
let mut per_entry: Vec<Vec<usize>> = vec![Vec::new(); entry_count];
196241
let mut base = 0usize;
197242
for (v, pairs) in per_thread {
198243
for (entry_id, local_idx) in pairs {
199244
let global_idx = base + local_idx;
200-
if let Some(vec) = per_entry.get_mut(entry_id as usize) { vec.push(global_idx); }
245+
if let Some(vec) = per_entry.get_mut(entry_id as usize) {
246+
vec.push(global_idx);
247+
}
201248
}
202249
base += v.len();
203250
}
251+
204252
FileNameCollection {
205253
all_filenames: file_names,
206254
per_entry_indices: per_entry,

src/mft_process.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ pub fn process_mft_file(
3333

3434
// collect filename attributes (parallel)
3535
let scan_start = Instant::now();
36-
let file_names =
37-
fast_entry::par_collect_filenames(&mft_file, mft_file.entry_size().get::<byte>() as usize);
36+
let file_names = fast_entry::collect_filenames(&mft_file);
3837
let scan_elapsed = Time::new::<second>(scan_start.elapsed().as_secs_f64());
3938
let scan_rate = InformationRate::from(
4039
uom::si::f64::Information::new::<byte>(mft_file.size().get::<byte>() as f64) / scan_elapsed,

0 commit comments

Comments
 (0)