Skip to content

Commit b8b7076

Browse files
committed
fix: sanitize filenames for cross-platform compatibility
- Add sanitize_filename() function to utils.rs that replaces invalid filesystem characters (< > : " | ? * and control characters) with underscores - Handle Windows reserved names (CON, PRN, AUX, NUL, COM1-9, LPT1-9) by inserting underscore before extension - Preserve directory structure in filenames with path separators - Trim leading/trailing spaces and trailing dots from path components - Apply sanitization in main.rs before downloading files - Display warnings when filenames are modified with before/after - Show summary count of sanitized files - Add comprehensive unit tests covering edge cases Fixes #63
1 parent 137d220 commit b8b7076

File tree

2 files changed

+354
-2
lines changed

2 files changed

+354
-2
lines changed

src/main.rs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use colored::*;
1010
use ia_get::archive_metadata::{parse_xml_files, XmlFiles};
1111
use ia_get::constants::USER_AGENT;
1212
use ia_get::downloader;
13-
use ia_get::utils::{create_spinner, validate_archive_url};
13+
use ia_get::utils::{create_spinner, sanitize_filename, validate_archive_url};
1414
use ia_get::Result;
1515
use indicatif::ProgressStyle;
1616
use reqwest::Client; // Add this line
@@ -178,6 +178,7 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
178178
spinner.finish();
179179

180180
// Prepare download data for batch processing
181+
let mut sanitized_count = 0;
181182
let download_data = files
182183
.files
183184
.into_iter()
@@ -186,10 +187,37 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
186187
if let Ok(joined_url) = absolute_url.join(&file.name) {
187188
absolute_url = joined_url;
188189
}
189-
(absolute_url.to_string(), file.name, file.md5)
190+
191+
// Sanitize filename for filesystem compatibility
192+
let (sanitized_name, was_modified) = sanitize_filename(&file.name);
193+
194+
// Warn user if filename was modified
195+
if was_modified {
196+
println!(
197+
"{} {} {} → {}",
198+
"⚠".yellow().bold(),
199+
"Sanitized:".yellow(),
200+
file.name.dimmed(),
201+
sanitized_name.bold()
202+
);
203+
sanitized_count += 1;
204+
}
205+
206+
(absolute_url.to_string(), sanitized_name, file.md5)
190207
})
191208
.collect::<Vec<_>>();
192209

210+
// Show summary if any files were sanitized
211+
if sanitized_count > 0 {
212+
println!(
213+
"\n{} {} {} file{} for filesystem compatibility",
214+
"✓".green().bold(),
215+
"Sanitized".bold(),
216+
sanitized_count.to_string().bold(),
217+
if sanitized_count == 1 { "" } else { "s" }
218+
);
219+
}
220+
193221
// Download all files with integrated signal handling
194222
downloader::download_files(&client, download_data.clone(), download_data.len()).await?;
195223

src/utils.rs

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,327 @@ pub fn format_transfer_rate(bytes_per_sec: f64) -> (f64, &'static str) {
163163
(bytes_per_sec / GB, "GB")
164164
}
165165
}
166+
167+
/// Sanitizes a filename for cross-platform filesystem compatibility
168+
///
169+
/// Replaces characters that are invalid on Windows or Unix filesystems
170+
/// with underscores, while preserving path separators.
171+
///
172+
/// Invalid characters replaced with underscores:
173+
/// - Windows: `< > : " | ? *` and control characters (0-31)
174+
/// - Unix: null character (\0)
175+
/// - Both: leading/trailing spaces, trailing dots in path components
176+
///
177+
/// Also handles Windows reserved names (CON, PRN, AUX, NUL, COM1-9, LPT1-9)
178+
/// by appending an underscore.
179+
///
180+
/// # Arguments
181+
/// * `filename` - The original filename (may include path components separated by `/`)
182+
///
183+
/// # Returns
184+
/// * `(sanitized_filename, was_modified)` - Tuple of cleaned filename and whether it was changed
185+
///
186+
/// # Examples
187+
/// ```
188+
/// use ia_get::utils::sanitize_filename;
189+
///
190+
/// let (sanitized, modified) = sanitize_filename("normal_file.txt");
191+
/// assert_eq!(sanitized, "normal_file.txt");
192+
/// assert!(!modified);
193+
///
194+
/// let (sanitized, modified) = sanitize_filename("file?name.txt");
195+
/// assert_eq!(sanitized, "file_name.txt");
196+
/// assert!(modified);
197+
///
198+
/// let (sanitized, modified) = sanitize_filename("Season 1/Episode?.mp4");
199+
/// assert_eq!(sanitized, "Season 1/Episode_.mp4");
200+
/// assert!(modified);
201+
/// ```
202+
pub fn sanitize_filename(filename: &str) -> (String, bool) {
203+
// Windows reserved names (case-insensitive)
204+
const RESERVED_NAMES: &[&str] = &[
205+
"CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8",
206+
"COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
207+
];
208+
209+
let mut was_modified = false;
210+
let mut result = String::with_capacity(filename.len());
211+
212+
// Process each path component separately to preserve directory structure
213+
let components: Vec<&str> = filename.split('/').collect();
214+
let mut first_component = true;
215+
216+
for component in components.iter() {
217+
// Skip empty components (e.g., from leading/trailing slashes or "//" sequences)
218+
if component.is_empty() {
219+
if !filename.is_empty() {
220+
was_modified = true;
221+
}
222+
continue;
223+
}
224+
225+
// Add separator before non-first components
226+
if !first_component {
227+
result.push('/');
228+
}
229+
first_component = false;
230+
231+
let mut sanitized_component = String::with_capacity(component.len());
232+
233+
// Replace invalid characters
234+
for ch in component.chars() {
235+
match ch {
236+
// Windows invalid characters
237+
'<' | '>' | ':' | '"' | '|' | '?' | '*' => {
238+
sanitized_component.push('_');
239+
was_modified = true;
240+
}
241+
// Backslash (path separator on Windows, invalid in filenames on Unix)
242+
'\\' => {
243+
sanitized_component.push('_');
244+
was_modified = true;
245+
}
246+
// Control characters (0-31) and DEL (127)
247+
'\x00'..='\x1F' | '\x7F' => {
248+
sanitized_component.push('_');
249+
was_modified = true;
250+
}
251+
// Valid character
252+
_ => sanitized_component.push(ch),
253+
}
254+
}
255+
256+
// Trim leading/trailing spaces
257+
let trimmed = sanitized_component.trim();
258+
if trimmed.len() != sanitized_component.len() {
259+
was_modified = true;
260+
sanitized_component = trimmed.to_string();
261+
}
262+
263+
// Trim trailing dots (Windows doesn't allow filenames ending with dots)
264+
let trimmed_dots = sanitized_component.trim_end_matches('.');
265+
if trimmed_dots.len() != sanitized_component.len() {
266+
was_modified = true;
267+
sanitized_component = trimmed_dots.to_string();
268+
}
269+
270+
// Handle empty components after sanitization
271+
if sanitized_component.is_empty() {
272+
sanitized_component = "_".to_string();
273+
was_modified = true;
274+
}
275+
276+
// Check for Windows reserved names
277+
// Split by '.' to check the base name (before extension)
278+
let dot_pos = sanitized_component.find('.');
279+
let base_name = if let Some(pos) = dot_pos {
280+
&sanitized_component[..pos]
281+
} else {
282+
&sanitized_component
283+
};
284+
285+
if RESERVED_NAMES
286+
.iter()
287+
.any(|&reserved| base_name.eq_ignore_ascii_case(reserved))
288+
{
289+
// Insert underscore after base name, before extension
290+
if let Some(pos) = dot_pos {
291+
sanitized_component.insert(pos, '_');
292+
} else {
293+
sanitized_component.push('_');
294+
}
295+
was_modified = true;
296+
}
297+
298+
result.push_str(&sanitized_component);
299+
}
300+
301+
// Remove trailing slash if present (unless it's just "/")
302+
if result.len() > 1 && result.ends_with('/') {
303+
result.pop();
304+
was_modified = true;
305+
}
306+
307+
// Check if result differs from original
308+
if !was_modified {
309+
was_modified = result != filename;
310+
}
311+
312+
(result, was_modified)
313+
}
314+
315+
#[cfg(test)]
316+
mod tests {
317+
use super::*;
318+
319+
#[test]
320+
fn test_sanitize_valid_filename() {
321+
let (result, modified) = sanitize_filename("normal_file-name.txt");
322+
assert_eq!(result, "normal_file-name.txt");
323+
assert!(!modified);
324+
}
325+
326+
#[test]
327+
fn test_sanitize_valid_filename_with_path() {
328+
let (result, modified) = sanitize_filename("folder/subfolder/file.txt");
329+
assert_eq!(result, "folder/subfolder/file.txt");
330+
assert!(!modified);
331+
}
332+
333+
#[test]
334+
fn test_sanitize_invalid_characters() {
335+
let (result, modified) = sanitize_filename("file?name:test<>.txt");
336+
assert_eq!(result, "file_name_test__.txt");
337+
assert!(modified);
338+
}
339+
340+
#[test]
341+
fn test_sanitize_question_mark() {
342+
let (result, modified) = sanitize_filename("Episode?.mp4");
343+
assert_eq!(result, "Episode_.mp4");
344+
assert!(modified);
345+
}
346+
347+
#[test]
348+
fn test_sanitize_with_path() {
349+
let (result, modified) = sanitize_filename("Season 1/Episode?.mp4");
350+
assert_eq!(result, "Season 1/Episode_.mp4");
351+
assert!(modified);
352+
}
353+
354+
#[test]
355+
fn test_sanitize_multiple_invalid_in_path() {
356+
let (result, modified) = sanitize_filename("Folder:Name/File*Name?.txt");
357+
assert_eq!(result, "Folder_Name/File_Name_.txt");
358+
assert!(modified);
359+
}
360+
361+
#[test]
362+
fn test_sanitize_windows_reserved_names() {
363+
let (result, modified) = sanitize_filename("CON.txt");
364+
assert_eq!(result, "CON_.txt");
365+
assert!(modified);
366+
367+
let (result, modified) = sanitize_filename("con.txt");
368+
assert_eq!(result, "con_.txt");
369+
assert!(modified);
370+
371+
let (result, modified) = sanitize_filename("PRN");
372+
assert_eq!(result, "PRN_");
373+
assert!(modified);
374+
375+
let (result, modified) = sanitize_filename("aux.log");
376+
assert_eq!(result, "aux_.log");
377+
assert!(modified);
378+
379+
let (result, modified) = sanitize_filename("COM1.dat");
380+
assert_eq!(result, "COM1_.dat");
381+
assert!(modified);
382+
383+
let (result, modified) = sanitize_filename("LPT9.txt");
384+
assert_eq!(result, "LPT9_.txt");
385+
assert!(modified);
386+
}
387+
388+
#[test]
389+
fn test_sanitize_reserved_in_path() {
390+
let (result, modified) = sanitize_filename("folder/CON.txt");
391+
assert_eq!(result, "folder/CON_.txt");
392+
assert!(modified);
393+
}
394+
395+
#[test]
396+
fn test_sanitize_control_characters() {
397+
let (result, modified) = sanitize_filename("file\x00\x1fname.txt");
398+
assert_eq!(result, "file__name.txt");
399+
assert!(modified);
400+
401+
let (result, modified) = sanitize_filename("test\x7Ffile.txt");
402+
assert_eq!(result, "test_file.txt");
403+
assert!(modified);
404+
}
405+
406+
#[test]
407+
fn test_sanitize_backslash() {
408+
let (result, modified) = sanitize_filename("folder\\file.txt");
409+
assert_eq!(result, "folder_file.txt");
410+
assert!(modified);
411+
}
412+
413+
#[test]
414+
fn test_sanitize_whitespace_edge_cases() {
415+
let (result, modified) = sanitize_filename(" leading.txt ");
416+
assert_eq!(result, "leading.txt");
417+
assert!(modified);
418+
419+
let (result, modified) = sanitize_filename("folder/ spaces /file.txt");
420+
assert_eq!(result, "folder/spaces/file.txt");
421+
assert!(modified);
422+
}
423+
424+
#[test]
425+
fn test_sanitize_trailing_dots() {
426+
let (result, modified) = sanitize_filename("file...");
427+
assert_eq!(result, "file");
428+
assert!(modified);
429+
430+
let (result, modified) = sanitize_filename("folder./file.txt");
431+
assert_eq!(result, "folder/file.txt");
432+
assert!(modified);
433+
}
434+
435+
#[test]
436+
fn test_sanitize_empty_components() {
437+
let (result, modified) = sanitize_filename("folder//file.txt");
438+
assert_eq!(result, "folder/file.txt");
439+
assert!(modified);
440+
441+
let (result, modified) = sanitize_filename("/folder/file.txt");
442+
assert_eq!(result, "folder/file.txt");
443+
assert!(modified);
444+
445+
let (result, modified) = sanitize_filename("folder/file.txt/");
446+
assert_eq!(result, "folder/file.txt");
447+
assert!(modified);
448+
}
449+
450+
#[test]
451+
fn test_sanitize_all_invalid() {
452+
let (result, modified) = sanitize_filename("???");
453+
assert_eq!(result, "___");
454+
assert!(modified);
455+
}
456+
457+
#[test]
458+
fn test_sanitize_unicode() {
459+
let (result, modified) = sanitize_filename("файл.txt");
460+
assert_eq!(result, "файл.txt");
461+
assert!(!modified);
462+
463+
let (result, modified) = sanitize_filename("文件.txt");
464+
assert_eq!(result, "文件.txt");
465+
assert!(!modified);
466+
467+
let (result, modified) = sanitize_filename("emoji😀.txt");
468+
assert_eq!(result, "emoji😀.txt");
469+
assert!(!modified);
470+
}
471+
472+
#[test]
473+
fn test_sanitize_mixed_valid_invalid() {
474+
let (result, modified) =
475+
sanitize_filename("Red vs. Blue - Season 1/Episode 1: Why Are We Here?.mp4");
476+
assert_eq!(
477+
result,
478+
"Red vs. Blue - Season 1/Episode 1_ Why Are We Here_.mp4"
479+
);
480+
assert!(modified);
481+
}
482+
483+
#[test]
484+
fn test_sanitize_preserves_extension() {
485+
let (result, modified) = sanitize_filename("file:name.tar.gz");
486+
assert_eq!(result, "file_name.tar.gz");
487+
assert!(modified);
488+
}
489+
}

0 commit comments

Comments
 (0)