diff --git a/.gitignore b/.gitignore index b4698191c..5ba73d432 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ collab-plugins/.env.test **/.DS_Store .claude - +.serena diff --git a/collab-importer/src/zip_tool/async_zip.rs b/collab-importer/src/zip_tool/async_zip.rs index 45ec46477..383f72549 100644 --- a/collab-importer/src/zip_tool/async_zip.rs +++ b/collab-importer/src/zip_tool/async_zip.rs @@ -5,7 +5,7 @@ use async_zip::{StringEncoding, ZipString}; use futures::AsyncReadExt as FuturesAsyncReadExt; use futures::io::AsyncBufRead; use std::ffi::OsString; -use std::{io, str}; +use std::str; use std::path::{Path, PathBuf}; use tokio::fs; @@ -19,8 +19,11 @@ use tokio_util::compat::TokioAsyncReadCompatExt; use tokio_util::compat::TokioAsyncWriteCompatExt; use crate::error::ImporterError; -use crate::zip_tool::util::{is_multi_part_zip_signature, remove_part_suffix, sanitize_file_path}; -use tracing::error; +use crate::zip_tool::util::{ + has_multi_part_extension, has_multi_part_suffix, is_multi_part_zip_signature, remove_part_suffix, + sanitize_file_path, +}; +use tracing::{error, warn}; pub struct UnzipFile { pub file_name: String, @@ -74,12 +77,20 @@ where if buffer.len() >= 4 { if let Ok(four_bytes) = buffer[..4].try_into() { if is_multi_part_zip_signature(four_bytes) { - if let Some(file_name) = - Path::new(&filename).file_stem().and_then(|s| s.to_str()) - { - root_dir = Some(remove_part_suffix(file_name)); + let is_multipart_candidate = filename.contains('/') + || has_multi_part_extension(&filename) + || has_multi_part_suffix(&filename); + + if root_dir.is_none() && is_multipart_candidate { + if let Some(file_name) = + Path::new(&filename).file_stem().and_then(|s| s.to_str()) + { + root_dir = Some(remove_part_suffix(file_name)); + } + } + if is_multipart_candidate { + parts.push(output_path.clone()); } - parts.push(output_path.clone()); } } } @@ -130,54 +141,39 @@ where None => match default_file_name { None => Err(ImporterError::FileNotFound), Some(default_file_name) => { - let new_out_dir = out_dir - .parent() - .ok_or_else(|| ImporterError::FileNotFound)? - .join(uuid::Uuid::new_v4().to_string()) - .join(&default_file_name); - move_all(&out_dir, &new_out_dir).await?; - let _ = fs::remove_dir_all(&out_dir).await; + if fs::metadata(&out_dir).await.is_err() { + fs::create_dir_all(&out_dir) + .await + .with_context(|| format!("Failed to create output directory: {}", out_dir.display()))?; + } Ok(UnzipFile { file_name: default_file_name.clone(), - unzip_dir_path: new_out_dir, + unzip_dir_path: out_dir, parts, }) }, }, - Some(file_name) => Ok(UnzipFile { - file_name: file_name.clone(), - unzip_dir_path: out_dir.join(file_name), - parts, - }), - } -} - -#[async_recursion] -async fn move_all(old_path: &Path, new_path: &Path) -> io::Result<()> { - if !new_path.exists() { - fs::create_dir_all(new_path).await?; - } - - let mut read_dir = fs::read_dir(old_path).await?; - while let Some(entry) = read_dir.next_entry().await? { - let path = entry.path(); - let file_name = match path.file_name() { - Some(name) => name, - None => continue, - }; - - let new_file_path = new_path.join(file_name); - if path.is_dir() { - if !new_file_path.exists() { - fs::create_dir_all(&new_file_path).await?; + Some(file_name) => { + let target_dir = out_dir.join(&file_name); + if fs::metadata(&target_dir).await.is_err() { + warn!( + "Root directory {:?} missing after unzip; falling back to {:?}", + target_dir, out_dir + ); + return Ok(UnzipFile { + file_name, + unzip_dir_path: out_dir, + parts, + }); } - move_all(&path, &new_file_path).await?; - fs::remove_dir_all(&path).await?; - } else if path.is_file() { - fs::rename(&path, &new_file_path).await?; - } + + Ok(UnzipFile { + file_name: file_name.clone(), + unzip_dir_path: target_dir, + parts, + }) + }, } - Ok(()) } pub fn get_filename_from_zip_string(zip_string: &ZipString) -> Result { diff --git a/collab-importer/src/zip_tool/sync_zip.rs b/collab-importer/src/zip_tool/sync_zip.rs index 8df2b2aea..79660ad10 100644 --- a/collab-importer/src/zip_tool/sync_zip.rs +++ b/collab-importer/src/zip_tool/sync_zip.rs @@ -1,5 +1,8 @@ use crate::error::ImporterError; -use crate::zip_tool::util::{is_multi_part_zip_signature, remove_part_suffix, sanitize_file_path}; +use crate::zip_tool::util::{ + has_multi_part_extension, has_multi_part_suffix, is_multi_part_zip_signature, remove_part_suffix, + sanitize_file_path, +}; use anyhow::{Result, anyhow}; use std::fs::{File, OpenOptions}; @@ -17,7 +20,7 @@ pub struct UnzipFile { pub fn sync_unzip( file_path: PathBuf, - mut out_dir: PathBuf, + out_dir: PathBuf, default_file_name: Option, ) -> Result { let file = File::open(file_path) @@ -37,14 +40,9 @@ pub fn sync_unzip( } } - if root_dir.is_none() { - if let Some(default_name) = &default_file_name { - out_dir = out_dir.join(default_name); - if !out_dir.exists() { - fs::create_dir_all(&out_dir) - .map_err(|e| ImporterError::Internal(anyhow!("Failed to create dir: {:?}", e)))?; - } - } + if !out_dir.exists() { + fs::create_dir_all(&out_dir) + .map_err(|e| ImporterError::Internal(anyhow!("Failed to create dir: {:?}", e)))?; } // Iterate through each file in the archive @@ -92,10 +90,18 @@ pub fn sync_unzip( if buffer.len() >= 4 { let four_bytes: [u8; 4] = buffer[..4].try_into().unwrap(); if is_multi_part_zip_signature(&four_bytes) { - if let Some(file_name) = Path::new(&filename).file_stem().and_then(|s| s.to_str()) { - root_dir = Some(remove_part_suffix(file_name)); + let is_multipart_candidate = filename.contains('/') + || has_multi_part_extension(&filename) + || has_multi_part_suffix(&filename); + + if root_dir.is_none() && is_multipart_candidate { + if let Some(file_name) = Path::new(&filename).file_stem().and_then(|s| s.to_str()) { + root_dir = Some(remove_part_suffix(file_name)); + } + } + if is_multipart_candidate { + parts.push(output_path.clone()); } - parts.push(output_path.clone()); } } @@ -130,11 +136,26 @@ pub fn sync_unzip( parts, }), }, - Some(root_dir) => Ok(UnzipFile { - dir_name: root_dir.clone(), - unzip_dir: out_dir.join(root_dir), - parts, - }), + Some(root_dir) => { + let target_dir = out_dir.join(&root_dir); + if !target_dir.exists() { + warn!( + "Root directory {:?} missing after unzip; falling back to {:?}", + target_dir, out_dir + ); + return Ok(UnzipFile { + dir_name: root_dir, + unzip_dir: out_dir, + parts, + }); + } + + Ok(UnzipFile { + dir_name: root_dir.clone(), + unzip_dir: target_dir, + parts, + }) + }, } } diff --git a/collab-importer/src/zip_tool/util.rs b/collab-importer/src/zip_tool/util.rs index 7175a14a2..ad26345a6 100644 --- a/collab-importer/src/zip_tool/util.rs +++ b/collab-importer/src/zip_tool/util.rs @@ -35,6 +35,32 @@ pub fn sanitize_file_path(path: &str) -> PathBuf { .collect() } +/// Determine whether the provided file path ends with a known multi-part archive extension. +pub fn has_multi_part_extension(file_name: &str) -> bool { + Path::new(file_name) + .extension() + .and_then(|ext| ext.to_str()) + .map(|ext| { + let ext = ext.to_ascii_lowercase(); + (ext.starts_with('z') && ext.chars().skip(1).all(|c| c.is_ascii_digit())) + || (ext.starts_with("part") && ext.chars().skip(4).all(|c| c.is_ascii_digit())) + }) + .unwrap_or(false) +} + +/// Identify multi-part style suffixes that appear before the file extension. +pub fn has_multi_part_suffix(file_name: &str) -> bool { + if let Some(file_name) = Path::new(file_name).file_name().and_then(|s| s.to_str()) { + let patterns = [r"(?i)-part-\d+", r"(?i)\.part\d+", r"(?i)\.z\d{2}"]; + return patterns.iter().any(|pattern| { + let re = Regex::new(pattern).unwrap(); + re.is_match(file_name).unwrap_or(false) + }); + } + + false +} + pub fn remove_part_suffix(file_name: &str) -> String { let path = Path::new(file_name); if let Some(stem) = path.file_stem() { @@ -97,4 +123,20 @@ mod tests { ); } } + + #[test] + fn test_has_multi_part_extension() { + assert!(has_multi_part_extension("Export-Part.z01")); + assert!(has_multi_part_extension("export.part1")); + assert!(!has_multi_part_extension("Attachment.zip")); + assert!(!has_multi_part_extension("regular.txt")); + } + + #[test] + fn test_has_multi_part_suffix() { + assert!(has_multi_part_suffix("Export-Part-1.zip")); + assert!(has_multi_part_suffix("nested/Export.Part2")); + assert!(!has_multi_part_suffix("Attachment.zip")); + assert!(!has_multi_part_suffix("duplicate(1).zip")); + } } diff --git a/collab-importer/tests/main.rs b/collab-importer/tests/main.rs index 825438b02..f0cab2d42 100644 --- a/collab-importer/tests/main.rs +++ b/collab-importer/tests/main.rs @@ -1,3 +1,4 @@ mod notion_test; mod util; mod workspace; +mod zip_tool; diff --git a/collab-importer/tests/zip_tool.rs b/collab-importer/tests/zip_tool.rs new file mode 100644 index 000000000..8cc441282 --- /dev/null +++ b/collab-importer/tests/zip_tool.rs @@ -0,0 +1,165 @@ +use anyhow::Result; +use std::io::{Cursor, Write}; +use std::path::Path; + +use collab_importer::zip_tool::async_zip::async_unzip; +use collab_importer::zip_tool::sync_zip::sync_unzip; +use tempfile::tempdir; +use tokio_util::compat::TokioAsyncReadCompatExt; +use zip::CompressionMethod; +use zip::ZipWriter; +use zip::write::FileOptions; + +const ROOT_DIR: &str = "ExportBlock-7365b89b"; +const FALLBACK_DIR: &str = "fallback"; + +#[test] +fn sync_unzip_preserves_root_directory_with_nested_zip() -> Result<()> { + let temp = tempdir()?; + let zip_path = temp.path().join("root_dir_with_nested.zip"); + create_zip_with_root_dir(&zip_path)?; + + let output_dir = temp.path().join("output"); + std::fs::create_dir_all(&output_dir)?; + + let unzip_file = sync_unzip( + zip_path.clone(), + output_dir.clone(), + Some(FALLBACK_DIR.to_string()), + )?; + + let expected_root = output_dir.join(ROOT_DIR); + assert_eq!(unzip_file.dir_name, ROOT_DIR); + assert_eq!(unzip_file.unzip_dir, expected_root); + assert!(unzip_file.unzip_dir.is_dir()); + + Ok(()) +} + +#[test] +fn sync_unzip_falls_back_when_root_directory_missing() -> Result<()> { + let temp = tempdir()?; + let zip_path = temp.path().join("missing_root.zip"); + create_zip_without_root_dir(&zip_path)?; + + let output_dir = temp.path().join("output"); + std::fs::create_dir_all(&output_dir)?; + + let unzip_file = sync_unzip( + zip_path.clone(), + output_dir.clone(), + Some(FALLBACK_DIR.to_string()), + )?; + + assert_eq!(unzip_file.dir_name, FALLBACK_DIR); + assert_eq!(unzip_file.unzip_dir, output_dir); + assert!(unzip_file.unzip_dir.is_dir()); + + Ok(()) +} + +#[tokio::test] +async fn async_unzip_preserves_root_directory_with_nested_zip() -> Result<()> { + let temp = tempdir()?; + let zip_path = temp.path().join("async_root_dir_with_nested.zip"); + create_zip_with_root_dir(&zip_path)?; + + let file = tokio::fs::File::open(&zip_path).await?; + let reader = tokio::io::BufReader::new(file).compat(); + let zip_reader = async_zip::base::read::stream::ZipFileReader::new(reader); + + let output_dir = temp.path().join("async_output"); + tokio::fs::create_dir_all(&output_dir).await?; + + let unzip_file = async_unzip( + zip_reader, + output_dir.clone(), + Some(FALLBACK_DIR.to_string()), + ) + .await?; + + let expected_root = output_dir.join(ROOT_DIR); + assert_eq!(unzip_file.file_name, ROOT_DIR); + assert_eq!(unzip_file.unzip_dir_path, expected_root); + assert!( + tokio::fs::metadata(unzip_file.unzip_dir_path.clone()) + .await? + .is_dir() + ); + + Ok(()) +} + +#[tokio::test] +async fn async_unzip_falls_back_when_root_directory_missing() -> Result<()> { + let temp = tempdir()?; + let zip_path = temp.path().join("async_missing_root.zip"); + create_zip_without_root_dir(&zip_path)?; + + let file = tokio::fs::File::open(&zip_path).await?; + let reader = tokio::io::BufReader::new(file).compat(); + let zip_reader = async_zip::base::read::stream::ZipFileReader::new(reader); + + let output_dir = temp.path().join("async_output"); + tokio::fs::create_dir_all(&output_dir).await?; + + let unzip_file = async_unzip( + zip_reader, + output_dir.clone(), + Some(FALLBACK_DIR.to_string()), + ) + .await?; + + assert_eq!(unzip_file.file_name, FALLBACK_DIR); + assert_eq!(unzip_file.unzip_dir_path, output_dir); + assert!( + tokio::fs::metadata(&unzip_file.unzip_dir_path) + .await? + .is_dir() + ); + + Ok(()) +} + +fn create_zip_with_root_dir(zip_path: &Path) -> Result<()> { + let file = std::fs::File::create(zip_path)?; + let mut writer = ZipWriter::new(file); + let options = FileOptions::default().compression_method(CompressionMethod::Stored); + + writer.add_directory(format!("{ROOT_DIR}/"), options)?; + writer.start_file(format!("{ROOT_DIR}/{ROOT_DIR}.md"), options)?; + writer.write_all(b"page content")?; + + writer.start_file(format!("{ROOT_DIR}/Attachment.zip"), options)?; + writer.write_all(&create_nested_zip_bytes("nested.txt"))?; + + writer.finish()?; + Ok(()) +} + +fn create_zip_without_root_dir(zip_path: &Path) -> Result<()> { + let file = std::fs::File::create(zip_path)?; + let mut writer = ZipWriter::new(file); + let options = FileOptions::default().compression_method(CompressionMethod::Stored); + + writer.start_file("TopLevel.md", options)?; + writer.write_all(b"page content")?; + + writer.start_file("Attachment.zip", options)?; + writer.write_all(&create_nested_zip_bytes("nested.txt"))?; + + writer.finish()?; + Ok(()) +} + +fn create_nested_zip_bytes(name: &str) -> Vec { + let cursor = Cursor::new(Vec::new()); + let mut nested_writer = ZipWriter::new(cursor); + let options = FileOptions::default().compression_method(CompressionMethod::Stored); + + nested_writer.start_file(name, options).unwrap(); + nested_writer.write_all(b"nested content").unwrap(); + + let cursor = nested_writer.finish().unwrap(); + cursor.into_inner() +}