diff --git a/Cargo.lock b/Cargo.lock index 4531ba24a..23dd1d919 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -326,6 +326,16 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.19.0" @@ -464,6 +474,8 @@ dependencies = [ name = "cocoindex_ops_text" version = "999.0.0" dependencies = [ + "anyhow", + "globset", "regex", "tree-sitter", "tree-sitter-c", @@ -1162,6 +1174,19 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + [[package]] name = "hashbrown" version = "0.12.3" diff --git a/Cargo.toml b/Cargo.toml index e7c31ff08..d2de559b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ derivative = "2.2.0" encoding_rs = "0.8.35" env_logger = "0.11.8" futures = "0.3.31" +globset = "0.4.18" hashlink = "0.11" hex = "0.4.3" indexmap = { version = "2.12.1", features = ["serde"] } @@ -39,11 +40,7 @@ itertools = "0.14.0" log = "0.4.28" numpy = "0.27.0" pgvector = { version = "0.4.1", features = ["sqlx", "halfvec"] } -pyo3 = { version = "0.27.1", features = [ - "auto-initialize", - "chrono", - "uuid", -] } +pyo3 = { version = "0.27.1", features = ["auto-initialize", "chrono", "uuid"] } pyo3-async-runtimes = { version = "0.27.0", features = ["tokio-runtime"] } pythonize = "0.27.0" rand = "0.9.2" diff --git a/rust/ops_text/Cargo.toml b/rust/ops_text/Cargo.toml index 597784a68..6949abbfb 100644 --- a/rust/ops_text/Cargo.toml +++ b/rust/ops_text/Cargo.toml @@ -6,6 +6,8 @@ rust-version = { workspace = true } license = { workspace = true } [dependencies] +anyhow = { workspace = true } +globset = { workspace = true } regex = { workspace = true } unicase = { workspace = true } diff --git a/rust/ops_text/src/lib.rs b/rust/ops_text/src/lib.rs index aed6d55e7..67b89497c 100644 --- a/rust/ops_text/src/lib.rs +++ b/rust/ops_text/src/lib.rs @@ -5,5 +5,6 @@ //! - Text splitting by separators //! - Recursive text chunking with syntax awareness +pub mod pattern_matcher; pub mod prog_langs; pub mod split; diff --git a/rust/ops_text/src/pattern_matcher.rs b/rust/ops_text/src/pattern_matcher.rs new file mode 100644 index 000000000..0f8fca443 --- /dev/null +++ b/rust/ops_text/src/pattern_matcher.rs @@ -0,0 +1,101 @@ +use anyhow::Result; +use globset::{Glob, GlobSet, GlobSetBuilder}; + +/// Builds a GlobSet from a vector of pattern strings +fn build_glob_set(patterns: Vec) -> Result { + let mut builder = GlobSetBuilder::new(); + for pattern in patterns { + builder.add(Glob::new(pattern.as_str())?); + } + Ok(builder.build()?) +} + +/// Pattern matcher that handles include and exclude patterns for files +#[derive(Debug)] +pub struct PatternMatcher { + /// Patterns matching full path of files to be included. + included_glob_set: Option, + /// Patterns matching full path of files and directories to be excluded. + /// If a directory is excluded, all files and subdirectories within it are also excluded. + excluded_glob_set: Option, +} + +impl PatternMatcher { + /// Create a new PatternMatcher from optional include and exclude pattern vectors + pub fn new( + included_patterns: Option>, + excluded_patterns: Option>, + ) -> Result { + let included_glob_set = included_patterns.map(build_glob_set).transpose()?; + let excluded_glob_set = excluded_patterns.map(build_glob_set).transpose()?; + + Ok(Self { + included_glob_set, + excluded_glob_set, + }) + } + + /// Check if a file or directory is excluded by the exclude patterns + /// Can be called on directories to prune traversal on excluded directories. + pub fn is_excluded(&self, path: &str) -> bool { + self.excluded_glob_set + .as_ref() + .is_some_and(|glob_set| glob_set.is_match(path)) + } + + /// Check if a file should be included based on both include and exclude patterns + /// Should be called for each file. + pub fn is_file_included(&self, path: &str) -> bool { + self.included_glob_set + .as_ref() + .is_none_or(|glob_set| glob_set.is_match(path)) + && !self.is_excluded(path) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pattern_matcher_no_patterns() { + let matcher = PatternMatcher::new(None, None).unwrap(); + assert!(matcher.is_file_included("test.txt")); + assert!(matcher.is_file_included("path/to/file.rs")); + assert!(!matcher.is_excluded("anything")); + } + + #[test] + fn test_pattern_matcher_include_only() { + let matcher = + PatternMatcher::new(Some(vec!["*.txt".to_string(), "*.rs".to_string()]), None).unwrap(); + + assert!(matcher.is_file_included("test.txt")); + assert!(matcher.is_file_included("main.rs")); + assert!(!matcher.is_file_included("image.png")); + } + + #[test] + fn test_pattern_matcher_exclude_only() { + let matcher = + PatternMatcher::new(None, Some(vec!["*.tmp".to_string(), "*.log".to_string()])) + .unwrap(); + + assert!(matcher.is_file_included("test.txt")); + assert!(!matcher.is_file_included("temp.tmp")); + assert!(!matcher.is_file_included("debug.log")); + } + + #[test] + fn test_pattern_matcher_both_patterns() { + let matcher = PatternMatcher::new( + Some(vec!["*.txt".to_string()]), + Some(vec!["*temp*".to_string()]), + ) + .unwrap(); + + assert!(matcher.is_file_included("test.txt")); + assert!(!matcher.is_file_included("temp.txt")); // excluded despite matching include + assert!(!matcher.is_file_included("main.rs")); // doesn't match include + } +}