Skip to content

Commit acae683

Browse files
committed
Merge remote-tracking branch 'origin/main' into g/v1-merge-main
2 parents ec1b26c + a381cb1 commit acae683

File tree

5 files changed

+131
-5
lines changed

5 files changed

+131
-5
lines changed

Cargo.lock

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ derivative = "2.2.0"
3030
encoding_rs = "0.8.35"
3131
env_logger = "0.11.8"
3232
futures = "0.3.31"
33+
globset = "0.4.18"
3334
hashlink = "0.11"
3435
hex = "0.4.3"
3536
indexmap = { version = "2.12.1", features = ["serde"] }
@@ -39,11 +40,7 @@ itertools = "0.14.0"
3940
log = "0.4.28"
4041
numpy = "0.27.0"
4142
pgvector = { version = "0.4.1", features = ["sqlx", "halfvec"] }
42-
pyo3 = { version = "0.27.1", features = [
43-
"auto-initialize",
44-
"chrono",
45-
"uuid",
46-
] }
43+
pyo3 = { version = "0.27.1", features = ["auto-initialize", "chrono", "uuid"] }
4744
pyo3-async-runtimes = { version = "0.27.0", features = ["tokio-runtime"] }
4845
pythonize = "0.27.0"
4946
rand = "0.9.2"

rust/ops_text/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ rust-version = { workspace = true }
66
license = { workspace = true }
77

88
[dependencies]
9+
anyhow = { workspace = true }
10+
globset = { workspace = true }
911
regex = { workspace = true }
1012
unicase = { workspace = true }
1113

rust/ops_text/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55
//! - Text splitting by separators
66
//! - Recursive text chunking with syntax awareness
77
8+
pub mod pattern_matcher;
89
pub mod prog_langs;
910
pub mod split;
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
use anyhow::Result;
2+
use globset::{Glob, GlobSet, GlobSetBuilder};
3+
4+
/// Builds a GlobSet from a vector of pattern strings
5+
fn build_glob_set(patterns: Vec<String>) -> Result<GlobSet> {
6+
let mut builder = GlobSetBuilder::new();
7+
for pattern in patterns {
8+
builder.add(Glob::new(pattern.as_str())?);
9+
}
10+
Ok(builder.build()?)
11+
}
12+
13+
/// Pattern matcher that handles include and exclude patterns for files
14+
#[derive(Debug)]
15+
pub struct PatternMatcher {
16+
/// Patterns matching full path of files to be included.
17+
included_glob_set: Option<GlobSet>,
18+
/// Patterns matching full path of files and directories to be excluded.
19+
/// If a directory is excluded, all files and subdirectories within it are also excluded.
20+
excluded_glob_set: Option<GlobSet>,
21+
}
22+
23+
impl PatternMatcher {
24+
/// Create a new PatternMatcher from optional include and exclude pattern vectors
25+
pub fn new(
26+
included_patterns: Option<Vec<String>>,
27+
excluded_patterns: Option<Vec<String>>,
28+
) -> Result<Self> {
29+
let included_glob_set = included_patterns.map(build_glob_set).transpose()?;
30+
let excluded_glob_set = excluded_patterns.map(build_glob_set).transpose()?;
31+
32+
Ok(Self {
33+
included_glob_set,
34+
excluded_glob_set,
35+
})
36+
}
37+
38+
/// Check if a file or directory is excluded by the exclude patterns
39+
/// Can be called on directories to prune traversal on excluded directories.
40+
pub fn is_excluded(&self, path: &str) -> bool {
41+
self.excluded_glob_set
42+
.as_ref()
43+
.is_some_and(|glob_set| glob_set.is_match(path))
44+
}
45+
46+
/// Check if a file should be included based on both include and exclude patterns
47+
/// Should be called for each file.
48+
pub fn is_file_included(&self, path: &str) -> bool {
49+
self.included_glob_set
50+
.as_ref()
51+
.is_none_or(|glob_set| glob_set.is_match(path))
52+
&& !self.is_excluded(path)
53+
}
54+
}
55+
56+
#[cfg(test)]
57+
mod tests {
58+
use super::*;
59+
60+
#[test]
61+
fn test_pattern_matcher_no_patterns() {
62+
let matcher = PatternMatcher::new(None, None).unwrap();
63+
assert!(matcher.is_file_included("test.txt"));
64+
assert!(matcher.is_file_included("path/to/file.rs"));
65+
assert!(!matcher.is_excluded("anything"));
66+
}
67+
68+
#[test]
69+
fn test_pattern_matcher_include_only() {
70+
let matcher =
71+
PatternMatcher::new(Some(vec!["*.txt".to_string(), "*.rs".to_string()]), None).unwrap();
72+
73+
assert!(matcher.is_file_included("test.txt"));
74+
assert!(matcher.is_file_included("main.rs"));
75+
assert!(!matcher.is_file_included("image.png"));
76+
}
77+
78+
#[test]
79+
fn test_pattern_matcher_exclude_only() {
80+
let matcher =
81+
PatternMatcher::new(None, Some(vec!["*.tmp".to_string(), "*.log".to_string()]))
82+
.unwrap();
83+
84+
assert!(matcher.is_file_included("test.txt"));
85+
assert!(!matcher.is_file_included("temp.tmp"));
86+
assert!(!matcher.is_file_included("debug.log"));
87+
}
88+
89+
#[test]
90+
fn test_pattern_matcher_both_patterns() {
91+
let matcher = PatternMatcher::new(
92+
Some(vec!["*.txt".to_string()]),
93+
Some(vec!["*temp*".to_string()]),
94+
)
95+
.unwrap();
96+
97+
assert!(matcher.is_file_included("test.txt"));
98+
assert!(!matcher.is_file_included("temp.txt")); // excluded despite matching include
99+
assert!(!matcher.is_file_included("main.rs")); // doesn't match include
100+
}
101+
}

0 commit comments

Comments
 (0)