From 4c8a78593c4157d882b72c32b4db6654ebfee9ef Mon Sep 17 00:00:00 2001 From: LJ Date: Sat, 15 Mar 2025 13:48:07 -0700 Subject: [PATCH] Support included/excluded patterns in `LocalFile` --- Cargo.toml | 1 + python/cocoindex/sources.py | 8 +++++++ src/ops/sources/local_file.rs | 43 +++++++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5faf94f25..938660657 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,3 +56,4 @@ tree-sitter-python = "0.23.6" tree-sitter-javascript = "0.23.1" tree-sitter-typescript = "0.23.2" tree-sitter-md = "0.3.2" +globset = "0.4.16" diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py index 1312abb42..6023f53e5 100644 --- a/python/cocoindex/sources.py +++ b/python/cocoindex/sources.py @@ -8,3 +8,11 @@ class LocalFile(op.SourceSpec): path: str binary: bool = False + + # If provided, only files matching these patterns will be included. + # See https://docs.rs/globset/latest/globset/index.html for the syntax of the patterns. + included_patterns: list[str] | None = None + + # If provided, files matching these patterns will be excluded. + # See https://docs.rs/globset/latest/globset/index.html for the syntax of the patterns. + excluded_patterns: list[str] | None = None diff --git a/src/ops/sources/local_file.rs b/src/ops/sources/local_file.rs index 6918639b6..47e9fa6b9 100644 --- a/src/ops/sources/local_file.rs +++ b/src/ops/sources/local_file.rs @@ -1,3 +1,4 @@ +use globset::{Glob, GlobSet, GlobSetBuilder}; use log::warn; use std::{path::PathBuf, sync::Arc}; @@ -7,12 +8,16 @@ use crate::{fields_value, ops::sdk::*}; pub struct Spec { path: String, binary: bool, + included_patterns: Option>, + excluded_patterns: Option>, } struct Executor { root_path_str: String, root_path: PathBuf, binary: bool, + included_glob_set: Option, + excluded_glob_set: Option, } impl Executor { @@ -20,16 +25,26 @@ impl Executor { for entry in std::fs::read_dir(dir_path)? { let entry = entry?; let path = entry.path(); - if path.is_dir() { - Box::pin(self.traverse_dir(&path, result)).await?; - } else { - if let Some(file_name) = path.to_str() { - result.push(KeyValue::Str(Arc::from( - &file_name[self.root_path_str.len() + 1..], - ))); - } else { - warn!("Skipped ill-formed file path: {}", path.display()); + if let Some(file_name) = path.to_str() { + let relative_path = &file_name[self.root_path_str.len() + 1..]; + if self + .excluded_glob_set + .as_ref() + .map_or(false, |glob_set| glob_set.is_match(relative_path)) + { + continue; } + if path.is_dir() { + Box::pin(self.traverse_dir(&path, result)).await?; + } else if self + .included_glob_set + .as_ref() + .map_or(true, |glob_set| glob_set.is_match(relative_path)) + { + result.push(KeyValue::Str(Arc::from(relative_path))); + } + } else { + warn!("Skipped ill-formed file path: {}", path.display()); } } Ok(()) @@ -102,6 +117,16 @@ impl SourceFactoryBase for Factory { root_path_str: spec.path.clone(), root_path: PathBuf::from(spec.path), binary: spec.binary, + included_glob_set: spec.included_patterns.map(build_glob_set).transpose()?, + excluded_glob_set: spec.excluded_patterns.map(build_glob_set).transpose()?, })) } } + +fn build_glob_set(patterns: Vec) -> Result { + let mut builder = GlobSetBuilder::new(); + for pattern in patterns { + builder.add(Glob::new(pattern.as_str())?); + } + Ok(builder.build()?) +}