Skip to content

Commit 992513a

Browse files
prabhath004prabhath004
authored andcommitted
feat: add max_file_size support to LocalFile source (#1260)
Add optional max_file_size parameter to filter files by size in both list() and get_value() APIs. Files exceeding the limit are treated as non-existent. Closes #1249 Co-authored-by: prabhath004 <[email protected]>
1 parent 7996528 commit 992513a

File tree

3 files changed

+30
-0
lines changed

3 files changed

+30
-0
lines changed

docs/docs/sources/localfile.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ The spec takes the following fields:
2323

2424
:::
2525

26+
* `max_file_size` (`int`, optional): if provided, files exceeding this size in bytes will be treated as non-existent and skipped during processing.
27+
This is useful to avoid processing large files that are not relevant to your use case, such as videos or backups.
28+
If not specified, no size limit is applied.
29+
2630
### Schema
2731

2832
The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields:

python/cocoindex/sources/_engine_builtin_specs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ class LocalFile(op.SourceSpec):
2323
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
2424
excluded_patterns: list[str] | None = None
2525

26+
# If provided, files exceeding this size in bytes will be treated as non-existent.
27+
max_file_size: int | None = None
28+
2629

2730
class GoogleDrive(op.SourceSpec):
2831
"""Import data from Google Drive."""

src/ops/sources/local_file.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ pub struct Spec {
1313
binary: bool,
1414
included_patterns: Option<Vec<String>>,
1515
excluded_patterns: Option<Vec<String>>,
16+
max_file_size: Option<i64>,
1617
}
1718

1819
struct Executor {
1920
root_path: PathBuf,
2021
binary: bool,
2122
pattern_matcher: PatternMatcher,
23+
max_file_size: Option<i64>,
2224
}
2325

2426
#[async_trait]
@@ -49,6 +51,14 @@ impl SourceExecutor for Executor {
4951
new_dirs.push(Cow::Owned(path));
5052
}
5153
} else if self.pattern_matcher.is_file_included(relative_path) {
54+
// Check file size limit
55+
if let Some(max_size) = self.max_file_size {
56+
if let Ok(metadata) = path.metadata() {
57+
if metadata.len() > max_size as u64 {
58+
continue;
59+
}
60+
}
61+
}
5262
let ordinal: Option<Ordinal> = if options.include_ordinal {
5363
Some(path.metadata()?.modified()?.try_into()?)
5464
} else {
@@ -86,6 +96,18 @@ impl SourceExecutor for Executor {
8696
});
8797
}
8898
let path = self.root_path.join(path);
99+
// Check file size limit
100+
if let Some(max_size) = self.max_file_size {
101+
if let Ok(metadata) = path.metadata() {
102+
if metadata.len() > max_size as u64 {
103+
return Ok(PartialSourceRowData {
104+
value: Some(SourceValue::NonExistence),
105+
ordinal: Some(Ordinal::unavailable()),
106+
content_version_fp: None,
107+
});
108+
}
109+
}
110+
}
89111
let ordinal = if options.include_ordinal {
90112
Some(path.metadata()?.modified()?.try_into()?)
91113
} else {
@@ -172,6 +194,7 @@ impl SourceFactoryBase for Factory {
172194
root_path: PathBuf::from(spec.path),
173195
binary: spec.binary,
174196
pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
197+
max_file_size: spec.max_file_size,
175198
}))
176199
}
177200
}

0 commit comments

Comments
 (0)