From 8d19be59a026af84591906a857b56ddfc00e9f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Hanuszczak?= Date: Wed, 1 Oct 2025 19:51:38 +0200 Subject: [PATCH 1/4] Add contents regex argument. --- crates/rrg/src/action/get_file_metadata.rs | 32 ++++++++++++++++++++++ proto/rrg/action/get_file_metadata.proto | 11 ++++++++ 2 files changed, 43 insertions(+) diff --git a/crates/rrg/src/action/get_file_metadata.rs b/crates/rrg/src/action/get_file_metadata.rs index 5ff6fb22..7218c6fa 100644 --- a/crates/rrg/src/action/get_file_metadata.rs +++ b/crates/rrg/src/action/get_file_metadata.rs @@ -22,6 +22,8 @@ pub struct Args { path_pruning_regex: Regex, /// Whether to collect canonical path to the file. path_canon: bool, + /// Regex to restrict the results only to those with matching contents. + contents_regex: Regex, } /// Result of the `get_file_metadata` action. @@ -324,6 +326,9 @@ impl crate::request::Args for Args { let path_pruning_regex = Regex::new(proto.path_pruning_regex()) .map_err(|error| ParseArgsError::invalid_field("path_pruning_regex", error))?; + let contents_regex = Regex::new(proto.contents_regex()) + .map_err(|error| ParseArgsError::invalid_field("contents_regex", error))?; + Ok(Args { paths, path_canon: proto.path_canonical(), @@ -332,6 +337,7 @@ impl crate::request::Args for Args { sha1: proto.sha1(), sha256: proto.sha256(), path_pruning_regex, + contents_regex, }) } } @@ -395,6 +401,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -413,6 +420,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -436,6 +444,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -468,6 +477,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -506,6 +516,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -544,6 +555,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -577,6 +589,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -618,6 +631,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -664,6 +678,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -722,6 +737,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -785,6 +801,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -827,6 +844,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -862,6 +880,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -907,6 +926,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -942,6 +962,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -987,6 +1008,7 @@ mod tests { sha256: true, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1024,6 +1046,7 @@ mod tests { sha256: true, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1067,6 +1090,7 @@ mod tests { sha256: true, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1124,6 +1148,7 @@ mod tests { sep = regex::escape(std::path::MAIN_SEPARATOR_STR), }).unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1178,6 +1203,7 @@ mod tests { sep = regex::escape(std::path::MAIN_SEPARATOR_STR), }).unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1230,6 +1256,7 @@ mod tests { tempdir = regex::escape(tempdir.to_str().unwrap()), }).unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1272,6 +1299,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1322,6 +1350,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1373,6 +1402,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: false, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1410,6 +1440,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: true, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); @@ -1443,6 +1474,7 @@ mod tests { sha256: false, path_pruning_regex: Regex::new("").unwrap(), path_canon: true, + contents_regex: Regex::new("").unwrap(), }; let mut session = crate::session::FakeSession::new(); diff --git a/proto/rrg/action/get_file_metadata.proto b/proto/rrg/action/get_file_metadata.proto index c53af549..1eabfc86 100644 --- a/proto/rrg/action/get_file_metadata.proto +++ b/proto/rrg/action/get_file_metadata.proto @@ -60,6 +60,17 @@ message Args { // resolve multiple symlinks along the way and thus should not be enabled for // cases where long filesystem traversals are expected. bool path_canonical = 7; + + // Regex to restrict the results only to those with matching contents. + // + // Note that evaluating this condition involves opening the file and reading + // its contents (entirely in the worst case of not matching the regex). Thus, + // this can be an expensive operation. + // + // File contents are split into overlapping chunks and matching is done per + // chunk. This means that the expected matching substring cannot exceed the + // size of the chunk. + string contents_regex = 8; } message Result { From 39a50ff9e0e796733e3a222537e6114b5131f998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Hanuszczak?= Date: Wed, 1 Oct 2025 20:01:36 +0200 Subject: [PATCH 2/4] Implement the contents matching logic. --- crates/rrg/src/action/get_file_metadata.rs | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/crates/rrg/src/action/get_file_metadata.rs b/crates/rrg/src/action/get_file_metadata.rs index 7218c6fa..d16ead59 100644 --- a/crates/rrg/src/action/get_file_metadata.rs +++ b/crates/rrg/src/action/get_file_metadata.rs @@ -140,6 +140,74 @@ where } }; + if !args.contents_regex.as_str().is_empty() { + // Non-files obviously cannot match the contents conditions. We + // skip thme explicitly to avoid excesive errors when attempting + // to open them. + if !entry.metadata.is_file() { + continue + } + + let mut buf = Vec::::with_capacity(1 * 1024 * 1024); + + log::debug! { + "matching contents of '{}' to '{}' (using {}-bytes buffer)", + entry.path.display(), + args.contents_regex, + buf.capacity(), + }; + + let mut file = match std::fs::File::open(&entry.path) { + Ok(file) => file, + Err(error) => { + log::error! { + "failed to open '{}' for reading: {error}", + entry.path.display(), + }; + + continue + } + }; + + let is_match = loop { + use std::io::Read as _; + + // We always read as much as to fill the buffer. We drain + // the first half of it at the end of the loop, so there + // should always be some. + assert!(buf.capacity() - buf.len() > 0); + + let mut file_chunk = file + .take((buf.capacity() - buf.len()) as u64); + + match file_chunk.read_to_end(&mut buf) { + Ok(0) => break false, + Ok(_) => (), + Err(error) => { + log::error! { + "failed to read contents of '{}': {error}", + entry.path.display(), + }; + + break false + } + } + file = file_chunk.into_inner(); + + if args.contents_regex.is_match(&buf) { + break true + } + + // We don't drain the entire buffer but just the first half + // not to omit matches at the chunk boundary. + buf.drain(0..(buf.len() / 2)); + }; + + if !is_match { + continue + } + } + #[cfg(target_family = "unix")] let ext_attrs = match ospect::fs::ext_attrs(&entry.path) { Ok(ext_attrs) => ext_attrs.filter_map(|ext_attr| match ext_attr { From 658e27114ba1a5a46dacaeade75b4e97dc76a6a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Hanuszczak?= Date: Wed, 1 Oct 2025 20:12:41 +0200 Subject: [PATCH 3/4] Add tests for contents matching logic. --- crates/rrg/src/action/get_file_metadata.rs | 228 +++++++++++++++++++++ 1 file changed, 228 insertions(+) diff --git a/crates/rrg/src/action/get_file_metadata.rs b/crates/rrg/src/action/get_file_metadata.rs index d16ead59..7fd972ba 100644 --- a/crates/rrg/src/action/get_file_metadata.rs +++ b/crates/rrg/src/action/get_file_metadata.rs @@ -1342,6 +1342,234 @@ mod tests { assert!(paths.contains(&&tempdir.join(OsStr::from_bytes(b"\xFF\xAA\xBB/B")))); } + #[test] + fn handle_contents_regex_match_at_start() { + use std::io::Write as _; + + let mut tempfile = tempfile::NamedTempFile::new() + .unwrap(); + + tempfile.write_all(b"foobar").unwrap(); + tempfile.flush().unwrap(); + + let args = Args { + paths: vec![tempfile.path().to_path_buf()], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("foo").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + assert_eq!(session.reply_count(), 1); + } + + #[test] + fn handle_contents_regex_match_at_end() { + use std::io::Write as _; + + let mut tempfile = tempfile::NamedTempFile::new() + .unwrap(); + + tempfile.write_all(b"foobar").unwrap(); + tempfile.flush().unwrap(); + + let args = Args { + paths: vec![tempfile.path().to_path_buf()], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("bar").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + assert_eq!(session.reply_count(), 1); + } + + #[test] + fn handle_contents_regex_no_match() { + use std::io::Write as _; + + let mut tempfile = tempfile::NamedTempFile::new() + .unwrap(); + + tempfile.write_all(b"foobar").unwrap(); + tempfile.flush().unwrap(); + + let args = Args { + paths: vec![tempfile.path().to_path_buf()], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("quux").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + assert_eq!(session.reply_count(), 0); + } + + #[test] + fn handle_contents_regex_multiple_chunks_match() { + use std::io::{Read as _, Write as _}; + + let mut tempfile = tempfile::NamedTempFile::new() + .unwrap(); + + std::io::copy(&mut std::io::repeat(0x0).take(13371337), &mut tempfile).unwrap(); + tempfile.write_all(b"foobar").unwrap(); + tempfile.flush().unwrap(); + + let args = Args { + paths: vec![tempfile.path().to_path_buf()], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("foo").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + assert_eq!(session.reply_count(), 1); + } + + #[test] + fn handle_contents_regex_multiple_chunks_no_match() { + use std::io::{Read as _, Write as _}; + + let mut tempfile = tempfile::NamedTempFile::new() + .unwrap(); + + std::io::copy(&mut std::io::repeat(0x0).take(13371337), &mut tempfile).unwrap(); + tempfile.write_all(b"foobar").unwrap(); + tempfile.flush().unwrap(); + + let args = Args { + paths: vec![tempfile.path().to_path_buf()], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("quux").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + assert_eq!(session.reply_count(), 0); + } + + #[cfg(target_os = "linux")] + #[test] + fn handle_contents_regex_special() { + let args = Args { + paths: vec![PathBuf::from("/dev/zero")], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("\\x00").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + // Even though `/dev/zero` should match a pattern of null-byte, we do + // not want to process special files and thus it should not be reported. + assert_eq!(session.reply_count(), 0); + } + + #[test] + fn handle_contents_regex_many() { + use std::io::Write as _; + + let mut tempfile_foo = tempfile::NamedTempFile::new() + .unwrap(); + let mut tempfile_bar = tempfile::NamedTempFile::new() + .unwrap(); + + tempfile_foo.write_all(b"foo").unwrap(); + tempfile_foo.flush().unwrap(); + + tempfile_bar.write_all(b"bar").unwrap(); + tempfile_bar.flush().unwrap(); + + let args = Args { + paths: vec![ + tempfile_foo.path().to_path_buf(), + tempfile_bar.path().to_path_buf(), + ], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("foo").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + assert_eq!(session.reply_count(), 1); + assert_eq!(session.reply::(0).path, tempfile_foo.path()); + } + + #[test] + fn handle_contents_regex_children() { + let tempdir = tempfile::tempdir() + .unwrap(); + let tempdir = tempdir.path(); + + std::fs::write(tempdir.join("foo"), b"foo").unwrap(); + std::fs::write(tempdir.join("bar"), b"bar").unwrap(); + std::fs::write(tempdir.join("baz"), b"baz").unwrap(); + + let args = Args { + paths: vec![tempdir.to_path_buf()], + max_depth: u32::MAX, + md5: false, + sha1: false, + sha256: false, + path_pruning_regex: Regex::new("").unwrap(), + path_canon: false, + contents_regex: Regex::new("ba[rz]").unwrap(), + }; + + let mut session = crate::session::FakeSession::new(); + assert!(handle(&mut session, args).is_ok()); + + let paths = session.replies::() + .map(|item| &item.path) + .collect::>(); + + assert!(!paths.contains(&&tempdir.join("foo"))); + assert!(paths.contains(&&tempdir.join("bar"))); + assert!(paths.contains(&&tempdir.join("baz"))); + } + #[test] fn handle_many_regular_files() { let tempdir = tempfile::tempdir() From aa1899450577e0dbac1e044170a4400049e9141a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Hanuszczak?= Date: Thu, 2 Oct 2025 14:56:49 +0200 Subject: [PATCH 4/4] Fix comment typos. --- crates/rrg/src/action/get_file_metadata.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/rrg/src/action/get_file_metadata.rs b/crates/rrg/src/action/get_file_metadata.rs index 7fd972ba..0a237567 100644 --- a/crates/rrg/src/action/get_file_metadata.rs +++ b/crates/rrg/src/action/get_file_metadata.rs @@ -142,8 +142,8 @@ where if !args.contents_regex.as_str().is_empty() { // Non-files obviously cannot match the contents conditions. We - // skip thme explicitly to avoid excesive errors when attempting - // to open them. + // skip them explicitly to avoid excessive errors when attempt- + // ing to open them. if !entry.metadata.is_file() { continue }