Skip to content

Commit 9abd7e6

Browse files
committed
Added list of igore paths and extensions
1 parent 8628db3 commit 9abd7e6

File tree

5 files changed

+127
-19
lines changed

5 files changed

+127
-19
lines changed

stackmuncher/src/cmd_munch.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ pub(crate) async fn run(config: AppConfig) -> Result<(), ()> {
3131
let cached_project_report = Report::from_disk(&project_report_filename);
3232

3333
// get and retain a copy of the full git lot to re-use in multiple places
34-
let git_log = git::get_log(&config.lib_config.project_dir, None).await?;
34+
let git_log = git::get_log(&config.lib_config.project_dir, None, &code_rules.ignore_paths).await?;
3535

3636
let project_report = match Report::process_project(
3737
&mut code_rules,

stackmuncher_lib/src/code_rules.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ pub struct CodeRules {
3030
/// Contains names of newly added munchers to assist merging multiple instances
3131
/// of CodeRules for parallel processing.
3232
pub new_munchers: Option<HashSet<String>>,
33+
34+
/// Compiled regex for file names and paths that should be ignored regardless of any other rules
35+
pub ignore_paths: Vec<Regex>,
3336
}
3437

3538
impl CodeRules {
@@ -58,6 +61,7 @@ impl CodeRules {
5861
// dir\foo -> foo
5962
file_ext_regex: Regex::new(r#"[\.\\/][a-zA-Z0-1_]+$|^[a-zA-Z0-1_]+$"#).unwrap(),
6063
new_munchers: None,
64+
ignore_paths: crate::ignore_paths::compile_ignore_paths(),
6165
};
6266

6367
// load the contents of file_type definitions one by one

stackmuncher_lib/src/git.rs

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use regex::Regex;
12
use std::collections::{HashMap, HashSet};
23
use std::path::Path;
34
use tokio::process::Command;
@@ -209,7 +210,11 @@ pub(crate) async fn populate_blob_sha1(
209210
/// 100644 blob f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 LICENSE
210211
/// 100644 blob 9da69050aa4d1f6488a258a221217a4dd9e73b71 assets/file-types/cs.json
211212
/// ```
212-
pub(crate) async fn get_all_tree_files(dir: &Path, commit_sha1: Option<String>) -> Result<HashSet<String>, ()> {
213+
pub(crate) async fn get_all_tree_files(
214+
dir: &Path,
215+
commit_sha1: Option<String>,
216+
ignore_paths: &Vec<Regex>,
217+
) -> Result<HashSet<String>, ()> {
213218
// use HEAD by default
214219
let commit_sha1 = commit_sha1.unwrap_or("HEAD".to_owned());
215220

@@ -228,21 +233,58 @@ pub(crate) async fn get_all_tree_files(dir: &Path, commit_sha1: Option<String>)
228233
}
229234
})
230235
.collect::<HashSet<String>>();
231-
info!("Objects in the GIT tree: {}", files.len());
236+
let tree_all = files.len();
237+
238+
// remove ignored files
239+
let files = files
240+
.into_iter()
241+
.filter_map(|file_path| {
242+
if is_in_ignore_list(ignore_paths, &file_path) {
243+
None
244+
} else {
245+
Some(file_path)
246+
}
247+
})
248+
.collect::<HashSet<String>>();
249+
250+
info!(
251+
"Objects in the GIT tree: {}, ignored: {}, processing: {}",
252+
tree_all,
253+
tree_all - files.len(),
254+
files.len(),
255+
);
232256

233257
Ok(files)
234258
}
235259

260+
/// Returns TRUE if the file matches any of the ignore regex rules from `ignore_paths` module.
261+
#[inline]
262+
fn is_in_ignore_list(ignore_paths: &Vec<Regex>, file_path: &str) -> bool {
263+
// check if the path is in the ignore list
264+
for ignore_regex in ignore_paths {
265+
if ignore_regex.is_match(file_path) {
266+
debug!("Path ignored: {}", file_path);
267+
return true;
268+
}
269+
}
270+
271+
false
272+
}
273+
236274
/// Get the contents of the Git blob as text.
237275
pub(crate) async fn get_blob_contents(dir: &Path, blob_sha1: &String) -> Result<Vec<u8>, ()> {
238276
let blob_contents = execute_git_command(vec!["cat-file".into(), "-p".into(), blob_sha1.into()], dir, false).await?;
239277

240278
Ok(blob_contents)
241279
}
242280

243-
/// Extracts and parses GIT log into who, what, when. No de-duping or optimisation is done. All log data is copied into the structs as-is.
281+
/// Extracts and parses GIT log into who, what, when. Removes ignored files. No de-duping or optimisation is done. All log data is copied into the structs as-is.
244282
/// Merge commits are excluded.
245-
pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>) -> Result<Vec<GitLogEntry>, ()> {
283+
pub async fn get_log(
284+
repo_dir: &Path,
285+
contributor_git_identity: Option<&String>,
286+
ignore_paths: &Vec<Regex>,
287+
) -> Result<Vec<GitLogEntry>, ()> {
246288
debug!("Extracting git log");
247289

248290
// prepare the command that may optionally include the author name to limit commits just to that contributor
@@ -285,7 +327,10 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
285327
// commit d5e742de653954bfae88f0e5f6c8f0a7a5f6c437
286328
// save the previous commit details and start a new one
287329
// the very first entry will be always blank, it is remove outside the loop
288-
log_entries.push(current_log_entry);
330+
if current_log_entry.files.len() > 0 {
331+
// do not add a commit if a commit consists entirely of ignored files or has no files for another reason
332+
log_entries.push(current_log_entry);
333+
}
289334
current_log_entry = GitLogEntry::new();
290335
if line.len() > 8 {
291336
current_log_entry.sha1 = line[7..].to_owned();
@@ -315,7 +360,7 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
315360
}
316361
// name/email split failed - add the entire line
317362
current_log_entry.author_name_email = (author.to_owned(), String::new());
318-
error!("Split failed on {}", line);
363+
warn!("Split failed on {}", line);
319364
} else if line.starts_with("Date: ") {
320365
// Date: Tue Dec 22 17:43:07 2020 +0000
321366
if line.len() < 9 {
@@ -326,7 +371,7 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
326371
trace!("Date: {}", date);
327372
// go to the next line if there is no date (impossible?)
328373
if date.is_empty() {
329-
error!("Encountered a commit with no date: {}", line);
374+
warn!("Encountered a commit with no date: {}", line);
330375
continue;
331376
}
332377

@@ -351,17 +396,19 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
351396
// the only remaining type of data should be the list of files
352397
// they are not tagged or indented - the entire line is the file name with the relative path
353398
// file names are displayed only with --name-only option
354-
trace!("Added as a file");
355-
current_log_entry.files.insert(line.into());
399+
if !is_in_ignore_list(ignore_paths, line) {
400+
trace!("Added as a file");
401+
current_log_entry.files.insert(line.into());
402+
} else {
403+
trace!("Ignored");
404+
}
356405
}
357406
}
358407

359408
// the very last commit has to be pushed outside the loop
360409
log_entries.push(current_log_entry);
361-
// the very first commit is always a blank record
362-
log_entries.remove(0);
363410

364-
debug!("Found {} commits", log_entries.len());
411+
debug!("Found {} commits of interest", log_entries.len());
365412
Ok(log_entries)
366413
}
367414

stackmuncher_lib/src/ignore_paths.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
use regex::Regex;
2+
3+
/// Returns a list of compiled regex with the list of paths that should be ignored.
4+
/// Panics if any of the regex statements is incorrect.
5+
pub(crate) fn compile_ignore_paths() -> Vec<Regex> {
6+
IGNORE_PATHS
7+
.iter()
8+
.map(|ignore_path| Regex::new(ignore_path).expect(&format!("Invalid IGNORE_TYPES regex: {}", ignore_path)))
9+
.collect::<Vec<Regex>>()
10+
}
11+
12+
#[test]
13+
fn test_compile_ignore_paths() {
14+
assert!(compile_ignore_paths().len() > 0);
15+
}
16+
17+
/// A list of path fragments, file names, file extensions as Regex.
18+
/// Files with the path matching any of regex from this list are ignored.
19+
const IGNORE_PATHS: [&str; 25] = [
20+
// known framework paths
21+
r#"node_modules[/\\]"#,
22+
r#"angular[/\\]README\.md"#,
23+
r#"package-lock\.json"#,
24+
// images
25+
r#"\.ico$"#,
26+
r#"\.png$"#,
27+
r#"\.jpg$"#,
28+
r#"\.jpeg$"#,
29+
r#"\.gif$"#,
30+
// documents
31+
r#"\.pdf$"#,
32+
r#"\.doc$"#,
33+
r#"\.docx$"#,
34+
r#"\.txt$"#,
35+
// git files
36+
r#"\.gitignore$"#,
37+
r#"\.gitattributes$"#,
38+
r#"\.gitkeep$"#,
39+
// binaries
40+
r#"\.exe$"#,
41+
r#"\.dll$"#,
42+
r#"\.so$"#,
43+
r#"\.jar$"#,
44+
// archives
45+
r#"\.zip$"#,
46+
r#"\.rar$"#,
47+
// data files
48+
r#"\.csv$"#,
49+
r#"\.tsv$"#,
50+
r#"\.xls$"#,
51+
r#"\.xlsx$"#,
52+
];

stackmuncher_lib/src/lib.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pub mod config;
1010
pub mod contributor;
1111
pub mod file_type;
1212
pub mod git;
13+
mod ignore_paths;
1314
pub mod kwc;
1415
pub mod muncher;
1516
pub mod processors;
@@ -36,10 +37,14 @@ impl Report {
3637
) -> Result<Option<report::Report>, ()> {
3738
let report = report::Report::new();
3839

39-
let git_log = git_log.unwrap_or(git::get_log(project_dir, None).await?);
40+
// get the full git log if none was supplied
41+
let git_log = match git_log {
42+
Some(v) => v,
43+
None => git::get_log(project_dir, None, &code_rules.ignore_paths).await?,
44+
};
4045

4146
// get the list of files in the tree at HEAD
42-
let all_head_files = git::get_all_tree_files(project_dir, None).await?;
47+
let all_head_files = git::get_all_tree_files(project_dir, None, &code_rules.ignore_paths).await?;
4348
if all_head_files.len() > Report::MAX_FILES_PER_REPO {
4449
warn!("Repo ignored. Too many files: {}", all_head_files.len());
4550
return Err(());
@@ -90,12 +95,12 @@ impl Report {
9095
.collect::<ListOfBlobs>();
9196
debug!("Blobs that could not be copied from cache: {}", blobs_to_munch.len());
9297

93-
// remove blobs that have no munchers - there is no point even retrieving the contents
98+
// remove blobs that have no munchers or should be ignored - there is no point even retrieving the contents
9499
let blobs_to_munch = blobs_to_munch
95100
.into_iter()
96-
.filter_map(|(file_name, blob)| {
97-
if code_rules.get_muncher(&file_name).is_some() {
98-
Some((file_name, blob))
101+
.filter_map(|(file_path, blob)| {
102+
if code_rules.get_muncher(&file_path).is_some() {
103+
Some((file_path, blob))
99104
} else {
100105
None
101106
}

0 commit comments

Comments
 (0)