1
+ use regex:: Regex ;
1
2
use std:: collections:: { HashMap , HashSet } ;
2
3
use std:: path:: Path ;
3
4
use tokio:: process:: Command ;
@@ -209,7 +210,11 @@ pub(crate) async fn populate_blob_sha1(
209
210
/// 100644 blob f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 LICENSE
210
211
/// 100644 blob 9da69050aa4d1f6488a258a221217a4dd9e73b71 assets/file-types/cs.json
211
212
/// ```
212
- pub ( crate ) async fn get_all_tree_files ( dir : & Path , commit_sha1 : Option < String > ) -> Result < HashSet < String > , ( ) > {
213
+ pub ( crate ) async fn get_all_tree_files (
214
+ dir : & Path ,
215
+ commit_sha1 : Option < String > ,
216
+ ignore_paths : & Vec < Regex > ,
217
+ ) -> Result < HashSet < String > , ( ) > {
213
218
// use HEAD by default
214
219
let commit_sha1 = commit_sha1. unwrap_or ( "HEAD" . to_owned ( ) ) ;
215
220
@@ -228,21 +233,58 @@ pub(crate) async fn get_all_tree_files(dir: &Path, commit_sha1: Option<String>)
228
233
}
229
234
} )
230
235
. collect :: < HashSet < String > > ( ) ;
231
- info ! ( "Objects in the GIT tree: {}" , files. len( ) ) ;
236
+ let tree_all = files. len ( ) ;
237
+
238
+ // remove ignored files
239
+ let files = files
240
+ . into_iter ( )
241
+ . filter_map ( |file_path| {
242
+ if is_in_ignore_list ( ignore_paths, & file_path) {
243
+ None
244
+ } else {
245
+ Some ( file_path)
246
+ }
247
+ } )
248
+ . collect :: < HashSet < String > > ( ) ;
249
+
250
+ info ! (
251
+ "Objects in the GIT tree: {}, ignored: {}, processing: {}" ,
252
+ tree_all,
253
+ tree_all - files. len( ) ,
254
+ files. len( ) ,
255
+ ) ;
232
256
233
257
Ok ( files)
234
258
}
235
259
260
+ /// Returns TRUE if the file matches any of the ignore regex rules from `ignore_paths` module.
261
+ #[ inline]
262
+ fn is_in_ignore_list ( ignore_paths : & Vec < Regex > , file_path : & str ) -> bool {
263
+ // check if the path is in the ignore list
264
+ for ignore_regex in ignore_paths {
265
+ if ignore_regex. is_match ( file_path) {
266
+ debug ! ( "Path ignored: {}" , file_path) ;
267
+ return true ;
268
+ }
269
+ }
270
+
271
+ false
272
+ }
273
+
236
274
/// Get the contents of the Git blob as text.
237
275
pub ( crate ) async fn get_blob_contents ( dir : & Path , blob_sha1 : & String ) -> Result < Vec < u8 > , ( ) > {
238
276
let blob_contents = execute_git_command ( vec ! [ "cat-file" . into( ) , "-p" . into( ) , blob_sha1. into( ) ] , dir, false ) . await ?;
239
277
240
278
Ok ( blob_contents)
241
279
}
242
280
243
- /// Extracts and parses GIT log into who, what, when. No de-duping or optimisation is done. All log data is copied into the structs as-is.
281
+ /// Extracts and parses GIT log into who, what, when. Removes ignored files. No de-duping or optimisation is done. All log data is copied into the structs as-is.
244
282
/// Merge commits are excluded.
245
- pub async fn get_log ( repo_dir : & Path , contributor_git_identity : Option < & String > ) -> Result < Vec < GitLogEntry > , ( ) > {
283
+ pub async fn get_log (
284
+ repo_dir : & Path ,
285
+ contributor_git_identity : Option < & String > ,
286
+ ignore_paths : & Vec < Regex > ,
287
+ ) -> Result < Vec < GitLogEntry > , ( ) > {
246
288
debug ! ( "Extracting git log" ) ;
247
289
248
290
// prepare the command that may optionally include the author name to limit commits just to that contributor
@@ -285,7 +327,10 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
285
327
// commit d5e742de653954bfae88f0e5f6c8f0a7a5f6c437
286
328
// save the previous commit details and start a new one
287
329
// the very first entry will be always blank, it is remove outside the loop
288
- log_entries. push ( current_log_entry) ;
330
+ if current_log_entry. files . len ( ) > 0 {
331
+ // do not add a commit if a commit consists entirely of ignored files or has no files for another reason
332
+ log_entries. push ( current_log_entry) ;
333
+ }
289
334
current_log_entry = GitLogEntry :: new ( ) ;
290
335
if line. len ( ) > 8 {
291
336
current_log_entry. sha1 = line[ 7 ..] . to_owned ( ) ;
@@ -315,7 +360,7 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
315
360
}
316
361
// name/email split failed - add the entire line
317
362
current_log_entry. author_name_email = ( author. to_owned ( ) , String :: new ( ) ) ;
318
- error ! ( "Split failed on {}" , line) ;
363
+ warn ! ( "Split failed on {}" , line) ;
319
364
} else if line. starts_with ( "Date: " ) {
320
365
// Date: Tue Dec 22 17:43:07 2020 +0000
321
366
if line. len ( ) < 9 {
@@ -326,7 +371,7 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
326
371
trace ! ( "Date: {}" , date) ;
327
372
// go to the next line if there is no date (impossible?)
328
373
if date. is_empty ( ) {
329
- error ! ( "Encountered a commit with no date: {}" , line) ;
374
+ warn ! ( "Encountered a commit with no date: {}" , line) ;
330
375
continue ;
331
376
}
332
377
@@ -351,17 +396,19 @@ pub async fn get_log(repo_dir: &Path, contributor_git_identity: Option<&String>)
351
396
// the only remaining type of data should be the list of files
352
397
// they are not tagged or indented - the entire line is the file name with the relative path
353
398
// file names are displayed only with --name-only option
354
- trace ! ( "Added as a file" ) ;
355
- current_log_entry. files . insert ( line. into ( ) ) ;
399
+ if !is_in_ignore_list ( ignore_paths, line) {
400
+ trace ! ( "Added as a file" ) ;
401
+ current_log_entry. files . insert ( line. into ( ) ) ;
402
+ } else {
403
+ trace ! ( "Ignored" ) ;
404
+ }
356
405
}
357
406
}
358
407
359
408
// the very last commit has to be pushed outside the loop
360
409
log_entries. push ( current_log_entry) ;
361
- // the very first commit is always a blank record
362
- log_entries. remove ( 0 ) ;
363
410
364
- debug ! ( "Found {} commits" , log_entries. len( ) ) ;
411
+ debug ! ( "Found {} commits of interest " , log_entries. len( ) ) ;
365
412
Ok ( log_entries)
366
413
}
367
414
0 commit comments