@@ -367,6 +367,19 @@ static int find_exact_renames(struct diff_options *options)
367
367
return renames ;
368
368
}
369
369
370
+ struct dir_rename_info {
371
+ struct strintmap idx_map ;
372
+ struct strmap dir_rename_guess ;
373
+ struct strmap * dir_rename_count ;
374
+ unsigned setup ;
375
+ };
376
+
377
+ static char * get_dirname (const char * filename )
378
+ {
379
+ char * slash = strrchr (filename , '/' );
380
+ return slash ? xstrndup (filename , slash - filename ) : xstrdup ("" );
381
+ }
382
+
370
383
static const char * get_basename (const char * filename )
371
384
{
372
385
/*
@@ -379,14 +392,86 @@ static const char *get_basename(const char *filename)
379
392
return base ? base + 1 : filename ;
380
393
}
381
394
382
- static int idx_possible_rename (char * filename )
395
+ static int idx_possible_rename (char * filename , struct dir_rename_info * info )
383
396
{
384
- /* Unconditionally return -1, "not found", for now */
385
- return -1 ;
397
+ /*
398
+ * Our comparison of files with the same basename (see
399
+ * find_basename_matches() below), is only helpful when after exact
400
+ * rename detection we have exactly one file with a given basename
401
+ * among the rename sources and also only exactly one file with
402
+ * that basename among the rename destinations. When we have
403
+ * multiple files with the same basename in either set, we do not
404
+ * know which to compare against. However, there are some
405
+ * filenames that occur in large numbers (particularly
406
+ * build-related filenames such as 'Makefile', '.gitignore', or
407
+ * 'build.gradle' that potentially exist within every single
408
+ * subdirectory), and for performance we want to be able to quickly
409
+ * find renames for these files too.
410
+ *
411
+ * The reason basename comparisons are a useful heuristic was that it
412
+ * is common for people to move files across directories while keeping
413
+ * their filename the same. If we had a way of determining or even
414
+ * making a good educated guess about which directory these non-unique
415
+ * basename files had moved the file to, we could check it.
416
+ * Luckily...
417
+ *
418
+ * When an entire directory is in fact renamed, we have two factors
419
+ * helping us out:
420
+ * (a) the original directory disappeared giving us a hint
421
+ * about when we can apply an extra heuristic.
422
+ * (a) we often have several files within that directory and
423
+ * subdirectories that are renamed without changes
424
+ * So, rules for a heuristic:
425
+ * (0) If there basename matches are non-unique (the condition under
426
+ * which this function is called) AND
427
+ * (1) the directory in which the file was found has disappeared
428
+ * (i.e. dirs_removed is non-NULL and has a relevant entry) THEN
429
+ * (2) use exact renames of files within the directory to determine
430
+ * where the directory is likely to have been renamed to. IF
431
+ * there is at least one exact rename from within that
432
+ * directory, we can proceed.
433
+ * (3) If there are multiple places the directory could have been
434
+ * renamed to based on exact renames, ignore all but one of them.
435
+ * Just use the destination with the most renames going to it.
436
+ * (4) Check if applying that directory rename to the original file
437
+ * would result in a destination filename that is in the
438
+ * potential rename set. If so, return the index of the
439
+ * destination file (the index within rename_dst).
440
+ * (5) Compare the original file and returned destination for
441
+ * similarity, and if they are sufficiently similar, record the
442
+ * rename.
443
+ *
444
+ * This function, idx_possible_rename(), is only responsible for (4).
445
+ * The conditions/steps in (1)-(3) will be handled via setting up
446
+ * dir_rename_count and dir_rename_guess in a future
447
+ * initialize_dir_rename_info() function. Steps (0) and (5) are
448
+ * handled by the caller of this function.
449
+ */
450
+ char * old_dir , * new_dir ;
451
+ struct strbuf new_path = STRBUF_INIT ;
452
+ int idx ;
453
+
454
+ if (!info -> setup )
455
+ return -1 ;
456
+
457
+ old_dir = get_dirname (filename );
458
+ new_dir = strmap_get (& info -> dir_rename_guess , old_dir );
459
+ free (old_dir );
460
+ if (!new_dir )
461
+ return -1 ;
462
+
463
+ strbuf_addstr (& new_path , new_dir );
464
+ strbuf_addch (& new_path , '/' );
465
+ strbuf_addstr (& new_path , get_basename (filename ));
466
+
467
+ idx = strintmap_get (& info -> idx_map , new_path .buf );
468
+ strbuf_release (& new_path );
469
+ return idx ;
386
470
}
387
471
388
472
static int find_basename_matches (struct diff_options * options ,
389
- int minimum_score )
473
+ int minimum_score ,
474
+ struct dir_rename_info * info )
390
475
{
391
476
/*
392
477
* When I checked in early 2020, over 76% of file renames in linux
@@ -494,7 +579,7 @@ static int find_basename_matches(struct diff_options *options,
494
579
dst_index = strintmap_get (& dests , base );
495
580
if (src_index == -1 || dst_index == -1 ) {
496
581
src_index = i ;
497
- dst_index = idx_possible_rename (filename );
582
+ dst_index = idx_possible_rename (filename , info );
498
583
}
499
584
if (dst_index == -1 )
500
585
continue ;
@@ -677,8 +762,10 @@ void diffcore_rename(struct diff_options *options)
677
762
int num_destinations , dst_cnt ;
678
763
int num_sources , want_copies ;
679
764
struct progress * progress = NULL ;
765
+ struct dir_rename_info info ;
680
766
681
767
trace2_region_enter ("diff" , "setup" , options -> repo );
768
+ info .setup = 0 ;
682
769
want_copies = (detect_rename == DIFF_DETECT_COPY );
683
770
if (!minimum_score )
684
771
minimum_score = DEFAULT_RENAME_SCORE ;
@@ -774,7 +861,8 @@ void diffcore_rename(struct diff_options *options)
774
861
/* Utilize file basenames to quickly find renames. */
775
862
trace2_region_enter ("diff" , "basename matches" , options -> repo );
776
863
rename_count += find_basename_matches (options ,
777
- min_basename_score );
864
+ min_basename_score ,
865
+ & info );
778
866
trace2_region_leave ("diff" , "basename matches" , options -> repo );
779
867
780
868
/*
0 commit comments