Skip to content

Commit bde8b9f

Browse files
newrengitster
authored andcommitted
diffcore-rename: provide basic implementation of idx_possible_rename()
Add a new struct dir_rename_info with various values we need inside our idx_possible_rename() function introduced in the previous commit. Add a basic implementation for this function showing how we plan to use the variables, but which will just return early with a value of -1 (not found) when those variables are not set up. Future commits will do the work necessary to set up those other variables so that idx_possible_rename() does not always return -1. Reviewed-by: Derrick Stolee <[email protected]> Signed-off-by: Elijah Newren <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 37a2514 commit bde8b9f

File tree

1 file changed

+94
-6
lines changed

1 file changed

+94
-6
lines changed

diffcore-rename.c

Lines changed: 94 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,19 @@ static int find_exact_renames(struct diff_options *options)
367367
return renames;
368368
}
369369

370+
struct dir_rename_info {
371+
struct strintmap idx_map;
372+
struct strmap dir_rename_guess;
373+
struct strmap *dir_rename_count;
374+
unsigned setup;
375+
};
376+
377+
static char *get_dirname(const char *filename)
378+
{
379+
char *slash = strrchr(filename, '/');
380+
return slash ? xstrndup(filename, slash - filename) : xstrdup("");
381+
}
382+
370383
static const char *get_basename(const char *filename)
371384
{
372385
/*
@@ -379,14 +392,86 @@ static const char *get_basename(const char *filename)
379392
return base ? base + 1 : filename;
380393
}
381394

382-
static int idx_possible_rename(char *filename)
395+
static int idx_possible_rename(char *filename, struct dir_rename_info *info)
383396
{
384-
/* Unconditionally return -1, "not found", for now */
385-
return -1;
397+
/*
398+
* Our comparison of files with the same basename (see
399+
* find_basename_matches() below), is only helpful when after exact
400+
* rename detection we have exactly one file with a given basename
401+
* among the rename sources and also only exactly one file with
402+
* that basename among the rename destinations. When we have
403+
* multiple files with the same basename in either set, we do not
404+
* know which to compare against. However, there are some
405+
* filenames that occur in large numbers (particularly
406+
* build-related filenames such as 'Makefile', '.gitignore', or
407+
* 'build.gradle' that potentially exist within every single
408+
* subdirectory), and for performance we want to be able to quickly
409+
* find renames for these files too.
410+
*
411+
* The reason basename comparisons are a useful heuristic was that it
412+
* is common for people to move files across directories while keeping
413+
* their filename the same. If we had a way of determining or even
414+
* making a good educated guess about which directory these non-unique
415+
* basename files had moved the file to, we could check it.
416+
* Luckily...
417+
*
418+
* When an entire directory is in fact renamed, we have two factors
419+
* helping us out:
420+
* (a) the original directory disappeared giving us a hint
421+
* about when we can apply an extra heuristic.
422+
* (a) we often have several files within that directory and
423+
* subdirectories that are renamed without changes
424+
* So, rules for a heuristic:
425+
* (0) If there basename matches are non-unique (the condition under
426+
* which this function is called) AND
427+
* (1) the directory in which the file was found has disappeared
428+
* (i.e. dirs_removed is non-NULL and has a relevant entry) THEN
429+
* (2) use exact renames of files within the directory to determine
430+
* where the directory is likely to have been renamed to. IF
431+
* there is at least one exact rename from within that
432+
* directory, we can proceed.
433+
* (3) If there are multiple places the directory could have been
434+
* renamed to based on exact renames, ignore all but one of them.
435+
* Just use the destination with the most renames going to it.
436+
* (4) Check if applying that directory rename to the original file
437+
* would result in a destination filename that is in the
438+
* potential rename set. If so, return the index of the
439+
* destination file (the index within rename_dst).
440+
* (5) Compare the original file and returned destination for
441+
* similarity, and if they are sufficiently similar, record the
442+
* rename.
443+
*
444+
* This function, idx_possible_rename(), is only responsible for (4).
445+
* The conditions/steps in (1)-(3) will be handled via setting up
446+
* dir_rename_count and dir_rename_guess in a future
447+
* initialize_dir_rename_info() function. Steps (0) and (5) are
448+
* handled by the caller of this function.
449+
*/
450+
char *old_dir, *new_dir;
451+
struct strbuf new_path = STRBUF_INIT;
452+
int idx;
453+
454+
if (!info->setup)
455+
return -1;
456+
457+
old_dir = get_dirname(filename);
458+
new_dir = strmap_get(&info->dir_rename_guess, old_dir);
459+
free(old_dir);
460+
if (!new_dir)
461+
return -1;
462+
463+
strbuf_addstr(&new_path, new_dir);
464+
strbuf_addch(&new_path, '/');
465+
strbuf_addstr(&new_path, get_basename(filename));
466+
467+
idx = strintmap_get(&info->idx_map, new_path.buf);
468+
strbuf_release(&new_path);
469+
return idx;
386470
}
387471

388472
static int find_basename_matches(struct diff_options *options,
389-
int minimum_score)
473+
int minimum_score,
474+
struct dir_rename_info *info)
390475
{
391476
/*
392477
* When I checked in early 2020, over 76% of file renames in linux
@@ -494,7 +579,7 @@ static int find_basename_matches(struct diff_options *options,
494579
dst_index = strintmap_get(&dests, base);
495580
if (src_index == -1 || dst_index == -1) {
496581
src_index = i;
497-
dst_index = idx_possible_rename(filename);
582+
dst_index = idx_possible_rename(filename, info);
498583
}
499584
if (dst_index == -1)
500585
continue;
@@ -677,8 +762,10 @@ void diffcore_rename(struct diff_options *options)
677762
int num_destinations, dst_cnt;
678763
int num_sources, want_copies;
679764
struct progress *progress = NULL;
765+
struct dir_rename_info info;
680766

681767
trace2_region_enter("diff", "setup", options->repo);
768+
info.setup = 0;
682769
want_copies = (detect_rename == DIFF_DETECT_COPY);
683770
if (!minimum_score)
684771
minimum_score = DEFAULT_RENAME_SCORE;
@@ -774,7 +861,8 @@ void diffcore_rename(struct diff_options *options)
774861
/* Utilize file basenames to quickly find renames. */
775862
trace2_region_enter("diff", "basename matches", options->repo);
776863
rename_count += find_basename_matches(options,
777-
min_basename_score);
864+
min_basename_score,
865+
&info);
778866
trace2_region_leave("diff", "basename matches", options->repo);
779867

780868
/*

0 commit comments

Comments
 (0)