Skip to content

Commit 25e65b6

Browse files
newrengitster
authored andcommitted
merge-ort, diffcore-rename: employ cached renames when possible
When there are many renames between the old base of a series of commits and the new base, the way sequencer.c, merge-recursive.c, and diffcore-rename.c have traditionally split the work resulted in redetecting the same renames with each and every commit being transplanted. To address this, the last several commits have been creating a cache of rename detection results, determining when it was safe to use such a cache in subsequent merge operations, adding helper functions, and so on. See the previous half dozen commit messages for additional discussion of this optimization, particularly the message a few commits ago entitled "add code to check for whether cached renames can be reused". This commit finally ties all of that work together, modifying the merge algorithm to make use of these cached renames. For the testcases mentioned in commit 557ac03 ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 5.665 s ± 0.129 s 5.622 s ± 0.059 s mega-renames: 11.435 s ± 0.158 s 10.127 s ± 0.073 s just-one-mega: 494.2 ms ± 6.1 ms 500.3 ms ± 3.8 ms That's a fairly small improvement, but mostly because the previous optimizations were so effective for these particular testcases; this optimization only kicks in when the others don't. If we undid the basename-guided rename detection and skip-irrelevant-renames optimizations, then we'd see that this series by itself improved performance as follows: Before Basename Series After Just This Series no-renames: 13.815 s ± 0.062 s 5.697 s ± 0.080 s mega-renames: 1799.937 s ± 0.493 s 205.709 s ± 0.457 s Since this optimization kicks in to help accelerate cases where the previous optimizations do not apply, this last comparison shows that this cached-renames optimization has the potential to help signficantly in cases that don't meet the requirements for the other optimizations to be effective. The changes made in this optimization also lay some important groundwork for a future optimization around having collect_merge_info() avoid recursing into subtrees in more cases. However, for this optimization to be effective, merge_switch_to_result() should only be called when the rebase or cherry-pick operation has either completed or hit a case where the user needs to resolve a conflict or edit the result. If it is called after every commit, as sequencer.c does, then the working tree and index are needlessly updated with every commit and the cached metadata is tossed, defeating this optimization. Refactoring sequencer.c to only call merge_switch_to_result() at the end of the operation is a bigger undertaking, and the practical benefits of this optimization will not be realized until that work is performed. Since `test-tool fast-rebase` only updates at the end of the operation, it was used to obtain the timings above. Signed-off-by: Elijah Newren <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent cbdca28 commit 25e65b6

File tree

4 files changed

+90
-30
lines changed

4 files changed

+90
-30
lines changed

diffcore-rename.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
568568
static void initialize_dir_rename_info(struct dir_rename_info *info,
569569
struct strintmap *relevant_sources,
570570
struct strintmap *dirs_removed,
571-
struct strmap *dir_rename_count)
571+
struct strmap *dir_rename_count,
572+
struct strmap *cached_pairs)
572573
{
573574
struct hashmap_iter iter;
574575
struct strmap_entry *entry;
@@ -633,6 +634,17 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
633634
rename_dst[i].p->two->path);
634635
}
635636

637+
/* Add cached_pairs to counts */
638+
strmap_for_each_entry(cached_pairs, &iter, entry) {
639+
const char *old_name = entry->key;
640+
const char *new_name = entry->value;
641+
if (!new_name)
642+
/* known delete; ignore it */
643+
continue;
644+
645+
update_dir_rename_counts(info, dirs_removed, old_name, new_name);
646+
}
647+
636648
/*
637649
* Now we collapse
638650
* dir_rename_count: old_directory -> {new_directory -> count}
@@ -1247,7 +1259,8 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info,
12471259
void diffcore_rename_extended(struct diff_options *options,
12481260
struct strintmap *relevant_sources,
12491261
struct strintmap *dirs_removed,
1250-
struct strmap *dir_rename_count)
1262+
struct strmap *dir_rename_count,
1263+
struct strmap *cached_pairs)
12511264
{
12521265
int detect_rename = options->detect_rename;
12531266
int minimum_score = options->rename_score;
@@ -1363,7 +1376,8 @@ void diffcore_rename_extended(struct diff_options *options,
13631376
/* Preparation for basename-driven matching. */
13641377
trace2_region_enter("diff", "dir rename setup", options->repo);
13651378
initialize_dir_rename_info(&info, relevant_sources,
1366-
dirs_removed, dir_rename_count);
1379+
dirs_removed, dir_rename_count,
1380+
cached_pairs);
13671381
trace2_region_leave("diff", "dir rename setup", options->repo);
13681382

13691383
/* Utilize file basenames to quickly find renames. */
@@ -1561,5 +1575,5 @@ void diffcore_rename_extended(struct diff_options *options,
15611575

15621576
void diffcore_rename(struct diff_options *options)
15631577
{
1564-
diffcore_rename_extended(options, NULL, NULL, NULL);
1578+
diffcore_rename_extended(options, NULL, NULL, NULL, NULL);
15651579
}

diffcore.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ void diffcore_rename(struct diff_options *);
181181
void diffcore_rename_extended(struct diff_options *options,
182182
struct strintmap *relevant_sources,
183183
struct strintmap *dirs_removed,
184-
struct strmap *dir_rename_count);
184+
struct strmap *dir_rename_count,
185+
struct strmap *cached_pairs);
185186
void diffcore_merge_broken(void);
186187
void diffcore_pickaxe(struct diff_options *);
187188
void diffcore_order(const char *orderfile);

merge-ort.c

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -763,15 +763,48 @@ static void add_pair(struct merge_options *opt,
763763
struct rename_info *renames = &opt->priv->renames;
764764
int names_idx = is_add ? side : 0;
765765

766-
if (!is_add) {
766+
if (is_add) {
767+
if (strset_contains(&renames->cached_target_names[side],
768+
pathname))
769+
return;
770+
} else {
767771
unsigned content_relevant = (match_mask == 0);
768772
unsigned location_relevant = (dir_rename_mask == 0x07);
769773

774+
/*
775+
* If pathname is found in cached_irrelevant[side] due to
776+
* previous pick but for this commit content is relevant,
777+
* then we need to remove it from cached_irrelevant.
778+
*/
779+
if (content_relevant)
780+
/* strset_remove is no-op if strset doesn't have key */
781+
strset_remove(&renames->cached_irrelevant[side],
782+
pathname);
783+
784+
/*
785+
* We do not need to re-detect renames for paths that we already
786+
* know the pairing, i.e. for cached_pairs (or
787+
* cached_irrelevant). However, handle_deferred_entries() needs
788+
* to loop over the union of keys from relevant_sources[side] and
789+
* cached_pairs[side], so for simplicity we set relevant_sources
790+
* for all the cached_pairs too and then strip them back out in
791+
* prune_cached_from_relevant() at the beginning of
792+
* detect_regular_renames().
793+
*/
770794
if (content_relevant || location_relevant) {
771795
/* content_relevant trumps location_relevant */
772796
strintmap_set(&renames->relevant_sources[side], pathname,
773797
content_relevant ? RELEVANT_CONTENT : RELEVANT_LOCATION);
774798
}
799+
800+
/*
801+
* Avoid creating pair if we've already cached rename results.
802+
* Note that we do this after setting relevant_sources[side]
803+
* as noted in the comment above.
804+
*/
805+
if (strmap_contains(&renames->cached_pairs[side], pathname) ||
806+
strset_contains(&renames->cached_irrelevant[side], pathname))
807+
return;
775808
}
776809

777810
one = alloc_filespec(pathname);
@@ -2360,7 +2393,9 @@ static inline int possible_side_renames(struct rename_info *renames,
23602393
static inline int possible_renames(struct rename_info *renames)
23612394
{
23622395
return possible_side_renames(renames, 1) ||
2363-
possible_side_renames(renames, 2);
2396+
possible_side_renames(renames, 2) ||
2397+
!strmap_empty(&renames->cached_pairs[1]) ||
2398+
!strmap_empty(&renames->cached_pairs[2]);
23642399
}
23652400

23662401
static void resolve_diffpair_statuses(struct diff_queue_struct *q)
@@ -2384,7 +2419,6 @@ static void resolve_diffpair_statuses(struct diff_queue_struct *q)
23842419
}
23852420
}
23862421

2387-
MAYBE_UNUSED
23882422
static void prune_cached_from_relevant(struct rename_info *renames,
23892423
unsigned side)
23902424
{
@@ -2404,7 +2438,6 @@ static void prune_cached_from_relevant(struct rename_info *renames,
24042438
}
24052439
}
24062440

2407-
MAYBE_UNUSED
24082441
static void use_cached_pairs(struct merge_options *opt,
24092442
struct strmap *cached_pairs,
24102443
struct diff_queue_struct *pairs)
@@ -2507,6 +2540,7 @@ static void detect_regular_renames(struct merge_options *opt,
25072540
struct diff_options diff_opts;
25082541
struct rename_info *renames = &opt->priv->renames;
25092542

2543+
prune_cached_from_relevant(renames, side_index);
25102544
if (!possible_side_renames(renames, side_index)) {
25112545
/*
25122546
* No rename detection needed for this side, but we still need
@@ -2535,7 +2569,8 @@ static void detect_regular_renames(struct merge_options *opt,
25352569
diffcore_rename_extended(&diff_opts,
25362570
&renames->relevant_sources[side_index],
25372571
&renames->dirs_removed[side_index],
2538-
&renames->dir_rename_count[side_index]);
2572+
&renames->dir_rename_count[side_index],
2573+
&renames->cached_pairs[side_index]);
25392574
trace2_region_leave("diff", "diffcore_rename", opt->repo);
25402575
resolve_diffpair_statuses(&diff_queued_diff);
25412576

@@ -2643,6 +2678,8 @@ static int detect_and_process_renames(struct merge_options *opt,
26432678
trace2_region_enter("merge", "regular renames", opt->repo);
26442679
detect_regular_renames(opt, MERGE_SIDE1);
26452680
detect_regular_renames(opt, MERGE_SIDE2);
2681+
use_cached_pairs(opt, &renames->cached_pairs[1], &renames->pairs[1]);
2682+
use_cached_pairs(opt, &renames->cached_pairs[2], &renames->pairs[2]);
26462683
trace2_region_leave("merge", "regular renames", opt->repo);
26472684

26482685
trace2_region_enter("merge", "directory renames", opt->repo);

t/t6429-merge-sequence-rename-caching.sh

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,10 @@ test_expect_success 'caching renames does not preclude finding new ones' '
101101
# dramatic change in size of the file, but remembering the rename and
102102
# reusing it is reasonable too.
103103
#
104-
# Rename detection (diffcore_rename_extended()) will run twice here; it is
105-
# not needed on the topic side of history for either of the two commits
106-
# being merged, but it is needed on the upstream side of history for each
107-
# commit being picked.
104+
# We do test here that we expect rename detection to only be run once total
105+
# (the topic side of history doesn't need renames, and with caching we
106+
# should be able to only run rename detection on the upstream side one
107+
# time.)
108108
test_expect_success 'cherry-pick both a commit and its immediate revert' '
109109
test_create_repo pick-commit-and-its-immediate-revert &&
110110
(
@@ -140,11 +140,11 @@ test_expect_success 'cherry-pick both a commit and its immediate revert' '
140140
GIT_TRACE2_PERF="$(pwd)/trace.output" &&
141141
export GIT_TRACE2_PERF &&
142142
143-
test_might_fail test-tool fast-rebase --onto HEAD upstream~1 topic &&
143+
test-tool fast-rebase --onto HEAD upstream~1 topic &&
144144
#git cherry-pick upstream~1..topic &&
145145
146146
grep region_enter.*diffcore_rename trace.output >calls &&
147-
test_line_count = 2 calls
147+
test_line_count = 1 calls
148148
)
149149
'
150150

@@ -304,9 +304,11 @@ test_expect_success 'rename same file identically, then add file to old dir' '
304304
# Here we are just concerned that cached renames might prevent us from seeing
305305
# the rename conflict, and we want to ensure that we do get a conflict.
306306
#
307-
# While at it, also test that we do rename detection three times. We have to
308-
# detect renames on the upstream side of history once for each merge, plus
309-
# Topic_2 has renames.
307+
# While at it, though, we do test that we only try to detect renames 2
308+
# times and not three. (The first merge needs to detect renames on the
309+
# upstream side. Traditionally, the second merge would need to detect
310+
# renames on both sides of history, but our caching of upstream renames
311+
# should avoid the need to re-detect upstream renames.)
310312
#
311313
test_expect_success 'cached dir rename does not prevent noticing later conflict' '
312314
test_create_repo dir-rename-cache-not-occluding-later-conflict &&
@@ -357,7 +359,7 @@ test_expect_success 'cached dir rename does not prevent noticing later conflict'
357359
grep CONFLICT..rename/rename output &&
358360
359361
grep region_enter.*diffcore_rename trace.output >calls &&
360-
test_line_count = 3 calls
362+
test_line_count = 2 calls
361363
)
362364
'
363365

@@ -412,10 +414,17 @@ test_setup_upstream_rename () {
412414
# commit to mess up its location either. We want to make sure that
413415
# olddir/newfile doesn't exist in the result and that newdir/newfile does.
414416
#
415-
# We also expect rename detection to occur three times. Although it is
416-
# typically needed two times per commit, there are no deleted files on the
417-
# topic side of history, so we only need to detect renames on the upstream
418-
# side for each of the 3 commits we need to pick.
417+
# We also test that we only do rename detection twice. We never need
418+
# rename detection on the topic side of history, but we do need it twice on
419+
# the upstream side of history. For the first topic commit, we only need
420+
# the
421+
# relevant-rename -> renamed
422+
# rename, because olddir is unmodified by Topic_1. For Topic_2, however,
423+
# the new file being added to olddir means files that were previously
424+
# irrelevant for rename detection are now relevant, forcing us to repeat
425+
# rename detection for the paths we don't already have cached. Topic_3 also
426+
# tweaks olddir/newfile, but the renames in olddir/ will have been cached
427+
# from the second rename detection run.
419428
#
420429
test_expect_success 'dir rename unneeded, then add new file to old dir' '
421430
test_setup_upstream_rename dir-rename-unneeded-until-new-file &&
@@ -450,7 +459,7 @@ test_expect_success 'dir rename unneeded, then add new file to old dir' '
450459
#git cherry-pick upstream..topic &&
451460
452461
grep region_enter.*diffcore_rename trace.output >calls &&
453-
test_line_count = 3 calls &&
462+
test_line_count = 2 calls &&
454463
455464
git ls-files >tracked &&
456465
test_line_count = 5 tracked &&
@@ -516,7 +525,7 @@ test_expect_success 'dir rename unneeded, then rename existing file into old dir
516525
#git cherry-pick upstream..topic &&
517526
518527
grep region_enter.*diffcore_rename trace.output >calls &&
519-
test_line_count = 4 calls &&
528+
test_line_count = 3 calls &&
520529
521530
test_path_is_missing somefile &&
522531
test_path_is_missing olddir/newfile &&
@@ -648,9 +657,8 @@ test_expect_success 'caching renames only on upstream side, part 1' '
648657
# for the wrong side of history.
649658
#
650659
#
651-
# This testcase should only need three calls to diffcore_rename_extended(),
652-
# because there are no renames on the topic side of history for picking
653-
# Topic_2.
660+
# This testcase should only need two calls to diffcore_rename_extended(),
661+
# both for the first merge, one for each side of history.
654662
#
655663
test_expect_success 'caching renames only on upstream side, part 2' '
656664
test_setup_topic_rename cache-renames-only-upstream-rename-file &&
@@ -677,7 +685,7 @@ test_expect_success 'caching renames only on upstream side, part 2' '
677685
#git cherry-pick upstream..topic &&
678686
679687
grep region_enter.*diffcore_rename trace.output >calls &&
680-
test_line_count = 3 calls &&
688+
test_line_count = 2 calls &&
681689
682690
git ls-files >tracked &&
683691
test_line_count = 4 tracked &&

0 commit comments

Comments
 (0)