Skip to content

Commit e2e1a03

Browse files
committed
Merge branch 'en/ort-perf-batch-10'
Various rename detection optimization to help "ort" merge strategy backend. * en/ort-perf-batch-10: diffcore-rename: determine which relevant_sources are no longer relevant merge-ort: record the reason that we want a rename for a file diffcore-rename: add computation of number of unknown renames diffcore-rename: check if we have enough renames for directories early on diffcore-rename: only compute dir_rename_count for relevant directories merge-ort: record the reason that we want a rename for a directory merge-ort, diffcore-rename: tweak dirs_removed and relevant_source type diffcore-rename: take advantage of "majority rules" to skip more renames
2 parents d1b10fc + 9bd3421 commit e2e1a03

File tree

3 files changed

+281
-47
lines changed

3 files changed

+281
-47
lines changed

diffcore-rename.c

Lines changed: 204 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ struct dir_rename_info {
371371
struct strintmap idx_map;
372372
struct strmap dir_rename_guess;
373373
struct strmap *dir_rename_count;
374-
struct strset *relevant_source_dirs;
374+
struct strintmap *relevant_source_dirs;
375375
unsigned setup;
376376
};
377377

@@ -407,6 +407,28 @@ static const char *get_highest_rename_path(struct strintmap *counts)
407407
return highest_destination_dir;
408408
}
409409

410+
static char *UNKNOWN_DIR = "/"; /* placeholder -- short, illegal directory */
411+
412+
static int dir_rename_already_determinable(struct strintmap *counts)
413+
{
414+
struct hashmap_iter iter;
415+
struct strmap_entry *entry;
416+
int first = 0, second = 0, unknown = 0;
417+
strintmap_for_each_entry(counts, &iter, entry) {
418+
const char *destination_dir = entry->key;
419+
intptr_t count = (intptr_t)entry->value;
420+
if (!strcmp(destination_dir, UNKNOWN_DIR)) {
421+
unknown = count;
422+
} else if (count >= first) {
423+
second = first;
424+
first = count;
425+
} else if (count >= second) {
426+
second = count;
427+
}
428+
}
429+
return first > second + unknown;
430+
}
431+
410432
static void increment_count(struct dir_rename_info *info,
411433
char *old_dir,
412434
char *new_dir)
@@ -429,7 +451,7 @@ static void increment_count(struct dir_rename_info *info,
429451
}
430452

431453
static void update_dir_rename_counts(struct dir_rename_info *info,
432-
struct strset *dirs_removed,
454+
struct strintmap *dirs_removed,
433455
const char *oldname,
434456
const char *newname)
435457
{
@@ -461,10 +483,12 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
461483
return;
462484

463485
while (1) {
486+
int drd_flag = NOT_RELEVANT;
487+
464488
/* Get old_dir, skip if its directory isn't relevant. */
465489
dirname_munge(old_dir);
466490
if (info->relevant_source_dirs &&
467-
!strset_contains(info->relevant_source_dirs, old_dir))
491+
!strintmap_contains(info->relevant_source_dirs, old_dir))
468492
break;
469493

470494
/* Get new_dir */
@@ -509,16 +533,31 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
509533
}
510534
}
511535

512-
if (strset_contains(dirs_removed, old_dir))
536+
/*
537+
* Above we suggested that we'd keep recording renames for
538+
* all ancestor directories where the trailing directories
539+
* matched, i.e. for
540+
* "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
541+
* we'd increment rename counts for each of
542+
* a/b/c/d/e/ => a/b/some/thing/else/e/
543+
* a/b/c/d/ => a/b/some/thing/else/
544+
* However, we only need the rename counts for directories
545+
* in dirs_removed whose value is RELEVANT_FOR_SELF.
546+
* However, we add one special case of also recording it for
547+
* first_time_in_loop because find_basename_matches() can
548+
* use that as a hint to find a good pairing.
549+
*/
550+
if (dirs_removed)
551+
drd_flag = strintmap_get(dirs_removed, old_dir);
552+
if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop)
513553
increment_count(info, old_dir, new_dir);
514-
else
515-
break;
516554

555+
first_time_in_loop = 0;
556+
if (drd_flag == NOT_RELEVANT)
557+
break;
517558
/* If we hit toplevel directory ("") for old or new dir, quit */
518559
if (!*old_dir || !*new_dir)
519560
break;
520-
521-
first_time_in_loop = 0;
522561
}
523562

524563
/* Free resources we don't need anymore */
@@ -527,8 +566,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
527566
}
528567

529568
static void initialize_dir_rename_info(struct dir_rename_info *info,
530-
struct strset *relevant_sources,
531-
struct strset *dirs_removed,
569+
struct strintmap *relevant_sources,
570+
struct strintmap *dirs_removed,
532571
struct strmap *dir_rename_count)
533572
{
534573
struct hashmap_iter iter;
@@ -555,12 +594,13 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
555594
info->relevant_source_dirs = dirs_removed; /* might be NULL */
556595
} else {
557596
info->relevant_source_dirs = xmalloc(sizeof(struct strintmap));
558-
strset_init(info->relevant_source_dirs);
559-
strset_for_each_entry(relevant_sources, &iter, entry) {
597+
strintmap_init(info->relevant_source_dirs, 0 /* unused */);
598+
strintmap_for_each_entry(relevant_sources, &iter, entry) {
560599
char *dirname = get_dirname(entry->key);
561600
if (!dirs_removed ||
562-
strset_contains(dirs_removed, dirname))
563-
strset_add(info->relevant_source_dirs, dirname);
601+
strintmap_contains(dirs_removed, dirname))
602+
strintmap_set(info->relevant_source_dirs,
603+
dirname, 0 /* value irrelevant */);
564604
free(dirname);
565605
}
566606
}
@@ -624,7 +664,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
624664
}
625665

626666
static void cleanup_dir_rename_info(struct dir_rename_info *info,
627-
struct strset *dirs_removed,
667+
struct strintmap *dirs_removed,
628668
int keep_dir_rename_count)
629669
{
630670
struct hashmap_iter iter;
@@ -644,7 +684,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
644684
/* relevant_source_dirs */
645685
if (info->relevant_source_dirs &&
646686
info->relevant_source_dirs != dirs_removed) {
647-
strset_clear(info->relevant_source_dirs);
687+
strintmap_clear(info->relevant_source_dirs);
648688
FREE_AND_NULL(info->relevant_source_dirs);
649689
}
650690

@@ -659,18 +699,22 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
659699
/*
660700
* Although dir_rename_count was passed in
661701
* diffcore_rename_extended() and we want to keep it around and
662-
* return it to that caller, we first want to remove any data
702+
* return it to that caller, we first want to remove any counts in
703+
* the maps associated with UNKNOWN_DIR entries and any data
663704
* associated with directories that weren't renamed.
664705
*/
665706
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
666707
const char *source_dir = entry->key;
667708
struct strintmap *counts = entry->value;
668709

669-
if (!strset_contains(dirs_removed, source_dir)) {
710+
if (!strintmap_get(dirs_removed, source_dir)) {
670711
string_list_append(&to_remove, source_dir);
671712
strintmap_clear(counts);
672713
continue;
673714
}
715+
716+
if (strintmap_contains(counts, UNKNOWN_DIR))
717+
strintmap_remove(counts, UNKNOWN_DIR);
674718
}
675719
for (i = 0; i < to_remove.nr; ++i)
676720
strmap_remove(info->dir_rename_count,
@@ -770,8 +814,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info)
770814
static int find_basename_matches(struct diff_options *options,
771815
int minimum_score,
772816
struct dir_rename_info *info,
773-
struct strset *relevant_sources,
774-
struct strset *dirs_removed)
817+
struct strintmap *relevant_sources,
818+
struct strintmap *dirs_removed)
775819
{
776820
/*
777821
* When I checked in early 2020, over 76% of file renames in linux
@@ -863,7 +907,7 @@ static int find_basename_matches(struct diff_options *options,
863907

864908
/* Skip irrelevant sources */
865909
if (relevant_sources &&
866-
!strset_contains(relevant_sources, filename))
910+
!strintmap_contains(relevant_sources, filename))
867911
continue;
868912

869913
/*
@@ -994,7 +1038,7 @@ static int find_renames(struct diff_score *mx,
9941038
int minimum_score,
9951039
int copies,
9961040
struct dir_rename_info *info,
997-
struct strset *dirs_removed)
1041+
struct strintmap *dirs_removed)
9981042
{
9991043
int count = 0, i;
10001044

@@ -1019,7 +1063,7 @@ static int find_renames(struct diff_score *mx,
10191063
}
10201064

10211065
static void remove_unneeded_paths_from_src(int detecting_copies,
1022-
struct strset *interesting)
1066+
struct strintmap *interesting)
10231067
{
10241068
int i, new_num_src;
10251069

@@ -1061,7 +1105,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
10611105
continue;
10621106

10631107
/* If we don't care about the source path, skip it */
1064-
if (interesting && !strset_contains(interesting, one->path))
1108+
if (interesting && !strintmap_contains(interesting, one->path))
10651109
continue;
10661110

10671111
if (new_num_src < i)
@@ -1073,9 +1117,136 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
10731117
rename_src_nr = new_num_src;
10741118
}
10751119

1120+
static void handle_early_known_dir_renames(struct dir_rename_info *info,
1121+
struct strintmap *relevant_sources,
1122+
struct strintmap *dirs_removed)
1123+
{
1124+
/*
1125+
* Directory renames are determined via an aggregate of all renames
1126+
* under them and using a "majority wins" rule. The fact that
1127+
* "majority wins", though, means we don't need all the renames
1128+
* under the given directory, we only need enough to ensure we have
1129+
* a majority.
1130+
*/
1131+
1132+
int i, new_num_src;
1133+
struct hashmap_iter iter;
1134+
struct strmap_entry *entry;
1135+
1136+
if (!dirs_removed || !relevant_sources)
1137+
return; /* nothing to cull */
1138+
if (break_idx)
1139+
return; /* culling incompatbile with break detection */
1140+
1141+
/*
1142+
* Supplement dir_rename_count with number of potential renames,
1143+
* marking all potential rename sources as mapping to UNKNOWN_DIR.
1144+
*/
1145+
for (i = 0; i < rename_src_nr; i++) {
1146+
char *old_dir;
1147+
struct diff_filespec *one = rename_src[i].p->one;
1148+
1149+
/*
1150+
* sources that are part of a rename will have already been
1151+
* removed by a prior call to remove_unneeded_paths_from_src()
1152+
*/
1153+
assert(!one->rename_used);
1154+
1155+
old_dir = get_dirname(one->path);
1156+
while (*old_dir != '\0' &&
1157+
NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) {
1158+
char *freeme = old_dir;
1159+
1160+
increment_count(info, old_dir, UNKNOWN_DIR);
1161+
old_dir = get_dirname(old_dir);
1162+
1163+
/* Free resources we don't need anymore */
1164+
free(freeme);
1165+
}
1166+
/*
1167+
* old_dir and new_dir free'd in increment_count, but
1168+
* get_dirname() gives us a new pointer we need to free for
1169+
* old_dir. Also, if the loop runs 0 times we need old_dir
1170+
* to be freed.
1171+
*/
1172+
free(old_dir);
1173+
}
1174+
1175+
/*
1176+
* For any directory which we need a potential rename detected for
1177+
* (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check
1178+
* whether we have enough renames to satisfy the "majority rules"
1179+
* requirement such that detecting any more renames of files under
1180+
* it won't change the result. For any such directory, mark that
1181+
* we no longer need to detect a rename for it. However, since we
1182+
* might need to still detect renames for an ancestor of that
1183+
* directory, use RELEVANT_FOR_ANCESTOR.
1184+
*/
1185+
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
1186+
/* entry->key is source_dir */
1187+
struct strintmap *counts = entry->value;
1188+
1189+
if (strintmap_get(dirs_removed, entry->key) ==
1190+
RELEVANT_FOR_SELF &&
1191+
dir_rename_already_determinable(counts)) {
1192+
strintmap_set(dirs_removed, entry->key,
1193+
RELEVANT_FOR_ANCESTOR);
1194+
}
1195+
}
1196+
1197+
for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
1198+
struct diff_filespec *one = rename_src[i].p->one;
1199+
int val;
1200+
1201+
val = strintmap_get(relevant_sources, one->path);
1202+
1203+
/*
1204+
* sources that were not found in relevant_sources should
1205+
* have already been removed by a prior call to
1206+
* remove_unneeded_paths_from_src()
1207+
*/
1208+
assert(val != -1);
1209+
1210+
if (val == RELEVANT_LOCATION) {
1211+
int removable = 1;
1212+
char *dir = get_dirname(one->path);
1213+
while (1) {
1214+
char *freeme = dir;
1215+
int res = strintmap_get(dirs_removed, dir);
1216+
1217+
/* Quit if not found or irrelevant */
1218+
if (res == NOT_RELEVANT)
1219+
break;
1220+
/* If RELEVANT_FOR_SELF, can't remove */
1221+
if (res == RELEVANT_FOR_SELF) {
1222+
removable = 0;
1223+
break;
1224+
}
1225+
/* Else continue searching upwards */
1226+
assert(res == RELEVANT_FOR_ANCESTOR);
1227+
dir = get_dirname(dir);
1228+
free(freeme);
1229+
}
1230+
free(dir);
1231+
if (removable) {
1232+
strintmap_set(relevant_sources, one->path,
1233+
RELEVANT_NO_MORE);
1234+
continue;
1235+
}
1236+
}
1237+
1238+
if (new_num_src < i)
1239+
memcpy(&rename_src[new_num_src], &rename_src[i],
1240+
sizeof(struct diff_rename_src));
1241+
new_num_src++;
1242+
}
1243+
1244+
rename_src_nr = new_num_src;
1245+
}
1246+
10761247
void diffcore_rename_extended(struct diff_options *options,
1077-
struct strset *relevant_sources,
1078-
struct strset *dirs_removed,
1248+
struct strintmap *relevant_sources,
1249+
struct strintmap *dirs_removed,
10791250
struct strmap *dir_rename_count)
10801251
{
10811252
int detect_rename = options->detect_rename;
@@ -1208,9 +1379,16 @@ void diffcore_rename_extended(struct diff_options *options,
12081379
* Cull sources, again:
12091380
* - remove ones involved in renames (found via basenames)
12101381
* - remove ones not found in relevant_sources
1382+
* and
1383+
* - remove ones in relevant_sources which are needed only
1384+
* for directory renames IF no ancestory directory
1385+
* actually needs to know any more individual path
1386+
* renames under them
12111387
*/
12121388
trace2_region_enter("diff", "cull basename", options->repo);
12131389
remove_unneeded_paths_from_src(want_copies, relevant_sources);
1390+
handle_early_known_dir_renames(&info, relevant_sources,
1391+
dirs_removed);
12141392
trace2_region_leave("diff", "cull basename", options->repo);
12151393
}
12161394

0 commit comments

Comments
 (0)