@@ -371,7 +371,7 @@ struct dir_rename_info {
371
371
struct strintmap idx_map ;
372
372
struct strmap dir_rename_guess ;
373
373
struct strmap * dir_rename_count ;
374
- struct strset * relevant_source_dirs ;
374
+ struct strintmap * relevant_source_dirs ;
375
375
unsigned setup ;
376
376
};
377
377
@@ -407,6 +407,28 @@ static const char *get_highest_rename_path(struct strintmap *counts)
407
407
return highest_destination_dir ;
408
408
}
409
409
410
+ static char * UNKNOWN_DIR = "/" ; /* placeholder -- short, illegal directory */
411
+
412
+ static int dir_rename_already_determinable (struct strintmap * counts )
413
+ {
414
+ struct hashmap_iter iter ;
415
+ struct strmap_entry * entry ;
416
+ int first = 0 , second = 0 , unknown = 0 ;
417
+ strintmap_for_each_entry (counts , & iter , entry ) {
418
+ const char * destination_dir = entry -> key ;
419
+ intptr_t count = (intptr_t )entry -> value ;
420
+ if (!strcmp (destination_dir , UNKNOWN_DIR )) {
421
+ unknown = count ;
422
+ } else if (count >= first ) {
423
+ second = first ;
424
+ first = count ;
425
+ } else if (count >= second ) {
426
+ second = count ;
427
+ }
428
+ }
429
+ return first > second + unknown ;
430
+ }
431
+
410
432
static void increment_count (struct dir_rename_info * info ,
411
433
char * old_dir ,
412
434
char * new_dir )
@@ -429,7 +451,7 @@ static void increment_count(struct dir_rename_info *info,
429
451
}
430
452
431
453
static void update_dir_rename_counts (struct dir_rename_info * info ,
432
- struct strset * dirs_removed ,
454
+ struct strintmap * dirs_removed ,
433
455
const char * oldname ,
434
456
const char * newname )
435
457
{
@@ -461,10 +483,12 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
461
483
return ;
462
484
463
485
while (1 ) {
486
+ int drd_flag = NOT_RELEVANT ;
487
+
464
488
/* Get old_dir, skip if its directory isn't relevant. */
465
489
dirname_munge (old_dir );
466
490
if (info -> relevant_source_dirs &&
467
- !strset_contains (info -> relevant_source_dirs , old_dir ))
491
+ !strintmap_contains (info -> relevant_source_dirs , old_dir ))
468
492
break ;
469
493
470
494
/* Get new_dir */
@@ -509,16 +533,31 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
509
533
}
510
534
}
511
535
512
- if (strset_contains (dirs_removed , old_dir ))
536
+ /*
537
+ * Above we suggested that we'd keep recording renames for
538
+ * all ancestor directories where the trailing directories
539
+ * matched, i.e. for
540
+ * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
541
+ * we'd increment rename counts for each of
542
+ * a/b/c/d/e/ => a/b/some/thing/else/e/
543
+ * a/b/c/d/ => a/b/some/thing/else/
544
+ * However, we only need the rename counts for directories
545
+ * in dirs_removed whose value is RELEVANT_FOR_SELF.
546
+ * However, we add one special case of also recording it for
547
+ * first_time_in_loop because find_basename_matches() can
548
+ * use that as a hint to find a good pairing.
549
+ */
550
+ if (dirs_removed )
551
+ drd_flag = strintmap_get (dirs_removed , old_dir );
552
+ if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop )
513
553
increment_count (info , old_dir , new_dir );
514
- else
515
- break ;
516
554
555
+ first_time_in_loop = 0 ;
556
+ if (drd_flag == NOT_RELEVANT )
557
+ break ;
517
558
/* If we hit toplevel directory ("") for old or new dir, quit */
518
559
if (!* old_dir || !* new_dir )
519
560
break ;
520
-
521
- first_time_in_loop = 0 ;
522
561
}
523
562
524
563
/* Free resources we don't need anymore */
@@ -527,8 +566,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
527
566
}
528
567
529
568
static void initialize_dir_rename_info (struct dir_rename_info * info ,
530
- struct strset * relevant_sources ,
531
- struct strset * dirs_removed ,
569
+ struct strintmap * relevant_sources ,
570
+ struct strintmap * dirs_removed ,
532
571
struct strmap * dir_rename_count )
533
572
{
534
573
struct hashmap_iter iter ;
@@ -555,12 +594,13 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
555
594
info -> relevant_source_dirs = dirs_removed ; /* might be NULL */
556
595
} else {
557
596
info -> relevant_source_dirs = xmalloc (sizeof (struct strintmap ));
558
- strset_init (info -> relevant_source_dirs );
559
- strset_for_each_entry (relevant_sources , & iter , entry ) {
597
+ strintmap_init (info -> relevant_source_dirs , 0 /* unused */ );
598
+ strintmap_for_each_entry (relevant_sources , & iter , entry ) {
560
599
char * dirname = get_dirname (entry -> key );
561
600
if (!dirs_removed ||
562
- strset_contains (dirs_removed , dirname ))
563
- strset_add (info -> relevant_source_dirs , dirname );
601
+ strintmap_contains (dirs_removed , dirname ))
602
+ strintmap_set (info -> relevant_source_dirs ,
603
+ dirname , 0 /* value irrelevant */ );
564
604
free (dirname );
565
605
}
566
606
}
@@ -624,7 +664,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
624
664
}
625
665
626
666
static void cleanup_dir_rename_info (struct dir_rename_info * info ,
627
- struct strset * dirs_removed ,
667
+ struct strintmap * dirs_removed ,
628
668
int keep_dir_rename_count )
629
669
{
630
670
struct hashmap_iter iter ;
@@ -644,7 +684,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
644
684
/* relevant_source_dirs */
645
685
if (info -> relevant_source_dirs &&
646
686
info -> relevant_source_dirs != dirs_removed ) {
647
- strset_clear (info -> relevant_source_dirs );
687
+ strintmap_clear (info -> relevant_source_dirs );
648
688
FREE_AND_NULL (info -> relevant_source_dirs );
649
689
}
650
690
@@ -659,18 +699,22 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
659
699
/*
660
700
* Although dir_rename_count was passed in
661
701
* diffcore_rename_extended() and we want to keep it around and
662
- * return it to that caller, we first want to remove any data
702
+ * return it to that caller, we first want to remove any counts in
703
+ * the maps associated with UNKNOWN_DIR entries and any data
663
704
* associated with directories that weren't renamed.
664
705
*/
665
706
strmap_for_each_entry (info -> dir_rename_count , & iter , entry ) {
666
707
const char * source_dir = entry -> key ;
667
708
struct strintmap * counts = entry -> value ;
668
709
669
- if (!strset_contains (dirs_removed , source_dir )) {
710
+ if (!strintmap_get (dirs_removed , source_dir )) {
670
711
string_list_append (& to_remove , source_dir );
671
712
strintmap_clear (counts );
672
713
continue ;
673
714
}
715
+
716
+ if (strintmap_contains (counts , UNKNOWN_DIR ))
717
+ strintmap_remove (counts , UNKNOWN_DIR );
674
718
}
675
719
for (i = 0 ; i < to_remove .nr ; ++ i )
676
720
strmap_remove (info -> dir_rename_count ,
@@ -770,8 +814,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info)
770
814
static int find_basename_matches (struct diff_options * options ,
771
815
int minimum_score ,
772
816
struct dir_rename_info * info ,
773
- struct strset * relevant_sources ,
774
- struct strset * dirs_removed )
817
+ struct strintmap * relevant_sources ,
818
+ struct strintmap * dirs_removed )
775
819
{
776
820
/*
777
821
* When I checked in early 2020, over 76% of file renames in linux
@@ -863,7 +907,7 @@ static int find_basename_matches(struct diff_options *options,
863
907
864
908
/* Skip irrelevant sources */
865
909
if (relevant_sources &&
866
- !strset_contains (relevant_sources , filename ))
910
+ !strintmap_contains (relevant_sources , filename ))
867
911
continue ;
868
912
869
913
/*
@@ -994,7 +1038,7 @@ static int find_renames(struct diff_score *mx,
994
1038
int minimum_score ,
995
1039
int copies ,
996
1040
struct dir_rename_info * info ,
997
- struct strset * dirs_removed )
1041
+ struct strintmap * dirs_removed )
998
1042
{
999
1043
int count = 0 , i ;
1000
1044
@@ -1019,7 +1063,7 @@ static int find_renames(struct diff_score *mx,
1019
1063
}
1020
1064
1021
1065
static void remove_unneeded_paths_from_src (int detecting_copies ,
1022
- struct strset * interesting )
1066
+ struct strintmap * interesting )
1023
1067
{
1024
1068
int i , new_num_src ;
1025
1069
@@ -1061,7 +1105,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
1061
1105
continue ;
1062
1106
1063
1107
/* If we don't care about the source path, skip it */
1064
- if (interesting && !strset_contains (interesting , one -> path ))
1108
+ if (interesting && !strintmap_contains (interesting , one -> path ))
1065
1109
continue ;
1066
1110
1067
1111
if (new_num_src < i )
@@ -1073,9 +1117,136 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
1073
1117
rename_src_nr = new_num_src ;
1074
1118
}
1075
1119
1120
+ static void handle_early_known_dir_renames (struct dir_rename_info * info ,
1121
+ struct strintmap * relevant_sources ,
1122
+ struct strintmap * dirs_removed )
1123
+ {
1124
+ /*
1125
+ * Directory renames are determined via an aggregate of all renames
1126
+ * under them and using a "majority wins" rule. The fact that
1127
+ * "majority wins", though, means we don't need all the renames
1128
+ * under the given directory, we only need enough to ensure we have
1129
+ * a majority.
1130
+ */
1131
+
1132
+ int i , new_num_src ;
1133
+ struct hashmap_iter iter ;
1134
+ struct strmap_entry * entry ;
1135
+
1136
+ if (!dirs_removed || !relevant_sources )
1137
+ return ; /* nothing to cull */
1138
+ if (break_idx )
1139
+ return ; /* culling incompatbile with break detection */
1140
+
1141
+ /*
1142
+ * Supplement dir_rename_count with number of potential renames,
1143
+ * marking all potential rename sources as mapping to UNKNOWN_DIR.
1144
+ */
1145
+ for (i = 0 ; i < rename_src_nr ; i ++ ) {
1146
+ char * old_dir ;
1147
+ struct diff_filespec * one = rename_src [i ].p -> one ;
1148
+
1149
+ /*
1150
+ * sources that are part of a rename will have already been
1151
+ * removed by a prior call to remove_unneeded_paths_from_src()
1152
+ */
1153
+ assert (!one -> rename_used );
1154
+
1155
+ old_dir = get_dirname (one -> path );
1156
+ while (* old_dir != '\0' &&
1157
+ NOT_RELEVANT != strintmap_get (dirs_removed , old_dir )) {
1158
+ char * freeme = old_dir ;
1159
+
1160
+ increment_count (info , old_dir , UNKNOWN_DIR );
1161
+ old_dir = get_dirname (old_dir );
1162
+
1163
+ /* Free resources we don't need anymore */
1164
+ free (freeme );
1165
+ }
1166
+ /*
1167
+ * old_dir and new_dir free'd in increment_count, but
1168
+ * get_dirname() gives us a new pointer we need to free for
1169
+ * old_dir. Also, if the loop runs 0 times we need old_dir
1170
+ * to be freed.
1171
+ */
1172
+ free (old_dir );
1173
+ }
1174
+
1175
+ /*
1176
+ * For any directory which we need a potential rename detected for
1177
+ * (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check
1178
+ * whether we have enough renames to satisfy the "majority rules"
1179
+ * requirement such that detecting any more renames of files under
1180
+ * it won't change the result. For any such directory, mark that
1181
+ * we no longer need to detect a rename for it. However, since we
1182
+ * might need to still detect renames for an ancestor of that
1183
+ * directory, use RELEVANT_FOR_ANCESTOR.
1184
+ */
1185
+ strmap_for_each_entry (info -> dir_rename_count , & iter , entry ) {
1186
+ /* entry->key is source_dir */
1187
+ struct strintmap * counts = entry -> value ;
1188
+
1189
+ if (strintmap_get (dirs_removed , entry -> key ) ==
1190
+ RELEVANT_FOR_SELF &&
1191
+ dir_rename_already_determinable (counts )) {
1192
+ strintmap_set (dirs_removed , entry -> key ,
1193
+ RELEVANT_FOR_ANCESTOR );
1194
+ }
1195
+ }
1196
+
1197
+ for (i = 0 , new_num_src = 0 ; i < rename_src_nr ; i ++ ) {
1198
+ struct diff_filespec * one = rename_src [i ].p -> one ;
1199
+ int val ;
1200
+
1201
+ val = strintmap_get (relevant_sources , one -> path );
1202
+
1203
+ /*
1204
+ * sources that were not found in relevant_sources should
1205
+ * have already been removed by a prior call to
1206
+ * remove_unneeded_paths_from_src()
1207
+ */
1208
+ assert (val != -1 );
1209
+
1210
+ if (val == RELEVANT_LOCATION ) {
1211
+ int removable = 1 ;
1212
+ char * dir = get_dirname (one -> path );
1213
+ while (1 ) {
1214
+ char * freeme = dir ;
1215
+ int res = strintmap_get (dirs_removed , dir );
1216
+
1217
+ /* Quit if not found or irrelevant */
1218
+ if (res == NOT_RELEVANT )
1219
+ break ;
1220
+ /* If RELEVANT_FOR_SELF, can't remove */
1221
+ if (res == RELEVANT_FOR_SELF ) {
1222
+ removable = 0 ;
1223
+ break ;
1224
+ }
1225
+ /* Else continue searching upwards */
1226
+ assert (res == RELEVANT_FOR_ANCESTOR );
1227
+ dir = get_dirname (dir );
1228
+ free (freeme );
1229
+ }
1230
+ free (dir );
1231
+ if (removable ) {
1232
+ strintmap_set (relevant_sources , one -> path ,
1233
+ RELEVANT_NO_MORE );
1234
+ continue ;
1235
+ }
1236
+ }
1237
+
1238
+ if (new_num_src < i )
1239
+ memcpy (& rename_src [new_num_src ], & rename_src [i ],
1240
+ sizeof (struct diff_rename_src ));
1241
+ new_num_src ++ ;
1242
+ }
1243
+
1244
+ rename_src_nr = new_num_src ;
1245
+ }
1246
+
1076
1247
void diffcore_rename_extended (struct diff_options * options ,
1077
- struct strset * relevant_sources ,
1078
- struct strset * dirs_removed ,
1248
+ struct strintmap * relevant_sources ,
1249
+ struct strintmap * dirs_removed ,
1079
1250
struct strmap * dir_rename_count )
1080
1251
{
1081
1252
int detect_rename = options -> detect_rename ;
@@ -1208,9 +1379,16 @@ void diffcore_rename_extended(struct diff_options *options,
1208
1379
* Cull sources, again:
1209
1380
* - remove ones involved in renames (found via basenames)
1210
1381
* - remove ones not found in relevant_sources
1382
+ * and
1383
+ * - remove ones in relevant_sources which are needed only
1384
+ * for directory renames IF no ancestory directory
1385
+ * actually needs to know any more individual path
1386
+ * renames under them
1211
1387
*/
1212
1388
trace2_region_enter ("diff" , "cull basename" , options -> repo );
1213
1389
remove_unneeded_paths_from_src (want_copies , relevant_sources );
1390
+ handle_early_known_dir_renames (& info , relevant_sources ,
1391
+ dirs_removed );
1214
1392
trace2_region_leave ("diff" , "cull basename" , options -> repo );
1215
1393
}
1216
1394
0 commit comments