@@ -27,27 +27,51 @@ use mononoke_types::ChangesetId;
2727use mononoke_types:: ContentId ;
2828use mononoke_types:: MPath ;
2929use mononoke_types:: NonRootMPath ;
30+ use mononoke_types:: fsnode:: FsnodeFile ;
3031use mononoke_types:: inferred_copy_from:: InferredCopyFrom ;
3132use mononoke_types:: inferred_copy_from:: InferredCopyFromEntry ;
3233use vec1:: Vec1 ;
3334
3435const BASENAME_MATCH_MAX_CANDIDATES : usize = 10_000 ;
3536
37+ struct CopyFromCandidate {
38+ cs_id : ChangesetId ,
39+ path : MPath ,
40+ #[ allow( unused) ]
41+ fsnode : FsnodeFile ,
42+ }
43+
3644// It's possible to have multiple source files that match,
3745// pick the one with the smallest path
38- fn pick_source_from_candidates (
39- candidates : & [ ( ChangesetId , MPath ) ] ,
40- ) -> Option < & ( ChangesetId , MPath ) > {
41- candidates. iter ( ) . min_by_key ( |( _, mpath) | mpath. clone ( ) )
46+ fn pick_source_from_candidates ( candidates : & [ CopyFromCandidate ] ) -> & CopyFromCandidate {
47+ candidates
48+ . iter ( )
49+ . min_by_key ( |c| c. path . clone ( ) )
50+ . unwrap_or_else ( || panic ! ( "There should be at least one candidate" ) )
51+ }
52+
53+ fn flatten_candidates (
54+ maps : Vec < HashMap < ContentId , Vec < CopyFromCandidate > > > ,
55+ ) -> HashMap < ContentId , Vec < CopyFromCandidate > > {
56+ let mut merged = HashMap :: new ( ) ;
57+ for map in maps {
58+ for ( content_id, candidates) in map {
59+ merged
60+ . entry ( content_id)
61+ . or_insert ( vec ! [ ] )
62+ . extend ( candidates)
63+ }
64+ }
65+ merged
4266}
4367
44- async fn get_content_to_paths_from_changeset (
68+ async fn get_candidates_from_changeset (
4569 ctx : & CoreContext ,
4670 derivation_ctx : & DerivationContext ,
4771 cs_id : ChangesetId ,
4872 paths : Vec < NonRootMPath > ,
49- ) -> Result < HashMap < ContentId , Vec < ( ChangesetId , MPath ) > > > {
50- let mut content_to_paths = HashMap :: new ( ) ;
73+ ) -> Result < HashMap < ContentId , Vec < CopyFromCandidate > > > {
74+ let mut content_to_candidates = HashMap :: new ( ) ;
5175
5276 let entries = derivation_ctx
5377 . fetch_dependency :: < RootFsnodeId > ( ctx, cs_id)
@@ -58,14 +82,18 @@ async fn get_content_to_paths_from_changeset(
5882 . await ?;
5983
6084 for ( path, entry) in entries {
61- if let Some ( content_id ) = entry. into_leaf ( ) . map ( |f| f . content_id ( ) . clone ( ) ) {
62- content_to_paths
63- . entry ( content_id)
85+ if let Some ( fsnode ) = entry. into_leaf ( ) {
86+ content_to_candidates
87+ . entry ( fsnode . content_id ( ) . clone ( ) )
6488 . or_insert ( vec ! [ ] )
65- . push ( ( cs_id, path) ) ;
89+ . push ( CopyFromCandidate {
90+ cs_id,
91+ path,
92+ fsnode,
93+ } ) ;
6694 }
6795 }
68- Ok ( content_to_paths )
96+ Ok ( content_to_candidates )
6997}
7098
7199async fn get_matched_paths_by_basenames_from_changeset (
@@ -91,11 +119,18 @@ async fn get_matched_paths_by_basenames_from_changeset(
91119// Find exact renames by comparing the content of deleted vs new/changed files
92120// in the current changeset. If they have the same content, the path pair is
93121// a rename.
122+ //
123+ // Return a list of inferred renames and the remaining candidates we gathered that
124+ // failed the exact match check. They will be reconsidered for partial content
125+ // matching later.
94126async fn find_exact_renames (
95127 ctx : & CoreContext ,
96128 derivation_ctx : & DerivationContext ,
97129 bonsai : & BonsaiChangeset ,
98- ) -> Result < Vec < ( MPath , InferredCopyFromEntry ) > > {
130+ ) -> Result < (
131+ Vec < ( MPath , InferredCopyFromEntry ) > ,
132+ HashMap < ContentId , Vec < CopyFromCandidate > > ,
133+ ) > {
99134 let mut content_to_paths = HashMap :: new ( ) ;
100135 for ( path, file_change) in bonsai. simplified_file_changes ( ) {
101136 if let Some ( fc) = file_change {
@@ -116,10 +151,10 @@ async fn find_exact_renames(
116151 }
117152 } )
118153 . collect :: < Vec < _ > > ( ) ;
119- let content_to_deleted_paths = try_join_all ( bonsai. parents ( ) . map ( |parent_cs_id| {
154+ let content_to_candidates_vec = try_join_all ( bonsai. parents ( ) . map ( |parent_cs_id| {
120155 cloned ! ( deleted_paths) ;
121156 async move {
122- get_content_to_paths_from_changeset ( ctx, derivation_ctx, parent_cs_id, deleted_paths)
157+ get_candidates_from_changeset ( ctx, derivation_ctx, parent_cs_id, deleted_paths)
123158 . await
124159 . with_context ( || {
125160 format ! (
@@ -129,38 +164,49 @@ async fn find_exact_renames(
129164 } )
130165 }
131166 } ) )
132- . await ?
133- . into_iter ( )
134- . flatten ( )
135- . collect :: < HashMap < _ , _ > > ( ) ;
167+ . await ?;
168+ let mut content_to_candidates = flatten_candidates ( content_to_candidates_vec) ;
136169
137170 let mut renames = vec ! [ ] ;
138- for ( content_id, paths) in content_to_paths {
139- if let Some ( deleted_paths ) = content_to_deleted_paths . get ( & content_id) {
140- let from = pick_source_from_candidates ( deleted_paths ) . unwrap ( ) ;
171+ for ( content_id, paths) in & content_to_paths {
172+ if let Some ( candidates ) = content_to_candidates . get ( content_id) {
173+ let from = pick_source_from_candidates ( candidates ) ;
141174 for path in paths {
142175 renames. push ( (
143- MPath :: from ( path) ,
176+ MPath :: from ( path. clone ( ) ) ,
144177 InferredCopyFromEntry {
145- from_csid : from. 0 ,
146- from_path : from. 1 . clone ( ) ,
178+ from_csid : from. cs_id ,
179+ from_path : from. path . clone ( ) ,
147180 } ,
148181 ) ) ;
149182 }
150183 }
151184 }
152- Ok ( renames)
185+
186+ // Remove any exact-matched content from the candidate list
187+ // The remaining will be used for partial matching later
188+ for content_id in content_to_paths. keys ( ) {
189+ content_to_candidates. remove ( content_id) ;
190+ }
191+ Ok ( ( renames, content_to_candidates) )
153192}
154193
155194// Infer copies by matching basenames between new/changed files in the
156195// current changeset and other files in the same repo (with some constraints).
157196// If the basenames match and the content are the same, the path pair is a copy.
197+ //
198+ // Return a list of inferred copies and the remaining candidates we gathered that
199+ // failed the exact match check. They will be reconsidered for partial content
200+ // matching later.
158201async fn find_basename_matched_copies (
159202 ctx : & CoreContext ,
160203 derivation_ctx : & DerivationContext ,
161204 bonsai : & BonsaiChangeset ,
162205 paths_to_ignore : & HashSet < MPath > ,
163- ) -> Result < Vec < ( MPath , InferredCopyFromEntry ) > > {
206+ ) -> Result < (
207+ Vec < ( MPath , InferredCopyFromEntry ) > ,
208+ HashMap < ContentId , Vec < CopyFromCandidate > > ,
209+ ) > {
164210 let mut content_to_paths = HashMap :: new ( ) ;
165211 let mut basenames = HashSet :: new ( ) ;
166212 let mut path_prefixes = HashSet :: new ( ) ;
@@ -181,15 +227,15 @@ async fn find_basename_matched_copies(
181227 }
182228 }
183229 if basenames. is_empty ( ) {
184- return Ok ( vec ! [ ] ) ;
230+ return Ok ( ( vec ! [ ] , HashMap :: new ( ) ) ) ;
185231 }
186232
187233 let basenames_vec = basenames. into_iter ( ) . collect :: < Vec < _ > > ( ) ;
188234 let path_prefixes_vec = path_prefixes. into_iter ( ) . collect :: < Vec < _ > > ( ) ;
189- let mut content_to_matched_paths = HashMap :: new ( ) ;
235+ let mut content_to_candidates_vec = vec ! [ ] ;
190236
191237 for parent_cs_id in bonsai. parents ( ) {
192- content_to_matched_paths . extend (
238+ content_to_candidates_vec . push (
193239 get_matched_paths_by_basenames_from_changeset (
194240 ctx,
195241 derivation_ctx,
@@ -203,34 +249,41 @@ async fn find_basename_matched_copies(
203249 . try_chunks ( 100 )
204250 . try_fold ( HashMap :: new ( ) , |mut acc, paths| async move {
205251 let hashmap =
206- get_content_to_paths_from_changeset ( ctx, derivation_ctx, parent_cs_id, paths)
207- . await ;
252+ get_candidates_from_changeset ( ctx, derivation_ctx, parent_cs_id, paths) . await ;
208253 if let Ok ( hashmap) = hashmap {
209- acc. extend ( hashmap. into_iter ( ) ) ;
254+ for ( k, v) in hashmap {
255+ acc. entry ( k) . or_insert ( vec ! [ ] ) . extend ( v) ;
256+ }
210257 }
211258 Ok ( acc)
212259 } )
213- . await ?
214- . into_iter ( ) ,
260+ . await ?,
215261 ) ;
216262 }
263+ let mut content_to_candidates = flatten_candidates ( content_to_candidates_vec) ;
217264
218265 let mut copies = vec ! [ ] ;
219- for ( content_id, paths) in content_to_paths {
220- if let Some ( matched_paths ) = content_to_matched_paths . get ( & content_id) {
221- let from = pick_source_from_candidates ( matched_paths ) . unwrap ( ) ;
266+ for ( content_id, paths) in & content_to_paths {
267+ if let Some ( candidates ) = content_to_candidates . get ( content_id) {
268+ let from = pick_source_from_candidates ( candidates ) ;
222269 for path in paths {
223270 copies. push ( (
224- MPath :: from ( path) ,
271+ MPath :: from ( path. clone ( ) ) ,
225272 InferredCopyFromEntry {
226- from_csid : from. 0 ,
227- from_path : from. 1 . clone ( ) ,
273+ from_csid : from. cs_id ,
274+ from_path : from. path . clone ( ) ,
228275 } ,
229276 ) ) ;
230277 }
231278 }
232279 }
233- Ok ( copies)
280+
281+ // Remove any exact-matched content from the candidate list
282+ // The remaining will be used for partial matching later
283+ for content_id in content_to_paths. keys ( ) {
284+ content_to_candidates. remove ( content_id) ;
285+ }
286+ Ok ( ( copies, content_to_candidates) )
234287}
235288
236289// TODO: add more cases
@@ -242,10 +295,10 @@ pub(crate) async fn derive_impl(
242295) -> Result < Option < InferredCopyFrom > > {
243296 let mut resolved_paths = HashSet :: new ( ) ;
244297
245- let exact_renames = find_exact_renames ( ctx, derivation_ctx, bonsai) . await ?;
298+ let ( exact_renames, _leftover0 ) = find_exact_renames ( ctx, derivation_ctx, bonsai) . await ?;
246299 resolved_paths. extend ( exact_renames. iter ( ) . map ( |( path, _) | path. clone ( ) ) ) ;
247300
248- let basename_matched_copies =
301+ let ( basename_matched_copies, _leftover1 ) =
249302 find_basename_matched_copies ( ctx, derivation_ctx, bonsai, & resolved_paths) . await ?;
250303 resolved_paths. extend ( basename_matched_copies. iter ( ) . map ( |( path, _) | path. clone ( ) ) ) ;
251304
0 commit comments