Skip to content

Commit b35d7f4

Browse files
Liu Yangfacebook-github-bot
authored andcommitted
InferredCopyFrom: small adjustment to prep for partial match
Summary: * Changed the "candidate" tuple to a struct, since I'm stuffing more things into it * Changed how I merge the candidates hashmap, it was wrong before NOTE: There are some unused variable in this diff. I'll use them in the next diff, so ignore the linter for now. Differential Revision: D75872059 fbshipit-source-id: 06ae4af37f45d317c62190a85989f1e36e3961f3
1 parent 32c40b0 commit b35d7f4

File tree

1 file changed

+97
-44
lines changed
  • eden/mononoke/derived_data/inferred_copy_from

1 file changed

+97
-44
lines changed

eden/mononoke/derived_data/inferred_copy_from/derive.rs

Lines changed: 97 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -27,27 +27,51 @@ use mononoke_types::ChangesetId;
2727
use mononoke_types::ContentId;
2828
use mononoke_types::MPath;
2929
use mononoke_types::NonRootMPath;
30+
use mononoke_types::fsnode::FsnodeFile;
3031
use mononoke_types::inferred_copy_from::InferredCopyFrom;
3132
use mononoke_types::inferred_copy_from::InferredCopyFromEntry;
3233
use vec1::Vec1;
3334

3435
const BASENAME_MATCH_MAX_CANDIDATES: usize = 10_000;
3536

37+
struct CopyFromCandidate {
38+
cs_id: ChangesetId,
39+
path: MPath,
40+
#[allow(unused)]
41+
fsnode: FsnodeFile,
42+
}
43+
3644
// It's possible to have multiple source files that match,
3745
// pick the one with the smallest path
38-
fn pick_source_from_candidates(
39-
candidates: &[(ChangesetId, MPath)],
40-
) -> Option<&(ChangesetId, MPath)> {
41-
candidates.iter().min_by_key(|(_, mpath)| mpath.clone())
46+
fn pick_source_from_candidates(candidates: &[CopyFromCandidate]) -> &CopyFromCandidate {
47+
candidates
48+
.iter()
49+
.min_by_key(|c| c.path.clone())
50+
.unwrap_or_else(|| panic!("There should be at least one candidate"))
51+
}
52+
53+
fn flatten_candidates(
54+
maps: Vec<HashMap<ContentId, Vec<CopyFromCandidate>>>,
55+
) -> HashMap<ContentId, Vec<CopyFromCandidate>> {
56+
let mut merged = HashMap::new();
57+
for map in maps {
58+
for (content_id, candidates) in map {
59+
merged
60+
.entry(content_id)
61+
.or_insert(vec![])
62+
.extend(candidates)
63+
}
64+
}
65+
merged
4266
}
4367

44-
async fn get_content_to_paths_from_changeset(
68+
async fn get_candidates_from_changeset(
4569
ctx: &CoreContext,
4670
derivation_ctx: &DerivationContext,
4771
cs_id: ChangesetId,
4872
paths: Vec<NonRootMPath>,
49-
) -> Result<HashMap<ContentId, Vec<(ChangesetId, MPath)>>> {
50-
let mut content_to_paths = HashMap::new();
73+
) -> Result<HashMap<ContentId, Vec<CopyFromCandidate>>> {
74+
let mut content_to_candidates = HashMap::new();
5175

5276
let entries = derivation_ctx
5377
.fetch_dependency::<RootFsnodeId>(ctx, cs_id)
@@ -58,14 +82,18 @@ async fn get_content_to_paths_from_changeset(
5882
.await?;
5983

6084
for (path, entry) in entries {
61-
if let Some(content_id) = entry.into_leaf().map(|f| f.content_id().clone()) {
62-
content_to_paths
63-
.entry(content_id)
85+
if let Some(fsnode) = entry.into_leaf() {
86+
content_to_candidates
87+
.entry(fsnode.content_id().clone())
6488
.or_insert(vec![])
65-
.push((cs_id, path));
89+
.push(CopyFromCandidate {
90+
cs_id,
91+
path,
92+
fsnode,
93+
});
6694
}
6795
}
68-
Ok(content_to_paths)
96+
Ok(content_to_candidates)
6997
}
7098

7199
async fn get_matched_paths_by_basenames_from_changeset(
@@ -91,11 +119,18 @@ async fn get_matched_paths_by_basenames_from_changeset(
91119
// Find exact renames by comparing the content of deleted vs new/changed files
92120
// in the current changeset. If they have the same content, the path pair is
93121
// a rename.
122+
//
123+
// Return a list of inferred renames and the remaining candidates we gathered that
124+
// failed the exact match check. They will be reconsidered for partial content
125+
// matching later.
94126
async fn find_exact_renames(
95127
ctx: &CoreContext,
96128
derivation_ctx: &DerivationContext,
97129
bonsai: &BonsaiChangeset,
98-
) -> Result<Vec<(MPath, InferredCopyFromEntry)>> {
130+
) -> Result<(
131+
Vec<(MPath, InferredCopyFromEntry)>,
132+
HashMap<ContentId, Vec<CopyFromCandidate>>,
133+
)> {
99134
let mut content_to_paths = HashMap::new();
100135
for (path, file_change) in bonsai.simplified_file_changes() {
101136
if let Some(fc) = file_change {
@@ -116,10 +151,10 @@ async fn find_exact_renames(
116151
}
117152
})
118153
.collect::<Vec<_>>();
119-
let content_to_deleted_paths = try_join_all(bonsai.parents().map(|parent_cs_id| {
154+
let content_to_candidates_vec = try_join_all(bonsai.parents().map(|parent_cs_id| {
120155
cloned!(deleted_paths);
121156
async move {
122-
get_content_to_paths_from_changeset(ctx, derivation_ctx, parent_cs_id, deleted_paths)
157+
get_candidates_from_changeset(ctx, derivation_ctx, parent_cs_id, deleted_paths)
123158
.await
124159
.with_context(|| {
125160
format!(
@@ -129,38 +164,49 @@ async fn find_exact_renames(
129164
})
130165
}
131166
}))
132-
.await?
133-
.into_iter()
134-
.flatten()
135-
.collect::<HashMap<_, _>>();
167+
.await?;
168+
let mut content_to_candidates = flatten_candidates(content_to_candidates_vec);
136169

137170
let mut renames = vec![];
138-
for (content_id, paths) in content_to_paths {
139-
if let Some(deleted_paths) = content_to_deleted_paths.get(&content_id) {
140-
let from = pick_source_from_candidates(deleted_paths).unwrap();
171+
for (content_id, paths) in &content_to_paths {
172+
if let Some(candidates) = content_to_candidates.get(content_id) {
173+
let from = pick_source_from_candidates(candidates);
141174
for path in paths {
142175
renames.push((
143-
MPath::from(path),
176+
MPath::from(path.clone()),
144177
InferredCopyFromEntry {
145-
from_csid: from.0,
146-
from_path: from.1.clone(),
178+
from_csid: from.cs_id,
179+
from_path: from.path.clone(),
147180
},
148181
));
149182
}
150183
}
151184
}
152-
Ok(renames)
185+
186+
// Remove any exact-matched content from the candidate list
187+
// The remaining will be used for partial matching later
188+
for content_id in content_to_paths.keys() {
189+
content_to_candidates.remove(content_id);
190+
}
191+
Ok((renames, content_to_candidates))
153192
}
154193

155194
// Infer copies by matching basenames between new/changed files in the
156195
// current changeset and other files in the same repo (with some constraints).
157196
// If the basenames match and the content are the same, the path pair is a copy.
197+
//
198+
// Return a list of inferred copies and the remaining candidates we gathered that
199+
// failed the exact match check. They will be reconsidered for partial content
200+
// matching later.
158201
async fn find_basename_matched_copies(
159202
ctx: &CoreContext,
160203
derivation_ctx: &DerivationContext,
161204
bonsai: &BonsaiChangeset,
162205
paths_to_ignore: &HashSet<MPath>,
163-
) -> Result<Vec<(MPath, InferredCopyFromEntry)>> {
206+
) -> Result<(
207+
Vec<(MPath, InferredCopyFromEntry)>,
208+
HashMap<ContentId, Vec<CopyFromCandidate>>,
209+
)> {
164210
let mut content_to_paths = HashMap::new();
165211
let mut basenames = HashSet::new();
166212
let mut path_prefixes = HashSet::new();
@@ -181,15 +227,15 @@ async fn find_basename_matched_copies(
181227
}
182228
}
183229
if basenames.is_empty() {
184-
return Ok(vec![]);
230+
return Ok((vec![], HashMap::new()));
185231
}
186232

187233
let basenames_vec = basenames.into_iter().collect::<Vec<_>>();
188234
let path_prefixes_vec = path_prefixes.into_iter().collect::<Vec<_>>();
189-
let mut content_to_matched_paths = HashMap::new();
235+
let mut content_to_candidates_vec = vec![];
190236

191237
for parent_cs_id in bonsai.parents() {
192-
content_to_matched_paths.extend(
238+
content_to_candidates_vec.push(
193239
get_matched_paths_by_basenames_from_changeset(
194240
ctx,
195241
derivation_ctx,
@@ -203,34 +249,41 @@ async fn find_basename_matched_copies(
203249
.try_chunks(100)
204250
.try_fold(HashMap::new(), |mut acc, paths| async move {
205251
let hashmap =
206-
get_content_to_paths_from_changeset(ctx, derivation_ctx, parent_cs_id, paths)
207-
.await;
252+
get_candidates_from_changeset(ctx, derivation_ctx, parent_cs_id, paths).await;
208253
if let Ok(hashmap) = hashmap {
209-
acc.extend(hashmap.into_iter());
254+
for (k, v) in hashmap {
255+
acc.entry(k).or_insert(vec![]).extend(v);
256+
}
210257
}
211258
Ok(acc)
212259
})
213-
.await?
214-
.into_iter(),
260+
.await?,
215261
);
216262
}
263+
let mut content_to_candidates = flatten_candidates(content_to_candidates_vec);
217264

218265
let mut copies = vec![];
219-
for (content_id, paths) in content_to_paths {
220-
if let Some(matched_paths) = content_to_matched_paths.get(&content_id) {
221-
let from = pick_source_from_candidates(matched_paths).unwrap();
266+
for (content_id, paths) in &content_to_paths {
267+
if let Some(candidates) = content_to_candidates.get(content_id) {
268+
let from = pick_source_from_candidates(candidates);
222269
for path in paths {
223270
copies.push((
224-
MPath::from(path),
271+
MPath::from(path.clone()),
225272
InferredCopyFromEntry {
226-
from_csid: from.0,
227-
from_path: from.1.clone(),
273+
from_csid: from.cs_id,
274+
from_path: from.path.clone(),
228275
},
229276
));
230277
}
231278
}
232279
}
233-
Ok(copies)
280+
281+
// Remove any exact-matched content from the candidate list
282+
// The remaining will be used for partial matching later
283+
for content_id in content_to_paths.keys() {
284+
content_to_candidates.remove(content_id);
285+
}
286+
Ok((copies, content_to_candidates))
234287
}
235288

236289
// TODO: add more cases
@@ -242,10 +295,10 @@ pub(crate) async fn derive_impl(
242295
) -> Result<Option<InferredCopyFrom>> {
243296
let mut resolved_paths = HashSet::new();
244297

245-
let exact_renames = find_exact_renames(ctx, derivation_ctx, bonsai).await?;
298+
let (exact_renames, _leftover0) = find_exact_renames(ctx, derivation_ctx, bonsai).await?;
246299
resolved_paths.extend(exact_renames.iter().map(|(path, _)| path.clone()));
247300

248-
let basename_matched_copies =
301+
let (basename_matched_copies, _leftover1) =
249302
find_basename_matched_copies(ctx, derivation_ctx, bonsai, &resolved_paths).await?;
250303
resolved_paths.extend(basename_matched_copies.iter().map(|(path, _)| path.clone()));
251304

0 commit comments

Comments
 (0)