Skip to content

Commit c9d45b5

Browse files
committed
Retrieve level-2 directories to avoid false-positives
1 parent 8acdaf7 commit c9d45b5

File tree

1 file changed

+52
-12
lines changed

1 file changed

+52
-12
lines changed

src/gha_logs.rs

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ use crate::handlers::Context;
33
use anyhow::Context as _;
44
use hyper::header::{CACHE_CONTROL, CONTENT_SECURITY_POLICY, CONTENT_TYPE};
55
use hyper::{Body, Response, StatusCode};
6-
use itertools::Itertools;
76
use std::collections::VecDeque;
87
use std::str::FromStr;
98
use std::sync::Arc;
@@ -136,22 +135,57 @@ async fn process_logs(
136135
.workflow_run_job(&repo, log_id)
137136
.await
138137
.context("unable to fetch job details")?;
139-
let trees = ctx
138+
139+
// To minimize false positives in paths linked to the GitHub repositories, we
140+
// restrict matching to only the second-level directories of the repository.
141+
//
142+
// We achieve this by retrieving the contents of the root repository and then
143+
// retrive the content of the top-level directory which we then serialize for
144+
// the JS so they can be escaped and concatenated into a regex OR pattern
145+
// (e.g., `compiler/rustc_ast|tests/ui|src/version`) which is used in the JS regex.
146+
let mut root_trees = ctx
140147
.github
141148
.repo_git_trees(&repo, &job.head_sha)
142149
.await
143150
.context("unable to fetch git tree for the repository")?;
144151

145-
// To minimize false positives in paths linked to the GitHub repositories,
146-
// we restrict matching to only the top-level directories of the repository.
147-
// We achieve this by retrieving all "tree" objects and concatenating them
148-
// into a regex OR pattern (e.g., `compiler|tests|src`) which is used in the
149-
// JS regex.
150-
let tree_roots = trees
152+
// Prune every entry that isn't a tree (aka directory)
153+
root_trees.tree.retain(|t| t.object_type == "tree");
154+
155+
// Retrive all the sub-directories trees (for rust-lang/rust it's 6 API calls)
156+
let roots_trees: Vec<_> = root_trees
157+
.tree
158+
.iter()
159+
.map(|t| async { ctx.github.repo_git_trees(&repo, &t.sha).await })
160+
.collect();
161+
162+
// Join all futures and fail fast if one of them returns an error
163+
let roots_trees = futures::future::try_join_all(roots_trees)
164+
.await
165+
.context("unable to fetch content details")?;
166+
167+
// Collect and fix-up all the paths to directories and files (avoid submodules)
168+
let mut tree_roots: Vec<_> = root_trees
151169
.tree
152170
.iter()
153-
.filter_map(|t| (t.object_type == "tree").then_some(&t.path))
154-
.join("|");
171+
.zip(&roots_trees)
172+
.map(|(root, childs)| {
173+
childs
174+
.tree
175+
.iter()
176+
.filter(|t| t.object_type == "tree" || t.object_type == "blob")
177+
.map(|t| format!("{}/{}", root.path, t.path))
178+
})
179+
.flatten()
180+
.collect();
181+
182+
// We need to sort the tree roots by descending order, otherwise `library/std` will
183+
// be matched before `library/stdarch`
184+
tree_roots.sort_by(|a, b| b.cmp(a));
185+
186+
// Serialize to a JS(ON) array so we can escape them in the browser
187+
let tree_roots =
188+
serde_json::to_string(&tree_roots).context("unable to serialize the tree roots")?;
155189

156190
anyhow::Result::<_>::Ok((job, tree_roots))
157191
};
@@ -237,6 +271,7 @@ async fn process_logs(
237271
import {{ AnsiUp }} from '{ANSI_UP_URL}'
238272
239273
var logs = {logs};
274+
var tree_roots = {tree_roots};
240275
var ansi_up = new AnsiUp();
241276
242277
// 1. Tranform the ANSI escape codes to HTML
@@ -268,7 +303,7 @@ async fn process_logs(
268303
// Detailed examples of what the regex does is at https://regex101.com/r/vCnx9Y/2
269304
//
270305
// But simply speaking the regex tries to find absolute (with `/checkout` prefix) and
271-
// relative paths, the path must start with one of the repository top-level directory.
306+
// relative paths, the path must start with one of the repository level-2 directories.
272307
// We also try to retrieve the lines and cols if given (`<path>:line:col`).
273308
//
274309
// Some examples of paths we want to find:
@@ -277,7 +312,12 @@ async fn process_logs(
277312
// - /checkout/src/doc/rustdoc/src/advanced-features.md
278313
//
279314
// Any other paths, in particular if prefixed by `./` or `obj/` should not taken.
280-
const pathRegex = /(?<boundary>[^a-zA-Z0-9.\\/])(?<inner>(?:[\\\/]?(?:checkout[\\\/])?(?<path>(?:{tree_roots})[\\\/][a-zA-Z0-9_$\-.\\\/]+))(?::(?<line>[0-9]+):(?<col>[0-9]+))?)/g;
315+
const pathRegex = new RegExp(
316+
"(?<boundary>[^a-zA-Z0-9.\\/])(?<inner>(?:[\\\/]?(?:checkout[\\\/])?(?<path>(?:"
317+
+ tree_roots.map(p => RegExp.escape(p)).join("|") +
318+
")(?:[\\\/][a-zA-Z0-9_$\\\-.\\\/]+)?))(?::(?<line>[0-9]+):(?<col>[0-9]+))?)",
319+
"g"
320+
);
281321
html = html.replace(pathRegex, (match, boundary, inner, path, line, col) => {{
282322
const pos = (line !== undefined) ? `#L${{line}}` : "";
283323
return `${{boundary}}<a href="https://github.com/{owner}/{repo}/blob/{sha}/${{path}}${{pos}}" class="path-marker">${{inner}}</a>`;

0 commit comments

Comments
 (0)