-
-
Notifications
You must be signed in to change notification settings - Fork 105
Add utilities to detect and replace broken links. #1366
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 6 commits
9004e6b
7153833
d205074
c3a64e3
48c23ab
d556a33
66b8d6e
24998cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,14 +4,24 @@ | |
| import com.linkedin.urls.detection.UrlDetector; | ||
| import com.linkedin.urls.detection.UrlDetectorOptions; | ||
|
|
||
| import java.net.URI; | ||
| import java.net.http.HttpClient; | ||
| import java.net.http.HttpRequest; | ||
| import java.net.http.HttpResponse; | ||
| import java.util.List; | ||
| import java.util.Objects; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import java.util.concurrent.CompletableFuture; | ||
|
|
||
| /** | ||
| * Utility class to detect links. | ||
| */ | ||
| public class LinkDetection { | ||
| private static final HttpClient HTTP_CLIENT = HttpClient.newHttpClient(); | ||
|
|
||
| private static final Set<LinkFilter> DEFAULT_FILTERS = | ||
| Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME); | ||
|
|
||
| /** | ||
| * Possible ways to filter a link. | ||
|
|
@@ -58,6 +68,106 @@ public static boolean containsLink(String content) { | |
| return !(new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect().isEmpty()); | ||
| } | ||
|
|
||
| /** | ||
| * Checks whether the given URL is considered broken. | ||
| * | ||
| * <p> | ||
| * A link is considered broken if: | ||
| * <ul> | ||
| * <li>The URL is invalid or malformed</li> | ||
barsh404error marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| * <li>An HTTP request fails</li> | ||
| * <li>The HTTP response status code is outside the 200–399 range</li> | ||
| * </ul> | ||
| * | ||
| * <p> | ||
| * Notes: | ||
| * <ul> | ||
| * <li>Status code {@code 200} is considered valid, even if the response body is empty</li> | ||
| * <li>The response body content is not inspected</li> | ||
| * </ul> | ||
| * | ||
| * @param url the URL to check | ||
| * @return a future completing with {@code true} if the link is broken | ||
| */ | ||
|
|
||
| public static CompletableFuture<Boolean> isLinkBroken(String url) { | ||
| HttpRequest headRequest = HttpRequest.newBuilder(URI.create(url)) | ||
barsh404error marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Zabuzard marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| .method("HEAD", HttpRequest.BodyPublishers.noBody()) | ||
| .build(); | ||
|
|
||
| return HTTP_CLIENT.sendAsync(headRequest, HttpResponse.BodyHandlers.discarding()) | ||
| .thenApply(response -> { | ||
| int status = response.statusCode(); | ||
| return status < 200 || status >= 400; | ||
| }) | ||
| .exceptionally(ignored -> true) | ||
|
||
| .thenCompose(result -> { | ||
| if (!Boolean.TRUE.equals(result)) { | ||
| return CompletableFuture.completedFuture(false); | ||
| } | ||
| HttpRequest getRequest = HttpRequest.newBuilder(URI.create(url)).GET().build(); | ||
Zabuzard marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return HTTP_CLIENT.sendAsync(getRequest, HttpResponse.BodyHandlers.discarding()) | ||
| .thenApply(resp -> resp.statusCode() >= 400) | ||
| .exceptionally(ignored -> true); // still never null | ||
| }); | ||
barsh404error marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| /** | ||
| * Replaces all broken links in the given text with the provided replacement string. | ||
| * | ||
| * <p> | ||
| * Example: | ||
| * | ||
| * <pre>{@code | ||
| * replaceDeadLinks(""" | ||
| * Test | ||
| * http://deadlink/1 | ||
| * http://workinglink/1 | ||
| * """, "broken") | ||
| * }</pre> | ||
| * | ||
| * <p> | ||
| * Results in: | ||
| * | ||
| * <pre>{@code | ||
| * Test | ||
| * broken | ||
| * http://workinglink/1 | ||
| * }</pre> | ||
| * | ||
| * @param text the input text containing URLs | ||
| * @param replacement the string to replace broken links with | ||
| * @return a future containing the modified text | ||
| */ | ||
|
|
||
| public static CompletableFuture<String> replaceDeadLinks(String text, String replacement) { | ||
tj-wazei marked this conversation as resolved.
Show resolved
Hide resolved
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. improve naming. avoid multiple terms for the same think. you called it
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not done yet |
||
| List<String> links = extractLinks(text, DEFAULT_FILTERS); | ||
|
|
||
| if (links.isEmpty()) { | ||
| return CompletableFuture.completedFuture(text); | ||
| } | ||
|
|
||
| List<CompletableFuture<String>> deadLinkFutures = links.stream() | ||
| .distinct() | ||
| .map(link -> isLinkBroken(link) | ||
| .thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? link : null)) | ||
Zabuzard marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
barsh404error marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| .toList(); | ||
|
|
||
| return CompletableFuture.allOf(deadLinkFutures.toArray(new CompletableFuture[0])) | ||
barsh404error marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| .thenApply(ignored -> deadLinkFutures.stream() | ||
Zabuzard marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| .map(CompletableFuture::join) | ||
| .filter(Objects::nonNull) | ||
|
||
| .toList()) | ||
| .thenApply(deadLinks -> { | ||
| String result = text; | ||
| for (String deadLink : deadLinks) { | ||
| result = result.replace(deadLink, replacement); | ||
Zabuzard marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| return result; | ||
| }); | ||
| } | ||
|
|
||
| private static Optional<String> toLink(Url url, Set<LinkFilter> filter) { | ||
| String raw = url.getOriginalUrl(); | ||
| if (filter.contains(LinkFilter.SUPPRESSED) && raw.contains(">")) { | ||
|
|
@@ -76,7 +186,6 @@ private static Optional<String> toLink(Url url, Set<LinkFilter> filter) { | |
| // Remove trailing punctuation | ||
| link = link.substring(0, link.length() - 1); | ||
| } | ||
|
|
||
| return Optional.of(link); | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.