Skip to content

Commit ad3b99e

Browse files
committed
Browser: Handle navigation abort from downloads starting
1 parent 5b5b754 commit ad3b99e

File tree

3 files changed

+25
-3
lines changed

3 files changed

+25
-3
lines changed

commons/src/main/java/org/archive/net/webdriver/LocalWebDriverBiDi.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,12 @@ public <T extends BiDiModule> T module(Class<T> moduleClass) {
173173
parameterizedType.getActualTypeArguments()[0] instanceof Class<?> typeClass) {
174174
yield future.thenApply(result -> BiDiJson.fromJson(result, typeClass));
175175
} else {
176-
yield BiDiJson.fromJson(future.get(30, TimeUnit.SECONDS), method.getReturnType());
176+
try {
177+
yield BiDiJson.fromJson(future.get(30, TimeUnit.SECONDS), method.getReturnType());
178+
} catch (ExecutionException e) {
179+
if (e.getCause() instanceof RuntimeException re) throw re;
180+
throw e;
181+
}
177182
}
178183
}
179184
};

engine/src/main/java/org/archive/crawler/processor/Browser.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.archive.crawler.processor;
2121

22+
import org.apache.commons.lang.StringUtils;
2223
import org.archive.crawler.event.CrawlURIDispositionEvent;
2324
import org.archive.crawler.framework.CrawlController;
2425
import org.archive.crawler.framework.Frontier;
@@ -115,6 +116,9 @@ protected boolean shouldProcess(CrawlURI curi) {
115116
if (!scheme.equals("https") && !scheme.equals("http")) return false;
116117
String mime = curi.getContentType().toLowerCase();
117118
if (!mime.startsWith("text/html")) return false;
119+
if (StringUtils.startsWithIgnoreCase(curi.getHttpResponseHeader("Content-Disposition"), "attachment")) {
120+
return false;
121+
}
118122
return true;
119123
}
120124

@@ -143,7 +147,16 @@ private void visit(CrawlURI curi) {
143147
webdriver.network().addIntercept(List.of(Network.InterceptPhase.beforeRequestSent), List.of(tab),
144148
List.of(new Network.UrlPatternPattern("pattern", "http"),
145149
new Network.UrlPatternPattern("pattern", "https")));
146-
var navigation = webdriver.browsingContext().navigate(tab, curi.getURI(), BrowsingContext.ReadinessState.complete);
150+
BrowsingContext.NavigateResult navigation;
151+
152+
try {
153+
navigation = webdriver.browsingContext().navigate(tab, curi.getURI(), BrowsingContext.ReadinessState.complete);
154+
} catch (WebDriverException e) {
155+
if (e.getMessage().equals("net::ERR_ABORTED")) return; // Chrome: probably download started
156+
throw e;
157+
}
158+
if (navigation.url().equals("about:blank")) return; // Firefox: probably download started
159+
147160
logger.log(System.Logger.Level.DEBUG, "Navigated to {0}", navigation);
148161

149162
// Wait for network activity to stop

engine/src/test/java/org/archive/crawler/processor/BrowserTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,11 @@ public void testDownload() throws IOException, InterruptedException {
5858
CrawlURI crawlURI = newCrawlURI(baseUrl + "download.bin");
5959
fetcher.process(crawlURI);
6060
assertEquals(200, crawlURI.getFetchStatus());
61+
assertFalse(browser.shouldProcess(crawlURI), "content-disposition header should skip processing");
62+
63+
// force processing anyway to test the behavior for other download reasons (e.g. non-HTML)
6164
browser.innerProcess(crawlURI);
65+
assertFalse(crawlURI.getAnnotations().contains("browser"), "navigation should have aborted");
6266
}
6367

6468
private CrawlURI newCrawlURI(String uri) throws URIException {
@@ -106,7 +110,7 @@ static void startHttpServer() throws IOException {
106110
}
107111
case "/download.bin" -> {
108112
body = "sample-download-file";
109-
contentType = "application/octet-stream";
113+
exchange.getResponseHeaders().add("Content-Disposition", "attachment; filename=heritrix-test.bin");
110114
}
111115
default -> status = 404;
112116
}

0 commit comments

Comments
 (0)