Skip to content

Commit 5b5b754

Browse files
committed
Browser: Disable downloads in Firefox and Chrome
This hopefully will stop us filling up ~/Downloads with random junk.
1 parent 396f7a0 commit 5b5b754

File tree

3 files changed

+121
-61
lines changed

3 files changed

+121
-61
lines changed

commons/src/main/java/org/archive/net/webdriver/LocalWebDriverBiDi.java

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,18 +58,15 @@ public class LocalWebDriverBiDi implements WebDriverBiDi, Closeable {
5858
private static final String[] BROWSERS = new String[]{
5959
"firefox", "/Applications/Firefox.app/Contents/MacOS/firefox", "chromedriver"};
6060

61-
public LocalWebDriverBiDi(String executable, List<String> options, int proxyPort, Path profileDir)
61+
public LocalWebDriverBiDi(String executable, List<String> options, Session.CapabilitiesRequest capabilities, Path profileDir)
6262
throws ExecutionException, InterruptedException, IOException {
6363
this.process = executable == null ? launchAnyBrowser(options, profileDir) : launchBrowser(executable, options, profileDir);
6464
Runtime.getRuntime().addShutdownHook(new Thread(this.process::destroyForcibly));
6565
new Thread(this::handleStderr).start();
6666
webSocket = HttpClient.newHttpClient().newWebSocketBuilder()
6767
.buildAsync(URI.create(webSocketUrl.get() + "/session"), new Listener())
6868
.get();
69-
var alwaysMatch = Map.of("acceptInsecureCerts", true,
70-
"proxy", new Session.ProxyConfiguration("manual",
71-
"127.0.0.1:" + proxyPort, "127.0.0.1:" + proxyPort));
72-
sessionId = session().new_(new Session.CapabilitiesRequest(alwaysMatch, null)).sessionId();
69+
sessionId = session().new_(capabilities).sessionId();
7370
}
7471

7572
private static Process launchAnyBrowser(List<String> options, Path profileDir) throws IOException {

engine/src/main/java/org/archive/crawler/processor/Browser.java

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,10 +187,34 @@ public void start() {
187187
Path profileDir = crawlController.getScratchDir().getFile().toPath().resolve("profile");
188188
Files.createDirectories(profileDir);
189189

190-
// Firefox: send localhost requests via the proxy too (for tests and local crawling)
191-
Files.writeString(profileDir.resolve("user.js"), "user_pref('network.proxy.allow_hijacking_localhost', true);");
192-
193-
this.webdriver = new LocalWebDriverBiDi(executable, options, proxy.getPort(), profileDir);
190+
// Firefox doesn't seem to allow setting prefs via capabilities with bidi
191+
// so drop them in user.js instead
192+
Files.writeString(profileDir.resolve("user.js"), """
193+
// send localhost requests via the proxy too (for tests and local crawling)
194+
user_pref('network.proxy.allow_hijacking_localhost', true);
195+
196+
// disable downloads by setting to something that can't be created as a directory
197+
user_pref('browser.download.dir', '/dev/null');
198+
user_pref('browser.download.folderList', 2);
199+
""");
200+
201+
int proxyPort = proxy.getPort();
202+
var alwaysMatch = Map.of("acceptInsecureCerts", true,
203+
"proxy", new Session.ProxyConfiguration("manual",
204+
"127.0.0.1:" + proxyPort, "127.0.0.1:" + proxyPort));
205+
206+
List<Map<String,Object>> firstMatch = List.of(
207+
Map.of("browserName", "chrome",
208+
"goog:chromeOptions", Map.of(
209+
"args", List.of("headless=new", "user-data-dir=" + profileDir),
210+
"prefs", Map.of("download_restrictions", 3))),
211+
// Fallback for other browsers
212+
Map.of()
213+
);
214+
var capabilities = new Session.CapabilitiesRequest(alwaysMatch, firstMatch);
215+
216+
217+
this.webdriver = new LocalWebDriverBiDi(executable, options, capabilities, profileDir);
194218
} catch (Exception e) {
195219
logger.log(ERROR, "Error starting browser", e);
196220
throw new RuntimeException(e);

engine/src/test/java/org/archive/crawler/processor/BrowserTest.java

Lines changed: 91 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
import org.archive.modules.fetcher.DefaultServerCache;
77
import org.archive.modules.fetcher.FetchHTTP2;
88
import org.archive.net.UURIFactory;
9+
import org.archive.url.URIException;
910
import org.archive.util.Recorder;
10-
import org.junit.jupiter.api.AfterAll;
11-
import org.junit.jupiter.api.BeforeAll;
12-
import org.junit.jupiter.api.Test;
11+
import org.junit.jupiter.api.*;
1312
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
1413
import org.junit.jupiter.api.io.TempDir;
1514

@@ -18,73 +17,77 @@
1817
import java.net.InetSocketAddress;
1918
import java.nio.file.Path;
2019
import java.util.ArrayList;
20+
import java.util.HashSet;
2121
import java.util.List;
22+
import java.util.Set;
2223

2324
import static java.lang.System.Logger.Level.DEBUG;
2425
import static org.junit.jupiter.api.Assertions.*;
2526

27+
@EnabledIfSystemProperty(named = "runBrowserTests", matches = "true")
2628
class BrowserTest {
2729
private static final System.Logger logger = System.getLogger(BrowserTest.class.getName());
2830

2931
private static HttpServer httpServer;
30-
32+
private static FetchHTTP2 fetcher;
33+
private static Browser browser;
34+
private static String baseUrl;
35+
private static ArrayList<CrawlURI> subrequests;
36+
private static CrawlController crawlController;
37+
private Set<Recorder> recorders = new HashSet<>();
3138
@TempDir
3239
Path tempDir;
3340

3441
@Test
35-
@EnabledIfSystemProperty(named = "runBrowserTests", matches = "true")
42+
@Disabled
3643
public void test() throws IOException, InterruptedException {
37-
String url = "http://" + httpServer.getAddress().getAddress().getHostAddress() + ":" +
38-
httpServer.getAddress().getPort() + "/";
39-
var fetcher = new FetchHTTP2(new DefaultServerCache(), null);
40-
fetcher.setUserAgentProvider(new CrawlMetadata());
41-
fetcher.start();
42-
try {
43-
var crawlController = new CrawlController();
44-
FetchChain fetchChain = new FetchChain();
45-
fetchChain.setProcessors(List.of());
46-
crawlController.setFetchChain(fetchChain);
47-
48-
var subrequests = new ArrayList<CrawlURI>();
49-
50-
DispositionChain dispositionChain = new DispositionChain();
51-
dispositionChain.setProcessors(List.of(new Processor() {
52-
@Override
53-
protected boolean shouldProcess(CrawlURI uri) {
54-
return true;
55-
}
44+
CrawlURI crawlURI = newCrawlURI(baseUrl);
45+
fetcher.process(crawlURI);
46+
assertEquals(200, crawlURI.getFetchStatus());
47+
browser.innerProcess(crawlURI);
5648

57-
@Override
58-
protected void innerProcess(CrawlURI uri) throws InterruptedException {
59-
subrequests.add(uri);
60-
}
61-
}));
62-
crawlController.setDispositionChain(dispositionChain);
63-
crawlController.getScratchDir().setPath(tempDir.toString());
64-
var browserProcessor = new Browser(fetcher, crawlController, event -> {}, null);
65-
try {
66-
browserProcessor.start();
67-
68-
CrawlURI crawlURI = new CrawlURI(UURIFactory.getInstance(url));
69-
crawlURI.setRecorder(new Recorder(tempDir.toFile(), "fetcher"));
70-
fetcher.process(crawlURI);
71-
assertEquals(200, crawlURI.getFetchStatus());
72-
browserProcessor.innerProcess(crawlURI);
73-
74-
var outLinks = new ArrayList<>(crawlURI.getOutLinks());
75-
assertEquals("/link", outLinks.get(0).getUURI().getPath());
76-
assertTrue(crawlURI.getAnnotations().contains("browser"));
77-
78-
logger.log(DEBUG, "Subrequests: {0}", subrequests);
79-
} finally {
80-
browserProcessor.stop();
81-
}
82-
} finally {
83-
fetcher.stop();
49+
var outLinks = new ArrayList<>(crawlURI.getOutLinks());
50+
assertEquals("/link", outLinks.get(0).getUURI().getPath());
51+
assertTrue(crawlURI.getAnnotations().contains("browser"));
52+
53+
logger.log(DEBUG, "Subrequests: {0}", subrequests);
54+
}
55+
56+
@Test
57+
public void testDownload() throws IOException, InterruptedException {
58+
CrawlURI crawlURI = newCrawlURI(baseUrl + "download.bin");
59+
fetcher.process(crawlURI);
60+
assertEquals(200, crawlURI.getFetchStatus());
61+
browser.innerProcess(crawlURI);
62+
}
63+
64+
private CrawlURI newCrawlURI(String uri) throws URIException {
65+
CrawlURI curi = new CrawlURI(UURIFactory.getInstance(uri));
66+
Recorder recorder = new Recorder(tempDir.toFile(), "fetcher");
67+
recorders.add(recorder);
68+
curi.setRecorder(recorder);
69+
return curi;
70+
}
71+
72+
@BeforeEach
73+
void setUp() {
74+
crawlController.getScratchDir().setPath(tempDir.toString());
75+
}
76+
77+
@AfterEach
78+
void tearDown() {
79+
subrequests.clear();
80+
for (Recorder recorder : recorders) {
81+
recorder.cleanup();
8482
}
8583
}
8684

8785
@BeforeAll
86+
static void setUpAll() throws Exception {
87+
startHttpServer();
88+
startProcessors();
89+
}
90+
8891
static void startHttpServer() throws IOException {
8992
httpServer = HttpServer.create(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0), -1);
9093
httpServer.createContext("/", exchange -> {
@@ -101,6 +104,10 @@ static void startHttpServer() throws IOException {
101104
body = "body { color: red; background: url(bg.jpg); }";
102105
contentType = "text/css";
103106
}
107+
case "/download.bin" -> {
108+
body = "sample-download-file";
109+
contentType = "application/octet-stream";
110+
}
104111
default -> status = 404;
105112
}
106113
exchange.getResponseHeaders().add("Content-Type", contentType);
@@ -109,11 +116,43 @@ static void startHttpServer() throws IOException {
109116
exchange.close();
110117
});
111118
httpServer.start();
119+
baseUrl = "http://" + httpServer.getAddress().getAddress().getHostAddress() + ":" +
120+
httpServer.getAddress().getPort() + "/";
121+
}
122+
123+
static void startProcessors() {
124+
fetcher = new FetchHTTP2(new DefaultServerCache(), null);
125+
fetcher.setUserAgentProvider(new CrawlMetadata());
126+
fetcher.start();
127+
crawlController = new CrawlController();
128+
FetchChain fetchChain = new FetchChain();
129+
fetchChain.setProcessors(List.of());
130+
crawlController.setFetchChain(fetchChain);
131+
132+
subrequests = new ArrayList<CrawlURI>();
133+
134+
DispositionChain dispositionChain = new DispositionChain();
135+
dispositionChain.setProcessors(List.of(new Processor() {
136+
@Override
137+
protected boolean shouldProcess(CrawlURI uri) {
138+
return true;
139+
}
140+
141+
@Override
142+
protected void innerProcess(CrawlURI uri) throws InterruptedException {
143+
subrequests.add(uri);
144+
}
145+
}));
146+
crawlController.setDispositionChain(dispositionChain);
147+
browser = new Browser(fetcher, crawlController, event -> {}, null);
148+
browser.start();
112149
}
113150

114151
@AfterAll
115-
static void stopHttpServer() {
152+
static void tearDownAll() {
116153
if (httpServer != null) httpServer.stop(0);
154+
if (browser != null) browser.stop();
155+
if (fetcher != null) fetcher.stop();
117156
}
118157

119158
}

0 commit comments

Comments
 (0)