Skip to content

Commit 54853d5

Browse files
committed
FetchHTTP2: Add HTTP proxy support
1 parent 497d795 commit 54853d5

File tree

3 files changed

+134
-36
lines changed

3 files changed

+134
-36
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
with the `--web-auth basic` command-line option. This is useful when running Heritrix behind a reverse proxy that
1111
adds external authentication.
1212

13+
- **FetchHTTP2:** Added HTTP proxy support.
14+
1315
#### Fixes
1416

1517
- **Code editor:** The configuration editor and script console were upgraded to CodeMirror 6. This resolves some browser

modules/src/main/java/org/archive/modules/fetcher/FetchHTTP2.java

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import java.nio.charset.UnsupportedCharsetException;
6363
import java.time.Instant;
6464
import java.util.*;
65+
import java.util.concurrent.ConcurrentHashMap;
6566
import java.util.concurrent.ExecutionException;
6667
import java.util.concurrent.TimeUnit;
6768
import java.util.concurrent.TimeoutException;
@@ -86,6 +87,7 @@ public class FetchHTTP2 extends Processor implements Lifecycle, InitializingBean
8687
protected String digestAlgorithm = "sha1";
8788
protected boolean useHTTP2 = true;
8889
protected boolean useHTTP3 = false;
90+
private final Map<HttpProxySettings, HttpProxy> httpProxies = new ConcurrentHashMap<>();
8991

9092
public FetchHTTP2(ServerCache serverCache, @Autowired(required = false) AbstractCookieStore cookieStore) {
9193
this.serverCache = serverCache;
@@ -201,7 +203,8 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
201203
Request request = httpClient.newRequest(curi.getURI())
202204
.timeout(getTimeoutSeconds(), TimeUnit.SECONDS)
203205
.method(curi.getFetchType() == CrawlURI.FetchType.HTTP_POST ? HttpMethod.POST : HttpMethod.GET)
204-
.agent(getUserAgentProvider().getUserAgent());
206+
.agent(getUserAgentProvider().getUserAgent())
207+
.tag(getHttpProxy());
205208
if (!curi.getUURI().getScheme().equals("https")) {
206209
request.version(HttpVersion.HTTP_1_1);
207210
} else if (useHTTP3 && curi.getFetchAttempts() == 0) {
@@ -245,6 +248,47 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
245248
}
246249
}
247250

251+
public String getHttpProxyHost() {
252+
return (String) kp.get("httpProxyHost");
253+
}
254+
/**
255+
* Proxy host IP (set only if needed).
256+
*/
257+
public void setHttpProxyHost(String host) {
258+
kp.put("httpProxyHost",host);
259+
}
260+
261+
public Integer getHttpProxyPort() {
262+
return (Integer) kp.get("httpProxyPort");
263+
}
264+
/**
265+
* Proxy port (set only if needed).
266+
*/
267+
public void setHttpProxyPort(Integer port) {
268+
kp.put("httpProxyPort", port);
269+
}
270+
271+
private HttpProxy getHttpProxy() {
272+
String host = getHttpProxyHost();
273+
Integer port = getHttpProxyPort();
274+
if (host == null || port == null) return null;
275+
return httpProxies.computeIfAbsent(new HttpProxySettings(host, port), this::createHttpProxy);
276+
}
277+
278+
private HttpProxy createHttpProxy(HttpProxySettings settings) {
279+
HttpProxy proxy = new HttpProxy(settings.host(), settings.port()) {
280+
@Override
281+
public boolean matches(Origin origin) {
282+
return origin.getTag() == this;
283+
}
284+
};
285+
httpClient.getProxyConfiguration().addProxy(proxy);
286+
return proxy;
287+
}
288+
289+
private record HttpProxySettings(String host, int port) {
290+
}
291+
248292
/**
249293
* Handles the Alt-Svc HTTP header to enable HTTP/3 alternative service.
250294
* Does nothing if useHTTP3 is disabled.
@@ -444,6 +488,8 @@ public void stop() {
444488
} catch (Exception e) {
445489
throw new RuntimeException(e);
446490
}
491+
httpClient = null;
492+
httpProxies.clear();
447493
}
448494

449495
public UserAgentProvider getUserAgentProvider() {

modules/src/test/java/org/archive/modules/fetcher/FetchHTTP2Test.java

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,40 @@
2525
import org.archive.net.UURIFactory;
2626
import org.archive.spring.ConfigPath;
2727
import org.archive.util.Recorder;
28-
import org.junit.jupiter.api.Test;
28+
import org.eclipse.jetty.proxy.ProxyHandler;
29+
import org.eclipse.jetty.server.Server;
30+
import org.eclipse.jetty.server.ServerConnector;
31+
import org.junit.jupiter.api.*;
2932
import org.junit.jupiter.api.io.TempDir;
3033

34+
import java.io.IOException;
3135
import java.net.Inet4Address;
3236
import java.net.InetAddress;
3337
import java.net.InetSocketAddress;
3438
import java.nio.file.Files;
3539
import java.nio.file.Path;
3640

37-
import static org.junit.jupiter.api.Assertions.assertEquals;
38-
import static org.junit.jupiter.api.Assertions.assertTrue;
41+
import static org.junit.jupiter.api.Assertions.*;
3942

4043
public class FetchHTTP2Test {
44+
private static HttpServer server;
45+
4146
@TempDir
4247
Path tempDir;
48+
private BdbModule bdb;
49+
private BdbCookieStore cookieStore;
50+
private FetchHTTP2 fetcher;
51+
private static String baseUrl;
52+
private Recorder recorder;
4353

44-
@Test
45-
public void test() throws Exception {
54+
@BeforeAll
55+
public static void beforeAll() throws IOException {
4656
InetAddress loopbackAddress = Inet4Address.getLoopbackAddress();
47-
var server = HttpServer.create(new InetSocketAddress(loopbackAddress, 0), -1);
57+
server = HttpServer.create(new InetSocketAddress(loopbackAddress, 0), -1);
4858
server.createContext("/", exchange -> {
59+
if (exchange.getRequestHeaders().containsKey("Via")) {
60+
exchange.getResponseHeaders().add("Used-Proxy", "true");
61+
}
4962
exchange.getResponseHeaders().add("Content-Type", "text/html; charset=UTF-8");
5063
exchange.getResponseHeaders().add("Set-Cookie", "foo=bar; Path=/");
5164
byte[] body = "Hello World!".getBytes();
@@ -54,40 +67,77 @@ public void test() throws Exception {
5467
exchange.close();
5568
});
5669
server.start();
57-
BdbCookieStore cookieStore = new BdbCookieStore();
58-
BdbModule bdb = new BdbModule();
70+
baseUrl = "http://" + server.getAddress().getHostString() + ":" + server.getAddress().getPort() + "/";
71+
}
72+
73+
@AfterAll
74+
public static void afterAll() {
75+
if (server != null) server.stop(0);
76+
}
77+
78+
@BeforeEach
79+
public void beforeEach() throws IOException {
80+
cookieStore = new BdbCookieStore();
81+
bdb = new BdbModule();
5982
Path cookies = tempDir.resolve("cookies");
6083
Files.createDirectories(cookies);
6184
bdb.setDir(new ConfigPath("cookies", cookies.toString()));
6285
cookieStore.setBdbModule(bdb);
63-
try (var serverCache = new DefaultServerCache()) {
64-
bdb.start();
65-
cookieStore.start();
66-
var fetcher = new FetchHTTP2(serverCache, cookieStore);
67-
fetcher.setUserAgentProvider(new CrawlMetadata());
68-
fetcher.start();
69-
try {
70-
String url = "http://" + server.getAddress().getHostString() + ":" + server.getAddress().getPort() + "/";
71-
var curi = new CrawlURI(UURIFactory.getInstance(url));
72-
curi.setRecorder(new Recorder(tempDir.toFile(), "temp"));
73-
fetcher.innerProcess(curi);
86+
var serverCache = new DefaultServerCache();
87+
bdb.start();
88+
cookieStore.start();
89+
fetcher = new FetchHTTP2(serverCache, cookieStore);
90+
fetcher.setUserAgentProvider(new CrawlMetadata());
91+
recorder = new Recorder(tempDir.toFile(), "temp");
92+
}
7493

75-
assertEquals(200, curi.getFetchStatus());
76-
assertEquals(CrawlURI.FetchType.HTTP_GET, curi.getFetchType());
77-
assertEquals(12, curi.getContentLength());
78-
assertEquals("text/html; charset=UTF-8", curi.getContentType());
79-
assertEquals("UTF-8", curi.getRecorder().getCharset().name());
80-
assertEquals(loopbackAddress.getHostAddress(), curi.getServerIP());
81-
assertEquals("Hello World!", curi.getRecorder().getContentReplayPrefixString(100));
82-
assertEquals("foo=bar; Path=/", curi.getHttpResponseHeader("Set-Cookie"));
83-
assertTrue(curi.getFetchBeginTime() > 1);
84-
assertTrue(curi.getFetchCompletedTime() >= curi.getFetchBeginTime());
85-
} finally {
86-
fetcher.stop();
87-
server.stop(0);
88-
cookieStore.stop();
89-
bdb.stop();
90-
}
94+
@AfterEach
95+
public void afterEach() {
96+
fetcher.stop();
97+
cookieStore.stop();
98+
bdb.stop();
99+
recorder.cleanup();
100+
}
101+
102+
@Test
103+
public void test() throws Exception {
104+
fetcher.start();
105+
var curi = new CrawlURI(UURIFactory.getInstance(baseUrl));
106+
curi.setRecorder(recorder);
107+
fetcher.innerProcess(curi);
108+
109+
assertEquals(200, curi.getFetchStatus());
110+
assertEquals(CrawlURI.FetchType.HTTP_GET, curi.getFetchType());
111+
assertEquals(12, curi.getContentLength());
112+
assertEquals("text/html; charset=UTF-8", curi.getContentType());
113+
assertEquals("UTF-8", curi.getRecorder().getCharset().name());
114+
assertEquals(Inet4Address.getLoopbackAddress().getHostAddress(), curi.getServerIP());
115+
assertEquals("Hello World!", curi.getRecorder().getContentReplayPrefixString(100));
116+
assertEquals("foo=bar; Path=/", curi.getHttpResponseHeader("Set-Cookie"));
117+
assertTrue(curi.getFetchBeginTime() > 1);
118+
assertTrue(curi.getFetchCompletedTime() >= curi.getFetchBeginTime());
119+
assertNull(curi.getHttpResponseHeader("Used-Proxy"));
120+
curi.getRecorder().cleanup();
121+
}
122+
123+
@Test
124+
public void testHttpProxy() throws Exception {
125+
Server proxyServer = new Server(new InetSocketAddress(Inet4Address.getLoopbackAddress(), 0));
126+
proxyServer.setHandler(new ProxyHandler.Forward());
127+
proxyServer.start();
128+
try {
129+
var proxyPort = ((ServerConnector) proxyServer.getConnectors()[0]).getLocalPort();
130+
fetcher.setHttpProxyHost(Inet4Address.getLoopbackAddress().getHostAddress());
131+
fetcher.setHttpProxyPort(proxyPort);
132+
fetcher.start();
133+
var curi = new CrawlURI(UURIFactory.getInstance(baseUrl));
134+
curi.setRecorder(recorder);
135+
fetcher.innerProcess(curi);
136+
assertEquals("true", curi.getHttpResponseHeader("Used-Proxy"));
137+
assertEquals(200, curi.getFetchStatus());
138+
assertEquals("Hello World!", curi.getRecorder().getContentReplayPrefixString(100));
139+
} finally {
140+
proxyServer.stop();
91141
}
92142
}
93143
}

0 commit comments

Comments
 (0)