Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
- **Robots.txt wildcards:** The `*` and `$` wildcard rules from RFC 9309 are now supported.
[#656](https://github.com/internetarchive/heritrix3/pull/656)

- **FetchHTTP2:** Added HTTP proxy support.

#### Fixes

- **Code editor:** The configuration editor and script console were upgraded to CodeMirror 6. This resolves some browser
Expand Down
23 changes: 22 additions & 1 deletion commons/src/main/java/org/archive/net/MitmProxy.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
package org.archive.net;

import org.eclipse.jetty.client.HttpClient;
import org.eclipse.jetty.client.HttpProxy;
import org.eclipse.jetty.client.ProxyConfiguration;
import org.eclipse.jetty.http.HttpHeader;
import org.eclipse.jetty.http.HttpURI;
import org.eclipse.jetty.io.Content;
Expand All @@ -42,6 +44,7 @@
* An HTTP proxy server which intercepts TLS and records or replays responses.
*/
public class MitmProxy {
private static final String UPSTREAM_PROXY = MitmProxy.class.getName() + ".upstreamProxy";
private final SslConnectionFactory sslConnectionFactory = new SslConnectionFactory();
private final Server server = new Server(0);
private final RequestHandler requestHandler;
Expand All @@ -63,7 +66,6 @@ public int getPort() {

public void start() throws Exception {
sslConnectionFactory.start();

server.setHandler(new Handler.Sequence(
new SslConnectHandler(),
new MitmProxyHandler()));
Expand All @@ -90,6 +92,10 @@ public void sendResponse(int status, Map<String,String> headers, InputStream bod
body.transferTo(Content.Sink.asOutputStream(response));
callback().succeeded();
}

public void setUpstreamProxy(ProxyConfiguration.Proxy proxy) {
request.setAttribute(UPSTREAM_PROXY, proxy);
}
}

public interface RequestHandler {
Expand Down Expand Up @@ -156,15 +162,30 @@ protected void addProxyHeaders(org.eclipse.jetty.server.Request clientToProxyReq
// Host header is not allowed in HTTP/2
headers.remove(HttpHeader.HOST);
});

ProxyConfiguration.Proxy upstreamProxy = (HttpProxy)clientToProxyRequest.getAttribute(UPSTREAM_PROXY);
if (upstreamProxy != null) {
addUpstreamProxyIfAbsent(upstreamProxy);
proxyToServerRequest.tag(upstreamProxy);
}

var listener = (ExchangeListener)clientToProxyRequest.getAttribute(ExchangeListener.class.getName());
if (listener != null) {
proxyToServerRequest.onRequestBegin(listener);
proxyToServerRequest.onRequestHeaders(listener);
proxyToServerRequest.onRequestContent(listener);
proxyToServerRequest.onResponseHeaders(listener);
proxyToServerRequest.onResponseContent(listener);
proxyToServerRequest.onComplete(listener);
}
}

private void addUpstreamProxyIfAbsent(ProxyConfiguration.Proxy proxy) {
for (var existingProxy : getHttpClient().getProxyConfiguration().getProxies()) {
if (existingProxy == proxy) return;
}
getHttpClient().getProxyConfiguration().addProxy(proxy);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.archive.spring.KeyedProperties;
import org.archive.util.IdleBarrier;
import org.archive.util.Recorder;
import org.eclipse.jetty.client.ProxyConfiguration;
import org.eclipse.jetty.client.Result;
import org.json.JSONException;
import org.json.JSONObject;
Expand Down Expand Up @@ -168,7 +169,7 @@ private void visit(CrawlURI curi) {
String pageId = UUID.randomUUID().toString();
var tab = webdriver.browsingContext().create(BrowsingContext.CreateType.tab).context();
try {
BrowserPage page = new BrowserPage(curi, new IdleBarrier(), webdriver, tab);
BrowserPage page = new BrowserPage(curi, new IdleBarrier(), webdriver, tab, fetcher.getProxy());
pages.put(pageId, page);
pageIdsByContext.put(tab, pageId);
webdriver.network().addIntercept(List.of(Network.InterceptPhase.beforeRequestSent), List.of(tab),
Expand Down Expand Up @@ -293,6 +294,7 @@ private void handleProxyRequest(MitmProxy.Request proxyRequest) throws IOExcepti
} else {
// Record exchange as a subresource
proxyRequest.setListener(new SubresourceRecorder(page, proxyRequest.url()));
proxyRequest.setUpstreamProxy(page.proxy);
}
}

Expand Down Expand Up @@ -477,7 +479,8 @@ public void onComplete(Result result) {
record BrowserPage(CrawlURI curi,
IdleBarrier networkActivity,
WebDriverBiDi webdriver,
BrowsingContext.Context context) implements Page {
BrowsingContext.Context context,
ProxyConfiguration.Proxy proxy) implements Page {

/**
* Evaluates JavaScript and returns the result as simple Java objects (numbers, strings, maps, lists).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
import org.archive.net.UURIFactory;
import org.archive.url.URIException;
import org.archive.util.Recorder;
import org.eclipse.jetty.proxy.ProxyHandler;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.server.ServerConnector;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
import org.junit.jupiter.api.io.TempDir;

import java.io.IOException;
import java.net.Inet4Address;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.nio.file.Path;
Expand Down Expand Up @@ -65,6 +69,33 @@ public void testDownload() throws IOException, InterruptedException {
assertFalse(crawlURI.getAnnotations().contains("browser"), "navigation should have aborted");
}

@Test
public void testHttpProxy() throws Exception {
InetAddress localhost = Inet4Address.getLoopbackAddress();
Server proxyServer = new Server(new InetSocketAddress(localhost, 0));
proxyServer.setHandler(new ProxyHandler.Forward());
proxyServer.start();
try {
fetcher.setHttpProxyHost(localhost.getHostAddress());
fetcher.setHttpProxyPort(((ServerConnector)proxyServer.getConnectors()[0]).getLocalPort());

CrawlURI crawlURI = newCrawlURI(baseUrl);
fetcher.process(crawlURI);
assertEquals(200, crawlURI.getFetchStatus());
browserProcessor.innerProcess(crawlURI);

var outLinks = new ArrayList<>(crawlURI.getOutLinks());
assertEquals("/link", outLinks.get(0).getUURI().getPath());
assertTrue(crawlURI.getAnnotations().contains("browser"));
assertEquals("true", crawlURI.getHttpResponseHeader("Used-Proxy"));
assertEquals("true", subrequests.get(0).getHttpResponseHeader("Used-Proxy"));

logger.log(DEBUG, "Subrequests: {0}", subrequests);
} finally {
proxyServer.stop();
}
}

private CrawlURI newCrawlURI(String uri) throws URIException {
CrawlURI curi = new CrawlURI(UURIFactory.getInstance(uri));
Recorder recorder = new Recorder(tempDir.toFile(), "fetcher");
Expand Down Expand Up @@ -114,6 +145,9 @@ static void startHttpServer() throws IOException {
}
default -> status = 404;
}
if (exchange.getRequestHeaders().containsKey("Via")) {
exchange.getResponseHeaders().add("Used-Proxy", "true");
}
exchange.getResponseHeaders().add("Content-Type", contentType);
exchange.sendResponseHeaders(status, 0);
exchange.getResponseBody().write(body.getBytes());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import java.nio.charset.UnsupportedCharsetException;
import java.time.Instant;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
Expand All @@ -86,6 +87,7 @@ public class FetchHTTP2 extends Processor implements Lifecycle, InitializingBean
protected String digestAlgorithm = "sha1";
protected boolean useHTTP2 = true;
protected boolean useHTTP3 = false;
private final Map<HttpProxySettings, HttpProxy> httpProxies = new ConcurrentHashMap<>();

public FetchHTTP2(@Autowired ServerCache serverCache, @Autowired(required = false) AbstractCookieStore cookieStore) {
this.serverCache = serverCache;
Expand Down Expand Up @@ -202,7 +204,8 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
Request request = httpClient.newRequest(curi.getURI())
.timeout(getTimeoutSeconds(), TimeUnit.SECONDS)
.method(curi.getFetchType() == CrawlURI.FetchType.HTTP_POST ? HttpMethod.POST : HttpMethod.GET)
.agent(getUserAgentProvider().getUserAgent());
.agent(getUserAgentProvider().getUserAgent())
.tag(getProxy());
if (!curi.getUURI().getScheme().equals("https")) {
request.version(HttpVersion.HTTP_1_1);
} else if (useHTTP3 && curi.getFetchAttempts() == 0) {
Expand Down Expand Up @@ -246,6 +249,47 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
}
}

public String getHttpProxyHost() {
return (String) kp.get("httpProxyHost");
}
/**
* Proxy host IP (set only if needed).
*/
public void setHttpProxyHost(String host) {
kp.put("httpProxyHost",host);
}

public Integer getHttpProxyPort() {
return (Integer) kp.get("httpProxyPort");
}
/**
* Proxy port (set only if needed).
*/
public void setHttpProxyPort(Integer port) {
kp.put("httpProxyPort", port);
}

public ProxyConfiguration.Proxy getProxy() {
String host = getHttpProxyHost();
Integer port = getHttpProxyPort();
if (host == null || port == null) return null;
return httpProxies.computeIfAbsent(new HttpProxySettings(host, port), this::createHttpProxy);
}

private HttpProxy createHttpProxy(HttpProxySettings settings) {
HttpProxy proxy = new HttpProxy(settings.host(), settings.port()) {
@Override
public boolean matches(Origin origin) {
return origin.getTag() == this;
}
};
httpClient.getProxyConfiguration().addProxy(proxy);
return proxy;
}

private record HttpProxySettings(String host, int port) {
}

/**
* Handles the Alt-Svc HTTP header to enable HTTP/3 alternative service.
* Does nothing if useHTTP3 is disabled.
Expand Down Expand Up @@ -450,6 +494,8 @@ public void stop() {
} catch (Exception e) {
throw new RuntimeException(e);
}
httpClient = null;
httpProxies.clear();
}

public UserAgentProvider getUserAgentProvider() {
Expand Down
Loading
Loading