Skip to content

Commit 52bbd80

Browse files
authored
Merge pull request #653 from internetarchive/bidi
Add Browser processor using WebDriver BiDi
2 parents 7e15d2f + 1161d87 commit 52bbd80

File tree

25 files changed

+2132
-21
lines changed

25 files changed

+2132
-21
lines changed

commons/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,11 @@
158158
<artifactId>groovy</artifactId>
159159
<version>${groovy.version}</version>
160160
</dependency>
161+
<dependency>
162+
<groupId>org.eclipse.jetty</groupId>
163+
<artifactId>jetty-proxy</artifactId>
164+
<version>${jetty.version}</version>
165+
</dependency>
161166
</dependencies>
162167
<build>
163168
<resources>
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/*
2+
* This file is part of the Heritrix web crawler (crawler.archive.org).
3+
*
4+
* Licensed to the Internet Archive (IA) by one or more individual
5+
* contributors.
6+
*
7+
* The IA licenses this file to You under the Apache License, Version 2.0
8+
* (the "License"); you may not use this file except in compliance with
9+
* the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
package org.archive.net;
21+
22+
import org.eclipse.jetty.client.HttpClient;
23+
import org.eclipse.jetty.http.HttpHeader;
24+
import org.eclipse.jetty.http.HttpURI;
25+
import org.eclipse.jetty.io.Content;
26+
import org.eclipse.jetty.io.EndPoint;
27+
import org.eclipse.jetty.proxy.ProxyHandler;
28+
import org.eclipse.jetty.server.*;
29+
import org.eclipse.jetty.server.handler.ConnectHandler;
30+
import org.eclipse.jetty.util.Callback;
31+
32+
import java.io.IOException;
33+
import java.io.InputStream;
34+
import java.io.UncheckedIOException;
35+
import java.net.InetSocketAddress;
36+
import java.nio.channels.ServerSocketChannel;
37+
import java.util.Map;
38+
39+
import static org.eclipse.jetty.http.HttpHeader.ACCEPT_ENCODING;
40+
41+
/**
42+
* An HTTP proxy server which intercepts TLS and records or replays responses.
43+
*/
44+
public class MitmProxy {
45+
private final SslConnectionFactory sslConnectionFactory = new SslConnectionFactory();
46+
private final Server server = new Server(0);
47+
private final RequestHandler requestHandler;
48+
49+
public MitmProxy(RequestHandler requestHandler) {
50+
this.requestHandler = requestHandler;
51+
sslConnectionFactory.getSslContextFactory().setKeyStorePath("adhoc.keystore");
52+
sslConnectionFactory.getSslContextFactory().setKeyStorePassword("password");
53+
}
54+
55+
public int getPort() {
56+
try {
57+
return ((InetSocketAddress) ((ServerSocketChannel) server.getConnectors()[0].getTransport())
58+
.getLocalAddress()).getPort();
59+
} catch (IOException e) {
60+
throw new UncheckedIOException(e);
61+
}
62+
}
63+
64+
public void start() throws Exception {
65+
sslConnectionFactory.start();
66+
67+
server.setHandler(new Handler.Sequence(
68+
new SslConnectHandler(),
69+
new MitmProxyHandler()));
70+
server.start();
71+
}
72+
73+
public void stop() throws Exception {
74+
server.stop();
75+
sslConnectionFactory.stop();
76+
}
77+
78+
public record Request(org.eclipse.jetty.server.Request request, Response response, Callback callback) {
79+
public String url() {
80+
return request().getHttpURI().asString();
81+
}
82+
83+
public void setListener(ExchangeListener listener) {
84+
request.setAttribute(ExchangeListener.class.getName(), listener);
85+
}
86+
87+
public void sendResponse(int status, Map<String,String> headers, InputStream body) throws IOException {
88+
response.setStatus(status);
89+
headers.forEach((k,v) -> response.getHeaders().put(k, v));
90+
body.transferTo(Content.Sink.asOutputStream(response));
91+
callback().succeeded();
92+
}
93+
}
94+
95+
public interface RequestHandler {
96+
/**
97+
* Handles a request to the proxy server. Must either:
98+
* <ul>
99+
* <li>write an immediate response, in which case the proxy won't make an upstream request
100+
* <li>return an {@link ExchangeListener} to record the exchange with the upstream server
101+
* <li>return null to let the proxy make an upstream request without recording it
102+
* </ul>
103+
*/
104+
void handle(Request request) throws IOException;
105+
}
106+
107+
public interface ExchangeListener extends
108+
org.eclipse.jetty.client.Request.BeginListener,
109+
org.eclipse.jetty.client.Request.HeadersListener,
110+
org.eclipse.jetty.client.Request.ContentListener,
111+
org.eclipse.jetty.client.Response.ContentListener,
112+
org.eclipse.jetty.client.Response.HeadersListener,
113+
org.eclipse.jetty.client.Response.CompleteListener {
114+
}
115+
116+
private class MitmProxyHandler extends ProxyHandler.Forward {
117+
@Override
118+
protected HttpClient newHttpClient() {
119+
HttpClient httpClient = super.newHttpClient();
120+
httpClient.setMaxConnectionsPerDestination(6);
121+
httpClient.setFollowRedirects(false);
122+
httpClient.setUserAgentField(null);
123+
return httpClient;
124+
}
125+
126+
@Override
127+
public boolean handle(org.eclipse.jetty.server.Request request, Response response, Callback callback) {
128+
try {
129+
requestHandler.handle(new Request(request, response, callback));
130+
if (response.isCommitted()) return true;
131+
return super.handle(request, response, callback);
132+
} catch (Throwable t) {
133+
callback.failed(t);
134+
return true;
135+
}
136+
}
137+
138+
@Override
139+
protected HttpURI rewriteHttpURI(org.eclipse.jetty.server.Request clientToProxyRequest) {
140+
HttpURI uri = super.rewriteHttpURI(clientToProxyRequest);
141+
String string = uri.asString();
142+
// HttpClient uses Java URI which unlike WHATWG URL doesn't allow "|" so percent encode it
143+
if (string.contains("|")) {
144+
return HttpURI.from(string.replace("|", "%7C"));
145+
} else {
146+
return uri;
147+
}
148+
}
149+
150+
@Override
151+
protected void addProxyHeaders(org.eclipse.jetty.server.Request clientToProxyRequest, org.eclipse.jetty.client.Request proxyToServerRequest) {
152+
proxyToServerRequest.headers(headers -> {
153+
// Ensure we only get the encodings we support
154+
headers.put(ACCEPT_ENCODING, "gzip");
155+
156+
// Host header is not allowed in HTTP/2
157+
headers.remove(HttpHeader.HOST);
158+
});
159+
var listener = (ExchangeListener)clientToProxyRequest.getAttribute(ExchangeListener.class.getName());
160+
if (listener != null) {
161+
proxyToServerRequest.onRequestHeaders(listener);
162+
proxyToServerRequest.onRequestContent(listener);
163+
proxyToServerRequest.onResponseHeaders(listener);
164+
proxyToServerRequest.onResponseContent(listener);
165+
proxyToServerRequest.onComplete(listener);
166+
}
167+
}
168+
}
169+
170+
/**
171+
* Handles the CONNECT method by upgrading the connection to SSL.
172+
*/
173+
private class SslConnectHandler extends ConnectHandler {
174+
@Override
175+
protected void handleConnect(org.eclipse.jetty.server.Request request, Response response, Callback callback, String serverAddress) {
176+
EndPoint clientEP = request.getTunnelSupport().getEndPoint();
177+
var sslConnection = sslConnectionFactory.newConnection(server.getConnectors()[0], clientEP);
178+
request.setAttribute(HttpStream.UPGRADE_CONNECTION_ATTRIBUTE, sslConnection);
179+
response.setStatus(200);
180+
callback.succeeded();
181+
}
182+
}
183+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* This file is part of the Heritrix web crawler (crawler.archive.org).
3+
*
4+
* Licensed to the Internet Archive (IA) by one or more individual
5+
* contributors.
6+
*
7+
* The IA licenses this file to You under the Apache License, Version 2.0
8+
* (the "License"); you may not use this file except in compliance with
9+
* the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
package org.archive.net.webdriver;
21+
22+
interface BiDiEvent {
23+
}

0 commit comments

Comments
 (0)