Skip to content

Commit 5b31176

Browse files
committed
FetchHTTP2: Fix IllegalArgumentException when host contains underscores
Fixes #718
1 parent 9af99ad commit 5b31176

File tree

2 files changed

+26
-5
lines changed

2 files changed

+26
-5
lines changed

modules/src/main/java/org/archive/modules/fetcher/FetchHTTP2.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.archive.modules.fetcher;
2020

21+
import org.archive.net.UURI;
2122
import org.archive.url.URIException;
2223
import org.apache.commons.io.IOUtils;
2324
import org.apache.commons.io.output.NullOutputStream;
@@ -212,22 +213,29 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
212213
curi.setFetchBeginTime(System.currentTimeMillis());
213214

214215
try {
215-
Request request = httpClient.newRequest(curi.getURI())
216+
UURI uuri = curi.getUURI();
217+
// newRequest() calls Java's URI.create() which rejects some UURIs, so instead pass a dummy value
218+
// and then override all the components
219+
Request request = httpClient.newRequest("http://dummy")
220+
.scheme(uuri.getScheme())
221+
.host(uuri.getHost())
222+
.port(HttpClient.normalizePort(uuri.getScheme(), uuri.getPort()))
223+
.path(new String(uuri.getRawPathQuery()))
216224
.timeout(getTimeoutSeconds(), TimeUnit.SECONDS)
217225
.method(curi.getFetchType() == CrawlURI.FetchType.HTTP_POST ? HttpMethod.POST : HttpMethod.GET)
218226
.agent(getUserAgentProvider().getUserAgent())
219227
.tag(getProxy());
220-
if (!curi.getUURI().getScheme().equals("https")) {
228+
if (!uuri.getScheme().equals("https")) {
221229
request.version(HttpVersion.HTTP_1_1);
222230
} else if (useHTTP3 && curi.getFetchAttempts() == 0) {
223231
// use HTTP/3 if we've seen an Alt-Svc header
224-
CrawlServer crawlServer = serverCache.getServerFor(curi.getUURI());
232+
CrawlServer crawlServer = serverCache.getServerFor(uuri);
225233
int http3Port = crawlServer.getHttp3AltSvcPort();
226234
if (http3Port > 0) {
227235
// TODO: Support alternate Alt-Svc ports for HTTP/3.
228236
// Tricky to do because we need to preserve the original request URI.
229237
// Maybe changing the port in resolveSocketAddress() would work?
230-
if (http3Port == curi.getUURI().getPort() || (curi.getUURI().getPort() == -1 && http3Port == 443)) {
238+
if (http3Port == uuri.getPort() || (uuri.getPort() == -1 && http3Port == 443)) {
231239
request.version(HttpVersion.HTTP_3);
232240
}
233241
}
@@ -767,7 +775,9 @@ public List<HttpCookie> all() {
767775

768776
@Override
769777
public List<HttpCookie> match(URI uri) {
770-
CookieStore hostCookieStore = cookieStore.cookieStoreFor(uri.getHost());
778+
String host = uri.getHost();
779+
if (host == null) host = uri.getAuthority(); // workaround for hosts containing underscores
780+
CookieStore hostCookieStore = cookieStore.cookieStoreFor(host);
771781
if (hostCookieStore == null) return Collections.emptyList();
772782
return hostCookieStore.getCookies().stream()
773783
.map(c -> (HttpCookie) new CookieAdaptor(c)).toList();

modules/src/test/java/org/archive/modules/fetcher/FetchHTTP2Test.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,4 +294,15 @@ public void testAuthLargeBody() throws Exception {
294294
assertEquals(401, curi.getFetchStatus());
295295
curi.getRecorder().cleanup();
296296
}
297+
298+
// #718 java.lang.IllegalArgumentException: Invalid URI host: null (authority: doesnotexist_a_b.example.com)
299+
@Test
300+
public void testUnderscoreHost() throws Exception {
301+
fetcher.start();
302+
var curi = new CrawlURI(UURIFactory.getInstance("http://doesnotexist_a_b.example.com/"));
303+
curi.setRecorder(recorder);
304+
fetcher.innerProcess(curi);
305+
assertEquals(-2, curi.getFetchStatus());
306+
curi.getRecorder().cleanup();
307+
}
297308
}

0 commit comments

Comments
 (0)