|
18 | 18 | */ |
19 | 19 | package org.archive.modules.fetcher; |
20 | 20 |
|
| 21 | +import org.archive.net.UURI; |
21 | 22 | import org.archive.url.URIException; |
22 | 23 | import org.apache.commons.io.IOUtils; |
23 | 24 | import org.apache.commons.io.output.NullOutputStream; |
@@ -212,22 +213,29 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException { |
212 | 213 | curi.setFetchBeginTime(System.currentTimeMillis()); |
213 | 214 |
|
214 | 215 | try { |
215 | | - Request request = httpClient.newRequest(curi.getURI()) |
| 216 | + UURI uuri = curi.getUURI(); |
| 217 | + // newRequest() calls Java's URI.create() which rejects some UURIs, so instead pass a dummy value |
| 218 | + // and then override all the components |
| 219 | + Request request = httpClient.newRequest("http://dummy") |
| 220 | + .scheme(uuri.getScheme()) |
| 221 | + .host(uuri.getHost()) |
| 222 | + .port(HttpClient.normalizePort(uuri.getScheme(), uuri.getPort())) |
| 223 | + .path(new String(uuri.getRawPathQuery())) |
216 | 224 | .timeout(getTimeoutSeconds(), TimeUnit.SECONDS) |
217 | 225 | .method(curi.getFetchType() == CrawlURI.FetchType.HTTP_POST ? HttpMethod.POST : HttpMethod.GET) |
218 | 226 | .agent(getUserAgentProvider().getUserAgent()) |
219 | 227 | .tag(getProxy()); |
220 | | - if (!curi.getUURI().getScheme().equals("https")) { |
| 228 | + if (!uuri.getScheme().equals("https")) { |
221 | 229 | request.version(HttpVersion.HTTP_1_1); |
222 | 230 | } else if (useHTTP3 && curi.getFetchAttempts() == 0) { |
223 | 231 | // use HTTP/3 if we've seen an Alt-Svc header |
224 | | - CrawlServer crawlServer = serverCache.getServerFor(curi.getUURI()); |
| 232 | + CrawlServer crawlServer = serverCache.getServerFor(uuri); |
225 | 233 | int http3Port = crawlServer.getHttp3AltSvcPort(); |
226 | 234 | if (http3Port > 0) { |
227 | 235 | // TODO: Support alternate Alt-Svc ports for HTTP/3. |
228 | 236 | // Tricky to do because we need to preserve the original request URI. |
229 | 237 | // Maybe changing the port in resolveSocketAddress() would work? |
230 | | - if (http3Port == curi.getUURI().getPort() || (curi.getUURI().getPort() == -1 && http3Port == 443)) { |
| 238 | + if (http3Port == uuri.getPort() || (uuri.getPort() == -1 && http3Port == 443)) { |
231 | 239 | request.version(HttpVersion.HTTP_3); |
232 | 240 | } |
233 | 241 | } |
@@ -767,7 +775,9 @@ public List<HttpCookie> all() { |
767 | 775 |
|
768 | 776 | @Override |
769 | 777 | public List<HttpCookie> match(URI uri) { |
770 | | - CookieStore hostCookieStore = cookieStore.cookieStoreFor(uri.getHost()); |
| 778 | + String host = uri.getHost(); |
| 779 | + if (host == null) host = uri.getAuthority(); // workaround for hosts containing underscores |
| 780 | + CookieStore hostCookieStore = cookieStore.cookieStoreFor(host); |
771 | 781 | if (hostCookieStore == null) return Collections.emptyList(); |
772 | 782 | return hostCookieStore.getCookies().stream() |
773 | 783 | .map(c -> (HttpCookie) new CookieAdaptor(c)).toList(); |
|
0 commit comments