Skip to content

Commit 6dd2889

Browse files
committed
Opens gzip compressed content (#511)
- follows redirects - fixes misconception of "Content-Encoding"
1 parent 52e4141 commit 6dd2889

File tree

2 files changed

+146
-62
lines changed

2 files changed

+146
-62
lines changed

metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java

Lines changed: 114 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2013, 2022 Deutsche Nationalbibliothek et al
2+
* Copyright 2013, 2023 Deutsche Nationalbibliothek et al
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -32,19 +32,22 @@
3232
import java.io.SequenceInputStream;
3333
import java.net.HttpURLConnection;
3434
import java.net.URL;
35+
import java.net.URLDecoder;
3536
import java.util.Arrays;
3637
import java.util.HashMap;
3738
import java.util.Map;
3839
import java.util.regex.Pattern;
40+
import java.util.zip.GZIPInputStream;
3941

4042
/**
4143
* Opens an {@link HttpURLConnection} and passes a reader to the receiver.
4244
*
4345
* @author Christoph Böhme
4446
* @author Jan Schnasse
4547
* @author Jens Wille
48+
* @author Pascal Christoph (dr0i)
4649
*/
47-
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
50+
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
4851
@In(String.class)
4952
@Out(Reader.class)
5053
@FluxCommand("open-http")
@@ -53,22 +56,21 @@ public final class HttpOpener extends DefaultObjectPipe<String, ObjectReceiver<R
5356
public static final String ACCEPT_DEFAULT = "*/*";
5457
public static final String ACCEPT_HEADER = "accept";
5558
public static final String CONTENT_TYPE_HEADER = "content-type";
59+
public static final String ACCEPT_ENCODING_HEADER = "accept-encoding";
60+
public static final String ENCODING_HEADER = "content-encoding";
5661
public static final String DEFAULT_PREFIX = "ERROR: ";
57-
public static final String ENCODING_DEFAULT = "UTF-8";
58-
public static final String ENCODING_HEADER = "accept-charset";
62+
public static final String CHARSET_DEFAULT = "UTF-8";
63+
public static final String ACCEPT_CHARSET_HEADER = "accept-charset";
5964
public static final String INPUT_DESIGNATOR = "@-";
60-
6165
public static final String DEFAULT_METHOD_NAME = "GET";
6266
public static final Method DEFAULT_METHOD = Method.valueOf(DEFAULT_METHOD_NAME);
63-
6467
public static final String HEADER_FIELD_SEPARATOR = "\n";
6568
public static final String HEADER_VALUE_SEPARATOR = ":";
66-
6769
private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern.compile(HEADER_FIELD_SEPARATOR);
6870
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern.compile(HEADER_VALUE_SEPARATOR);
69-
71+
private static final int ALLOWED_REDIRECTIONS = 3;
72+
private static final int CONNECTION_TIMEOUT = 11000;
7073
private final Map<String, String> headers = new HashMap<>();
71-
7274
private Method method;
7375
private String body;
7476
private String errorPrefix;
@@ -118,7 +120,7 @@ public boolean getResponseHasBody() {
118120
*/
119121
public HttpOpener() {
120122
setAccept(ACCEPT_DEFAULT);
121-
setEncoding(ENCODING_DEFAULT);
123+
setAcceptCharset(CHARSET_DEFAULT);
122124
setErrorPrefix(DEFAULT_PREFIX);
123125
setMethod(DEFAULT_METHOD);
124126
setUrl(INPUT_DESIGNATOR);
@@ -163,17 +165,50 @@ public void setContentType(final String contentType) {
163165
setHeader(CONTENT_TYPE_HEADER, contentType);
164166
}
165167

168+
/**
169+
* Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
170+
* preferred charset for the HTTP response.
171+
* The default charset is {@value CHARSET_DEFAULT}.
172+
*
173+
* @param charset name of the charset used for the accept-charset HTTP header
174+
*/
175+
public void setAcceptCharset(final String charset) {
176+
setHeader(ACCEPT_CHARSET_HEADER, charset);
177+
}
178+
179+
/**
180+
* @deprecated Use {@link #setAcceptCharset} instead.
181+
* @param charset name of the charset used for the accept-charset HTTP header
182+
*/
183+
@Deprecated
184+
public void setEncoding(final String charset) {
185+
setAcceptCharset(charset);
186+
}
187+
188+
/**
189+
* Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
190+
* preferred content encoding for the HTTP response. It accepts HTTP compression.
191+
* Allowed values are i.a. "gzip" and "Brotli".
192+
* The default for the content encoding is null, which means "no compression".
193+
*
194+
* @param contentEncoding name of content encoding used for the accept-encoding HTTP
195+
* header
196+
*/
197+
public void setAcceptContentEncoding(final String contentEncoding) {
198+
setHeader(ACCEPT_ENCODING_HEADER, contentEncoding);
199+
}
200+
166201
/**
167202
* Sets the HTTP {@value ENCODING_HEADER} header value. This is the
168-
* preferred encoding for the HTTP response. Additionally, the encoding
169-
* is used for reading the HTTP response if it does not specify a content
170-
* encoding. The default for the encoding is {@value ENCODING_DEFAULT}.
203+
* content encoding for the HTTP GET. It enables HTTP compression.
204+
* Allowed values are "gzip".
205+
* The default for the content encoding is null, which means "no compression".
171206
*
172-
* @param encoding name of the encoding used for the accept-charset HTTP
207+
* @param contentEncoding name of content encoding used for the content-encoding HTTP
173208
* header
174209
*/
175-
public void setEncoding(final String encoding) {
176-
setHeader(ENCODING_HEADER, encoding);
210+
public void setContentEncoding(final String contentEncoding) {
211+
setHeader(ENCODING_HEADER, contentEncoding);
177212
}
178213

179214
/**
@@ -244,23 +279,15 @@ public void process(final String input) {
244279
try {
245280
final String requestUrl = getInput(input, url);
246281
final String requestBody = getInput(input,
247-
body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);
248-
249-
final HttpURLConnection connection =
250-
(HttpURLConnection) new URL(requestUrl).openConnection();
251-
252-
connection.setRequestMethod(method.name());
253-
headers.forEach(connection::addRequestProperty);
254-
282+
body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);
283+
Reader reader = null;
255284
if (requestBody != null) {
256-
connection.setDoOutput(true);
257-
connection.getOutputStream().write(requestBody.getBytes());
285+
reader = doPostOrPut(requestBody, new URL(requestUrl));
258286
}
259-
260-
final InputStream inputStream = getInputStream(connection);
261-
final String contentEncoding = getEncoding(connection.getContentEncoding());
262-
263-
getReceiver().process(new InputStreamReader(inputStream, contentEncoding));
287+
else {
288+
reader = doGet(requestUrl);
289+
}
290+
getReceiver().process(reader);
264291
}
265292
catch (final IOException e) {
266293
throw new MetafactureException(e);
@@ -270,6 +297,32 @@ public void process(final String input) {
270297
}
271298
}
272299

300+
private Reader doPostOrPut(final String requestBody, final URL urlToOpen) throws IOException {
301+
final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection();
302+
connection.setDoOutput(true);
303+
connection.setRequestMethod(method.name());
304+
headers.forEach(connection::setRequestProperty);
305+
connection.getOutputStream().write(requestBody.getBytes());
306+
final InputStream inputStream = getInputStream(connection);
307+
return new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
308+
}
309+
310+
private Reader doGet(final String requestUrl) throws IOException {
311+
final Reader reader;
312+
final HttpURLConnection connection;
313+
connection = followRedirects(new URL(requestUrl));
314+
final InputStream inputStream = getInputStream(connection);
315+
316+
if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
317+
final GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
318+
reader = new InputStreamReader(gzipInputStream);
319+
}
320+
else {
321+
reader = new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
322+
}
323+
return reader;
324+
}
325+
273326
private String getInput(final String input, final String value) {
274327
final String result;
275328

@@ -312,8 +365,36 @@ private InputStream getErrorStream(final InputStream errorStream) {
312365
}
313366
}
314367

315-
private String getEncoding(final String contentEncoding) {
316-
return contentEncoding != null ? contentEncoding : headers.get(ENCODING_HEADER);
368+
private HttpURLConnection followRedirects(final URL startingUrl) throws IOException {
369+
int times = 0;
370+
HttpURLConnection conn;
371+
URL urlToFollow = startingUrl;
372+
while (true) {
373+
times = times + 1;
374+
375+
if (times > ALLOWED_REDIRECTIONS) {
376+
throw new IOException("Stuck in redirect loop");
377+
}
378+
379+
conn = (HttpURLConnection) urlToFollow.openConnection();
380+
headers.forEach(conn::setRequestProperty);
381+
conn.setRequestMethod(method.name());
382+
conn.setConnectTimeout(CONNECTION_TIMEOUT);
383+
conn.setInstanceFollowRedirects(false); // Make the logic below easier to detect redirections
384+
385+
switch (conn.getResponseCode()) {
386+
case HttpURLConnection.HTTP_MOVED_PERM:
387+
case HttpURLConnection.HTTP_MOVED_TEMP:
388+
String location = conn.getHeaderField("Location");
389+
location = URLDecoder.decode(location, "UTF-8");
390+
urlToFollow = new URL(urlToFollow, location); // Deal with relative URLs
391+
continue;
392+
default:
393+
break;
394+
}
395+
break;
396+
}
397+
return conn;
317398
}
318399

319400
}

metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@
1616

1717
package org.metafacture.io;
1818

19-
import org.metafacture.commons.ResourceUtil;
20-
import org.metafacture.framework.ObjectReceiver;
21-
2219
import com.github.tomakehurst.wiremock.client.MappingBuilder;
2320
import com.github.tomakehurst.wiremock.client.ResponseDefinitionBuilder;
2421
import com.github.tomakehurst.wiremock.client.WireMock;
2522
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
23+
import com.github.tomakehurst.wiremock.http.HttpHeader;
24+
import com.github.tomakehurst.wiremock.http.HttpHeaders;
2625
import com.github.tomakehurst.wiremock.http.RequestMethod;
2726
import com.github.tomakehurst.wiremock.junit.WireMockRule;
2827
import com.github.tomakehurst.wiremock.matching.RequestPatternBuilder;
@@ -32,20 +31,22 @@
3231
import org.junit.ComparisonFailure;
3332
import org.junit.Rule;
3433
import org.junit.Test;
34+
import org.metafacture.commons.ResourceUtil;
35+
import org.metafacture.framework.ObjectReceiver;
3536
import org.mockito.ArgumentCaptor;
3637
import org.mockito.Captor;
3738
import org.mockito.Mock;
3839
import org.mockito.Mockito;
3940
import org.mockito.junit.MockitoJUnit;
4041
import org.mockito.junit.MockitoRule;
4142

42-
import static org.mockito.Mockito.times;
43-
44-
import java.io.IOException;
45-
import java.io.Reader;
43+
import java.io.*;
4644
import java.util.Arrays;
4745
import java.util.function.BiConsumer;
4846
import java.util.function.Consumer;
47+
import java.util.zip.GZIPOutputStream;
48+
49+
import static org.mockito.Mockito.times;
4950

5051
/**
5152
* Tests for class {@link HttpOpener}.
@@ -62,6 +63,18 @@ public final class HttpOpenerTest {
6263

6364
private static final String REQUEST_BODY = "request body";
6465
private static final String RESPONSE_BODY = "response bödy"; // UTF-8
66+
private static byte[] GZIPPED_RESPONSE_BODY;
67+
static {
68+
try {
69+
ByteArrayOutputStream out = new ByteArrayOutputStream();
70+
GZIPOutputStream gzip = new GZIPOutputStream(out);
71+
gzip.write(RESPONSE_BODY.getBytes("UTF-8"));
72+
gzip.close();
73+
GZIPPED_RESPONSE_BODY = out.toByteArray();
74+
}catch (Exception e){
75+
e.printStackTrace();
76+
}
77+
}
6578

6679
@Rule
6780
public MockitoRule mockitoRule = MockitoJUnit.rule();
@@ -226,40 +239,23 @@ public void shouldPerformPostRequestWithContentTypeParameter() throws IOExceptio
226239
}
227240

228241
@Test
229-
public void shouldPerformPostRequestWithEncodingParameter() throws IOException {
230-
final String encoding = "ISO-8859-1";
242+
public void shouldPerformPostRequestWithCharsetParameter() throws IOException {
243+
final String charset = "ISO-8859-1";
231244
final String header = "Accept-Charset";
232-
final StringValuePattern value = WireMock.equalTo(encoding);
245+
final StringValuePattern value = WireMock.equalTo(charset);
233246

234247
try {
235248
shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
236249
o.setMethod(HttpOpener.Method.POST);
237250
o.setUrl(u);
238-
o.setEncoding(encoding);
251+
o.setAcceptCharset(charset);
239252
}, s -> s.withHeader(header, value), q -> q.withHeader(header, value), null);
240253
}
241254
catch (final ComparisonFailure e) {
242255
Assert.assertEquals("expected:<response b[ö]dy> but was:<response b[ö]dy>", e.getMessage());
243256
}
244257
}
245258

246-
@Test
247-
public void shouldPerformPostRequestWithEncodingParameterAndContentEncodingResponseHeader() throws IOException {
248-
final String encoding = "ISO-8859-1";
249-
final String header = "Accept-Charset";
250-
final StringValuePattern value = WireMock.equalTo(encoding);
251-
252-
shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
253-
o.setMethod(HttpOpener.Method.POST);
254-
o.setUrl(u);
255-
o.setEncoding(encoding);
256-
},
257-
s -> s.withHeader(header, value),
258-
q -> q.withHeader(header, value),
259-
r -> r.withHeader("Content-Encoding", "UTF-8")
260-
);
261-
}
262-
263259
@Test
264260
public void shouldPerformGetRequestWithErrorResponse() throws IOException {
265261
shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> {},
@@ -278,6 +274,14 @@ public void shouldPerformGetRequestWithErrorResponseAndWithoutErrorPrefixParamet
278274
null, null, WireMock.badRequest().withBody(RESPONSE_BODY), RESPONSE_BODY);
279275
}
280276

277+
@Test
278+
public void shouldPerformGetRequestWithGzipedContentEncoding() throws IOException {
279+
shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> o.setAcceptContentEncoding("gzip"),
280+
null, null,
281+
WireMock.ok().withBody(GZIPPED_RESPONSE_BODY).withHeaders(new HttpHeaders(new HttpHeader(HttpOpener.ENCODING_HEADER,"gzip"))),
282+
RESPONSE_BODY);
283+
}
284+
281285
private void shouldPerformRequest(final String input, final HttpOpener.Method method, final BiConsumer<HttpOpener, String> consumer, final String... headers) throws IOException {
282286
shouldPerformRequest(input, method, consumer,
283287
s -> Arrays.stream(headers).forEach(h -> s.withHeader(h, TEST_VALUE)),
@@ -289,7 +293,6 @@ private void shouldPerformRequest(final String input, final HttpOpener.Method me
289293
if (responseConsumer != null) {
290294
responseConsumer.accept(response);
291295
}
292-
293296
shouldPerformRequest(input, method,
294297
consumer, stubConsumer, requestConsumer,
295298
response, method.getResponseHasBody() ? RESPONSE_BODY : "");

0 commit comments

Comments
 (0)