1
1
/*
2
- * Copyright 2013, 2022 Deutsche Nationalbibliothek et al
2
+ * Copyright 2013, 2023 Deutsche Nationalbibliothek et al
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 the "License";
5
5
* you may not use this file except in compliance with the License.
32
32
import java .io .SequenceInputStream ;
33
33
import java .net .HttpURLConnection ;
34
34
import java .net .URL ;
35
+ import java .net .URLDecoder ;
35
36
import java .util .Arrays ;
36
37
import java .util .HashMap ;
37
38
import java .util .Map ;
38
39
import java .util .regex .Pattern ;
40
+ import java .util .zip .GZIPInputStream ;
39
41
40
42
/**
41
43
* Opens an {@link HttpURLConnection} and passes a reader to the receiver.
42
44
*
43
45
* @author Christoph Böhme
44
46
* @author Jan Schnasse
45
47
* @author Jens Wille
48
+ * @author Pascal Christoph (dr0i)
46
49
*/
47
- @ Description ("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\ n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `." )
50
+ @ Description ("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding` and `Content-Type`, as well as generic headers (separated by `\\ n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `." )
48
51
@ In (String .class )
49
52
@ Out (Reader .class )
50
53
@ FluxCommand ("open-http" )
@@ -53,22 +56,21 @@ public final class HttpOpener extends DefaultObjectPipe<String, ObjectReceiver<R
53
56
public static final String ACCEPT_DEFAULT = "*/*" ;
54
57
public static final String ACCEPT_HEADER = "accept" ;
55
58
public static final String CONTENT_TYPE_HEADER = "content-type" ;
59
+ public static final String ACCEPT_ENCODING_HEADER = "accept-encoding" ;
60
+ public static final String ENCODING_HEADER = "content-encoding" ;
56
61
public static final String DEFAULT_PREFIX = "ERROR: " ;
57
- public static final String ENCODING_DEFAULT = "UTF-8" ;
58
- public static final String ENCODING_HEADER = "accept-charset" ;
62
+ public static final String CHARSET_DEFAULT = "UTF-8" ;
63
+ public static final String ACCEPT_CHARSET_HEADER = "accept-charset" ;
59
64
public static final String INPUT_DESIGNATOR = "@-" ;
60
-
61
65
public static final String DEFAULT_METHOD_NAME = "GET" ;
62
66
public static final Method DEFAULT_METHOD = Method .valueOf (DEFAULT_METHOD_NAME );
63
-
64
67
public static final String HEADER_FIELD_SEPARATOR = "\n " ;
65
68
public static final String HEADER_VALUE_SEPARATOR = ":" ;
66
-
67
69
private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern .compile (HEADER_FIELD_SEPARATOR );
68
70
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern .compile (HEADER_VALUE_SEPARATOR );
69
-
71
+ private static final int ALLOWED_REDIRECTIONS = 3 ;
72
+ private static final int CONNECTION_TIMEOUT = 11000 ;
70
73
private final Map <String , String > headers = new HashMap <>();
71
-
72
74
private Method method ;
73
75
private String body ;
74
76
private String errorPrefix ;
@@ -118,7 +120,7 @@ public boolean getResponseHasBody() {
118
120
*/
119
121
public HttpOpener () {
120
122
setAccept (ACCEPT_DEFAULT );
121
- setEncoding ( ENCODING_DEFAULT );
123
+ setAcceptCharset ( CHARSET_DEFAULT );
122
124
setErrorPrefix (DEFAULT_PREFIX );
123
125
setMethod (DEFAULT_METHOD );
124
126
setUrl (INPUT_DESIGNATOR );
@@ -163,17 +165,50 @@ public void setContentType(final String contentType) {
163
165
setHeader (CONTENT_TYPE_HEADER , contentType );
164
166
}
165
167
168
+ /**
169
+ * Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
170
+ * preferred charset for the HTTP response.
171
+ * The default charset is {@value CHARSET_DEFAULT}.
172
+ *
173
+ * @param charset name of the charset used for the accept-charset HTTP header
174
+ */
175
+ public void setAcceptCharset (final String charset ) {
176
+ setHeader (ACCEPT_CHARSET_HEADER , charset );
177
+ }
178
+
179
+ /**
180
+ * @deprecated Use {@link #setAcceptCharset} instead.
181
+ * @param charset name of the charset used for the accept-charset HTTP header
182
+ */
183
+ @ Deprecated
184
+ public void setEncoding (final String charset ) {
185
+ setAcceptCharset (charset );
186
+ }
187
+
188
+ /**
189
+ * Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
190
+ * preferred content encoding for the HTTP response. It accepts HTTP compression.
191
+ * Allowed values are i.a. "gzip" and "Brotli".
192
+ * The default for the content encoding is null, which means "no compression".
193
+ *
194
+ * @param contentEncoding name of content encoding used for the accept-encoding HTTP
195
+ * header
196
+ */
197
+ public void setAcceptContentEncoding (final String contentEncoding ) {
198
+ setHeader (ACCEPT_ENCODING_HEADER , contentEncoding );
199
+ }
200
+
166
201
/**
167
202
* Sets the HTTP {@value ENCODING_HEADER} header value. This is the
168
- * preferred encoding for the HTTP response. Additionally, the encoding
169
- * is used for reading the HTTP response if it does not specify a content
170
- * encoding. The default for the encoding is {@value ENCODING_DEFAULT} .
203
+ * content encoding for the HTTP GET. It enables HTTP compression.
204
+ * Allowed values are "gzip".
205
+ * The default for the content encoding is null, which means "no compression" .
171
206
*
172
- * @param encoding name of the encoding used for the accept-charset HTTP
207
+ * @param contentEncoding name of content encoding used for the content-encoding HTTP
173
208
* header
174
209
*/
175
- public void setEncoding (final String encoding ) {
176
- setHeader (ENCODING_HEADER , encoding );
210
+ public void setContentEncoding (final String contentEncoding ) {
211
+ setHeader (ENCODING_HEADER , contentEncoding );
177
212
}
178
213
179
214
/**
@@ -244,23 +279,15 @@ public void process(final String input) {
244
279
try {
245
280
final String requestUrl = getInput (input , url );
246
281
final String requestBody = getInput (input ,
247
- body == null && method .getRequestHasBody () ? INPUT_DESIGNATOR : body );
248
-
249
- final HttpURLConnection connection =
250
- (HttpURLConnection ) new URL (requestUrl ).openConnection ();
251
-
252
- connection .setRequestMethod (method .name ());
253
- headers .forEach (connection ::addRequestProperty );
254
-
282
+ body == null && method .getRequestHasBody () ? INPUT_DESIGNATOR : body );
283
+ Reader reader = null ;
255
284
if (requestBody != null ) {
256
- connection .setDoOutput (true );
257
- connection .getOutputStream ().write (requestBody .getBytes ());
285
+ reader = doPostOrPut (requestBody , new URL (requestUrl ));
258
286
}
259
-
260
- final InputStream inputStream = getInputStream (connection );
261
- final String contentEncoding = getEncoding (connection .getContentEncoding ());
262
-
263
- getReceiver ().process (new InputStreamReader (inputStream , contentEncoding ));
287
+ else {
288
+ reader = doGet (requestUrl );
289
+ }
290
+ getReceiver ().process (reader );
264
291
}
265
292
catch (final IOException e ) {
266
293
throw new MetafactureException (e );
@@ -270,6 +297,32 @@ public void process(final String input) {
270
297
}
271
298
}
272
299
300
+ private Reader doPostOrPut (final String requestBody , final URL urlToOpen ) throws IOException {
301
+ final HttpURLConnection connection = (HttpURLConnection ) urlToOpen .openConnection ();
302
+ connection .setDoOutput (true );
303
+ connection .setRequestMethod (method .name ());
304
+ headers .forEach (connection ::setRequestProperty );
305
+ connection .getOutputStream ().write (requestBody .getBytes ());
306
+ final InputStream inputStream = getInputStream (connection );
307
+ return new InputStreamReader (inputStream , headers .get (ACCEPT_CHARSET_HEADER ));
308
+ }
309
+
310
+ private Reader doGet (final String requestUrl ) throws IOException {
311
+ final Reader reader ;
312
+ final HttpURLConnection connection ;
313
+ connection = followRedirects (new URL (requestUrl ));
314
+ final InputStream inputStream = getInputStream (connection );
315
+
316
+ if ("gzip" .equalsIgnoreCase (connection .getContentEncoding ())) {
317
+ final GZIPInputStream gzipInputStream = new GZIPInputStream (inputStream );
318
+ reader = new InputStreamReader (gzipInputStream );
319
+ }
320
+ else {
321
+ reader = new InputStreamReader (inputStream , headers .get (ACCEPT_CHARSET_HEADER ));
322
+ }
323
+ return reader ;
324
+ }
325
+
273
326
private String getInput (final String input , final String value ) {
274
327
final String result ;
275
328
@@ -312,8 +365,36 @@ private InputStream getErrorStream(final InputStream errorStream) {
312
365
}
313
366
}
314
367
315
- private String getEncoding (final String contentEncoding ) {
316
- return contentEncoding != null ? contentEncoding : headers .get (ENCODING_HEADER );
368
+ private HttpURLConnection followRedirects (final URL startingUrl ) throws IOException {
369
+ int times = 0 ;
370
+ HttpURLConnection conn ;
371
+ URL urlToFollow = startingUrl ;
372
+ while (true ) {
373
+ times = times + 1 ;
374
+
375
+ if (times > ALLOWED_REDIRECTIONS ) {
376
+ throw new IOException ("Stuck in redirect loop" );
377
+ }
378
+
379
+ conn = (HttpURLConnection ) urlToFollow .openConnection ();
380
+ headers .forEach (conn ::setRequestProperty );
381
+ conn .setRequestMethod (method .name ());
382
+ conn .setConnectTimeout (CONNECTION_TIMEOUT );
383
+ conn .setInstanceFollowRedirects (false ); // Make the logic below easier to detect redirections
384
+
385
+ switch (conn .getResponseCode ()) {
386
+ case HttpURLConnection .HTTP_MOVED_PERM :
387
+ case HttpURLConnection .HTTP_MOVED_TEMP :
388
+ String location = conn .getHeaderField ("Location" );
389
+ location = URLDecoder .decode (location , "UTF-8" );
390
+ urlToFollow = new URL (urlToFollow , location ); // Deal with relative URLs
391
+ continue ;
392
+ default :
393
+ break ;
394
+ }
395
+ break ;
396
+ }
397
+ return conn ;
317
398
}
318
399
319
400
}
0 commit comments