Skip to content

Commit 2ef2d3a

Browse files
committed
Extract HttpOpener response charset from Content-Type header. (#513)
The `Accept-Charset` request header value is generally not suitable as a charset name. Should be applied to compressed content as well.
1 parent 401babb commit 2ef2d3a

File tree

2 files changed

+43
-5
lines changed

2 files changed

+43
-5
lines changed

metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,16 @@ public final class HttpOpener extends DefaultObjectPipe<String, ObjectReceiver<R
6565
public static final String HEADER_FIELD_SEPARATOR = "\n";
6666
public static final String HEADER_VALUE_SEPARATOR = ":";
6767
public static final String INPUT_DESIGNATOR = "@-";
68+
public static final String MIME_PARAMETER_CHARSET = "charset";
69+
public static final String MIME_PARAMETER_SEPARATOR = ";";
70+
public static final String MIME_PARAMETER_VALUE_SEPARATOR = "=";
6871

6972
public static final String DEFAULT_METHOD_NAME = "GET";
7073
public static final Method DEFAULT_METHOD = Method.valueOf(DEFAULT_METHOD_NAME);
7174

7275
private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern.compile(HEADER_FIELD_SEPARATOR);
7376
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern.compile(HEADER_VALUE_SEPARATOR);
77+
private static final Pattern MIME_PARAMETER_SEPARATOR_PATTERN = Pattern.compile(MIME_PARAMETER_SEPARATOR);
7478

7579
private static final int ALLOWED_REDIRECTIONS = 3;
7680
private static final int CONNECTION_TIMEOUT = 11000;
@@ -310,7 +314,7 @@ private Reader doPostOrPut(final String requestBody, final URL urlToOpen) throws
310314
headers.forEach(connection::setRequestProperty);
311315
connection.getOutputStream().write(requestBody.getBytes());
312316
final InputStream inputStream = getInputStream(connection);
313-
return new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
317+
return new InputStreamReader(inputStream, getContentCharset(connection));
314318
}
315319

316320
private Reader doGet(final String requestUrl) throws IOException {
@@ -321,10 +325,10 @@ private Reader doGet(final String requestUrl) throws IOException {
321325

322326
if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
323327
final GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
324-
reader = new InputStreamReader(gzipInputStream);
328+
reader = new InputStreamReader(gzipInputStream, getContentCharset(connection));
325329
}
326330
else {
327-
reader = new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
331+
reader = new InputStreamReader(inputStream, getContentCharset(connection));
328332
}
329333
return reader;
330334
}
@@ -371,6 +375,25 @@ private InputStream getErrorStream(final InputStream errorStream) {
371375
}
372376
}
373377

378+
private String getContentCharset(final HttpURLConnection connection) {
379+
final String contentType = connection.getContentType();
380+
381+
if (contentType != null) {
382+
final String[] parts = MIME_PARAMETER_SEPARATOR_PATTERN.split(contentType);
383+
384+
for (int i = 1; i < parts.length; ++i) {
385+
final String parameter = parts[i].trim();
386+
final int index = parameter.indexOf(MIME_PARAMETER_VALUE_SEPARATOR);
387+
388+
if (index != -1 && MIME_PARAMETER_CHARSET.equalsIgnoreCase(parameter.substring(0, index))) {
389+
return parameter.substring(index + 1);
390+
}
391+
}
392+
}
393+
394+
return CHARSET_DEFAULT;
395+
}
396+
374397
private HttpURLConnection followRedirects(final URL startingUrl) throws IOException {
375398
int times = 0;
376399
HttpURLConnection conn;

metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,20 +240,35 @@ public void shouldPerformPostRequestWithContentTypeParameter() throws IOExceptio
240240

241241
@Test
242242
public void shouldPerformPostRequestWithCharsetParameter() throws IOException {
243+
shouldPerformPostRequestWithCharsetParameter(null);
244+
}
245+
246+
@Test
247+
public void shouldPerformPostRequestWithCharsetParameterAndContentTypeResponseHeader() throws IOException {
248+
shouldPerformPostRequestWithCharsetParameter("expected:<response b[ö]dy> but was:<response b[ö]dy>");
249+
}
250+
251+
private void shouldPerformPostRequestWithCharsetParameter(final String expectedMessage) throws IOException {
243252
final String charset = "ISO-8859-1";
244253
final String header = "Accept-Charset";
245254
final StringValuePattern value = WireMock.equalTo(charset);
246255

256+
String actualMessage;
247257
try {
248258
shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
249259
o.setMethod(HttpOpener.Method.POST);
250260
o.setUrl(u);
251261
o.setAcceptCharset(charset);
252-
}, s -> s.withHeader(header, value), q -> q.withHeader(header, value), null);
262+
}, s -> s.withHeader(header, value), q -> q.withHeader(header, value), expectedMessage != null ?
263+
r -> r.withHeader(HttpOpener.CONTENT_TYPE_HEADER, "text/plain; charset=" + charset) : null);
264+
265+
actualMessage = null;
253266
}
254267
catch (final ComparisonFailure e) {
255-
Assert.assertEquals("expected:<response b[ö]dy> but was:<response b[ö]dy>", e.getMessage());
268+
actualMessage = e.getMessage();
256269
}
270+
271+
Assert.assertEquals(expectedMessage, actualMessage);
257272
}
258273

259274
@Test

0 commit comments

Comments
 (0)