From 85a7c44ffe23e0049dd88c440b79b26cacdfcd44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Axel=20D=C3=B6rfler?= Date: Thu, 16 May 2024 12:20:56 +0200 Subject: [PATCH] text-parser uses Metadata.CONTENT_ENCODING If CSVParams.getCharset() is null, the passed in encoding is used before trying to auto detect it. --- .../java/org/apache/tika/parser/csv/TextAndCSVParser.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java index 462167d149..b5981369f0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java @@ -294,6 +294,13 @@ private Reader detect(CSVParams params, InputStream stream, Metadata metadata, getEncodingDetector(context)); } } + String charsetString = metadata.get(Metadata.CONTENT_ENCODING); + if (charsetString != null && params.getCharset() == null) { + try { + params.setCharset(Charset.forName(charsetString)); + } catch (Exception ignored) { + } + } Reader reader = null; if (params.getCharset() == null) { reader = new AutoDetectReader(CloseShieldInputStream.wrap(stream), metadata,