From 7cc0f7dde0fa28ca6378071be8ca40087a61e991 Mon Sep 17 00:00:00 2001 From: Yakov Shafranovich Date: Tue, 29 Aug 2023 14:09:46 -0400 Subject: [PATCH] Improved error handling for TIKA-2328 --- .../org/apache/tika/parser/html/HtmlParser.java | 6 +++++- .../apache/tika/parser/html/HtmlParserTest.java | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java index ccf2711c1d..e4b2fac667 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -146,7 +146,11 @@ metadata, getEncodingDetector(context))) { parser.setContentHandler(new XHTMLDowngradeHandler( new HtmlHandler(mapper, handler, metadata, context, extractScripts))); - parser.parse(reader.asInputSource()); + try { + parser.parse(reader.asInputSource()); + } catch (StringIndexOutOfBoundsException e) { + throw new TikaException(e.getMessage(), e); + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 408c850921..192a1f9171 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -22,6 +22,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; @@ -1269,6 +1270,19 @@ public void testPreferenceForTitleElement() throws Exception { assertEquals("OldMetaTitle", m.get("title")); } + @Test + public void testUnbalancedQuotes() throws Exception { + //this tests handling of unbalanced quotes (see TIKA-2328) + String testData = ""; + assertThrows(TikaException.class, () -> { + new HtmlParser().parse(new ByteArrayInputStream(testData.getBytes()), + new BodyContentHandler(), + new Metadata(), + new ParseContext()); + + }); + } + private class EncodingDetectorRunner implements Callable { final static String DONE = "done";