diff --git a/docs/changelog/133410.yaml b/docs/changelog/133410.yaml
new file mode 100644
index 0000000000000..3012c1223bb2d
--- /dev/null
+++ b/docs/changelog/133410.yaml
@@ -0,0 +1,5 @@
+pr: 133410
+summary: Upgrading to tika 3.2.2
+area: Ingest Node
+type: upgrade
+issues: []
diff --git a/gradle/verification-metadata.xml b/gradle/verification-metadata.xml
index b08535464c7df..d161da5d92236 100644
--- a/gradle/verification-metadata.xml
+++ b/gradle/verification-metadata.xml
@@ -1389,9 +1389,9 @@
-
-
-
+
+
+
@@ -1439,11 +1439,6 @@
-
-
-
-
-
@@ -2122,9 +2117,9 @@
-
-
-
+
+
+
@@ -2147,9 +2142,9 @@
-
-
-
+
+
+
@@ -2182,9 +2177,9 @@
-
-
-
+
+
+
@@ -2805,14 +2800,14 @@
-
-
-
+
+
+
-
-
-
+
+
+
@@ -3305,9 +3300,9 @@
-
-
-
+
+
+
@@ -3315,29 +3310,34 @@
-
-
-
+
+
+
+
+
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
@@ -3365,59 +3365,59 @@
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
-
-
-
+
+
+
@@ -3543,11 +3543,6 @@
-
-
-
-
-
@@ -4228,6 +4223,11 @@
+
+
+
+
+
@@ -4773,9 +4773,9 @@
-
-
-
+
+
+
@@ -4808,9 +4808,9 @@
-
-
-
+
+
+
@@ -4828,9 +4828,9 @@
-
-
-
+
+
+
diff --git a/modules/ingest-attachment/build.gradle b/modules/ingest-attachment/build.gradle
index 993c02993c571..7a8198cda930f 100644
--- a/modules/ingest-attachment/build.gradle
+++ b/modules/ingest-attachment/build.gradle
@@ -19,24 +19,24 @@ esplugin {
// when updating tika, please review it's parent pom : https://repo1.maven.org/maven2/org/apache/tika/tika-parent
// and manually update the transitive dependencies here
def versions = [
- 'tika' : '2.9.3',
- 'pdfbox': '2.0.33',
- 'poi' : '5.4.0',
+ 'tika' : '3.2.2',
+ 'pdfbox': '3.0.5',
+ 'poi' : '5.4.1',
'sparsebitset' : '1.3', //poi dependency: https://repo1.maven.org/maven2/org/apache/poi/poi/
- 'mime4j': '0.8.12',
- 'commonsCodec': '1.18.0',
- 'slf4' : '2.0.16',
+ 'mime4j': '0.8.13',
+ 'commonsCodec': '1.19.0',
+ 'slf4' : '2.0.17',
'xz' : '1.10',
- 'commonsIo' : '2.18.0',
+ 'commonsIo' : '2.20.0',
//intentionally using the elder "juniversalchardet:juniversalchardet" rather than the newer "com.github.albfernandez:juniversalchardet"
//since the "com.github.albfernandez" fork has some problems with Chinese.
'juniversalchardet' : '1.0.3',
- 'tagsoup' : '1.2.1',
+ 'jsoup' : '1.21.1',
'jempbox' : '1.8.17',
'xmlbeans' : '5.3.0', //poi-ooxml dependency: https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/
- 'commonsCollections4' : '4.4',
- 'commonsCompress' : '1.27.1',
- 'commonsLang3' :'3.17.0',
+ 'commonsCollections4' : '4.5.0',
+ 'commonsCompress' : '1.28.0',
+ 'commonsLang3' :'3.18.0',
'commonsMath3' : '3.6.1'
]
@@ -86,9 +86,10 @@ dependencies {
// external parser libraries
// HTML
- api "org.ccil.cowan.tagsoup:tagsoup:${versions.tagsoup}"
+ api "org.jsoup:jsoup:${versions.jsoup}"
// Adobe PDF
api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
+ api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}"
api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
api "org.apache.pdfbox:jempbox:${versions.jempbox}"
// OpenOffice
diff --git a/modules/ingest-attachment/licenses/jsoup-LICENSE.txt b/modules/ingest-attachment/licenses/jsoup-LICENSE.txt
new file mode 100644
index 0000000000000..e4bf2be9fb7f2
--- /dev/null
+++ b/modules/ingest-attachment/licenses/jsoup-LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2009-2025 Jonathan Hedley
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/modules/ingest-attachment/licenses/tagsoup-NOTICE.txt b/modules/ingest-attachment/licenses/jsoup-NOTICE.txt
similarity index 100%
rename from modules/ingest-attachment/licenses/tagsoup-NOTICE.txt
rename to modules/ingest-attachment/licenses/jsoup-NOTICE.txt
diff --git a/modules/ingest-attachment/licenses/tagsoup-LICENSE.txt b/modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt
similarity index 60%
rename from modules/ingest-attachment/licenses/tagsoup-LICENSE.txt
rename to modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt
index 261eeb9e9f8b2..97553f24a432a 100644
--- a/modules/ingest-attachment/licenses/tagsoup-LICENSE.txt
+++ b/modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt
@@ -1,3 +1,4 @@
+
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@@ -199,3 +200,145 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+
+EXTERNAL COMPONENTS
+
+Apache PDFBox includes a number of components with separate copyright notices
+and license terms. Your use of these components is subject to the terms and
+conditions of the following licenses.
+
+Contributions made to the original PDFBox and FontBox projects:
+
+ Copyright (c) 2002-2007, www.pdfbox.org
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. Neither the name of pdfbox; nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+Adobe Font Metrics (AFM) for PDF Core 14 Fonts
+
+ This file and the 14 PostScript(R) AFM files it accompanies may be used,
+ copied, and distributed for any purpose and without charge, with or without
+ modification, provided that all copyright notices are retained; that the
+ AFM files are not distributed without this file; that all modifications
+ to this file or any of the AFM files are prominently noted in the modified
+ file(s); and that this paragraph is not modified. Adobe Systems has no
+ responsibility or obligation to support the use of the AFM files.
+
+CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads)
+
+ Copyright 1990-2009 Adobe Systems Incorporated.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ Neither the name of Adobe Systems Incorporated nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ THE POSSIBILITY OF SUCH DAMAGE.
+
+PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf)
+
+ Copyright 2010 Atos Worldline SAS
+
+ Licensed by Atos Worldline SAS under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+OSXAdapter
+
+ Version: 2.0
+
+ Disclaimer: IMPORTANT: This Apple software is supplied to you by
+ Apple Inc. ("Apple") in consideration of your agreement to the
+ following terms, and your use, installation, modification or
+ redistribution of this Apple software constitutes acceptance of these
+ terms. If you do not agree with these terms, please do not use,
+ install, modify or redistribute this Apple software.
+
+ In consideration of your agreement to abide by the following terms, and
+ subject to these terms, Apple grants you a personal, non-exclusive
+ license, under Apple's copyrights in this original Apple software (the
+ "Apple Software"), to use, reproduce, modify and redistribute the Apple
+ Software, with or without modifications, in source and/or binary forms;
+ provided that if you redistribute the Apple Software in its entirety and
+ without modifications, you must retain this notice and the following
+ text and disclaimers in all such redistributions of the Apple Software.
+ Neither the name, trademarks, service marks or logos of Apple Inc.
+ may be used to endorse or promote products derived from the Apple
+ Software without specific prior written permission from Apple. Except
+ as expressly stated in this notice, no other rights or licenses, express
+ or implied, are granted by Apple herein, including but not limited to
+ any patent rights that may be infringed by your derivative works or by
+ other works in which the Apple Software may be incorporated.
+
+ The Apple Software is provided by Apple on an "AS IS" basis. APPLE
+ MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+ THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+ OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+
+ IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+ OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+ MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+ AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+ STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+ Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved
diff --git a/modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt b/modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt
new file mode 100644
index 0000000000000..4da75301eaf79
--- /dev/null
+++ b/modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt
@@ -0,0 +1,22 @@
+Apache PDFBox
+Copyright 2014 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Based on source code originally developed in the PDFBox and
+FontBox projects.
+
+Copyright (c) 2002-2007, www.pdfbox.org
+
+Based on source code originally developed in the PaDaF project.
+Copyright (c) 2010 Atos Worldline SAS
+
+Includes the Adobe Glyph List
+Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated.
+
+Includes the Zapf Dingbats Glyph List
+Copyright 2002, 2010 Adobe Systems Incorporated.
+
+Includes OSXAdapter
+Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved
diff --git a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
index c057d17576c0a..f83768784d3cd 100644
--- a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
+++ b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
@@ -16,6 +16,7 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.html.JSoupParser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@@ -46,7 +47,7 @@ final class TikaImpl {
/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
- new org.apache.tika.parser.html.HtmlParser(),
+ new JSoupParser(),
new org.apache.tika.parser.microsoft.rtf.RTFParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),