diff --git a/docs/changelog/133410.yaml b/docs/changelog/133410.yaml new file mode 100644 index 0000000000000..3012c1223bb2d --- /dev/null +++ b/docs/changelog/133410.yaml @@ -0,0 +1,5 @@ +pr: 133410 +summary: Upgrading to tika 3.2.2 +area: Ingest Node +type: upgrade +issues: [] diff --git a/gradle/verification-metadata.xml b/gradle/verification-metadata.xml index f2f51dcb818c8..e4adec09f17fe 100644 --- a/gradle/verification-metadata.xml +++ b/gradle/verification-metadata.xml @@ -1384,9 +1384,9 @@ - - - + + + @@ -1434,11 +1434,6 @@ - - - - - @@ -2117,9 +2112,9 @@ - - - + + + @@ -2142,9 +2137,9 @@ - - - + + + @@ -2177,9 +2172,9 @@ - - - + + + @@ -2800,14 +2795,14 @@ - - - + + + - - - + + + @@ -3300,9 +3295,9 @@ - - - + + + @@ -3310,29 +3305,34 @@ - - - + + + + + + + + - - - + + + - - - + + + - - - + + + - - - + + + @@ -3360,59 +3360,59 @@ - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + @@ -3523,11 +3523,6 @@ - - - - - @@ -4208,6 +4203,11 @@ + + + + + @@ -4753,9 +4753,9 @@ - - - + + + @@ -4788,9 +4788,9 @@ - - - + + + @@ -4808,9 +4808,9 @@ - - - + + + diff --git a/modules/ingest-attachment/build.gradle b/modules/ingest-attachment/build.gradle index 993c02993c571..7a8198cda930f 100644 --- a/modules/ingest-attachment/build.gradle +++ b/modules/ingest-attachment/build.gradle @@ -19,24 +19,24 @@ esplugin { // when updating tika, please review it's parent pom : https://repo1.maven.org/maven2/org/apache/tika/tika-parent // and manually update the transitive dependencies here def versions = [ - 'tika' : '2.9.3', - 'pdfbox': '2.0.33', - 'poi' : '5.4.0', + 'tika' : '3.2.2', + 'pdfbox': '3.0.5', + 'poi' : '5.4.1', 'sparsebitset' : '1.3', //poi dependency: https://repo1.maven.org/maven2/org/apache/poi/poi/ - 'mime4j': '0.8.12', - 'commonsCodec': '1.18.0', - 'slf4' : '2.0.16', + 'mime4j': '0.8.13', + 'commonsCodec': '1.19.0', + 'slf4' : '2.0.17', 'xz' : '1.10', - 'commonsIo' : '2.18.0', + 'commonsIo' : '2.20.0', //intentionally using the elder "juniversalchardet:juniversalchardet" rather than the newer "com.github.albfernandez:juniversalchardet" //since the "com.github.albfernandez" fork has some problems with Chinese. 'juniversalchardet' : '1.0.3', - 'tagsoup' : '1.2.1', + 'jsoup' : '1.21.1', 'jempbox' : '1.8.17', 'xmlbeans' : '5.3.0', //poi-ooxml dependency: https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/ - 'commonsCollections4' : '4.4', - 'commonsCompress' : '1.27.1', - 'commonsLang3' :'3.17.0', + 'commonsCollections4' : '4.5.0', + 'commonsCompress' : '1.28.0', + 'commonsLang3' :'3.18.0', 'commonsMath3' : '3.6.1' ] @@ -86,9 +86,10 @@ dependencies { // external parser libraries // HTML - api "org.ccil.cowan.tagsoup:tagsoup:${versions.tagsoup}" + api "org.jsoup:jsoup:${versions.jsoup}" // Adobe PDF api "org.apache.pdfbox:pdfbox:${versions.pdfbox}" + api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}" api "org.apache.pdfbox:fontbox:${versions.pdfbox}" api "org.apache.pdfbox:jempbox:${versions.jempbox}" // OpenOffice diff --git a/modules/ingest-attachment/licenses/jsoup-LICENSE.txt b/modules/ingest-attachment/licenses/jsoup-LICENSE.txt new file mode 100644 index 0000000000000..e4bf2be9fb7f2 --- /dev/null +++ b/modules/ingest-attachment/licenses/jsoup-LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2009-2025 Jonathan Hedley + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/modules/ingest-attachment/licenses/tagsoup-NOTICE.txt b/modules/ingest-attachment/licenses/jsoup-NOTICE.txt similarity index 100% rename from modules/ingest-attachment/licenses/tagsoup-NOTICE.txt rename to modules/ingest-attachment/licenses/jsoup-NOTICE.txt diff --git a/modules/ingest-attachment/licenses/tagsoup-LICENSE.txt b/modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt similarity index 60% rename from modules/ingest-attachment/licenses/tagsoup-LICENSE.txt rename to modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt index 261eeb9e9f8b2..97553f24a432a 100644 --- a/modules/ingest-attachment/licenses/tagsoup-LICENSE.txt +++ b/modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt @@ -1,3 +1,4 @@ + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -199,3 +200,145 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +EXTERNAL COMPONENTS + +Apache PDFBox includes a number of components with separate copyright notices +and license terms. Your use of these components is subject to the terms and +conditions of the following licenses. + +Contributions made to the original PDFBox and FontBox projects: + + Copyright (c) 2002-2007, www.pdfbox.org + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of pdfbox; nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + +Adobe Font Metrics (AFM) for PDF Core 14 Fonts + + This file and the 14 PostScript(R) AFM files it accompanies may be used, + copied, and distributed for any purpose and without charge, with or without + modification, provided that all copyright notices are retained; that the + AFM files are not distributed without this file; that all modifications + to this file or any of the AFM files are prominently noted in the modified + file(s); and that this paragraph is not modified. Adobe Systems has no + responsibility or obligation to support the use of the AFM files. + +CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads) + + Copyright 1990-2009 Adobe Systems Incorporated. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + Neither the name of Adobe Systems Incorporated nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. + +PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf) + + Copyright 2010 Atos Worldline SAS + + Licensed by Atos Worldline SAS under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +OSXAdapter + + Version: 2.0 + + Disclaimer: IMPORTANT: This Apple software is supplied to you by + Apple Inc. ("Apple") in consideration of your agreement to the + following terms, and your use, installation, modification or + redistribution of this Apple software constitutes acceptance of these + terms. If you do not agree with these terms, please do not use, + install, modify or redistribute this Apple software. + + In consideration of your agreement to abide by the following terms, and + subject to these terms, Apple grants you a personal, non-exclusive + license, under Apple's copyrights in this original Apple software (the + "Apple Software"), to use, reproduce, modify and redistribute the Apple + Software, with or without modifications, in source and/or binary forms; + provided that if you redistribute the Apple Software in its entirety and + without modifications, you must retain this notice and the following + text and disclaimers in all such redistributions of the Apple Software. + Neither the name, trademarks, service marks or logos of Apple Inc. + may be used to endorse or promote products derived from the Apple + Software without specific prior written permission from Apple. Except + as expressly stated in this notice, no other rights or licenses, express + or implied, are granted by Apple herein, including but not limited to + any patent rights that may be infringed by your derivative works or by + other works in which the Apple Software may be incorporated. + + The Apple Software is provided by Apple on an "AS IS" basis. APPLE + MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION + THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND + OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. + + IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL + OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, + MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED + AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), + STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved diff --git a/modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt b/modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt new file mode 100644 index 0000000000000..4da75301eaf79 --- /dev/null +++ b/modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt @@ -0,0 +1,22 @@ +Apache PDFBox +Copyright 2014 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Based on source code originally developed in the PDFBox and +FontBox projects. + +Copyright (c) 2002-2007, www.pdfbox.org + +Based on source code originally developed in the PaDaF project. +Copyright (c) 2010 Atos Worldline SAS + +Includes the Adobe Glyph List +Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated. + +Includes the Zapf Dingbats Glyph List +Copyright 2002, 2010 Adobe Systems Incorporated. + +Includes OSXAdapter +Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved diff --git a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java index c057d17576c0a..f83768784d3cd 100644 --- a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java +++ b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java @@ -16,6 +16,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.html.JSoupParser; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -46,7 +47,7 @@ final class TikaImpl { /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents - new org.apache.tika.parser.html.HtmlParser(), + new JSoupParser(), new org.apache.tika.parser.microsoft.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(),