Skip to content

Commit c000e55

Browse files
authored
Upgrading to tika 3.2.2 (#133410) (#133414)
1 parent 65a4808 commit c000e55

File tree

8 files changed

+294
-101
lines changed

8 files changed

+294
-101
lines changed

docs/changelog/133410.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 133410
2+
summary: Upgrading to tika 3.2.2
3+
area: Ingest Node
4+
type: upgrade
5+
issues: []

gradle/verification-metadata.xml

Lines changed: 88 additions & 88 deletions
Large diffs are not rendered by default.

modules/ingest-attachment/build.gradle

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,24 @@ esplugin {
1919
// when updating tika, please review it's parent pom : https://repo1.maven.org/maven2/org/apache/tika/tika-parent
2020
// and manually update the transitive dependencies here
2121
def versions = [
22-
'tika' : '2.9.3',
23-
'pdfbox': '2.0.33',
24-
'poi' : '5.4.0',
22+
'tika' : '3.2.2',
23+
'pdfbox': '3.0.5',
24+
'poi' : '5.4.1',
2525
'sparsebitset' : '1.3', //poi dependency: https://repo1.maven.org/maven2/org/apache/poi/poi/
26-
'mime4j': '0.8.12',
27-
'commonsCodec': '1.18.0',
28-
'slf4' : '2.0.16',
26+
'mime4j': '0.8.13',
27+
'commonsCodec': '1.19.0',
28+
'slf4' : '2.0.17',
2929
'xz' : '1.10',
30-
'commonsIo' : '2.18.0',
30+
'commonsIo' : '2.20.0',
3131
//intentionally using the elder "juniversalchardet:juniversalchardet" rather than the newer "com.github.albfernandez:juniversalchardet"
3232
//since the "com.github.albfernandez" fork has some problems with Chinese.
3333
'juniversalchardet' : '1.0.3',
34-
'tagsoup' : '1.2.1',
34+
'jsoup' : '1.21.1',
3535
'jempbox' : '1.8.17',
3636
'xmlbeans' : '5.3.0', //poi-ooxml dependency: https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/
37-
'commonsCollections4' : '4.4',
38-
'commonsCompress' : '1.27.1',
39-
'commonsLang3' :'3.17.0',
37+
'commonsCollections4' : '4.5.0',
38+
'commonsCompress' : '1.28.0',
39+
'commonsLang3' :'3.18.0',
4040
'commonsMath3' : '3.6.1'
4141
]
4242

@@ -86,9 +86,10 @@ dependencies {
8686

8787
// external parser libraries
8888
// HTML
89-
api "org.ccil.cowan.tagsoup:tagsoup:${versions.tagsoup}"
89+
api "org.jsoup:jsoup:${versions.jsoup}"
9090
// Adobe PDF
9191
api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
92+
api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}"
9293
api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
9394
api "org.apache.pdfbox:jempbox:${versions.jempbox}"
9495
// OpenOffice
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License
2+
3+
Copyright (c) 2009-2025 Jonathan Hedley <https://jsoup.org/>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
File renamed without changes.

modules/ingest-attachment/licenses/tagsoup-LICENSE.txt renamed to modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
Apache License
23
Version 2.0, January 2004
34
http://www.apache.org/licenses/
@@ -199,3 +200,145 @@
199200
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200201
See the License for the specific language governing permissions and
201202
limitations under the License.
203+
204+
EXTERNAL COMPONENTS
205+
206+
Apache PDFBox includes a number of components with separate copyright notices
207+
and license terms. Your use of these components is subject to the terms and
208+
conditions of the following licenses.
209+
210+
Contributions made to the original PDFBox and FontBox projects:
211+
212+
Copyright (c) 2002-2007, www.pdfbox.org
213+
All rights reserved.
214+
215+
Redistribution and use in source and binary forms, with or without
216+
modification, are permitted provided that the following conditions are met:
217+
218+
1. Redistributions of source code must retain the above copyright notice,
219+
this list of conditions and the following disclaimer.
220+
221+
2. Redistributions in binary form must reproduce the above copyright
222+
notice, this list of conditions and the following disclaimer in the
223+
documentation and/or other materials provided with the distribution.
224+
225+
3. Neither the name of pdfbox; nor the names of its contributors may be
226+
used to endorse or promote products derived from this software without
227+
specific prior written permission.
228+
229+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
230+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
232+
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
233+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
234+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
235+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
236+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
237+
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
238+
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
239+
SUCH DAMAGE.
240+
241+
Adobe Font Metrics (AFM) for PDF Core 14 Fonts
242+
243+
This file and the 14 PostScript(R) AFM files it accompanies may be used,
244+
copied, and distributed for any purpose and without charge, with or without
245+
modification, provided that all copyright notices are retained; that the
246+
AFM files are not distributed without this file; that all modifications
247+
to this file or any of the AFM files are prominently noted in the modified
248+
file(s); and that this paragraph is not modified. Adobe Systems has no
249+
responsibility or obligation to support the use of the AFM files.
250+
251+
CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads)
252+
253+
Copyright 1990-2009 Adobe Systems Incorporated.
254+
All rights reserved.
255+
256+
Redistribution and use in source and binary forms, with or without
257+
modification, are permitted provided that the following conditions
258+
are met:
259+
260+
Redistributions of source code must retain the above copyright notice,
261+
this list of conditions and the following disclaimer.
262+
263+
Redistributions in binary form must reproduce the above copyright notice,
264+
this list of conditions and the following disclaimer in the documentation
265+
and/or other materials provided with the distribution.
266+
267+
Neither the name of Adobe Systems Incorporated nor the names of its
268+
contributors may be used to endorse or promote products derived from this
269+
software without specific prior written permission.
270+
271+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
272+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
273+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
274+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
275+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
276+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
277+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
278+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
279+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
280+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
281+
THE POSSIBILITY OF SUCH DAMAGE.
282+
283+
PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf)
284+
285+
Copyright 2010 Atos Worldline SAS
286+
287+
Licensed by Atos Worldline SAS under one or more
288+
contributor license agreements. See the NOTICE file distributed with
289+
this work for additional information regarding copyright ownership.
290+
Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0
291+
(the "License"); you may not use this file except in compliance with
292+
the License. You may obtain a copy of the License at
293+
294+
http://www.apache.org/licenses/LICENSE-2.0
295+
296+
Unless required by applicable law or agreed to in writing, software
297+
distributed under the License is distributed on an "AS IS" BASIS,
298+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
299+
See the License for the specific language governing permissions and
300+
limitations under the License.
301+
302+
OSXAdapter
303+
304+
Version: 2.0
305+
306+
Disclaimer: IMPORTANT: This Apple software is supplied to you by
307+
Apple Inc. ("Apple") in consideration of your agreement to the
308+
following terms, and your use, installation, modification or
309+
redistribution of this Apple software constitutes acceptance of these
310+
terms. If you do not agree with these terms, please do not use,
311+
install, modify or redistribute this Apple software.
312+
313+
In consideration of your agreement to abide by the following terms, and
314+
subject to these terms, Apple grants you a personal, non-exclusive
315+
license, under Apple's copyrights in this original Apple software (the
316+
"Apple Software"), to use, reproduce, modify and redistribute the Apple
317+
Software, with or without modifications, in source and/or binary forms;
318+
provided that if you redistribute the Apple Software in its entirety and
319+
without modifications, you must retain this notice and the following
320+
text and disclaimers in all such redistributions of the Apple Software.
321+
Neither the name, trademarks, service marks or logos of Apple Inc.
322+
may be used to endorse or promote products derived from the Apple
323+
Software without specific prior written permission from Apple. Except
324+
as expressly stated in this notice, no other rights or licenses, express
325+
or implied, are granted by Apple herein, including but not limited to
326+
any patent rights that may be infringed by your derivative works or by
327+
other works in which the Apple Software may be incorporated.
328+
329+
The Apple Software is provided by Apple on an "AS IS" basis. APPLE
330+
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
331+
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
332+
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
333+
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
334+
335+
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
336+
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
337+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
338+
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
339+
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
340+
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
341+
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
342+
POSSIBILITY OF SUCH DAMAGE.
343+
344+
Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Apache PDFBox
2+
Copyright 2014 The Apache Software Foundation
3+
4+
This product includes software developed at
5+
The Apache Software Foundation (http://www.apache.org/).
6+
7+
Based on source code originally developed in the PDFBox and
8+
FontBox projects.
9+
10+
Copyright (c) 2002-2007, www.pdfbox.org
11+
12+
Based on source code originally developed in the PaDaF project.
13+
Copyright (c) 2010 Atos Worldline SAS
14+
15+
Includes the Adobe Glyph List
16+
Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated.
17+
18+
Includes the Zapf Dingbats Glyph List
19+
Copyright 2002, 2010 Adobe Systems Incorporated.
20+
21+
Includes OSXAdapter
22+
Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved

modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.apache.tika.parser.AutoDetectParser;
1717
import org.apache.tika.parser.Parser;
1818
import org.apache.tika.parser.ParserDecorator;
19+
import org.apache.tika.parser.html.JSoupParser;
1920

2021
import java.io.ByteArrayInputStream;
2122
import java.io.IOException;
@@ -46,7 +47,7 @@ final class TikaImpl {
4647
/** subset of parsers for types we support */
4748
private static final Parser PARSERS[] = new Parser[] {
4849
// documents
49-
new org.apache.tika.parser.html.HtmlParser(),
50+
new JSoupParser(),
5051
new org.apache.tika.parser.microsoft.rtf.RTFParser(),
5152
new org.apache.tika.parser.pdf.PDFParser(),
5253
new org.apache.tika.parser.txt.TXTParser(),

0 commit comments

Comments
 (0)