diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java index 06783d67c1..132e657a74 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java @@ -46,6 +46,12 @@ public static PipesConfig load(Path tikaConfig) throws IOException, TikaConfigEx return pipesConfig; } + public static PipesConfig load(InputStream tikaConfigInputStream) throws IOException, TikaConfigException { + PipesConfig pipesConfig = new PipesConfig(); + pipesConfig.configure("pipes", tikaConfigInputStream); + return pipesConfig; + } + private PipesConfig() { } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java index bf6a6bb696..27e6a49fbc 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java @@ -54,7 +54,10 @@ public class PipesConfigBase extends ConfigBase { private int numClients = DEFAULT_NUM_CLIENTS; private int maxFilesProcessedPerProcess = DEFAULT_MAX_FILES_PROCESSED_PER_PROCESS; - + public static final int DEFAULT_STALE_FETCHER_TIMEOUT_SECONDS = 600; + private int staleFetcherTimeoutSeconds = DEFAULT_STALE_FETCHER_TIMEOUT_SECONDS; + public static final int DEFAULT_STALE_FETCHER_DELAY_SECONDS = 60; + private int staleFetcherDelaySeconds = DEFAULT_STALE_FETCHER_DELAY_SECONDS; private List forkedJvmArgs = new ArrayList<>(); private Path tikaConfig; private String javaPath = "java"; @@ -171,4 +174,20 @@ public long getSleepOnStartupTimeoutMillis() { public void setSleepOnStartupTimeoutMillis(long sleepOnStartupTimeoutMillis) { this.sleepOnStartupTimeoutMillis = sleepOnStartupTimeoutMillis; } + + public int getStaleFetcherTimeoutSeconds() { + return staleFetcherTimeoutSeconds; + } + + public void setStaleFetcherTimeoutSeconds(int staleFetcherTimeoutSeconds) { + this.staleFetcherTimeoutSeconds = staleFetcherTimeoutSeconds; + } + + public int getStaleFetcherDelaySeconds() { + return staleFetcherDelaySeconds; + } + + public void setStaleFetcherDelaySeconds(int staleFetcherDelaySeconds) { + this.staleFetcherDelaySeconds = staleFetcherDelaySeconds; + } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index 95bb0e9edc..59def749cf 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -69,10 +69,11 @@ import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.emitter.StreamEmitter; import org.apache.tika.pipes.emitter.TikaEmitterException; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.extractor.EmittingEmbeddedDocumentBytesHandler; +import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.fetcher.RangeFetcher; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; @@ -280,7 +281,7 @@ private String getContainerStacktrace(FetchEmitTuple t, List metadataL private void emit(String taskId, EmitKey emitKey, boolean isExtractEmbeddedBytes, MetadataListAndEmbeddedBytes parseData, - String parseExceptionStack, ParseContext parseContext) { + String parseExceptionStack) { Emitter emitter = null; try { @@ -296,7 +297,7 @@ private void emit(String taskId, EmitKey emitKey, parseData.toBePackagedForStreamEmitter()) { emitContentsAndBytes(emitter, emitKey, parseData); } else { - emitter.emit(emitKey.getEmitKey(), parseData.getMetadataList(), parseContext); + emitter.emit(emitKey.getEmitKey(), parseData.getMetadataList()); } } catch (IOException | TikaEmitterException e) { LOG.warn("emit exception", e); @@ -377,7 +378,7 @@ private void actuallyParse(FetchEmitTuple t) { LOG.trace("timer -- to parse: {} ms", System.currentTimeMillis() - start); } - if (metadataIsEmpty(parseData.getMetadataList())) { + if (parseData == null || metadataIsEmpty(parseData.getMetadataList())) { write(STATUS.EMPTY_OUTPUT); return; } @@ -400,11 +401,8 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD String stack = getContainerStacktrace(t, parseData.getMetadataList()); //we need to apply this after we pull out the stacktrace filterMetadata(parseData.getMetadataList()); - ParseContext parseContext = t.getParseContext(); - FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException(); - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); if (StringUtils.isBlank(stack) || - onParseException == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) { + t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) { injectUserMetadata(t.getMetadata(), parseData.getMetadataList()); EmitKey emitKey = t.getEmitKey(); if (StringUtils.isBlank(emitKey.getEmitKey())) { @@ -412,14 +410,14 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD t.setEmitKey(emitKey); } EmitData emitData = new EmitData(t.getEmitKey(), parseData.getMetadataList(), stack); - if (embeddedDocumentBytesConfig.isExtractEmbeddedDocumentBytes() && + if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes() && parseData.toBePackagedForStreamEmitter()) { - emit(t.getId(), emitKey, embeddedDocumentBytesConfig.isExtractEmbeddedDocumentBytes(), - parseData, stack, parseContext); + emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(), + parseData, stack); } else if (maxForEmitBatchBytes >= 0 && emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) { - emit(t.getId(), emitKey, embeddedDocumentBytesConfig.isExtractEmbeddedDocumentBytes(), - parseData, stack, parseContext); + emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(), + parseData, stack); } else { //send back to the client write(emitData); @@ -458,18 +456,35 @@ private Fetcher getFetcher(FetchEmitTuple t) { } protected MetadataListAndEmbeddedBytes parseFromTuple(FetchEmitTuple t, Fetcher fetcher) { - - Metadata metadata = new Metadata(); - try (InputStream stream = fetcher.fetch(t.getFetchKey().getFetchKey(), metadata, t.getParseContext())) { - return parseWithStream(t, stream, metadata); - } catch (SecurityException e) { - LOG.error("security exception " + t.getId(), e); - throw e; - } catch (TikaException | IOException e) { - LOG.warn("fetch exception " + t.getId(), e); - write(STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e)); + FetchKey fetchKey = t.getFetchKey(); + if (fetchKey.hasRange()) { + if (!(fetcher instanceof RangeFetcher)) { + throw new IllegalArgumentException( + "fetch key has a range, but the fetcher is not a range fetcher"); + } + Metadata metadata = t.getMetadata() == null ? new Metadata() : t.getMetadata(); + try (InputStream stream = ((RangeFetcher) fetcher).fetch(fetchKey.getFetchKey(), + fetchKey.getRangeStart(), fetchKey.getRangeEnd(), metadata)) { + return parseWithStream(t, stream, metadata); + } catch (SecurityException e) { + LOG.error("security exception " + t.getId(), e); + throw e; + } catch (TikaException | IOException e) { + LOG.warn("fetch exception " + t.getId(), e); + write(STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e)); + } + } else { + Metadata metadata = t.getMetadata() == null ? new Metadata() : t.getMetadata(); + try (InputStream stream = fetcher.fetch(t.getFetchKey().getFetchKey(), metadata)) { + return parseWithStream(t, stream, metadata); + } catch (SecurityException e) { + LOG.error("security exception " + t.getId(), e); + throw e; + } catch (TikaException | IOException e) { + LOG.warn("fetch exception " + t.getId(), e); + write(STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e)); + } } - return null; } @@ -513,11 +528,10 @@ private void handleOOM(String taskId, OutOfMemoryError oom) { private MetadataListAndEmbeddedBytes parseWithStream(FetchEmitTuple fetchEmitTuple, InputStream stream, Metadata metadata) throws TikaConfigException { - + HandlerConfig handlerConfig = fetchEmitTuple.getHandlerConfig(); List metadataList; //this adds the EmbeddedDocumentByteStore to the parsecontext - ParseContext parseContext = setupParseContext(fetchEmitTuple); - HandlerConfig handlerConfig = parseContext.get(HandlerConfig.class); + ParseContext parseContext = createParseContext(fetchEmitTuple); if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) { metadataList = parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext); @@ -530,16 +544,10 @@ private MetadataListAndEmbeddedBytes parseWithStream(FetchEmitTuple fetchEmitTup parseContext.get(EmbeddedDocumentBytesHandler.class)); } - private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) + private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple) throws TikaConfigException { - ParseContext parseContext = fetchEmitTuple.getParseContext(); - if (parseContext.get(HandlerConfig.class) == null) { - parseContext.set(HandlerConfig.class, HandlerConfig.DEFAULT_HANDLER_CONFIG); - } - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); - if (embeddedDocumentBytesConfig == null) { - //make sure there's one here -- or do we make this default in fetchemit tuple? - parseContext.set(EmbeddedDocumentBytesConfig.class, EmbeddedDocumentBytesConfig.SKIP); + ParseContext parseContext = new ParseContext(); + if (! fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) { return parseContext; } EmbeddedDocumentExtractorFactory factory = ((AutoDetectParser)autoDetectParser) @@ -553,17 +561,18 @@ private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) "instance of EmbeddedDocumentByteStoreExtractorFactory if you want" + "to extract embedded bytes! I see this embedded doc factory: " + factory.getClass() + "and a request: " + - embeddedDocumentBytesConfig); + fetchEmitTuple.getEmbeddedDocumentBytesConfig()); } } //TODO: especially clean this up. - if (!StringUtils.isBlank(embeddedDocumentBytesConfig.getEmitter())) { + if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) { parseContext.set(EmbeddedDocumentBytesHandler.class, - new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple, emitterManager)); + new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple.getEmitKey(), + fetchEmitTuple.getEmbeddedDocumentBytesConfig(), emitterManager)); } else { parseContext.set(EmbeddedDocumentBytesHandler.class, new BasicEmbeddedDocumentBytesHandler( - embeddedDocumentBytesConfig)); + fetchEmitTuple.getEmbeddedDocumentBytesConfig())); } return parseContext; } @@ -684,10 +693,11 @@ private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, } catch (IOException e) { LOG.warn("problem detecting: " + t.getId(), e); } - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); - if (embeddedDocumentBytesConfig != null && - embeddedDocumentBytesConfig.isIncludeOriginal()) { - EmbeddedDocumentBytesHandler embeddedDocumentByteStore = parseContext.get(EmbeddedDocumentBytesHandler.class); + + if (t.getEmbeddedDocumentBytesConfig() != null && + t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) { + EmbeddedDocumentBytesHandler embeddedDocumentByteStore = + parseContext.get(EmbeddedDocumentBytesHandler.class); try (InputStream is = Files.newInputStream(tis.getPath())) { embeddedDocumentByteStore.add(0, metadata, is); } catch (IOException e) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java index 40121f9a7e..6a4c921a0a 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java @@ -25,6 +25,9 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -35,7 +38,7 @@ * This forbids multiple fetchers supporting the same name. */ public class FetcherManager extends ConfigBase { - + private static final Logger LOG = LoggerFactory.getLogger(FetcherManager.class); public static FetcherManager load(Path p) throws IOException, TikaConfigException { try (InputStream is = Files.newInputStream(p)) { @@ -48,12 +51,12 @@ public static FetcherManager load(Path p) throws IOException, TikaConfigExceptio public FetcherManager(List fetchers) throws TikaConfigException { for (Fetcher fetcher : fetchers) { String name = fetcher.getName(); - if (name == null || name.trim().length() == 0) { - throw new TikaConfigException("fetcher name must not be blank"); + if (name == null || name.trim().isEmpty()) { + throw new TikaConfigException("Fetcher name must not be blank"); } if (fetcherMap.containsKey(fetcher.getName())) { - throw new TikaConfigException( - "Multiple fetchers cannot support the same prefix: " + fetcher.getName()); + LOG.warn("Duplicate fetcher saved in the tika-config xml: {}. Ignoring.", fetcher.getName()); + continue; } fetcherMap.put(fetcher.getName(), fetcher); } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java new file mode 100644 index 0000000000..1d0fa8e48e --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.config; + +public abstract class AbstractConfig { + // Nothing to do here yet. +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java index 7188999767..bc3c4cddd3 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -43,8 +43,16 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.fs.config.FileSystemFetcherConfig; public class FileSystemFetcher extends AbstractFetcher implements Initializable { + public FileSystemFetcher() { + } + + public FileSystemFetcher(FileSystemFetcherConfig fileSystemFetcherConfig) { + setBasePath(fileSystemFetcherConfig.getBasePath()); + setExtractFileSystemMetadata(fileSystemFetcherConfig.isExtractFileSystemMetadata()); + } private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcher.class); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java new file mode 100644 index 0000000000..b9f155fbd7 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.fs.config; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class FileSystemFetcherConfig extends AbstractConfig { + private String basePath; + private boolean extractFileSystemMetadata; + + public String getBasePath() { + return basePath; + } + + public FileSystemFetcherConfig setBasePath(String basePath) { + this.basePath = basePath; + return this; + } + + public boolean isExtractFileSystemMetadata() { + return extractFileSystemMetadata; + } + + public FileSystemFetcherConfig setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { + this.extractFileSystemMetadata = extractFileSystemMetadata; + return this; + } +} diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java new file mode 100644 index 0000000000..01cc86c0a2 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import javax.xml.parsers.ParserConfigurationException; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.emitter.EmitKey; +import org.apache.tika.pipes.fetcher.FetchKey; + +class PipesClientTest { + String fetcherName = "fs"; + String testPdfFile = "testOverlappingText.pdf"; + + private PipesClient pipesClient; + + @BeforeEach + public void init() + throws TikaConfigException, IOException, ParserConfigurationException, SAXException { + Path tikaConfigPath = + Paths.get("src", "test", "resources", "org", "apache", "tika", "pipes", + "tika-sample-config.xml"); + PipesConfig pipesConfig = PipesConfig.load(tikaConfigPath); + pipesClient = new PipesClient(pipesConfig); + } + + @Test + void process() throws IOException, InterruptedException { + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testPdfFile, new FetchKey(fetcherName, testPdfFile), + new EmitKey(), new Metadata(), new ParseContext(), FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + Assertions.assertNotNull(pipesResult.getEmitData().getMetadataList()); + Assertions.assertEquals(1, pipesResult.getEmitData().getMetadataList().size()); + Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0); + Assertions.assertEquals("testOverlappingText.pdf", metadata.get("resourceName")); + } +} diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml b/tika-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml new file mode 100644 index 0000000000..c936852d95 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml @@ -0,0 +1,41 @@ + + + + + + 2 + + -Xmx1g + -XX:ParallelGCThreads=2 + + 60000 + -1 + + + + + false + + + + + fs + src/test/resources/test-documents + + + \ No newline at end of file diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index f01a3c11c3..5e0427b84e 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -328,7 +328,7 @@ 0.10.1 1.8.0 1.17.0 - 4.5.0-M1 + 4.5.0-M2 1.26.2 1.11.0 1.4.0 diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index 4ef27a191b..d1de647b39 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -36,6 +36,7 @@ tika-pipes-iterators tika-pipes-reporters tika-async-cli + tika-grpc @@ -69,11 +70,13 @@ checkstyle.xml UTF-8 - false + true + true true ${project.basedir}/src/test/java error true + true check diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 7830a74d67..2507de6e0d 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -36,6 +36,8 @@ tika-fetcher-s3 tika-fetcher-gcs tika-fetcher-az-blob + tika-fetcher-microsoft-graph + tika-fetcher-google @@ -44,4 +46,4 @@ 3.0.0-BETA-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java index d1f9e80d64..e38b71d9d9 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java @@ -45,6 +45,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.azblob.config.AZBlobFetcherConfig; import org.apache.tika.utils.StringUtils; /** @@ -58,6 +59,16 @@ * your requests, your fetchKey will be the complete SAS url pointing to the blob. */ public class AZBlobFetcher extends AbstractFetcher implements Initializable { + public AZBlobFetcher() { + + } + public AZBlobFetcher(AZBlobFetcherConfig azBlobFetcherConfig) { + setContainer(azBlobFetcherConfig.getContainer()); + setEndpoint(azBlobFetcherConfig.getEndpoint()); + setSasToken(azBlobFetcherConfig.getSasToken()); + setSpoolToTemp(azBlobFetcherConfig.isSpoolToTemp()); + setExtractUserMetadata(azBlobFetcherConfig.isExtractUserMetadata()); + } private static final Logger LOGGER = LoggerFactory.getLogger(AZBlobFetcher.class); private static String PREFIX = "az-blob"; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java new file mode 100644 index 0000000000..2bfe61fa79 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.azblob.config; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class AZBlobFetcherConfig extends AbstractConfig { + private boolean spoolToTemp; + private String sasToken; + private String endpoint; + private String container; + private boolean extractUserMetadata; + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public AZBlobFetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public String getSasToken() { + return sasToken; + } + + public AZBlobFetcherConfig setSasToken(String sasToken) { + this.sasToken = sasToken; + return this; + } + + public String getEndpoint() { + return endpoint; + } + + public AZBlobFetcherConfig setEndpoint(String endpoint) { + this.endpoint = endpoint; + return this; + } + + public String getContainer() { + return container; + } + + public AZBlobFetcherConfig setContainer(String container) { + this.container = container; + return this; + } + + public boolean isExtractUserMetadata() { + return extractUserMetadata; + } + + public AZBlobFetcherConfig setExtractUserMetadata(boolean extractUserMetadata) { + this.extractUserMetadata = extractUserMetadata; + return this; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java index 661d5f30db..75f89527e8 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java @@ -41,12 +41,21 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.gcs.config.GCSFetcherConfig; /** * Fetches files from google cloud storage. Must set projectId and bucket via the config. */ public class GCSFetcher extends AbstractFetcher implements Initializable { + public GCSFetcher() { + } + public GCSFetcher(GCSFetcherConfig gcsFetcherConfig) { + setBucket(gcsFetcherConfig.getBucket()); + setProjectId(gcsFetcherConfig.getProjectId()); + setSpoolToTemp(gcsFetcherConfig.isSpoolToTemp()); + setExtractUserMetadata(gcsFetcherConfig.isExtractUserMetadata()); + } private static String PREFIX = "gcs"; private static final Logger LOGGER = LoggerFactory.getLogger(GCSFetcher.class); private String projectId; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java new file mode 100644 index 0000000000..a8dad6417d --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.gcs.config; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class GCSFetcherConfig extends AbstractConfig { + private boolean spoolToTemp; + private String projectId; + private String bucket; + private boolean extractUserMetadata; + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public GCSFetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public String getProjectId() { + return projectId; + } + + public GCSFetcherConfig setProjectId(String projectId) { + this.projectId = projectId; + return this; + } + + public String getBucket() { + return bucket; + } + + public GCSFetcherConfig setBucket(String bucket) { + this.bucket = bucket; + return this; + } + + public boolean isExtractUserMetadata() { + return extractUserMetadata; + } + + public GCSFetcherConfig setExtractUserMetadata(boolean extractUserMetadata) { + this.extractUserMetadata = extractUserMetadata; + return this; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml new file mode 100644 index 0000000000..a7098309da --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml @@ -0,0 +1,115 @@ + + + + 4.0.0 + + + tika-fetchers + org.apache.tika + 4.0.0-SNAPSHOT + + + tika-fetcher-google + Google Tika Pipes Fetcher + + + 2.2.0 + 11 + 11 + UTF-8 + 1.11.0 + 6.4.0 + 1.1.1 + 5.11.0-M2 + 3.3.1 + 5.3.1 + 9.37.3 + + + + + + ${project.groupId} + tika-core + ${project.version} + + + + + com.google.api-client + google-api-client + ${google.api.client.version} + + + + com.google.auth + google-auth-library-oauth2-http + 1.19.0 + + + + + com.google.apis + google-api-services-drive + v3-rev20241027-2.0.0 + + + + + org.slf4j + slf4j-api + + + + + commons-io + commons-io + + + + + org.junit.jupiter + junit-jupiter + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.apache.tika.pipes.fetcher.s3 + + + + + + + + + 3.0.0-BETA-rc1 + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java new file mode 100644 index 0000000000..94a21740ee --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.google; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Base64; +import java.util.List; +import java.util.Map; + +import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; +import com.google.api.client.http.HttpRequestInitializer; +import com.google.api.client.json.JsonFactory; +import com.google.api.client.json.gson.GsonFactory; +import com.google.api.services.drive.Drive; +import com.google.api.services.drive.DriveScopes; +import com.google.auth.http.HttpCredentialsAdapter; +import com.google.auth.oauth2.GoogleCredentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetchers.google.config.GoogleDriveFetcherConfig; + + +/** + * GoogleDrive Fetcher allows the fetching of files from a Google Drive, using a + * service account key. + * + * Fetch Keys are ${fileId},${subjectUser}, where the subject user is the + * organizer of the file. This user is necessary as part of the key as the + * service account must act on behalf of the user when querying for the file. + */ +public class GoogleDriveFetcher extends AbstractFetcher implements Initializable { + private static final Logger LOGGER = LoggerFactory.getLogger(GoogleDriveFetcher.class); + private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance(); + + private GoogleCredentials baseCredentials; + + private Drive driveService; + private boolean spoolToTemp; + private List scopes; + + private GoogleDriveFetcherConfig config = new GoogleDriveFetcherConfig(); + + public GoogleDriveFetcher() { + scopes = new ArrayList<>(); + scopes.add(DriveScopes.DRIVE_READONLY); + } + + public GoogleDriveFetcher(GoogleDriveFetcherConfig config) { + this.config = config; + } + + @Field + public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException { + String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(","); + long[] seconds = new long[longStrings.length]; + for (int i = 0; i < longStrings.length; i++) { + try { + seconds[i] = Long.parseLong(longStrings[i]); + } catch (NumberFormatException e) { + throw new TikaConfigException(e.getMessage()); + } + } + setThrottleSeconds(seconds); + } + + public void setThrottleSeconds(long[] throttleSeconds) { + config.setThrottleSeconds(throttleSeconds); + } + + @Field + public void setSpoolToTemp(boolean spoolToTemp) { + config.setSpoolToTemp(spoolToTemp); + } + + @Field + public void setServiceAccountKeyBase64(String serviceAccountKeyBase64) { + config.setServiceAccountKeyBase64(serviceAccountKeyBase64); + } + + @Field + public void setSubjectUser(String subjectUser) { + config.setSubjectUser(subjectUser); + } + + @Field + public void setScopes(List scopes) { + config.setScopes(new ArrayList<>(scopes)); + if (config.getScopes().isEmpty()) { + config.getScopes().add(DriveScopes.DRIVE_READONLY); + } + } + + @Override + public void initialize(Map map) throws TikaConfigException { + try { + baseCredentials = GoogleCredentials + .fromStream(new ByteArrayInputStream(Base64.getDecoder().decode(config.getServiceAccountKeyBase64()))) + .createScoped(scopes); + } catch (IOException e) { + throw new TikaConfigException("Failed to initialize Google Drive service", e); + } + } + + @Override + public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException { + } + + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException { + int tries = 0; + Exception ex = null; + TemporaryResources tmp = null; + + do { + long start = System.currentTimeMillis(); + try { + String[] fetchKeySplit = fetchKey.split(","); + if (fetchKeySplit.length != 2) { + throw new TikaException("Invalid fetch key, expected format ${fileId},${subjectUser}: " + fetchKey); + } + + String fileId = fetchKeySplit[0]; + String subjectUser = fetchKeySplit[1]; + + GoogleCredentials delegatedCredentials = baseCredentials.createDelegated(subjectUser); + final HttpRequestInitializer requestInitializer = new HttpCredentialsAdapter(delegatedCredentials); + + driveService = new Drive.Builder( + GoogleNetHttpTransport.newTrustedTransport(), + JSON_FACTORY, + requestInitializer).setApplicationName("tika-fetcher-google").build(); + + InputStream is = driveService.files() + .get(fileId) + .executeMediaAsInputStream(); + + if (is == null) { + throw new IOException("Empty input stream when we tried to parse " + fetchKey); + } + + if (spoolToTemp) { + tmp = new TemporaryResources(); + Path tmpPath = tmp.createTempFile(fileId + ".dat"); + Files.copy(is, tmpPath); + return TikaInputStream.get(tmpPath); + } + return TikaInputStream.get(is); + + } catch (Exception e) { + LOGGER.warn("Exception fetching on retry=" + tries, e); + ex = e; + } finally { + long elapsed = System.currentTimeMillis() - start; + LOGGER.debug("Total to fetch {}", elapsed); + } + + long[] throttleSeconds = config.getThrottleSeconds(); + + LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]); + try { + Thread.sleep(throttleSeconds[tries] * 1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } while (++tries < config.getThrottleSeconds().length); + + throw new TikaException("Could not fetch " + fetchKey, ex); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java new file mode 100644 index 0000000000..f03db46955 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.google.config; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class GoogleDriveFetcherConfig extends AbstractConfig { + private long[] throttleSeconds; + private boolean spoolToTemp; + protected String serviceAccountKeyBase64; + protected String subjectUser; + private List scopes = new ArrayList<>(); + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public GoogleDriveFetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public long[] getThrottleSeconds() { + if (throttleSeconds == null) { + return new long[]{5, 10, 15}; // Default retry intervals + } + return throttleSeconds; + } + + public GoogleDriveFetcherConfig setThrottleSeconds(long[] throttleSeconds) { + this.throttleSeconds = throttleSeconds; + return this; + } + + public String getServiceAccountKeyBase64() { + return serviceAccountKeyBase64; + } + + public GoogleDriveFetcherConfig setServiceAccountKeyBase64(String serviceAccountKeyBase64) { + this.serviceAccountKeyBase64 = serviceAccountKeyBase64; + return this; + } + + public String getSubjectUser() { + return subjectUser; + } + + public GoogleDriveFetcherConfig setSubjectUser(String subjectUser) { + this.subjectUser = subjectUser; + return this; + } + + public List getScopes() { + return scopes; + } + + public GoogleDriveFetcherConfig setScopes(List scopes) { + this.scopes = scopes; + return this; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java index f9a4ebe0df..b64d6bc8a7 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java @@ -28,14 +28,15 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; -import java.util.HashSet; +import java.security.PrivateKey; +import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.atomic.AtomicBoolean; +import com.nimbusds.jose.JOSEException; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.http.ConnectionClosedException; @@ -60,6 +61,7 @@ import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaTimeoutException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -69,13 +71,26 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.AbstractFetcher; import org.apache.tika.pipes.fetcher.RangeFetcher; -import org.apache.tika.pipes.fetcher.http.config.AdditionalHttpHeaders; +import org.apache.tika.pipes.fetcher.http.config.HttpFetcherConfig; +import org.apache.tika.pipes.fetcher.http.jwt.JwtGenerator; +import org.apache.tika.pipes.fetcher.http.jwt.JwtPrivateKeyCreds; +import org.apache.tika.pipes.fetcher.http.jwt.JwtSecretCreds; import org.apache.tika.utils.StringUtils; /** * Based on Apache httpclient */ public class HttpFetcher extends AbstractFetcher implements Initializable, RangeFetcher { + public HttpFetcher() { + + } + + private HttpFetcherConfig httpFetcherConfig = new HttpFetcherConfig(); + private HttpClientFactory httpClientFactory = new HttpClientFactory(); + + public HttpFetcher(HttpFetcherConfig httpFetcherConfig) { + this.httpFetcherConfig = httpFetcherConfig; + } public static String HTTP_HEADER_PREFIX = "http-header:"; @@ -84,98 +99,109 @@ public class HttpFetcher extends AbstractFetcher implements Initializable, Range /** * http status code */ - public static Property HTTP_STATUS_CODE = - Property.externalInteger(HTTP_HEADER_PREFIX + "status-code"); + public static Property HTTP_STATUS_CODE = Property.externalInteger(HTTP_HEADER_PREFIX + "status-code"); /** * Number of redirects */ - public static Property HTTP_NUM_REDIRECTS = - Property.externalInteger(HTTP_FETCH_PREFIX + "num-redirects"); + public static Property HTTP_NUM_REDIRECTS = Property.externalInteger(HTTP_FETCH_PREFIX + "num-redirects"); /** * If there were redirects, this captures the final URL visited */ - public static Property HTTP_TARGET_URL = - Property.externalText(HTTP_FETCH_PREFIX + "target-url"); + public static Property HTTP_TARGET_URL = Property.externalText(HTTP_FETCH_PREFIX + "target-url"); - public static Property HTTP_TARGET_IP_ADDRESS = - Property.externalText(HTTP_FETCH_PREFIX + "target-ip-address"); + public static Property HTTP_TARGET_IP_ADDRESS = Property.externalText(HTTP_FETCH_PREFIX + "target-ip-address"); - public static Property HTTP_FETCH_TRUNCATED = - Property.externalBoolean(HTTP_FETCH_PREFIX + "fetch-truncated"); + public static Property HTTP_FETCH_TRUNCATED = Property.externalBoolean(HTTP_FETCH_PREFIX + "fetch-truncated"); - public static Property HTTP_CONTENT_ENCODING = - Property.externalText(HTTP_HEADER_PREFIX + "content-encoding"); + public static Property HTTP_CONTENT_ENCODING = Property.externalText(HTTP_HEADER_PREFIX + "content-encoding"); - public static Property HTTP_CONTENT_TYPE = - Property.externalText(HTTP_HEADER_PREFIX + "content-type"); + public static Property HTTP_CONTENT_TYPE = Property.externalText(HTTP_HEADER_PREFIX + "content-type"); private static String USER_AGENT = "User-Agent"; Logger LOG = LoggerFactory.getLogger(HttpFetcher.class); - private HttpClientFactory httpClientFactory = new HttpClientFactory(); private HttpClient httpClient; //back-off client that disables compression private HttpClient noCompressHttpClient; - private int maxRedirects = 10; - //overall timeout in milliseconds - private long overallTimeout = -1; - - private long maxSpoolSize = -1; - - //max string length to read from a result if the - //status code was not in the 200 range - private int maxErrMsgSize = 10000; - - //httpHeaders to capture in the metadata - private Set httpHeaders = new HashSet<>(); - - //When making the request, what User-Agent is sent. - //By default httpclient adds e.g. "Apache-HttpClient/4.5.13 (Java/x.y.z)" - private String userAgent = null; + JwtGenerator jwtGenerator; @Override - public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException { + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException, TikaException { HttpGet get = new HttpGet(fetchKey); - RequestConfig requestConfig = - RequestConfig.custom() - .setMaxRedirects(maxRedirects) - .setRedirectsEnabled(true).build(); + RequestConfig requestConfig = RequestConfig + .custom() + .setMaxRedirects(httpFetcherConfig.getMaxRedirects()) + .setRedirectsEnabled(httpFetcherConfig.getMaxRedirects() > 0) + .build(); get.setConfig(requestConfig); - putAdditionalHeadersOnRequest(parseContext, get); + putAdditionalHeadersOnRequest(get, metadata); return execute(get, metadata, httpClient, true); } @Override - public InputStream fetch(String fetchKey, long startRange, long endRange, Metadata metadata, ParseContext parseContext) - throws IOException { + public InputStream fetch(String fetchKey, long startRange, long endRange, Metadata metadata, + ParseContext parseContext) throws IOException, TikaException { HttpGet get = new HttpGet(fetchKey); - putAdditionalHeadersOnRequest(parseContext, get); + putAdditionalHeadersOnRequest(get, metadata); get.setHeader("Range", "bytes=" + startRange + "-" + endRange); return execute(get, metadata, httpClient, true); } - private void putAdditionalHeadersOnRequest(ParseContext parseContext, HttpGet get) { - if (!StringUtils.isBlank(userAgent)) { - get.setHeader(USER_AGENT, userAgent); + private void putAdditionalHeadersOnRequest(HttpGet httpGet, Metadata requestMetadata) throws TikaException { + if (!StringUtils.isBlank(httpFetcherConfig.getUserAgent())) { + httpGet.setHeader(USER_AGENT, httpFetcherConfig.getUserAgent()); + } + if (requestMetadata != null) { + String [] httpRequestHeaders = requestMetadata.getValues("httpRequestHeaders"); + if (httpRequestHeaders != null) { + for (String httpRequestHeader : httpRequestHeaders) { + placeHeaderOnGetRequest(httpGet, httpRequestHeader); + } + } } - AdditionalHttpHeaders additionalHttpHeaders = parseContext.get(AdditionalHttpHeaders.class); - if (additionalHttpHeaders != null) { - additionalHttpHeaders - .getHeaders() - .forEach(get::setHeader); + if (jwtGenerator != null) { + try { + httpGet.setHeader("Authorization", "Bearer " + jwtGenerator.jwt()); + } catch (JOSEException e) { + throw new TikaException("Could not generate JWT", e); + } } + placeHeadersOnGetRequest(httpGet); } - private InputStream execute(HttpGet get, Metadata metadata, HttpClient client, - boolean retryOnBadLength) throws IOException { + private void placeHeadersOnGetRequest(HttpGet httpGet) { + if (httpFetcherConfig.getHttpRequestHeaders() != null) { + for (String httpRequestHeader : httpFetcherConfig.getHttpRequestHeaders()) { + placeHeaderOnGetRequest(httpGet, httpRequestHeader); + } + } + } + + private void placeHeaderOnGetRequest(HttpGet httpGet, String httpRequestHeader) { + int idxOfEquals = httpRequestHeader.indexOf(':'); + if (idxOfEquals == -1) { + return; + } + String headerKey = httpRequestHeader + .substring(0, idxOfEquals) + .trim(); + String headerValue = httpRequestHeader + .substring(idxOfEquals + 1) + .trim(); + httpGet.setHeader(headerKey, headerValue); + } + + + private InputStream execute(HttpGet get, Metadata metadata, HttpClient client, boolean retryOnBadLength) throws IOException { HttpClientContext context = HttpClientContext.create(); HttpResponse response = null; final AtomicBoolean timeout = new AtomicBoolean(false); Timer timer = null; + long overallTimeout = httpFetcherConfig.getOverallTimeout() == null ? -1 : httpFetcherConfig.getOverallTimeout(); try { if (overallTimeout > -1) { TimerTask task = new TimerTask() { @@ -193,29 +219,35 @@ public void run() { } response = client.execute(get, context); - updateMetadata(get.getURI().toString(), response, context, metadata); + updateMetadata(get + .getURI() + .toString(), response, context, metadata); - int code = response.getStatusLine().getStatusCode(); + int code = response + .getStatusLine() + .getStatusCode(); + LOG.info("Fetch id {} status code {}", get.getURI(), code); if (code < 200 || code > 299) { - throw new IOException("bad status code: " + code + " :: " + - responseToString(response)); + throw new IOException("bad status code: " + code + " :: " + responseToString(response)); } - try (InputStream is = response.getEntity().getContent()) { + try (InputStream is = response + .getEntity() + .getContent()) { return spool(is, metadata); } } catch (ConnectionClosedException e) { - if (retryOnBadLength && e.getMessage() != null && e.getMessage().contains("Premature " + - "end of " + - "Content-Length delimited message")) { + if (retryOnBadLength && e.getMessage() != null && e + .getMessage() + .contains("Premature " + "end of " + "Content-Length delimited message")) { //one trigger for this is if the server sends the uncompressed length //and then compresses the stream. See HTTPCLIENT-2176 - LOG.warn("premature end of content-length delimited message; retrying with " + - "content compression disabled for {}", get.getURI()); + LOG.warn("premature end of content-length delimited message; retrying with " + "content compression" + + " disabled for {}", get.getURI()); return execute(get, metadata, noCompressHttpClient, false); } throw e; - } catch (IOException e) { + } catch (IOException e) { if (timeout.get()) { throw new TikaTimeoutException("Overall timeout after " + overallTimeout + "ms"); } else { @@ -240,12 +272,12 @@ private InputStream spool(InputStream content, Metadata metadata) throws IOExcep long start = System.currentTimeMillis(); TemporaryResources tmp = new TemporaryResources(); Path tmpFile = tmp.createTempFile(metadata); - if (maxSpoolSize < 0) { + if (httpFetcherConfig.getMaxSpoolSize() < 0) { Files.copy(content, tmpFile, StandardCopyOption.REPLACE_EXISTING); } else { try (OutputStream os = Files.newOutputStream(tmpFile)) { - long totalRead = IOUtils.copyLarge(content, os, 0, maxSpoolSize); - if (totalRead == maxSpoolSize && content.read() != -1) { + long totalRead = IOUtils.copyLarge(content, os, 0, httpFetcherConfig.getMaxSpoolSize()); + if (totalRead == httpFetcherConfig.getMaxSpoolSize() && content.read() != -1) { metadata.set(HTTP_FETCH_TRUNCATED, "true"); } } @@ -255,31 +287,38 @@ private InputStream spool(InputStream content, Metadata metadata) throws IOExcep return TikaInputStream.get(tmpFile, metadata, tmp); } - private void updateMetadata(String url, HttpResponse response, HttpClientContext context, - Metadata metadata) { + private void updateMetadata(String url, HttpResponse response, HttpClientContext context, Metadata metadata) { if (response == null) { return; } if (response.getStatusLine() != null) { - metadata.set(HTTP_STATUS_CODE, response.getStatusLine().getStatusCode()); + metadata.set(HTTP_STATUS_CODE, response + .getStatusLine() + .getStatusCode()); } HttpEntity entity = response.getEntity(); if (entity != null && entity.getContentEncoding() != null) { - metadata.set(HTTP_CONTENT_ENCODING, entity.getContentEncoding().getValue()); + metadata.set(HTTP_CONTENT_ENCODING, entity + .getContentEncoding() + .getValue()); } if (entity != null && entity.getContentType() != null) { - metadata.set(HTTP_CONTENT_TYPE, entity.getContentType().getValue()); + metadata.set(HTTP_CONTENT_TYPE, entity + .getContentType() + .getValue()); } - //load headers - for (String h : httpHeaders) { - Header[] headers = response.getHeaders(h); - if (headers != null && headers.length > 0) { - String name = HTTP_HEADER_PREFIX + h; - for (Header header : headers) { - metadata.add(name, header.getValue()); + //load response headers + if (httpFetcherConfig.getHttpHeaders() != null) { + for (String h : httpFetcherConfig.getHttpHeaders()) { + Header[] headers = response.getHeaders(h); + if (headers != null && headers.length > 0) { + String name = HTTP_HEADER_PREFIX + h; + for (Header header : headers) { + metadata.add(name, header.getValue()); + } } } } @@ -305,13 +344,12 @@ private void updateMetadata(String url, HttpResponse response, HttpClientContext HttpConnection connection = context.getConnection(); if (connection instanceof HttpInetConnection) { try { - InetAddress inetAddress = ((HttpInetConnection)connection).getRemoteAddress(); + InetAddress inetAddress = ((HttpInetConnection) connection).getRemoteAddress(); if (inetAddress != null) { metadata.set(HTTP_TARGET_IP_ADDRESS, inetAddress.getHostAddress()); } } catch (ConnectionShutdownException e) { - LOG.warn("connection shutdown while trying to get target URL: " + - url); + LOG.warn("connection shutdown while trying to get target URL: " + url); } } } @@ -320,14 +358,18 @@ private String responseToString(HttpResponse response) { if (response.getEntity() == null) { return ""; } - try (InputStream is = response.getEntity().getContent()) { - UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); - IOUtils.copyLarge(is, bos, 0, maxErrMsgSize); + try (InputStream is = response + .getEntity() + .getContent()) { + UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream + .builder() + .get(); + IOUtils.copyLarge(is, bos, 0, httpFetcherConfig.getMaxErrMsgSize()); return bos.toString(StandardCharsets.UTF_8); } catch (IOException e) { LOG.warn("IOException trying to read error message", e); return ""; - } catch (NullPointerException e ) { + } catch (NullPointerException e) { return ""; } finally { EntityUtils.consumeQuietly(response.getEntity()); @@ -337,75 +379,90 @@ private String responseToString(HttpResponse response) { @Field public void setUserName(String userName) { - httpClientFactory.setUserName(userName); + httpFetcherConfig.setUserName(userName); } @Field public void setPassword(String password) { - httpClientFactory.setPassword(password); + httpFetcherConfig.setPassword(password); } @Field public void setNtDomain(String domain) { - httpClientFactory.setNtDomain(domain); + httpFetcherConfig.setNtDomain(domain); } @Field public void setAuthScheme(String authScheme) { - httpClientFactory.setAuthScheme(authScheme); + httpFetcherConfig.setAuthScheme(authScheme); } @Field public void setProxyHost(String proxyHost) { - httpClientFactory.setProxyHost(proxyHost); + httpFetcherConfig.setProxyHost(proxyHost); } @Field public void setProxyPort(int proxyPort) { - httpClientFactory.setProxyPort(proxyPort); + httpFetcherConfig.setProxyPort(proxyPort); } @Field public void setConnectTimeout(int connectTimeout) { - httpClientFactory.setConnectTimeout(connectTimeout); + httpFetcherConfig.setConnectTimeout(connectTimeout); } @Field public void setRequestTimeout(int requestTimeout) { - httpClientFactory.setRequestTimeout(requestTimeout); + httpFetcherConfig.setRequestTimeout(requestTimeout); } @Field public void setSocketTimeout(int socketTimeout) { - httpClientFactory.setSocketTimeout(socketTimeout); + httpFetcherConfig.setSocketTimeout(socketTimeout); } @Field public void setMaxConnections(int maxConnections) { - httpClientFactory.setMaxConnections(maxConnections); + httpFetcherConfig.setMaxConnections(maxConnections); } @Field public void setMaxConnectionsPerRoute(int maxConnectionsPerRoute) { - httpClientFactory.setMaxConnectionsPerRoute(maxConnectionsPerRoute); + httpFetcherConfig.setMaxConnectionsPerRoute(maxConnectionsPerRoute); } /** * Set the maximum number of bytes to spool to a temp file. * If this value is -1, the full stream will be spooled to a temp file - * + *

* Default size is -1. * * @param maxSpoolSize */ @Field public void setMaxSpoolSize(long maxSpoolSize) { - this.maxSpoolSize = maxSpoolSize; + httpFetcherConfig.setMaxSpoolSize(maxSpoolSize); } @Field public void setMaxRedirects(int maxRedirects) { - this.maxRedirects = maxRedirects; + httpFetcherConfig.setMaxRedirects(maxRedirects); + } + + /** + * Which http request headers should we send in the http fetch requests. + * + * @param headers The headers to add to the HTTP GET requests. + */ + @Field + public void setHttpRequestHeaders(List headers) { + httpFetcherConfig.setHttpRequestHeaders(new ArrayList<>()); + if (headers != null) { + httpFetcherConfig + .getHttpRequestHeaders() + .addAll(headers); + } } /** @@ -416,8 +473,12 @@ public void setMaxRedirects(int maxRedirects) { */ @Field public void setHttpHeaders(List headers) { - this.httpHeaders.clear(); - this.httpHeaders.addAll(headers); + httpFetcherConfig.setHttpHeaders(new ArrayList<>()); + if (headers != null) { + httpFetcherConfig + .getHttpHeaders() + .addAll(headers); + } } /** @@ -428,12 +489,12 @@ public void setHttpHeaders(List headers) { */ @Field public void setOverallTimeout(long overallTimeout) { - this.overallTimeout = overallTimeout; + httpFetcherConfig.setOverallTimeout(overallTimeout); } @Field public void setMaxErrMsgSize(int maxErrMsgSize) { - this.maxErrMsgSize = maxErrMsgSize; + httpFetcherConfig.setMaxErrMsgSize(maxErrMsgSize); } /** @@ -444,24 +505,87 @@ public void setMaxErrMsgSize(int maxErrMsgSize) { */ @Field public void setUserAgent(String userAgent) { - this.userAgent = userAgent; + httpFetcherConfig.setUserAgent(userAgent); + } + + @Field + public void setJwtIssuer(String jwtIssuer) { + httpFetcherConfig.setJwtIssuer(jwtIssuer); + } + + @Field + public void setJwtSubject(String jwtSubject) { + httpFetcherConfig.setJwtSubject(jwtSubject); + } + + @Field + public void setJwtExpiresInSeconds(int jwtExpiresInSeconds) { + httpFetcherConfig.setJwtExpiresInSeconds(jwtExpiresInSeconds); + } + + @Field + public void setJwtSecret(String jwtSecret) { + httpFetcherConfig.setJwtSecret(jwtSecret); + } + + @Field + public void setJwtPrivateKeyBase64(String jwtPrivateKeyBase64) { + httpFetcherConfig.setJwtPrivateKeyBase64(jwtPrivateKeyBase64); } @Override public void initialize(Map params) throws TikaConfigException { + if (httpFetcherConfig.getSocketTimeout() != null) { + httpClientFactory.setSocketTimeout(httpFetcherConfig.getSocketTimeout()); + } + if (httpFetcherConfig.getRequestTimeout() != null) { + httpClientFactory.setRequestTimeout(httpFetcherConfig.getRequestTimeout()); + } + if (httpFetcherConfig.getConnectTimeout() != null) { + httpClientFactory.setSocketTimeout(httpFetcherConfig.getConnectTimeout()); + } + if (httpFetcherConfig.getMaxConnections() != null) { + httpClientFactory.setMaxConnections(httpFetcherConfig.getMaxConnections()); + } + if (httpFetcherConfig.getMaxConnectionsPerRoute() != null) { + httpClientFactory.setMaxConnectionsPerRoute(httpFetcherConfig.getMaxConnectionsPerRoute()); + } + if (!StringUtils.isBlank(httpFetcherConfig.getAuthScheme())) { + httpClientFactory.setUserName(httpFetcherConfig.getUserName()); + httpClientFactory.setPassword(httpFetcherConfig.getPassword()); + httpClientFactory.setAuthScheme(httpFetcherConfig.getAuthScheme()); + if (httpFetcherConfig.getNtDomain() != null) { + httpClientFactory.setNtDomain(httpFetcherConfig.getNtDomain()); + } + } + if (!StringUtils.isBlank(httpFetcherConfig.getProxyHost())) { + httpClientFactory.setProxyHost(httpFetcherConfig.getProxyHost()); + httpClientFactory.setProxyPort(httpFetcherConfig.getProxyPort()); + } httpClient = httpClientFactory.build(); HttpClientFactory cp = httpClientFactory.copy(); cp.setDisableContentCompression(true); noCompressHttpClient = cp.build(); + + if (!StringUtils.isBlank(httpFetcherConfig.getJwtPrivateKeyBase64())) { + PrivateKey key = JwtPrivateKeyCreds.convertBase64ToPrivateKey(httpFetcherConfig.getJwtPrivateKeyBase64()); + jwtGenerator = new JwtGenerator(new JwtPrivateKeyCreds(key, httpFetcherConfig.getJwtIssuer(), + httpFetcherConfig.getJwtSubject(), httpFetcherConfig.getJwtExpiresInSeconds())); + } else if (!StringUtils.isBlank(httpFetcherConfig.getJwtSecret())) { + jwtGenerator = new JwtGenerator(new JwtSecretCreds(httpFetcherConfig + .getJwtSecret() + .getBytes(StandardCharsets.UTF_8), httpFetcherConfig.getJwtIssuer(), httpFetcherConfig.getJwtSubject(), httpFetcherConfig.getJwtExpiresInSeconds())); + } } @Override - public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { + public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { + if (!StringUtils.isBlank(httpFetcherConfig.getJwtSecret()) && !StringUtils.isBlank(httpFetcherConfig.getJwtPrivateKeyBase64())) { + throw new TikaConfigException("Both JWT secret and JWT private key base 64 were " + "specified. Only one or the other is supported"); + } } - // For test purposes - void setHttpClientFactory(HttpClientFactory httpClientFactory) { + public void setHttpClientFactory(HttpClientFactory httpClientFactory) { this.httpClientFactory = httpClientFactory; } @@ -472,4 +596,12 @@ public void setHttpClient(HttpClient httpClient) { public HttpClient getHttpClient() { return httpClient; } + + public HttpFetcherConfig getHttpFetcherConfig() { + return httpFetcherConfig; + } + + public void setHttpFetcherConfig(HttpFetcherConfig httpFetcherConfig) { + this.httpFetcherConfig = httpFetcherConfig; + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java new file mode 100644 index 0000000000..1988529f62 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http.config; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class HttpFetcherConfig extends AbstractConfig { + private String userName; + private String password; + private String ntDomain; + private String authScheme; + private String proxyHost; + private Integer proxyPort; + private Integer maxConnectionsPerRoute = 1000; + private Integer maxConnections = 2000; + private Integer requestTimeout = 120000; + private Integer connectTimeout = 120000; + private Integer socketTimeout = 120000; + private Long maxSpoolSize = -1L; + private Integer maxRedirects = 0; + private List httpHeaders = new ArrayList<>(); + private List httpRequestHeaders = new ArrayList<>(); + private Long overallTimeout = 120000L; + private Integer maxErrMsgSize = 10000000; + private String userAgent; + private String jwtIssuer; + private String jwtSubject; + private int jwtExpiresInSeconds; + private String jwtSecret; + private String jwtPrivateKeyBase64; + + + public String getUserName() { + return userName; + } + + public HttpFetcherConfig setUserName(String userName) { + this.userName = userName; + return this; + } + + public String getPassword() { + return password; + } + + public HttpFetcherConfig setPassword(String password) { + this.password = password; + return this; + } + + public String getNtDomain() { + return ntDomain; + } + + public HttpFetcherConfig setNtDomain(String ntDomain) { + this.ntDomain = ntDomain; + return this; + } + + public String getAuthScheme() { + return authScheme; + } + + public HttpFetcherConfig setAuthScheme(String authScheme) { + this.authScheme = authScheme; + return this; + } + + public String getProxyHost() { + return proxyHost; + } + + public HttpFetcherConfig setProxyHost(String proxyHost) { + this.proxyHost = proxyHost; + return this; + } + + public Integer getProxyPort() { + return proxyPort; + } + + public HttpFetcherConfig setProxyPort(Integer proxyPort) { + this.proxyPort = proxyPort; + return this; + } + + public Integer getConnectTimeout() { + return connectTimeout; + } + + public HttpFetcherConfig setConnectTimeout(Integer connectTimeout) { + this.connectTimeout = connectTimeout; + return this; + } + + public Integer getRequestTimeout() { + return requestTimeout; + } + + public HttpFetcherConfig setRequestTimeout(Integer requestTimeout) { + this.requestTimeout = requestTimeout; + return this; + } + + public Integer getSocketTimeout() { + return socketTimeout; + } + + public HttpFetcherConfig setSocketTimeout(Integer socketTimeout) { + this.socketTimeout = socketTimeout; + return this; + } + + public Integer getMaxConnections() { + return maxConnections; + } + + public HttpFetcherConfig setMaxConnections(Integer maxConnections) { + this.maxConnections = maxConnections; + return this; + } + + public Integer getMaxConnectionsPerRoute() { + return maxConnectionsPerRoute; + } + + public HttpFetcherConfig setMaxConnectionsPerRoute(Integer maxConnectionsPerRoute) { + this.maxConnectionsPerRoute = maxConnectionsPerRoute; + return this; + } + + public Long getMaxSpoolSize() { + return maxSpoolSize; + } + + public HttpFetcherConfig setMaxSpoolSize(Long maxSpoolSize) { + this.maxSpoolSize = maxSpoolSize; + return this; + } + + public Integer getMaxRedirects() { + return maxRedirects; + } + + public HttpFetcherConfig setMaxRedirects(Integer maxRedirects) { + this.maxRedirects = maxRedirects; + return this; + } + + public List getHttpHeaders() { + return httpHeaders; + } + + public HttpFetcherConfig setHttpHeaders(List httpHeaders) { + this.httpHeaders = httpHeaders; + return this; + } + + public List getHttpRequestHeaders() { + return httpRequestHeaders; + } + + public void setHttpRequestHeaders(List httpRequestHeaders) { + this.httpRequestHeaders = httpRequestHeaders; + } + + public Long getOverallTimeout() { + return overallTimeout; + } + + public HttpFetcherConfig setOverallTimeout(Long overallTimeout) { + this.overallTimeout = overallTimeout; + return this; + } + + public Integer getMaxErrMsgSize() { + return maxErrMsgSize; + } + + public HttpFetcherConfig setMaxErrMsgSize(Integer maxErrMsgSize) { + this.maxErrMsgSize = maxErrMsgSize; + return this; + } + + public String getUserAgent() { + return userAgent; + } + + public HttpFetcherConfig setUserAgent(String userAgent) { + this.userAgent = userAgent; + return this; + } + + public String getJwtIssuer() { + return jwtIssuer; + } + + public HttpFetcherConfig setJwtIssuer(String jwtIssuer) { + this.jwtIssuer = jwtIssuer; + return this; + } + + public String getJwtSubject() { + return jwtSubject; + } + + public HttpFetcherConfig setJwtSubject(String jwtSubject) { + this.jwtSubject = jwtSubject; + return this; + } + + public int getJwtExpiresInSeconds() { + return jwtExpiresInSeconds; + } + + public HttpFetcherConfig setJwtExpiresInSeconds(int jwtExpiresInSeconds) { + this.jwtExpiresInSeconds = jwtExpiresInSeconds; + return this; + } + + public String getJwtSecret() { + return jwtSecret; + } + + public HttpFetcherConfig setJwtSecret(String jwtSecret) { + this.jwtSecret = jwtSecret; + return this; + } + + public String getJwtPrivateKeyBase64() { + return jwtPrivateKeyBase64; + } + + public HttpFetcherConfig setJwtPrivateKeyBase64(String jwtPrivateKeyBase64) { + this.jwtPrivateKeyBase64 = jwtPrivateKeyBase64; + return this; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtCreds.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtCreds.java new file mode 100644 index 0000000000..ed783e7338 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtCreds.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http.jwt; + +public abstract class JwtCreds { + private final String issuer; + private final String subject; + private final int expiresInSeconds; + + public JwtCreds(String issuer, String subject, int expiresInSeconds) { + this.issuer = issuer; + this.subject = subject; + this.expiresInSeconds = expiresInSeconds; + } + + public String getIssuer() { + return issuer; + } + + public String getSubject() { + return subject; + } + + public int getExpiresInSeconds() { + return expiresInSeconds; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtGenerator.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtGenerator.java new file mode 100644 index 0000000000..c8e7bdf2b7 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtGenerator.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http.jwt; + +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.Date; + +import com.nimbusds.jose.JOSEException; +import com.nimbusds.jose.JWSAlgorithm; +import com.nimbusds.jose.JWSHeader; +import com.nimbusds.jose.JWSSigner; +import com.nimbusds.jose.crypto.MACSigner; +import com.nimbusds.jose.crypto.RSASSASigner; +import com.nimbusds.jwt.JWTClaimsSet; +import com.nimbusds.jwt.SignedJWT; + +public class JwtGenerator { + JwtCreds jwtCreds; + public JwtGenerator(JwtCreds jwtCreds) { + this.jwtCreds = jwtCreds; + } + + public String jwt() throws JOSEException { + if (jwtCreds instanceof JwtSecretCreds) { + return jwtHS256((JwtSecretCreds) jwtCreds); + } else { + return jwtRS256((JwtPrivateKeyCreds) jwtCreds); + } + } + + String jwtHS256(JwtSecretCreds jwtSecretCreds) + throws JOSEException { + JWSSigner signer = new MACSigner(jwtSecretCreds.getSecret()); + + JWTClaimsSet claimsSet = getJwtClaimsSet(jwtSecretCreds.getIssuer(), + jwtSecretCreds.getSubject(), jwtSecretCreds.getExpiresInSeconds()); + + SignedJWT signedJWT = new SignedJWT(new JWSHeader(JWSAlgorithm.HS256), claimsSet); + signedJWT.sign(signer); + + return signedJWT.serialize(); + } + + String jwtRS256(JwtPrivateKeyCreds jwtPrivateKeyCreds) + throws JOSEException { + JWSSigner signer = new RSASSASigner(jwtPrivateKeyCreds.getPrivateKey()); + + JWTClaimsSet claimsSet = getJwtClaimsSet(jwtPrivateKeyCreds.getIssuer(), + jwtPrivateKeyCreds.getSubject(), jwtPrivateKeyCreds.getExpiresInSeconds()); + + SignedJWT signedJWT = new SignedJWT(new JWSHeader(JWSAlgorithm.RS256), claimsSet); + + signedJWT.sign(signer); + + return signedJWT.serialize(); + } + + private JWTClaimsSet getJwtClaimsSet(String issuer, String subject, int expiresInSeconds) { + return new JWTClaimsSet.Builder() + .subject(subject) + .issuer(issuer) + .expirationTime(Date.from(Instant.now().plus(expiresInSeconds, ChronoUnit.SECONDS))) + .build(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtPrivateKeyCreds.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtPrivateKeyCreds.java new file mode 100644 index 0000000000..149e74f5a5 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtPrivateKeyCreds.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http.jwt; + +import java.security.KeyFactory; +import java.security.NoSuchAlgorithmException; +import java.security.PrivateKey; +import java.security.spec.InvalidKeySpecException; +import java.security.spec.PKCS8EncodedKeySpec; +import java.util.Base64; + +import org.apache.tika.exception.TikaConfigException; + +public class JwtPrivateKeyCreds extends JwtCreds { + private final PrivateKey privateKey; + public JwtPrivateKeyCreds(PrivateKey privateKey, String issuer, String subject, + int expiresInSeconds) { + super(issuer, subject, expiresInSeconds); + this.privateKey = privateKey; + } + + public PrivateKey getPrivateKey() { + return privateKey; + } + + public static String convertPrivateKeyToBase64(PrivateKey privateKey) { + // Get the encoded form of the private key + byte[] privateKeyEncoded = privateKey.getEncoded(); + // Encode the byte array using Base64 + return Base64.getEncoder().encodeToString(privateKeyEncoded); + } + + public static PrivateKey convertBase64ToPrivateKey(String privateKeyBase64) + throws TikaConfigException { + try { + byte[] privateKeyEncoded = Base64.getDecoder().decode(privateKeyBase64); + + KeyFactory keyFactory = KeyFactory.getInstance("RSA"); + PKCS8EncodedKeySpec keySpec = new PKCS8EncodedKeySpec(privateKeyEncoded); + return keyFactory.generatePrivate(keySpec); + } catch (NoSuchAlgorithmException | InvalidKeySpecException e) { + throw new TikaConfigException("Could not convert private key base64 to PrivateKey", e); + } + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtSecretCreds.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtSecretCreds.java new file mode 100644 index 0000000000..f159cce3a7 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/jwt/JwtSecretCreds.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http.jwt; + +public class JwtSecretCreds extends JwtCreds { + private final byte[] secret; + public JwtSecretCreds(byte[] secret, String issuer, String subject, int expiresInSeconds) { + super(issuer, subject, expiresInSeconds); + this.secret = secret; + } + + public byte[] getSecret() { + return secret; + } + +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java index 1fca7811eb..2e9f58e091 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java @@ -30,6 +30,8 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.security.SecureRandom; +import java.util.ArrayList; import java.util.Collections; import java.util.zip.GZIPInputStream; @@ -57,22 +59,37 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.FetcherManager; -import org.apache.tika.pipes.fetcher.http.config.AdditionalHttpHeaders; - -public class HttpFetcherTest extends TikaTest { +import org.apache.tika.pipes.fetcher.http.config.HttpFetcherConfig; +import org.apache.tika.pipes.fetcher.http.jwt.JwtGenerator; +class HttpFetcherTest extends TikaTest { private static final String TEST_URL = "wontbecalled"; private static final String CONTENT = "request content"; private HttpFetcher httpFetcher; + private HttpFetcherConfig httpFetcherConfig; + @BeforeEach public void before() throws Exception { - final HttpResponse mockResponse = buildMockResponse(HttpStatus.SC_OK, - IOUtils.toInputStream(CONTENT, Charset.defaultCharset())); + httpFetcherConfig = new HttpFetcherConfig(); + httpFetcherConfig.setHttpHeaders(new ArrayList<>()); + httpFetcherConfig.setUserAgent("Test app"); + httpFetcherConfig.setConnectTimeout(240_000); + httpFetcherConfig.setRequestTimeout(240_000); + httpFetcherConfig.setSocketTimeout(240_000); + httpFetcherConfig.setMaxConnections(500); + httpFetcherConfig.setMaxConnectionsPerRoute(20); + httpFetcherConfig.setMaxRedirects(-1); + httpFetcherConfig.setMaxErrMsgSize(500_000_000); + httpFetcherConfig.setOverallTimeout(400_000L); + httpFetcherConfig.setMaxSpoolSize(-1L); + + final HttpResponse mockResponse = buildMockResponse(HttpStatus.SC_OK, IOUtils.toInputStream(CONTENT, Charset.defaultCharset())); mockClientResponse(mockResponse); } @@ -108,36 +125,29 @@ public void test4xxResponse() throws Exception { } @Test - @Disabled("requires network connectivity") - public void testRedirect() throws Exception { - String url = "https://t.co/cvfkWAEIxw?amp=1"; - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - Metadata metadata = new Metadata(); - HttpFetcher httpFetcher = - (HttpFetcher) getFetcherManager("tika-config-http.xml").getFetcher("http"); - try (InputStream is = httpFetcher.fetch(url, metadata, new ParseContext())) { - IOUtils.copy(is, bos); - } - //debug(metadata); - } + public void testJwt() throws Exception { + byte[] randomBytes = new byte[32]; + new SecureRandom().nextBytes(randomBytes); - @Test - @Disabled("requires network connectivity") - public void testRange() throws Exception { - String url = - "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/segments/1603107869785.9/warc/CC-MAIN-20201020021700-20201020051700-00529.warc.gz"; - long start = 969596307; - long end = start + 1408 - 1; - Metadata metadata = new Metadata(); - HttpFetcher httpFetcher = - (HttpFetcher) getFetcherManager("tika-config-http.xml").getFetcher("http"); - try (TemporaryResources tmp = new TemporaryResources()) { - Path tmpPath = tmp.createTempFile(metadata); - try (InputStream is = httpFetcher.fetch(url, start, end, metadata)) { - Files.copy(new GZIPInputStream(is), tmpPath, StandardCopyOption.REPLACE_EXISTING); - } - assertEquals(2461, Files.size(tmpPath)); + httpFetcher.jwtGenerator = Mockito.mock(JwtGenerator.class); + + final Metadata meta = new Metadata(); + meta.set(TikaCoreProperties.RESOURCE_NAME_KEY, "fileName"); + + try (final InputStream ignored = httpFetcher.fetch(TEST_URL, meta, new ParseContext())) { + // HTTP headers added into meta + assertEquals("200", meta.get("http-header:status-code")); + assertEquals(TEST_URL, meta.get("http-connection:target-url")); + // Content size included in meta + assertEquals("15", meta.get("Content-Length")); + + // Filename passed in should be preserved + assertEquals("fileName", meta.get(TikaCoreProperties.RESOURCE_NAME_KEY)); } + + Mockito + .verify(httpFetcher.jwtGenerator) + .jwt(); } @Test @@ -147,8 +157,7 @@ public void testHttpRequestHeaders() throws Exception { CloseableHttpResponse response = mock(CloseableHttpResponse.class); ArgumentCaptor httpGetArgumentCaptor = ArgumentCaptor.forClass(HttpGet.class); - when(httpClient.execute(httpGetArgumentCaptor.capture(), any(HttpContext.class))) - .thenReturn(response); + when(httpClient.execute(httpGetArgumentCaptor.capture(), any(HttpContext.class))).thenReturn(response); when(response.getStatusLine()).thenReturn(new StatusLine() { @Override public ProtocolVersion getProtocolVersion() { @@ -169,20 +178,49 @@ public String getReasonPhrase() { when(response.getEntity()).thenReturn(new StringEntity("Hi")); Metadata metadata = new Metadata(); - ParseContext parseContext = new ParseContext(); - AdditionalHttpHeaders additionalHttpHeaders = new AdditionalHttpHeaders(); - additionalHttpHeaders.getHeaders().put("nick1", "val1"); - additionalHttpHeaders.getHeaders().put("nick2", "val2"); - parseContext.set(AdditionalHttpHeaders.class, additionalHttpHeaders); - httpFetcher.fetch("http://localhost", metadata, parseContext); + metadata.set(Property.externalText("httpRequestHeaders"), new String[]{"nick1: val1", "nick2: val2"}); + httpFetcher.fetch("http://localhost", metadata, new ParseContext()); HttpGet httpGet = httpGetArgumentCaptor.getValue(); Assertions.assertEquals("val1", httpGet.getHeaders("nick1")[0].getValue()); Assertions.assertEquals("val2", httpGet.getHeaders("nick2")[0].getValue()); + // also make sure the headers from the fetcher config level are specified - see src/test/resources/tika-config-http.xml + Assertions.assertEquals("headerValueFromFetcherConfig", httpGet.getHeaders("headerNameFromFetcherConfig")[0].getValue()); + } + + @Test + @Disabled("requires network connectivity") + public void testRedirect() throws Exception { + String url = "https://t.co/cvfkWAEIxw?amp=1"; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + Metadata metadata = new Metadata(); + HttpFetcher httpFetcher = (HttpFetcher) getFetcherManager("tika-config-http.xml").getFetcher("http"); + try (InputStream is = httpFetcher.fetch(url, metadata, new ParseContext())) { + IOUtils.copy(is, bos); + } + //debug(metadata); + } + + @Test + @Disabled("requires network connectivity") + public void testRange() throws Exception { + String url = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/segments/1603107869785.9/warc/CC-MAIN-20201020021700-20201020051700-00529.warc.gz"; + long start = 969596307; + long end = start + 1408 - 1; + Metadata metadata = new Metadata(); + HttpFetcher httpFetcher = (HttpFetcher) getFetcherManager("tika-config-http.xml").getFetcher("http"); + try (TemporaryResources tmp = new TemporaryResources()) { + Path tmpPath = tmp.createTempFile(metadata); + try (InputStream is = httpFetcher.fetch(url, start, end, metadata)) { + Files.copy(new GZIPInputStream(is), tmpPath, StandardCopyOption.REPLACE_EXISTING); + } + assertEquals(2461, Files.size(tmpPath)); + } } FetcherManager getFetcherManager(String path) throws Exception { - return FetcherManager.load( - Paths.get(HttpFetcherTest.class.getResource("/" + path).toURI())); + return FetcherManager.load(Paths.get(HttpFetcherTest.class + .getResource("/" + path) + .toURI())); } private void mockClientResponse(final HttpResponse response) throws Exception { @@ -191,17 +229,16 @@ private void mockClientResponse(final HttpResponse response) throws Exception { final HttpClient httpClient = mock(HttpClient.class); final HttpClientFactory clientFactory = mock(HttpClientFactory.class); - when(httpClient.execute( - any(HttpUriRequest.class), any(HttpContext.class))).thenReturn(response); + when(httpClient.execute(any(HttpUriRequest.class), any(HttpContext.class))).thenReturn(response); when(clientFactory.build()).thenReturn(httpClient); when(clientFactory.copy()).thenReturn(clientFactory); httpFetcher.setHttpClientFactory(clientFactory); + httpFetcher.setHttpFetcherConfig(httpFetcherConfig); httpFetcher.initialize(Collections.emptyMap()); } - private static HttpResponse buildMockResponse(final int statusCode, final InputStream is) - throws IOException { + private static HttpResponse buildMockResponse(final int statusCode, final InputStream is) throws IOException { final HttpResponse response = mock(HttpResponse.class); final StatusLine status = mock(StatusLine.class); final HttpEntity entity = mock(HttpEntity.class); diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/jwt/JwtGeneratorTest.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/jwt/JwtGeneratorTest.java new file mode 100644 index 0000000000..62aa082adb --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/jwt/JwtGeneratorTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http.jwt; + +import java.security.KeyPair; +import java.security.KeyPairGenerator; +import java.security.SecureRandom; +import java.security.interfaces.RSAPublicKey; + +import com.nimbusds.jose.JWSVerifier; +import com.nimbusds.jose.crypto.MACVerifier; +import com.nimbusds.jose.crypto.RSASSAVerifier; +import com.nimbusds.jwt.SignedJWT; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class JwtGeneratorTest { + @Test + void jwtSecret() throws Exception { + byte[] randomBytes = new byte[32]; + new SecureRandom().nextBytes(randomBytes); + String jwt = new JwtGenerator(new JwtSecretCreds(randomBytes, "nick", "subject", + 120)).jwt(); + SignedJWT signedJWT = SignedJWT.parse(jwt); + JWSVerifier verifier = new MACVerifier(randomBytes); + Assertions.assertTrue(signedJWT.verify(verifier)); + } + + @Test + void jwtPrivateKey() throws Exception { + KeyPairGenerator keyPairGenerator = KeyPairGenerator.getInstance("RSA"); + keyPairGenerator.initialize(2048); + byte[] randomBytes = new byte[32]; + new SecureRandom().nextBytes(randomBytes); + KeyPair keyPair = keyPairGenerator.generateKeyPair(); + String jwt = new JwtGenerator(new JwtPrivateKeyCreds(keyPair.getPrivate(), "nick", + "subject", 120)).jwt(); + JWSVerifier verifier = new RSASSAVerifier((RSAPublicKey) keyPair.getPublic()); + SignedJWT signedJWT = SignedJWT.parse(jwt); + Assertions.assertTrue(signedJWT.verify(verifier)); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml index bd77de4bac..5def8f5dc4 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml @@ -24,6 +24,9 @@

Expires
Content-Length
+ +
headerNameFromFetcherConfig: headerValueFromFetcherConfig
+
- \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml new file mode 100644 index 0000000000..99d8e8f575 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml @@ -0,0 +1,157 @@ + + + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + 4.0.0 + + tika-fetcher-microsoft-graph + Microsoft Graph Tika Pipes Fetcher + + + 11 + 11 + UTF-8 + 1.11.0 + 6.4.0 + 1.1.1 + 5.11.0-M2 + 3.3.1 + 5.3.1 + 9.37.3 + + + + + com.azure + azure-identity + ${azure-identity.version} + + + ${project.groupId} + tika-core + ${project.version} + + + com.microsoft.graph + microsoft-graph + ${microsoft-graph.version} + + + org.junit.jupiter + junit-jupiter-engine + ${junit-jupiter-engine.version} + test + + + org.mockito + mockito-core + test + + + org.mockito + mockito-junit-jupiter + ${mockito-junit-jupiter.version} + test + + + com.nimbusds + nimbus-jose-jwt + ${nimbus-jose-jwt.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + org.apache.tika.pipes.fetcher.s3 + + + + + + + test-jar + + + + + + maven-shade-plugin + ${maven.shade.version} + + + package + + shade + + + + false + + + + + *:* + + META-INF/* + LICENSE.txt + NOTICE.txt + + + + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + META-INF/NOTICE + target/classes/META-INF/NOTICE + + + META-INF/DEPENDENCIES + target/classes/META-INF/DEPENDENCIES + + + + + + + + + + + + 3.0.0-BETA-rc1 + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java new file mode 100644 index 0000000000..b9f6ceafa8 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.microsoftgraph; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Base64; +import java.util.List; +import java.util.Map; + +import com.azure.identity.ClientCertificateCredentialBuilder; +import com.azure.identity.ClientSecretCredentialBuilder; +import com.microsoft.graph.serviceclient.GraphServiceClient; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetchers.microsoftgraph.config.MicrosoftGraphFetcherConfig; + +/** + * Fetches files from Microsoft Graph API. + * Fetch keys are ${siteDriveId},${driveItemId} + */ +public class MicrosoftGraphFetcher extends AbstractFetcher implements Initializable { + private static final Logger LOGGER = LoggerFactory.getLogger(MicrosoftGraphFetcher.class); + private GraphServiceClient graphClient; + private MicrosoftGraphFetcherConfig config = new MicrosoftGraphFetcherConfig(); + private long[] throttleSeconds; + private boolean spoolToTemp; + + + public MicrosoftGraphFetcher() { + + } + + public MicrosoftGraphFetcher(MicrosoftGraphFetcherConfig config) { + this.config = config; + } + + /** + * Set seconds to throttle retries as a comma-delimited list, e.g.: 30,60,120,600 + * + * @param commaDelimitedLongs + * @throws TikaConfigException + */ + @Field + public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException { + String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(","); + long[] seconds = new long[longStrings.length]; + for (int i = 0; i < longStrings.length; i++) { + try { + seconds[i] = Long.parseLong(longStrings[i]); + } catch (NumberFormatException e) { + throw new TikaConfigException(e.getMessage()); + } + } + setThrottleSeconds(seconds); + } + + public void setThrottleSeconds(long[] throttleSeconds) { + this.throttleSeconds = throttleSeconds; + } + + @Field + public void setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + } + + @Field + public void setTenantId(String tenantId) { + config.setTenantId(tenantId); + } + + @Field + public void setClientId(String clientId) { + config.setClientId(clientId); + } + + @Field + public void setClientSecret(String clientSecret) { + config.setClientSecret(clientSecret); + } + + @Field + public void setCertificateBytesBase64(String certificateBytesBase64) { + config.setCertificateBytesBase64(certificateBytesBase64); + } + + @Field + public void setCertificatePassword(String certificatePassword) { + config.setCertificatePassword(certificatePassword); + } + + @Field + public void setScopes(List scopes) { + config.setScopes(new ArrayList<>(scopes)); + if (config.getScopes().isEmpty()) { + config.getScopes().add("https://graph.microsoft.com/.default"); + } + } + + @Override + public void initialize(Map map) { + String[] scopes = config + .getScopes() + .toArray(new String[0]); + if (config.getCertificateBytesBase64() != null) { + graphClient = new GraphServiceClient(new ClientCertificateCredentialBuilder() + .clientId(config.getClientId()) + .tenantId(config.getTenantId()) + .pfxCertificate(new ByteArrayInputStream(Base64.getDecoder().decode(config.getCertificateBytesBase64()))) + .clientCertificatePassword(config.getCertificatePassword()) + .build(), scopes); + } else if (config.getClientSecret() != null) { + graphClient = new GraphServiceClient(new ClientSecretCredentialBuilder() + .tenantId(config.getTenantId()) + .clientId(config.getClientId()) + .clientSecret(config.getClientSecret()) + .build(), scopes); + } + } + + @Override + public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException { + } + + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException { + int tries = 0; + Exception ex; + do { + long start = System.currentTimeMillis(); + try { + String[] fetchKeySplit = fetchKey.split(","); + String siteDriveId = fetchKeySplit[0]; + String driveItemId = fetchKeySplit[1]; + InputStream is = graphClient + .drives() + .byDriveId(siteDriveId) + .items() + .byDriveItemId(driveItemId) + .content() + .get(); + + if (is == null) { + throw new IOException("Empty input stream when we tried to parse " + fetchKey); + } + if (spoolToTemp) { + File tempFile = Files + .createTempFile("spooled-temp", ".dat") + .toFile(); + FileUtils.copyInputStreamToFile(is, tempFile); + LOGGER.info("Spooled to temp file {}", tempFile); + return TikaInputStream.get(tempFile.toPath()); + } + return TikaInputStream.get(is); + } catch (Exception e) { + LOGGER.warn("Exception fetching on retry=" + tries, e); + ex = e; + } finally { + long elapsed = System.currentTimeMillis() - start; + LOGGER.debug("Total to fetch {}", elapsed); + } + LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]); + try { + Thread.sleep(throttleSeconds[tries]); + } catch (InterruptedException e) { + Thread + .currentThread() + .interrupt(); + } + } while (++tries < throttleSeconds.length); + throw new TikaException("Could not parse " + fetchKey, ex); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java new file mode 100644 index 0000000000..f9bc5b1b84 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.microsoftgraph.config; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class MicrosoftGraphFetcherConfig extends AbstractConfig { + private long[] throttleSeconds; + private boolean spoolToTemp; + protected String tenantId; + protected String clientId; + private String clientSecret; + private String certificateBytesBase64; + private String certificatePassword; + private List scopes = new ArrayList<>(); + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public MicrosoftGraphFetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public long[] getThrottleSeconds() { + return throttleSeconds; + } + + public MicrosoftGraphFetcherConfig setThrottleSeconds(long[] throttleSeconds) { + this.throttleSeconds = throttleSeconds; + return this; + } + + public String getTenantId() { + return tenantId; + } + + public MicrosoftGraphFetcherConfig setTenantId(String tenantId) { + this.tenantId = tenantId; + return this; + } + + public String getClientId() { + return clientId; + } + + public MicrosoftGraphFetcherConfig setClientId(String clientId) { + this.clientId = clientId; + return this; + } + + public String getClientSecret() { + return clientSecret; + } + + public MicrosoftGraphFetcherConfig setClientSecret(String clientSecret) { + this.clientSecret = clientSecret; + return this; + } + + public String getCertificateBytesBase64() { + return certificateBytesBase64; + } + + public void setCertificateBytesBase64(String certificateBytesBase64) { + this.certificateBytesBase64 = certificateBytesBase64; + } + + public String getCertificatePassword() { + return certificatePassword; + } + + public MicrosoftGraphFetcherConfig setCertificatePassword(String certificatePassword) { + this.certificatePassword = certificatePassword; + return this; + } + + public List getScopes() { + return scopes; + } + + public MicrosoftGraphFetcherConfig setScopes(List scopes) { + this.scopes = scopes; + return this; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/resources/log4j2.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..c88e66e99e --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/resources/log4j2.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java index 7283c97273..144f15c016 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java @@ -58,6 +58,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.AbstractFetcher; import org.apache.tika.pipes.fetcher.RangeFetcher; +import org.apache.tika.pipes.fetcher.s3.config.S3FetcherConfig; import org.apache.tika.utils.StringUtils; /** @@ -66,6 +67,29 @@ * initialization, and the fetch key is "path/to/my_file.pdf". */ public class S3Fetcher extends AbstractFetcher implements Initializable, RangeFetcher { + public S3Fetcher() { + + } + + public S3Fetcher(S3FetcherConfig s3FetcherConfig) { + setBucket(s3FetcherConfig.getBucket()); + setRegion(s3FetcherConfig.getRegion()); + setProfile(s3FetcherConfig.getProfile()); + setAccessKey(s3FetcherConfig.getAccessKey()); + setSecretKey(s3FetcherConfig.getSecretKey()); + setPrefix(s3FetcherConfig.getPrefix()); + + setCredentialsProvider(s3FetcherConfig.getCredentialsProvider()); + setEndpointConfigurationService(s3FetcherConfig.getEndpointConfigurationService()); + + setMaxConnections(s3FetcherConfig.getMaxConnections()); + setSpoolToTemp(s3FetcherConfig.isSpoolToTemp()); + setThrottleSeconds(s3FetcherConfig.getThrottleSeconds()); + setMaxLength(s3FetcherConfig.getMaxLength()); + + setExtractUserMetadata(s3FetcherConfig.isExtractUserMetadata()); + setPathStyleAccessEnabled(s3FetcherConfig.isPathStyleAccessEnabled()); + } private static final Logger LOGGER = LoggerFactory.getLogger(S3Fetcher.class); private static final String PREFIX = "s3"; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java new file mode 100644 index 0000000000..84a335a2bd --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.s3.config; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class S3FetcherConfig extends AbstractConfig { + private boolean spoolToTemp; + private String region; + private String profile; + private String bucket; + private String commaDelimitedLongs; + private String prefix; + private boolean extractUserMetadata; + private int maxConnections; + private String credentialsProvider; + private long maxLength; + private String accessKey; + private String secretKey; + private String endpointConfigurationService; + private boolean pathStyleAccessEnabled; + private long[] throttleSeconds; + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public S3FetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public String getRegion() { + return region; + } + + public S3FetcherConfig setRegion(String region) { + this.region = region; + return this; + } + + public String getProfile() { + return profile; + } + + public S3FetcherConfig setProfile(String profile) { + this.profile = profile; + return this; + } + + public String getBucket() { + return bucket; + } + + public S3FetcherConfig setBucket(String bucket) { + this.bucket = bucket; + return this; + } + + public String getCommaDelimitedLongs() { + return commaDelimitedLongs; + } + + public S3FetcherConfig setCommaDelimitedLongs(String commaDelimitedLongs) { + this.commaDelimitedLongs = commaDelimitedLongs; + return this; + } + + public String getPrefix() { + return prefix; + } + + public S3FetcherConfig setPrefix(String prefix) { + this.prefix = prefix; + return this; + } + + public boolean isExtractUserMetadata() { + return extractUserMetadata; + } + + public S3FetcherConfig setExtractUserMetadata(boolean extractUserMetadata) { + this.extractUserMetadata = extractUserMetadata; + return this; + } + + public int getMaxConnections() { + return maxConnections; + } + + public S3FetcherConfig setMaxConnections(int maxConnections) { + this.maxConnections = maxConnections; + return this; + } + + public String getCredentialsProvider() { + return credentialsProvider; + } + + public S3FetcherConfig setCredentialsProvider(String credentialsProvider) { + this.credentialsProvider = credentialsProvider; + return this; + } + + public long getMaxLength() { + return maxLength; + } + + public S3FetcherConfig setMaxLength(long maxLength) { + this.maxLength = maxLength; + return this; + } + + public String getAccessKey() { + return accessKey; + } + + public S3FetcherConfig setAccessKey(String accessKey) { + this.accessKey = accessKey; + return this; + } + + public String getSecretKey() { + return secretKey; + } + + public S3FetcherConfig setSecretKey(String secretKey) { + this.secretKey = secretKey; + return this; + } + + public String getEndpointConfigurationService() { + return endpointConfigurationService; + } + + public S3FetcherConfig setEndpointConfigurationService(String endpointConfigurationService) { + this.endpointConfigurationService = endpointConfigurationService; + return this; + } + + public boolean isPathStyleAccessEnabled() { + return pathStyleAccessEnabled; + } + + public S3FetcherConfig setPathStyleAccessEnabled(boolean pathStyleAccessEnabled) { + this.pathStyleAccessEnabled = pathStyleAccessEnabled; + return this; + } + + public long[] getThrottleSeconds() { + return throttleSeconds; + } + + public S3FetcherConfig setThrottleSeconds(long[] throttleSeconds) { + this.throttleSeconds = throttleSeconds; + return this; + } +} diff --git a/tika-pipes/tika-grpc/README.md b/tika-pipes/tika-grpc/README.md new file mode 100644 index 0000000000..f2269b22bf --- /dev/null +++ b/tika-pipes/tika-grpc/README.md @@ -0,0 +1,13 @@ +# Tika Pipes GRPC Server + +The following is the Tika Pipes GRPC Server. + +This server will manage a pool of Tika Pipes clients. + +* Tika Pipes Fetcher CRUD operations + * Create + * Read + * Update + * Delete +* Fetch + Parse a given Fetch Item + diff --git a/tika-pipes/tika-grpc/example-dockerfile/Dockerfile b/tika-pipes/tika-grpc/example-dockerfile/Dockerfile new file mode 100644 index 0000000000..dca5866a3d --- /dev/null +++ b/tika-pipes/tika-grpc/example-dockerfile/Dockerfile @@ -0,0 +1,29 @@ +FROM ubuntu:latest +COPY libs/ /tika/libs/ +COPY tika-config.xml /tika/config/tika-config.xml +COPY log4j2.xml /tika/config/log4j2.xml +ARG JRE='openjdk-17-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +EXPOSE 50051 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -Dlog4j.configurationFile=/tika/config/log4j2.xml -cp \"/tika/libs/*\" org.apache.tika.pipes.grpc.TikaGrpcServer --port 50051 --tika-config /tika/config/tika-config.xml"] diff --git a/tika-pipes/tika-grpc/example-dockerfile/docker-build.sh b/tika-pipes/tika-grpc/example-dockerfile/docker-build.sh new file mode 100644 index 0000000000..329e15366b --- /dev/null +++ b/tika-pipes/tika-grpc/example-dockerfile/docker-build.sh @@ -0,0 +1,42 @@ +TAG_NAME=$1 + +if [ -z "${TAG_NAME}" ]; then + echo "Single command line argument is required which will be used as the -t parameter of the docker build command" + exit 1 +fi + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +TIKA_SRC_PATH=${SCRIPT_DIR}/../../.. +OUT_DIR=${TIKA_SRC_PATH}/tika-pipes/tika-grpc/target/tika-docker + +mvn clean install -Dossindex.skip -DskipTests=true -Denforcer.skip=true -Dossindex.skip=true -f "${TIKA_SRC_PATH}" || exit +mvn dependency:copy-dependencies -f "${TIKA_SRC_PATH}/tika-pipes/tika-grpc" || exit +rm -rf "${OUT_DIR}" +mkdir -p "${OUT_DIR}" + +project_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout -f "${TIKA_SRC_PATH}") + +cp -r "${TIKA_SRC_PATH}/tika-pipes/tika-grpc/target/dependency" "${OUT_DIR}/libs" +cp -r "${TIKA_SRC_PATH}/tika-pipes/tika-fetchers/tika-fetcher-gcs/target/tika-fetcher-gcs-${project_version}.jar" "${OUT_DIR}/libs" +cp -r "${TIKA_SRC_PATH}/tika-pipes/tika-fetchers/tika-fetcher-az-blob/target/tika-fetcher-az-blob-${project_version}.jar" "${OUT_DIR}/libs" +cp -r "${TIKA_SRC_PATH}/tika-pipes/tika-fetchers/tika-fetcher-http/target/tika-fetcher-http-${project_version}.jar" "${OUT_DIR}/libs" +cp -r "${TIKA_SRC_PATH}/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/target/tika-fetcher-microsoft-graph-${project_version}.jar" "${OUT_DIR}/libs" +cp -r "${TIKA_SRC_PATH}/tika-pipes/tika-fetchers/tika-fetcher-s3/target/tika-fetcher-s3-${project_version}.jar" "${OUT_DIR}/libs" + +cp "${TIKA_SRC_PATH}/tika-pipes/tika-grpc/target/tika-grpc-${project_version}.jar" "${OUT_DIR}/libs" +cp "${TIKA_SRC_PATH}/tika-pipes/tika-grpc/src/test/resources/log4j2.xml" "${OUT_DIR}" +cp "${TIKA_SRC_PATH}/tika-pipes/tika-grpc/src/test/resources/tika-pipes-test-config.xml" "${OUT_DIR}/tika-config.xml" +cp "${TIKA_SRC_PATH}/tika-pipes/tika-grpc/example-dockerfile/Dockerfile" "${OUT_DIR}/Dockerfile" + +cd "${OUT_DIR}" || exit + +# build single arch +#docker build "${OUT_DIR}" -t "${TAG_NAME}" + +# Or we can build multi-arch - https://www.docker.com/blog/multi-arch-images/ +docker buildx create --name tikabuilder +# see https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147 +docker run --rm --privileged tonistiigi/binfmt --install amd64 +docker run --rm --privileged tonistiigi/binfmt --install arm64 +docker buildx build --builder=tikabuilder "${OUT_DIR}" -t "${TAG_NAME}" --platform linux/amd64,linux/arm64 --push +docker buildx stop tikabuilder diff --git a/tika-pipes/tika-grpc/pom.xml b/tika-pipes/tika-grpc/pom.xml new file mode 100644 index 0000000000..e442e94198 --- /dev/null +++ b/tika-pipes/tika-grpc/pom.xml @@ -0,0 +1,339 @@ + + 4.0.0 + tika-grpc + jar + + Apache Tika pipes gRPC server + https://tika.apache.org/ + + + org.apache.tika + tika-pipes + 3.0.0-SNAPSHOT + ../pom.xml + + + + UTF-8 + 1.60.0 + 3.24.0 + 3.24.0 + 1.2.2 + 11 + 4.2.1 + 3.0.0 + 2.28.0 + 2.15.0 + + + + + + io.grpc + grpc-bom + ${grpc.version} + pom + import + + + + + + + io.grpc + grpc-netty-shaded + runtime + + + com.google.guava + guava + + + com.google.errorprone + error_prone_annotations + + + com.google.code.gson + gson + + + + + io.grpc + grpc-protobuf + + + com.google.guava + guava + + + com.google.errorprone + error_prone_annotations + + + + + io.grpc + grpc-services + + + com.google.guava + guava + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + + + + + io.grpc + grpc-stub + + + com.google.guava + guava + + + com.google.errorprone + error_prone_annotations + + + + + com.google.protobuf + protobuf-java-util + ${protobuf.version} + + + com.google.guava + guava + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + + + + + com.google.code.gson + gson + ${gson.version} + + + com.google.j2objc + j2objc-annotations + ${j2objc-annotations.version} + + + com.google.errorprone + error_prone_annotations + ${error_prone_annotations.version} + + + com.google.guava + guava + + + + + org.apache.tika + tika-async-cli + ${project.version} + + + org.apache.tika + tika-parsers-standard-package + ${project.version} + + + org.apache.tika + tika-core + ${project.version} + + + com.google.guava + guava + ${guava.version} + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + + + + + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + + + org.slf4j + jcl-over-slf4j + + + org.apache.tomcat + annotations-api + 6.0.53 + provided + + + com.beust + jcommander + + + org.mockito + mockito-core + 3.4.0 + test + + + io.grpc + grpc-testing + test + + + com.google.guava + guava + + + + + org.apache.tika + tika-fetcher-http + ${project.version} + + + org.apache.tika + tika-fetcher-microsoft-graph + ${project.version} + + + org.apache.tika + tika-fetcher-google + ${project.version} + + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson-module-jsonSchema.version} + + + com.asarkar.grpc + grpc-test + ${asarkar-grpc-test.version} + test + + + org.eclipse.jetty + jetty-server + test + + + org.awaitility + awaitility + ${awaitility.version} + test + + + + + + kr.motd.maven + os-maven-plugin + 1.7.1 + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + 0.6.1 + + com.google.protobuf:protoc:${protoc.version}:exe:${os.detected.classifier} + grpc-java + io.grpc:protoc-gen-grpc-java:${grpc.version}:exe:${os.detected.classifier} + + + + + compile + compile-custom + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + 1.4.1 + + + enforce + + enforce + + + + + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.5.0 + + + test + generate-sources + + add-source + + + + ${basedir}/target/generated-sources/protobuf/grpc-java + ${basedir}/target/generated-sources/protobuf/java + + + + + + + org.codehaus.mojo + exec-maven-plugin + 3.2.0 + + + + java + + + + + org.apache.tika.pipes.grpc.TikaGrpcServer + + + + + diff --git a/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java new file mode 100644 index 0000000000..d21f11b08f --- /dev/null +++ b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import java.time.Instant; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class ExpiringFetcherStore implements AutoCloseable { + private static final Logger LOG = LoggerFactory.getLogger(ExpiringFetcherStore.class); + public static final long EXPIRE_JOB_INITIAL_DELAY = 1L; + private final Map fetchers = Collections.synchronizedMap(new HashMap<>()); + private final Map fetcherConfigs = Collections.synchronizedMap(new HashMap<>()); + private final Map fetcherLastAccessed = Collections.synchronizedMap(new HashMap<>()); + + private final ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor(); + + public ExpiringFetcherStore(int expireAfterSeconds, int checkForExpiredFetchersDelaySeconds) { + executorService.scheduleAtFixedRate(() -> { + Set expired = new HashSet<>(); + for (String fetcherName : fetchers.keySet()) { + Instant lastAccessed = fetcherLastAccessed.get(fetcherName); + if (lastAccessed == null) { + LOG.error("Detected a fetcher with no last access time. FetcherName={}", fetcherName); + expired.add(fetcherName); + } else if (Instant + .now() + .isAfter(lastAccessed.plusSeconds(expireAfterSeconds))) { + LOG.info("Detected stale fetcher {} hasn't been accessed in {} seconds. " + "Deleting.", fetcherName, Instant + .now() + .getEpochSecond() - lastAccessed.getEpochSecond()); + expired.add(fetcherName); + } + } + for (String expiredFetcherId : expired) { + deleteFetcher(expiredFetcherId); + } + }, EXPIRE_JOB_INITIAL_DELAY, checkForExpiredFetchersDelaySeconds, TimeUnit.SECONDS); + } + + public boolean deleteFetcher(String fetcherName) { + boolean success = fetchers.remove(fetcherName) != null; + fetcherConfigs.remove(fetcherName); + fetcherLastAccessed.remove(fetcherName); + return success; + } + + public Map getFetchers() { + return fetchers; + } + + public Map getFetcherConfigs() { + return fetcherConfigs; + } + + /** + * This method will get the fetcher, but will also log the access the fetcher as having + * been accessed. This prevents the scheduled job from removing the stale fetcher. + */ + public T getFetcherAndLogAccess(String fetcherName) { + fetcherLastAccessed.put(fetcherName, Instant.now()); + return (T) fetchers.get(fetcherName); + } + + public void createFetcher(T fetcher, C config) { + fetchers.put(fetcher.getName(), fetcher); + fetcherConfigs.put(fetcher.getName(), config); + getFetcherAndLogAccess(fetcher.getName()); + } + + @Override + public void close() { + executorService.shutdownNow(); + } +} diff --git a/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java new file mode 100644 index 0000000000..2cd4da941c --- /dev/null +++ b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import static io.grpc.health.v1.HealthCheckResponse.ServingStatus; + +import java.io.File; +import java.util.concurrent.TimeUnit; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import io.grpc.Grpc; +import io.grpc.InsecureServerCredentials; +import io.grpc.Server; +import io.grpc.ServerCredentials; +import io.grpc.TlsServerCredentials; +import io.grpc.protobuf.services.HealthStatusManager; +import io.grpc.protobuf.services.ProtoReflectionService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Server that manages startup/shutdown of the GRPC Tika server. + */ +public class TikaGrpcServer { + private static final Logger LOGGER = LoggerFactory.getLogger(TikaGrpcServer.class); + private Server server; + @Parameter(names = {"-p", "--port"}, description = "The grpc server port", help = true, required = true) + private Integer port; + + @Parameter(names = {"-t", "--tika-config"}, description = "The grpc server port", help = true, required = true) + private File tikaConfigXml; + + @Parameter(names = {"-s", "--secure"}, description = "Enable credentials required to access this grpc server") + private boolean secure; + + @Parameter(names = {"--cert-chain"}, description = "Certificate chain file. Example: server1.pem See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls") + private File certChain; + + @Parameter(names = {"--private-key"}, description = "Private key store. Example: server1.key See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls") + private File privateKey; + + @Parameter(names = {"--private-key-password"}, description = "Private key password, if needed") + private String privateKeyPassword; + + @Parameter(names = {"--trust-cert-collection"}, description = "The trust certificate collection (root certs). Example: ca.pem See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls") + private File trustCertCollection; + + @Parameter(names = {"--client-auth-required"}, description = "Is Mutual TLS required?") + private boolean clientAuthRequired; + + @Parameter(names = {"-h", "-H", "--help"}, description = "Display help menu") + private boolean help; + + public void start() throws Exception { + HealthStatusManager healthStatusManager = new HealthStatusManager(); + ServerCredentials creds; + if (secure) { + TlsServerCredentials.Builder channelCredBuilder = TlsServerCredentials.newBuilder(); + channelCredBuilder.keyManager(certChain, privateKey, privateKeyPassword); + if (trustCertCollection != null && trustCertCollection.exists()) { + channelCredBuilder.trustManager(trustCertCollection); + if (clientAuthRequired) { + channelCredBuilder.clientAuth(TlsServerCredentials.ClientAuth.REQUIRE); + } + } + creds = channelCredBuilder.build(); + } else { + creds = InsecureServerCredentials.create(); + } + File tikaConfigFile = new File(tikaConfigXml.getAbsolutePath()); + healthStatusManager.setStatus(TikaGrpcServer.class.getSimpleName(), ServingStatus.SERVING); + server = Grpc + .newServerBuilderForPort(port, creds) + .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath())) + .addService(healthStatusManager.getHealthService()) + .addService(ProtoReflectionService.newInstance()) // Enable reflection + .build() + .start(); + LOGGER.info("Server started, listening on " + port); + Runtime + .getRuntime() + .addShutdownHook(new Thread(() -> { + // Use stderr here since the logger may have been reset by its JVM shutdown hook. + System.err.println("*** shutting down gRPC server since JVM is shutting down"); + healthStatusManager.clearStatus(TikaGrpcServer.class.getSimpleName()); + try { + TikaGrpcServer.this.stop(); + } catch (InterruptedException e) { + e.printStackTrace(System.err); + } + System.err.println("*** server shut down"); + })); + } + + public void stop() throws InterruptedException { + if (server != null) { + server + .shutdown() + .awaitTermination(30, TimeUnit.SECONDS); + } + } + + /** + * Await termination on the main thread since the grpc library uses daemon threads. + */ + public void blockUntilShutdown() throws InterruptedException { + if (server != null) { + server.awaitTermination(); + } + } + + /** + * Main launches the server from the command line. + */ + public static void main(String[] args) throws Exception { + TikaGrpcServer server = new TikaGrpcServer(); + JCommander commander = JCommander + .newBuilder() + .addObject(server) + .build(); + + commander.parse(args); + + if (server.help) { + commander.usage(); + return; + } + + server.start(); + server.blockUntilShutdown(); + } + + public TikaGrpcServer setTikaConfigXml(File tikaConfigXml) { + this.tikaConfigXml = tikaConfigXml; + return this; + } + + public TikaGrpcServer setServer(Server server) { + this.server = server; + return this; + } + + public TikaGrpcServer setPort(Integer port) { + this.port = port; + return this; + } + + public TikaGrpcServer setSecure(boolean secure) { + this.secure = secure; + return this; + } + + public TikaGrpcServer setCertChain(File certChain) { + this.certChain = certChain; + return this; + } + + public TikaGrpcServer setPrivateKey(File privateKey) { + this.privateKey = privateKey; + return this; + } + + public TikaGrpcServer setPrivateKeyPassword(String privateKeyPassword) { + this.privateKeyPassword = privateKeyPassword; + return this; + } + + public TikaGrpcServer setTrustCertCollection(File trustCertCollection) { + this.trustCertCollection = trustCertCollection; + return this; + } + + public TikaGrpcServer setClientAuthRequired(boolean clientAuthRequired) { + this.clientAuthRequired = clientAuthRequired; + return this; + } +} diff --git a/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java new file mode 100644 index 0000000000..42872e0875 --- /dev/null +++ b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.module.jsonSchema.JsonSchema; +import com.fasterxml.jackson.module.jsonSchema.JsonSchemaGenerator; +import com.google.rpc.Status; +import io.grpc.protobuf.StatusProto; +import io.grpc.stub.StreamObserver; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.xml.sax.SAXException; + +import org.apache.tika.DeleteFetcherReply; +import org.apache.tika.DeleteFetcherRequest; +import org.apache.tika.FetchAndParseReply; +import org.apache.tika.FetchAndParseRequest; +import org.apache.tika.GetFetcherConfigJsonSchemaReply; +import org.apache.tika.GetFetcherConfigJsonSchemaRequest; +import org.apache.tika.GetFetcherReply; +import org.apache.tika.GetFetcherRequest; +import org.apache.tika.ListFetchersReply; +import org.apache.tika.ListFetchersRequest; +import org.apache.tika.SaveFetcherReply; +import org.apache.tika.SaveFetcherRequest; +import org.apache.tika.TikaGrpc; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.Param; +import org.apache.tika.config.TikaConfigSerializer; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.FetchEmitTuple; +import org.apache.tika.pipes.PipesClient; +import org.apache.tika.pipes.PipesConfig; +import org.apache.tika.pipes.PipesResult; +import org.apache.tika.pipes.emitter.EmitKey; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.FetchKey; +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { + private static final Logger LOG = LoggerFactory.getLogger(TikaConfigSerializer.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + static { + OBJECT_MAPPER.setSerializationInclusion(JsonInclude.Include.NON_NULL); + } + public static final JsonSchemaGenerator JSON_SCHEMA_GENERATOR = new JsonSchemaGenerator(OBJECT_MAPPER); + + /** + * FetcherID is key, The pair is the Fetcher object and the Metadata + */ + PipesConfig pipesConfig; + PipesClient pipesClient; + ExpiringFetcherStore expiringFetcherStore; + + String tikaConfigPath; + + TikaGrpcServerImpl(String tikaConfigPath) + throws TikaConfigException, IOException, ParserConfigurationException, + TransformerException, SAXException { + File tikaConfigFile = new File(tikaConfigPath); + if (!tikaConfigFile.canWrite()) { + File tmpTikaConfigFile = File.createTempFile("configCopy", tikaConfigFile.getName()); + tmpTikaConfigFile.deleteOnExit(); + LOG.info("Tika config file {} is read-only. Making a temporary copy to {}", tikaConfigFile, tmpTikaConfigFile); + String tikaConfigFileContents = FileUtils.readFileToString(tikaConfigFile, StandardCharsets.UTF_8); + FileUtils.writeStringToFile(tmpTikaConfigFile, tikaConfigFileContents, StandardCharsets.UTF_8); + tikaConfigFile = tmpTikaConfigFile; + tikaConfigPath = tikaConfigFile.getAbsolutePath(); + } + pipesConfig = PipesConfig.load(tikaConfigFile.toPath()); + pipesClient = new PipesClient(pipesConfig); + + expiringFetcherStore = new ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(), + pipesConfig.getStaleFetcherDelaySeconds()); + this.tikaConfigPath = tikaConfigPath; + updateTikaConfig(); + } + + private void updateTikaConfig() + throws ParserConfigurationException, IOException, SAXException, TransformerException { + Document tikaConfigDoc = + DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath); + + Element fetchersElement = (Element) tikaConfigDoc.getElementsByTagName("fetchers").item(0); + for (int i = 0; i < fetchersElement.getChildNodes().getLength(); ++i) { + fetchersElement.removeChild(fetchersElement.getChildNodes().item(i)); + } + for (var fetcherEntry : expiringFetcherStore.getFetchers().entrySet()) { + AbstractFetcher fetcherObject = fetcherEntry.getValue(); + Map fetcherConfigParams = OBJECT_MAPPER.convertValue( + expiringFetcherStore.getFetcherConfigs().get(fetcherEntry.getKey()), + new TypeReference<>() { + }); + Element fetcher = tikaConfigDoc.createElement("fetcher"); + fetcher.setAttribute("class", fetcherEntry.getValue().getClass().getName()); + Element fetcherName = tikaConfigDoc.createElement("name"); + fetcherName.setTextContent(fetcherObject.getName()); + fetcher.appendChild(fetcherName); + populateFetcherConfigs(fetcherConfigParams, tikaConfigDoc, fetcher); + fetchersElement.appendChild(fetcher); + } + DOMSource source = new DOMSource(tikaConfigDoc); + FileWriter writer = new FileWriter(tikaConfigPath, StandardCharsets.UTF_8); + StreamResult result = new StreamResult(writer); + + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + Transformer transformer = transformerFactory.newTransformer(); + transformer.transform(source, result); + } + + private void populateFetcherConfigs(Map fetcherConfigParams, + Document tikaConfigDoc, Element fetcher) { + for (var configParam : fetcherConfigParams.entrySet()) { + Element configElm = tikaConfigDoc.createElement(configParam.getKey()); + fetcher.appendChild(configElm); + if (configParam.getValue() instanceof List) { + List configParamVal = (List) configParam.getValue(); + String singularName = configParam.getKey().substring(0, configParam.getKey().length() - 1); + for (Object configParamObj : configParamVal) { + Element childElement = tikaConfigDoc.createElement(singularName); + childElement.setTextContent(Objects.toString(configParamObj)); + configElm.appendChild(childElement); + } + } else { + configElm.setTextContent(Objects.toString(configParam.getValue())); + } + } + } + + @Override + public void fetchAndParseServerSideStreaming(FetchAndParseRequest request, + StreamObserver responseObserver) { + fetchAndParseImpl(request, responseObserver); + } + + @Override + public StreamObserver fetchAndParseBiDirectionalStreaming( + StreamObserver responseObserver) { + return new StreamObserver<>() { + @Override + public void onNext(FetchAndParseRequest fetchAndParseRequest) { + fetchAndParseImpl(fetchAndParseRequest, responseObserver); + } + + @Override + public void onError(Throwable throwable) { + LOG.error("Parse error occurred", throwable); + } + + @Override + public void onCompleted() { + responseObserver.onCompleted(); + } + }; + } + + @Override + public void fetchAndParse(FetchAndParseRequest request, + StreamObserver responseObserver) { + fetchAndParseImpl(request, responseObserver); + responseObserver.onCompleted(); + } + + + private void fetchAndParseImpl(FetchAndParseRequest request, + StreamObserver responseObserver) { + AbstractFetcher fetcher = + expiringFetcherStore.getFetcherAndLogAccess(request.getFetcherId()); + if (fetcher == null) { + throw new RuntimeException( + "Could not find fetcher with name " + request.getFetcherId()); + } + Metadata tikaMetadata = new Metadata(); + try { + String metadataJson = request.getMetadataJson(); + loadMetadata(metadataJson, tikaMetadata); + ParseContext parseContext = new ParseContext(); + PipesResult pipesResult = pipesClient.process(new FetchEmitTuple(request.getFetchKey(), + new FetchKey(fetcher.getName(), request.getFetchKey()), new EmitKey(), tikaMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + FetchAndParseReply.Builder fetchReplyBuilder = + FetchAndParseReply.newBuilder() + .setFetchKey(request.getFetchKey()) + .setStatus(pipesResult.getStatus().name()); + if (pipesResult.getStatus().equals(PipesResult.STATUS.FETCH_EXCEPTION)) { + fetchReplyBuilder.setErrorMessage(pipesResult.getMessage()); + } + if (pipesResult.getEmitData() != null && pipesResult.getEmitData().getMetadataList() != null) { + for (Metadata metadata : pipesResult.getEmitData().getMetadataList()) { + for (String name : metadata.names()) { + String value = metadata.get(name); + if (value != null) { + fetchReplyBuilder.putFields(name, value); + } + } + } + } + responseObserver.onNext(fetchReplyBuilder.build()); + } catch (IOException e) { + throw new RuntimeException(e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private static void loadMetadata(String metadataJson, Metadata tikaMetadata) throws JsonProcessingException { + Map metadataJsonObject = new HashMap<>(); + if (!StringUtils.isBlank(metadataJson)) { + try { + metadataJsonObject = OBJECT_MAPPER.readValue(metadataJson, new TypeReference<>() {}); + } catch (JsonProcessingException e) { + metadataJsonObject = new HashMap<>(); + } + } + for (Map.Entry entry : metadataJsonObject.entrySet()) { + if (entry.getValue() instanceof List) { + List list = (List) entry.getValue(); + tikaMetadata.set(Property.externalText(entry.getKey()), list.stream() + .map(String::valueOf) + .collect(Collectors.toList()) + .toArray(new String[] {})); + } else if (entry.getValue() instanceof String) { + tikaMetadata.set(Property.externalText(entry.getKey()), (String) entry.getValue()); + } else if (entry.getValue() instanceof Integer) { + tikaMetadata.set(Property.externalText(entry.getKey()), (Integer) entry.getValue()); + } else if (entry.getValue() instanceof Double) { + tikaMetadata.set(Property.externalText(entry.getKey()), (Double) entry.getValue()); + } else if (entry.getValue() instanceof Float) { + tikaMetadata.set(Property.externalText(entry.getKey()), (Float) entry.getValue()); + } else if (entry.getValue() instanceof Boolean) { + tikaMetadata.set(Property.externalText(entry.getKey()), (Boolean) entry.getValue()); + } + } + } + + @SuppressWarnings("raw") + @Override + public void saveFetcher(SaveFetcherRequest request, + StreamObserver responseObserver) { + SaveFetcherReply reply = + SaveFetcherReply.newBuilder().setFetcherId(request.getFetcherId()).build(); + try { + Map fetcherConfigMap = OBJECT_MAPPER.readValue(request.getFetcherConfigJson(), new TypeReference<>() {}); + Map tikaParamsMap = createTikaParamMap(fetcherConfigMap); + saveFetcher(request.getFetcherId(), request.getFetcherClass(), fetcherConfigMap, tikaParamsMap); + updateTikaConfig(); + } catch (Exception e) { + throw new RuntimeException(e); + } + responseObserver.onNext(reply); + responseObserver.onCompleted(); + } + + private void saveFetcher(String name, String fetcherClassName, Map paramsMap, Map tikaParamsMap) { + try { + if (paramsMap == null) { + paramsMap = new LinkedHashMap<>(); + } + Class fetcherClass = + (Class) Class.forName(fetcherClassName); + String configClassName = + fetcherClass.getPackageName() + ".config." + fetcherClass.getSimpleName() + + "Config"; + Class configClass = + (Class) Class.forName(configClassName); + AbstractConfig configObject = OBJECT_MAPPER.convertValue(paramsMap, configClass); + AbstractFetcher abstractFetcher = + fetcherClass.getDeclaredConstructor(configClass).newInstance(configObject); + abstractFetcher.setName(name); + if (Initializable.class.isAssignableFrom(fetcherClass)) { + Initializable initializable = (Initializable) abstractFetcher; + initializable.initialize(tikaParamsMap); + } + if (expiringFetcherStore.deleteFetcher(name)) { + LOG.info("Updating fetcher {}", name); + } else { + LOG.info("Creating new fetcher {}", name); + } + expiringFetcherStore.createFetcher(abstractFetcher, configObject); + } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | + InvocationTargetException | NoSuchMethodException | TikaConfigException e) { + throw new RuntimeException(e); + } + } + + private static Map createTikaParamMap(Map fetcherConfigMap) { + Map tikaParamsMap = new HashMap<>(); + for (Map.Entry entry : fetcherConfigMap.entrySet()) { + if (entry.getValue() != null) { + tikaParamsMap.put(entry.getKey(), new Param<>(entry.getKey(), entry.getValue())); + } + } + return tikaParamsMap; + } + + static Status notFoundStatus(String fetcherId) { + return Status.newBuilder() + .setCode(io.grpc.Status.Code.NOT_FOUND.value()) + .setMessage("Could not find fetcher with id:" + fetcherId) + .build(); + } + + @Override + public void getFetcher(GetFetcherRequest request, + StreamObserver responseObserver) { + GetFetcherReply.Builder getFetcherReply = GetFetcherReply.newBuilder(); + AbstractConfig abstractConfig = + expiringFetcherStore.getFetcherConfigs().get(request.getFetcherId()); + AbstractFetcher abstractFetcher = expiringFetcherStore.getFetchers().get(request.getFetcherId()); + if (abstractFetcher == null || abstractConfig == null) { + responseObserver.onError(StatusProto.toStatusException(notFoundStatus(request.getFetcherId()))); + return; + } + getFetcherReply.setFetcherId(request.getFetcherId()); + getFetcherReply.setFetcherClass(abstractFetcher.getClass().getName()); + Map paramMap = OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() {}); + paramMap.forEach( + (k, v) -> getFetcherReply.putParams(Objects.toString(k), Objects.toString(v))); + responseObserver.onNext(getFetcherReply.build()); + responseObserver.onCompleted(); + } + + @Override + public void listFetchers(ListFetchersRequest request, + StreamObserver responseObserver) { + ListFetchersReply.Builder listFetchersReplyBuilder = ListFetchersReply.newBuilder(); + for (Map.Entry fetcherConfig : expiringFetcherStore.getFetcherConfigs() + .entrySet()) { + GetFetcherReply.Builder replyBuilder = saveFetcherReply(fetcherConfig); + listFetchersReplyBuilder.addGetFetcherReplies(replyBuilder.build()); + } + responseObserver.onNext(listFetchersReplyBuilder.build()); + responseObserver.onCompleted(); + } + + private GetFetcherReply.Builder saveFetcherReply( + Map.Entry fetcherConfig) { + AbstractFetcher abstractFetcher = + expiringFetcherStore.getFetchers().get(fetcherConfig.getKey()); + AbstractConfig abstractConfig = + expiringFetcherStore.getFetcherConfigs().get(fetcherConfig.getKey()); + GetFetcherReply.Builder replyBuilder = + GetFetcherReply.newBuilder().setFetcherClass(abstractFetcher.getClass().getName()) + .setFetcherId(abstractFetcher.getName()); + loadParamsIntoReply(abstractConfig, replyBuilder); + return replyBuilder; + } + + private static void loadParamsIntoReply(AbstractConfig abstractConfig, + GetFetcherReply.Builder replyBuilder) { + Map paramMap = + OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() { + }); + if (paramMap != null) { + paramMap.forEach( + (k, v) -> replyBuilder.putParams(Objects.toString(k), Objects.toString(v))); + } + } + + @Override + public void deleteFetcher(DeleteFetcherRequest request, + StreamObserver responseObserver) { + boolean successfulDelete = deleteFetcher(request.getFetcherId()); + if (successfulDelete) { + try { + updateTikaConfig(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + responseObserver.onNext(DeleteFetcherReply.newBuilder().setSuccess(successfulDelete).build()); + responseObserver.onCompleted(); + } + + @Override + public void getFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest request, StreamObserver responseObserver) { + GetFetcherConfigJsonSchemaReply.Builder builder = GetFetcherConfigJsonSchemaReply.newBuilder(); + try { + JsonSchema jsonSchema = JSON_SCHEMA_GENERATOR.generateSchema(Class.forName(request.getFetcherClass())); + builder.setFetcherConfigJsonSchema(OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema)); + } catch (ClassNotFoundException | JsonProcessingException e) { + throw new RuntimeException("Could not create json schema for " + request.getFetcherClass(), e); + } + responseObserver.onNext(builder.build()); + responseObserver.onCompleted(); + } + + private boolean deleteFetcher(String fetcherName) { + return expiringFetcherStore.deleteFetcher(fetcherName); + } +} diff --git a/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/exception/FetcherNotFoundException.java b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/exception/FetcherNotFoundException.java new file mode 100644 index 0000000000..72116d6ded --- /dev/null +++ b/tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/exception/FetcherNotFoundException.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc.exception; + +public class FetcherNotFoundException extends Exception { + public FetcherNotFoundException(String message) { + super(message); + } +} diff --git a/tika-pipes/tika-grpc/src/main/proto/tika.proto b/tika-pipes/tika-grpc/src/main/proto/tika.proto new file mode 100644 index 0000000000..18761aac03 --- /dev/null +++ b/tika-pipes/tika-grpc/src/main/proto/tika.proto @@ -0,0 +1,146 @@ +// Copyright 2015 The gRPC Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +syntax = "proto3"; +package tika; + +option go_package = "apache.org/tika"; + +option java_multiple_files = true; +option java_package = "org.apache.tika"; +option java_outer_classname = "TikaProto"; +option objc_class_prefix = "HLW"; + +// The Tika Grpc Service definition +service Tika { + /* + Save a fetcher to the fetcher store. + */ + rpc SaveFetcher(SaveFetcherRequest) returns (SaveFetcherReply) {} + /* + Get a fetcher's data from the fetcher store. + */ + rpc GetFetcher(GetFetcherRequest) returns (GetFetcherReply) {} + /* + List fetchers that are currently in the fetcher store. + */ + rpc ListFetchers(ListFetchersRequest) returns (ListFetchersReply) {} + /* + Delete a fetcher from the fetcher store. + */ + rpc DeleteFetcher(DeleteFetcherRequest) returns (DeleteFetcherReply) {} + /* + Using a Fetcher in the fetcher store, send a FetchAndParse request. This will fetch, parse, and return + the FetchParseTuple data output from Tika Pipes. This is a synchronous call that immediately returns 1 result. + */ + rpc FetchAndParse(FetchAndParseRequest) returns (FetchAndParseReply) {} + /* + Using a Fetcher in the fetcher store, send a FetchAndParse request. This will fetch, parse, and return + the FetchParseTuple data output from Tika Pipes. This will stream the data from the server in response. + */ + rpc FetchAndParseServerSideStreaming(FetchAndParseRequest) + returns (stream FetchAndParseReply) {} + /* + Using a Fetcher in the fetcher store, send a FetchAndParse request. This will fetch, parse, and return + the FetchParseTuple data output from Tika Pipes. This serves a bi-directional stream of fetch inputs and + parsed outputs. + */ + rpc FetchAndParseBiDirectionalStreaming(stream FetchAndParseRequest) + returns (stream FetchAndParseReply) {} + /* + Get the Fetcher Config schema for a given fetcher class. + */ + rpc GetFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest) returns (GetFetcherConfigJsonSchemaReply) {} +} + +message SaveFetcherRequest { + // A unique identifier for each fetcher. If this already exists, operation will overwrite existing. + string fetcher_id = 1; + // The full java class name of the fetcher class. List of + // fetcher classes is found here: https://cwiki.apache.org/confluence/display/TIKA/tika-pipes + string fetcher_class = 2; + // JSON string of the fetcher config object. To see the json schema from which to build this json, + // use the GetFetcherConfigJsonSchema rpc method. + string fetcher_config_json = 3; +} + +message SaveFetcherReply { + // The fetcher_id that was saved. + string fetcher_id = 1; +} + +message FetchAndParseRequest { + // The ID of the fetcher in the fetcher store (previously saved by SaveFetcher) to use for the fetch. + string fetcher_id = 1; + // The "Fetch Key" of the item that will be fetched. + string fetch_key = 2; + // Additional metadata describing how to fetch and parse the item. + string metadata_json = 3; +} + +message FetchAndParseReply { + // Echoes the fetch_key that was sent in the request. + string fetch_key = 1; + // Metadata fields from the parse output. + map fields = 2; + // The status from the message. See javadoc for org.apache.tika.pipes.PipesResult.STATUS for the list of status. + string status = 3; + // If there was an error, this will contain the error message. + string error_message = 4; +} + +message DeleteFetcherRequest { + // ID of the fetcher to delete. + string fetcher_id = 1; +} + +message DeleteFetcherReply { + // Success if the fetcher was successfully removed from the fetch store. + bool success = 1; +} + +message GetFetcherRequest { + // ID of the fetcher for which to return config. + string fetcher_id = 1; +} + +message GetFetcherReply { + // Echoes the ID of the fetcher being returned. + string fetcher_id = 1; + // The full Java class name of the Fetcher. + string fetcher_class = 2; + // The configuration parameters. + map params = 3; +} + +message ListFetchersRequest { + // List the fetchers starting at this page number + int32 page_number = 1; + // List this many fetchers per page. + int32 num_fetchers_per_page = 2; +} + +message ListFetchersReply { + // List of fetcher configs returned by the Lists Fetchers service. + repeated GetFetcherReply get_fetcher_replies = 1; +} + +message GetFetcherConfigJsonSchemaRequest { + // The full java class name of the fetcher config for which to fetch json schema. + string fetcher_class = 1; +} + +message GetFetcherConfigJsonSchemaReply { + // The json schema that describes the fetcher config in string format. + string fetcher_config_json_schema = 1; +} diff --git a/tika-pipes/tika-grpc/src/main/resources/log4j2.xml b/tika-pipes/tika-grpc/src/main/resources/log4j2.xml new file mode 100644 index 0000000000..c88e66e99e --- /dev/null +++ b/tika-pipes/tika-grpc/src/main/resources/log4j2.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java b/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java new file mode 100644 index 0000000000..264c366f38 --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.io.InputStream; +import java.time.Duration; + +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +class ExpiringFetcherStoreTest { + + @Test + void createFetcher() { + try (ExpiringFetcherStore expiringFetcherStore = new ExpiringFetcherStore(1, 5)) { + AbstractFetcher fetcher = new AbstractFetcher() { + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) { + return null; + } + }; + fetcher.setName("nick"); + AbstractConfig config = new AbstractConfig() { + }; + expiringFetcherStore.createFetcher(fetcher, config); + + Assertions.assertNotNull(expiringFetcherStore + .getFetchers() + .get(fetcher.getName())); + + Awaitility + .await() + .atMost(Duration.ofSeconds(60)) + .until(() -> expiringFetcherStore + .getFetchers() + .get(fetcher.getName()) == null); + + assertNull(expiringFetcherStore + .getFetcherConfigs() + .get(fetcher.getName())); + } + } +} diff --git a/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java b/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java new file mode 100644 index 0000000000..e78110abb1 --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import java.io.File; +import java.io.IOException; +import java.net.InetAddress; +import java.net.ServerSocket; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableMap; +import io.grpc.Grpc; +import io.grpc.ManagedChannel; +import io.grpc.TlsChannelCredentials; +import io.grpc.netty.shaded.io.netty.handler.ssl.util.InsecureTrustManagerFactory; +import io.grpc.stub.StreamObserver; +import org.apache.commons.io.FileUtils; +import org.awaitility.Awaitility; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.handler.ResourceHandler; +import org.eclipse.jetty.util.resource.PathResource; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.FetchAndParseReply; +import org.apache.tika.FetchAndParseRequest; +import org.apache.tika.SaveFetcherReply; +import org.apache.tika.SaveFetcherRequest; +import org.apache.tika.TikaGrpc; +import org.apache.tika.pipes.fetcher.http.HttpFetcher; + +/** + * This test will start an HTTP server using jetty. + * Then it will start Tika Pipes Grpc service. + * Then it will, using a bidirectional stream of data, send urls to the + * HTTP fetcher whilst simultaneously receiving parsed output as they parse. + */ +class PipesBiDirectionalStreamingIntegrationTest { + static final Logger LOGGER = LoggerFactory.getLogger(PipesBiDirectionalStreamingIntegrationTest.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + static File tikaConfigXmlTemplate = Paths + .get("src", "test", "resources", "tika-pipes-test-config.xml") + .toFile(); + static File tikaConfigXml = new File("target", "tika-config-" + UUID.randomUUID() + ".xml"); + static TikaGrpcServer grpcServer; + static int grpcPort; + static String httpServerUrl; + static TikaGrpc.TikaBlockingStub tikaBlockingStub; + static TikaGrpc.TikaStub tikaStub; + static Server httpServer; + static int httpServerPort; + String httpFetcherId = "httpFetcherIdHere"; + List files = Arrays.asList("014760.docx", "017091.docx", "017097.docx", "018367.docx"); + + static int findAvailablePort() throws IOException { + try (ServerSocket serverSocket = new ServerSocket(0)) { + return serverSocket.getLocalPort(); + } + } + + @BeforeAll + static void setUpHttpServer() throws Exception { + // Specify the folder from which files will be served + httpServerPort = findAvailablePort(); + httpServer = new Server(httpServerPort); + + ResourceHandler resourceHandler = new ResourceHandler(); + resourceHandler.setDirAllowed(true); + resourceHandler.setBaseResource(new PathResource(Paths.get("src", "test", "resources", "test-files"))); + httpServer.setHandler(resourceHandler); + httpServer.start(); + + httpServerUrl = "http://" + InetAddress + .getByName("localhost") + .getHostAddress() + ":" + httpServerPort; + } + + @BeforeAll + static void setUpGrpcServer() throws Exception { + grpcPort = findAvailablePort(); + FileUtils.copyFile(tikaConfigXmlTemplate, tikaConfigXml); + + grpcServer = new TikaGrpcServer(); + grpcServer.setTikaConfigXml(tikaConfigXml); + grpcServer.setPort(grpcPort); + grpcServer.setSecure(true); + grpcServer.setCertChain(Paths.get("src", "test", "resources", "certs", "server1.pem").toFile()); + grpcServer.setPrivateKey(Paths.get("src", "test", "resources", "certs", "server1.key").toFile()); + grpcServer.setTrustCertCollection(Paths.get("src", "test", "resources", "certs", "ca.pem").toFile()); + grpcServer.setClientAuthRequired(true); + grpcServer.start(); + + String target = InetAddress + .getByName("localhost") + .getHostAddress() + ":" + grpcPort; + + TlsChannelCredentials.Builder channelCredBuilder = TlsChannelCredentials.newBuilder(); + File clientCertChain = Paths.get("src", "test", "resources", "certs", "client.pem").toFile(); + File clientPrivateKey = Paths.get("src", "test", "resources", "certs", "client.key").toFile(); + channelCredBuilder.keyManager(clientCertChain, clientPrivateKey); + channelCredBuilder.trustManager(InsecureTrustManagerFactory.INSTANCE.getTrustManagers()); + + ManagedChannel channel = Grpc + .newChannelBuilder(target, channelCredBuilder.build()) + .build(); + + tikaBlockingStub = TikaGrpc.newBlockingStub(channel); + tikaStub = TikaGrpc.newStub(channel); + } + + @AfterAll + static void stopHttpServer() throws Exception { + if (httpServer != null) { + httpServer.stop(); + } + } + + @AfterAll + static void stopGrpcServer() throws Exception { + if (grpcServer != null) { + grpcServer.stop(); + } + } + + @BeforeEach + void createHttpFetcher() throws Exception { + SaveFetcherRequest saveFetcherRequest = SaveFetcherRequest + .newBuilder() + .setFetcherId(httpFetcherId) + .setFetcherClass(HttpFetcher.class.getName()) + .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap + .builder() + .put("requestTimeout", 30_000) + .put("socketTimeout", 30_000) + .put("connectTimeout", 20_000) + .put("maxConnectionsPerRoute", 200) + .put("maxRedirects", 0) + .put("maxSpoolSize", -1) + .put("overallTimeout", 50_000) + .build())) + .build(); + SaveFetcherReply saveFetcherReply = tikaBlockingStub.saveFetcher(saveFetcherRequest); + Assertions.assertEquals(saveFetcherReply.getFetcherId(), httpFetcherId); + } + + @Test + void testHttpFetchScenario() throws Exception { + AtomicInteger numParsed = new AtomicInteger(); + Map> result = Collections.synchronizedMap(new HashMap<>()); + StreamObserver responseObserver = new StreamObserver<>() { + @Override + public void onNext(FetchAndParseReply fetchAndParseReply) { + LOGGER.info("Parsed: {}", fetchAndParseReply.getFetchKey()); + numParsed.incrementAndGet(); + result.put(fetchAndParseReply.getFetchKey(), fetchAndParseReply.getFieldsMap()); + } + + @Override + public void onError(Throwable throwable) { + LOGGER.error("Error occurred", throwable); + } + + @Override + public void onCompleted() { + LOGGER.info("Completed fetching."); + } + }; + StreamObserver request = tikaStub.fetchAndParseBiDirectionalStreaming(responseObserver); + for (String file : files) { + request.onNext(FetchAndParseRequest + .newBuilder() + .setFetcherId(httpFetcherId) + .setFetchKey(httpServerUrl + "/" + file) + .build()); + } + request.onCompleted(); + + Awaitility.await().atMost(Duration.ofSeconds(600)).until(() -> result.size() == files.size()); + + Assertions.assertEquals(files.size(), numParsed.get()); + } +} diff --git a/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java b/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java new file mode 100644 index 0000000000..80f391e33b --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; + +import com.asarkar.grpc.test.GrpcCleanupExtension; +import com.asarkar.grpc.test.Resources; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableMap; +import io.grpc.ManagedChannel; +import io.grpc.Server; +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import io.grpc.inprocess.InProcessChannelBuilder; +import io.grpc.inprocess.InProcessServerBuilder; +import io.grpc.stub.StreamObserver; +import org.apache.commons.io.FileUtils; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.DeleteFetcherReply; +import org.apache.tika.DeleteFetcherRequest; +import org.apache.tika.FetchAndParseReply; +import org.apache.tika.FetchAndParseRequest; +import org.apache.tika.GetFetcherReply; +import org.apache.tika.GetFetcherRequest; +import org.apache.tika.SaveFetcherReply; +import org.apache.tika.SaveFetcherRequest; +import org.apache.tika.TikaGrpc; +import org.apache.tika.pipes.PipesResult; +import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher; + +@ExtendWith(GrpcCleanupExtension.class) +public class TikaGrpcServerTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerTest.class); + public static final int NUM_TEST_DOCS = 2; + static File tikaConfigXmlTemplate = Paths + .get("src", "test", "resources", "tika-pipes-test-config.xml") + .toFile(); + static File tikaConfigXml = new File("target", "tika-config-" + UUID.randomUUID() + ".xml"); + + + @BeforeAll + static void init() throws Exception { + FileUtils.copyFile(tikaConfigXmlTemplate, tikaConfigXml); + } + + static final int NUM_FETCHERS_TO_CREATE = 10; + + @Test + public void testFetcherCrud(Resources resources) throws Exception { + Assertions.assertTrue(tikaConfigXml.setWritable(false)); + String serverName = InProcessServerBuilder.generateName(); + + Server server = InProcessServerBuilder + .forName(serverName) + .directExecutor() + .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())) + .build() + .start(); + resources.register(server, Duration.ofSeconds(10)); + + ManagedChannel channel = InProcessChannelBuilder + .forName(serverName) + .directExecutor() + .build(); + resources.register(channel, Duration.ofSeconds(10)); + TikaGrpc.TikaBlockingStub blockingStub = TikaGrpc.newBlockingStub(channel); + + String targetFolder = new File("target").getAbsolutePath(); + // create fetchers + for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) { + String fetcherId = createFetcherId(i); + SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .setFetcherClass(FileSystemFetcher.class.getName()) + .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap + .builder() + .put("basePath", targetFolder) + .put("extractFileSystemMetadata", true) + .build())) + .build()); + assertEquals(fetcherId, reply.getFetcherId()); + } + // update fetchers + for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) { + String fetcherId = createFetcherId(i); + SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .setFetcherClass(FileSystemFetcher.class.getName()) + .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap + .builder() + .put("basePath", targetFolder) + .put("extractFileSystemMetadata", false) + .build())) + .build()); + assertEquals(fetcherId, reply.getFetcherId()); + GetFetcherReply getFetcherReply = blockingStub.getFetcher(GetFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .build()); + assertEquals("false", getFetcherReply + .getParamsMap() + .get("extractFileSystemMetadata")); + } + + // get fetchers + for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) { + String fetcherId = createFetcherId(i); + GetFetcherReply getFetcherReply = blockingStub.getFetcher(GetFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .build()); + assertEquals(fetcherId, getFetcherReply.getFetcherId()); + assertEquals(FileSystemFetcher.class.getName(), getFetcherReply.getFetcherClass()); + } + + // delete fetchers + for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) { + String fetcherId = createFetcherId(i); + DeleteFetcherReply deleteFetcherReply = blockingStub.deleteFetcher(DeleteFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .build()); + Assertions.assertTrue(deleteFetcherReply.getSuccess()); + StatusRuntimeException statusRuntimeException = Assertions.assertThrows(StatusRuntimeException.class, () -> blockingStub.getFetcher(GetFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .build())); + Assertions.assertEquals(Status.NOT_FOUND + .getCode() + .value(), statusRuntimeException + .getStatus() + .getCode() + .value()); + } + } + + @NotNull + private static String createFetcherId(int i) { + return "nick" + i + ":is:cool:super/" + FileSystemFetcher.class; + } + + @Test + public void testBiStream(Resources resources) throws Exception { + String serverName = InProcessServerBuilder.generateName(); + + Server server = InProcessServerBuilder + .forName(serverName) + .directExecutor() + .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())) + .build() + .start(); + resources.register(server, Duration.ofSeconds(10)); + + ManagedChannel channel = InProcessChannelBuilder + .forName(serverName) + .directExecutor() + .build(); + resources.register(channel, Duration.ofSeconds(10)); + TikaGrpc.TikaBlockingStub blockingStub = TikaGrpc.newBlockingStub(channel); + TikaGrpc.TikaStub tikaStub = TikaGrpc.newStub(channel); + + String fetcherId = createFetcherId(1); + String targetFolder = new File("target").getAbsolutePath(); + SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest + .newBuilder() + .setFetcherId(fetcherId) + .setFetcherClass(FileSystemFetcher.class.getName()) + .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap + .builder() + .put("basePath", targetFolder) + .put("extractFileSystemMetadata", true) + .build())) + .build()); + + assertEquals(fetcherId, reply.getFetcherId()); + + List successes = Collections.synchronizedList(new ArrayList<>()); + List errors = Collections.synchronizedList(new ArrayList<>()); + AtomicBoolean finished = new AtomicBoolean(false); + + StreamObserver replyStreamObserver = new StreamObserver<>() { + @Override + public void onNext(FetchAndParseReply fetchAndParseReply) { + LOG.debug("Fetched {} with metadata {}", fetchAndParseReply.getFetchKey(), fetchAndParseReply.getFieldsMap()); + if (PipesResult.STATUS.FETCH_EXCEPTION.name().equals(fetchAndParseReply.getStatus())) { + errors.add(fetchAndParseReply); + } else { + successes.add(fetchAndParseReply); + } + } + + @Override + public void onError(Throwable throwable) { + fail(throwable); + } + + @Override + public void onCompleted() { + LOG.info("Stream completed"); + finished.set(true); + } + }; + + StreamObserver requestStreamObserver = tikaStub.fetchAndParseBiDirectionalStreaming(replyStreamObserver); + + File testDocumentFolder = new File("target/" + DateTimeFormatter + .ofPattern("yyyy_MM_dd_HH_mm_ssSSS", Locale.getDefault()) + .format(LocalDateTime.now(ZoneId.systemDefault())) + "-" + UUID.randomUUID()); + assertTrue(testDocumentFolder.mkdir()); + try { + for (int i = 0; i < NUM_TEST_DOCS; ++i) { + File testFile = new File(testDocumentFolder, "test-" + i + ".html"); + FileUtils.writeStringToFile(testFile, "test " + i + "", StandardCharsets.UTF_8); + } + File[] testDocuments = testDocumentFolder.listFiles(); + assertNotNull(testDocuments); + for (File testDocument : testDocuments) { + requestStreamObserver.onNext(FetchAndParseRequest + .newBuilder() + .setFetcherId(fetcherId) + .setFetchKey(testDocument.getAbsolutePath()) + .build()); + } + // Now test error condition + requestStreamObserver.onNext(FetchAndParseRequest + .newBuilder() + .setFetcherId(fetcherId) + .setFetchKey("does not exist") + .build()); + requestStreamObserver.onCompleted(); + assertEquals(NUM_TEST_DOCS, successes.size()); + assertEquals(1, errors.size()); + assertTrue(finished.get()); + } finally { + FileUtils.deleteDirectory(testDocumentFolder); + } + } +} diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/README.md b/tika-pipes/tika-grpc/src/test/resources/certs/README.md new file mode 100644 index 0000000000..7373d56354 --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/README.md @@ -0,0 +1,5 @@ +# Test certs for Tika Grpc mTLS + +Generate these using script as documented here: + +https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/testing/src/main/resources/certs diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/ca.key b/tika-pipes/tika-grpc/src/test/resources/certs/ca.key new file mode 100644 index 0000000000..03be0bfa6e --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/ca.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCwYvShd+UXQvOg +z4GH6pRT3KGrPDbDw45fma7+I0LJQ4GupoeLuYYfHvcYPTV2I3MLO+VxCp00gfo1 +BIvsNOkGNxrrqNhP27ve9l7YwOuvWdVu4u9+73znRx3GJQ4ie/nF/z6xMcbQL5r5 +UC8yGwuJGOyr6VcpEnKTnORtuwRPJuqnGgn4rsKhLLfJz+RAhjdOKnAS3CQo/iHP +KjoqIZ38M97GJ7icFQic3dtLUFR41nnN5ogLZ6DduR55btypPnlv5h6foLFjRMST +MEroAq39ZSJqUoyBPTBtPFFk7uRQIfdKrp7/Bd4V0n4e91Us+UCDlOcxo2lF1CKH +/ydEWmx3AgMBAAECggEAKrDosKQKKKUlvkg6+6CFIf8GiiFax+ru7KiPuCbkpT3X +h2P67pCKq8Gc4Jr/84YE9DUdBU0iW3ESE/7ztsnflIeF1n/ZSwrN39sVfbTD1n8R +r3LxsHFac8e8pxaU4zfKbmemztBTZFQBWFJV+fSdyCLmNX2WgPRcEuooR366PkWv +xZLAxeDGqpnsa62o1GdGmalxx8aljLN/QcbQi73mR9Osim1OtSd1cyDlZ/8x6OoV +Ae5GDN3Bj0hO9ZKzNWTbQpRw9SHKU6sWXtHlcDx4xi5kN/n9aptn7kixbY9y8uOM +5zjErVGWvxdP94IvlSkrkenwnIjlHBtdlAjVuCFioQKBgQDoJLyfHNWPBnjPGVnK +xcbIIwmf4C9UnZBbHRD3YxU/GBpsPgPh9EwhQTAXlGQGHeuslxCIh4cEfbOIrJ9b +/s3OqeL9CSUaz/N+1av1ZuwOI9CEvNPi51IK+rXNRmVJG8pG6RaKNx57pXaFtmqq +FUtC7twbPECvjspapn61nZYSiQKBgQDCg1tpGwZJJOCIkhYH4wFc4j4p0LxIcBJ2 +E3L9VnQ+APT/x8uitkZsuRY9tmWcHK8/zWTc1GpFdwGUJ9+Yzvprtej+P/buxM9J +Y6ZJZdCIHWDuh3eq+sXS4lwr5fi7ir5m97npG1bXPlOoYIJ7p172EyoNmurRIgiP +LWnzK0jG/wKBgQCRQtOouNFFcyZLaTCPutxdRddy7ESRrRq0eOax9pVH6tw12URy +snyk3naqepdwYG6li82zsSKig8nA/0uktDeyVwoLjhpiwbc7KZc1sxaI7o4/US1B +McBb0G/MqH0elz4myxnomP8BHhOhLflmvnZexrqCbFyJvk8PFFn7aUWMCQKBgDvX +9BCzOszYJqh94X9NrQapqJxu1u6mZFelhjRBHARTgQ0MqC8IS0R58UjNTBeqj5Re +mdCDHar/gSHW3qkBzPPEhMlsXol5TZjzqp5cT7sA5uicDwowmxpVgCwVVeBFQG0n +fDAmtCIGz/A2uQ5YIRQuMzr6VZJAGUgLndQtlfd7AoGBAMq1imggFKd1rt49XCnO +t97lpWOT+TlWYblHr01tOw+esawG5MFucqVI6tGpBSccTRQw6orWf4GK3KmkgQ6J +UgHKjwYsA0sf4U5vppkdkbAbM/WwUPOTQpGFRERyJqMqFGIc4wMtZOJBxXwf+9iD +l8tvan8w/6HugqnI7qqkTgLq +-----END PRIVATE KEY----- diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/ca.pem b/tika-pipes/tika-grpc/src/test/resources/certs/ca.pem new file mode 100644 index 0000000000..49d39cd8ed --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/ca.pem @@ -0,0 +1,20 @@ +-----BEGIN CERTIFICATE----- +MIIDWjCCAkKgAwIBAgIUWrP0VvHcy+LP6UuYNtiL9gBhD5owDQYJKoZIhvcNAQEL +BQAwVjELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM +GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEPMA0GA1UEAwwGdGVzdGNhMB4XDTIw +MDMxNzE4NTk1MVoXDTMwMDMxNTE4NTk1MVowVjELMAkGA1UEBhMCQVUxEzARBgNV +BAgMClNvbWUtU3RhdGUxITAfBgNVBAoMGEludGVybmV0IFdpZGdpdHMgUHR5IEx0 +ZDEPMA0GA1UEAwwGdGVzdGNhMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC +AQEAsGL0oXflF0LzoM+Bh+qUU9yhqzw2w8OOX5mu/iNCyUOBrqaHi7mGHx73GD01 +diNzCzvlcQqdNIH6NQSL7DTpBjca66jYT9u73vZe2MDrr1nVbuLvfu9850cdxiUO +Inv5xf8+sTHG0C+a+VAvMhsLiRjsq+lXKRJyk5zkbbsETybqpxoJ+K7CoSy3yc/k +QIY3TipwEtwkKP4hzyo6KiGd/DPexie4nBUInN3bS1BUeNZ5zeaIC2eg3bkeeW7c +qT55b+Yen6CxY0TEkzBK6AKt/WUialKMgT0wbTxRZO7kUCH3Sq6e/wXeFdJ+HvdV +LPlAg5TnMaNpRdQih/8nRFpsdwIDAQABoyAwHjAMBgNVHRMEBTADAQH/MA4GA1Ud +DwEB/wQEAwICBDANBgkqhkiG9w0BAQsFAAOCAQEAkTrKZjBrJXHps/HrjNCFPb5a +THuGPCSsepe1wkKdSp1h4HGRpLoCgcLysCJ5hZhRpHkRihhef+rFHEe60UePQO3S +CVTtdJB4CYWpcNyXOdqefrbJW5QNljxgi6Fhvs7JJkBqdXIkWXtFk2eRgOIP2Eo9 +/OHQHlYnwZFrk6sp4wPyR+A95S0toZBcyDVz7u+hOW0pGK3wviOe9lvRgj/H3Pwt +bewb0l+MhRig0/DVHamyVxrDRbqInU1/GTNCwcZkXKYFWSf92U+kIcTth24Q1gcw +eZiLl5FfrWokUNytFElXob0V0a5/kbhiLc3yWmvWqHTpqCALbVyF+rKJo2f5Kw== +-----END CERTIFICATE----- diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/client.key b/tika-pipes/tika-grpc/src/test/resources/certs/client.key new file mode 100644 index 0000000000..349b40033d --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/client.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCyqYRp+DXVp72N +FbQH8hdhTZLycZXOlJhmMsrJmrjn2p7pI/8mTZ/0FC+SGWBGZV+ELiHrmCX5zfaI +Lr9Iuw7Ghr3Vzoefi8r62rLupVPNi/qdqyjWk2dECHC9Z3+Ag3KzKTyerXWjKcvy +KVmM0ZxE0RXhDW/RoQbqZsU2GKg1B2rhUU8KN0gVmKn0rJHOxzRVSYeYLYp5Yn7K +rtPJcKyo9aVuEr7dGANzpyF6lg/nYBWc+9SGwkoLdFvKvABYJMyrbNhHUQfv0fza +Z0P86dfTENrDxzALrzGnqcx3KTrwJjkZ/aSr1tyD0/tXvukRFiPxWBJhjHQ70GqT +FQY19RbhAgMBAAECggEAIL8JUhL4awyvpWhQ8xPgTSlWwbEn8BE0TacJnCILuhNM +BRdf8LlRk/8PKQwVpVF3TFbYSMI+U6b4hMVssfv3HVQc/083dHq+3XOwUCVlUstR +SAzTE2E5EDMr1stdh0SQhV4Nilfos9s5Uk1Z6IGSztoz1GgOErIc/mGPy/aA/hbr +fRWHvTp35+MbCJSvZuOeevX2iLs0dNzqdk6DiOWIH/BVGirVPtO6ykrkuTj1FWiN +hyZ3MBChShlNH2poNX46ntOc7nEus0qteOgxBK8lummFEtlehCA7hd/8xuvYlP0k +7aN684LCRDajmAGpoZO57NSDYQhAFGZeUZ93SMFucQKBgQDe7GGkzZFEiv91u1q9 +lgMy1h5dZjIZKgQaOarPC6wCQMUdqCf6cSLsAPr4T8EDoWsnY7dSnrTZ6YCIFL1T +idg8M3BQXipICCJkFORS76pKKZ0wMn3/NgkSepsmNct91WHr6okvx4tOaoRCtdzU +g7jt4Mr3sfLCiZtqTQyySdMUEwKBgQDNK+ZFKL0XhkWZP+PGKjWG8LWpPiK3d78/ +wYBFXzSTGlkr6FvRmYtZeNwXWRYLB4UxZ9At4hbJVEdi/2dITOz/sehVDyCAjjs3 +gycsc3UJqiZbcw5XKhI5TWBuWxkKENdbMSayogVbp2aSYoRblH764//t0ACmbfTW +KUQRQPB/uwKBgQC5QjjjfPL8w4cJkGoYpFKELO2PMR7xSrmeEc6hwlFwjeNCgjy3 +JM6g0y++rIj7O2qRkY0IXFxvvF3UuWedxTCu1xC/uYHp2ti506LsScB7YZoAM/YB +4iYn9Tx6xLoYGP0H0iGwU2SyBlNkHT8oXU+SYP5MWtYkVbeS3/VtNWz1gQKBgQCA +6Nk4kN0mH7YxEKRzSOfyzeDF4oV7kuB2FYUbkTL+TirC3K58JiYY5Egc31trOKFm +Jlz1xz0b6DkmKWTiV3r9OPHKJ8P7IeJxAZWmZzCdDuwkv0i+WW+z0zsIe3JjEavN +3zb6O7R0HtziksWoqMeTqZeO+wa9iw6vVKQw1wWEqwKBgFHfahFs0DZ5cUTpGpBt +F/AQG7ukgipB6N6AkB9kDbgCs1FLgd199MQrEncug5hfpq8QerbyMatmA+GXoGMb +7vztKEH85yzp4n02FNL6H7xL4VVILvyZHdolmiORJ4qT2hZnl8pEQ2TYuF4RlHUd +nSwXX+2o0J/nF85fm4AwWKAc +-----END PRIVATE KEY----- diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/client.pem b/tika-pipes/tika-grpc/src/test/resources/certs/client.pem new file mode 100644 index 0000000000..8815875f32 --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/client.pem @@ -0,0 +1,20 @@ +-----BEGIN CERTIFICATE----- +MIIDNzCCAh8CFGyX00RCepOv/qCJ1oVdTtY92U83MA0GCSqGSIb3DQEBCwUAMFYx +CzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRl +cm5ldCBXaWRnaXRzIFB0eSBMdGQxDzANBgNVBAMMBnRlc3RjYTAeFw0yMDAzMTgw +MTA2MTBaFw0zMDAzMTYwMTA2MTBaMFoxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApT +b21lLVN0YXRlMSEwHwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQxEzAR +BgNVBAMMCnRlc3RjbGllbnQwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIB +AQCyqYRp+DXVp72NFbQH8hdhTZLycZXOlJhmMsrJmrjn2p7pI/8mTZ/0FC+SGWBG +ZV+ELiHrmCX5zfaILr9Iuw7Ghr3Vzoefi8r62rLupVPNi/qdqyjWk2dECHC9Z3+A +g3KzKTyerXWjKcvyKVmM0ZxE0RXhDW/RoQbqZsU2GKg1B2rhUU8KN0gVmKn0rJHO +xzRVSYeYLYp5Yn7KrtPJcKyo9aVuEr7dGANzpyF6lg/nYBWc+9SGwkoLdFvKvABY +JMyrbNhHUQfv0fzaZ0P86dfTENrDxzALrzGnqcx3KTrwJjkZ/aSr1tyD0/tXvukR +FiPxWBJhjHQ70GqTFQY19RbhAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAFXCewK8 +cWT+zWxXyGFnouFSBzTi0BMBJRrhsiNoiQxkqityJHWFExiQZie+7CA+EabXCQUB ++JwMSWM29j3mSw10DTfmC3rhheQqGxy304BZyUpdpvI2dt3p/mcsE7O+p4sQrSep +gijiDssKAfxTAmUM93N6+Q8yJK5immxlbeYfijoBvmkzyB/B+qNRPsx0n7aFGnfv +oWfkW296iPhWLiwknpC3xB6oK3vRbK4Zj1OaGb0grK7VN8EyhBix2xVF61i4dzCK +kMIpl7CUpw1Mb2z8q3F2bHBS7iF7g1Ccn5VGcO+aJ+6PWydaeqJ6VEBF0Nwv9woe +mL5AluNRLaqjZvE= +-----END CERTIFICATE----- diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/server1.key b/tika-pipes/tika-grpc/src/test/resources/certs/server1.key new file mode 100644 index 0000000000..086462992c --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/server1.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDnE443EknxvxBq +6+hvn/t09hl8hx366EBYvZmVM/NC+7igXRAjiJiA/mIaCvL3MS0Iz5hBLxSGICU+ +WproA3GCIFITIwcf/ETyWj/5xpgZ4AKrLrjQmmX8mhwUajfF3UvwMJrCOVqPp67t +PtP+2kBXaqrXdvnvXR41FsIB8V7zIAuIZB6bHQhiGVlc1sgZYsE2EGG9WMmHtS86 +qkAOTjG2XyjmPTGAwhGDpYkYrpzp99IiDh4/Veai81hn0ssQkbry0XRD/Ig3jcHh +23WiriPNJ0JsbgXUSLKRPZObA9VgOLy2aXoN84IMaeK3yy+cwSYG/99w93fUZJte +MXwz4oYZAgMBAAECggEBAIVn2Ncai+4xbH0OLWckabwgyJ4IM9rDc0LIU368O1kU +koais8qP9dujAWgfoh3sGh/YGgKn96VnsZjKHlyMgF+r4TaDJn3k2rlAOWcurGlj +1qaVlsV4HiEzp7pxiDmHhWvp4672Bb6iBG+bsjCUOEk/n9o9KhZzIBluRhtxCmw5 +nw4Do7z00PTvN81260uPWSc04IrytvZUiAIx/5qxD72bij2xJ8t/I9GI8g4FtoVB +8pB6S/hJX1PZhh9VlU6Yk+TOfOVnbebG4W5138LkB835eqk3Zz0qsbc2euoi8Hxi +y1VGwQEmMQ63jXz4c6g+X55ifvUK9Jpn5E8pq+pMd7ECgYEA93lYq+Cr54K4ey5t +sWMa+ye5RqxjzgXj2Kqr55jb54VWG7wp2iGbg8FMlkQwzTJwebzDyCSatguEZLuB +gRGroRnsUOy9vBvhKPOch9bfKIl6qOgzMJB267fBVWx5ybnRbWN/I7RvMQf3k+9y +biCIVnxDLEEYyx7z85/5qxsXg/MCgYEA7wmWKtCTn032Hy9P8OL49T0X6Z8FlkDC +Rk42ygrc/MUbugq9RGUxcCxoImOG9JXUpEtUe31YDm2j+/nbvrjl6/bP2qWs0V7l +dTJl6dABP51pCw8+l4cWgBBX08Lkeen812AAFNrjmDCjX6rHjWHLJcpS18fnRRkP +V1d/AHWX7MMCgYEA6Gsw2guhp0Zf2GCcaNK5DlQab8OL4Hwrpttzo4kuTlwtqNKp +Q9H4al9qfF4Cr1TFya98+EVYf8yFRM3NLNjZpe3gwYf2EerlJj7VLcahw0KKzoN1 +QBENfwgPLRk5sDkx9VhSmcfl/diLroZdpAwtv3vo4nEoxeuGFbKTGx3Qkf0CgYEA +xyR+dcb05Ygm3w4klHQTowQ10s1H80iaUcZBgQuR1ghEtDbUPZHsoR5t1xCB02ys +DgAwLv1bChIvxvH/L6KM8ovZ2LekBX4AviWxoBxJnfz/EVau98B0b1auRN6eSC83 +FRuGldlSOW1z/nSh8ViizSYE5H5HX1qkXEippvFRE88CgYB3Bfu3YQY60ITWIShv +nNkdcbTT9eoP9suaRJjw92Ln+7ZpALYlQMKUZmJ/5uBmLs4RFwUTQruLOPL4yLTH +awADWUzs3IRr1fwn9E+zM8JVyKCnUEM3w4N5UZskGO2klashAd30hWO+knRv/y0r +uGIYs9Ek7YXlXIRVrzMwcsrt1w== +-----END PRIVATE KEY----- diff --git a/tika-pipes/tika-grpc/src/test/resources/certs/server1.pem b/tika-pipes/tika-grpc/src/test/resources/certs/server1.pem new file mode 100644 index 0000000000..88244f856c --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/certs/server1.pem @@ -0,0 +1,22 @@ +-----BEGIN CERTIFICATE----- +MIIDtDCCApygAwIBAgIUbJfTREJ6k6/+oInWhV1O1j3ZT0IwDQYJKoZIhvcNAQEL +BQAwVjELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM +GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEPMA0GA1UEAwwGdGVzdGNhMB4XDTIw +MDMxODAzMTA0MloXDTMwMDMxNjAzMTA0MlowZTELMAkGA1UEBhMCVVMxETAPBgNV +BAgMCElsbGlub2lzMRAwDgYDVQQHDAdDaGljYWdvMRUwEwYDVQQKDAxFeGFtcGxl +LCBDby4xGjAYBgNVBAMMESoudGVzdC5nb29nbGUuY29tMIIBIjANBgkqhkiG9w0B +AQEFAAOCAQ8AMIIBCgKCAQEA5xOONxJJ8b8Qauvob5/7dPYZfIcd+uhAWL2ZlTPz +Qvu4oF0QI4iYgP5iGgry9zEtCM+YQS8UhiAlPlqa6ANxgiBSEyMHH/xE8lo/+caY +GeACqy640Jpl/JocFGo3xd1L8DCawjlaj6eu7T7T/tpAV2qq13b5710eNRbCAfFe +8yALiGQemx0IYhlZXNbIGWLBNhBhvVjJh7UvOqpADk4xtl8o5j0xgMIRg6WJGK6c +6ffSIg4eP1XmovNYZ9LLEJG68tF0Q/yIN43B4dt1oq4jzSdCbG4F1EiykT2TmwPV +YDi8tml6DfOCDGnit8svnMEmBv/fcPd31GSbXjF8M+KGGQIDAQABo2swaTAJBgNV +HRMEAjAAMAsGA1UdDwQEAwIF4DBPBgNVHREESDBGghAqLnRlc3QuZ29vZ2xlLmZy +ghh3YXRlcnpvb2kudGVzdC5nb29nbGUuYmWCEioudGVzdC55b3V0dWJlLmNvbYcE +wKgBAzANBgkqhkiG9w0BAQsFAAOCAQEAS8hDQA8PSgipgAml7Q3/djwQ644ghWQv +C2Kb+r30RCY1EyKNhnQnIIh/OUbBZvh0M0iYsy6xqXgfDhCB93AA6j0i5cS8fkhH +Jl4RK0tSkGQ3YNY4NzXwQP/vmUgfkw8VBAZ4Y4GKxppdATjffIW+srbAmdDruIRM +wPeikgOoRrXf0LA1fi4TqxARzeRwenQpayNfGHTvVF9aJkl8HoaMunTAdG5pIVcr +9GKi/gEMpXUJbbVv3U5frX1Wo4CFo+rZWJ/LyCMeb0jciNLxSdMwj/E/ZuExlyeZ +gc9ctPjSMvgSyXEKv6Vwobleeg88V2ZgzenziORoWj4KszG/lbQZvg== +-----END CERTIFICATE----- diff --git a/tika-pipes/tika-grpc/src/test/resources/log4j2.xml b/tika-pipes/tika-grpc/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..c88e66e99e --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/log4j2.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tika-pipes/tika-grpc/src/test/resources/test-files/014760.docx b/tika-pipes/tika-grpc/src/test/resources/test-files/014760.docx new file mode 100644 index 0000000000..07bb0ea4ef Binary files /dev/null and b/tika-pipes/tika-grpc/src/test/resources/test-files/014760.docx differ diff --git a/tika-pipes/tika-grpc/src/test/resources/test-files/017091.docx b/tika-pipes/tika-grpc/src/test/resources/test-files/017091.docx new file mode 100644 index 0000000000..03919a30e1 Binary files /dev/null and b/tika-pipes/tika-grpc/src/test/resources/test-files/017091.docx differ diff --git a/tika-pipes/tika-grpc/src/test/resources/test-files/017097.docx b/tika-pipes/tika-grpc/src/test/resources/test-files/017097.docx new file mode 100644 index 0000000000..c3b9c4f454 Binary files /dev/null and b/tika-pipes/tika-grpc/src/test/resources/test-files/017097.docx differ diff --git a/tika-pipes/tika-grpc/src/test/resources/test-files/018367.docx b/tika-pipes/tika-grpc/src/test/resources/test-files/018367.docx new file mode 100644 index 0000000000..7d14aee263 Binary files /dev/null and b/tika-pipes/tika-grpc/src/test/resources/test-files/018367.docx differ diff --git a/tika-pipes/tika-grpc/src/test/resources/tika-pipes-test-config.xml b/tika-pipes/tika-grpc/src/test/resources/tika-pipes-test-config.xml new file mode 100644 index 0000000000..e4006edb35 --- /dev/null +++ b/tika-pipes/tika-grpc/src/test/resources/tika-pipes-test-config.xml @@ -0,0 +1,35 @@ + + + + 600 + 60 + + + + 2 + + -Xmx1g + -XX:ParallelGCThreads=2 + + 60000 + -1 + + + + + diff --git a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java index e29c03b2cc..4919c17aee 100644 --- a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java +++ b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java @@ -337,7 +337,9 @@ private void addCredentialsProvider(HttpClientBuilder builder) throws TikaConfig authSchemeRegistry = RegistryBuilder.create() .register("ntlm", new NTLMSchemeFactory()).build(); } - provider.setCredentials(AuthScope.ANY, credentials); + if (credentials != null) { + provider.setCredentials(AuthScope.ANY, credentials); + } builder.setDefaultCredentialsProvider(provider); builder.setDefaultAuthSchemeRegistry(authSchemeRegistry);