Skip to content

Commit c28c558

Browse files
committed
Add GoogleFetcher
This allows the fetching of items using files.get from Google Drive
1 parent 6feee8f commit c28c558

File tree

6 files changed

+383
-2
lines changed

6 files changed

+383
-2
lines changed

tika-pipes/tika-fetchers/pom.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
<module>tika-fetcher-gcs</module>
3838
<module>tika-fetcher-az-blob</module>
3939
<module>tika-fetcher-microsoft-graph</module>
40+
<module>tika-fetcher-google</module>
4041
</modules>
4142

4243
<dependencies>
@@ -45,4 +46,4 @@
4546
<scm>
4647
<tag>3.0.0-BETA-rc1</tag>
4748
</scm>
48-
</project>
49+
</project>
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0"
2+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4+
<modelVersion>4.0.0</modelVersion>
5+
6+
<parent>
7+
<artifactId>tika-fetchers</artifactId>
8+
<groupId>org.apache.tika</groupId>
9+
<version>3.0.0-SNAPSHOT</version>
10+
</parent>
11+
12+
<artifactId>tika-fetcher-google</artifactId>
13+
<name>Google Tika Pipes Fetcher</name>
14+
15+
<properties>
16+
<google.api.client.version>2.2.0</google.api.client.version>
17+
<maven.compiler.source>11</maven.compiler.source>
18+
<maven.compiler.target>11</maven.compiler.target>
19+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
20+
<azure-identity.version>1.11.0</azure-identity.version>
21+
<microsoft-graph.version>6.4.0</microsoft-graph.version>
22+
<microsoft-kiota-serialization-json.version>1.1.1</microsoft-kiota-serialization-json.version>
23+
<junit-jupiter-engine.version>5.11.0-M2</junit-jupiter-engine.version>
24+
<wiremock.version>3.3.1</wiremock.version>
25+
<mockito-junit-jupiter.version>5.3.1</mockito-junit-jupiter.version>
26+
<nimbus-jose-jwt.version>9.37.3</nimbus-jose-jwt.version>
27+
</properties>
28+
29+
<dependencies>
30+
<!-- Apache Tika Core -->
31+
<dependency>
32+
<groupId>${project.groupId}</groupId>
33+
<artifactId>tika-core</artifactId>
34+
<version>${project.version}</version>
35+
</dependency>
36+
37+
<!-- Google Drive API Client -->
38+
<dependency>
39+
<groupId>com.google.api-client</groupId>
40+
<artifactId>google-api-client</artifactId>
41+
<version>${google.api.client.version}</version>
42+
</dependency>
43+
44+
<dependency>
45+
<groupId>com.google.auth</groupId>
46+
<artifactId>google-auth-library-oauth2-http</artifactId>
47+
<version>1.19.0</version>
48+
</dependency>
49+
50+
<!-- Google Drive API -->
51+
<dependency>
52+
<groupId>com.google.apis</groupId>
53+
<artifactId>google-api-services-drive</artifactId>
54+
<version>v3-rev20241027-2.0.0</version>
55+
</dependency>
56+
57+
<!-- Logging -->
58+
<dependency>
59+
<groupId>org.slf4j</groupId>
60+
<artifactId>slf4j-api</artifactId>
61+
</dependency>
62+
63+
<!-- Apache Commons IO -->
64+
<dependency>
65+
<groupId>commons-io</groupId>
66+
<artifactId>commons-io</artifactId>
67+
</dependency>
68+
69+
<!-- Test Dependencies -->
70+
<dependency>
71+
<groupId>org.junit.jupiter</groupId>
72+
<artifactId>junit-jupiter</artifactId>
73+
<scope>test</scope>
74+
</dependency>
75+
</dependencies>
76+
77+
<build>
78+
<plugins>
79+
<plugin>
80+
<groupId>org.apache.maven.plugins</groupId>
81+
<artifactId>maven-compiler-plugin</artifactId>
82+
<configuration>
83+
<archive>
84+
<manifestEntries>
85+
<Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name>
86+
</manifestEntries>
87+
</archive>
88+
</configuration>
89+
</plugin>
90+
</plugins>
91+
</build>
92+
93+
<scm>
94+
<tag>3.0.0-BETA-rc1</tag>
95+
</scm>
96+
</project>
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.pipes.fetchers.google;
18+
19+
import java.io.ByteArrayInputStream;
20+
import java.io.File;
21+
import java.io.IOException;
22+
import java.io.InputStream;
23+
import java.nio.file.Files;
24+
import java.util.ArrayList;
25+
import java.util.Base64;
26+
import java.util.List;
27+
import java.util.Map;
28+
29+
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
30+
import com.google.api.client.http.HttpRequestInitializer;
31+
import com.google.api.client.json.JsonFactory;
32+
import com.google.api.client.json.gson.GsonFactory;
33+
import com.google.api.services.drive.Drive;
34+
import com.google.api.services.drive.DriveScopes;
35+
import com.google.auth.http.HttpCredentialsAdapter;
36+
import com.google.auth.oauth2.GoogleCredentials;
37+
import org.apache.commons.io.FileUtils;
38+
import org.slf4j.Logger;
39+
import org.slf4j.LoggerFactory;
40+
41+
import org.apache.tika.config.Field;
42+
import org.apache.tika.config.Initializable;
43+
import org.apache.tika.config.InitializableProblemHandler;
44+
import org.apache.tika.config.Param;
45+
import org.apache.tika.exception.TikaConfigException;
46+
import org.apache.tika.exception.TikaException;
47+
import org.apache.tika.io.TikaInputStream;
48+
import org.apache.tika.metadata.Metadata;
49+
import org.apache.tika.parser.ParseContext;
50+
import org.apache.tika.pipes.fetcher.AbstractFetcher;
51+
import org.apache.tika.pipes.fetchers.google.config.GoogleFetcherConfig;
52+
53+
54+
/**
55+
* Google Fetcher allows the fetching of files from a Google Drive, using a
56+
* service account key.
57+
*
58+
* Fetch Keys are ${fileId},${subjectUser}, where the subject user is the
59+
* organizer of the file. This user is necessary as part of the key as the
60+
* service account must act on behalf of the user when querying for the file.
61+
*/
62+
public class GoogleFetcher extends AbstractFetcher implements Initializable {
63+
private static final Logger LOGGER = LoggerFactory.getLogger(GoogleFetcher.class);
64+
private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance();
65+
66+
private GoogleCredentials baseCredentials;
67+
68+
private Drive driveService;
69+
private boolean spoolToTemp;
70+
private List<String> scopes;
71+
72+
private GoogleFetcherConfig config = new GoogleFetcherConfig();
73+
74+
public GoogleFetcher() {
75+
scopes = new ArrayList<>();
76+
scopes.add(DriveScopes.DRIVE_READONLY);
77+
}
78+
79+
public GoogleFetcher(GoogleFetcherConfig config) {
80+
this.config = config;
81+
}
82+
83+
@Field
84+
public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException {
85+
String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(",");
86+
long[] seconds = new long[longStrings.length];
87+
for (int i = 0; i < longStrings.length; i++) {
88+
try {
89+
seconds[i] = Long.parseLong(longStrings[i]);
90+
} catch (NumberFormatException e) {
91+
throw new TikaConfigException(e.getMessage());
92+
}
93+
}
94+
setThrottleSeconds(seconds);
95+
}
96+
97+
public void setThrottleSeconds(long[] throttleSeconds) {
98+
config.setThrottleSeconds(throttleSeconds);
99+
}
100+
101+
@Field
102+
public void setSpoolToTemp(boolean spoolToTemp) {
103+
config.setSpoolToTemp(spoolToTemp);
104+
}
105+
106+
@Field
107+
public void setServiceAccountKeyBase64(String serviceAccountKeyBase64) {
108+
config.setServiceAccountKeyBase64(serviceAccountKeyBase64);
109+
}
110+
111+
@Field
112+
public void setSubjectUser(String subjectUser) {
113+
config.setSubjectUser(subjectUser);
114+
}
115+
116+
@Field
117+
public void setScopes(List<String> scopes) {
118+
config.setScopes(new ArrayList<>(scopes));
119+
if (config.getScopes().isEmpty()) {
120+
config.getScopes().add(DriveScopes.DRIVE_READONLY);
121+
}
122+
}
123+
124+
@Override
125+
public void initialize(Map<String, Param> map) throws TikaConfigException {
126+
try {
127+
baseCredentials = GoogleCredentials
128+
.fromStream(new ByteArrayInputStream(Base64.getDecoder().decode(config.getServiceAccountKeyBase64())))
129+
.createScoped(scopes);
130+
} catch (IOException e) {
131+
throw new TikaConfigException("Failed to initialize Google Drive service", e);
132+
}
133+
}
134+
135+
@Override
136+
public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException {
137+
}
138+
139+
@Override
140+
public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException {
141+
int tries = 0;
142+
Exception ex = null;
143+
144+
do {
145+
long start = System.currentTimeMillis();
146+
try {
147+
String[] fetchKeySplit = fetchKey.split(",");
148+
if (fetchKeySplit.length != 2) {
149+
throw new TikaException("Invalid fetch key, expected format ${fileId},${subjectUser}: " + fetchKey);
150+
}
151+
152+
String fileId = fetchKeySplit[0];
153+
String subjectUser = fetchKeySplit[1];
154+
155+
GoogleCredentials delegatedCredentials = baseCredentials.createDelegated(subjectUser);
156+
final HttpRequestInitializer requestInitializer = new HttpCredentialsAdapter(delegatedCredentials);
157+
158+
driveService = new Drive.Builder(
159+
GoogleNetHttpTransport.newTrustedTransport(),
160+
JSON_FACTORY,
161+
requestInitializer).setApplicationName("tika-fetcher-google").build();
162+
163+
InputStream is = driveService.files()
164+
.get(fileId)
165+
.executeMediaAsInputStream();
166+
167+
if (is == null) {
168+
throw new IOException("Empty input stream when we tried to parse " + fetchKey);
169+
}
170+
171+
if (spoolToTemp) {
172+
File tempFile = Files.createTempFile("spooled-temp", ".dat").toFile();
173+
FileUtils.copyInputStreamToFile(is, tempFile);
174+
LOGGER.info("Spooled to temp file {}", tempFile);
175+
return TikaInputStream.get(tempFile.toPath());
176+
}
177+
return TikaInputStream.get(is);
178+
179+
} catch (Exception e) {
180+
LOGGER.warn("Exception fetching on retry=" + tries, e);
181+
ex = e;
182+
} finally {
183+
long elapsed = System.currentTimeMillis() - start;
184+
LOGGER.debug("Total to fetch {}", elapsed);
185+
}
186+
187+
long[] throttleSeconds = config.getThrottleSeconds();
188+
189+
LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]);
190+
try {
191+
Thread.sleep(throttleSeconds[tries] * 1000);
192+
} catch (InterruptedException e) {
193+
Thread.currentThread().interrupt();
194+
}
195+
} while (++tries < config.getThrottleSeconds().length);
196+
197+
throw new TikaException("Could not fetch " + fetchKey, ex);
198+
}
199+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.pipes.fetchers.google.config;
18+
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
22+
import org.apache.tika.pipes.fetcher.config.AbstractConfig;
23+
24+
public class GoogleFetcherConfig extends AbstractConfig {
25+
private long[] throttleSeconds;
26+
private boolean spoolToTemp;
27+
protected String serviceAccountKeyBase64;
28+
protected String subjectUser;
29+
private List<String> scopes = new ArrayList<>();
30+
31+
public boolean isSpoolToTemp() {
32+
return spoolToTemp;
33+
}
34+
35+
public GoogleFetcherConfig setSpoolToTemp(boolean spoolToTemp) {
36+
this.spoolToTemp = spoolToTemp;
37+
return this;
38+
}
39+
40+
public long[] getThrottleSeconds() {
41+
return throttleSeconds;
42+
}
43+
44+
public GoogleFetcherConfig setThrottleSeconds(long[] throttleSeconds) {
45+
this.throttleSeconds = throttleSeconds;
46+
return this;
47+
}
48+
49+
public String getServiceAccountKeyBase64() {
50+
return serviceAccountKeyBase64;
51+
}
52+
53+
public GoogleFetcherConfig setServiceAccountKeyBase64(String serviceAccountKeyBase64) {
54+
this.serviceAccountKeyBase64 = serviceAccountKeyBase64;
55+
return this;
56+
}
57+
58+
public String getSubjectUser() {
59+
return subjectUser;
60+
}
61+
62+
public GoogleFetcherConfig setSubjectUser(String subjectUser) {
63+
this.subjectUser = subjectUser;
64+
return this;
65+
}
66+
67+
public List<String> getScopes() {
68+
return scopes;
69+
}
70+
71+
public GoogleFetcherConfig setScopes(List<String> scopes) {
72+
this.scopes = scopes;
73+
return this;
74+
}
75+
}

tika-pipes/tika-grpc/example-dockerfile/docker-build.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,10 @@ docker buildx create --name tikabuilder
3838
# see https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
3939
docker run --rm --privileged tonistiigi/binfmt --install amd64
4040
docker run --rm --privileged tonistiigi/binfmt --install arm64
41-
docker buildx build --builder=tikabuilder "${OUT_DIR}" -t "${TAG_NAME}" --platform linux/amd64,linux/arm64 --push
41+
REPO_URL=$(echo ${TAG_NAME} | cut -d'/' -f 1)
42+
aws ecr get-login-password | docker login --username AWS --password-stdin ${REPO_URL}
43+
docker buildx build --builder=tikabuilder "${OUT_DIR}" \
44+
-t ${TAG_NAME} \
45+
--platform linux/amd64,linux/arm64 \
46+
--push
4247
docker buildx stop tikabuilder

0 commit comments

Comments
 (0)