Skip to content

Commit 15abcaf

Browse files
HADOOP-19343: Add native support for GCS connector
Closes #7869 Co-authored-by: Chris Nauroth <[email protected]> Signed-off-by: Shilun Fan <[email protected]> Signed-off-by: Steve Loughran <[email protected]> Signed-off-by: Chris Nauroth <[email protected]> Signed-off-by: Mukund Thakur <[email protected]> Signed-off-by: Ayush Saxena <[email protected]> Reviewed-by: Arunkumar Chacko <[email protected]> Reviewed-by: Cheng Pan <[email protected]>
1 parent 183b576 commit 15abcaf

File tree

73 files changed

+10179
-7
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+10179
-7
lines changed

LICENSE-binary

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,25 @@ com.fasterxml.woodstox:woodstox-core:5.4.0
229229
com.github.ben-manes.caffeine:caffeine:2.9.3
230230
com.github.davidmoten:rxjava-extras:0.8.0.17
231231
com.github.stephenc.jcip:jcip-annotations:1.0-1
232+
com.google.api:api-common:2.47.2
233+
com.google.api-client:google-api-client:2.7.2
234+
com.google.api:gax:2.64.2
235+
com.google.api:gax-grpc:2.64.2
236+
com.google.api:gax-httpjson:2.64.2
237+
com.google.api.grpc:gapic-google-cloud-storage-v2:2.52.0
238+
com.google.api.grpc:grpc-google-cloud-storage-v2:2.52.0
239+
com.google.api.grpc:proto-google-cloud-monitoring-v3:3.52.0
240+
com.google.api.grpc:proto-google-cloud-storage-v2:2.52.0
241+
com.google.api.grpc:proto-google-common-protos:2.55.2
242+
com.google.api.grpc:proto-google-iam-v1:1.50.2
243+
com.google.apis:google-api-services-storage:v1-rev20250420-2.0.0
244+
com.google.auto.value:auto-value-annotations:1.11.0
245+
com.google.cloud:google-cloud-core-grpc:2.54.2
246+
com.google.cloud:google-cloud-core-http:2.54.2
247+
com.google.cloud:google-cloud-core:2.54.2
248+
com.google.cloud:google-cloud-monitoring:3.52.0
249+
com.google.cloud.opentelemetry:exporter-metrics:0.33.0
250+
com.google.errorprone:error_prone_annotations:2.36.0
232251
com.google:guice:5.1.0
233252
com.google:guice-servlet:5.1.0
234253
com.google.api.grpc:proto-google-common-protos:1.0.0
@@ -237,9 +256,18 @@ com.google.errorprone:error_prone_annotations:2.5.1
237256
com.google.j2objc:j2objc-annotations:1.3
238257
com.google.json-simple:json-simple:1.1.1
239258
com.google.guava:failureaccess:1.0
259+
com.google.guava:failureaccess:1.0.2
240260
com.google.guava:guava:20.0
241261
com.google.guava:guava:32.0.1-jre
262+
com.google.guava:guava:33.1.0-jre
242263
com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava
264+
com.google.http-client:google-http-client-apache-v2:1.46.3
265+
com.google.http-client:google-http-client-appengine:1.46.3
266+
com.google.http-client:google-http-client-gson:1.46.3
267+
com.google.http-client:google-http-client-jackson2:1.46.3
268+
com.google.http-client:google-http-client:1.46.3
269+
com.google.j2objc:j2objc-annotations:3.0.0
270+
com.google.oauth-client:google-oauth-client:1.37.0
243271
com.microsoft.azure:azure-storage:7.0.0
244272
com.nimbusds:nimbus-jose-jwt:9.37.2
245273
com.zaxxer:HikariCP:4.0.3
@@ -251,13 +279,29 @@ commons-daemon:commons-daemon:1.0.13
251279
commons-io:commons-io:2.16.1
252280
commons-net:commons-net:3.9.0
253281
de.ruedigermoeller:fst:2.50
282+
io.grpc:grpc-alts:1.70.0
254283
io.grpc:grpc-api:1.69.0
284+
io.grpc:grpc-api:1.70.0
285+
io.grpc:grpc-auth:1.70.0
255286
io.grpc:grpc-context:1.69.0
287+
io.grpc:grpc-context:1.70.0
256288
io.grpc:grpc-core:1.69.0
289+
io.grpc:grpc-core:1.70.0
290+
io.grpc:grpc-googleapis:1.70.0
291+
io.grpc:grpc-grpclb:1.70.0
292+
io.grpc:grpc-inprocess:1.70.0
257293
io.grpc:grpc-netty:1.69.0
294+
io.grpc:grpc-netty-shaded:1.70.0
295+
io.grpc:grpc-opentelemetry:1.70.0
258296
io.grpc:grpc-protobuf:1.69.0
297+
io.grpc:grpc-protobuf:1.70.0
259298
io.grpc:grpc-protobuf-lite:1.69.0
299+
io.grpc:grpc-rls:1.70.0
300+
io.grpc:grpc-services:1.70.0
260301
io.grpc:grpc-stub:1.69.0
302+
io.grpc:grpc-stub:1.70.0
303+
io.grpc:grpc-util:1.70.0
304+
io.grpc:grpc-xds:1.70.0
261305
io.netty:netty-all:4.1.118.Final
262306
io.netty:netty-buffer:4.1.118.Final
263307
io.netty:netty-codec:4.1.118.Final
@@ -289,7 +333,19 @@ io.netty:netty-transport-native-epoll:4.1.118.Final
289333
io.netty:netty-transport-native-kqueue:4.1.118.Final
290334
io.netty:netty-resolver-dns-native-macos:4.1.118.Final
291335
io.opencensus:opencensus-api:0.12.3
336+
io.opencensus:opencensus-api:0.31.1
292337
io.opencensus:opencensus-contrib-grpc-metrics:0.12.3
338+
io.opencensus:opencensus-contrib-http-util:0.31.1
339+
io.opentelemetry.contrib:opentelemetry-gcp-resources:1.37.0-alpha
340+
io.opentelemetry:opentelemetry-api:1.47.0
341+
io.opentelemetry:opentelemetry-context:1.47.0
342+
io.opentelemetry:opentelemetry-sdk-common:1.47.0
343+
io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:1.47.0
344+
io.opentelemetry:opentelemetry-sdk:1.47.0
345+
io.opentelemetry:opentelemetry-sdk-logs:1.47.0
346+
io.opentelemetry:opentelemetry-sdk-metrics:1.47.0
347+
io.opentelemetry:opentelemetry-sdk-trace:1.47.0
348+
io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha
293349
io.reactivex:rxjava:1.3.8
294350
io.reactivex:rxjava-string:1.1.1
295351
io.reactivex:rxnetty:0.4.20
@@ -340,6 +396,7 @@ org.apache.solr:solr-solrj:8.11.2
340396
org.apache.yetus:audience-annotations:0.5.0
341397
org.apache.zookeeper:zookeeper:3.8.4
342398
org.codehaus.jettison:jettison:1.5.4
399+
org.conscrypt:conscrypt-openjdk-uber:2.5.2
343400
org.eclipse.jetty:jetty-annotations:9.4.57.v20241219
344401
org.eclipse.jetty:jetty-http:9.4.57.v20241219
345402
org.eclipse.jetty:jetty-io:9.4.57.v20241219
@@ -394,18 +451,22 @@ hadoop-tools/hadoop-sls/src/main/html/js/thirdparty/d3.v3.js
394451
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/static/d3-3.5.17.min.js
395452
leveldb v1.13
396453

454+
com.google.auth:google-auth-library-credentials:1.33.1
455+
com.google.auth:google-auth-library-oauth2-http:1.33.1
397456
com.google.protobuf:protobuf-java:2.5.0
398457
com.google.protobuf:protobuf-java:3.25.5
399458
com.google.re2j:re2j:1.1
400459
com.jcraft:jsch:0.1.55
401460
com.thoughtworks.paranamer:paranamer:2.3
402461
jakarta.activation:jakarta.activation-api:1.2.1
462+
org.checkerframework:checker-qual:3.49.0
403463
org.fusesource.leveldbjni:leveldbjni-all:1.8
404464
org.jline:jline:3.9.0
405465
org.hamcrest:hamcrest-core:1.3
406466
org.ow2.asm:asm:5.0.4
407467
org.ow2.asm:asm-commons:6.0
408468
org.ow2.asm:asm-tree:6.0
469+
org.threeten:threetenbp:1.7.0
409470

410471

411472
MIT License

hadoop-cloud-storage-project/hadoop-cloud-storage/pom.xml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,5 +130,20 @@
130130
<artifactId>hadoop-tos</artifactId>
131131
<scope>compile</scope>
132132
</dependency>
133+
<dependency>
134+
<groupId>org.apache.hadoop</groupId>
135+
<artifactId>hadoop-gcp</artifactId>
136+
<scope>compile</scope>
137+
<!--
138+
Exclude transitive dependencies to prevent dependency convergence
139+
problems. hadoop-gcp is a self-contained shaded jar.
140+
-->
141+
<exclusions>
142+
<exclusion>
143+
<groupId>*</groupId>
144+
<artifactId>*</artifactId>
145+
</exclusion>
146+
</exclusions>
147+
</dependency>
133148
</dependencies>
134149
</project>

hadoop-common-project/hadoop-common/src/main/resources/core-default.xml

Lines changed: 193 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,7 +1285,7 @@
12851285

12861286
<property>
12871287
<name>fs.viewfs.overload.scheme.target.gs.impl</name>
1288-
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
1288+
<value>org.apache.hadoop.fs.gs.GoogleHadoopFileSystem</value>
12891289
<description>The GoogleHadoopFS/Google Cloud Storage file system for view
12901290
file system overload scheme when child file system and ViewFSOverloadScheme's
12911291
schemes are gs.
@@ -2373,12 +2373,6 @@ The switch to turn S3A auditing on or off.
23732373
otherwise fall back to hadoop.tmp.dir </description>
23742374
</property>
23752375

2376-
<property>
2377-
<name>fs.AbstractFileSystem.gs.impl</name>
2378-
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
2379-
<description>The AbstractFileSystem for gs: uris.</description>
2380-
</property>
2381-
23822376
<property>
23832377
<name>fs.azure.enable.readahead</name>
23842378
<value>true</value>
@@ -4509,4 +4503,196 @@ The switch to turn S3A auditing on or off.
45094503
If the value is less than or equal to 0, the cache is disabled entirely.
45104504
</description>
45114505
</property>
4506+
4507+
<property>
4508+
<name>fs.gs.impl</name>
4509+
<value>org.apache.hadoop.fs.gs.GoogleHadoopFileSystem</value>
4510+
<description>The FileSystem for gs: uris.</description>
4511+
</property>
4512+
4513+
<property>
4514+
<name>fs.AbstractFileSystem.gs.impl</name>
4515+
<value>org.apache.hadoop.fs.gs.Gs</value>
4516+
<description>The AbstractFileSystem for gs: uris.</description>
4517+
</property>
4518+
4519+
<property>
4520+
<name>fs.gs.project.id</name>
4521+
<description>
4522+
Google Cloud Project ID with access to Google Cloud Storage buckets.
4523+
Required only for list buckets and create bucket operations.
4524+
</description>
4525+
</property>
4526+
4527+
<property>
4528+
<name>fs.gs.working.dir</name>
4529+
<value>/</value>
4530+
<description>
4531+
The directory relative gs: uris resolve in inside the default bucket.
4532+
</description>
4533+
</property>
4534+
4535+
<property>
4536+
<name>fs.gs.rewrite.max.chunk.size</name>
4537+
<value>512m</value>
4538+
<description>
4539+
Maximum size of object chunk that will be rewritten in a single rewrite
4540+
request when fs.gs.copy.with.rewrite.enable is set to true.
4541+
</description>
4542+
</property>
4543+
4544+
<property>
4545+
<name>fs.gs.bucket.delete.enable</name>
4546+
<value>false</value>
4547+
<description>
4548+
If true, recursive delete on a path that refers to a Cloud Storage bucket
4549+
itself or delete on that path when it is empty will result in deletion of
4550+
the bucket itself. If false, any operation that normally would have
4551+
deleted the bucket will be ignored. Setting to false preserves the typical
4552+
behavior of rm -rf / which translates to deleting everything inside of
4553+
root, but without clobbering the filesystem authority corresponding to that
4554+
root path in the process.
4555+
</description>
4556+
</property>
4557+
4558+
<property>
4559+
<name>fs.gs.block.size</name>
4560+
<value>64m</value>
4561+
<description>
4562+
The reported block size of the file system. This does not change any
4563+
behavior of the connector or the underlying Google Cloud Storage objects.
4564+
However, it will affect the number of splits Hadoop MapReduce uses for a
4565+
given input.
4566+
</description>
4567+
</property>
4568+
4569+
<property>
4570+
<name>fs.gs.create.items.conflict.check.enable</name>
4571+
<value>true</value>
4572+
<description>
4573+
Enables a check that ensures that conflicting directories do not exist when
4574+
creating files and conflicting files do not exist when creating directories.
4575+
</description>
4576+
</property>
4577+
4578+
<property>
4579+
<name>fs.gs.marker.file.pattern</name>
4580+
<description>
4581+
If set, files that match specified pattern are copied last during folder
4582+
rename operation.
4583+
</description>
4584+
</property>
4585+
4586+
<property>
4587+
<name>fs.gs.auth.type</name>
4588+
<value>COMPUTE_ENGINE</value>
4589+
<description>
4590+
What type of authentication mechanism to use for Google Cloud Storage
4591+
access. Valid values: APPLICATION_DEFAULT, COMPUTE_ENGINE,
4592+
SERVICE_ACCOUNT_JSON_KEYFILE, UNAUTHENTICATED, USER_CREDENTIALS.
4593+
</description>
4594+
</property>
4595+
4596+
<property>
4597+
<name>fs.gs.auth.service.account.json.keyfile</name>
4598+
<description>
4599+
The path to the JSON keyfile for the service account when fs.gs.auth.type
4600+
property is set to SERVICE_ACCOUNT_JSON_KEYFILE. The file must exist at
4601+
the same path on all nodes
4602+
</description>
4603+
</property>
4604+
4605+
<property>
4606+
<name>fs.gs.auth.client.id</name>
4607+
<description>
4608+
The OAuth2 client ID.
4609+
</description>
4610+
</property>
4611+
4612+
<property>
4613+
<name>fs.gs.auth.client.secret</name>
4614+
<description>
4615+
The OAuth2 client secret.
4616+
</description>
4617+
</property>
4618+
4619+
<property>
4620+
<name>fs.gs.auth.refresh.token</name>
4621+
<description>
4622+
The refresh token.
4623+
</description>
4624+
</property>
4625+
4626+
<property>
4627+
<name>fs.gs.inputstream.support.gzip.encoding.enable</name>
4628+
<value>false</value>
4629+
<description>
4630+
If set to false then reading files with GZIP content encoding (HTTP header
4631+
Content-Encoding: gzip) will result in failure (IOException is thrown).
4632+
4633+
This feature is disabled by default because processing of
4634+
GZIP encoded files is inefficient and error-prone in Hadoop and Spark.
4635+
</description>
4636+
</property>
4637+
4638+
<property>
4639+
<name>fs.gs.outputstream.buffer.size</name>
4640+
<value>8m</value>
4641+
<description>
4642+
Write buffer size used by the file system API to send the data to be
4643+
uploaded to Cloud Storage upload thread via pipes. The various pipe types
4644+
are documented below.
4645+
</description>
4646+
</property>
4647+
4648+
<property>
4649+
<name>fs.gs.outputstream.sync.min.interval</name>
4650+
<value>0</value>
4651+
<description>
4652+
Output stream configuration that controls the minimum interval between
4653+
consecutive syncs. This allows to avoid getting rate-limited by Google Cloud
4654+
Storage. Default is 0 - no wait between syncs. Note that hflush() will
4655+
be no-op if called more frequently than minimum sync interval and hsync()
4656+
will block until an end of a min sync interval.
4657+
</description>
4658+
</property>
4659+
4660+
<property>
4661+
<name>fs.gs.inputstream.fadvise</name>
4662+
<value>AUTO</value>
4663+
<description>
4664+
Tunes reading objects behavior to optimize HTTP GET requests for various use
4665+
cases. Valid values: SEQUENTIAL, RANDOM, AUTO, AUTO_RANDOM.
4666+
</description>
4667+
</property>
4668+
4669+
<property>
4670+
<name>fs.gs.fadvise.request.track.count</name>
4671+
<value>3</value>
4672+
<description>
4673+
Self adaptive fadvise mode uses distance between the served requests to
4674+
decide the access pattern. This property controls how many such requests
4675+
need to be tracked. It is used when AUTO_RANDOM is selected.
4676+
</description>
4677+
</property>
4678+
4679+
<property>
4680+
<name>fs.gs.inputstream.inplace.seek.limit</name>
4681+
<value>8m</value>
4682+
<description>
4683+
If forward seeks are within this many bytes of the current position, seeks
4684+
are performed by reading and discarding bytes in-place rather than opening a
4685+
new underlying stream.
4686+
</description>
4687+
</property>
4688+
4689+
<property>
4690+
<name>fs.gs.inputstream.min.range.request.size</name>
4691+
<value>2m</value>
4692+
<description>
4693+
Minimum size in bytes of the read range for Cloud Storage request when
4694+
opening a new stream to read an object.
4695+
</description>
4696+
</property>
4697+
45124698
</configuration>

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestCommonConfigurationFields.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ public void initializeMemberVariables() {
149149
xmlPropsToSkipCompare.add("fs.azure.saskey.usecontainersaskeyforallaccess");
150150
xmlPropsToSkipCompare.add("fs.azure.user.agent.prefix");
151151

152+
// GS properties are in a different class
153+
// - org.apache.hadoop.fs.gs.GoogleHadoopFileSystemConfiguration
154+
xmlPrefixToSkipCompare.add("gs.");
155+
xmlPrefixToSkipCompare.add("fs.gs.");
156+
xmlPropsToSkipCompare.add("fs.AbstractFileSystem.gs.impl");
157+
152158
// Properties in enable callqueue overflow trigger failover for stateless servers.
153159
xmlPropsToSkipCompare.add("ipc.[port_number].callqueue.overflow.trigger.failover");
154160
xmlPropsToSkipCompare.add("ipc.callqueue.overflow.trigger.failover");

0 commit comments

Comments
 (0)