Skip to content

Commit 19dcf09

Browse files
docs(samples): Add Dataflow snippet for reading from Cloud Storage (#9568)
1 parent c6ae893 commit 19dcf09

File tree

6 files changed

+166
-16
lines changed

6 files changed

+166
-16
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.example.dataflow;
18+
19+
// [START dataflow_read_from_cloud_storage]
20+
import org.apache.beam.sdk.Pipeline;
21+
import org.apache.beam.sdk.PipelineResult;
22+
import org.apache.beam.sdk.io.TextIO;
23+
import org.apache.beam.sdk.options.Description;
24+
import org.apache.beam.sdk.options.PipelineOptions;
25+
import org.apache.beam.sdk.options.PipelineOptionsFactory;
26+
import org.apache.beam.sdk.transforms.MapElements;
27+
import org.apache.beam.sdk.values.TypeDescriptors;
28+
29+
public class ReadFromStorage {
30+
// [END dataflow_read_from_cloud_storage]
31+
public interface Options extends PipelineOptions {
32+
@Description("The Cloud Storage bucket to read from")
33+
String getBucket();
34+
35+
void setBucket(String value);
36+
}
37+
38+
public static PipelineResult.State main(String[] args) {
39+
var options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
40+
Pipeline pipeline = createPipeline(options);
41+
return pipeline.run().waitUntilFinish();
42+
}
43+
44+
// [START dataflow_read_from_cloud_storage]
45+
public static Pipeline createPipeline(Options options) {
46+
var pipeline = Pipeline.create(options);
47+
pipeline
48+
// Read from a text file.
49+
.apply(TextIO.read().from(
50+
"gs://" + options.getBucket() + "/*.txt"))
51+
.apply(
52+
MapElements.into(TypeDescriptors.strings())
53+
.via(
54+
(x -> {
55+
System.out.println(x);
56+
return x;
57+
})));
58+
return pipeline;
59+
}
60+
}
61+
// [END dataflow_read_from_cloud_storage]

dataflow/snippets/src/test/java/com/example/dataflow/ApacheIcebergIT.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353

5454
public class ApacheIcebergIT {
5555
private ByteArrayOutputStream bout;
56-
private PrintStream out;
56+
private final PrintStream originalOut = System.out;
5757

5858
private static final String CATALOG_NAME = "local";
5959
private static final String TABLE_NAME = "table1";
@@ -112,8 +112,7 @@ private void writeTableRecord()
112112
@Before
113113
public void setUp() throws IOException {
114114
bout = new ByteArrayOutputStream();
115-
out = new PrintStream(bout);
116-
System.setOut(out);
115+
System.setOut(new PrintStream(bout));
117116

118117
// Create an Apache Iceberg catalog with a table.
119118
warehouseDirectory = Files.createTempDirectory("test-warehouse");
@@ -131,7 +130,7 @@ public void setUp() throws IOException {
131130
@After
132131
public void tearDown() throws IOException {
133132
Files.deleteIfExists(Paths.get(OUTPUT_FILE_NAME));
134-
System.setOut(null);
133+
System.setOut(originalOut);
135134
}
136135

137136
@Test

dataflow/snippets/src/test/java/com/example/dataflow/BigQueryWriteIT.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ public class BigQueryWriteIT {
4747
private static final String projectId = System.getenv("GOOGLE_CLOUD_PROJECT");
4848

4949
private ByteArrayOutputStream bout;
50-
private PrintStream out;
50+
private final PrintStream originalOut = System.out;
5151
private BigQuery bigquery;
5252
private String datasetName;
5353
private String tableName;
@@ -65,8 +65,7 @@ private void createTable() {
6565
@Before
6666
public void setUp() throws InterruptedException {
6767
bout = new ByteArrayOutputStream();
68-
out = new PrintStream(bout);
69-
System.setOut(out);
68+
System.setOut(new PrintStream(bout));
7069

7170
bigquery = BigQueryOptions.getDefaultInstance().getService();
7271

@@ -79,7 +78,7 @@ public void setUp() throws InterruptedException {
7978
public void tearDown() {
8079
bigquery.delete(
8180
DatasetId.of(projectId, datasetName), DatasetDeleteOption.deleteContents());
82-
System.setOut(null);
81+
System.setOut(originalOut);
8382
}
8483

8584
@Test

dataflow/snippets/src/test/java/com/example/dataflow/BiqQueryReadIT.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,15 @@ public class BiqQueryReadIT {
4545
private static final String projectId = System.getenv("GOOGLE_CLOUD_PROJECT");
4646

4747
private ByteArrayOutputStream bout;
48-
private PrintStream out;
48+
private final PrintStream originalOut = System.out;
4949
private BigQuery bigquery;
5050
private String datasetName;
5151
private String tableName;
5252

5353
@Before
5454
public void setUp() throws InterruptedException {
5555
bout = new ByteArrayOutputStream();
56-
out = new PrintStream(bout);
57-
System.setOut(out);
56+
System.setOut(new PrintStream(bout));
5857

5958
bigquery = BigQueryOptions.getDefaultInstance().getService();
6059

@@ -81,7 +80,7 @@ public void setUp() throws InterruptedException {
8180
public void tearDown() {
8281
bigquery.delete(
8382
DatasetId.of(projectId, datasetName), DatasetDeleteOption.deleteContents());
84-
System.setOut(null);
83+
System.setOut(originalOut);
8584
}
8685

8786
@Test

dataflow/snippets/src/test/java/com/example/dataflow/PubSubWriteIT.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ public class PubSubWriteIT {
4747
private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
4848

4949
private ByteArrayOutputStream bout;
50-
private PrintStream out;
50+
private final PrintStream originalOut = System.out;
5151
private String topicId;
5252
private String subscriptionId;
5353
TopicAdminClient topicAdminClient;
@@ -64,8 +64,7 @@ public void setUp() throws Exception {
6464
requireEnvVar("GOOGLE_CLOUD_PROJECT");
6565

6666
bout = new ByteArrayOutputStream();
67-
out = new PrintStream(bout);
68-
System.setOut(out);
67+
System.setOut(new PrintStream(bout));
6968

7069
topicId = "test_topic_" + UUID.randomUUID().toString().substring(0, 8);
7170
subscriptionId = topicId + "-sub";
@@ -84,7 +83,7 @@ public void setUp() throws Exception {
8483
public void tearDown() {
8584
subscriptionAdminClient.deleteSubscription(SubscriptionName.of(PROJECT_ID, subscriptionId));
8685
topicAdminClient.deleteTopic(TopicName.of(PROJECT_ID, topicId));
87-
System.setOut(null);
86+
System.setOut(originalOut);
8887
}
8988

9089
@Test
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.example.dataflow;
18+
19+
import static org.junit.Assert.assertEquals;
20+
import static org.junit.Assert.assertTrue;
21+
22+
import com.google.cloud.storage.BlobId;
23+
import com.google.cloud.storage.BlobInfo;
24+
import com.google.cloud.storage.BucketInfo;
25+
import com.google.cloud.storage.Storage;
26+
import com.google.cloud.storage.testing.RemoteStorageHelper;
27+
import java.io.ByteArrayOutputStream;
28+
import java.io.PrintStream;
29+
import java.nio.charset.StandardCharsets;
30+
import java.util.concurrent.ExecutionException;
31+
import java.util.concurrent.TimeUnit;
32+
import org.apache.beam.sdk.PipelineResult;
33+
import org.junit.After;
34+
import org.junit.Before;
35+
import org.junit.Test;
36+
import org.junit.runner.RunWith;
37+
import org.junit.runners.JUnit4;
38+
39+
@RunWith(JUnit4.class)
40+
public class ReadFromStorageIT {
41+
42+
private static final String projectId = System.getenv("GOOGLE_CLOUD_PROJECT");
43+
44+
private ByteArrayOutputStream bout;
45+
private final PrintStream originalout = System.out;
46+
47+
String bucketName;
48+
Storage storage;
49+
50+
private static final String[] lines = {"line 1", "line 2"};
51+
52+
@Before
53+
public void setUp() {
54+
// Redirect System.err to capture logs.
55+
bout = new ByteArrayOutputStream();
56+
System.setOut(new PrintStream(bout));
57+
58+
// Create a Cloud Storage bucket with a text file.
59+
RemoteStorageHelper helper = RemoteStorageHelper.create();
60+
storage = helper.getOptions().getService();
61+
bucketName = RemoteStorageHelper.generateBucketName();
62+
storage.create(BucketInfo.of(bucketName));
63+
64+
String objectName = "file1.txt";
65+
String contents = String.format("%s\n%s\n", lines[0], lines[1]);
66+
67+
BlobId blobId = BlobId.of(bucketName, objectName);
68+
BlobInfo blobInfo = BlobInfo.newBuilder(blobId).build();
69+
byte[] content = contents.getBytes(StandardCharsets.UTF_8);
70+
71+
storage.create(blobInfo, content);
72+
}
73+
74+
@After
75+
public void tearDown() throws ExecutionException, InterruptedException {
76+
RemoteStorageHelper.forceDelete(storage, bucketName, 5, TimeUnit.SECONDS);
77+
78+
System.setOut(originalout);
79+
bout.reset();
80+
}
81+
82+
@Test
83+
public void readFromStorage_shouldReadFile() throws Exception {
84+
85+
PipelineResult.State state = ReadFromStorage.main(
86+
new String[] {"--runner=DirectRunner", "--bucket=" + bucketName});
87+
assertEquals(PipelineResult.State.DONE, state);
88+
89+
String got = bout.toString();
90+
assertTrue(got.contains(lines[0]));
91+
assertTrue(got.contains(lines[1]));
92+
}
93+
}

0 commit comments

Comments
 (0)