Skip to content

Commit 90351b0

Browse files
authored
impl(bigquery/read): add avro example (#339)
* impl(bigquery/read): add avro example * checkers * Apply suggestions from code review * ci: add bigquery/read * style * add flex dep * fix dockerfile * add bison
1 parent 364c5c5 commit 90351b0

File tree

9 files changed

+267
-12
lines changed

9 files changed

+267
-12
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ set(samples
2424
batch/cpp_application
2525
batch/parallel/application
2626
batch/simple
27+
bigquery/read/arrow
28+
bigquery/read/avro
2729
bigquery/write
2830
cloud-run-hello-world
2931
gcs-fast-transfers

bigquery/read/README.md

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ session for the table via the BigQuery Storage library and read the rows from
99
the table.
1010

1111
This example shows how to create a query job using the BigQuery v2 Python API,
12-
and then read the data from the table using the BigQuery Storage C++ API.
12+
and then read the data from the table using the BigQuery Storage C++ API. There
13+
are two examples for reading the data: one using Avro and one using Arrow.
1314

1415
If you are not familiar with the BigQuery v2 API or the BigQuery Storage Read
1516
API, we recommend you first read the [API overview] before starting this guide.
@@ -57,8 +58,19 @@ apt update && apt install -y build-essential cmake git ninja-build pkg-config g+
5758
In this directory compile the dependencies and the code, this can take as long
5859
as an hour, depending on the performance of your workstation:
5960

61+
### Arrow read
62+
6063
```shell
61-
cd cpp-samples/bigquery/read
64+
cd cpp-samples/bigquery/read/arrow
65+
cmake -S . -B .build -DCMAKE_BUILD_TYPE=Release \
66+
-DCMAKE_TOOLCHAIN_FILE=$HOME/vcpkg/scripts/buildsystems/vcpkg.cmake
67+
cmake --build .build
68+
```
69+
70+
### Avro read
71+
72+
```shell
73+
cd cpp-samples/bigquery/read/avro
6274
cmake -S . -B .build -DCMAKE_BUILD_TYPE=Release \
6375
-DCMAKE_TOOLCHAIN_FILE=$HOME/vcpkg/scripts/buildsystems/vcpkg.cmake
6476
cmake --build .build
@@ -69,18 +81,17 @@ cmake --build .build
6981
Run the example, replace the `[PROJECT ID]` placeholder with the id of your
7082
project:
7183

84+
### Arrow read
85+
7286
```shell
87+
cd cpp-samples/bigquery/read/arrow
7388
.build/arrow_read [PROJECT ID] [DATASET_NAME] [TABLE_NAME]
7489
```
7590

7691
```shell
7792
.build/arrow_read [PROJECT ID] usa_names top10_names
7893
```
7994

80-
## Output
81-
82-
Your output should look like the following:
83-
8495
```
8596
Schema is:
8697
name: string
@@ -99,6 +110,33 @@ Row 9: Charles 2244693
99110
Read 1 record batch(es) and 10 total row(s) from table: projects/[PROJECT-ID]/datasets/usa_names/tables/top10_names
100111
```
101112

113+
### Avro read
114+
115+
```shell
116+
cd cpp-samples/bigquery/read/avro
117+
.build/avro_read [PROJECT ID] [DATASET_NAME] [TABLE_NAME]
118+
```
119+
120+
```shell
121+
.build/avro_read [PROJECT ID] usa_names top10_names
122+
```
123+
124+
The output should look like:
125+
126+
```
127+
Row 0 (2): James 4942431
128+
Row 1 (2): John 4834422
129+
Row 2 (2): Robert 4718787
130+
Row 3 (2): Michael 4297230
131+
Row 4 (2): William 3822209
132+
Row 5 (2): Mary 3737679
133+
Row 6 (2): David 3549801
134+
Row 7 (2): Richard 2531924
135+
Row 8 (2): Joseph 2472917
136+
Row 9 (2): Charles 2244693
137+
Read 1 response(s) and 10 total row(s) from table: projects/[PROJECT-ID]/datasets/usa_names/tables/top10_names
138+
```
139+
102140
## Cleanup
103141

104142
Remove the table and dataset:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ set(CMAKE_CXX_STANDARD 20)
2020
# Define the project name and where to report bugs.
2121
set(PACKAGE_BUGREPORT
2222
"https://github.com/GoogleCloudPlatform/cpp-samples/issues")
23-
project(cpp-samples-bigquery-read CXX)
23+
project(cpp-samples-bigquery-read-arrow CXX)
2424

2525
find_package(google_cloud_cpp_bigquery REQUIRED)
2626
find_package(Arrow REQUIRED)
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ void ProcessRecordBatch(std::shared_ptr<arrow::Schema> schema,
107107
// need to be handled.
108108
default:
109109
std::cout << std::left << std::setw(15) << "UNDEFINED ";
110-
<< " ";
111110
}
112111
}
113112
std::cout << "\n";
@@ -165,7 +164,7 @@ int main(int argc, char* argv[]) try {
165164
}
166165

167166
ProcessRecordBatch(schema, record_batch, num_rows);
168-
num_rows += row->row_count();
167+
num_rows += read_rows_response->row_count();
169168
++record_batch_count;
170169
}
171170
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
2-
"name": "gcp-cpp-samples-bigquery-read",
2+
"name": "gcp-cpp-samples-bigquery-read-arrow",
33
"version-string": "unversioned",
44
"homepage": "https://github.com/GoogleCloudPlatform/cpp-samples/",
5-
"description": "An example using the BigQuery Storage Read API",
5+
"description": "An example using the BigQuery Storage Read API and Arrow",
66
"dependencies": [
77
{
88
"name": "google-cloud-cpp",

bigquery/read/avro/CMakeLists.txt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# ~~~
2+
# Copyright 2024 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ~~~
16+
17+
cmake_minimum_required(VERSION 3.20)
18+
set(CMAKE_CXX_STANDARD 14)
19+
20+
# Define the project name and where to report bugs.
21+
set(PACKAGE_BUGREPORT
22+
"https://github.com/GoogleCloudPlatform/cpp-samples/issues")
23+
project(cpp-samples-bigquery-read-avro CXX)
24+
25+
find_package(google_cloud_cpp_bigquery REQUIRED)
26+
find_package(unofficial-avro-cpp CONFIG REQUIRED)
27+
28+
add_executable(avro_read avro_read.cc)
29+
target_link_libraries(avro_read PRIVATE google-cloud-cpp::bigquery
30+
unofficial::avro-cpp::avrocpp)

bigquery/read/avro/avro_read.cc

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// Copyright 2024 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "google/cloud/bigquery/storage/v1/bigquery_read_client.h"
16+
#include "google/cloud/project.h"
17+
#include <fstream>
18+
#include <iomanip>
19+
#include <iostream>
20+
#include <sstream>
21+
#include <avro/Compiler.hh>
22+
#include <avro/DataFile.hh>
23+
#include <avro/Decoder.hh>
24+
#include <avro/Generic.hh>
25+
#include <avro/GenericDatum.hh>
26+
#include <avro/Stream.hh>
27+
#include <avro/ValidSchema.hh>
28+
29+
namespace {
30+
31+
avro::ValidSchema GetAvroSchema(
32+
::google::cloud::bigquery::storage::v1::AvroSchema const& schema) {
33+
// Create a valid reader schema.
34+
std::istringstream schema_bytes(schema.schema(), std::ios::binary);
35+
avro::ValidSchema valid_schema;
36+
avro::compileJsonSchema(schema_bytes, valid_schema);
37+
38+
// [optional] Write the schema to a file. This could be useful if you want to
39+
// re-use the schema elsewhere.
40+
std::ofstream output("schema.avsc");
41+
if (output.is_open()) {
42+
valid_schema.toJson(output);
43+
output.close();
44+
} else {
45+
std::cerr << "Error opening the file!" << std::endl;
46+
}
47+
return valid_schema;
48+
}
49+
50+
void ProcessRowsInAvroFormat(
51+
avro::ValidSchema const& valid_schema,
52+
::google::cloud::bigquery::storage::v1::AvroRows const& rows,
53+
std::int64_t row_count) {
54+
// Get an avro decoder.
55+
std::stringstream row_bytes(rows.serialized_binary_rows(), std::ios::binary);
56+
std::unique_ptr<avro::InputStream> in = avro::istreamInputStream(row_bytes);
57+
avro::DecoderPtr decoder =
58+
avro::validatingDecoder(valid_schema, avro::binaryDecoder());
59+
decoder->init(*in);
60+
61+
for (auto i = 0; i < row_count; ++i) {
62+
std::cout << "Row " << i << " ";
63+
avro::GenericDatum datum(valid_schema);
64+
avro::decode(*decoder, datum);
65+
if (datum.type() == avro::AVRO_RECORD) {
66+
const avro::GenericRecord& record = datum.value<avro::GenericRecord>();
67+
std::cout << "(" << record.fieldCount() << "): ";
68+
for (auto i = 0; i < record.fieldCount(); i++) {
69+
const avro::GenericDatum& datum = record.fieldAt(i);
70+
71+
switch (datum.type()) {
72+
case avro::AVRO_STRING:
73+
std::cout << std::left << std::setw(15)
74+
<< datum.value<std::string>();
75+
break;
76+
case avro::AVRO_INT:
77+
std::cout << std::left << std::setw(15) << datum.value<int>();
78+
break;
79+
case avro::AVRO_LONG:
80+
std::cout << std::left << std::setw(15) << datum.value<long>();
81+
break;
82+
// Depending on the table you are reading, you might need to add
83+
// cases for other datatypes here. The schema will tell you what
84+
// datatypes need to be handled.
85+
default:
86+
std::cout << std::left << std::setw(15) << "UNDEFINED";
87+
}
88+
std::cout << "\t";
89+
}
90+
}
91+
std::cout << "\n";
92+
}
93+
}
94+
95+
} // namespace
96+
97+
int main(int argc, char* argv[]) try {
98+
if (argc != 4) {
99+
std::cerr << "Usage: " << argv[0]
100+
<< " <project-id> <dataset-name> <table-name>\n";
101+
return 1;
102+
}
103+
104+
std::string const project_id = argv[1];
105+
std::string const dataset_name = argv[2];
106+
std::string const table_name = argv[3];
107+
108+
std::string const table_id = "projects/" + project_id + "/datasets/" +
109+
dataset_name + "/tables/" + table_name;
110+
111+
// Create a namespace alias to make the code easier to read.
112+
namespace bigquery_storage = ::google::cloud::bigquery_storage_v1;
113+
constexpr int kMaxReadStreams = 1;
114+
// Create the ReadSession.
115+
auto client = bigquery_storage::BigQueryReadClient(
116+
bigquery_storage::MakeBigQueryReadConnection());
117+
::google::cloud::bigquery::storage::v1::ReadSession read_session;
118+
read_session.set_data_format(
119+
google::cloud::bigquery::storage::v1::DataFormat::AVRO);
120+
read_session.set_table(table_id);
121+
auto session =
122+
client.CreateReadSession(google::cloud::Project(project_id).FullName(),
123+
read_session, kMaxReadStreams);
124+
if (!session) throw std::move(session).status();
125+
126+
// Get Avro schema.
127+
avro::ValidSchema valid_schema = GetAvroSchema(session->avro_schema());
128+
129+
// Read rows from the ReadSession.
130+
constexpr int kRowOffset = 0;
131+
auto read_rows = client.ReadRows(session->streams(0).name(), kRowOffset);
132+
133+
std::int64_t num_rows = 0;
134+
std::int64_t num_responses = 0;
135+
for (auto const& read_rows_response : read_rows) {
136+
if (read_rows_response.ok()) {
137+
num_rows += read_rows_response->row_count();
138+
ProcessRowsInAvroFormat(valid_schema, read_rows_response->avro_rows(),
139+
read_rows_response->row_count());
140+
++num_responses;
141+
}
142+
}
143+
144+
std::cout << "Read " << num_responses << " responses(s) and " << num_rows
145+
<< " total row(s) from table: " << table_id << "\n";
146+
147+
return 0;
148+
} catch (google::cloud::Status const& status) {
149+
std::cerr << "google::cloud::Status thrown: " << status << "\n";
150+
return 1;
151+
} catch (avro::Exception const& e) {
152+
std::cerr << "avro::Exception thrown: " << e.what() << "\n";
153+
return 1;
154+
}

bigquery/read/avro/vcpkg.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"name": "gcp-cpp-samples-bigquery-read-avro",
3+
"version-string": "unversioned",
4+
"homepage": "https://github.com/GoogleCloudPlatform/cpp-samples/",
5+
"description": "An example using the BigQuery Storage Read API and Avro",
6+
"dependencies": [
7+
{
8+
"name": "google-cloud-cpp",
9+
"default-features": false,
10+
"features": ["bigquery"]
11+
},
12+
"avro-cpp"
13+
]
14+
}

ci/devtools.Dockerfile

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,26 @@
1515
FROM ubuntu:24.04
1616

1717
ENV DEBIAN_FRONTEND=noninteractive
18+
# bigquery/read/arrow: bison is for thrift, which is a dependency for arrow
19+
# bigquery/read/arrow: flex is for thrift, which is a dependency for arrow
1820
RUN apt update \
19-
&& apt install -y build-essential git gcc g++ clang llvm cmake ninja-build pkg-config python3 tar zip unzip curl
21+
&& apt install -y \
22+
bison \
23+
build-essential \
24+
git \
25+
gcc \
26+
g++ \
27+
clang \
28+
cmake \
29+
curl \
30+
flex \
31+
llvm \
32+
ninja-build \
33+
pkg-config \
34+
python3 \
35+
tar \
36+
zip \
37+
unzip
2038

2139
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \
2240
tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \

0 commit comments

Comments
 (0)