|
| 1 | +// Copyright 2024 Google LLC |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +#include "google/cloud/bigquery/storage/v1/bigquery_read_client.h" |
| 16 | +#include "google/cloud/project.h" |
| 17 | +#include <fstream> |
| 18 | +#include <iomanip> |
| 19 | +#include <iostream> |
| 20 | +#include <sstream> |
| 21 | +#include <avro/Compiler.hh> |
| 22 | +#include <avro/DataFile.hh> |
| 23 | +#include <avro/Decoder.hh> |
| 24 | +#include <avro/Generic.hh> |
| 25 | +#include <avro/GenericDatum.hh> |
| 26 | +#include <avro/Stream.hh> |
| 27 | +#include <avro/ValidSchema.hh> |
| 28 | + |
| 29 | +namespace { |
| 30 | + |
| 31 | +avro::ValidSchema GetAvroSchema( |
| 32 | + ::google::cloud::bigquery::storage::v1::AvroSchema const& schema) { |
| 33 | + // Create a valid reader schema. |
| 34 | + std::istringstream schema_bytes(schema.schema(), std::ios::binary); |
| 35 | + avro::ValidSchema valid_schema; |
| 36 | + avro::compileJsonSchema(schema_bytes, valid_schema); |
| 37 | + |
| 38 | + // [optional] Write the schema to a file. This could be useful if you want to |
| 39 | + // re-use the schema elsewhere. |
| 40 | + std::ofstream output("schema.avsc"); |
| 41 | + if (output.is_open()) { |
| 42 | + valid_schema.toJson(output); |
| 43 | + output.close(); |
| 44 | + } else { |
| 45 | + std::cerr << "Error opening the file!" << std::endl; |
| 46 | + } |
| 47 | + return valid_schema; |
| 48 | +} |
| 49 | + |
| 50 | +void ProcessRowsInAvroFormat( |
| 51 | + avro::ValidSchema const& valid_schema, |
| 52 | + ::google::cloud::bigquery::storage::v1::AvroRows const& rows, |
| 53 | + std::int64_t row_count) { |
| 54 | + // Get an avro decoder. |
| 55 | + std::stringstream row_bytes(rows.serialized_binary_rows(), std::ios::binary); |
| 56 | + std::unique_ptr<avro::InputStream> in = avro::istreamInputStream(row_bytes); |
| 57 | + avro::DecoderPtr decoder = |
| 58 | + avro::validatingDecoder(valid_schema, avro::binaryDecoder()); |
| 59 | + decoder->init(*in); |
| 60 | + |
| 61 | + for (auto i = 0; i < row_count; ++i) { |
| 62 | + std::cout << "Row " << i << " "; |
| 63 | + avro::GenericDatum datum(valid_schema); |
| 64 | + avro::decode(*decoder, datum); |
| 65 | + if (datum.type() == avro::AVRO_RECORD) { |
| 66 | + const avro::GenericRecord& record = datum.value<avro::GenericRecord>(); |
| 67 | + std::cout << "(" << record.fieldCount() << "): "; |
| 68 | + for (auto i = 0; i < record.fieldCount(); i++) { |
| 69 | + const avro::GenericDatum& datum = record.fieldAt(i); |
| 70 | + |
| 71 | + switch (datum.type()) { |
| 72 | + case avro::AVRO_STRING: |
| 73 | + std::cout << std::left << std::setw(15) |
| 74 | + << datum.value<std::string>(); |
| 75 | + break; |
| 76 | + case avro::AVRO_INT: |
| 77 | + std::cout << std::left << std::setw(15) << datum.value<int>(); |
| 78 | + break; |
| 79 | + case avro::AVRO_LONG: |
| 80 | + std::cout << std::left << std::setw(15) << datum.value<long>(); |
| 81 | + break; |
| 82 | + // Depending on the table you are reading, you might need to add |
| 83 | + // cases for other datatypes here. The schema will tell you what |
| 84 | + // datatypes need to be handled. |
| 85 | + default: |
| 86 | + std::cout << std::left << std::setw(15) << "UNDEFINED"; |
| 87 | + } |
| 88 | + std::cout << "\t"; |
| 89 | + } |
| 90 | + } |
| 91 | + std::cout << "\n"; |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +} // namespace |
| 96 | + |
| 97 | +int main(int argc, char* argv[]) try { |
| 98 | + if (argc != 4) { |
| 99 | + std::cerr << "Usage: " << argv[0] |
| 100 | + << " <project-id> <dataset-name> <table-name>\n"; |
| 101 | + return 1; |
| 102 | + } |
| 103 | + |
| 104 | + std::string const project_id = argv[1]; |
| 105 | + std::string const dataset_name = argv[2]; |
| 106 | + std::string const table_name = argv[3]; |
| 107 | + |
| 108 | + std::string const table_id = "projects/" + project_id + "/datasets/" + |
| 109 | + dataset_name + "/tables/" + table_name; |
| 110 | + |
| 111 | + // Create a namespace alias to make the code easier to read. |
| 112 | + namespace bigquery_storage = ::google::cloud::bigquery_storage_v1; |
| 113 | + constexpr int kMaxReadStreams = 1; |
| 114 | + // Create the ReadSession. |
| 115 | + auto client = bigquery_storage::BigQueryReadClient( |
| 116 | + bigquery_storage::MakeBigQueryReadConnection()); |
| 117 | + ::google::cloud::bigquery::storage::v1::ReadSession read_session; |
| 118 | + read_session.set_data_format( |
| 119 | + google::cloud::bigquery::storage::v1::DataFormat::AVRO); |
| 120 | + read_session.set_table(table_id); |
| 121 | + auto session = |
| 122 | + client.CreateReadSession(google::cloud::Project(project_id).FullName(), |
| 123 | + read_session, kMaxReadStreams); |
| 124 | + if (!session) throw std::move(session).status(); |
| 125 | + |
| 126 | + // Get Avro schema. |
| 127 | + avro::ValidSchema valid_schema = GetAvroSchema(session->avro_schema()); |
| 128 | + |
| 129 | + // Read rows from the ReadSession. |
| 130 | + constexpr int kRowOffset = 0; |
| 131 | + auto read_rows = client.ReadRows(session->streams(0).name(), kRowOffset); |
| 132 | + |
| 133 | + std::int64_t num_rows = 0; |
| 134 | + std::int64_t num_responses = 0; |
| 135 | + for (auto const& read_rows_response : read_rows) { |
| 136 | + if (read_rows_response.ok()) { |
| 137 | + num_rows += read_rows_response->row_count(); |
| 138 | + ProcessRowsInAvroFormat(valid_schema, read_rows_response->avro_rows(), |
| 139 | + read_rows_response->row_count()); |
| 140 | + ++num_responses; |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + std::cout << "Read " << num_responses << " responses(s) and " << num_rows |
| 145 | + << " total row(s) from table: " << table_id << "\n"; |
| 146 | + |
| 147 | + return 0; |
| 148 | +} catch (google::cloud::Status const& status) { |
| 149 | + std::cerr << "google::cloud::Status thrown: " << status << "\n"; |
| 150 | + return 1; |
| 151 | +} catch (avro::Exception const& e) { |
| 152 | + std::cerr << "avro::Exception thrown: " << e.what() << "\n"; |
| 153 | + return 1; |
| 154 | +} |
0 commit comments