|
| 1 | +--- |
| 2 | +title: "Cpp API" |
| 3 | +weight: 6 |
| 4 | +type: docs |
| 5 | +aliases: |
| 6 | + - /api/cpp-api.html |
| 7 | +--- |
| 8 | + |
| 9 | +<!-- |
| 10 | +Licensed to the Apache Software Foundation (ASF) under one |
| 11 | +or more contributor license agreements. See the NOTICE file |
| 12 | +distributed with this work for additional information |
| 13 | +regarding copyright ownership. The ASF licenses this file |
| 14 | +to you under the Apache License, Version 2.0 (the |
| 15 | +"License"); you may not use this file except in compliance |
| 16 | +with the License. You may obtain a copy of the License at |
| 17 | +
|
| 18 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 19 | +
|
| 20 | +Unless required by applicable law or agreed to in writing, |
| 21 | +software distributed under the License is distributed on an |
| 22 | +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 23 | +KIND, either express or implied. See the License for the |
| 24 | +specific language governing permissions and limitations |
| 25 | +under the License. |
| 26 | +--> |
| 27 | + |
| 28 | +# Cpp API |
| 29 | + |
| 30 | +Paimon C++ is a high-performance C++ implementation of Apache Paimon. Paimon C++ aims to provide a native, |
| 31 | +high-performance and extensible implementation that allows native engines to access the Paimon datalake |
| 32 | +format with maximum efficiency. |
| 33 | + |
| 34 | +## Environment Settings |
| 35 | + |
| 36 | +[Paimon C++](https://github.com/alibaba/paimon-cpp.git) is currently governed under Alibaba open source |
| 37 | +community. You can checkout the [document](https://alibaba.github.io/paimon-cpp/getting_started.html) |
| 38 | +for more details about envinroment settings. |
| 39 | + |
| 40 | +```sh |
| 41 | +git clone https://github.com/alibaba/paimon-cpp.git |
| 42 | +cd paimon-cpp |
| 43 | +mkdir build-release |
| 44 | +cd build-release |
| 45 | +cmake .. |
| 46 | +make -j8 # if you have 8 CPU cores, otherwise adjust |
| 47 | +make install |
| 48 | +``` |
| 49 | + |
| 50 | +## Create Catalog |
| 51 | + |
| 52 | +Before coming into contact with the Table, you need to create a Catalog. |
| 53 | + |
| 54 | +```c++ |
| 55 | +#include "paimon/catalog/catalog.h" |
| 56 | + |
| 57 | +// Note that keys and values are all string |
| 58 | +std::map<std::string, std::string> options; |
| 59 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::Catalog> catalog, |
| 60 | + paimon::Catalog::Create(root_path, options)); |
| 61 | +``` |
| 62 | +
|
| 63 | +Current C++ Paimon only supports filesystem catalog. In the future, we will support REST catalog. |
| 64 | +See [Catalog]({{< ref "concepts/catalog" >}}). |
| 65 | +
|
| 66 | +You can use the catalog to create table for writing data. |
| 67 | +
|
| 68 | +## Create Database |
| 69 | +
|
| 70 | +Table is located in a database. If you want to create table in a new database, you should create it. |
| 71 | +
|
| 72 | +```c++ |
| 73 | +PAIMON_RETURN_NOT_OK(catalog->CreateDatabase('database_name', options, /*ignore_if_exists=*/false)); |
| 74 | +``` |
| 75 | + |
| 76 | +## Create Table |
| 77 | + |
| 78 | +Table schema contains fields definition, partition keys, primary keys, table options. |
| 79 | +The field definition is described by `Arrow::Schema`. All arguments except fields definition are optional. |
| 80 | + |
| 81 | +for example: |
| 82 | + |
| 83 | +```c++ |
| 84 | +arrow::FieldVector fields = { |
| 85 | + arrow::field("f0", arrow::utf8()), |
| 86 | + arrow::field("f1", arrow::int32()), |
| 87 | + arrow::field("f2", arrow::int32()), |
| 88 | + arrow::field("f3", arrow::float64()), |
| 89 | +}; |
| 90 | +std::shared_ptr<arrow::Schema> schema = arrow::schema(fields); |
| 91 | +::ArrowSchema arrow_schema; |
| 92 | +arrow::Status arrow_status = arrow::ExportSchema(*schema, &arrow_schema); |
| 93 | +if (!arrow_status.ok()) { |
| 94 | + return paimon::Status::Invalid(arrow_status.message()); |
| 95 | +} |
| 96 | +PAIMON_RETURN_NOT_OK(catalog->CreateTable(paimon::Identifier(db_name, table_name), |
| 97 | + &arrow_schema, |
| 98 | + /*partition_keys=*/{}, |
| 99 | + /*primary_keys=*/{}, options, |
| 100 | + /*ignore_if_exists=*/false)); |
| 101 | +``` |
| 102 | +
|
| 103 | +See [Data Types](https://alibaba.github.io/paimon-cpp/user_guide/data_types.html) for all supported |
| 104 | +`arrow-to-paimon` data types mapping. |
| 105 | +
|
| 106 | +## Batch Write |
| 107 | +
|
| 108 | +Paimon table write is Two-Phase Commit, you can write many times, but once committed, no more data can be written. |
| 109 | +C++ Paimon uses Apache Arrow as [in-memory format], check out [document](https://alibaba.github.io/paimon-cpp/user_guide/arrow.html) |
| 110 | +for more details. |
| 111 | +
|
| 112 | +for example: |
| 113 | +```c++ |
| 114 | +arrow::Result<std::shared_ptr<arrow::StructArray>> PrepareData(const arrow::FieldVector& fields) { |
| 115 | + arrow::StringBuilder f0_builder; |
| 116 | + arrow::Int32Builder f1_builder; |
| 117 | + arrow::Int32Builder f2_builder; |
| 118 | + arrow::DoubleBuilder f3_builder; |
| 119 | +
|
| 120 | + std::vector<std::tuple<std::string, int, int, double>> data = { |
| 121 | + {"Alice", 1, 0, 11.0}, {"Bob", 1, 1, 12.1}, {"Cathy", 1, 2, 13.2}}; |
| 122 | +
|
| 123 | + for (const auto& row : data) { |
| 124 | + ARROW_RETURN_NOT_OK(f0_builder.Append(std::get<0>(row))); |
| 125 | + ARROW_RETURN_NOT_OK(f1_builder.Append(std::get<1>(row))); |
| 126 | + ARROW_RETURN_NOT_OK(f2_builder.Append(std::get<2>(row))); |
| 127 | + ARROW_RETURN_NOT_OK(f3_builder.Append(std::get<3>(row))); |
| 128 | + } |
| 129 | +
|
| 130 | + std::shared_ptr<arrow::Array> f0_array, f1_array, f2_array, f3_array; |
| 131 | + ARROW_RETURN_NOT_OK(f0_builder.Finish(&f0_array)); |
| 132 | + ARROW_RETURN_NOT_OK(f1_builder.Finish(&f1_array)); |
| 133 | + ARROW_RETURN_NOT_OK(f2_builder.Finish(&f2_array)); |
| 134 | + ARROW_RETURN_NOT_OK(f3_builder.Finish(&f3_array)); |
| 135 | +
|
| 136 | + std::vector<std::shared_ptr<arrow::Array>> children = {f0_array, f1_array, f2_array, f3_array}; |
| 137 | + auto struct_type = arrow::struct_(fields); |
| 138 | + return std::make_shared<arrow::StructArray>(struct_type, f0_array->length(), children); |
| 139 | +} |
| 140 | +``` |
| 141 | + |
| 142 | +```c++ |
| 143 | +std::string table_path = root_path + "/" + db_name + ".db/" + table_name; |
| 144 | +std::string commit_user = "some_commit_user"; |
| 145 | +// write |
| 146 | +paimon::WriteContextBuilder context_builder(table_path, commit_user); |
| 147 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::WriteContext> write_context, |
| 148 | + context_builder.SetOptions(options).Finish()); |
| 149 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::FileStoreWrite> writer, |
| 150 | + paimon::FileStoreWrite::Create(std::move(write_context))); |
| 151 | +// prepare data |
| 152 | +auto struct_array = PrepareData(fields); |
| 153 | +if (!struct_array.ok()) { |
| 154 | + return paimon::Status::Invalid(struct_array.status().ToString()); |
| 155 | +} |
| 156 | +::ArrowArray arrow_array; |
| 157 | +arrow_status = arrow::ExportArray(*struct_array.ValueUnsafe(), &arrow_array); |
| 158 | +if (!arrow_status.ok()) { |
| 159 | + return paimon::Status::Invalid(arrow_status.message()); |
| 160 | +} |
| 161 | +paimon::RecordBatchBuilder batch_builder(&arrow_array); |
| 162 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::RecordBatch> record_batch, |
| 163 | + batch_builder.Finish()); |
| 164 | +PAIMON_RETURN_NOT_OK(writer->Write(std::move(record_batch))); |
| 165 | +PAIMON_ASSIGN_OR_RAISE(std::vector<std::shared_ptr<paimon::CommitMessage>> commit_message, |
| 166 | + writer->PrepareCommit()); |
| 167 | + |
| 168 | +// commit |
| 169 | +paimon::CommitContextBuilder commit_context_builder(table_path, commit_user); |
| 170 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::CommitContext> commit_context, |
| 171 | + commit_context_builder.SetOptions(options).Finish()); |
| 172 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::FileStoreCommit> committer, |
| 173 | + paimon::FileStoreCommit::Create(std::move(commit_context))); |
| 174 | +PAIMON_RETURN_NOT_OK(committer->Commit(commit_message)); |
| 175 | +``` |
| 176 | +
|
| 177 | +## Batch Read |
| 178 | +
|
| 179 | +### Predicate pushdown |
| 180 | +
|
| 181 | +A `ReadContextBuilder` is used to pass context to reader, push down and filter is done by reader. |
| 182 | +
|
| 183 | +```c++ |
| 184 | +ReadContextBuilder read_context_builder(table_path); |
| 185 | +``` |
| 186 | + |
| 187 | +You can use `PredicateBuilder` to build filters and pushdown them by `ReadContextBuilder`: |
| 188 | + |
| 189 | +```c++ |
| 190 | +# Example filter: 'f3' > 12.0 OR 'f1' == 1 |
| 191 | +PAIMON_ASSIGN_OR_RAISE( |
| 192 | + auto predicate, |
| 193 | + PredicateBuilder::Or( |
| 194 | + {PredicateBuilder::GreaterThan(/*field_index=*/3, /*field_name=*/"f3", |
| 195 | + FieldType::DOUBLE, Literal(static_cast<double>(12.0))), |
| 196 | + PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, |
| 197 | + Literal(1))})); |
| 198 | +ReadContextBuilder read_context_builder(table_path); |
| 199 | +read_context_builder.SetPredicate(predicate).EnablePredicateFilter(true); |
| 200 | +``` |
| 201 | +
|
| 202 | +You can also pushdown projection by `ReadContextBuilder`: |
| 203 | +
|
| 204 | +```c++ |
| 205 | +# select f3 and f2 columns |
| 206 | +read_context_builder.SetReadSchema({"f3", "f1", "f2"}); |
| 207 | +``` |
| 208 | + |
| 209 | +### Generate Splits |
| 210 | + |
| 211 | +Then you can step into Scan Plan stage to get `splits`: |
| 212 | + |
| 213 | +```c++ |
| 214 | +// scan |
| 215 | +paimon::ScanContextBuilder scan_context_builder(table_path); |
| 216 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::ScanContext> scan_context, |
| 217 | + scan_context_builder.SetOptions(options).Finish()); |
| 218 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::TableScan> scanner, |
| 219 | + paimon::TableScan::Create(std::move(scan_context))); |
| 220 | +PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<paimon::Plan> plan, scanner->CreatePlan()); |
| 221 | +auto splits = plan->Splits(); |
| 222 | +``` |
| 223 | +
|
| 224 | +Finally, you can read data from the `splits` to arrow format. |
| 225 | +
|
| 226 | +### Read Apache Arrow |
| 227 | +
|
| 228 | +This requires `C++ Arrow` to be installed. |
| 229 | +
|
| 230 | +```c++ |
| 231 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::ReadContext> read_context, |
| 232 | + read_context_builder.SetOptions(options).Finish()); |
| 233 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::TableRead> table_read, |
| 234 | + paimon::TableRead::Create(std::move(read_context))); |
| 235 | +PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::BatchReader> batch_reader, |
| 236 | + table_read->CreateReader(splits)); |
| 237 | +arrow::ArrayVector result_array_vector; |
| 238 | +while (true) { |
| 239 | + PAIMON_ASSIGN_OR_RAISE(paimon::BatchReader::ReadBatch batch, batch_reader->NextBatch()); |
| 240 | + if (paimon::BatchReader::IsEofBatch(batch)) { |
| 241 | + break; |
| 242 | + } |
| 243 | + auto& [c_array, c_schema] = batch; |
| 244 | + auto arrow_result = arrow::ImportArray(c_array.get(), c_schema.get()); |
| 245 | + if (!arrow_result.ok()) { |
| 246 | + return paimon::Status::Invalid(arrow_result.status().ToString()); |
| 247 | + } |
| 248 | + auto result_array = arrow_result.ValueUnsafe(); |
| 249 | + result_array_vector.push_back(result_array); |
| 250 | +} |
| 251 | +auto chunk_result = arrow::ChunkedArray::Make(result_array_vector); |
| 252 | +if (!chunk_result.ok()) { |
| 253 | + return paimon::Status::Invalid(chunk_result.status().ToString()); |
| 254 | +} |
| 255 | +``` |
| 256 | + |
| 257 | +## Documentation |
| 258 | + |
| 259 | +For more information, See [C++ Paimon Documentation](https://alibaba.github.io/paimon-cpp/index.html). |
0 commit comments