|
| 1 | +<!--- |
| 2 | + Copyright 2024-present Alibaba Inc. |
| 3 | +
|
| 4 | + Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + you may not use this file except in compliance with the License. |
| 6 | + You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | + Unless required by applicable law or agreed to in writing, software |
| 11 | + distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + See the License for the specific language governing permissions and |
| 14 | + limitations under the License. |
| 15 | +--> |
| 16 | + |
| 17 | +# Paimon C++ |
| 18 | + |
| 19 | +Paimon C++ is a high-performance C++ implementation of [Apache Paimon](https://paimon.apache.org). Paimon C++ aims to provide a native, high-performance and extensible implementation that allows native engines to access the Paimon datalake format with maximum efficiency. |
| 20 | + |
| 21 | +## What's in the Paimon C++ library |
| 22 | + |
| 23 | +* Write: Paimon append table and primary key table write (without compaction). |
| 24 | +* Commit: Paimon append table commit. (Note: Limited support — only works for simple append-only tables; table with compaction, index, changelog, and stats are not supported.) |
| 25 | +* Scan: Paimon append and primary key table batch and stream scan (without changelog). |
| 26 | +* Read: Paimon append table read and primary key table with deletion vector read (raw read) and primary key table with merge on read (merge read). |
| 27 | +* Batch read and write interface using the [Arrow Columnar In-Memory Format](https://arrow.apache.org) to increase throughput. |
| 28 | +* IO interfaces to file system and built-in local and jindo file system implementation. |
| 29 | +* File format interfaces to customize different format and built-in orc, parquet and lance format implementation. |
| 30 | +* Memory pool interfaces and a default implementation. |
| 31 | +* Thread pool executor interfaces and a default implementation. |
| 32 | +* Compatible with Java Paimon format and communication protocol (e.g., commit message, data splits, manifests). |
| 33 | +* Only support x86 platform. |
| 34 | + |
| 35 | +## Write And Commit Example |
| 36 | + |
| 37 | +The writing is divided into two stages: |
| 38 | + |
| 39 | +1. Write records: write records in distributed tasks, generate commit messages. |
| 40 | +2. Commit/Abort: collect all commit messages, commit them in a global node ('Coordinator', or named 'Driver', or named 'Committer'). When the commit fails for certain reason, abort unsuccessful commit via commit messages. |
| 41 | + |
| 42 | + |
| 43 | +```c++ |
| 44 | + std::string table_path = "/tmp/paimon/my.db/test_table/"; |
| 45 | + WriteContextBuilder context_builder(table_path, "commit_user"); |
| 46 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<WriteContext> write_context, |
| 47 | + context_builder.AddOption(Options::TARGET_FILE_SIZE, "1024mb") |
| 48 | + .AddOption(Options::FILE_SYSTEM, "local") |
| 49 | + .Finish()); |
| 50 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<FileStoreWrite> file_store_write, |
| 51 | + FileStoreWrite::Create(std::move(write_context))); |
| 52 | + |
| 53 | + ::ArrowArray arrow_array; |
| 54 | + // prepare your arrow array |
| 55 | + // ... |
| 56 | + RecordBatchBuilder batch_builder(&arrow_array); |
| 57 | + batch_builder.SetPartition({{"col1", "20240813"}, {"col2", "23"}}).SetBucket(1); |
| 58 | + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> batch, batch_builder.Finish()); |
| 59 | + PAIMON_RETURN_NOT_OK(file_store_write->Write(batch)); |
| 60 | + PAIMON_ASSIGN_OR_RAISE(std::vector<std::shared_ptr<CommitMessage>> commit_messages, |
| 61 | + file_store_write->PrepareCommit()); |
| 62 | + |
| 63 | + CommitContextBuilder commit_context_builder(table_path, "commit_user"); |
| 64 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<CommitContext> commit_context, |
| 65 | + commit_context_builder.AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") |
| 66 | + .AddOption(Options::FILE_SYSTEM, "local") |
| 67 | + .IgnoreEmptyCommit(false) |
| 68 | + .Finish()); |
| 69 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<FileStoreCommit> commit, FileStoreCommit::Create(std::move(commit_context)); |
| 70 | + PAIMON_RETURN_NOT_OK(commit->Commit(commit_messages)); |
| 71 | + |
| 72 | +``` |
| 73 | +
|
| 74 | +## Scan and Read Example |
| 75 | +
|
| 76 | +The reading is divided into two stages: |
| 77 | +
|
| 78 | +1. Scan: read snapshot, parse manifests, filter target file set by statistical information, and generate query plan data splits. |
| 79 | +2. Read: read the data files according to data splits, and perform schema evolution adjustment and predicate push-down optimization. |
| 80 | +
|
| 81 | +```c++ |
| 82 | + std::string table_path = "/tmp/paimon/my.db/test_table/"; |
| 83 | + ScanContextBuilder context_builder(table_path); |
| 84 | + // prepare predicate if needed |
| 85 | + std::shared_ptr<Predicate> predicate = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", |
| 86 | + /*field_type=*/FieldType::INT, Literal(10)); |
| 87 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<ScanContext> scan_context, |
| 88 | + context_builder.SetPredicate(predicate) |
| 89 | + .AddOption(Options::SCAN_SNAPSHOT_ID, "2") |
| 90 | + .AddOption(Options::FILE_SYSTEM, "local") |
| 91 | + .Finish()); |
| 92 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<TableScan> table_scan, |
| 93 | + TableScan::Create(std::move(scan_context))); |
| 94 | + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<Plan> plan, table_scan->CreatePlan()); |
| 95 | +
|
| 96 | + ReadContextBuilder read_context_builder(table_path, /*schema_id=*/0); |
| 97 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<ReadContext> read_context, |
| 98 | + read_context_builder.SetReadSchema({"f0", "f1"}) |
| 99 | + .SetPredicate(predicate) |
| 100 | + .AddOption(Options::FILE_SYSTEM, "local") |
| 101 | + .EnablePrefetch(true) |
| 102 | + .Finish()); |
| 103 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<TableRead> table_read, TableRead::Create(std::move(read_context)); |
| 104 | + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<BatchReader> batch_reader, table_read->CreateReader(plan->Splits())); |
| 105 | +
|
| 106 | + while (true) { |
| 107 | + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatch read_batch, batch_reader->NextBatch()); |
| 108 | + if (BatchReader::IsEofBatch(read_batch)) { |
| 109 | + break; |
| 110 | + } |
| 111 | + auto& [c_array, c_schema] = read_batch; |
| 112 | + // process the arrow array |
| 113 | + auto arrow_result = arrow::ImportArray(c_array.get(), c_schema.get()); |
| 114 | + } |
| 115 | +
|
| 116 | +``` |
| 117 | + |
| 118 | +## Getting Started |
| 119 | + |
| 120 | +## Development |
| 121 | + |
| 122 | +### CMake |
| 123 | + |
| 124 | +``` |
| 125 | +$ mkdir build |
| 126 | +$ cd build |
| 127 | +$ cmake .. |
| 128 | +$ make |
| 129 | +``` |
| 130 | +### Linting |
| 131 | + |
| 132 | +Install the python package `pre-commit` and run once `pre-commit install`. |
| 133 | + |
| 134 | +``` |
| 135 | +pip install pre-commit |
| 136 | +pre-commit install |
| 137 | +``` |
| 138 | + |
| 139 | +This will setup a git pre-commit-hook that is executed on each commit and will report the linting problems. To run all hooks on all files use `pre-commit run -a`. |
| 140 | + |
| 141 | +### Dev Containers |
| 142 | + |
| 143 | +We provide Dev Container configuration file templates. |
| 144 | + |
| 145 | +To use a Dev Container as your development environment, follow the steps below, then select `Dev Containers: Reopen in Container` from VS Code's Command Palette. |
| 146 | + |
| 147 | +``` |
| 148 | +cd .devcontainer |
| 149 | +cp Dockerfile.template Dockerfile |
| 150 | +cp devcontainer.json.template devcontainer.json |
| 151 | +``` |
| 152 | + |
| 153 | +If you make improvements that could benefit all developers, please update the template files and submit a pull request. |
| 154 | + |
| 155 | +## License |
| 156 | + |
| 157 | +Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) |
0 commit comments