Skip to content

Commit 5ff2fea

Browse files
committed
feat(catalog): add LoadTableSchema interface
1 parent f19f854 commit 5ff2fea

File tree

10 files changed

+288
-0
lines changed

10 files changed

+288
-0
lines changed

include/paimon/catalog/catalog.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include "paimon/catalog/identifier.h"
2525
#include "paimon/result.h"
26+
#include "paimon/schema/schema.h"
2627
#include "paimon/status.h"
2728
#include "paimon/type_fwd.h"
2829
#include "paimon/visibility.h"
@@ -93,6 +94,16 @@ class PAIMON_EXPORT Catalog {
9394
/// @return A result containing a vector of table names in the specified database, or an error
9495
/// status.
9596
virtual Result<std::vector<std::string>> ListTables(const std::string& db_name) const = 0;
97+
98+
/// Loads the latest schema of a specified table.
99+
///
100+
/// @note System tables will not be supported.
101+
///
102+
/// @param identifier The identifier (database and table name) of the table to load.
103+
/// @return A result containing table schema if the table exists, or std::nullopt if it
104+
/// doesn't, or an error status on failure.
105+
virtual Result<std::optional<std::shared_ptr<Schema>>> LoadTableSchema(
106+
const Identifier& identifier) const = 0;
96107
};
97108

98109
} // namespace paimon

include/paimon/schema/schema.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <map>
20+
#include <memory>
21+
#include <string>
22+
#include <vector>
23+
24+
#include "arrow/api.h"
25+
#include "arrow/c/bridge.h"
26+
#include "paimon/result.h"
27+
#include "paimon/visibility.h"
28+
struct ArrowSchema;
29+
30+
namespace paimon {
31+
class SchemaImpl;
32+
/// This interface provides access to TableSchema-related information.
33+
class PAIMON_EXPORT Schema {
34+
public:
35+
explicit Schema(std::unique_ptr<SchemaImpl>&& impl);
36+
~Schema();
37+
38+
public:
39+
/// Get the Arrow C schema representation of this table schema.
40+
/// @return A result containing an ArrowSchema, or an error status if conversion fails.
41+
Result<std::unique_ptr<::ArrowSchema>> GetArrowSchema() const;
42+
43+
/// Get the names of all fields in the table schema.
44+
/// @return A vector of field names.
45+
std::vector<std::string> FieldNames() const;
46+
47+
/// Get the unique identifier of this table schema.
48+
/// @return The schema ID
49+
int64_t Id() const;
50+
51+
/// Get the list of primary key field names.
52+
/// @return A reference to the vector of primary key names; empty if no primary keys are
53+
/// defined.
54+
const std::vector<std::string>& PrimaryKeys() const;
55+
56+
/// Get the list of partition key field names.
57+
/// @return A reference to the vector of partition key names; empty if the table is not
58+
/// partitioned.
59+
const std::vector<std::string>& PartitionKeys() const;
60+
61+
/// Get the list of bucket key field names used for bucketing.
62+
/// @return A reference to the vector of bucket key names.
63+
const std::vector<std::string>& BucketKeys() const;
64+
65+
/// Get the number of buckets configured for this table.
66+
/// @return The number of buckets;
67+
int32_t NumBuckets() const;
68+
69+
/// Get the highest field ID assigned in this schema.
70+
/// @return The maximum field ID.
71+
int32_t HighestFieldId() const;
72+
73+
/// Get the table-level options associated with this schema.
74+
/// @return A reference to the map of option key-value pairs (e.g., file format, filesystem).
75+
const std::map<std::string, std::string>& Options() const;
76+
77+
/// Get an optional comment describing the table.
78+
/// @return The table comment if set, or std::nullopt otherwise.
79+
std::optional<std::string> Comment() const;
80+
81+
private:
82+
std::unique_ptr<SchemaImpl> impl_;
83+
};
84+
85+
} // namespace paimon

src/paimon/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ set(PAIMON_CORE_SRCS
218218
core/schema/schema_manager.cpp
219219
core/schema/schema_validation.cpp
220220
core/schema/table_schema.cpp
221+
core/schema/schema.cpp
221222
core/snapshot.cpp
222223
core/stats/simple_stats_collector.cpp
223224
core/stats/simple_stats_converter.cpp

src/paimon/core/catalog/file_system_catalog.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "paimon/common/utils/arrow/status_utils.h"
2828
#include "paimon/common/utils/path_util.h"
2929
#include "paimon/common/utils/string_utils.h"
30+
#include "paimon/core/schema/schema_impl.h"
3031
#include "paimon/core/schema/schema_manager.h"
3132
#include "paimon/fs/file_system.h"
3233
#include "paimon/logging.h"
@@ -211,4 +212,20 @@ Result<bool> FileSystemCatalog::TableExistsInFileSystem(const std::string& table
211212
}
212213
}
213214

215+
Result<std::optional<std::shared_ptr<Schema>>> FileSystemCatalog::LoadTableSchema(
216+
const Identifier& identifier) const {
217+
if (IsSystemTable(identifier)) {
218+
return Status::NotImplemented("do not support loading schema for system table.");
219+
}
220+
SchemaManager schema_manager(fs_, NewDataTablePath(warehouse_, identifier));
221+
PAIMON_ASSIGN_OR_RAISE(std::optional<std::shared_ptr<TableSchema>> latest_schema,
222+
schema_manager.Latest());
223+
if (latest_schema.has_value()) {
224+
std::unique_ptr<SchemaImpl> schema_impl = std::make_unique<SchemaImpl>(*latest_schema);
225+
std::shared_ptr<Schema> schema = std::make_shared<Schema>(std::move(schema_impl));
226+
return std::optional<std::shared_ptr<Schema>>(schema);
227+
}
228+
return std::optional<std::shared_ptr<Schema>>();
229+
}
230+
214231
} // namespace paimon

src/paimon/core/catalog/file_system_catalog.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ class FileSystemCatalog : public Catalog {
4949

5050
Result<std::vector<std::string>> ListDatabases() const override;
5151
Result<std::vector<std::string>> ListTables(const std::string& database_names) const override;
52+
Result<std::optional<std::shared_ptr<Schema>>> LoadTableSchema(
53+
const Identifier& identifier) const override;
5254

5355
private:
5456
static std::string NewDatabasePath(const std::string& warehouse, const std::string& db_name);

src/paimon/core/catalog/file_system_catalog_test.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ TEST(FileSystemCatalogTest, TestCreateTableWithBlob) {
180180
ASSERT_OK_AND_ASSIGN(std::vector<std::string> table_names, catalog.ListTables("db1"));
181181
ASSERT_EQ(1, table_names.size());
182182
ASSERT_EQ(table_names[0], "tbl1");
183+
ASSERT_OK_AND_ASSIGN(std::optional<std::shared_ptr<Schema>> table_schema,
184+
catalog.LoadTableSchema(Identifier("db1", "tbl1")));
185+
ASSERT_TRUE(table_schema.has_value());
186+
ASSERT_OK_AND_ASSIGN(auto arrow_schema, (*table_schema)->GetArrowSchema());
187+
auto loaded_schema = arrow::ImportSchema(arrow_schema.get()).ValueOrDie();
188+
ASSERT_TRUE(typed_schema.Equals(loaded_schema));
183189
ArrowSchemaRelease(&schema);
184190
}
185191

@@ -309,4 +315,56 @@ TEST(FileSystemCatalogTest, TestInvalidList) {
309315
"do not support listing tables for system database.");
310316
}
311317

318+
TEST(FileSystemCatalogTest, TestValidateTableSchema) {
319+
std::map<std::string, std::string> options;
320+
options[Options::FILE_SYSTEM] = "local";
321+
options[Options::FILE_FORMAT] = "orc";
322+
ASSERT_OK_AND_ASSIGN(auto core_options, CoreOptions::FromMap(options));
323+
auto dir = UniqueTestDirectory::Create();
324+
ASSERT_TRUE(dir);
325+
FileSystemCatalog catalog(core_options.GetFileSystem(), dir->Str());
326+
{
327+
ASSERT_OK(catalog.CreateDatabase("db1", options, /*ignore_if_exists=*/true));
328+
arrow::FieldVector fields = {
329+
arrow::field("f0", arrow::utf8()),
330+
arrow::field("f1", arrow::int32()),
331+
arrow::field("f2", arrow::int32()),
332+
arrow::field("f3", arrow::float64()),
333+
};
334+
arrow::Schema typed_schema(fields);
335+
::ArrowSchema schema;
336+
ASSERT_TRUE(arrow::ExportSchema(typed_schema, &schema).ok());
337+
ASSERT_OK(catalog.CreateTable(Identifier("db1", "tbl1"), &schema, {"f1"}, {}, options,
338+
/*ignore_if_exists=*/false));
339+
340+
ASSERT_OK_AND_ASSIGN(std::optional<std::shared_ptr<Schema>> table_schema,
341+
catalog.LoadTableSchema(Identifier("db0", "tbl0")));
342+
ASSERT_FALSE(table_schema.has_value());
343+
ASSERT_OK_AND_ASSIGN(table_schema, catalog.LoadTableSchema(Identifier("db1", "tbl1")));
344+
ASSERT_TRUE(table_schema.has_value());
345+
ASSERT_EQ(0, (*table_schema)->Id());
346+
ASSERT_EQ(3, (*table_schema)->HighestFieldId());
347+
ASSERT_EQ(1, (*table_schema)->PartitionKeys().size());
348+
ASSERT_EQ(0, (*table_schema)->PrimaryKeys().size());
349+
ASSERT_EQ(-1, (*table_schema)->NumBuckets());
350+
ASSERT_FALSE((*table_schema)->Comment().has_value());
351+
std::vector<std::string> field_names = (*table_schema)->FieldNames();
352+
std::vector<std::string> expected_field_names = {"f0", "f1", "f2", "f3"};
353+
ASSERT_EQ(field_names, expected_field_names);
354+
355+
ASSERT_OK_AND_ASSIGN(auto arrow_schema, (*table_schema)->GetArrowSchema());
356+
auto loaded_schema = arrow::ImportSchema(arrow_schema.get()).ValueOrDie();
357+
ASSERT_TRUE(typed_schema.Equals(loaded_schema));
358+
359+
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
360+
ASSERT_OK(fs->Delete(PathUtil::JoinPath(dir->Str(), "db1.db/tbl1/schema/schema-0")));
361+
ASSERT_OK_AND_ASSIGN(table_schema, catalog.LoadTableSchema(Identifier("db1", "tbl1")));
362+
ASSERT_FALSE(table_schema.has_value());
363+
364+
ASSERT_NOK_WITH_MSG(catalog.LoadTableSchema(Identifier("db1", "tbl$11")),
365+
"do not support loading schema for system table.");
366+
ArrowSchemaRelease(&schema);
367+
}
368+
}
369+
312370
} // namespace paimon::test

src/paimon/core/schema/schema.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "paimon/schema/schema.h"
18+
19+
#include "paimon/catalog/identifier.h"
20+
#include "paimon/core/catalog/file_system_catalog.h"
21+
#include "paimon/core/schema/schema_impl.h"
22+
#include "paimon/core/schema/schema_manager.h"
23+
#include "paimon/core/schema/table_schema.h"
24+
25+
namespace paimon {
26+
27+
Schema::Schema(std::unique_ptr<SchemaImpl>&& impl) : impl_(std::move(impl)) {}
28+
Schema::~Schema() = default;
29+
Result<std::unique_ptr<::ArrowSchema>> Schema::GetArrowSchema() const {
30+
return impl_->table_schema_->GetArrowSchema();
31+
}
32+
33+
std::vector<std::string> Schema::FieldNames() const {
34+
return impl_->table_schema_->FieldNames();
35+
}
36+
37+
int64_t Schema::Id() const {
38+
return impl_->table_schema_->Id();
39+
}
40+
41+
const std::vector<std::string>& Schema::PrimaryKeys() const {
42+
return impl_->table_schema_->PrimaryKeys();
43+
}
44+
45+
const std::vector<std::string>& Schema::PartitionKeys() const {
46+
return impl_->table_schema_->PartitionKeys();
47+
}
48+
49+
const std::vector<std::string>& Schema::BucketKeys() const {
50+
return impl_->table_schema_->BucketKeys();
51+
}
52+
53+
int32_t Schema::NumBuckets() const {
54+
return impl_->table_schema_->NumBuckets();
55+
}
56+
57+
int32_t Schema::HighestFieldId() const {
58+
return impl_->table_schema_->HighestFieldId();
59+
}
60+
61+
const std::map<std::string, std::string>& Schema::Options() const {
62+
return impl_->table_schema_->Options();
63+
}
64+
65+
std::optional<std::string> Schema::Comment() const {
66+
return impl_->table_schema_->Commit();
67+
}
68+
69+
} // namespace paimon
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <memory>
20+
21+
#include "paimon/core/schema/table_schema.h"
22+
23+
namespace paimon {
24+
/// PIMPL implementation for Schema
25+
class SchemaImpl {
26+
public:
27+
friend class Schema;
28+
explicit SchemaImpl(const std::shared_ptr<TableSchema>& table_schema)
29+
: table_schema_(table_schema) {}
30+
31+
private:
32+
std::shared_ptr<TableSchema> table_schema_;
33+
};
34+
35+
} // namespace paimon

src/paimon/core/schema/table_schema.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <utility>
2323

2424
#include "arrow/api.h"
25+
#include "arrow/c/bridge.h"
2526
#include "arrow/util/checked_cast.h"
2627
#include "fmt/format.h"
2728
#include "paimon/common/utils/arrow/status_utils.h"
@@ -333,4 +334,11 @@ bool TableSchema::CrossPartitionUpdate() const {
333334
return !ObjectUtils::ContainsAll(primary_keys_, partition_keys_);
334335
}
335336

337+
Result<std::unique_ptr<::ArrowSchema>> TableSchema::GetArrowSchema() const {
338+
std::shared_ptr<arrow::Schema> schema = DataField::ConvertDataFieldsToArrowSchema(fields_);
339+
auto arrow_schema = std::make_unique<::ArrowSchema>();
340+
PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*schema, arrow_schema.get()));
341+
return arrow_schema;
342+
}
343+
336344
} // namespace paimon

src/paimon/core/schema/table_schema.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ class TableSchema : public Jsonizable<TableSchema> {
9898

9999
bool CrossPartitionUpdate() const;
100100

101+
Result<std::unique_ptr<::ArrowSchema>> GetArrowSchema() const;
102+
101103
private:
102104
JSONIZABLE_FRIEND_AND_DEFAULT_CTOR(TableSchema);
103105

0 commit comments

Comments
 (0)