Skip to content

Commit 1ecea8e

Browse files
authored
feat(catalog): add LoadTableSchema interface (#10)
1 parent 08e8be7 commit 1ecea8e

File tree

8 files changed

+244
-1
lines changed

8 files changed

+244
-1
lines changed

include/paimon/catalog/catalog.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include "paimon/catalog/identifier.h"
2525
#include "paimon/result.h"
26+
#include "paimon/schema/schema.h"
2627
#include "paimon/status.h"
2728
#include "paimon/type_fwd.h"
2829
#include "paimon/visibility.h"
@@ -93,6 +94,16 @@ class PAIMON_EXPORT Catalog {
9394
/// @return A result containing a vector of table names in the specified database, or an error
9495
/// status.
9596
virtual Result<std::vector<std::string>> ListTables(const std::string& db_name) const = 0;
97+
98+
/// Loads the latest schema of a specified table.
99+
///
100+
/// @note System tables will not be supported.
101+
///
102+
/// @param identifier The identifier (database and table name) of the table to load.
103+
/// @return A result containing table schema if the table exists, or std::nullopt if it
104+
/// doesn't, or an error status on failure.
105+
virtual Result<std::optional<std::shared_ptr<Schema>>> LoadTableSchema(
106+
const Identifier& identifier) const = 0;
96107
};
97108

98109
} // namespace paimon

include/paimon/schema/schema.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <map>
20+
#include <memory>
21+
#include <string>
22+
#include <vector>
23+
24+
#include "arrow/api.h"
25+
#include "arrow/c/bridge.h"
26+
#include "paimon/result.h"
27+
#include "paimon/visibility.h"
28+
29+
struct ArrowSchema;
30+
31+
namespace paimon {
32+
33+
/// This interface provides access to TableSchema-related information.
34+
class PAIMON_EXPORT Schema {
35+
public:
36+
virtual ~Schema() = default;
37+
38+
/// Get the Arrow C schema representation of this table schema.
39+
/// @return A result containing an ArrowSchema, or an error status if conversion fails.
40+
virtual Result<std::unique_ptr<::ArrowSchema>> GetArrowSchema() const = 0;
41+
42+
/// Get the names of all fields in the table schema.
43+
/// @return A vector of field names.
44+
virtual std::vector<std::string> FieldNames() const = 0;
45+
46+
/// Get the unique identifier of this table schema.
47+
/// @return The schema ID
48+
virtual int64_t Id() const = 0;
49+
50+
/// Get the list of primary key field names.
51+
/// @return A reference to the vector of primary key names; empty if no primary keys are
52+
/// defined.
53+
virtual const std::vector<std::string>& PrimaryKeys() const = 0;
54+
55+
/// Get the list of partition key field names.
56+
/// @return A reference to the vector of partition key names; empty if the table is not
57+
/// partitioned.
58+
virtual const std::vector<std::string>& PartitionKeys() const = 0;
59+
60+
/// Get the list of bucket key field names used for bucketing.
61+
/// @return A reference to the vector of bucket key names.
62+
virtual const std::vector<std::string>& BucketKeys() const = 0;
63+
64+
/// Get the number of buckets configured for this table.
65+
/// @return The number of buckets.
66+
virtual int32_t NumBuckets() const = 0;
67+
68+
/// Get the highest field ID assigned in this schema.
69+
/// @return The maximum field ID.
70+
virtual int32_t HighestFieldId() const = 0;
71+
72+
/// Get the table-level options associated with this schema.
73+
/// @return A reference to the map of option key-value pairs (e.g., file format, filesystem).
74+
virtual const std::map<std::string, std::string>& Options() const = 0;
75+
76+
/// Get an optional comment describing the table.
77+
/// @return The table comment if set, or std::nullopt otherwise.
78+
virtual std::optional<std::string> Comment() const = 0;
79+
};
80+
81+
} // namespace paimon

src/paimon/core/catalog/file_system_catalog.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "paimon/common/utils/arrow/status_utils.h"
2828
#include "paimon/common/utils/path_util.h"
2929
#include "paimon/common/utils/string_utils.h"
30+
#include "paimon/core/schema/schema_impl.h"
3031
#include "paimon/core/schema/schema_manager.h"
3132
#include "paimon/fs/file_system.h"
3233
#include "paimon/logging.h"
@@ -211,4 +212,19 @@ Result<bool> FileSystemCatalog::TableExistsInFileSystem(const std::string& table
211212
}
212213
}
213214

215+
Result<std::optional<std::shared_ptr<Schema>>> FileSystemCatalog::LoadTableSchema(
216+
const Identifier& identifier) const {
217+
if (IsSystemTable(identifier)) {
218+
return Status::NotImplemented("do not support loading schema for system table.");
219+
}
220+
SchemaManager schema_manager(fs_, NewDataTablePath(warehouse_, identifier));
221+
PAIMON_ASSIGN_OR_RAISE(std::optional<std::shared_ptr<TableSchema>> latest_schema,
222+
schema_manager.Latest());
223+
if (latest_schema.has_value()) {
224+
std::shared_ptr<Schema> schema = std::make_shared<SchemaImpl>(*latest_schema);
225+
return std::optional<std::shared_ptr<Schema>>(schema);
226+
}
227+
return std::optional<std::shared_ptr<Schema>>();
228+
}
229+
214230
} // namespace paimon

src/paimon/core/catalog/file_system_catalog.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ class FileSystemCatalog : public Catalog {
4949

5050
Result<std::vector<std::string>> ListDatabases() const override;
5151
Result<std::vector<std::string>> ListTables(const std::string& database_names) const override;
52+
Result<std::optional<std::shared_ptr<Schema>>> LoadTableSchema(
53+
const Identifier& identifier) const override;
5254

5355
private:
5456
static std::string NewDatabasePath(const std::string& warehouse, const std::string& db_name);

src/paimon/core/catalog/file_system_catalog_test.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ TEST(FileSystemCatalogTest, TestCreateTableWithBlob) {
180180
ASSERT_OK_AND_ASSIGN(std::vector<std::string> table_names, catalog.ListTables("db1"));
181181
ASSERT_EQ(1, table_names.size());
182182
ASSERT_EQ(table_names[0], "tbl1");
183+
ASSERT_OK_AND_ASSIGN(std::optional<std::shared_ptr<Schema>> table_schema,
184+
catalog.LoadTableSchema(Identifier("db1", "tbl1")));
185+
ASSERT_TRUE(table_schema.has_value());
186+
ASSERT_OK_AND_ASSIGN(auto arrow_schema, (*table_schema)->GetArrowSchema());
187+
auto loaded_schema = arrow::ImportSchema(arrow_schema.get()).ValueOrDie();
188+
ASSERT_TRUE(typed_schema.Equals(loaded_schema));
183189
ArrowSchemaRelease(&schema);
184190
}
185191

@@ -309,4 +315,54 @@ TEST(FileSystemCatalogTest, TestInvalidList) {
309315
"do not support listing tables for system database.");
310316
}
311317

318+
TEST(FileSystemCatalogTest, TestValidateTableSchema) {
319+
std::map<std::string, std::string> options;
320+
options[Options::FILE_SYSTEM] = "local";
321+
options[Options::FILE_FORMAT] = "orc";
322+
ASSERT_OK_AND_ASSIGN(auto core_options, CoreOptions::FromMap(options));
323+
auto dir = UniqueTestDirectory::Create();
324+
ASSERT_TRUE(dir);
325+
FileSystemCatalog catalog(core_options.GetFileSystem(), dir->Str());
326+
ASSERT_OK(catalog.CreateDatabase("db1", options, /*ignore_if_exists=*/true));
327+
arrow::FieldVector fields = {
328+
arrow::field("f0", arrow::utf8()),
329+
arrow::field("f1", arrow::int32()),
330+
arrow::field("f2", arrow::int32()),
331+
arrow::field("f3", arrow::float64()),
332+
};
333+
arrow::Schema typed_schema(fields);
334+
::ArrowSchema schema;
335+
ASSERT_TRUE(arrow::ExportSchema(typed_schema, &schema).ok());
336+
ASSERT_OK(catalog.CreateTable(Identifier("db1", "tbl1"), &schema, {"f1"}, {}, options,
337+
/*ignore_if_exists=*/false));
338+
339+
ASSERT_OK_AND_ASSIGN(std::optional<std::shared_ptr<Schema>> table_schema,
340+
catalog.LoadTableSchema(Identifier("db0", "tbl0")));
341+
ASSERT_FALSE(table_schema.has_value());
342+
ASSERT_OK_AND_ASSIGN(table_schema, catalog.LoadTableSchema(Identifier("db1", "tbl1")));
343+
ASSERT_TRUE(table_schema.has_value());
344+
ASSERT_EQ(0, (*table_schema)->Id());
345+
ASSERT_EQ(3, (*table_schema)->HighestFieldId());
346+
ASSERT_EQ(1, (*table_schema)->PartitionKeys().size());
347+
ASSERT_EQ(0, (*table_schema)->PrimaryKeys().size());
348+
ASSERT_EQ(-1, (*table_schema)->NumBuckets());
349+
ASSERT_FALSE((*table_schema)->Comment().has_value());
350+
std::vector<std::string> field_names = (*table_schema)->FieldNames();
351+
std::vector<std::string> expected_field_names = {"f0", "f1", "f2", "f3"};
352+
ASSERT_EQ(field_names, expected_field_names);
353+
354+
ASSERT_OK_AND_ASSIGN(auto arrow_schema, (*table_schema)->GetArrowSchema());
355+
auto loaded_schema = arrow::ImportSchema(arrow_schema.get()).ValueOrDie();
356+
ASSERT_TRUE(typed_schema.Equals(loaded_schema));
357+
358+
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
359+
ASSERT_OK(fs->Delete(PathUtil::JoinPath(dir->Str(), "db1.db/tbl1/schema/schema-0")));
360+
ASSERT_OK_AND_ASSIGN(table_schema, catalog.LoadTableSchema(Identifier("db1", "tbl1")));
361+
ASSERT_FALSE(table_schema.has_value());
362+
363+
ASSERT_NOK_WITH_MSG(catalog.LoadTableSchema(Identifier("db1", "tbl$11")),
364+
"do not support loading schema for system table.");
365+
ArrowSchemaRelease(&schema);
366+
}
367+
312368
} // namespace paimon::test
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <map>
20+
#include <memory>
21+
#include <string>
22+
#include <vector>
23+
24+
#include "paimon/core/schema/table_schema.h"
25+
26+
namespace paimon {
27+
28+
class SchemaImpl : public Schema {
29+
public:
30+
explicit SchemaImpl(const std::shared_ptr<TableSchema>& table_schema)
31+
: table_schema_(table_schema) {}
32+
Result<std::unique_ptr<::ArrowSchema>> GetArrowSchema() const override {
33+
return table_schema_->GetArrowSchema();
34+
}
35+
std::vector<std::string> FieldNames() const override {
36+
return table_schema_->FieldNames();
37+
}
38+
int64_t Id() const override {
39+
return table_schema_->Id();
40+
}
41+
const std::vector<std::string>& PrimaryKeys() const override {
42+
return table_schema_->PrimaryKeys();
43+
}
44+
const std::vector<std::string>& PartitionKeys() const override {
45+
return table_schema_->PartitionKeys();
46+
}
47+
const std::vector<std::string>& BucketKeys() const override {
48+
return table_schema_->BucketKeys();
49+
}
50+
int32_t NumBuckets() const override {
51+
return table_schema_->NumBuckets();
52+
}
53+
int32_t HighestFieldId() const override {
54+
return table_schema_->HighestFieldId();
55+
}
56+
const std::map<std::string, std::string>& Options() const override {
57+
return table_schema_->Options();
58+
}
59+
std::optional<std::string> Comment() const override {
60+
return table_schema_->Comment();
61+
}
62+
63+
private:
64+
std::shared_ptr<TableSchema> table_schema_;
65+
};
66+
67+
} // namespace paimon

src/paimon/core/schema/table_schema.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <utility>
2323

2424
#include "arrow/api.h"
25+
#include "arrow/c/bridge.h"
2526
#include "arrow/util/checked_cast.h"
2627
#include "fmt/format.h"
2728
#include "paimon/common/utils/arrow/status_utils.h"
@@ -333,4 +334,11 @@ bool TableSchema::CrossPartitionUpdate() const {
333334
return !ObjectUtils::ContainsAll(primary_keys_, partition_keys_);
334335
}
335336

337+
Result<std::unique_ptr<::ArrowSchema>> TableSchema::GetArrowSchema() const {
338+
std::shared_ptr<arrow::Schema> schema = DataField::ConvertDataFieldsToArrowSchema(fields_);
339+
auto arrow_schema = std::make_unique<::ArrowSchema>();
340+
PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*schema, arrow_schema.get()));
341+
return arrow_schema;
342+
}
343+
336344
} // namespace paimon

src/paimon/core/schema/table_schema.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,14 @@ class TableSchema : public Jsonizable<TableSchema> {
9292

9393
Result<std::vector<std::string>> TrimmedPrimaryKeys() const;
9494

95-
std::optional<std::string> Commit() const {
95+
std::optional<std::string> Comment() const {
9696
return comment_;
9797
}
9898

9999
bool CrossPartitionUpdate() const;
100100

101+
Result<std::unique_ptr<::ArrowSchema>> GetArrowSchema() const;
102+
101103
private:
102104
JSONIZABLE_FRIEND_AND_DEFAULT_CTOR(TableSchema);
103105

0 commit comments

Comments
 (0)