Skip to content

Commit be1351b

Browse files
committed
feat: add metadata columns definition
1 parent 1c2530c commit be1351b

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ set(ICEBERG_SOURCES
2323
expression/expression.cc
2424
file_reader.cc
2525
json_internal.cc
26+
metadata_columns.cc
2627
name_mapping.cc
2728
partition_field.cc
2829
partition_spec.cc

src/iceberg/metadata_columns.cc

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/metadata_columns.h"
21+
22+
#include <unordered_map>
23+
24+
namespace iceberg {
25+
26+
namespace {
27+
28+
using MetadataColumnMap = std::unordered_map<std::string_view, const SchemaField*>;
29+
30+
const MetadataColumnMap& GetMetadataColumnMap() {
31+
static const MetadataColumnMap kMetadataColumnMap = {
32+
{MetadataColumns::kFilePath.name(), &MetadataColumns::kFilePath},
33+
{MetadataColumns::kRowPosition.name(), &MetadataColumns::kRowPosition},
34+
{MetadataColumns::kIsDeleted.name(), &MetadataColumns::kIsDeleted},
35+
{MetadataColumns::kSpecId.name(), &MetadataColumns::kSpecId},
36+
{MetadataColumns::kRowId.name(), &MetadataColumns::kRowId},
37+
{MetadataColumns::kLastUpdatedSequenceNumber.name(),
38+
&MetadataColumns::kLastUpdatedSequenceNumber}};
39+
return kMetadataColumnMap;
40+
}
41+
42+
const std::set<int32_t>& GetMetadataFieldIdSet() {
43+
static const std::set<int32_t> kMetadataFieldIds = {
44+
MetadataColumns::kFilePath.field_id(),
45+
MetadataColumns::kRowPosition.field_id(),
46+
MetadataColumns::kIsDeleted.field_id(),
47+
MetadataColumns::kSpecId.field_id(),
48+
MetadataColumns::kPartitionColumnId,
49+
MetadataColumns::kRowId.field_id(),
50+
MetadataColumns::kLastUpdatedSequenceNumber.field_id()};
51+
return kMetadataFieldIds;
52+
}
53+
54+
} // namespace
55+
56+
const std::set<int32_t>& MetadataColumns::MetadataFieldIds() {
57+
return GetMetadataFieldIdSet();
58+
}
59+
60+
bool MetadataColumns::IsMetadataColumn(std::string_view name) {
61+
return name == kPartitionColumnName ||
62+
GetMetadataColumnMap().find(name) != GetMetadataColumnMap().end();
63+
}
64+
65+
bool MetadataColumns::IsMetadataColumn(int32_t id) {
66+
return GetMetadataFieldIdSet().find(id) != GetMetadataFieldIdSet().end();
67+
}
68+
69+
Result<const SchemaField*> MetadataColumns::MetadataColumn(std::string_view name) {
70+
const auto& metadata_column_map = GetMetadataColumnMap();
71+
const auto it = metadata_column_map.find(name);
72+
if (it == metadata_column_map.cend()) {
73+
return InvalidArgument("Unknown metadata column: {}", name);
74+
}
75+
return it->second;
76+
}
77+
78+
} // namespace iceberg

src/iceberg/metadata_columns.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/metadata_columns.h
23+
/// Metadata columns for reading Iceberg data files.
24+
25+
#include <limits>
26+
#include <memory>
27+
#include <set>
28+
#include <string_view>
29+
30+
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/result.h"
32+
#include "iceberg/schema_field.h"
33+
#include "iceberg/type.h"
34+
35+
namespace iceberg {
36+
37+
/// \brief A class containing constants and utility methods for metadata columns
38+
struct ICEBERG_EXPORT MetadataColumns {
39+
constexpr static int32_t kInt32Max = std::numeric_limits<int32_t>::max();
40+
41+
// IDs kInt32Max - (1-100) are used for metadata columns
42+
inline static const SchemaField kFilePath =
43+
SchemaField::MakeRequired(kInt32Max - 1, "_file", std::make_shared<StringType>(),
44+
"Path of the file in which a row is stored");
45+
46+
inline static const SchemaField kRowPosition =
47+
SchemaField::MakeRequired(kInt32Max - 2, "_pos", std::make_shared<LongType>(),
48+
"Ordinal position of a row in the source data file");
49+
50+
inline static const SchemaField kIsDeleted = SchemaField::MakeRequired(
51+
kInt32Max - 3, "_deleted", std::make_shared<BooleanType>(),
52+
"Whether the row has been deleted");
53+
54+
inline static const SchemaField kSpecId =
55+
SchemaField::MakeRequired(kInt32Max - 4, "_spec_id", std::make_shared<IntType>(),
56+
"Spec ID used to track the file containing a row");
57+
58+
// The partition column type depends on all specs in the table
59+
constexpr static int32_t kPartitionColumnId = kInt32Max - 5;
60+
constexpr static std::string_view kPartitionColumnName = "_partition";
61+
constexpr static std::string_view kPartitionColumnDoc =
62+
"Partition to which a row belongs to";
63+
64+
constexpr static int32_t kContentOffsetColumnId = kInt32Max - 6;
65+
constexpr static int32_t kContentSizeInBytesColumnId = kInt32Max - 7;
66+
67+
// IDs kInt32Max - (101-200) are used for reserved columns
68+
inline static const SchemaField kDeleteFilePath = SchemaField::MakeRequired(
69+
kInt32Max - 101, "file_path", std::make_shared<StringType>(),
70+
"Path of a file in which a deleted row is stored");
71+
72+
inline static const SchemaField kDeleteFilePos =
73+
SchemaField::MakeRequired(kInt32Max - 102, "pos", std::make_shared<LongType>(),
74+
"Ordinal position of a deleted row in the data file");
75+
76+
// The row column type depends on the table schema
77+
constexpr static int32_t kDeleteFileRowFieldId = kInt32Max - 103;
78+
constexpr static std::string_view kDeleteFileRowFieldName = "row";
79+
constexpr static std::string_view kDeleteFileRowDoc = "Deleted row values";
80+
81+
inline static const SchemaField kChangeType = SchemaField::MakeRequired(
82+
kInt32Max - 104, "_change_type", std::make_shared<StringType>(),
83+
"Record type in changelog");
84+
85+
inline static const SchemaField kChangeOrdinal = SchemaField::MakeOptional(
86+
kInt32Max - 105, "_change_ordinal", std::make_shared<IntType>(),
87+
"Change ordinal in changelog");
88+
89+
inline static const SchemaField kCommitSnapshotId =
90+
SchemaField::MakeOptional(kInt32Max - 106, "_commit_snapshot_id",
91+
std::make_shared<LongType>(), "Commit snapshot ID");
92+
93+
inline static const SchemaField kRowId =
94+
SchemaField::MakeOptional(kInt32Max - 107, "_row_id", std::make_shared<LongType>(),
95+
"Implicit row ID that is automatically assigned");
96+
97+
inline static const SchemaField kLastUpdatedSequenceNumber = SchemaField::MakeOptional(
98+
kInt32Max - 108, "_last_updated_sequence_number", std::make_shared<LongType>(),
99+
"Sequence number when the row was last updated");
100+
101+
/// \brief Get the set of metadata field IDs.
102+
static const std::set<int32_t>& MetadataFieldIds();
103+
104+
/// \brief Check if a column name is a metadata column.
105+
static bool IsMetadataColumn(std::string_view name);
106+
107+
/// \brief Check if a column ID is a metadata column.
108+
static bool IsMetadataColumn(int32_t id);
109+
110+
/// \brief Get a metadata column by name.
111+
///
112+
/// \param name The name of the metadata column.
113+
/// \return The metadata column, or an error if the name does not refer to a metadata
114+
/// column. The returned pointer is guaranteed to be valid.
115+
static Result<const SchemaField*> MetadataColumn(std::string_view name);
116+
117+
/// TODO(gangwu): add functions to build partition columns from a table schema
118+
};
119+
120+
} // namespace iceberg

0 commit comments

Comments
 (0)