Skip to content

Commit d47d8e3

Browse files
committed
feat: add metadata columns definition
1 parent 1c2530c commit d47d8e3

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ set(ICEBERG_SOURCES
2323
expression/expression.cc
2424
file_reader.cc
2525
json_internal.cc
26+
metadata_columns.cc
2627
name_mapping.cc
2728
partition_field.cc
2829
partition_spec.cc

src/iceberg/metadata_columns.cc

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/metadata_columns.h"
21+
22+
#include <unordered_map>
23+
24+
#include <iceberg/result.h>
25+
26+
namespace iceberg {
27+
28+
namespace {
29+
30+
using MetadataColumnMap = std::unordered_map<std::string_view, const SchemaField*>;
31+
32+
const MetadataColumnMap& GetMetadataColumnMap() {
33+
static const MetadataColumnMap kMetadataColumnMap = {
34+
{MetadataColumns::kFilePath.name(), &MetadataColumns::kFilePath},
35+
{MetadataColumns::kRowPosition.name(), &MetadataColumns::kRowPosition},
36+
{MetadataColumns::kIsDeleted.name(), &MetadataColumns::kIsDeleted},
37+
{MetadataColumns::kSpecId.name(), &MetadataColumns::kSpecId},
38+
{MetadataColumns::kRowId.name(), &MetadataColumns::kRowId},
39+
{MetadataColumns::kLastUpdatedSequenceNumber.name(),
40+
&MetadataColumns::kLastUpdatedSequenceNumber}};
41+
return kMetadataColumnMap;
42+
}
43+
44+
const std::set<int32_t>& GetMetadataFieldIdSet() {
45+
static const std::set<int32_t> kMetadataFieldIds = {
46+
MetadataColumns::kFilePath.field_id(),
47+
MetadataColumns::kRowPosition.field_id(),
48+
MetadataColumns::kIsDeleted.field_id(),
49+
MetadataColumns::kSpecId.field_id(),
50+
MetadataColumns::kPartitionColumnId,
51+
MetadataColumns::kRowId.field_id(),
52+
MetadataColumns::kLastUpdatedSequenceNumber.field_id()};
53+
return kMetadataFieldIds;
54+
}
55+
56+
} // namespace
57+
58+
const std::set<int32_t>& MetadataColumns::MetadataFieldIds() {
59+
return GetMetadataFieldIdSet();
60+
}
61+
62+
bool MetadataColumns::IsMetadataColumn(std::string_view name) {
63+
return name == kPartitionColumnName ||
64+
GetMetadataColumnMap().find(name) != GetMetadataColumnMap().end();
65+
}
66+
67+
bool MetadataColumns::IsMetadataColumn(int32_t id) {
68+
return GetMetadataFieldIdSet().find(id) != GetMetadataFieldIdSet().end();
69+
}
70+
71+
Result<std::reference_wrapper<const SchemaField>> MetadataColumns::MetadataColumn(
72+
std::string_view name) {
73+
const auto& metadata_column_map = GetMetadataColumnMap();
74+
const auto it = metadata_column_map.find(name);
75+
if (it == metadata_column_map.cend()) {
76+
return InvalidArgument("Unknown metadata column: {}", name);
77+
}
78+
return *it->second;
79+
}
80+
81+
} // namespace iceberg

src/iceberg/metadata_columns.h

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/metadata_columns.h
23+
/// Metadata columns for reading Iceberg data files.
24+
25+
#include <functional>
26+
#include <limits>
27+
#include <memory>
28+
#include <set>
29+
#include <string_view>
30+
31+
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/result.h"
33+
#include "iceberg/schema_field.h"
34+
#include "iceberg/type.h"
35+
36+
namespace iceberg {
37+
38+
/// \brief A class containing constants and utility methods for metadata columns
39+
struct ICEBERG_EXPORT MetadataColumns {
40+
constexpr static int32_t kInt32Max = std::numeric_limits<int32_t>::max();
41+
42+
// IDs kInt32Max - (1-100) are used for metadata columns
43+
inline static const SchemaField kFilePath =
44+
SchemaField::MakeRequired(kInt32Max - 1, "_file", std::make_shared<StringType>(),
45+
"Path of the file in which a row is stored");
46+
47+
inline static const SchemaField kRowPosition =
48+
SchemaField::MakeRequired(kInt32Max - 2, "_pos", std::make_shared<LongType>(),
49+
"Ordinal position of a row in the source data file");
50+
51+
inline static const SchemaField kIsDeleted = SchemaField::MakeRequired(
52+
kInt32Max - 3, "_deleted", std::make_shared<BooleanType>(),
53+
"Whether the row has been deleted");
54+
55+
inline static const SchemaField kSpecId =
56+
SchemaField::MakeRequired(kInt32Max - 4, "_spec_id", std::make_shared<IntType>(),
57+
"Spec ID used to track the file containing a row");
58+
59+
// The partition column type depends on all specs in the table
60+
constexpr static int32_t kPartitionColumnId = kInt32Max - 5;
61+
constexpr static std::string_view kPartitionColumnName = "_partition";
62+
constexpr static std::string_view kPartitionColumnDoc =
63+
"Partition to which a row belongs to";
64+
65+
constexpr static int32_t kContentOffsetColumnId = kInt32Max - 6;
66+
constexpr static int32_t kContentSizeInBytesColumnId = kInt32Max - 7;
67+
68+
// IDs kInt32Max - (101-200) are used for reserved columns
69+
inline static const SchemaField kDeleteFilePath = SchemaField::MakeRequired(
70+
kInt32Max - 101, "file_path", std::make_shared<StringType>(),
71+
"Path of a file in which a deleted row is stored");
72+
73+
inline static const SchemaField kDeleteFilePos =
74+
SchemaField::MakeRequired(kInt32Max - 102, "pos", std::make_shared<LongType>(),
75+
"Ordinal position of a deleted row in the data file");
76+
77+
// The row column type depends on the table schema
78+
constexpr static int32_t kDeleteFileRowFieldId = kInt32Max - 103;
79+
constexpr static std::string_view kDeleteFileRowFieldName = "row";
80+
constexpr static std::string_view kDeleteFileRowDoc = "Deleted row values";
81+
82+
inline static const SchemaField kChangeType = SchemaField::MakeRequired(
83+
kInt32Max - 104, "_change_type", std::make_shared<StringType>(),
84+
"Record type in changelog");
85+
86+
inline static const SchemaField kChangeOrdinal = SchemaField::MakeOptional(
87+
kInt32Max - 105, "_change_ordinal", std::make_shared<IntType>(),
88+
"Change ordinal in changelog");
89+
90+
inline static const SchemaField kCommitSnapshotId =
91+
SchemaField::MakeOptional(kInt32Max - 106, "_commit_snapshot_id",
92+
std::make_shared<LongType>(), "Commit snapshot ID");
93+
94+
inline static const SchemaField kRowId =
95+
SchemaField::MakeOptional(kInt32Max - 107, "_row_id", std::make_shared<LongType>(),
96+
"Implicit row ID that is automatically assigned");
97+
98+
inline static const SchemaField kLastUpdatedSequenceNumber = SchemaField::MakeOptional(
99+
kInt32Max - 108, "_last_updated_sequence_number", std::make_shared<LongType>(),
100+
"Sequence number when the row was last updated");
101+
102+
/// \brief Get the set of metadata field IDs.
103+
static const std::set<int32_t>& MetadataFieldIds();
104+
105+
/// \brief Check if a column name is a metadata column.
106+
static bool IsMetadataColumn(std::string_view name);
107+
108+
/// \brief Check if a column ID is a metadata column.
109+
static bool IsMetadataColumn(int32_t id);
110+
111+
/// \brief Get a metadata column by name.
112+
///
113+
/// \param name The name of the metadata column.
114+
/// \return The metadata column, or an error if the name does not refer to a metadata
115+
/// column.
116+
static Result<std::reference_wrapper<const SchemaField>> MetadataColumn(
117+
std::string_view name);
118+
119+
/// TODO(gangwu): add functions to build partition columns from a table schema
120+
};
121+
122+
} // namespace iceberg

0 commit comments

Comments
 (0)